author | martin |
Mon, 31 Aug 2009 15:00:04 -0700 | |
changeset 3714 | 6a4eb8f53f91 |
parent 1092 | 5a73ac754ac7 |
child 5506 | 202f599c92aa |
permissions | -rw-r--r-- |
2 | 1 |
/* |
715 | 2 |
* Copyright 2000-2008 Sun Microsystems, Inc. All Rights Reserved. |
2 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
7 |
* published by the Free Software Foundation. Sun designates this |
|
8 |
* particular file as subject to the "Classpath" exception as provided |
|
9 |
* by Sun in the LICENSE file that accompanied this code. |
|
10 |
* |
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
15 |
* accompanied this code). |
|
16 |
* |
|
17 |
* You should have received a copy of the GNU General Public License version |
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
* |
|
21 |
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
|
22 |
* CA 95054 USA or visit www.sun.com if you need additional information or |
|
23 |
* have any questions. |
|
24 |
*/ |
|
25 |
||
26 |
package sun.nio.cs; |
|
27 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
28 |
import java.nio.Buffer; |
2 | 29 |
import java.nio.ByteBuffer; |
30 |
import java.nio.CharBuffer; |
|
31 |
import java.nio.charset.Charset; |
|
32 |
import java.nio.charset.CharsetDecoder; |
|
33 |
import java.nio.charset.CharsetEncoder; |
|
34 |
import java.nio.charset.CoderResult; |
|
35 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
36 |
/* Legal UTF-8 Byte Sequences |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
37 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
38 |
* # Code Points Bits Bit/Byte pattern |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
39 |
* 1 7 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
40 |
* U+0000..U+007F 00..7F |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
41 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
42 |
* 2 11 110xxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
43 |
* U+0080..U+07FF C2..DF 80..BF |
2 | 44 |
* |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
45 |
* 3 16 1110xxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
46 |
* U+0800..U+0FFF E0 A0..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
47 |
* U+1000..U+FFFF E1..EF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
48 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
49 |
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
50 |
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
51 |
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
52 |
* U+100000..U10FFFF F4 80..8F 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
53 |
* |
2 | 54 |
*/ |
55 |
||
56 |
class UTF_8 extends Unicode |
|
57 |
{ |
|
58 |
public UTF_8() { |
|
59 |
super("UTF-8", StandardCharsets.aliases_UTF_8); |
|
60 |
} |
|
61 |
||
62 |
public String historicalName() { |
|
63 |
return "UTF8"; |
|
64 |
} |
|
65 |
||
66 |
public CharsetDecoder newDecoder() { |
|
67 |
return new Decoder(this); |
|
68 |
} |
|
69 |
||
70 |
public CharsetEncoder newEncoder() { |
|
71 |
return new Encoder(this); |
|
72 |
} |
|
73 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
74 |
static final void updatePositions(Buffer src, int sp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
75 |
Buffer dst, int dp) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
76 |
src.position(sp - src.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
77 |
dst.position(dp - dst.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
78 |
} |
2 | 79 |
|
80 |
private static class Decoder extends CharsetDecoder { |
|
81 |
private Decoder(Charset cs) { |
|
82 |
super(cs, 1.0f, 1.0f); |
|
83 |
} |
|
84 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
85 |
private static boolean isNotContinuation(int b) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
86 |
return (b & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
87 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
88 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
89 |
// [C2..DF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
90 |
private static boolean isMalformed2(int b1, int b2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
91 |
return (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
92 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
93 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
94 |
// [E0] [A0..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
95 |
// [E1..EF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
96 |
private static boolean isMalformed3(int b1, int b2, int b3) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
97 |
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
98 |
(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
99 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
100 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
101 |
// [F0] [90..BF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
102 |
// [F1..F3] [80..BF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
103 |
// [F4] [80..8F] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
104 |
// only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
105 |
// will be checked by Surrogate.neededFor(uc) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
106 |
private static boolean isMalformed4(int b2, int b3, int b4) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
107 |
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
108 |
(b4 & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
109 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
110 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
111 |
private static CoderResult lookupN(ByteBuffer src, int n) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
112 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
113 |
for (int i = 1; i < n; i++) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
114 |
if (isNotContinuation(src.get())) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
115 |
return CoderResult.malformedForLength(i); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
116 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
117 |
return CoderResult.malformedForLength(n); |
2 | 118 |
} |
119 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
120 |
private static CoderResult malformedN(ByteBuffer src, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
121 |
switch (nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
122 |
case 1: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
123 |
int b1 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
124 |
if ((b1 >> 2) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
125 |
// 5 bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
126 |
if (src.remaining() < 4) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
127 |
return CoderResult.UNDERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
128 |
return lookupN(src, 5); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
129 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
130 |
if ((b1 >> 1) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
131 |
// 6 bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
132 |
if (src.remaining() < 5) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
133 |
return CoderResult.UNDERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
134 |
return lookupN(src, 6); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
135 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
136 |
return CoderResult.malformedForLength(1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
137 |
case 2: // always 1 |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
138 |
return CoderResult.malformedForLength(1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
139 |
case 3: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
140 |
b1 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
141 |
int b2 = src.get(); // no need to lookup b3 |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
142 |
return CoderResult.malformedForLength( |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
143 |
((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
144 |
isNotContinuation(b2))?1:2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
145 |
case 4: // we don't care the speed here |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
146 |
b1 = src.get() & 0xff; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
147 |
b2 = src.get() & 0xff; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
148 |
if (b1 > 0xf4 || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
149 |
(b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
150 |
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
151 |
isNotContinuation(b2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
152 |
return CoderResult.malformedForLength(1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
153 |
if (isNotContinuation(src.get())) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
154 |
return CoderResult.malformedForLength(2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
155 |
return CoderResult.malformedForLength(3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
156 |
default: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
157 |
assert false; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
158 |
return null; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
159 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
160 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
161 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
162 |
private static CoderResult malformed(ByteBuffer src, int sp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
163 |
CharBuffer dst, int dp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
164 |
int nb) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
165 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
166 |
src.position(sp - src.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
167 |
CoderResult cr = malformedN(src, nb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
168 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
169 |
return cr; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
170 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
171 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
172 |
private static CoderResult malformed(ByteBuffer src, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
173 |
int mark, int nb) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
174 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
175 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
176 |
CoderResult cr = malformedN(src, nb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
177 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
178 |
return cr; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
179 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
180 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
181 |
private static CoderResult xflow(Buffer src, int sp, int sl, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
182 |
Buffer dst, int dp, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
183 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
184 |
return (nb == 0 || sl - sp < nb) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
185 |
?CoderResult.UNDERFLOW:CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
186 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
187 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
188 |
private static CoderResult xflow(Buffer src, int mark, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
189 |
CoderResult cr = (nb == 0 || src.remaining() < (nb - 1)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
190 |
?CoderResult.UNDERFLOW:CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
191 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
192 |
return cr; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
193 |
} |
2 | 194 |
|
195 |
private CoderResult decodeArrayLoop(ByteBuffer src, |
|
196 |
CharBuffer dst) |
|
197 |
{ |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
198 |
// This method is optimized for ASCII input. |
2 | 199 |
byte[] sa = src.array(); |
200 |
int sp = src.arrayOffset() + src.position(); |
|
201 |
int sl = src.arrayOffset() + src.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
202 |
|
2 | 203 |
char[] da = dst.array(); |
204 |
int dp = dst.arrayOffset() + dst.position(); |
|
205 |
int dl = dst.arrayOffset() + dst.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
206 |
int dlASCII = dp + Math.min(sl - sp, dl - dp); |
2 | 207 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
208 |
// ASCII only loop |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
209 |
while (dp < dlASCII && sa[sp] >= 0) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
210 |
da[dp++] = (char)sa[sp++]; |
2 | 211 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
212 |
while (sp < sl) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
213 |
int b1 = sa[sp]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
214 |
if (b1 >= 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
215 |
// 1 byte, 7 bits: 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
216 |
if (dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
217 |
return xflow(src, sp, sl, dst, dp, 1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
218 |
da[dp++] = (char)b1; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
219 |
sp++; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
220 |
} else if ((b1 >> 5) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
221 |
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
222 |
if (sl - sp < 2 || dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
223 |
return xflow(src, sp, sl, dst, dp, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
224 |
int b2 = sa[sp + 1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
225 |
if (isMalformed2(b1, b2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
226 |
return malformed(src, sp, dst, dp, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
227 |
da[dp++] = (char) (((b1 << 6) ^ b2) ^ 0x0f80); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
228 |
sp += 2; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
229 |
} else if ((b1 >> 4) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
230 |
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
231 |
if (sl - sp < 3 || dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
232 |
return xflow(src, sp, sl, dst, dp, 3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
233 |
int b2 = sa[sp + 1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
234 |
int b3 = sa[sp + 2]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
235 |
if (isMalformed3(b1, b2, b3)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
236 |
return malformed(src, sp, dst, dp, 3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
237 |
da[dp++] = (char) (((b1 << 12) ^ (b2 << 6) ^ b3) ^ 0x1f80); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
238 |
sp += 3; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
239 |
} else if ((b1 >> 3) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
240 |
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
241 |
if (sl - sp < 4 || dl - dp < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
242 |
return xflow(src, sp, sl, dst, dp, 4); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
243 |
int b2 = sa[sp + 1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
244 |
int b3 = sa[sp + 2]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
245 |
int b4 = sa[sp + 3]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
246 |
int uc = ((b1 & 0x07) << 18) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
247 |
((b2 & 0x3f) << 12) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
248 |
((b3 & 0x3f) << 06) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
249 |
(b4 & 0x3f); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
250 |
if (isMalformed4(b2, b3, b4) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
251 |
!Surrogate.neededFor(uc)) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
252 |
return malformed(src, sp, dst, dp, 4); |
2 | 253 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
254 |
da[dp++] = Surrogate.high(uc); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
255 |
da[dp++] = Surrogate.low(uc); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
256 |
sp += 4; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
257 |
} else |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
258 |
return malformed(src, sp, dst, dp, 1); |
2 | 259 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
260 |
return xflow(src, sp, sl, dst, dp, 0); |
2 | 261 |
} |
262 |
||
263 |
private CoderResult decodeBufferLoop(ByteBuffer src, |
|
264 |
CharBuffer dst) |
|
265 |
{ |
|
266 |
int mark = src.position(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
267 |
int limit = src.limit(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
268 |
while (mark < limit) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
269 |
int b1 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
270 |
if (b1 >= 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
271 |
// 1 byte, 7 bits: 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
272 |
if (dst.remaining() < 1) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
273 |
return xflow(src, mark, 1); //overflow |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
274 |
dst.put((char)b1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
275 |
mark++; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
276 |
} else if ((b1 >> 5) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
277 |
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
278 |
if (limit - mark < 2|| dst.remaining() < 1) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
279 |
return xflow(src, mark, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
280 |
int b2 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
281 |
if (isMalformed2(b1, b2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
282 |
return malformed(src, mark, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
283 |
dst.put((char) (((b1 << 6) ^ b2) ^ 0x0f80)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
284 |
mark += 2; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
285 |
} else if ((b1 >> 4) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
286 |
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
287 |
if (limit - mark < 3 || dst.remaining() < 1) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
288 |
return xflow(src, mark, 3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
289 |
int b2 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
290 |
int b3 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
291 |
if (isMalformed3(b1, b2, b3)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
292 |
return malformed(src, mark, 3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
293 |
dst.put((char) (((b1 << 12) ^ (b2 << 6) ^ b3) ^ 0x1f80)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
294 |
mark += 3; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
295 |
} else if ((b1 >> 3) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
296 |
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
297 |
if (limit - mark < 4 || dst.remaining() < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
298 |
return xflow(src, mark, 4); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
299 |
int b2 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
300 |
int b3 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
301 |
int b4 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
302 |
int uc = ((b1 & 0x07) << 18) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
303 |
((b2 & 0x3f) << 12) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
304 |
((b3 & 0x3f) << 06) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
305 |
(b4 & 0x3f); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
306 |
if (isMalformed4(b2, b3, b4) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
307 |
!Surrogate.neededFor(uc)) { // shortest form check |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
308 |
return malformed(src, mark, 4); |
2 | 309 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
310 |
dst.put(Surrogate.high(uc)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
311 |
dst.put(Surrogate.low(uc)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
312 |
mark += 4; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
313 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
314 |
return malformed(src, mark, 1); |
2 | 315 |
} |
316 |
} |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
317 |
return xflow(src, mark, 0); |
2 | 318 |
} |
319 |
||
320 |
protected CoderResult decodeLoop(ByteBuffer src, |
|
321 |
CharBuffer dst) |
|
322 |
{ |
|
323 |
if (src.hasArray() && dst.hasArray()) |
|
324 |
return decodeArrayLoop(src, dst); |
|
325 |
else |
|
326 |
return decodeBufferLoop(src, dst); |
|
327 |
} |
|
328 |
} |
|
329 |
||
330 |
private static class Encoder extends CharsetEncoder { |
|
331 |
||
332 |
private Encoder(Charset cs) { |
|
333 |
super(cs, 1.1f, 4.0f); |
|
334 |
} |
|
335 |
||
336 |
public boolean canEncode(char c) { |
|
3714 | 337 |
return !Character.isSurrogate(c); |
2 | 338 |
} |
339 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
340 |
public boolean isLegalReplacement(byte[] repl) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
341 |
return ((repl.length == 1 && repl[0] >= 0) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
342 |
super.isLegalReplacement(repl)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
343 |
} |
2 | 344 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
345 |
private static CoderResult overflow(CharBuffer src, int sp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
346 |
ByteBuffer dst, int dp) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
347 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
348 |
return CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
349 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
350 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
351 |
private static CoderResult overflow(CharBuffer src, int mark) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
352 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
353 |
return CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
354 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
355 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
356 |
private Surrogate.Parser sgp; |
2 | 357 |
private CoderResult encodeArrayLoop(CharBuffer src, |
358 |
ByteBuffer dst) |
|
359 |
{ |
|
360 |
char[] sa = src.array(); |
|
361 |
int sp = src.arrayOffset() + src.position(); |
|
362 |
int sl = src.arrayOffset() + src.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
363 |
|
2 | 364 |
byte[] da = dst.array(); |
365 |
int dp = dst.arrayOffset() + dst.position(); |
|
366 |
int dl = dst.arrayOffset() + dst.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
367 |
int dlASCII = dp + Math.min(sl - sp, dl - dp); |
2 | 368 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
369 |
//ASCII only loop |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
370 |
while (dp < dlASCII && sa[sp] < '\u0080') |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
371 |
da[dp++] = (byte) sa[sp++]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
372 |
while (sp < sl) { |
3714 | 373 |
char c = sa[sp]; |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
374 |
if (c < 0x80) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
375 |
// Have at most seven bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
376 |
if (dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
377 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
378 |
da[dp++] = (byte)c; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
379 |
} else if (c < 0x800) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
380 |
// 2 bytes, 11 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
381 |
if (dl - dp < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
382 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
383 |
da[dp++] = (byte)(0xc0 | ((c >> 06))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
384 |
da[dp++] = (byte)(0x80 | (c & 0x3f)); |
3714 | 385 |
} else if (Character.isSurrogate(c)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
386 |
// Have a surrogate pair |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
387 |
if (sgp == null) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
388 |
sgp = new Surrogate.Parser(); |
3714 | 389 |
int uc = sgp.parse(c, sa, sp, sl); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
390 |
if (uc < 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
391 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
392 |
return sgp.error(); |
2 | 393 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
394 |
if (dl - dp < 4) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
395 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
396 |
da[dp++] = (byte)(0xf0 | ((uc >> 18))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
397 |
da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
398 |
da[dp++] = (byte)(0x80 | ((uc >> 06) & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
399 |
da[dp++] = (byte)(0x80 | (uc & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
400 |
sp++; // 2 chars |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
401 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
402 |
// 3 bytes, 16 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
403 |
if (dl - dp < 3) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
404 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
405 |
da[dp++] = (byte)(0xe0 | ((c >> 12))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
406 |
da[dp++] = (byte)(0x80 | ((c >> 06) & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
407 |
da[dp++] = (byte)(0x80 | (c & 0x3f)); |
2 | 408 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
409 |
sp++; |
2 | 410 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
411 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
412 |
return CoderResult.UNDERFLOW; |
2 | 413 |
} |
414 |
||
415 |
private CoderResult encodeBufferLoop(CharBuffer src, |
|
416 |
ByteBuffer dst) |
|
417 |
{ |
|
418 |
int mark = src.position(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
419 |
while (src.hasRemaining()) { |
3714 | 420 |
char c = src.get(); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
421 |
if (c < 0x80) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
422 |
// Have at most seven bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
423 |
if (!dst.hasRemaining()) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
424 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
425 |
dst.put((byte)c); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
426 |
} else if (c < 0x800) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
427 |
// 2 bytes, 11 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
428 |
if (dst.remaining() < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
429 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
430 |
dst.put((byte)(0xc0 | ((c >> 06)))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
431 |
dst.put((byte)(0x80 | (c & 0x3f))); |
3714 | 432 |
} else if (Character.isSurrogate(c)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
433 |
// Have a surrogate pair |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
434 |
if (sgp == null) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
435 |
sgp = new Surrogate.Parser(); |
3714 | 436 |
int uc = sgp.parse(c, src); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
437 |
if (uc < 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
438 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
439 |
return sgp.error(); |
2 | 440 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
441 |
if (dst.remaining() < 4) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
442 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
443 |
dst.put((byte)(0xf0 | ((uc >> 18)))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
444 |
dst.put((byte)(0x80 | ((uc >> 12) & 0x3f))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
445 |
dst.put((byte)(0x80 | ((uc >> 06) & 0x3f))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
446 |
dst.put((byte)(0x80 | (uc & 0x3f))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
447 |
mark++; //2 chars |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
448 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
449 |
// 3 bytes, 16 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
450 |
if (dst.remaining() < 3) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
451 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
452 |
dst.put((byte)(0xe0 | ((c >> 12)))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
453 |
dst.put((byte)(0x80 | ((c >> 06) & 0x3f))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
454 |
dst.put((byte)(0x80 | (c & 0x3f))); |
2 | 455 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
456 |
mark++; |
2 | 457 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
458 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
459 |
return CoderResult.UNDERFLOW; |
2 | 460 |
} |
461 |
||
462 |
protected final CoderResult encodeLoop(CharBuffer src, |
|
463 |
ByteBuffer dst) |
|
464 |
{ |
|
465 |
if (src.hasArray() && dst.hasArray()) |
|
466 |
return encodeArrayLoop(src, dst); |
|
467 |
else |
|
468 |
return encodeBufferLoop(src, dst); |
|
469 |
} |
|
470 |
} |
|
471 |
} |