author | martin |
Wed, 30 Jun 2010 16:11:32 -0700 | |
changeset 5991 | 288afdbbca28 |
parent 5986 | 04eb44085c00 |
child 5992 | 15c59951d875 |
permissions | -rw-r--r-- |
2 | 1 |
/* |
5506 | 2 |
* Copyright (c) 2000, 2008, Oracle and/or its affiliates. All rights reserved. |
2 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
5506 | 7 |
* published by the Free Software Foundation. Oracle designates this |
2 | 8 |
* particular file as subject to the "Classpath" exception as provided |
5506 | 9 |
* by Oracle in the LICENSE file that accompanied this code. |
2 | 10 |
* |
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
15 |
* accompanied this code). |
|
16 |
* |
|
17 |
* You should have received a copy of the GNU General Public License version |
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
* |
|
5506 | 21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
22 |
* or visit www.oracle.com if you need additional information or have any |
|
23 |
* questions. |
|
2 | 24 |
*/ |
25 |
||
26 |
package sun.nio.cs; |
|
27 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
28 |
import java.nio.Buffer; |
2 | 29 |
import java.nio.ByteBuffer; |
30 |
import java.nio.CharBuffer; |
|
31 |
import java.nio.charset.Charset; |
|
32 |
import java.nio.charset.CharsetDecoder; |
|
33 |
import java.nio.charset.CharsetEncoder; |
|
34 |
import java.nio.charset.CoderResult; |
|
35 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
36 |
/* Legal UTF-8 Byte Sequences |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
37 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
38 |
* # Code Points Bits Bit/Byte pattern |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
39 |
* 1 7 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
40 |
* U+0000..U+007F 00..7F |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
41 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
42 |
* 2 11 110xxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
43 |
* U+0080..U+07FF C2..DF 80..BF |
2 | 44 |
* |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
45 |
* 3 16 1110xxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
46 |
* U+0800..U+0FFF E0 A0..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
47 |
* U+1000..U+FFFF E1..EF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
48 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
49 |
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
50 |
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
51 |
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
52 |
* U+100000..U10FFFF F4 80..8F 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
53 |
* |
2 | 54 |
*/ |
55 |
||
56 |
class UTF_8 extends Unicode |
|
57 |
{ |
|
58 |
public UTF_8() { |
|
59 |
super("UTF-8", StandardCharsets.aliases_UTF_8); |
|
60 |
} |
|
61 |
||
62 |
public String historicalName() { |
|
63 |
return "UTF8"; |
|
64 |
} |
|
65 |
||
66 |
public CharsetDecoder newDecoder() { |
|
67 |
return new Decoder(this); |
|
68 |
} |
|
69 |
||
70 |
public CharsetEncoder newEncoder() { |
|
71 |
return new Encoder(this); |
|
72 |
} |
|
73 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
74 |
static final void updatePositions(Buffer src, int sp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
75 |
Buffer dst, int dp) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
76 |
src.position(sp - src.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
77 |
dst.position(dp - dst.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
78 |
} |
2 | 79 |
|
80 |
private static class Decoder extends CharsetDecoder { |
|
81 |
private Decoder(Charset cs) { |
|
82 |
super(cs, 1.0f, 1.0f); |
|
83 |
} |
|
84 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
85 |
private static boolean isNotContinuation(int b) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
86 |
return (b & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
87 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
88 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
89 |
// [C2..DF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
90 |
private static boolean isMalformed2(int b1, int b2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
91 |
return (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
92 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
93 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
94 |
// [E0] [A0..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
95 |
// [E1..EF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
96 |
private static boolean isMalformed3(int b1, int b2, int b3) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
97 |
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
98 |
(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
99 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
100 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
101 |
// [F0] [90..BF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
102 |
// [F1..F3] [80..BF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
103 |
// [F4] [80..8F] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
104 |
// only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...] |
5986
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
105 |
// will be checked by Character.isSupplementaryCodePoint(uc) |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
106 |
private static boolean isMalformed4(int b2, int b3, int b4) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
107 |
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
108 |
(b4 & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
109 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
110 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
111 |
private static CoderResult lookupN(ByteBuffer src, int n) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
112 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
113 |
for (int i = 1; i < n; i++) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
114 |
if (isNotContinuation(src.get())) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
115 |
return CoderResult.malformedForLength(i); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
116 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
117 |
return CoderResult.malformedForLength(n); |
2 | 118 |
} |
119 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
120 |
private static CoderResult malformedN(ByteBuffer src, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
121 |
switch (nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
122 |
case 1: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
123 |
int b1 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
124 |
if ((b1 >> 2) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
125 |
// 5 bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
126 |
if (src.remaining() < 4) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
127 |
return CoderResult.UNDERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
128 |
return lookupN(src, 5); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
129 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
130 |
if ((b1 >> 1) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
131 |
// 6 bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
132 |
if (src.remaining() < 5) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
133 |
return CoderResult.UNDERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
134 |
return lookupN(src, 6); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
135 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
136 |
return CoderResult.malformedForLength(1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
137 |
case 2: // always 1 |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
138 |
return CoderResult.malformedForLength(1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
139 |
case 3: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
140 |
b1 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
141 |
int b2 = src.get(); // no need to lookup b3 |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
142 |
return CoderResult.malformedForLength( |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
143 |
((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
144 |
isNotContinuation(b2))?1:2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
145 |
case 4: // we don't care the speed here |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
146 |
b1 = src.get() & 0xff; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
147 |
b2 = src.get() & 0xff; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
148 |
if (b1 > 0xf4 || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
149 |
(b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
150 |
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
151 |
isNotContinuation(b2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
152 |
return CoderResult.malformedForLength(1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
153 |
if (isNotContinuation(src.get())) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
154 |
return CoderResult.malformedForLength(2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
155 |
return CoderResult.malformedForLength(3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
156 |
default: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
157 |
assert false; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
158 |
return null; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
159 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
160 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
161 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
162 |
private static CoderResult malformed(ByteBuffer src, int sp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
163 |
CharBuffer dst, int dp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
164 |
int nb) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
165 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
166 |
src.position(sp - src.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
167 |
CoderResult cr = malformedN(src, nb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
168 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
169 |
return cr; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
170 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
171 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
172 |
private static CoderResult malformed(ByteBuffer src, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
173 |
int mark, int nb) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
174 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
175 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
176 |
CoderResult cr = malformedN(src, nb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
177 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
178 |
return cr; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
179 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
180 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
181 |
private static CoderResult xflow(Buffer src, int sp, int sl, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
182 |
Buffer dst, int dp, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
183 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
184 |
return (nb == 0 || sl - sp < nb) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
185 |
?CoderResult.UNDERFLOW:CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
186 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
187 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
188 |
private static CoderResult xflow(Buffer src, int mark, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
189 |
CoderResult cr = (nb == 0 || src.remaining() < (nb - 1)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
190 |
?CoderResult.UNDERFLOW:CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
191 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
192 |
return cr; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
193 |
} |
2 | 194 |
|
195 |
private CoderResult decodeArrayLoop(ByteBuffer src, |
|
196 |
CharBuffer dst) |
|
197 |
{ |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
198 |
// This method is optimized for ASCII input. |
2 | 199 |
byte[] sa = src.array(); |
200 |
int sp = src.arrayOffset() + src.position(); |
|
201 |
int sl = src.arrayOffset() + src.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
202 |
|
2 | 203 |
char[] da = dst.array(); |
204 |
int dp = dst.arrayOffset() + dst.position(); |
|
205 |
int dl = dst.arrayOffset() + dst.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
206 |
int dlASCII = dp + Math.min(sl - sp, dl - dp); |
2 | 207 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
208 |
// ASCII only loop |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
209 |
while (dp < dlASCII && sa[sp] >= 0) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
210 |
da[dp++] = (char)sa[sp++]; |
2 | 211 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
212 |
while (sp < sl) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
213 |
int b1 = sa[sp]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
214 |
if (b1 >= 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
215 |
// 1 byte, 7 bits: 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
216 |
if (dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
217 |
return xflow(src, sp, sl, dst, dp, 1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
218 |
da[dp++] = (char)b1; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
219 |
sp++; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
220 |
} else if ((b1 >> 5) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
221 |
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
222 |
if (sl - sp < 2 || dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
223 |
return xflow(src, sp, sl, dst, dp, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
224 |
int b2 = sa[sp + 1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
225 |
if (isMalformed2(b1, b2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
226 |
return malformed(src, sp, dst, dp, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
227 |
da[dp++] = (char) (((b1 << 6) ^ b2) ^ 0x0f80); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
228 |
sp += 2; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
229 |
} else if ((b1 >> 4) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
230 |
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
231 |
if (sl - sp < 3 || dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
232 |
return xflow(src, sp, sl, dst, dp, 3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
233 |
int b2 = sa[sp + 1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
234 |
int b3 = sa[sp + 2]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
235 |
if (isMalformed3(b1, b2, b3)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
236 |
return malformed(src, sp, dst, dp, 3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
237 |
da[dp++] = (char) (((b1 << 12) ^ (b2 << 6) ^ b3) ^ 0x1f80); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
238 |
sp += 3; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
239 |
} else if ((b1 >> 3) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
240 |
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
241 |
if (sl - sp < 4 || dl - dp < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
242 |
return xflow(src, sp, sl, dst, dp, 4); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
243 |
int b2 = sa[sp + 1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
244 |
int b3 = sa[sp + 2]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
245 |
int b4 = sa[sp + 3]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
246 |
int uc = ((b1 & 0x07) << 18) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
247 |
((b2 & 0x3f) << 12) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
248 |
((b3 & 0x3f) << 06) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
249 |
(b4 & 0x3f); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
250 |
if (isMalformed4(b2, b3, b4) || |
5986
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
251 |
// shortest form check |
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
252 |
!Character.isSupplementaryCodePoint(uc)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
253 |
return malformed(src, sp, dst, dp, 4); |
2 | 254 |
} |
5991
288afdbbca28
6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents:
5986
diff
changeset
|
255 |
da[dp++] = Character.highSurrogate(uc); |
288afdbbca28
6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents:
5986
diff
changeset
|
256 |
da[dp++] = Character.lowSurrogate(uc); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
257 |
sp += 4; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
258 |
} else |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
259 |
return malformed(src, sp, dst, dp, 1); |
2 | 260 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
261 |
return xflow(src, sp, sl, dst, dp, 0); |
2 | 262 |
} |
263 |
||
264 |
private CoderResult decodeBufferLoop(ByteBuffer src, |
|
265 |
CharBuffer dst) |
|
266 |
{ |
|
267 |
int mark = src.position(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
268 |
int limit = src.limit(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
269 |
while (mark < limit) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
270 |
int b1 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
271 |
if (b1 >= 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
272 |
// 1 byte, 7 bits: 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
273 |
if (dst.remaining() < 1) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
274 |
return xflow(src, mark, 1); //overflow |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
275 |
dst.put((char)b1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
276 |
mark++; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
277 |
} else if ((b1 >> 5) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
278 |
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
279 |
if (limit - mark < 2|| dst.remaining() < 1) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
280 |
return xflow(src, mark, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
281 |
int b2 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
282 |
if (isMalformed2(b1, b2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
283 |
return malformed(src, mark, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
284 |
dst.put((char) (((b1 << 6) ^ b2) ^ 0x0f80)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
285 |
mark += 2; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
286 |
} else if ((b1 >> 4) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
287 |
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
288 |
if (limit - mark < 3 || dst.remaining() < 1) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
289 |
return xflow(src, mark, 3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
290 |
int b2 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
291 |
int b3 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
292 |
if (isMalformed3(b1, b2, b3)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
293 |
return malformed(src, mark, 3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
294 |
dst.put((char) (((b1 << 12) ^ (b2 << 6) ^ b3) ^ 0x1f80)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
295 |
mark += 3; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
296 |
} else if ((b1 >> 3) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
297 |
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
298 |
if (limit - mark < 4 || dst.remaining() < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
299 |
return xflow(src, mark, 4); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
300 |
int b2 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
301 |
int b3 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
302 |
int b4 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
303 |
int uc = ((b1 & 0x07) << 18) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
304 |
((b2 & 0x3f) << 12) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
305 |
((b3 & 0x3f) << 06) | |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
306 |
(b4 & 0x3f); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
307 |
if (isMalformed4(b2, b3, b4) || |
5986
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
308 |
// shortest form check |
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
309 |
!Character.isSupplementaryCodePoint(uc)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
310 |
return malformed(src, mark, 4); |
2 | 311 |
} |
5991
288afdbbca28
6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents:
5986
diff
changeset
|
312 |
dst.put(Character.highSurrogate(uc)); |
288afdbbca28
6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents:
5986
diff
changeset
|
313 |
dst.put(Character.lowSurrogate(uc)); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
314 |
mark += 4; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
315 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
316 |
return malformed(src, mark, 1); |
2 | 317 |
} |
318 |
} |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
319 |
return xflow(src, mark, 0); |
2 | 320 |
} |
321 |
||
322 |
protected CoderResult decodeLoop(ByteBuffer src, |
|
323 |
CharBuffer dst) |
|
324 |
{ |
|
325 |
if (src.hasArray() && dst.hasArray()) |
|
326 |
return decodeArrayLoop(src, dst); |
|
327 |
else |
|
328 |
return decodeBufferLoop(src, dst); |
|
329 |
} |
|
330 |
} |
|
331 |
||
332 |
private static class Encoder extends CharsetEncoder { |
|
333 |
||
334 |
private Encoder(Charset cs) { |
|
335 |
super(cs, 1.1f, 4.0f); |
|
336 |
} |
|
337 |
||
338 |
public boolean canEncode(char c) { |
|
3714 | 339 |
return !Character.isSurrogate(c); |
2 | 340 |
} |
341 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
342 |
public boolean isLegalReplacement(byte[] repl) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
343 |
return ((repl.length == 1 && repl[0] >= 0) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
344 |
super.isLegalReplacement(repl)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
345 |
} |
2 | 346 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
347 |
private static CoderResult overflow(CharBuffer src, int sp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
348 |
ByteBuffer dst, int dp) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
349 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
350 |
return CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
351 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
352 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
353 |
private static CoderResult overflow(CharBuffer src, int mark) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
354 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
355 |
return CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
356 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
357 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
358 |
private Surrogate.Parser sgp; |
2 | 359 |
private CoderResult encodeArrayLoop(CharBuffer src, |
360 |
ByteBuffer dst) |
|
361 |
{ |
|
362 |
char[] sa = src.array(); |
|
363 |
int sp = src.arrayOffset() + src.position(); |
|
364 |
int sl = src.arrayOffset() + src.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
365 |
|
2 | 366 |
byte[] da = dst.array(); |
367 |
int dp = dst.arrayOffset() + dst.position(); |
|
368 |
int dl = dst.arrayOffset() + dst.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
369 |
int dlASCII = dp + Math.min(sl - sp, dl - dp); |
2 | 370 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
371 |
//ASCII only loop |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
372 |
while (dp < dlASCII && sa[sp] < '\u0080') |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
373 |
da[dp++] = (byte) sa[sp++]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
374 |
while (sp < sl) { |
3714 | 375 |
char c = sa[sp]; |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
376 |
if (c < 0x80) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
377 |
// Have at most seven bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
378 |
if (dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
379 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
380 |
da[dp++] = (byte)c; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
381 |
} else if (c < 0x800) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
382 |
// 2 bytes, 11 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
383 |
if (dl - dp < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
384 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
385 |
da[dp++] = (byte)(0xc0 | ((c >> 06))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
386 |
da[dp++] = (byte)(0x80 | (c & 0x3f)); |
3714 | 387 |
} else if (Character.isSurrogate(c)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
388 |
// Have a surrogate pair |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
389 |
if (sgp == null) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
390 |
sgp = new Surrogate.Parser(); |
3714 | 391 |
int uc = sgp.parse(c, sa, sp, sl); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
392 |
if (uc < 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
393 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
394 |
return sgp.error(); |
2 | 395 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
396 |
if (dl - dp < 4) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
397 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
398 |
da[dp++] = (byte)(0xf0 | ((uc >> 18))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
399 |
da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
400 |
da[dp++] = (byte)(0x80 | ((uc >> 06) & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
401 |
da[dp++] = (byte)(0x80 | (uc & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
402 |
sp++; // 2 chars |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
403 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
404 |
// 3 bytes, 16 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
405 |
if (dl - dp < 3) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
406 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
407 |
da[dp++] = (byte)(0xe0 | ((c >> 12))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
408 |
da[dp++] = (byte)(0x80 | ((c >> 06) & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
409 |
da[dp++] = (byte)(0x80 | (c & 0x3f)); |
2 | 410 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
411 |
sp++; |
2 | 412 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
413 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
414 |
return CoderResult.UNDERFLOW; |
2 | 415 |
} |
416 |
||
417 |
private CoderResult encodeBufferLoop(CharBuffer src, |
|
418 |
ByteBuffer dst) |
|
419 |
{ |
|
420 |
int mark = src.position(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
421 |
while (src.hasRemaining()) { |
3714 | 422 |
char c = src.get(); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
423 |
if (c < 0x80) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
424 |
// Have at most seven bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
425 |
if (!dst.hasRemaining()) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
426 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
427 |
dst.put((byte)c); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
428 |
} else if (c < 0x800) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
429 |
// 2 bytes, 11 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
430 |
if (dst.remaining() < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
431 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
432 |
dst.put((byte)(0xc0 | ((c >> 06)))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
433 |
dst.put((byte)(0x80 | (c & 0x3f))); |
3714 | 434 |
} else if (Character.isSurrogate(c)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
435 |
// Have a surrogate pair |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
436 |
if (sgp == null) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
437 |
sgp = new Surrogate.Parser(); |
3714 | 438 |
int uc = sgp.parse(c, src); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
439 |
if (uc < 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
440 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
441 |
return sgp.error(); |
2 | 442 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
443 |
if (dst.remaining() < 4) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
444 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
445 |
dst.put((byte)(0xf0 | ((uc >> 18)))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
446 |
dst.put((byte)(0x80 | ((uc >> 12) & 0x3f))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
447 |
dst.put((byte)(0x80 | ((uc >> 06) & 0x3f))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
448 |
dst.put((byte)(0x80 | (uc & 0x3f))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
449 |
mark++; //2 chars |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
450 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
451 |
// 3 bytes, 16 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
452 |
if (dst.remaining() < 3) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
453 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
454 |
dst.put((byte)(0xe0 | ((c >> 12)))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
455 |
dst.put((byte)(0x80 | ((c >> 06) & 0x3f))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
456 |
dst.put((byte)(0x80 | (c & 0x3f))); |
2 | 457 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
458 |
mark++; |
2 | 459 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
460 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
461 |
return CoderResult.UNDERFLOW; |
2 | 462 |
} |
463 |
||
464 |
protected final CoderResult encodeLoop(CharBuffer src, |
|
465 |
ByteBuffer dst) |
|
466 |
{ |
|
467 |
if (src.hasArray() && dst.hasArray()) |
|
468 |
return encodeArrayLoop(src, dst); |
|
469 |
else |
|
470 |
return encodeBufferLoop(src, dst); |
|
471 |
} |
|
472 |
} |
|
473 |
} |