author | sherman |
Wed, 13 Dec 2017 07:51:57 -0800 | |
changeset 48262 | daf3b49f4839 |
parent 47216 | 71c04702a3d5 |
child 49443 | e5679a6661d6 |
permissions | -rw-r--r-- |
2 | 1 |
/* |
45894
995421c69f66
8184665: Skip name and alias checks for standard Charsets
redestad
parents:
33663
diff
changeset
|
2 |
* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved. |
2 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
5506 | 7 |
* published by the Free Software Foundation. Oracle designates this |
2 | 8 |
* particular file as subject to the "Classpath" exception as provided |
5506 | 9 |
* by Oracle in the LICENSE file that accompanied this code. |
2 | 10 |
* |
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
15 |
* accompanied this code). |
|
16 |
* |
|
17 |
* You should have received a copy of the GNU General Public License version |
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
* |
|
5506 | 21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
22 |
* or visit www.oracle.com if you need additional information or have any |
|
23 |
* questions. |
|
2 | 24 |
*/ |
25 |
||
26 |
package sun.nio.cs; |
|
27 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
28 |
import java.nio.Buffer; |
2 | 29 |
import java.nio.ByteBuffer; |
30 |
import java.nio.CharBuffer; |
|
31 |
import java.nio.charset.Charset; |
|
32 |
import java.nio.charset.CharsetDecoder; |
|
33 |
import java.nio.charset.CharsetEncoder; |
|
34 |
import java.nio.charset.CoderResult; |
|
9547
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
35 |
import java.nio.charset.CodingErrorAction; |
2 | 36 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
37 |
/* Legal UTF-8 Byte Sequences |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
38 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
39 |
* # Code Points Bits Bit/Byte pattern |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
40 |
* 1 7 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
41 |
* U+0000..U+007F 00..7F |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
42 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
43 |
* 2 11 110xxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
44 |
* U+0080..U+07FF C2..DF 80..BF |
2 | 45 |
* |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
46 |
* 3 16 1110xxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
47 |
* U+0800..U+0FFF E0 A0..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
48 |
* U+1000..U+FFFF E1..EF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
49 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
50 |
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
51 |
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
52 |
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
53 |
* U+100000..U10FFFF F4 80..8F 80..BF 80..BF |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
54 |
* |
2 | 55 |
*/ |
56 |
||
47026
94c45ad89b9c
8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents:
45894
diff
changeset
|
57 |
public final class UTF_8 extends Unicode { |
94c45ad89b9c
8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents:
45894
diff
changeset
|
58 |
|
94c45ad89b9c
8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents:
45894
diff
changeset
|
59 |
public static final UTF_8 INSTANCE = new UTF_8(); |
94c45ad89b9c
8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents:
45894
diff
changeset
|
60 |
|
2 | 61 |
public UTF_8() { |
47026
94c45ad89b9c
8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents:
45894
diff
changeset
|
62 |
super("UTF-8", StandardCharsets.aliases_UTF_8()); |
2 | 63 |
} |
64 |
||
65 |
public String historicalName() { |
|
66 |
return "UTF8"; |
|
67 |
} |
|
68 |
||
69 |
public CharsetDecoder newDecoder() { |
|
70 |
return new Decoder(this); |
|
71 |
} |
|
72 |
||
73 |
public CharsetEncoder newEncoder() { |
|
74 |
return new Encoder(this); |
|
75 |
} |
|
76 |
||
47026
94c45ad89b9c
8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents:
45894
diff
changeset
|
77 |
static final void updatePositions(Buffer src, int sp, |
10898 | 78 |
Buffer dst, int dp) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
79 |
src.position(sp - src.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
80 |
dst.position(dp - dst.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
81 |
} |
2 | 82 |
|
48262 | 83 |
private static class Decoder extends CharsetDecoder { |
84 |
||
2 | 85 |
private Decoder(Charset cs) { |
86 |
super(cs, 1.0f, 1.0f); |
|
87 |
} |
|
88 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
89 |
private static boolean isNotContinuation(int b) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
90 |
return (b & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
91 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
92 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
93 |
// [E0] [A0..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
94 |
// [E1..EF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
95 |
private static boolean isMalformed3(int b1, int b2, int b3) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
96 |
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
97 |
(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
98 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
99 |
|
10898 | 100 |
// only used when there is only one byte left in src buffer |
101 |
private static boolean isMalformed3_2(int b1, int b2) { |
|
102 |
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || |
|
103 |
(b2 & 0xc0) != 0x80; |
|
104 |
} |
|
105 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
106 |
// [F0] [90..BF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
107 |
// [F1..F3] [80..BF] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
108 |
// [F4] [80..8F] [80..BF] [80..BF] |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
109 |
// only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...] |
5986
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
110 |
// will be checked by Character.isSupplementaryCodePoint(uc) |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
111 |
private static boolean isMalformed4(int b2, int b3, int b4) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
112 |
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
113 |
(b4 & 0xc0) != 0x80; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
114 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
115 |
|
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
116 |
// only used when there is less than 4 bytes left in src buffer. |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
117 |
// both b1 and b2 should be "& 0xff" before passed in. |
10898 | 118 |
private static boolean isMalformed4_2(int b1, int b2) { |
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
119 |
return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
120 |
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) || |
10898 | 121 |
(b2 & 0xc0) != 0x80; |
122 |
} |
|
123 |
||
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
124 |
// tests if b1 and b2 are malformed as the first 2 bytes of a |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
125 |
// legal`4-byte utf-8 byte sequence. |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
126 |
// only used when there is less than 4 bytes left in src buffer, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
127 |
// after isMalformed4_2 has been invoked. |
10898 | 128 |
private static boolean isMalformed4_3(int b3) { |
129 |
return (b3 & 0xc0) != 0x80; |
|
130 |
} |
|
131 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
132 |
private static CoderResult lookupN(ByteBuffer src, int n) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
133 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
134 |
for (int i = 1; i < n; i++) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
135 |
if (isNotContinuation(src.get())) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
136 |
return CoderResult.malformedForLength(i); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
137 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
138 |
return CoderResult.malformedForLength(n); |
2 | 139 |
} |
140 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
141 |
private static CoderResult malformedN(ByteBuffer src, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
142 |
switch (nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
143 |
case 1: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
144 |
case 2: // always 1 |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
145 |
return CoderResult.malformedForLength(1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
146 |
case 3: |
10898 | 147 |
int b1 = src.get(); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
148 |
int b2 = src.get(); // no need to lookup b3 |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
149 |
return CoderResult.malformedForLength( |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
150 |
((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || |
10898 | 151 |
isNotContinuation(b2)) ? 1 : 2); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
152 |
case 4: // we don't care the speed here |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
153 |
b1 = src.get() & 0xff; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
154 |
b2 = src.get() & 0xff; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
155 |
if (b1 > 0xf4 || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
156 |
(b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
157 |
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
158 |
isNotContinuation(b2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
159 |
return CoderResult.malformedForLength(1); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
160 |
if (isNotContinuation(src.get())) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
161 |
return CoderResult.malformedForLength(2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
162 |
return CoderResult.malformedForLength(3); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
163 |
default: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
164 |
assert false; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
165 |
return null; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
166 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
167 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
168 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
169 |
private static CoderResult malformed(ByteBuffer src, int sp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
170 |
CharBuffer dst, int dp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
171 |
int nb) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
172 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
173 |
src.position(sp - src.arrayOffset()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
174 |
CoderResult cr = malformedN(src, nb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
175 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
176 |
return cr; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
177 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
178 |
|
10898 | 179 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
180 |
private static CoderResult malformed(ByteBuffer src, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
181 |
int mark, int nb) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
182 |
{ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
183 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
184 |
CoderResult cr = malformedN(src, nb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
185 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
186 |
return cr; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
187 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
188 |
|
10898 | 189 |
private static CoderResult malformedForLength(ByteBuffer src, |
190 |
int sp, |
|
191 |
CharBuffer dst, |
|
192 |
int dp, |
|
193 |
int malformedNB) |
|
194 |
{ |
|
195 |
updatePositions(src, sp, dst, dp); |
|
196 |
return CoderResult.malformedForLength(malformedNB); |
|
197 |
} |
|
198 |
||
199 |
private static CoderResult malformedForLength(ByteBuffer src, |
|
200 |
int mark, |
|
201 |
int malformedNB) |
|
202 |
{ |
|
203 |
src.position(mark); |
|
204 |
return CoderResult.malformedForLength(malformedNB); |
|
205 |
} |
|
206 |
||
207 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
208 |
private static CoderResult xflow(Buffer src, int sp, int sl, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
209 |
Buffer dst, int dp, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
210 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
211 |
return (nb == 0 || sl - sp < nb) |
10898 | 212 |
? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
213 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
214 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
215 |
private static CoderResult xflow(Buffer src, int mark, int nb) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
216 |
src.position(mark); |
10898 | 217 |
return (nb == 0 || src.remaining() < nb) |
218 |
? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
219 |
} |
2 | 220 |
|
221 |
private CoderResult decodeArrayLoop(ByteBuffer src, |
|
222 |
CharBuffer dst) |
|
223 |
{ |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
224 |
// This method is optimized for ASCII input. |
2 | 225 |
byte[] sa = src.array(); |
226 |
int sp = src.arrayOffset() + src.position(); |
|
227 |
int sl = src.arrayOffset() + src.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
228 |
|
2 | 229 |
char[] da = dst.array(); |
230 |
int dp = dst.arrayOffset() + dst.position(); |
|
231 |
int dl = dst.arrayOffset() + dst.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
232 |
int dlASCII = dp + Math.min(sl - sp, dl - dp); |
2 | 233 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
234 |
// ASCII only loop |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
235 |
while (dp < dlASCII && sa[sp] >= 0) |
5992 | 236 |
da[dp++] = (char) sa[sp++]; |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
237 |
while (sp < sl) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
238 |
int b1 = sa[sp]; |
5992 | 239 |
if (b1 >= 0) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
240 |
// 1 byte, 7 bits: 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
241 |
if (dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
242 |
return xflow(src, sp, sl, dst, dp, 1); |
5992 | 243 |
da[dp++] = (char) b1; |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
244 |
sp++; |
10898 | 245 |
} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
246 |
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx |
10898 | 247 |
// [C2..DF] [80..BF] |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
248 |
if (sl - sp < 2 || dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
249 |
return xflow(src, sp, sl, dst, dp, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
250 |
int b2 = sa[sp + 1]; |
10898 | 251 |
// Now we check the first byte of 2-byte sequence as |
252 |
// if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) |
|
253 |
// no longer need to check b1 against c1 & c0 for |
|
254 |
// malformed as we did in previous version |
|
255 |
// (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80; |
|
256 |
// only need to check the second byte b2. |
|
257 |
if (isNotContinuation(b2)) |
|
258 |
return malformedForLength(src, sp, dst, dp, 1); |
|
5992 | 259 |
da[dp++] = (char) (((b1 << 6) ^ b2) |
260 |
^ |
|
261 |
(((byte) 0xC0 << 6) ^ |
|
262 |
((byte) 0x80 << 0))); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
263 |
sp += 2; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
264 |
} else if ((b1 >> 4) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
265 |
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx |
10898 | 266 |
int srcRemaining = sl - sp; |
267 |
if (srcRemaining < 3 || dp >= dl) { |
|
268 |
if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1])) |
|
269 |
return malformedForLength(src, sp, dst, dp, 1); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
270 |
return xflow(src, sp, sl, dst, dp, 3); |
10898 | 271 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
272 |
int b2 = sa[sp + 1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
273 |
int b3 = sa[sp + 2]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
274 |
if (isMalformed3(b1, b2, b3)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
275 |
return malformed(src, sp, dst, dp, 3); |
10898 | 276 |
char c = (char) |
5992 | 277 |
((b1 << 12) ^ |
278 |
(b2 << 6) ^ |
|
279 |
(b3 ^ |
|
280 |
(((byte) 0xE0 << 12) ^ |
|
281 |
((byte) 0x80 << 6) ^ |
|
282 |
((byte) 0x80 << 0)))); |
|
10898 | 283 |
if (Character.isSurrogate(c)) |
284 |
return malformedForLength(src, sp, dst, dp, 3); |
|
285 |
da[dp++] = c; |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
286 |
sp += 3; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
287 |
} else if ((b1 >> 3) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
288 |
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
10898 | 289 |
int srcRemaining = sl - sp; |
290 |
if (srcRemaining < 4 || dl - dp < 2) { |
|
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
291 |
b1 &= 0xff; |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
292 |
if (b1 > 0xf4 || |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
293 |
srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff)) |
10898 | 294 |
return malformedForLength(src, sp, dst, dp, 1); |
295 |
if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2])) |
|
296 |
return malformedForLength(src, sp, dst, dp, 2); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
297 |
return xflow(src, sp, sl, dst, dp, 4); |
10898 | 298 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
299 |
int b2 = sa[sp + 1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
300 |
int b3 = sa[sp + 2]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
301 |
int b4 = sa[sp + 3]; |
5992 | 302 |
int uc = ((b1 << 18) ^ |
303 |
(b2 << 12) ^ |
|
304 |
(b3 << 6) ^ |
|
305 |
(b4 ^ |
|
306 |
(((byte) 0xF0 << 18) ^ |
|
307 |
((byte) 0x80 << 12) ^ |
|
308 |
((byte) 0x80 << 6) ^ |
|
309 |
((byte) 0x80 << 0)))); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
310 |
if (isMalformed4(b2, b3, b4) || |
5986
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
311 |
// shortest form check |
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
312 |
!Character.isSupplementaryCodePoint(uc)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
313 |
return malformed(src, sp, dst, dp, 4); |
2 | 314 |
} |
5991
288afdbbca28
6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents:
5986
diff
changeset
|
315 |
da[dp++] = Character.highSurrogate(uc); |
288afdbbca28
6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents:
5986
diff
changeset
|
316 |
da[dp++] = Character.lowSurrogate(uc); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
317 |
sp += 4; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
318 |
} else |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
319 |
return malformed(src, sp, dst, dp, 1); |
2 | 320 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
321 |
return xflow(src, sp, sl, dst, dp, 0); |
2 | 322 |
} |
323 |
||
324 |
private CoderResult decodeBufferLoop(ByteBuffer src, |
|
325 |
CharBuffer dst) |
|
326 |
{ |
|
327 |
int mark = src.position(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
328 |
int limit = src.limit(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
329 |
while (mark < limit) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
330 |
int b1 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
331 |
if (b1 >= 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
332 |
// 1 byte, 7 bits: 0xxxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
333 |
if (dst.remaining() < 1) |
5992 | 334 |
return xflow(src, mark, 1); // overflow |
335 |
dst.put((char) b1); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
336 |
mark++; |
10898 | 337 |
} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
338 |
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
339 |
if (limit - mark < 2|| dst.remaining() < 1) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
340 |
return xflow(src, mark, 2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
341 |
int b2 = src.get(); |
10898 | 342 |
if (isNotContinuation(b2)) |
343 |
return malformedForLength(src, mark, 1); |
|
344 |
dst.put((char) (((b1 << 6) ^ b2) |
|
5992 | 345 |
^ |
346 |
(((byte) 0xC0 << 6) ^ |
|
347 |
((byte) 0x80 << 0)))); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
348 |
mark += 2; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
349 |
} else if ((b1 >> 4) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
350 |
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx |
10898 | 351 |
int srcRemaining = limit - mark; |
352 |
if (srcRemaining < 3 || dst.remaining() < 1) { |
|
353 |
if (srcRemaining > 1 && isMalformed3_2(b1, src.get())) |
|
354 |
return malformedForLength(src, mark, 1); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
355 |
return xflow(src, mark, 3); |
10898 | 356 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
357 |
int b2 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
358 |
int b3 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
359 |
if (isMalformed3(b1, b2, b3)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
360 |
return malformed(src, mark, 3); |
10898 | 361 |
char c = (char) |
362 |
((b1 << 12) ^ |
|
363 |
(b2 << 6) ^ |
|
364 |
(b3 ^ |
|
365 |
(((byte) 0xE0 << 12) ^ |
|
366 |
((byte) 0x80 << 6) ^ |
|
367 |
((byte) 0x80 << 0)))); |
|
368 |
if (Character.isSurrogate(c)) |
|
369 |
return malformedForLength(src, mark, 3); |
|
370 |
dst.put(c); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
371 |
mark += 3; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
372 |
} else if ((b1 >> 3) == -2) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
373 |
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
10898 | 374 |
int srcRemaining = limit - mark; |
375 |
if (srcRemaining < 4 || dst.remaining() < 2) { |
|
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
376 |
b1 &= 0xff; |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
377 |
if (b1 > 0xf4 || |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
23010
diff
changeset
|
378 |
srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff)) |
10898 | 379 |
return malformedForLength(src, mark, 1); |
380 |
if (srcRemaining > 2 && isMalformed4_3(src.get())) |
|
381 |
return malformedForLength(src, mark, 2); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
382 |
return xflow(src, mark, 4); |
10898 | 383 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
384 |
int b2 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
385 |
int b3 = src.get(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
386 |
int b4 = src.get(); |
5992 | 387 |
int uc = ((b1 << 18) ^ |
388 |
(b2 << 12) ^ |
|
389 |
(b3 << 6) ^ |
|
390 |
(b4 ^ |
|
391 |
(((byte) 0xF0 << 18) ^ |
|
392 |
((byte) 0x80 << 12) ^ |
|
393 |
((byte) 0x80 << 6) ^ |
|
394 |
((byte) 0x80 << 0)))); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
395 |
if (isMalformed4(b2, b3, b4) || |
5986
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
396 |
// shortest form check |
04eb44085c00
6934265: Add public method Character.isBmpCodePoint
martin
parents:
5506
diff
changeset
|
397 |
!Character.isSupplementaryCodePoint(uc)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
398 |
return malformed(src, mark, 4); |
2 | 399 |
} |
5991
288afdbbca28
6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents:
5986
diff
changeset
|
400 |
dst.put(Character.highSurrogate(uc)); |
288afdbbca28
6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents:
5986
diff
changeset
|
401 |
dst.put(Character.lowSurrogate(uc)); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
402 |
mark += 4; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
403 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
404 |
return malformed(src, mark, 1); |
2 | 405 |
} |
406 |
} |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
407 |
return xflow(src, mark, 0); |
2 | 408 |
} |
409 |
||
410 |
protected CoderResult decodeLoop(ByteBuffer src, |
|
411 |
CharBuffer dst) |
|
412 |
{ |
|
413 |
if (src.hasArray() && dst.hasArray()) |
|
414 |
return decodeArrayLoop(src, dst); |
|
415 |
else |
|
416 |
return decodeBufferLoop(src, dst); |
|
417 |
} |
|
9547
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
418 |
|
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
419 |
private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp) |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
420 |
{ |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
421 |
if (bb == null) |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
422 |
bb = ByteBuffer.wrap(ba); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
423 |
bb.position(sp); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
424 |
return bb; |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
425 |
} |
2 | 426 |
} |
427 |
||
48262 | 428 |
private static final class Encoder extends CharsetEncoder { |
2 | 429 |
|
430 |
private Encoder(Charset cs) { |
|
7282
b1f801518f89
6957230: CharsetEncoder.maxBytesPerChar() reports 4 for UTF-8; should be 3
sherman
parents:
5992
diff
changeset
|
431 |
super(cs, 1.1f, 3.0f); |
2 | 432 |
} |
433 |
||
434 |
public boolean canEncode(char c) { |
|
3714 | 435 |
return !Character.isSurrogate(c); |
2 | 436 |
} |
437 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
438 |
public boolean isLegalReplacement(byte[] repl) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
439 |
return ((repl.length == 1 && repl[0] >= 0) || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
440 |
super.isLegalReplacement(repl)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
441 |
} |
2 | 442 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
443 |
private static CoderResult overflow(CharBuffer src, int sp, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
444 |
ByteBuffer dst, int dp) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
445 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
446 |
return CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
447 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
448 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
449 |
private static CoderResult overflow(CharBuffer src, int mark) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
450 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
451 |
return CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
452 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
453 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
454 |
private Surrogate.Parser sgp; |
2 | 455 |
private CoderResult encodeArrayLoop(CharBuffer src, |
456 |
ByteBuffer dst) |
|
457 |
{ |
|
458 |
char[] sa = src.array(); |
|
459 |
int sp = src.arrayOffset() + src.position(); |
|
460 |
int sl = src.arrayOffset() + src.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
461 |
|
2 | 462 |
byte[] da = dst.array(); |
463 |
int dp = dst.arrayOffset() + dst.position(); |
|
464 |
int dl = dst.arrayOffset() + dst.limit(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
465 |
int dlASCII = dp + Math.min(sl - sp, dl - dp); |
2 | 466 |
|
5992 | 467 |
// ASCII only loop |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
468 |
while (dp < dlASCII && sa[sp] < '\u0080') |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
469 |
da[dp++] = (byte) sa[sp++]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
470 |
while (sp < sl) { |
3714 | 471 |
char c = sa[sp]; |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
472 |
if (c < 0x80) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
473 |
// Have at most seven bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
474 |
if (dp >= dl) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
475 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
476 |
da[dp++] = (byte)c; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
477 |
} else if (c < 0x800) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
478 |
// 2 bytes, 11 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
479 |
if (dl - dp < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
480 |
return overflow(src, sp, dst, dp); |
5992 | 481 |
da[dp++] = (byte)(0xc0 | (c >> 6)); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
482 |
da[dp++] = (byte)(0x80 | (c & 0x3f)); |
3714 | 483 |
} else if (Character.isSurrogate(c)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
484 |
// Have a surrogate pair |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
485 |
if (sgp == null) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
486 |
sgp = new Surrogate.Parser(); |
3714 | 487 |
int uc = sgp.parse(c, sa, sp, sl); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
488 |
if (uc < 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
489 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
490 |
return sgp.error(); |
2 | 491 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
492 |
if (dl - dp < 4) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
493 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
494 |
da[dp++] = (byte)(0xf0 | ((uc >> 18))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
495 |
da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); |
5992 | 496 |
da[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
497 |
da[dp++] = (byte)(0x80 | (uc & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
498 |
sp++; // 2 chars |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
499 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
500 |
// 3 bytes, 16 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
501 |
if (dl - dp < 3) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
502 |
return overflow(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
503 |
da[dp++] = (byte)(0xe0 | ((c >> 12))); |
5992 | 504 |
da[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
505 |
da[dp++] = (byte)(0x80 | (c & 0x3f)); |
2 | 506 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
507 |
sp++; |
2 | 508 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
509 |
updatePositions(src, sp, dst, dp); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
510 |
return CoderResult.UNDERFLOW; |
2 | 511 |
} |
512 |
||
513 |
private CoderResult encodeBufferLoop(CharBuffer src, |
|
514 |
ByteBuffer dst) |
|
515 |
{ |
|
516 |
int mark = src.position(); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
517 |
while (src.hasRemaining()) { |
3714 | 518 |
char c = src.get(); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
519 |
if (c < 0x80) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
520 |
// Have at most seven bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
521 |
if (!dst.hasRemaining()) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
522 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
523 |
dst.put((byte)c); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
524 |
} else if (c < 0x800) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
525 |
// 2 bytes, 11 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
526 |
if (dst.remaining() < 2) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
527 |
return overflow(src, mark); |
5992 | 528 |
dst.put((byte)(0xc0 | (c >> 6))); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
529 |
dst.put((byte)(0x80 | (c & 0x3f))); |
3714 | 530 |
} else if (Character.isSurrogate(c)) { |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
531 |
// Have a surrogate pair |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
532 |
if (sgp == null) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
533 |
sgp = new Surrogate.Parser(); |
3714 | 534 |
int uc = sgp.parse(c, src); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
535 |
if (uc < 0) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
536 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
537 |
return sgp.error(); |
2 | 538 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
539 |
if (dst.remaining() < 4) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
540 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
541 |
dst.put((byte)(0xf0 | ((uc >> 18)))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
542 |
dst.put((byte)(0x80 | ((uc >> 12) & 0x3f))); |
5992 | 543 |
dst.put((byte)(0x80 | ((uc >> 6) & 0x3f))); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
544 |
dst.put((byte)(0x80 | (uc & 0x3f))); |
5992 | 545 |
mark++; // 2 chars |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
546 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
547 |
// 3 bytes, 16 bits |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
548 |
if (dst.remaining() < 3) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
549 |
return overflow(src, mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
550 |
dst.put((byte)(0xe0 | ((c >> 12)))); |
5992 | 551 |
dst.put((byte)(0x80 | ((c >> 6) & 0x3f))); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
552 |
dst.put((byte)(0x80 | (c & 0x3f))); |
2 | 553 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
554 |
mark++; |
2 | 555 |
} |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
556 |
src.position(mark); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
715
diff
changeset
|
557 |
return CoderResult.UNDERFLOW; |
2 | 558 |
} |
559 |
||
560 |
protected final CoderResult encodeLoop(CharBuffer src, |
|
561 |
ByteBuffer dst) |
|
562 |
{ |
|
563 |
if (src.hasArray() && dst.hasArray()) |
|
564 |
return encodeArrayLoop(src, dst); |
|
565 |
else |
|
566 |
return encodeBufferLoop(src, dst); |
|
567 |
} |
|
9547
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
7668
diff
changeset
|
568 |
|
2 | 569 |
} |
570 |
} |