author | jiangli |
Wed, 27 Sep 2017 17:55:20 -0400 | |
changeset 47548 | 664b9d44db74 |
parent 47216 | 71c04702a3d5 |
permissions | -rw-r--r-- |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
1 |
/* |
14342
8435a30053c1
7197491: update copyright year to match last edit in jdk8 jdk repository
alanb
parents:
10898
diff
changeset
|
2 |
* Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
4 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
5 |
* This code is free software; you can redistribute it and/or modify it |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
7 |
* published by the Free Software Foundation. |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
8 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
9 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
10 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
11 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
12 |
* version 2 for more details (a copy is included in the LICENSE file that |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
13 |
* accompanied this code). |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
14 |
* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
15 |
* You should have received a copy of the GNU General Public License version |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
16 |
* 2 along with this work; if not, write to the Free Software Foundation, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
17 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
18 |
* |
5506 | 19 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 |
* or visit www.oracle.com if you need additional information or have any |
|
21 |
* questions. |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
22 |
*/ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
23 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
24 |
/* |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
25 |
* @test |
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
26 |
* @bug 4486841 7040220 7096080 8039751 |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
27 |
* @summary Test UTF-8 charset |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
28 |
*/ |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
29 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
30 |
import java.nio.charset.*; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
31 |
import java.nio.*; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
32 |
import java.util.*; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
33 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
34 |
public class TestUTF8 { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
35 |
static char[] decode(byte[] bb, String csn, boolean testDirect) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
36 |
throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
37 |
CharsetDecoder dec = Charset.forName(csn).newDecoder(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
38 |
ByteBuffer bbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
39 |
CharBuffer cbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
40 |
if (testDirect) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
41 |
bbf = ByteBuffer.allocateDirect(bb.length); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
42 |
cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
43 |
bbf.put(bb).flip(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
44 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
45 |
bbf = ByteBuffer.wrap(bb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
46 |
cbf = CharBuffer.allocate(bb.length); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
47 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
48 |
CoderResult cr = dec.decode(bbf, cbf, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
49 |
if (cr != CoderResult.UNDERFLOW) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
50 |
throw new RuntimeException("Decoding err: " + csn); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
51 |
char[] cc = new char[cbf.position()]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
52 |
cbf.flip(); cbf.get(cc); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
53 |
return cc; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
54 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
55 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
56 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
57 |
static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
58 |
throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
59 |
CharsetDecoder dec = Charset.forName(csn).newDecoder(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
60 |
ByteBuffer bbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
61 |
CharBuffer cbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
62 |
if (testDirect) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
63 |
bbf = ByteBuffer.allocateDirect(bb.length); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
64 |
cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
65 |
bbf.put(bb).flip(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
66 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
67 |
bbf = ByteBuffer.wrap(bb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
68 |
cbf = CharBuffer.allocate(bb.length); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
69 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
70 |
return dec.decode(bbf, cbf, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
71 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
72 |
|
9547
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
73 |
// copy/paste of the StringCoding.decode() |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
74 |
static char[] decode(Charset cs, byte[] ba, int off, int len) { |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
75 |
CharsetDecoder cd = cs.newDecoder(); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
76 |
int en = (int)(len * cd.maxCharsPerByte()); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
77 |
char[] ca = new char[en]; |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
78 |
if (len == 0) |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
79 |
return ca; |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
80 |
cd.onMalformedInput(CodingErrorAction.REPLACE) |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
81 |
.onUnmappableCharacter(CodingErrorAction.REPLACE) |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
82 |
.reset(); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
83 |
|
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
84 |
ByteBuffer bb = ByteBuffer.wrap(ba, off, len); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
85 |
CharBuffer cb = CharBuffer.wrap(ca); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
86 |
try { |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
87 |
CoderResult cr = cd.decode(bb, cb, true); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
88 |
if (!cr.isUnderflow()) |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
89 |
cr.throwException(); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
90 |
cr = cd.flush(cb); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
91 |
if (!cr.isUnderflow()) |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
92 |
cr.throwException(); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
93 |
} catch (CharacterCodingException x) { |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
94 |
throw new Error(x); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
95 |
} |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
96 |
return Arrays.copyOf(ca, cb.position()); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
97 |
} |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
98 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
99 |
static byte[] encode(char[] cc, String csn, boolean testDirect) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
100 |
throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
101 |
ByteBuffer bbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
102 |
CharBuffer cbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
103 |
CharsetEncoder enc = Charset.forName(csn).newEncoder(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
104 |
if (testDirect) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
105 |
bbf = ByteBuffer.allocateDirect(cc.length * 4); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
106 |
cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
107 |
cbf.put(cc).flip(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
108 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
109 |
bbf = ByteBuffer.allocate(cc.length * 4); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
110 |
cbf = CharBuffer.wrap(cc); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
111 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
112 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
113 |
CoderResult cr = enc.encode(cbf, bbf, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
114 |
if (cr != CoderResult.UNDERFLOW) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
115 |
throw new RuntimeException("Encoding err: " + csn); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
116 |
byte[] bb = new byte[bbf.position()]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
117 |
bbf.flip(); bbf.get(bb); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
118 |
return bb; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
119 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
120 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
121 |
static CoderResult encodeCR(char[] cc, String csn, boolean testDirect) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
122 |
throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
123 |
ByteBuffer bbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
124 |
CharBuffer cbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
125 |
CharsetEncoder enc = Charset.forName(csn).newEncoder(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
126 |
if (testDirect) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
127 |
bbf = ByteBuffer.allocateDirect(cc.length * 4); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
128 |
cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
129 |
cbf.put(cc).flip(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
130 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
131 |
bbf = ByteBuffer.allocate(cc.length * 4); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
132 |
cbf = CharBuffer.wrap(cc); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
133 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
134 |
return enc.encode(cbf, bbf, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
135 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
136 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
137 |
static char[] getUTFChars() { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
138 |
char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
139 |
(0x110000 - 0x10000) * 2]; //supp |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
140 |
int pos = 0; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
141 |
int i = 0; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
142 |
for (i = 0; i < 0xd800; i++) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
143 |
cc[pos++] = (char)i; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
144 |
for (i = 0xe000; i < 0x10000; i++) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
145 |
cc[pos++] = (char)i; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
146 |
for (i = 0x10000; i < 0x110000; i++) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
147 |
pos += Character.toChars(i, cc, pos); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
148 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
149 |
return cc; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
150 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
151 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
152 |
static int to3ByteUTF8(char c, byte[] bb, int pos) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
153 |
bb[pos++] = (byte)(0xe0 | ((c >> 12))); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
154 |
bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
155 |
bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f)); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
156 |
return 3; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
157 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
158 |
|
10898 | 159 |
static int to4ByteUTF8(int uc, byte[] bb, int pos) { |
160 |
bb[pos++] = (byte)(0xf0 | ((uc >> 18))); |
|
161 |
bb[pos++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); |
|
162 |
bb[pos++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); |
|
163 |
bb[pos++] = (byte)(0x80 | (uc & 0x3f)); |
|
164 |
return 4; |
|
165 |
} |
|
166 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
167 |
static void checkRoundtrip(String csn) throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
168 |
System.out.printf(" Check roundtrip <%s>...", csn); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
169 |
char[] cc = getUTFChars(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
170 |
byte[] bb = encode(cc, csn, false); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
171 |
char[] ccO = decode(bb, csn, false); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
172 |
|
10898 | 173 |
if (!Arrays.equals(cc, ccO)) |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
174 |
System.out.printf(" non-direct failed"); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
175 |
bb = encode(cc, csn, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
176 |
ccO = decode(bb, csn, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
177 |
if (!Arrays.equals(cc, ccO)) { |
9547
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
178 |
System.out.print(" (direct) failed"); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
179 |
} |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
180 |
// String.getBytes()/toCharArray() goes to ArrayDe/Encoder path |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
181 |
if (!Arrays.equals(bb, new String(cc).getBytes(csn))) { |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
182 |
System.out.printf(" String.getBytes() failed"); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
183 |
} |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
184 |
if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) { |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
185 |
System.out.printf(" String.toCharArray() failed"); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
186 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
187 |
System.out.println(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
188 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
189 |
|
10898 | 190 |
static void check4ByteSurrs(String csn) throws Exception { |
191 |
System.out.printf(" Check 4-byte Surrogates <%s>...%n", csn); |
|
192 |
byte[] bb = new byte[(0x110000 - 0x10000) * 4]; |
|
193 |
char[] cc = new char[(0x110000 - 0x10000) * 2]; |
|
194 |
int bpos = 0; |
|
195 |
int cpos = 0; |
|
196 |
for (int i = 0x10000; i < 0x110000; i++) { |
|
197 |
Character.toChars(i, cc, cpos); |
|
198 |
bpos += to4ByteUTF8(i, bb, bpos); |
|
199 |
cpos += 2; |
|
200 |
} |
|
201 |
checkSurrs(csn, bb, cc); |
|
202 |
} |
|
203 |
||
204 |
||
205 |
static void checkSurrs(String csn, byte[] bb, char[] cc) |
|
206 |
throws Exception |
|
207 |
{ |
|
208 |
char[] ccO = decode(bb, csn, false); |
|
209 |
if (!Arrays.equals(cc, ccO)) { |
|
210 |
System.out.printf(" decoding failed%n"); |
|
211 |
} |
|
212 |
ccO = decode(bb, csn, true); |
|
213 |
if (!Arrays.equals(cc, ccO)) { |
|
214 |
System.out.printf(" decoding(direct) failed%n"); |
|
215 |
} |
|
216 |
if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) { |
|
217 |
System.out.printf(" String.toCharArray() failed"); |
|
218 |
} |
|
219 |
if (!Arrays.equals(bb, new String(cc).getBytes(csn))) { |
|
220 |
System.out.printf(" String.getBytes() failed"); |
|
221 |
} |
|
222 |
} |
|
223 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
224 |
static void check6ByteSurrs(String csn) throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
225 |
System.out.printf(" Check 6-byte Surrogates <%s>...%n", csn); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
226 |
byte[] bb = new byte[(0x110000 - 0x10000) * 6]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
227 |
char[] cc = new char[(0x110000 - 0x10000) * 2]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
228 |
int bpos = 0; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
229 |
int cpos = 0; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
230 |
for (int i = 0x10000; i < 0x110000; i++) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
231 |
Character.toChars(i, cc, cpos); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
232 |
bpos += to3ByteUTF8(cc[cpos], bb, bpos); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
233 |
bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
234 |
cpos += 2; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
235 |
} |
10898 | 236 |
checkSurrs(csn, bb, cc); |
237 |
} |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
238 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
239 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
240 |
static void compare(String csn1, String csn2) throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
241 |
System.out.printf(" Diff <%s> <%s>...%n", csn1, csn2); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
242 |
char[] cc = getUTFChars(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
243 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
244 |
byte[] bb1 = encode(cc, csn1, false); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
245 |
byte[] bb2 = encode(cc, csn2, false); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
246 |
if (!Arrays.equals(bb1, bb2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
247 |
System.out.printf(" encoding failed%n"); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
248 |
char[] cc1 = decode(bb1, csn1, false); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
249 |
char[] cc2 = decode(bb1, csn2, false); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
250 |
if (!Arrays.equals(cc1, cc2)) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
251 |
System.out.printf(" decoding failed%n"); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
252 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
253 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
254 |
bb1 = encode(cc, csn1, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
255 |
bb2 = encode(cc, csn2, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
256 |
if (!Arrays.equals(bb1, bb2)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
257 |
System.out.printf(" encoding (direct) failed%n"); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
258 |
cc1 = decode(bb1, csn1, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
259 |
cc2 = decode(bb1, csn2, true); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
260 |
if (!Arrays.equals(cc1, cc2)) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
261 |
System.out.printf(" decoding (direct) failed%n"); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
262 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
263 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
264 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
265 |
// The first byte is the length of malformed bytes |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
266 |
static byte[][] malformed = { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
267 |
// One-byte sequences: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
268 |
{1, (byte)0xFF }, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
269 |
{1, (byte)0xC0 }, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
270 |
{1, (byte)0x80 }, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
271 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
272 |
{1, (byte)0xFF, (byte)0xFF}, // all ones |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
273 |
{1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
274 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
275 |
// Two-byte sequences: |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
276 |
{1, (byte)0xC0, (byte)0x80}, // invalid first byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
277 |
{1, (byte)0xC1, (byte)0xBF}, // invalid first byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
278 |
{1, (byte)0xC2, (byte)0x00}, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
279 |
{1, (byte)0xC2, (byte)0xC0}, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
280 |
{1, (byte)0xD0, (byte)0x00}, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
281 |
{1, (byte)0xD0, (byte)0xC0}, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
282 |
{1, (byte)0xDF, (byte)0x00}, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
283 |
{1, (byte)0xDF, (byte)0xC0}, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
284 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
285 |
// Three-byte sequences |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
286 |
{1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
287 |
{1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
288 |
{1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
289 |
{1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
290 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
291 |
{1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
292 |
{2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
293 |
{2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte |
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
294 |
{2, (byte)0xE1, (byte)0x80, (byte)0x42}, // invalid third byte |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
295 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
296 |
{1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
297 |
{1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
298 |
{1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte |
10898 | 299 |
{1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes |
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
300 |
{1, (byte)0xE1, (byte)0x40,}, // invalid second byte & 2 bytes |
10898 | 301 |
{3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate |
302 |
{3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate |
|
303 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
304 |
|
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
305 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
306 |
// Four-byte sequences |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
307 |
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
308 |
{1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
309 |
{1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
310 |
{1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
311 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
312 |
{1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
313 |
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
314 |
{1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte |
10898 | 315 |
{1, (byte)0xF0, (byte)41 }, // invalid second byte |
316 |
// & only 2 bytes |
|
317 |
||
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
318 |
{2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte |
10898 | 319 |
{3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte |
320 |
{2, (byte)0xF0, (byte)0x90, (byte)0x41 }, // invalid third byte |
|
321 |
// & 3 bytes input |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
322 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
323 |
{1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
324 |
{2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
325 |
{3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
326 |
{1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
327 |
{1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
328 |
{1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
329 |
|
23880
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
330 |
// #8039751 |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
331 |
{1, (byte)0xF6, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
332 |
{1, (byte)0xF6, (byte)0x80, (byte)0x80, }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
333 |
{1, (byte)0xF6, (byte)0x80, }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
334 |
{1, (byte)0xF6, }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
335 |
{1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
336 |
{1, (byte)0xF5, (byte)0x80, (byte)0x80, }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
337 |
{1, (byte)0xF5, (byte)0x80, }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
338 |
{1, (byte)0xF5 }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
339 |
|
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
340 |
{1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0x80 }, // out-range 2nd byte |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
341 |
{1, (byte)0xF4, (byte)0x90, (byte)0x80 }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
342 |
{1, (byte)0xF4, (byte)0x90 }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
343 |
|
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
344 |
{1, (byte)0xF4, (byte)0x7f, (byte)0x80, (byte)0x80 }, // out-range/ascii 2nd byte |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
345 |
{1, (byte)0xF4, (byte)0x7f, (byte)0x80 }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
346 |
{1, (byte)0xF4, (byte)0x7f }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
347 |
|
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
348 |
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 2nd byte |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
349 |
{1, (byte)0xF0, (byte)0x80, (byte)0x80 }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
350 |
{1, (byte)0xF0, (byte)0x80 }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
351 |
|
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
352 |
{1, (byte)0xF0, (byte)0xc0, (byte)0x80, (byte)0x80 }, // out-range 2nd byte |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
353 |
{1, (byte)0xF0, (byte)0xc0, (byte)0x80 }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
354 |
{1, (byte)0xF0, (byte)0xc0 }, |
7d6b060131d3
8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents:
14342
diff
changeset
|
355 |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
356 |
// Five-byte sequences |
10898 | 357 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte |
358 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded |
|
359 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded |
|
360 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded |
|
361 |
{1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
362 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
363 |
{1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, |
10898 | 364 |
{1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, |
365 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, |
|
366 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
367 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
368 |
// Six-byte sequences |
10898 | 369 |
{1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded |
370 |
{1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded |
|
371 |
{1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded |
|
372 |
{1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
373 |
{1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, |
10898 | 374 |
{1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, |
375 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, |
|
376 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, |
|
377 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
378 |
}; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
379 |
|
10898 | 380 |
// The first byte is the length of malformed bytes |
381 |
static byte[][] malformed_cesu8 = { |
|
382 |
// One-byte sequences: |
|
383 |
{1, (byte)0xFF }, |
|
384 |
{1, (byte)0xC0 }, |
|
385 |
{1, (byte)0x80 }, |
|
386 |
||
387 |
{1, (byte)0xFF, (byte)0xFF}, // all ones |
|
388 |
{1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble |
|
389 |
||
390 |
// Two-byte sequences: |
|
391 |
{1, (byte)0xC0, (byte)0x80}, // invalid first byte |
|
392 |
{1, (byte)0xC1, (byte)0xBF}, // invalid first byte |
|
393 |
{1, (byte)0xC2, (byte)0x00}, // invalid second byte |
|
394 |
{1, (byte)0xC2, (byte)0xC0}, // invalid second byte |
|
395 |
{1, (byte)0xD0, (byte)0x00}, // invalid second byte |
|
396 |
{1, (byte)0xD0, (byte)0xC0}, // invalid second byte |
|
397 |
{1, (byte)0xDF, (byte)0x00}, // invalid second byte |
|
398 |
{1, (byte)0xDF, (byte)0xC0}, // invalid second byte |
|
399 |
||
400 |
// Three-byte sequences |
|
401 |
{1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble |
|
402 |
{1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded |
|
403 |
{1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded |
|
404 |
{1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded |
|
405 |
||
406 |
{1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte |
|
407 |
{2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte |
|
408 |
{2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte |
|
409 |
{1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones |
|
410 |
{1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte |
|
411 |
{1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte |
|
412 |
{1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes |
|
413 |
||
414 |
// CESU-8 does not have 4, 5, 6 bytes sequenc |
|
415 |
// Four-byte sequences |
|
416 |
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded |
|
417 |
{1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded |
|
418 |
{1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded |
|
419 |
{1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded |
|
420 |
||
421 |
{1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones |
|
422 |
{1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte |
|
423 |
{1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte |
|
424 |
{1, (byte)0xF0, (byte)41 }, // invalid second byte |
|
425 |
// & only 2 bytes |
|
426 |
{1, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte |
|
427 |
{1, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte |
|
428 |
{1, (byte)0xF0, (byte)0x90, (byte)0x41 }, // invalid third byte |
|
429 |
// & 3 bytes input |
|
430 |
||
431 |
{1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte |
|
432 |
{1, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte |
|
433 |
{1, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte |
|
434 |
{1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte |
|
435 |
{1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte |
|
436 |
{1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte |
|
437 |
||
438 |
// Five-byte sequences |
|
439 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte |
|
440 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded |
|
441 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded |
|
442 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded |
|
443 |
{1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded |
|
444 |
||
445 |
{1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, |
|
446 |
{1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, |
|
447 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, |
|
448 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, |
|
449 |
||
450 |
// Six-byte sequences |
|
451 |
{1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded |
|
452 |
{1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded |
|
453 |
{1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded |
|
454 |
{1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded |
|
455 |
{1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, |
|
456 |
{1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, |
|
457 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, |
|
458 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, |
|
459 |
{1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, |
|
460 |
}; |
|
461 |
||
462 |
||
463 |
static void checkMalformed(String csn, byte[][] malformed) throws Exception { |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
464 |
boolean failed = false; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
465 |
System.out.printf(" Check malformed <%s>...%n", csn); |
9547
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
466 |
Charset cs = Charset.forName(csn); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
467 |
for (boolean direct: new boolean[] {false, true}) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
468 |
for (byte[] bins : malformed) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
469 |
int mlen = bins[0]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
470 |
byte[] bin = Arrays.copyOfRange(bins, 1, bins.length); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
471 |
CoderResult cr = decodeCR(bin, csn, direct); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
472 |
String ashex = ""; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
473 |
for (int i = 0; i < bin.length; i++) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
474 |
if (i > 0) ashex += " "; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
475 |
ashex += Integer.toBinaryString((int)bin[i] & 0xff); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
476 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
477 |
if (!cr.isMalformed()) { |
9547
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
478 |
System.out.printf(" FAIL(direct=%b): [%s] not malformed.%n", direct, ashex); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
479 |
failed = true; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
480 |
} else if (cr.length() != mlen) { |
9547
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
481 |
System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].%n", direct, ashex, cr.length()); |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
482 |
failed = true; |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
483 |
} |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
484 |
if (!Arrays.equals(decode(cs, bin, 0, bin.length), |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
485 |
new String(bin, csn).toCharArray())) { |
454881baaca0
7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents:
5506
diff
changeset
|
486 |
System.out.printf(" FAIL(new String(bb, %s)) failed%n", csn); |
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
487 |
failed = true; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
488 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
489 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
490 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
491 |
if (failed) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
492 |
throw new RuntimeException("Check malformed failed " + csn); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
493 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
494 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
495 |
static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
496 |
int inPos = flow[0]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
497 |
int inLen = flow[1]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
498 |
int outPos = flow[2]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
499 |
int outLen = flow[3]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
500 |
int expedInPos = flow[4]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
501 |
int expedOutPos = flow[5]; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
502 |
CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
503 |
:CoderResult.OVERFLOW; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
504 |
ByteBuffer bbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
505 |
CharBuffer cbf; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
506 |
if (direct) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
507 |
bbf = ByteBuffer.allocateDirect(inPos + utf8s.length); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
508 |
cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
509 |
} else { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
510 |
bbf = ByteBuffer.allocate(inPos + utf8s.length); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
511 |
cbf = CharBuffer.allocate(outPos + outLen); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
512 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
513 |
bbf.position(inPos); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
514 |
bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
515 |
cbf.position(outPos); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
516 |
dec.reset(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
517 |
CoderResult cr = dec.decode(bbf, cbf, false); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
518 |
if (cr != expedCR || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
519 |
bbf.position() != expedInPos || |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
520 |
cbf.position() != expedOutPos) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
521 |
System.out.printf("Expected(direct=%5b): [", direct); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
522 |
for (int i:flow) System.out.print(" " + i); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
523 |
System.out.println("] CR=" + cr + |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
524 |
", inPos=" + bbf.position() + |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
525 |
", outPos=" + cbf.position()); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
526 |
return false; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
527 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
528 |
return true; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
529 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
530 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
531 |
static void checkUnderOverflow(String csn) throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
532 |
System.out.printf(" Check under/overflow <%s>...%n", csn); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
533 |
CharsetDecoder dec = Charset.forName(csn).newDecoder(); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
534 |
boolean failed = false; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
535 |
byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8"); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
536 |
int inlen = utf8s.length; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
537 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
538 |
for (int inoff = 0; inoff < 20; inoff++) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
539 |
for (int outoff = 0; outoff < 20; outoff++) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
540 |
int[][] Flows = { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
541 |
//inpos, inLen, outPos, outLen, inPosEP, outposEP, under(0)/over(1) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
542 |
{inoff, inlen, outoff, 1, inoff + 1, outoff + 1, 1}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
543 |
{inoff, inlen, outoff, 2, inoff + 3, outoff + 2, 1}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
544 |
{inoff, inlen, outoff, 3, inoff + 6, outoff + 3, 1}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
545 |
{inoff, inlen, outoff, 4, inoff + 6, outoff + 3, 1}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
546 |
{inoff, inlen, outoff, 5, inoff + 10,outoff + 5, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
547 |
// underflow |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
548 |
{inoff, 1, outoff, 5, inoff + 1, outoff + 1, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
549 |
{inoff, 2, outoff, 5, inoff + 1, outoff + 1, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
550 |
{inoff, 3, outoff, 5, inoff + 3, outoff + 2, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
551 |
{inoff, 4, outoff, 5, inoff + 3, outoff + 2, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
552 |
{inoff, 5, outoff, 5, inoff + 3, outoff + 2, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
553 |
{inoff, 6, outoff, 5, inoff + 6, outoff + 3, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
554 |
{inoff, 7, outoff, 5, inoff + 6, outoff + 3, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
555 |
{inoff, 8, outoff, 5, inoff + 6, outoff + 3, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
556 |
{inoff, 9, outoff, 5, inoff + 6, outoff + 3, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
557 |
{inoff, 10, outoff, 5, inoff + 10,outoff + 5, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
558 |
// 2-byte underflow/overflow |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
559 |
{inoff, 2, outoff, 1, inoff + 1, outoff + 1, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
560 |
{inoff, 3, outoff, 1, inoff + 1, outoff + 1, 1}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
561 |
// 3-byte underflow/overflow |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
562 |
{inoff, 4, outoff, 2, inoff + 3, outoff + 2, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
563 |
{inoff, 5, outoff, 2, inoff + 3, outoff + 2, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
564 |
{inoff, 6, outoff, 2, inoff + 3, outoff + 2, 1}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
565 |
// 4-byte underflow/overflow |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
566 |
{inoff, 7, outoff, 4, inoff + 6, outoff + 3, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
567 |
{inoff, 8, outoff, 4, inoff + 6, outoff + 3, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
568 |
{inoff, 9, outoff, 4, inoff + 6, outoff + 3, 0}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
569 |
{inoff, 10, outoff, 4, inoff + 6, outoff + 3, 1}, |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
570 |
}; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
571 |
for (boolean direct: new boolean[] {false, true}) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
572 |
for (int[] flow: Flows) { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
573 |
if (!check(dec, utf8s, direct, flow)) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
574 |
failed = true; |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
575 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
576 |
}}} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
577 |
if (failed) |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
578 |
throw new RuntimeException("Check under/overflow failed " + csn); |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
579 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
580 |
|
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
581 |
public static void main(String[] args) throws Exception { |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
582 |
checkRoundtrip("UTF-8"); |
10898 | 583 |
check4ByteSurrs("UTF-8"); |
584 |
checkMalformed("UTF-8", malformed); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
585 |
checkUnderOverflow("UTF-8"); |
10898 | 586 |
checkRoundtrip("CESU-8"); |
587 |
check6ByteSurrs("CESU-8"); |
|
588 |
checkMalformed("CESU-8", malformed_cesu8); |
|
1091
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
589 |
} |
136d19d6c372
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff
changeset
|
590 |
} |