author | serb |
Sun, 24 Jun 2018 16:35:21 -0700 | |
changeset 50834 | 9cf279436b9d |
parent 47216 | 71c04702a3d5 |
child 51823 | 2a51125b2794 |
permissions | -rw-r--r-- |
1 | 1 |
/* |
40901 | 2 |
* Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. |
1 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
7 |
* published by the Free Software Foundation. |
|
8 |
* |
|
9 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
10 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
11 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
12 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
13 |
* accompanied this code). |
|
14 |
* |
|
15 |
* You should have received a copy of the GNU General Public License version |
|
16 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
17 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
18 |
* |
|
5547
f4b087cbb361
6941466: Oracle rebranding changes for Hotspot repositories
trims
parents:
1
diff
changeset
|
19 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
f4b087cbb361
6941466: Oracle rebranding changes for Hotspot repositories
trims
parents:
1
diff
changeset
|
20 |
* or visit www.oracle.com if you need additional information or have any |
f4b087cbb361
6941466: Oracle rebranding changes for Hotspot repositories
trims
parents:
1
diff
changeset
|
21 |
* questions. |
1 | 22 |
* |
23 |
*/ |
|
24 |
||
7397 | 25 |
#include "precompiled.hpp" |
26 |
#include "utilities/utf8.hpp" |
|
1 | 27 |
|
28 |
// Assume the utf8 string is in legal form and has been |
|
29 |
// checked in the class file parser/format checker. |
|
33628 | 30 |
template<typename T> char* UTF8::next(const char* str, T* value) { |
1 | 31 |
unsigned const char *ptr = (const unsigned char *)str; |
32 |
unsigned char ch, ch2, ch3; |
|
33 |
int length = -1; /* bad length */ |
|
34 |
jchar result; |
|
35 |
switch ((ch = ptr[0]) >> 4) { |
|
36 |
default: |
|
37 |
result = ch; |
|
38 |
length = 1; |
|
39 |
break; |
|
40 |
||
41 |
case 0x8: case 0x9: case 0xA: case 0xB: case 0xF: |
|
42 |
/* Shouldn't happen. */ |
|
43 |
break; |
|
44 |
||
45 |
case 0xC: case 0xD: |
|
46 |
/* 110xxxxx 10xxxxxx */ |
|
47 |
if (((ch2 = ptr[1]) & 0xC0) == 0x80) { |
|
48 |
unsigned char high_five = ch & 0x1F; |
|
49 |
unsigned char low_six = ch2 & 0x3F; |
|
50 |
result = (high_five << 6) + low_six; |
|
51 |
length = 2; |
|
52 |
break; |
|
53 |
} |
|
54 |
break; |
|
55 |
||
56 |
case 0xE: |
|
57 |
/* 1110xxxx 10xxxxxx 10xxxxxx */ |
|
58 |
if (((ch2 = ptr[1]) & 0xC0) == 0x80) { |
|
59 |
if (((ch3 = ptr[2]) & 0xC0) == 0x80) { |
|
60 |
unsigned char high_four = ch & 0x0f; |
|
61 |
unsigned char mid_six = ch2 & 0x3f; |
|
62 |
unsigned char low_six = ch3 & 0x3f; |
|
63 |
result = (((high_four << 6) + mid_six) << 6) + low_six; |
|
64 |
length = 3; |
|
65 |
} |
|
66 |
} |
|
67 |
break; |
|
68 |
} /* end of switch */ |
|
69 |
||
70 |
if (length <= 0) { |
|
33628 | 71 |
*value = (T)ptr[0]; /* default bad result; */ |
1 | 72 |
return (char*)(ptr + 1); // make progress somehow |
73 |
} |
|
74 |
||
33628 | 75 |
*value = (T)result; |
1 | 76 |
|
77 |
// The assert is correct but the .class file is wrong |
|
78 |
// assert(UNICODE::utf8_size(result) == length, "checking reverse computation"); |
|
79 |
return (char *)(ptr + length); |
|
80 |
} |
|
81 |
||
82 |
char* UTF8::next_character(const char* str, jint* value) { |
|
83 |
unsigned const char *ptr = (const unsigned char *)str; |
|
84 |
/* See if it's legal supplementary character: |
|
85 |
11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */ |
|
86 |
if (is_supplementary_character(ptr)) { |
|
87 |
*value = get_supplementary_character(ptr); |
|
88 |
return (char *)(ptr + 6); |
|
89 |
} |
|
90 |
jchar result; |
|
91 |
char* next_ch = next(str, &result); |
|
92 |
*value = result; |
|
93 |
return next_ch; |
|
94 |
} |
|
95 |
||
96 |
// Count bytes of the form 10xxxxxx and deduct this count |
|
97 |
// from the total byte count. The utf8 string must be in |
|
98 |
// legal form which has been verified in the format checker. |
|
33628 | 99 |
int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) { |
1 | 100 |
int num_chars = len; |
33628 | 101 |
has_multibyte = false; |
102 |
is_latin1 = true; |
|
103 |
unsigned char prev = 0; |
|
1 | 104 |
for (int i = 0; i < len; i++) { |
33628 | 105 |
unsigned char c = str[i]; |
106 |
if ((c & 0xC0) == 0x80) { |
|
107 |
// Multibyte, check if valid latin1 character. |
|
108 |
has_multibyte = true; |
|
109 |
if (prev > 0xC3) { |
|
110 |
is_latin1 = false; |
|
111 |
} |
|
1 | 112 |
--num_chars; |
113 |
} |
|
33628 | 114 |
prev = c; |
1 | 115 |
} |
116 |
return num_chars; |
|
117 |
} |
|
118 |
||
119 |
// Count bytes of the utf8 string except those in form |
|
120 |
// 10xxxxxx which only appear in multibyte characters. |
|
121 |
// The utf8 string must be in legal form and has been |
|
122 |
// verified in the format checker. |
|
33628 | 123 |
int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) { |
1 | 124 |
int num_chars = 0; |
33628 | 125 |
has_multibyte = false; |
126 |
is_latin1 = true; |
|
127 |
unsigned char prev = 0; |
|
1 | 128 |
for (const char* p = str; *p; p++) { |
33628 | 129 |
unsigned char c = (*p); |
130 |
if ((c & 0xC0) == 0x80) { |
|
131 |
// Multibyte, check if valid latin1 character. |
|
132 |
has_multibyte = true; |
|
133 |
if (prev > 0xC3) { |
|
134 |
is_latin1 = false; |
|
135 |
} |
|
136 |
} else { |
|
1 | 137 |
num_chars++; |
138 |
} |
|
33628 | 139 |
prev = c; |
1 | 140 |
} |
141 |
return num_chars; |
|
142 |
} |
|
143 |
||
33628 | 144 |
// Writes a jchar as utf8 and returns the end |
1 | 145 |
static u_char* utf8_write(u_char* base, jchar ch) { |
146 |
if ((ch != 0) && (ch <=0x7f)) { |
|
147 |
base[0] = (u_char) ch; |
|
148 |
return base + 1; |
|
149 |
} |
|
150 |
||
151 |
if (ch <= 0x7FF) { |
|
152 |
/* 11 bits or less. */ |
|
153 |
unsigned char high_five = ch >> 6; |
|
154 |
unsigned char low_six = ch & 0x3F; |
|
155 |
base[0] = high_five | 0xC0; /* 110xxxxx */ |
|
156 |
base[1] = low_six | 0x80; /* 10xxxxxx */ |
|
157 |
return base + 2; |
|
158 |
} |
|
159 |
/* possibly full 16 bits. */ |
|
160 |
char high_four = ch >> 12; |
|
161 |
char mid_six = (ch >> 6) & 0x3F; |
|
162 |
char low_six = ch & 0x3f; |
|
163 |
base[0] = high_four | 0xE0; /* 1110xxxx */ |
|
164 |
base[1] = mid_six | 0x80; /* 10xxxxxx */ |
|
165 |
base[2] = low_six | 0x80; /* 10xxxxxx */ |
|
166 |
return base + 3; |
|
167 |
} |
|
168 |
||
33628 | 169 |
template<typename T> void UTF8::convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length) { |
1 | 170 |
unsigned char ch; |
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
171 |
const char *ptr = utf8_str; |
1 | 172 |
int index = 0; |
173 |
||
174 |
/* ASCII case loop optimization */ |
|
175 |
for (; index < unicode_length; index++) { |
|
176 |
if((ch = ptr[0]) > 0x7F) { break; } |
|
33628 | 177 |
unicode_str[index] = (T)ch; |
1 | 178 |
ptr = (const char *)(ptr + 1); |
179 |
} |
|
180 |
||
181 |
for (; index < unicode_length; index++) { |
|
182 |
ptr = UTF8::next(ptr, &unicode_str[index]); |
|
183 |
} |
|
184 |
} |
|
185 |
||
33628 | 186 |
// Explicit instantiation for all supported string types. |
187 |
template char* UTF8::next<jchar>(const char* str, jchar* value); |
|
188 |
template char* UTF8::next<jbyte>(const char* str, jbyte* value); |
|
189 |
template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unicode_str, int unicode_length); |
|
190 |
template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length); |
|
191 |
||
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
192 |
// returns the quoted ascii length of a 0-terminated utf8 string |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
193 |
int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
194 |
const char *ptr = utf8_str; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
195 |
const char* end = ptr + utf8_length; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
196 |
int result = 0; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
197 |
while (ptr < end) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
198 |
jchar c; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
199 |
ptr = UTF8::next(ptr, &c); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
200 |
if (c >= 32 && c < 127) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
201 |
result++; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
202 |
} else { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
203 |
result += 6; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
204 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
205 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
206 |
return result; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
207 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
208 |
|
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
209 |
// converts a utf8 string to quoted ascii |
16602
5df51d3bc550
8011048: Possible reading from unmapped memory in UTF8::as_quoted_ascii()
iklam
parents:
14477
diff
changeset
|
210 |
void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) { |
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
211 |
const char *ptr = utf8_str; |
16602
5df51d3bc550
8011048: Possible reading from unmapped memory in UTF8::as_quoted_ascii()
iklam
parents:
14477
diff
changeset
|
212 |
const char *utf8_end = ptr + utf8_length; |
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
213 |
char* p = buf; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
214 |
char* end = buf + buflen; |
16602
5df51d3bc550
8011048: Possible reading from unmapped memory in UTF8::as_quoted_ascii()
iklam
parents:
14477
diff
changeset
|
215 |
while (ptr < utf8_end) { |
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
216 |
jchar c; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
217 |
ptr = UTF8::next(ptr, &c); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
218 |
if (c >= 32 && c < 127) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
219 |
if (p + 1 >= end) break; // string is truncated |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
220 |
*p++ = (char)c; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
221 |
} else { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
222 |
if (p + 6 >= end) break; // string is truncated |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
223 |
sprintf(p, "\\u%04x", c); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
224 |
p += 6; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
225 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
226 |
} |
16602
5df51d3bc550
8011048: Possible reading from unmapped memory in UTF8::as_quoted_ascii()
iklam
parents:
14477
diff
changeset
|
227 |
assert(p < end, "sanity"); |
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
228 |
*p = '\0'; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
229 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
230 |
|
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
231 |
|
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
232 |
const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
233 |
const char *ptr = quoted_ascii_str; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
234 |
char* result = NULL; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
235 |
while (*ptr != '\0') { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
236 |
char c = *ptr; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
237 |
if (c < 32 || c >= 127) break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
238 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
239 |
if (*ptr == '\0') { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
240 |
// nothing to do so return original string |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
241 |
return quoted_ascii_str; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
242 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
243 |
// everything up to this point was ok. |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
244 |
int length = ptr - quoted_ascii_str; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
245 |
char* buffer = NULL; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
246 |
for (int round = 0; round < 2; round++) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
247 |
while (*ptr != '\0') { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
248 |
if (*ptr != '\\') { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
249 |
if (buffer != NULL) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
250 |
buffer[length] = *ptr; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
251 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
252 |
length++; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
253 |
} else { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
254 |
switch (ptr[1]) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
255 |
case 'u': { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
256 |
ptr += 2; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
257 |
jchar value=0; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
258 |
for (int i=0; i<4; i++) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
259 |
char c = *ptr++; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
260 |
switch (c) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
261 |
case '0': case '1': case '2': case '3': case '4': |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
262 |
case '5': case '6': case '7': case '8': case '9': |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
263 |
value = (value << 4) + c - '0'; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
264 |
break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
265 |
case 'a': case 'b': case 'c': |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
266 |
case 'd': case 'e': case 'f': |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
267 |
value = (value << 4) + 10 + c - 'a'; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
268 |
break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
269 |
case 'A': case 'B': case 'C': |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
270 |
case 'D': case 'E': case 'F': |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
271 |
value = (value << 4) + 10 + c - 'A'; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
272 |
break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
273 |
default: |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
274 |
ShouldNotReachHere(); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
275 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
276 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
277 |
if (buffer == NULL) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
278 |
char utf8_buffer[4]; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
279 |
char* next = (char*)utf8_write((u_char*)utf8_buffer, value); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
280 |
length += next - utf8_buffer; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
281 |
} else { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
282 |
char* next = (char*)utf8_write((u_char*)&buffer[length], value); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
283 |
length += next - &buffer[length]; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
284 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
285 |
break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
286 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
287 |
case 't': if (buffer != NULL) buffer[length] = '\t'; ptr += 2; length++; break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
288 |
case 'n': if (buffer != NULL) buffer[length] = '\n'; ptr += 2; length++; break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
289 |
case 'r': if (buffer != NULL) buffer[length] = '\r'; ptr += 2; length++; break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
290 |
case 'f': if (buffer != NULL) buffer[length] = '\f'; ptr += 2; length++; break; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
291 |
default: |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
292 |
ShouldNotReachHere(); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
293 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
294 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
295 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
296 |
if (round == 0) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
297 |
buffer = NEW_RESOURCE_ARRAY(char, length + 1); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
298 |
ptr = quoted_ascii_str; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
299 |
} else { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
300 |
buffer[length] = '\0'; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
301 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
302 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
303 |
return buffer; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
304 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
305 |
|
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
306 |
|
1 | 307 |
// Returns NULL if 'c' it not found. This only works as long |
308 |
// as 'c' is an ASCII character |
|
8076
96d498ec7ae1
6990754: Use native memory and reference counting to implement SymbolTable
coleenp
parents:
7397
diff
changeset
|
309 |
const jbyte* UTF8::strrchr(const jbyte* base, int length, jbyte c) { |
1 | 310 |
assert(length >= 0, "sanity check"); |
311 |
assert(c >= 0, "does not work for non-ASCII characters"); |
|
312 |
// Skip backwards in string until 'c' is found or end is reached |
|
313 |
while(--length >= 0 && base[length] != c); |
|
314 |
return (length < 0) ? NULL : &base[length]; |
|
315 |
} |
|
316 |
||
8076
96d498ec7ae1
6990754: Use native memory and reference counting to implement SymbolTable
coleenp
parents:
7397
diff
changeset
|
317 |
bool UTF8::equal(const jbyte* base1, int length1, const jbyte* base2, int length2) { |
1 | 318 |
// Length must be the same |
319 |
if (length1 != length2) return false; |
|
320 |
for (int i = 0; i < length1; i++) { |
|
321 |
if (base1[i] != base2[i]) return false; |
|
322 |
} |
|
323 |
return true; |
|
324 |
} |
|
325 |
||
326 |
bool UTF8::is_supplementary_character(const unsigned char* str) { |
|
327 |
return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80) |
|
328 |
&& ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80); |
|
329 |
} |
|
330 |
||
331 |
jint UTF8::get_supplementary_character(const unsigned char* str) { |
|
332 |
return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10) |
|
333 |
+ ((str[4] & 0x0f) << 6) + (str[5] & 0x3f); |
|
334 |
} |
|
335 |
||
36508 | 336 |
bool UTF8::is_legal_utf8(const unsigned char* buffer, int length, |
337 |
bool version_leq_47) { |
|
338 |
int i = 0; |
|
339 |
int count = length >> 2; |
|
340 |
for (int k=0; k<count; k++) { |
|
341 |
unsigned char b0 = buffer[i]; |
|
342 |
unsigned char b1 = buffer[i+1]; |
|
343 |
unsigned char b2 = buffer[i+2]; |
|
344 |
unsigned char b3 = buffer[i+3]; |
|
345 |
// For an unsigned char v, |
|
346 |
// (v | v - 1) is < 128 (highest bit 0) for 0 < v < 128; |
|
347 |
// (v | v - 1) is >= 128 (highest bit 1) for v == 0 or v >= 128. |
|
348 |
unsigned char res = b0 | b0 - 1 | |
|
349 |
b1 | b1 - 1 | |
|
350 |
b2 | b2 - 1 | |
|
351 |
b3 | b3 - 1; |
|
352 |
if (res >= 128) break; |
|
353 |
i += 4; |
|
354 |
} |
|
355 |
for(; i < length; i++) { |
|
356 |
unsigned short c; |
|
357 |
// no embedded zeros |
|
358 |
if (buffer[i] == 0) return false; |
|
359 |
if(buffer[i] < 128) { |
|
360 |
continue; |
|
361 |
} |
|
362 |
if ((i + 5) < length) { // see if it's legal supplementary character |
|
363 |
if (UTF8::is_supplementary_character(&buffer[i])) { |
|
364 |
c = UTF8::get_supplementary_character(&buffer[i]); |
|
365 |
i += 5; |
|
366 |
continue; |
|
367 |
} |
|
368 |
} |
|
369 |
switch (buffer[i] >> 4) { |
|
370 |
default: break; |
|
371 |
case 0x8: case 0x9: case 0xA: case 0xB: case 0xF: |
|
372 |
return false; |
|
373 |
case 0xC: case 0xD: // 110xxxxx 10xxxxxx |
|
374 |
c = (buffer[i] & 0x1F) << 6; |
|
375 |
i++; |
|
376 |
if ((i < length) && ((buffer[i] & 0xC0) == 0x80)) { |
|
377 |
c += buffer[i] & 0x3F; |
|
378 |
if (version_leq_47 || c == 0 || c >= 0x80) { |
|
379 |
break; |
|
380 |
} |
|
381 |
} |
|
382 |
return false; |
|
383 |
case 0xE: // 1110xxxx 10xxxxxx 10xxxxxx |
|
384 |
c = (buffer[i] & 0xF) << 12; |
|
385 |
i += 2; |
|
386 |
if ((i < length) && ((buffer[i-1] & 0xC0) == 0x80) && ((buffer[i] & 0xC0) == 0x80)) { |
|
387 |
c += ((buffer[i-1] & 0x3F) << 6) + (buffer[i] & 0x3F); |
|
388 |
if (version_leq_47 || c >= 0x800) { |
|
389 |
break; |
|
390 |
} |
|
391 |
} |
|
392 |
return false; |
|
393 |
} // end of switch |
|
394 |
} // end of for |
|
395 |
return true; |
|
396 |
} |
|
397 |
||
1 | 398 |
//------------------------------------------------------------------------------------- |
399 |
||
33628 | 400 |
bool UNICODE::is_latin1(jchar c) { |
401 |
return (c <= 0x00FF); |
|
402 |
} |
|
403 |
||
404 |
bool UNICODE::is_latin1(jchar* base, int length) { |
|
405 |
for (int index = 0; index < length; index++) { |
|
406 |
if (base[index] > 0x00FF) { |
|
407 |
return false; |
|
408 |
} |
|
409 |
} |
|
410 |
return true; |
|
411 |
} |
|
1 | 412 |
|
413 |
int UNICODE::utf8_size(jchar c) { |
|
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
414 |
if ((0x0001 <= c) && (c <= 0x007F)) { |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
415 |
// ASCII character |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
416 |
return 1; |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
417 |
} else if (c <= 0x07FF) { |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
418 |
return 2; |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
419 |
} else { |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
420 |
return 3; |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
421 |
} |
1 | 422 |
} |
423 |
||
33628 | 424 |
int UNICODE::utf8_size(jbyte c) { |
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
425 |
if (c >= 0x01) { |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
426 |
// ASCII character. Check is equivalent to |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
427 |
// (0x01 <= c) && (c <= 0x7F) because c is signed. |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
428 |
return 1; |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
429 |
} else { |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
430 |
// Non-ASCII character or 0x00 which needs to be |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
431 |
// two-byte encoded as 0xC080 in modified UTF-8. |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
432 |
return 2; |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
433 |
} |
33628 | 434 |
} |
435 |
||
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
436 |
template<typename T> |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
437 |
int UNICODE::utf8_length(T* base, int length) { |
1 | 438 |
int result = 0; |
439 |
for (int index = 0; index < length; index++) { |
|
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
440 |
T c = base[index]; |
33628 | 441 |
result += utf8_size(c); |
442 |
} |
|
443 |
return result; |
|
444 |
} |
|
445 |
||
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
446 |
template<typename T> |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
447 |
char* UNICODE::as_utf8(T* base, int& length) { |
1 | 448 |
int utf8_len = utf8_length(base, length); |
24237
7b210ef8c830
6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents:
16602
diff
changeset
|
449 |
u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1); |
7b210ef8c830
6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents:
16602
diff
changeset
|
450 |
char* result = as_utf8(base, length, (char*) buf, utf8_len + 1); |
7b210ef8c830
6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents:
16602
diff
changeset
|
451 |
assert((int) strlen(result) == utf8_len, "length prediction must be correct"); |
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
452 |
// Set string length to uft8 length |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
453 |
length = utf8_len; |
33628 | 454 |
return (char*) result; |
455 |
} |
|
456 |
||
1 | 457 |
char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) { |
458 |
u_char* p = (u_char*)buf; |
|
459 |
for (int index = 0; index < length; index++) { |
|
460 |
jchar c = base[index]; |
|
24237
7b210ef8c830
6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents:
16602
diff
changeset
|
461 |
buflen -= utf8_size(c); |
7b210ef8c830
6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents:
16602
diff
changeset
|
462 |
if (buflen <= 0) break; // string is truncated |
7b210ef8c830
6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents:
16602
diff
changeset
|
463 |
p = utf8_write(p, c); |
1 | 464 |
} |
465 |
*p = '\0'; |
|
466 |
return buf; |
|
467 |
} |
|
468 |
||
33628 | 469 |
char* UNICODE::as_utf8(jbyte* base, int length, char* buf, int buflen) { |
470 |
u_char* p = (u_char*)buf; |
|
471 |
u_char* end = (u_char*)buf + buflen; |
|
472 |
for (int index = 0; index < length; index++) { |
|
473 |
jbyte c = base[index]; |
|
474 |
int sz = utf8_size(c); |
|
475 |
buflen -= sz; |
|
476 |
if (buflen <= 0) break; // string is truncated |
|
477 |
if (sz == 1) { |
|
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
478 |
// Copy ASCII characters (UTF-8 is ASCII compatible) |
33628 | 479 |
*p++ = c; |
480 |
} else { |
|
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
481 |
// Non-ASCII character or 0x00 which should |
33628 | 482 |
// be encoded as 0xC080 in "modified" UTF8. |
483 |
p = utf8_write(p, ((jchar) c) & 0xff); |
|
484 |
} |
|
485 |
} |
|
486 |
*p = '\0'; |
|
487 |
return buf; |
|
488 |
} |
|
489 |
||
1 | 490 |
void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) { |
491 |
for(int index = 0; index < length; index++) { |
|
492 |
utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]); |
|
493 |
} |
|
494 |
*utf8_buffer = '\0'; |
|
495 |
} |
|
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
496 |
|
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
497 |
// returns the quoted ascii length of a unicode string |
33628 | 498 |
template<typename T> |
499 |
int UNICODE::quoted_ascii_length(T* base, int length) { |
|
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
500 |
int result = 0; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
501 |
for (int i = 0; i < length; i++) { |
33628 | 502 |
T c = base[i]; |
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
503 |
if (c >= 32 && c < 127) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
504 |
result++; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
505 |
} else { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
506 |
result += 6; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
507 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
508 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
509 |
return result; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
510 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
511 |
|
33628 | 512 |
// converts a unicode string to quoted ascii |
513 |
template<typename T> |
|
514 |
void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) { |
|
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
515 |
char* p = buf; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
516 |
char* end = buf + buflen; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
517 |
for (int index = 0; index < length; index++) { |
33628 | 518 |
T c = base[index]; |
14477
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
519 |
if (c >= 32 && c < 127) { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
520 |
if (p + 1 >= end) break; // string is truncated |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
521 |
*p++ = (char)c; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
522 |
} else { |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
523 |
if (p + 6 >= end) break; // string is truncated |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
524 |
sprintf(p, "\\u%04x", c); |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
525 |
p += 6; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
526 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
527 |
} |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
528 |
*p = '\0'; |
95e66ea71f71
6830717: replay of compilations would help with debugging
minqi
parents:
8921
diff
changeset
|
529 |
} |
24237
7b210ef8c830
6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents:
16602
diff
changeset
|
530 |
|
33628 | 531 |
// Explicit instantiation for all supported types. |
42057
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
532 |
template int UNICODE::utf8_length(jbyte* base, int length); |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
533 |
template int UNICODE::utf8_length(jchar* base, int length); |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
534 |
template char* UNICODE::as_utf8(jbyte* base, int& length); |
6a5b8ebcd3f2
8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents:
40901
diff
changeset
|
535 |
template char* UNICODE::as_utf8(jchar* base, int& length); |
33628 | 536 |
template int UNICODE::quoted_ascii_length<jbyte>(jbyte* base, int length); |
537 |
template int UNICODE::quoted_ascii_length<jchar>(jchar* base, int length); |
|
538 |
template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen); |
|
539 |
template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen); |