author | jlaskey |
Fri, 18 May 2018 08:43:49 -0300 | |
changeset 50175 | 589ed2770141 |
parent 47216 | 71c04702a3d5 |
permissions | -rw-r--r-- |
2 | 1 |
/* |
24374
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
2 |
* Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved. |
2 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
7 |
* published by the Free Software Foundation. |
|
8 |
* |
|
9 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
10 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
11 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
12 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
13 |
* accompanied this code). |
|
14 |
* |
|
15 |
* You should have received a copy of the GNU General Public License version |
|
16 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
17 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
18 |
* |
|
5506 | 19 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 |
* or visit www.oracle.com if you need additional information or have any |
|
21 |
* questions. |
|
2 | 22 |
*/ |
23 |
||
24 |
/* |
|
25 |
@test |
|
33663 | 26 |
@bug 4217441 4533872 4900935 8020037 8032012 8041791 8042589 8054307 |
2 | 27 |
@summary toLowerCase should lower-case Greek Sigma correctly depending |
28 |
on the context (final/non-final). Also it should handle |
|
29 |
Locale specific (lt, tr, and az) lowercasings and supplementary |
|
30 |
characters correctly. |
|
31 |
*/ |
|
32 |
||
33 |
import java.util.Locale; |
|
34 |
||
35 |
public class ToLowerCase { |
|
36 |
||
37 |
public static void main(String[] args) { |
|
38 |
Locale turkish = new Locale("tr", "TR"); |
|
39 |
Locale lt = new Locale("lt"); // Lithanian |
|
40 |
Locale az = new Locale("az"); // Azeri |
|
41 |
||
42 |
// Greek Sigma final/non-final tests |
|
43 |
test("\u03A3", Locale.US, "\u03C3"); |
|
44 |
test("LAST\u03A3", Locale.US, "last\u03C2"); |
|
45 |
test("MID\u03A3DLE", Locale.US, "mid\u03C3dle"); |
|
46 |
test("WORD1 \u03A3 WORD3", Locale.US, "word1 \u03C3 word3"); |
|
47 |
test("WORD1 LAST\u03A3 WORD3", Locale.US, "word1 last\u03C2 word3"); |
|
48 |
test("WORD1 MID\u03A3DLE WORD3", Locale.US, "word1 mid\u03C3dle word3"); |
|
49 |
test("\u0399\u0395\u03a3\u03a5\u03a3 \u03a7\u03a1\u0399\u03a3\u03a4\u039f\u03a3", Locale.US, |
|
50 |
"\u03b9\u03b5\u03c3\u03c5\u03c2 \u03c7\u03c1\u03b9\u03c3\u03c4\u03bf\u03c2"); // "IESUS XRISTOS" |
|
51 |
||
52 |
// Explicit dot above for I's and J's whenever there are more accents above (Lithanian) |
|
53 |
test("I", lt, "i"); |
|
54 |
test("I\u0300", lt, "i\u0307\u0300"); // "I" followed by COMBINING GRAVE ACCENT (cc==230) |
|
55 |
test("I\u0316", lt, "i\u0316"); // "I" followed by COMBINING GRAVE ACCENT BELOW (cc!=230) |
|
56 |
test("J", lt, "j"); |
|
57 |
test("J\u0300", lt, "j\u0307\u0300"); // "J" followed by COMBINING GRAVE ACCENT (cc==230) |
|
58 |
test("J\u0316", lt, "j\u0316"); // "J" followed by COMBINING GRAVE ACCENT BELOW (cc!=230) |
|
59 |
test("\u012E", lt, "\u012F"); |
|
60 |
test("\u012E\u0300", lt, "\u012F\u0307\u0300"); // "I (w/ OGONEK)" followed by COMBINING GRAVE ACCENT (cc==230) |
|
61 |
test("\u012E\u0316", lt, "\u012F\u0316"); // "I (w/ OGONEK)" followed by COMBINING GRAVE ACCENT BELOW (cc!=230) |
|
62 |
test("\u00CC", lt, "i\u0307\u0300"); |
|
63 |
test("\u00CD", lt, "i\u0307\u0301"); |
|
64 |
test("\u0128", lt, "i\u0307\u0303"); |
|
65 |
test("I\u0300", Locale.US, "i\u0300"); // "I" followed by COMBINING GRAVE ACCENT (cc==230) |
|
66 |
test("J\u0300", Locale.US, "j\u0300"); // "J" followed by COMBINING GRAVE ACCENT (cc==230) |
|
67 |
test("\u012E\u0300", Locale.US, "\u012F\u0300"); // "I (w/ OGONEK)" followed by COMBINING GRAVE ACCENT (cc==230) |
|
68 |
test("\u00CC", Locale.US, "\u00EC"); |
|
69 |
test("\u00CD", Locale.US, "\u00ED"); |
|
70 |
test("\u0128", Locale.US, "\u0129"); |
|
71 |
||
21308
638d0533f230
8020037: String.toLowerCase incorrectly increases length, if string contains \u0130 char
peytoia
parents:
5506
diff
changeset
|
72 |
// I-dot tests |
2 | 73 |
test("\u0130", turkish, "i"); |
74 |
test("\u0130", az, "i"); |
|
24374
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
75 |
test("\u0130", lt, "\u0069\u0307"); |
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
76 |
test("\u0130", Locale.US, "\u0069\u0307"); |
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
77 |
test("\u0130", Locale.JAPAN, "\u0069\u0307"); |
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
78 |
test("\u0130", Locale.ROOT, "\u0069\u0307"); |
2 | 79 |
|
80 |
// Remove dot_above in the sequence I + dot_above (Turkish and Azeri) |
|
81 |
test("I\u0307", turkish, "i"); |
|
82 |
test("I\u0307", az, "i"); |
|
83 |
test("J\u0307", turkish, "j\u0307"); |
|
84 |
test("J\u0307", az, "j\u0307"); |
|
85 |
||
86 |
// Unless an I is before a dot_above, it turns into a dotless i (Turkish and Azeri) |
|
87 |
test("I", turkish, "\u0131"); |
|
88 |
test("I", az, "\u0131"); |
|
89 |
test("I", Locale.US, "i"); |
|
90 |
test("IABC", turkish, "\u0131abc"); |
|
91 |
test("IABC", az, "\u0131abc"); |
|
92 |
test("IABC", Locale.US, "iabc"); |
|
93 |
||
94 |
// Supplementary character tests |
|
95 |
// |
|
96 |
// U+10400 ("\uD801\uDC00"): DESERET CAPITAL LETTER LONG I |
|
97 |
// U+10401 ("\uD801\uDC01"): DESERET CAPITAL LETTER LONG E |
|
98 |
// U+10402 ("\uD801\uDC02"): DESERET CAPITAL LETTER LONG A |
|
99 |
// U+10428 ("\uD801\uDC28"): DESERET SMALL LETTER LONG I |
|
100 |
// U+10429 ("\uD801\uDC29"): DESERET SMALL LETTER LONG E |
|
101 |
// U+1042A ("\uD801\uDC2A"): DESERET SMALL LETTER LONG A |
|
102 |
// |
|
103 |
// valid code point tests: |
|
104 |
test("\uD801\uDC00\uD801\uDC01\uD801\uDC02", Locale.US, "\uD801\uDC28\uD801\uDC29\uD801\uDC2A"); |
|
105 |
test("\uD801\uDC00A\uD801\uDC01B\uD801\uDC02C", Locale.US, "\uD801\uDC28a\uD801\uDC29b\uD801\uDC2Ac"); |
|
106 |
// invalid code point tests: |
|
107 |
test("\uD800\uD800\uD801A\uDC00\uDC00\uDC00B", Locale.US, "\uD800\uD800\uD801a\uDC00\uDC00\uDC00b"); |
|
108 |
||
25652
3fa5768212c4
8042589: String.toLowerCase do not work for some concatenated strings
sherman
parents:
24374
diff
changeset
|
109 |
// lower/uppercase + surrogates |
3fa5768212c4
8042589: String.toLowerCase do not work for some concatenated strings
sherman
parents:
24374
diff
changeset
|
110 |
test("a\uD801\uDC1c", Locale.ROOT, "a\uD801\uDC44"); |
3fa5768212c4
8042589: String.toLowerCase do not work for some concatenated strings
sherman
parents:
24374
diff
changeset
|
111 |
test("A\uD801\uDC1c", Locale.ROOT, "a\uD801\uDC44"); |
3fa5768212c4
8042589: String.toLowerCase do not work for some concatenated strings
sherman
parents:
24374
diff
changeset
|
112 |
test("a\uD801\uDC00\uD801\uDC01\uD801\uDC02", Locale.US, "a\uD801\uDC28\uD801\uDC29\uD801\uDC2A"); |
3fa5768212c4
8042589: String.toLowerCase do not work for some concatenated strings
sherman
parents:
24374
diff
changeset
|
113 |
test("A\uD801\uDC00\uD801\uDC01\uD801\uDC02", Locale.US, "a\uD801\uDC28\uD801\uDC29\uD801\uDC2A"); |
3fa5768212c4
8042589: String.toLowerCase do not work for some concatenated strings
sherman
parents:
24374
diff
changeset
|
114 |
|
22943
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
115 |
// test bmp + supp1 |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
116 |
StringBuilder src = new StringBuilder(0x20000); |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
117 |
StringBuilder exp = new StringBuilder(0x20000); |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
118 |
for (int cp = 0; cp < 0x20000; cp++) { |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
119 |
if (cp >= Character.MIN_HIGH_SURROGATE && cp <= Character.MAX_HIGH_SURROGATE) { |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
120 |
continue; |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
121 |
} |
24374
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
122 |
if (cp == 0x0130) { |
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
123 |
// Although UnicodeData.txt has the lower case char as \u0069, it should be |
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
124 |
// handled with the rules in SpecialCasing.txt, i.e., \u0069\u0307 in |
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
125 |
// non Turkic locales. |
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
126 |
continue; |
a38282cba2fc
8041791: String.toLowerCase regression - violates Unicode standard
naoto
parents:
23010
diff
changeset
|
127 |
} |
22943
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
128 |
int lowerCase = Character.toLowerCase(cp); |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
129 |
if (lowerCase == -1) { //Character.ERROR |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
130 |
continue; |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
131 |
} |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
132 |
src.appendCodePoint(cp); |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
133 |
exp.appendCodePoint(lowerCase); |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
134 |
} |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
135 |
test(src.toString(), Locale.US, exp.toString()); |
f35dda1a2873
8032012: String.toLowerCase/toUpperCase performance improvement
sherman
parents:
21308
diff
changeset
|
136 |
|
33663 | 137 |
// test latin1 |
138 |
src = new StringBuilder(0x100); |
|
139 |
exp = new StringBuilder(0x100); |
|
140 |
for (int cp = 0; cp < 0x100; cp++) { |
|
141 |
int lowerCase = Character.toLowerCase(cp); |
|
142 |
if (lowerCase == -1) { //Character.ERROR |
|
143 |
continue; |
|
144 |
} |
|
145 |
src.appendCodePoint(cp); |
|
146 |
exp.appendCodePoint(lowerCase); |
|
147 |
} |
|
148 |
test(src.toString(), Locale.US, exp.toString()); |
|
149 |
||
150 |
// test non-latin1 -> latin1 |
|
151 |
src = new StringBuilder(0x100).append("abc"); |
|
152 |
exp = new StringBuilder(0x100).append("abc"); |
|
153 |
for (int cp = 0x100; cp < 0x10000; cp++) { |
|
154 |
int lowerCase = Character.toLowerCase(cp); |
|
155 |
if (lowerCase < 0x100 && cp != '\u0130') { |
|
156 |
src.appendCodePoint(cp); |
|
157 |
exp.appendCodePoint(lowerCase); |
|
158 |
} |
|
159 |
} |
|
160 |
test(src.toString(), Locale.US, exp.toString()); |
|
2 | 161 |
} |
162 |
||
163 |
static void test(String in, Locale locale, String expected) { |
|
33663 | 164 |
test0(in, locale,expected); |
165 |
for (String[] ss : new String[][] { |
|
166 |
new String[] {"abc", "abc"}, |
|
167 |
new String[] {"aBc", "abc"}, |
|
168 |
new String[] {"ABC", "abc"}, |
|
169 |
new String[] {"ab\u4e00", "ab\u4e00"}, |
|
170 |
new String[] {"aB\u4e00", "ab\u4e00"}, |
|
171 |
new String[] {"AB\u4e00", "ab\u4e00"}, |
|
172 |
new String[] {"ab\uD800\uDC00", "ab\uD800\uDC00"}, |
|
173 |
new String[] {"aB\uD800\uDC00", "ab\uD800\uDC00"}, |
|
174 |
new String[] {"AB\uD800\uDC00", "ab\uD800\uDC00"}, |
|
175 |
new String[] {"ab\uD801\uDC1C", "ab\uD801\uDC44"}, |
|
176 |
new String[] {"aB\uD801\uDC1C", "ab\uD801\uDC44"}, |
|
177 |
new String[] {"AB\uD801\uDC1C", "ab\uD801\uDC44"}, |
|
178 |
||
179 |
}) { |
|
180 |
test0(ss[0] + " " + in, locale, ss[1] + " " + expected); |
|
181 |
test0(in + " " + ss[0], locale, expected + " " + ss[1]); |
|
182 |
} |
|
183 |
} |
|
184 |
||
185 |
static void test0(String in, Locale locale, String expected) { |
|
2 | 186 |
String result = in.toLowerCase(locale); |
187 |
if (!result.equals(expected)) { |
|
188 |
System.err.println("input: " + in + ", locale: " + locale + |
|
189 |
", expected: " + expected + ", actual: " + result); |
|
190 |
throw new RuntimeException(); |
|
191 |
} |
|
33663 | 192 |
} |
2 | 193 |
} |