# HG changeset patch # User yhuang # Date 1320121811 25200 # Node ID d9ac699afc2ac4eeb3af537167c0e27b74168019 # Parent 8fcdae2a7ec7a421d39619610a579c59af76b67c 6755060: Collator.compare() does not compare correctly for the Thai locale Reviewed-by: naoto diff -r 8fcdae2a7ec7 -r d9ac699afc2a jdk/src/share/classes/sun/text/resources/CollationData_th.java --- a/jdk/src/share/classes/sun/text/resources/CollationData_th.java Wed Aug 17 14:18:26 2011 -0700 +++ b/jdk/src/share/classes/sun/text/resources/CollationData_th.java Mon Oct 31 21:30:11 2011 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -103,18 +103,13 @@ // // Normal vowels // + + "< \u0E4D " // NIKHAHIT + "< \u0E30 " // SARA A + "< \u0E31 " // MAI HAN-AKAT + "< \u0E32 " // SARA AA - // Normalizer will decompose this character to \u0e4d\u0e32. This is - // a Bad Thing, because we want the separate characters to sort - // differently than this individual one. Since there's no public way to - // set the decomposition to be used when creating a collator, there's - // no way around this right now. - // It's best to go ahead and leave the character in, because it occurs - // this way a lot more often than it occurs as separate characters. - + "< \u0E33 " // SARA AM + // Normalizer will decompose this character to \u0e4d\u0e32. + + "< \u0E33 = \u0E4D\u0E32 " // SARA AM + "< \u0E34 " // SARA I @@ -133,62 +128,58 @@ + "< \u0E43 " // SARA AI MAIMUAN + "< \u0E44 " // SARA AI MAIMALAI - // - // Digits - // - + "< \u0E50 " // DIGIT ZERO - + "< \u0E51 " // DIGIT ONE - + "< \u0E52 " // DIGIT TWO - + "< \u0E53 " // DIGIT THREE - + "< \u0E54 " // DIGIT FOUR - + "< \u0E55 " // DIGIT FIVE - + "< \u0E56 " // DIGIT SIX - + "< \u0E57 " // DIGIT SEVEN - + "< \u0E58 " // DIGIT EIGHT - + "< \u0E59 " // DIGIT NINE + + //according to CLDR, it's after 0e44 + + "< \u0E3A " // PHINTHU + + - // Sorta tonal marks, but maybe not really - + "< \u0E4D " // NIKHAHIT + // This rare symbol comes after all characters. + + "< \u0E45 " // LAKKHANGYAO + + "& \u0E32 , \0E45 " // According to CLDR, 0E45 is after 0E32 in tertiary level + + + - // - // Thai symbols are supposed to sort "after white space". - // I'm treating this as making them sort just after the normal Latin-1 - // symbols, which are in turn after the white space. - // - + "&'\u007d'" // right-brace - + "< \u0E2F " // PAIYANNOI (ellipsis, abbreviation) - + "< \u0E46 " // MAIYAMOK - + "< \u0E4F " // FONGMAN - + "< \u0E5A " // ANGKHANKHU - + "< \u0E5B " // KHOMUT - + "< \u0E3F " // CURRENCY SYMBOL BAHT + // Below are thai puntuation marks and Tonal(Accent) marks. According to CLDR 1.9 and + // ISO/IEC 14651, Annex C, C.2.1 Thai ordering principles, 0E2F to 0E5B are punctuaion marks that need to be ignored + // in the first three leveles. 0E4E to 0E4B are tonal marks to be compared in secondary level. + // In real implmentation, set puncutation marks in tertiary as there is no fourth level in Java. + // Set all these special marks after \u0301, the accute accent. + + "& \u0301 " // acute accent - // These symbols are supposed to be "after all characters" - + "< \u0E4E " // YAMAKKAN + //puncutation marks + + ", \u0E2F " // PAIYANNOI (ellipsis, abbreviation) + + ", \u0E46 " // MAIYAMOK + + ", \u0E4F " // FONGMAN + + ", \u0E5A " // ANGKHANKHU + + ", \u0E5B " // KHOMUT - // This rare symbol also comes after all characters. But when it is - // used in combination with RU and LU, the combination is treated as - // a separate letter, ala "CH" sorting after "C" in traditional Spanish. - + "< \u0E45 " // LAKKHANGYAO - + "& \u0E24 < \u0E24\u0E45 " - + "& \u0E26 < \u0E26\u0E45 " - - // Tonal marks are primary ignorables but are treated as secondary - // differences - + "& \u0301 " // acute accent + //tonal marks + + "; \u0E4E " // YAMAKKAN + + "; \u0E4C " // THANTHAKHAT + "; \u0E47 " // MAITAIKHU + "; \u0E48 " // MAI EK + "; \u0E49 " // MAI THO + "; \u0E4A " // MAI TRI + "; \u0E4B " // MAI CHATTAWA - + "; \u0E4C " // THANTHAKHAT + + // + // Digits are equal to their corresponding Arabic digits in the first level + // + + "& 0 = \u0E50 " // DIGIT ZERO + + "& 1 = \u0E51 " // DIGIT ONE + + "& 2 = \u0E52 " // DIGIT TWO + + "& 3 = \u0E53 " // DIGIT THREE + + "& 4 = \u0E54 " // DIGIT FOUR + + "& 5 = \u0E55 " // DIGIT FIVE + + "& 6 = \u0E56 " // DIGIT SIX + + "& 7 = \u0E57 " // DIGIT SEVEN + + "& 8 = \u0E58 " // DIGIT EIGHT + + "& 9 = \u0E59 " // DIGIT NINE - // These are supposed to be ignored, so I'm treating them as controls - + "& \u0001 " - + "= \u0E3A " // PHINTHU - + "= '.' " // period - } + } }; } } diff -r 8fcdae2a7ec7 -r d9ac699afc2a jdk/test/sun/text/resources/Collator/Bug6755060.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jdk/test/sun/text/resources/Collator/Bug6755060.java Mon Oct 31 21:30:11 2011 -0700 @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 6755060 + * @summary updating collation tables for thai to make it consistent with CLDR 1.9 + */ + +import java.text.*; +import java.util.*; + +public class Bug6755060 { + + /******************************************************** + *********************************************************/ + public static void main (String[] args) { + + Locale reservedLocale = Locale.getDefault(); + + try{ + + int errors=0; + + Locale loc = new Locale ("th", "TH"); // Thai + + Locale.setDefault (loc); + Collator col = Collator.getInstance (); + + /* + * The original data "data" are the data to be sorted provided by the submitter of the CR. + * It's in correct order in accord with thai collation in CLDR 1.9. If we use old Java without this fix, + * the output order will be incorrect. Correct order will be turned into incorrect order. + + * If fix is there, "data" after sorting will be unchanged, same as "sortedData". If fix is lost (regression), + * "data" after sorting will be changed, not as "sortedData".(not correct anymore) + + * The submitter of the CR also gives a expected "sortedData" in the CR, but it's in accord with collation in CLDR 1.4. + * His data to be sorted are actually well sorted in accord with CLDR 1.9. + */ + + String[] data = {"\u0e01", "\u0e01\u0e2f", "\u0e01\u0e46", "\u0e01\u0e4f", "\u0e01\u0e5a", "\u0e01\u0e5b", "\u0e01\u0e4e", "\u0e01\u0e4c", "\u0e01\u0e48", "\u0e01\u0e01", "\u0e01\u0e4b\u0e01", "\u0e01\u0e4d", "\u0e01\u0e30", "\u0e01\u0e31\u0e01", "\u0e01\u0e32", "\u0e01\u0e33", "\u0e01\u0e34", "\u0e01\u0e35", "\u0e01\u0e36", "\u0e01\u0e37", "\u0e01\u0e38", "\u0e01\u0e39", "\u0e40\u0e01", "\u0e40\u0e01\u0e48", "\u0e40\u0e01\u0e49", "\u0e40\u0e01\u0e4b", "\u0e41\u0e01", "\u0e42\u0e01", "\u0e43\u0e01", "\u0e44\u0e01", "\u0e01\u0e3a", "\u0e24\u0e32", "\u0e24\u0e45", "\u0e40\u0e25", "\u0e44\u0e26"}; + + String[] sortedData = {"\u0e01", "\u0e01\u0e2f", "\u0e01\u0e46", "\u0e01\u0e4f", "\u0e01\u0e5a", "\u0e01\u0e5b", "\u0e01\u0e4e", "\u0e01\u0e4c", "\u0e01\u0e48", "\u0e01\u0e01", "\u0e01\u0e4b\u0e01", "\u0e01\u0e4d", "\u0e01\u0e30", "\u0e01\u0e31\u0e01", "\u0e01\u0e32", "\u0e01\u0e33", "\u0e01\u0e34", "\u0e01\u0e35", "\u0e01\u0e36", "\u0e01\u0e37", "\u0e01\u0e38", "\u0e01\u0e39", "\u0e40\u0e01", "\u0e40\u0e01\u0e48", "\u0e40\u0e01\u0e49", "\u0e40\u0e01\u0e4b", "\u0e41\u0e01", "\u0e42\u0e01", "\u0e43\u0e01", "\u0e44\u0e01", "\u0e01\u0e3a", "\u0e24\u0e32", "\u0e24\u0e45", "\u0e40\u0e25", "\u0e44\u0e26"}; + + Arrays.sort (data, col); + + System.out.println ("Using " + loc.getDisplayName()); + for (int i = 0; i < data.length; i++) { + System.out.println(data[i] + " : " + sortedData[i]); + if (sortedData[i].compareTo(data[i]) != 0) { + errors++; + } + }//end for + + if (errors > 0){ + StringBuffer expected = new StringBuffer(), actual = new StringBuffer(); + expected.append(sortedData[0]); + actual.append(data[0]); + + for (int i=1; i