101 + "< \u0E2E " // HO NOKHUK |
101 + "< \u0E2E " // HO NOKHUK |
102 |
102 |
103 // |
103 // |
104 // Normal vowels |
104 // Normal vowels |
105 // |
105 // |
|
106 + "< \u0E4D " // NIKHAHIT |
106 + "< \u0E30 " // SARA A |
107 + "< \u0E30 " // SARA A |
107 + "< \u0E31 " // MAI HAN-AKAT |
108 + "< \u0E31 " // MAI HAN-AKAT |
108 + "< \u0E32 " // SARA AA |
109 + "< \u0E32 " // SARA AA |
109 |
110 |
110 // Normalizer will decompose this character to \u0e4d\u0e32. This is |
111 // Normalizer will decompose this character to \u0e4d\u0e32. |
111 // a Bad Thing, because we want the separate characters to sort |
112 + "< \u0E33 = \u0E4D\u0E32 " // SARA AM |
112 // differently than this individual one. Since there's no public way to |
|
113 // set the decomposition to be used when creating a collator, there's |
|
114 // no way around this right now. |
|
115 // It's best to go ahead and leave the character in, because it occurs |
|
116 // this way a lot more often than it occurs as separate characters. |
|
117 + "< \u0E33 " // SARA AM |
|
118 |
113 |
119 + "< \u0E34 " // SARA I |
114 + "< \u0E34 " // SARA I |
120 |
115 |
121 + "< \u0E35 " // SARA II |
116 + "< \u0E35 " // SARA II |
122 + "< \u0E36 " // SARA UE |
117 + "< \u0E36 " // SARA UE |
131 + "< \u0E41 " // SARA AE |
126 + "< \u0E41 " // SARA AE |
132 + "< \u0E42 " // SARA O |
127 + "< \u0E42 " // SARA O |
133 + "< \u0E43 " // SARA AI MAIMUAN |
128 + "< \u0E43 " // SARA AI MAIMUAN |
134 + "< \u0E44 " // SARA AI MAIMALAI |
129 + "< \u0E44 " // SARA AI MAIMALAI |
135 |
130 |
136 // |
|
137 // Digits |
|
138 // |
|
139 + "< \u0E50 " // DIGIT ZERO |
|
140 + "< \u0E51 " // DIGIT ONE |
|
141 + "< \u0E52 " // DIGIT TWO |
|
142 + "< \u0E53 " // DIGIT THREE |
|
143 + "< \u0E54 " // DIGIT FOUR |
|
144 + "< \u0E55 " // DIGIT FIVE |
|
145 + "< \u0E56 " // DIGIT SIX |
|
146 + "< \u0E57 " // DIGIT SEVEN |
|
147 + "< \u0E58 " // DIGIT EIGHT |
|
148 + "< \u0E59 " // DIGIT NINE |
|
149 |
131 |
150 // Sorta tonal marks, but maybe not really |
132 //according to CLDR, it's after 0e44 |
151 + "< \u0E4D " // NIKHAHIT |
133 + "< \u0E3A " // PHINTHU |
152 |
134 |
153 // |
|
154 // Thai symbols are supposed to sort "after white space". |
|
155 // I'm treating this as making them sort just after the normal Latin-1 |
|
156 // symbols, which are in turn after the white space. |
|
157 // |
|
158 + "&'\u007d'" // right-brace |
|
159 + "< \u0E2F " // PAIYANNOI (ellipsis, abbreviation) |
|
160 + "< \u0E46 " // MAIYAMOK |
|
161 + "< \u0E4F " // FONGMAN |
|
162 + "< \u0E5A " // ANGKHANKHU |
|
163 + "< \u0E5B " // KHOMUT |
|
164 + "< \u0E3F " // CURRENCY SYMBOL BAHT |
|
165 |
135 |
166 // These symbols are supposed to be "after all characters" |
|
167 + "< \u0E4E " // YAMAKKAN |
|
168 |
136 |
169 // This rare symbol also comes after all characters. But when it is |
137 // This rare symbol comes after all characters. |
170 // used in combination with RU and LU, the combination is treated as |
|
171 // a separate letter, ala "CH" sorting after "C" in traditional Spanish. |
|
172 + "< \u0E45 " // LAKKHANGYAO |
138 + "< \u0E45 " // LAKKHANGYAO |
173 + "& \u0E24 < \u0E24\u0E45 " |
139 + "& \u0E32 , \0E45 " // According to CLDR, 0E45 is after 0E32 in tertiary level |
174 + "& \u0E26 < \u0E26\u0E45 " |
|
175 |
140 |
176 // Tonal marks are primary ignorables but are treated as secondary |
141 |
177 // differences |
142 |
|
143 |
|
144 // Below are thai puntuation marks and Tonal(Accent) marks. According to CLDR 1.9 and |
|
145 // ISO/IEC 14651, Annex C, C.2.1 Thai ordering principles, 0E2F to 0E5B are punctuaion marks that need to be ignored |
|
146 // in the first three leveles. 0E4E to 0E4B are tonal marks to be compared in secondary level. |
|
147 // In real implmentation, set puncutation marks in tertiary as there is no fourth level in Java. |
|
148 // Set all these special marks after \u0301, the accute accent. |
178 + "& \u0301 " // acute accent |
149 + "& \u0301 " // acute accent |
|
150 |
|
151 //puncutation marks |
|
152 + ", \u0E2F " // PAIYANNOI (ellipsis, abbreviation) |
|
153 + ", \u0E46 " // MAIYAMOK |
|
154 + ", \u0E4F " // FONGMAN |
|
155 + ", \u0E5A " // ANGKHANKHU |
|
156 + ", \u0E5B " // KHOMUT |
|
157 |
|
158 //tonal marks |
|
159 + "; \u0E4E " // YAMAKKAN |
|
160 + "; \u0E4C " // THANTHAKHAT |
179 + "; \u0E47 " // MAITAIKHU |
161 + "; \u0E47 " // MAITAIKHU |
180 + "; \u0E48 " // MAI EK |
162 + "; \u0E48 " // MAI EK |
181 + "; \u0E49 " // MAI THO |
163 + "; \u0E49 " // MAI THO |
182 + "; \u0E4A " // MAI TRI |
164 + "; \u0E4A " // MAI TRI |
183 + "; \u0E4B " // MAI CHATTAWA |
165 + "; \u0E4B " // MAI CHATTAWA |
184 + "; \u0E4C " // THANTHAKHAT |
166 |
|
167 // |
|
168 // Digits are equal to their corresponding Arabic digits in the first level |
|
169 // |
|
170 + "& 0 = \u0E50 " // DIGIT ZERO |
|
171 + "& 1 = \u0E51 " // DIGIT ONE |
|
172 + "& 2 = \u0E52 " // DIGIT TWO |
|
173 + "& 3 = \u0E53 " // DIGIT THREE |
|
174 + "& 4 = \u0E54 " // DIGIT FOUR |
|
175 + "& 5 = \u0E55 " // DIGIT FIVE |
|
176 + "& 6 = \u0E56 " // DIGIT SIX |
|
177 + "& 7 = \u0E57 " // DIGIT SEVEN |
|
178 + "& 8 = \u0E58 " // DIGIT EIGHT |
|
179 + "& 9 = \u0E59 " // DIGIT NINE |
185 |
180 |
186 |
181 |
187 // These are supposed to be ignored, so I'm treating them as controls |
182 } |
188 + "& \u0001 " |
|
189 + "= \u0E3A " // PHINTHU |
|
190 + "= '.' " // period |
|
191 } |
|
192 }; |
183 }; |
193 } |
184 } |
194 } |
185 } |