26 package java.util.regex; |
26 package java.util.regex; |
27 |
27 |
28 import java.util.Objects; |
28 import java.util.Objects; |
29 |
29 |
30 final class Grapheme { |
30 final class Grapheme { |
|
31 |
|
32 /** |
|
33 * Determines if there is an extended grapheme cluster boundary between two |
|
34 * continuing characters {@code cp1} and {@code cp2}. |
|
35 * <p> |
|
36 * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification |
|
37 * for the extended grapheme cluster boundary rules |
|
38 * <p> |
|
39 * Note: this method does not take care of stateful breaking. |
|
40 */ |
|
41 static boolean isBoundary(int cp1, int cp2) { |
|
42 return rules[getType(cp1)][getType(cp2)]; |
|
43 } |
31 |
44 |
32 /** |
45 /** |
33 * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes |
46 * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes |
34 * the start of the char sequence is a boundary. |
47 * the start of the char sequence is a boundary. |
35 * <p> |
48 * <p> |
48 |
61 |
49 int ch0 = Character.codePointAt(src, 0); |
62 int ch0 = Character.codePointAt(src, 0); |
50 int ret = Character.charCount(ch0); |
63 int ret = Character.charCount(ch0); |
51 int ch1; |
64 int ch1; |
52 // indicates whether gb11 or gb12 is underway |
65 // indicates whether gb11 or gb12 is underway |
53 boolean gb11 = EmojiData.isExtendedPictographic(ch0); |
66 int t0 = getGraphemeType(ch0); |
54 int riCount = getType(ch0) == RI ? 1 : 0; |
67 int riCount = t0 == RI ? 1 : 0; |
|
68 boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC; |
55 while (ret < limit) { |
69 while (ret < limit) { |
56 ch1 = Character.codePointAt(src, ret); |
70 ch1 = Character.codePointAt(src, ret); |
57 int t0 = getType(ch0); |
71 int t1 = getGraphemeType(ch1); |
58 int t1 = getType(ch1); |
|
59 |
72 |
60 if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { |
73 if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { |
61 gb11 = false; |
74 gb11 = false; |
62 } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) { |
75 } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) { |
63 // continue for gb12 |
76 // continue for gb12 |
64 } else if (rules[t0][t1]) { |
77 } else if (rules[t0][t1]) { |
65 if (ret > off) { |
78 if (ret > off) { |
66 break; |
79 break; |
67 } else { |
80 } else { |
68 gb11 = EmojiData.isExtendedPictographic(ch1); |
81 gb11 = t1 == EXTENDED_PICTOGRAPHIC; |
69 riCount = 0; |
82 riCount = 0; |
70 } |
83 } |
71 } |
84 } |
72 |
85 |
73 riCount += getType(ch1) == RI ? 1 : 0; |
86 riCount += (t1 == RI) ? 1 : 0; |
74 ch0 = ch1; |
87 t0 = t1; |
|
88 |
75 ret += Character.charCount(ch1); |
89 ret += Character.charCount(ch1); |
76 } |
90 } |
77 return ret; |
91 return ret; |
78 } |
92 } |
79 |
93 |
161 cp >= 0x109A && cp <= 0x109C || |
175 cp >= 0x109A && cp <= 0x109C || |
162 cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 || |
176 cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 || |
163 cp == 0xAA7B || cp == 0xAA7D; |
177 cp == 0xAA7B || cp == 0xAA7D; |
164 } |
178 } |
165 |
179 |
|
180 private static int getGraphemeType(int cp) { |
|
181 if (cp < 0x007F) { // ASCII |
|
182 if (cp < 32) { // Control characters |
|
183 if (cp == 0x000D) |
|
184 return CR; |
|
185 if (cp == 0x000A) |
|
186 return LF; |
|
187 return CONTROL; |
|
188 } |
|
189 return OTHER; |
|
190 } |
|
191 return getType(cp); |
|
192 } |
|
193 |
166 @SuppressWarnings("fallthrough") |
194 @SuppressWarnings("fallthrough") |
167 private static int getType(int cp) { |
195 private static int getType(int cp) { |
168 if (EmojiData.isExtendedPictographic(cp)) { |
196 if (EmojiData.isExtendedPictographic(cp)) { |
169 return EXTENDED_PICTOGRAPHIC; |
197 return EXTENDED_PICTOGRAPHIC; |
170 } |
198 } |
171 |
199 |
172 int type = Character.getType(cp); |
200 int type = Character.getType(cp); |
173 switch(type) { |
201 switch(type) { |
174 case Character.CONTROL: |
|
175 if (cp == 0x000D) |
|
176 return CR; |
|
177 if (cp == 0x000A) |
|
178 return LF; |
|
179 return CONTROL; |
|
180 case Character.UNASSIGNED: |
202 case Character.UNASSIGNED: |
181 // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control |
203 // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control |
182 // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" |
204 // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" |
183 // so type it as "Other" to make the test happy |
205 // so type it as "Other" to make the test happy |
184 if (cp == 0x0378) |
206 if (cp == 0x0378) |
185 return OTHER; |
207 return OTHER; |
186 |
208 |
|
209 case Character.CONTROL: |
187 case Character.LINE_SEPARATOR: |
210 case Character.LINE_SEPARATOR: |
188 case Character.PARAGRAPH_SEPARATOR: |
211 case Character.PARAGRAPH_SEPARATOR: |
189 case Character.SURROGATE: |
212 case Character.SURROGATE: |
190 return CONTROL; |
213 return CONTROL; |
191 case Character.FORMAT: |
214 case Character.FORMAT: |