23 * questions. |
23 * questions. |
24 */ |
24 */ |
25 |
25 |
26 package java.util.regex; |
26 package java.util.regex; |
27 |
27 |
|
28 import java.util.Objects; |
|
29 |
28 final class Grapheme { |
30 final class Grapheme { |
29 |
31 |
30 /** |
32 /** |
31 * Determines if there is an extended grapheme cluster boundary between two |
33 * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes |
32 * continuing characters {@code cp1} and {@code cp2}. |
34 * the start of the char sequence is a boundary. |
33 * <p> |
35 * <p> |
34 * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification |
36 * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification |
35 * for the extended grapheme cluster boundary rules |
37 * for the extended grapheme cluster boundary rules. The following implementation |
|
38 * is based on version 12.0 of the annex. |
|
39 * (http://www.unicode.org/reports/tr29/tr29-35.html) |
|
40 * |
|
41 * @param src the {@code CharSequence} to be scanned |
|
42 * @param off offset to start looking for the next boundary in the src |
|
43 * @param limit limit offset in the src (exclusive) |
|
44 * @return the next possible boundary |
36 */ |
45 */ |
37 static boolean isBoundary(int cp1, int cp2) { |
46 static int nextBoundary(CharSequence src, int off, int limit) { |
38 return rules[getType(cp1)][getType(cp2)]; |
47 Objects.checkFromToIndex(off, limit, src.length()); |
|
48 |
|
49 int ch0 = Character.codePointAt(src, 0); |
|
50 int ret = Character.charCount(ch0); |
|
51 int ch1; |
|
52 // indicates whether gb11 or gb12 is underway |
|
53 boolean gb11 = EmojiData.isExtendedPictographic(ch0); |
|
54 int riCount = getType(ch0) == RI ? 1 : 0; |
|
55 while (ret < limit) { |
|
56 ch1 = Character.codePointAt(src, ret); |
|
57 int t0 = getType(ch0); |
|
58 int t1 = getType(ch1); |
|
59 |
|
60 if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) { |
|
61 gb11 = false; |
|
62 } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) { |
|
63 // continue for gb12 |
|
64 } else if (rules[t0][t1]) { |
|
65 if (ret > off) { |
|
66 break; |
|
67 } else { |
|
68 gb11 = EmojiData.isExtendedPictographic(ch1); |
|
69 riCount = 0; |
|
70 } |
|
71 } |
|
72 |
|
73 riCount += getType(ch1) == RI ? 1 : 0; |
|
74 ch0 = ch1; |
|
75 ret += Character.charCount(ch1); |
|
76 } |
|
77 return ret; |
39 } |
78 } |
40 |
79 |
41 // types |
80 // types |
42 private static final int OTHER = 0; |
81 private static final int OTHER = 0; |
43 private static final int CR = 1; |
82 private static final int CR = 1; |
44 private static final int LF = 2; |
83 private static final int LF = 2; |
45 private static final int CONTROL = 3; |
84 private static final int CONTROL = 3; |
46 private static final int EXTEND = 4; |
85 private static final int EXTEND = 4; |
47 private static final int RI = 5; |
86 private static final int ZWJ = 5; |
48 private static final int PREPEND = 6; |
87 private static final int RI = 6; |
49 private static final int SPACINGMARK = 7; |
88 private static final int PREPEND = 7; |
50 private static final int L = 8; |
89 private static final int SPACINGMARK = 8; |
51 private static final int V = 9; |
90 private static final int L = 9; |
52 private static final int T = 10; |
91 private static final int V = 10; |
53 private static final int LV = 11; |
92 private static final int T = 11; |
54 private static final int LVT = 12; |
93 private static final int LV = 12; |
|
94 private static final int LVT = 13; |
|
95 private static final int EXTENDED_PICTOGRAPHIC = 14; |
55 |
96 |
56 private static final int FIRST_TYPE = 0; |
97 private static final int FIRST_TYPE = 0; |
57 private static final int LAST_TYPE = 12; |
98 private static final int LAST_TYPE = 14; |
58 |
99 |
59 private static boolean[][] rules; |
100 private static boolean[][] rules; |
60 static { |
101 static { |
61 rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1]; |
102 rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1]; |
62 // default, any + any |
103 // GB 999 Any + Any -> default |
63 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) |
104 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) |
64 for (int j = FIRST_TYPE; j <= LAST_TYPE; j++) |
105 for (int j = FIRST_TYPE; j <= LAST_TYPE; j++) |
65 rules[i][j] = true; |
106 rules[i][j] = true; |
66 // GB 6 L x (L | V | LV | VT) |
107 // GB 6 L x (L | V | LV | VT) |
67 rules[L][L] = false; |
108 rules[L][L] = false; |
74 rules[V][V] = false; |
115 rules[V][V] = false; |
75 rules[V][T] = false; |
116 rules[V][T] = false; |
76 // GB 8 (LVT | T) x T |
117 // GB 8 (LVT | T) x T |
77 rules[LVT][T] = false; |
118 rules[LVT][T] = false; |
78 rules[T][T] = false; |
119 rules[T][T] = false; |
79 // GB 8a RI x RI |
120 // GB 9 x (Extend|ZWJ) |
80 rules[RI][RI] = false; |
|
81 // GB 9 x Extend |
|
82 // GB 9a x Spacing Mark |
121 // GB 9a x Spacing Mark |
83 // GB 9b Prepend x |
122 // GB 9b Prepend x |
84 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) { |
123 for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) { |
85 rules[i][EXTEND] = false; |
124 rules[i][EXTEND] = false; |
|
125 rules[i][ZWJ] = false; |
86 rules[i][SPACINGMARK] = false; |
126 rules[i][SPACINGMARK] = false; |
87 rules[PREPEND][i] = false; |
127 rules[PREPEND][i] = false; |
88 } |
128 } |
89 // GB 4 (Control | CR | LF) + |
129 // GB 4 (Control | CR | LF) + |
90 // GB 5 + (Control | CR | LF) |
130 // GB 5 + (Control | CR | LF) |
121 cp == 0xAA7B || cp == 0xAA7D; |
163 cp == 0xAA7B || cp == 0xAA7D; |
122 } |
164 } |
123 |
165 |
124 @SuppressWarnings("fallthrough") |
166 @SuppressWarnings("fallthrough") |
125 private static int getType(int cp) { |
167 private static int getType(int cp) { |
|
168 if (EmojiData.isExtendedPictographic(cp)) { |
|
169 return EXTENDED_PICTOGRAPHIC; |
|
170 } |
|
171 |
126 int type = Character.getType(cp); |
172 int type = Character.getType(cp); |
127 switch(type) { |
173 switch(type) { |
128 case Character.CONTROL: |
174 case Character.CONTROL: |
129 if (cp == 0x000D) |
175 if (cp == 0x000D) |
130 return CR; |
176 return CR; |
131 if (cp == 0x000A) |
177 if (cp == 0x000A) |
132 return LF; |
178 return LF; |
133 return CONTROL; |
179 return CONTROL; |
134 case Character.UNASSIGNED: |
180 case Character.UNASSIGNED: |
135 // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control |
181 // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control |
136 // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" |
182 // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other" |
137 // so type it as "Other" to make the test happy |
183 // so type it as "Other" to make the test happy |
138 if (cp == 0x0378) |
184 if (cp == 0x0378) |
139 return OTHER; |
185 return OTHER; |
140 |
186 |
141 case Character.LINE_SEPARATOR: |
187 case Character.LINE_SEPARATOR: |
142 case Character.PARAGRAPH_SEPARATOR: |
188 case Character.PARAGRAPH_SEPARATOR: |
143 case Character.SURROGATE: |
189 case Character.SURROGATE: |
144 return CONTROL; |
190 return CONTROL; |
145 case Character.FORMAT: |
191 case Character.FORMAT: |
146 if (cp == 0x200C || cp == 0x200D) |
192 if (cp == 0x200C || |
|
193 cp >= 0xE0020 && cp <= 0xE007F) |
147 return EXTEND; |
194 return EXTEND; |
|
195 if (cp == 0x200D) |
|
196 return ZWJ; |
|
197 if (cp >= 0x0600 && cp <= 0x0605 || |
|
198 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 || |
|
199 cp == 0x110BD || cp == 0x110CD) |
|
200 return PREPEND; |
148 return CONTROL; |
201 return CONTROL; |
149 case Character.NON_SPACING_MARK: |
202 case Character.NON_SPACING_MARK: |
150 case Character.ENCLOSING_MARK: |
203 case Character.ENCLOSING_MARK: |
151 // NOTE: |
204 // NOTE: |
152 // #tr29 "plus a few General_Category = Spacing_Mark needed for |
205 // #tr29 "plus a few General_Category = Spacing_Mark needed for |
153 // canonical equivalence." |
206 // canonical equivalence." |
154 // but for "extended grapheme clusters" support, there is no |
207 // but for "extended grapheme clusters" support, there is no |
155 // need actually to diff "extend" and "spackmark" given GB9, GB9a |
208 // need actually to diff "extend" and "spackmark" given GB9, GB9a |
156 return EXTEND; |
209 return EXTEND; |
157 case Character.COMBINING_SPACING_MARK: |
210 case Character.COMBINING_SPACING_MARK: |
158 if (isExcludedSpacingMark(cp)) |
211 if (isExcludedSpacingMark(cp)) |
159 return OTHER; |
212 return OTHER; |
160 // NOTE: |
213 // NOTE: |
161 // 0x11720 and 0x11721 are mentioned in #tr29 as |
214 // 0x11720 and 0x11721 are mentioned in #tr29 as |