author | rgoel |
Mon, 11 Mar 2019 17:34:23 +0530 | |
changeset 54054 | 1def2d745747 |
parent 50045 | d9d55f64d136 |
permissions | -rw-r--r-- |
31680 | 1 |
/* |
54054
1def2d745747
8220414: Correct copyright headers in Norm2AllModes.java and Normalizer2.java
rgoel
parents:
50045
diff
changeset
|
2 |
* Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved. |
31680 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
7 |
* published by the Free Software Foundation. Oracle designates this |
|
8 |
* particular file as subject to the "Classpath" exception as provided |
|
9 |
* by Oracle in the LICENSE file that accompanied this code. |
|
10 |
* |
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
15 |
* accompanied this code). |
|
16 |
* |
|
17 |
* You should have received a copy of the GNU General Public License version |
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
* |
|
21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
22 |
* or visit www.oracle.com if you need additional information or have any |
|
23 |
* questions. |
|
24 |
*/ |
|
25 |
||
26 |
/* |
|
27 |
******************************************************************************* |
|
28 |
* Copyright (C) 2009-2014, International Business Machines |
|
29 |
* Corporation and others. All Rights Reserved. |
|
30 |
******************************************************************************* |
|
31 |
*/ |
|
32 |
||
33 |
package sun.text.normalizer; |
|
34 |
||
35 |
/** |
|
36 |
* Unicode normalization functionality for standard Unicode normalization or |
|
37 |
* for using custom mapping tables. |
|
38 |
* All instances of this class are unmodifiable/immutable. |
|
39 |
* The Normalizer2 class is not intended for public subclassing. |
|
40 |
* <p> |
|
41 |
* The primary functions are to produce a normalized string and to detect whether |
|
42 |
* a string is already normalized. |
|
43 |
* The most commonly used normalization forms are those defined in |
|
44 |
* http://www.unicode.org/unicode/reports/tr15/ |
|
45 |
* However, this API supports additional normalization forms for specialized purposes. |
|
46 |
* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) |
|
47 |
* and can be used in implementations of UTS #46. |
|
48 |
* <p> |
|
49 |
* Not only are the standard compose and decompose modes supplied, |
|
50 |
* but additional modes are provided as documented in the Mode enum. |
|
51 |
* <p> |
|
52 |
* Some of the functions in this class identify normalization boundaries. |
|
53 |
* At a normalization boundary, the portions of the string |
|
54 |
* before it and starting from it do not interact and can be handled independently. |
|
55 |
* <p> |
|
56 |
* The spanQuickCheckYes() stops at a normalization boundary. |
|
57 |
* When the goal is a normalized string, then the text before the boundary |
|
58 |
* can be copied, and the remainder can be processed with normalizeSecondAndAppend(). |
|
59 |
* <p> |
|
60 |
* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether |
|
61 |
* a character is guaranteed to be at a normalization boundary, |
|
62 |
* regardless of context. |
|
63 |
* This is used for moving from one normalization boundary to the next |
|
64 |
* or preceding boundary, and for performing iterative normalization. |
|
65 |
* <p> |
|
66 |
* Iterative normalization is useful when only a small portion of a |
|
67 |
* longer string needs to be processed. |
|
68 |
* For example, in ICU, iterative normalization is used by the NormalizationTransliterator |
|
69 |
* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() |
|
70 |
* (to process only the substring for which sort key bytes are computed). |
|
71 |
* <p> |
|
72 |
* The set of normalization boundaries returned by these functions may not be |
|
73 |
* complete: There may be more boundaries that could be returned. |
|
74 |
* Different functions may return different boundaries. |
|
75 |
* @stable ICU 4.4 |
|
76 |
* @author Markus W. Scherer |
|
77 |
*/ |
|
78 |
abstract class Normalizer2 { |
|
79 |
||
80 |
/** |
|
81 |
* Returns a Normalizer2 instance for Unicode NFC normalization. |
|
82 |
* Same as getInstance(null, "nfc", Mode.COMPOSE). |
|
83 |
* Returns an unmodifiable singleton instance. |
|
84 |
* @return the requested Normalizer2, if successful |
|
85 |
* @stable ICU 49 |
|
86 |
*/ |
|
87 |
public static Normalizer2 getNFCInstance() { |
|
88 |
return Norm2AllModes.getNFCInstance().comp; |
|
89 |
} |
|
90 |
||
91 |
/** |
|
92 |
* Returns a Normalizer2 instance for Unicode NFD normalization. |
|
93 |
* Same as getInstance(null, "nfc", Mode.DECOMPOSE). |
|
94 |
* Returns an unmodifiable singleton instance. |
|
95 |
* @return the requested Normalizer2, if successful |
|
96 |
* @stable ICU 49 |
|
97 |
*/ |
|
98 |
public static Normalizer2 getNFDInstance() { |
|
99 |
return Norm2AllModes.getNFCInstance().decomp; |
|
100 |
} |
|
101 |
||
102 |
/** |
|
103 |
* Returns a Normalizer2 instance for Unicode NFKC normalization. |
|
104 |
* Same as getInstance(null, "nfkc", Mode.COMPOSE). |
|
105 |
* Returns an unmodifiable singleton instance. |
|
106 |
* @return the requested Normalizer2, if successful |
|
107 |
* @stable ICU 49 |
|
108 |
*/ |
|
109 |
public static Normalizer2 getNFKCInstance() { |
|
110 |
return Norm2AllModes.getNFKCInstance().comp; |
|
111 |
} |
|
112 |
||
113 |
/** |
|
114 |
* Returns a Normalizer2 instance for Unicode NFKD normalization. |
|
115 |
* Same as getInstance(null, "nfkc", Mode.DECOMPOSE). |
|
116 |
* Returns an unmodifiable singleton instance. |
|
117 |
* @return the requested Normalizer2, if successful |
|
118 |
* @stable ICU 49 |
|
119 |
*/ |
|
120 |
public static Normalizer2 getNFKDInstance() { |
|
121 |
return Norm2AllModes.getNFKCInstance().decomp; |
|
122 |
} |
|
123 |
||
124 |
/** |
|
125 |
* Returns the normalized form of the source string. |
|
126 |
* @param src source string |
|
127 |
* @return normalized src |
|
128 |
* @stable ICU 4.4 |
|
129 |
*/ |
|
130 |
public String normalize(CharSequence src) { |
|
131 |
if(src instanceof String) { |
|
132 |
// Fastpath: Do not construct a new String if the src is a String |
|
133 |
// and is already normalized. |
|
134 |
int spanLength=spanQuickCheckYes(src); |
|
135 |
if(spanLength==src.length()) { |
|
136 |
return (String)src; |
|
137 |
} |
|
50045 | 138 |
if (spanLength != 0) { |
139 |
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength); |
|
140 |
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString(); |
|
141 |
} |
|
31680 | 142 |
} |
143 |
return normalize(src, new StringBuilder(src.length())).toString(); |
|
144 |
} |
|
145 |
||
146 |
/** |
|
147 |
* Writes the normalized form of the source string to the destination string |
|
148 |
* (replacing its contents) and returns the destination string. |
|
149 |
* The source and destination strings must be different objects. |
|
150 |
* @param src source string |
|
151 |
* @param dest destination string; its contents is replaced with normalized src |
|
152 |
* @return dest |
|
153 |
* @stable ICU 4.4 |
|
154 |
*/ |
|
155 |
public abstract StringBuilder normalize(CharSequence src, StringBuilder dest); |
|
156 |
||
157 |
/** |
|
158 |
* Writes the normalized form of the source string to the destination Appendable |
|
159 |
* and returns the destination Appendable. |
|
160 |
* The source and destination strings must be different objects. |
|
161 |
* |
|
162 |
* <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}. |
|
163 |
* |
|
164 |
* @param src source string |
|
165 |
* @param dest destination Appendable; gets normalized src appended |
|
166 |
* @return dest |
|
167 |
* @stable ICU 4.6 |
|
168 |
*/ |
|
169 |
public abstract Appendable normalize(CharSequence src, Appendable dest); |
|
170 |
||
171 |
/** |
|
172 |
* Appends the normalized form of the second string to the first string |
|
173 |
* (merging them at the boundary) and returns the first string. |
|
174 |
* The result is normalized if the first string was normalized. |
|
175 |
* The first and second strings must be different objects. |
|
176 |
* @param first string, should be normalized |
|
177 |
* @param second string, will be normalized |
|
178 |
* @return first |
|
179 |
* @stable ICU 4.4 |
|
180 |
*/ |
|
181 |
public abstract StringBuilder normalizeSecondAndAppend( |
|
182 |
StringBuilder first, CharSequence second); |
|
183 |
||
184 |
/** |
|
185 |
* Appends the second string to the first string |
|
186 |
* (merging them at the boundary) and returns the first string. |
|
187 |
* The result is normalized if both the strings were normalized. |
|
188 |
* The first and second strings must be different objects. |
|
189 |
* @param first string, should be normalized |
|
190 |
* @param second string, should be normalized |
|
191 |
* @return first |
|
192 |
* @stable ICU 4.4 |
|
193 |
*/ |
|
194 |
public abstract StringBuilder append(StringBuilder first, CharSequence second); |
|
195 |
||
196 |
/** |
|
197 |
* Gets the decomposition mapping of c. |
|
198 |
* Roughly equivalent to normalizing the String form of c |
|
199 |
* on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function |
|
200 |
* returns null if c does not have a decomposition mapping in this instance's data. |
|
201 |
* This function is independent of the mode of the Normalizer2. |
|
202 |
* @param c code point |
|
203 |
* @return c's decomposition mapping, if any; otherwise null |
|
204 |
* @stable ICU 4.6 |
|
205 |
*/ |
|
206 |
public abstract String getDecomposition(int c); |
|
207 |
||
208 |
/** |
|
209 |
* Gets the combining class of c. |
|
210 |
* The default implementation returns 0 |
|
211 |
* but all standard implementations return the Unicode Canonical_Combining_Class value. |
|
212 |
* @param c code point |
|
213 |
* @return c's combining class |
|
214 |
* @stable ICU 49 |
|
215 |
*/ |
|
216 |
public int getCombiningClass(int c) { return 0; } |
|
217 |
||
218 |
/** |
|
219 |
* Tests if the string is normalized. |
|
220 |
* Internally, in cases where the quickCheck() method would return "maybe" |
|
221 |
* (which is only possible for the two COMPOSE modes) this method |
|
222 |
* resolves to "yes" or "no" to provide a definitive result, |
|
223 |
* at the cost of doing more work in those cases. |
|
224 |
* @param s input string |
|
225 |
* @return true if s is normalized |
|
226 |
* @stable ICU 4.4 |
|
227 |
*/ |
|
228 |
public abstract boolean isNormalized(CharSequence s); |
|
229 |
||
230 |
/** |
|
231 |
* Returns the end of the normalized substring of the input string. |
|
232 |
* In other words, with <code>end=spanQuickCheckYes(s);</code> |
|
233 |
* the substring <code>s.subSequence(0, end)</code> |
|
234 |
* will pass the quick check with a "yes" result. |
|
235 |
* <p> |
|
236 |
* The returned end index is usually one or more characters before the |
|
237 |
* "no" or "maybe" character: The end index is at a normalization boundary. |
|
238 |
* (See the class documentation for more about normalization boundaries.) |
|
239 |
* <p> |
|
240 |
* When the goal is a normalized string and most input strings are expected |
|
241 |
* to be normalized already, then call this method, |
|
242 |
* and if it returns a prefix shorter than the input string, |
|
243 |
* copy that prefix and use normalizeSecondAndAppend() for the remainder. |
|
244 |
* @param s input string |
|
245 |
* @return "yes" span end index |
|
246 |
* @stable ICU 4.4 |
|
247 |
*/ |
|
248 |
public abstract int spanQuickCheckYes(CharSequence s); |
|
249 |
||
250 |
/** |
|
251 |
* Tests if the character always has a normalization boundary before it, |
|
252 |
* regardless of context. |
|
253 |
* If true, then the character does not normalization-interact with |
|
254 |
* preceding characters. |
|
255 |
* In other words, a string containing this character can be normalized |
|
256 |
* by processing portions before this character and starting from this |
|
257 |
* character independently. |
|
258 |
* This is used for iterative normalization. See the class documentation for details. |
|
259 |
* @param c character to test |
|
260 |
* @return true if c has a normalization boundary before it |
|
261 |
* @stable ICU 4.4 |
|
262 |
*/ |
|
263 |
public abstract boolean hasBoundaryBefore(int c); |
|
264 |
||
265 |
/** |
|
266 |
* Sole constructor. (For invocation by subclass constructors, |
|
267 |
* typically implicit.) |
|
268 |
* @internal |
|
269 |
* deprecated This API is ICU internal only. |
|
270 |
*/ |
|
271 |
protected Normalizer2() { |
|
272 |
} |
|
273 |
} |