65 * (Refer to the <a |
65 * (Refer to the <a |
66 * href="http://www.unicode.org/reports/tr27/#notation"><i> |
66 * href="http://www.unicode.org/reports/tr27/#notation"><i> |
67 * definition</i></a> of the U+<i>n</i> notation in the Unicode |
67 * definition</i></a> of the U+<i>n</i> notation in the Unicode |
68 * standard.) |
68 * standard.) |
69 * |
69 * |
70 * <p>The set of characters from U+0000 to U+FFFF is sometimes |
70 * <p><a name="BMP">The set of characters from U+0000 to U+FFFF is |
71 * referred to as the <em>Basic Multilingual Plane (BMP)</em>. <a |
71 * sometimes referred to as the <em>Basic Multilingual Plane (BMP)</em>. |
72 * name="supplementary">Characters</a> whose code points are greater |
72 * <a name="supplementary">Characters</a> whose code points are greater |
73 * than U+FFFF are called <em>supplementary character</em>s. The Java |
73 * than U+FFFF are called <em>supplementary character</em>s. The Java |
74 * 2 platform uses the UTF-16 representation in <code>char</code> |
74 * platform uses the UTF-16 representation in <code>char</code> arrays and |
75 * arrays and in the <code>String</code> and <code>StringBuffer</code> |
75 * in the <code>String</code> and <code>StringBuffer</code> classes. In |
76 * classes. In this representation, supplementary characters are |
76 * this representation, supplementary characters are represented as a pair |
77 * represented as a pair of <code>char</code> values, the first from |
77 * of <code>char</code> values, the first from the <em>high-surrogates</em> |
78 * the <em>high-surrogates</em> range, (\uD800-\uDBFF), the |
78 * range, (\uD800-\uDBFF), the second from the |
79 * second from the <em>low-surrogates</em> range |
79 * <em>low-surrogates</em> range (\uDC00-\uDFFF). |
80 * (\uDC00-\uDFFF). |
|
81 * |
80 * |
82 * <p>A <code>char</code> value, therefore, represents Basic |
81 * <p>A <code>char</code> value, therefore, represents Basic |
83 * Multilingual Plane (BMP) code points, including the surrogate |
82 * Multilingual Plane (BMP) code points, including the surrogate |
84 * code points, or code units of the UTF-16 encoding. An |
83 * code points, or code units of the UTF-16 encoding. An |
85 * <code>int</code> value represents all Unicode code points, |
84 * <code>int</code> value represents all Unicode code points, |
3922 return plane < ((MAX_CODE_POINT + 1) >>> 16); |
3921 return plane < ((MAX_CODE_POINT + 1) >>> 16); |
3923 } |
3922 } |
3924 |
3923 |
3925 /** |
3924 /** |
3926 * Determines whether the specified character (Unicode code point) |
3925 * Determines whether the specified character (Unicode code point) |
|
3926 * is in the <a href="#BMP">Basic Multilingual Plane (BMP)</a>. |
|
3927 * Such code points can be represented using a single {@code char}. |
|
3928 * |
|
3929 * @param codePoint the character (Unicode code point) to be tested |
|
3930 * @return {@code true} if the specified code point is between |
|
3931 * {@link #MIN_VALUE} and {@link #MAX_VALUE} inclusive; |
|
3932 * {@code false} otherwise. |
|
3933 * @since 1.7 |
|
3934 */ |
|
3935 public static boolean isBmpCodePoint(int codePoint) { |
|
3936 return codePoint >>> 16 == 0; |
|
3937 // Optimized form of: |
|
3938 // codePoint >= MIN_VALUE && codePoint <= MAX_VALUE |
|
3939 // We consistently use logical shift (>>>) to facilitate |
|
3940 // additional runtime optimizations. |
|
3941 } |
|
3942 |
|
3943 /** |
|
3944 * Determines whether the specified character (Unicode code point) |
3927 * is in the <a href="#supplementary">supplementary character</a> range. |
3945 * is in the <a href="#supplementary">supplementary character</a> range. |
3928 * |
3946 * |
3929 * @param codePoint the character (Unicode code point) to be tested |
3947 * @param codePoint the character (Unicode code point) to be tested |
3930 * @return {@code true} if the specified code point is between |
3948 * @return {@code true} if the specified code point is between |
3931 * {@link #MIN_SUPPLEMENTARY_CODE_POINT} and |
3949 * {@link #MIN_SUPPLEMENTARY_CODE_POINT} and |
4317 * high-surrogate value is not stored in |
4335 * high-surrogate value is not stored in |
4318 * <code>dst[dstIndex]</code>.) |
4336 * <code>dst[dstIndex]</code>.) |
4319 * @since 1.5 |
4337 * @since 1.5 |
4320 */ |
4338 */ |
4321 public static int toChars(int codePoint, char[] dst, int dstIndex) { |
4339 public static int toChars(int codePoint, char[] dst, int dstIndex) { |
4322 if (codePoint < 0 || codePoint > MAX_CODE_POINT) { |
4340 if (isBmpCodePoint(codePoint)) { |
|
4341 dst[dstIndex] = (char) codePoint; |
|
4342 return 1; |
|
4343 } else if (isValidCodePoint(codePoint)) { |
|
4344 toSurrogates(codePoint, dst, dstIndex); |
|
4345 return 2; |
|
4346 } else { |
4323 throw new IllegalArgumentException(); |
4347 throw new IllegalArgumentException(); |
4324 } |
4348 } |
4325 if (codePoint < MIN_SUPPLEMENTARY_CODE_POINT) { |
|
4326 dst[dstIndex] = (char) codePoint; |
|
4327 return 1; |
|
4328 } |
|
4329 toSurrogates(codePoint, dst, dstIndex); |
|
4330 return 2; |
|
4331 } |
4349 } |
4332 |
4350 |
4333 /** |
4351 /** |
4334 * Converts the specified character (Unicode code point) to its |
4352 * Converts the specified character (Unicode code point) to its |
4335 * UTF-16 representation stored in a <code>char</code> array. If |
4353 * UTF-16 representation stored in a <code>char</code> array. If |
4345 * @exception IllegalArgumentException if the specified |
4363 * @exception IllegalArgumentException if the specified |
4346 * <code>codePoint</code> is not a valid Unicode code point. |
4364 * <code>codePoint</code> is not a valid Unicode code point. |
4347 * @since 1.5 |
4365 * @since 1.5 |
4348 */ |
4366 */ |
4349 public static char[] toChars(int codePoint) { |
4367 public static char[] toChars(int codePoint) { |
4350 if (codePoint < 0 || codePoint > MAX_CODE_POINT) { |
4368 if (isBmpCodePoint(codePoint)) { |
|
4369 return new char[] { (char) codePoint }; |
|
4370 } else if (isValidCodePoint(codePoint)) { |
|
4371 char[] result = new char[2]; |
|
4372 toSurrogates(codePoint, result, 0); |
|
4373 return result; |
|
4374 } else { |
4351 throw new IllegalArgumentException(); |
4375 throw new IllegalArgumentException(); |
4352 } |
4376 } |
4353 if (codePoint < MIN_SUPPLEMENTARY_CODE_POINT) { |
|
4354 return new char[] { (char) codePoint }; |
|
4355 } |
|
4356 char[] result = new char[2]; |
|
4357 toSurrogates(codePoint, result, 0); |
|
4358 return result; |
|
4359 } |
4377 } |
4360 |
4378 |
4361 static void toSurrogates(int codePoint, char[] dst, int index) { |
4379 static void toSurrogates(int codePoint, char[] dst, int index) { |
4362 // We write elements "backwards" to guarantee all-or-nothing |
4380 // We write elements "backwards" to guarantee all-or-nothing |
4363 dst[index+1] = (char)((codePoint & 0x3ff) + MIN_LOW_SURROGATE); |
4381 dst[index+1] = (char)((codePoint & 0x3ff) + MIN_LOW_SURROGATE); |
6257 * @return a <code>char[]</code> with the uppercased character. |
6275 * @return a <code>char[]</code> with the uppercased character. |
6258 * @since 1.4 |
6276 * @since 1.4 |
6259 */ |
6277 */ |
6260 static char[] toUpperCaseCharArray(int codePoint) { |
6278 static char[] toUpperCaseCharArray(int codePoint) { |
6261 // As of Unicode 4.0, 1:M uppercasings only happen in the BMP. |
6279 // As of Unicode 4.0, 1:M uppercasings only happen in the BMP. |
6262 assert isValidCodePoint(codePoint) && |
6280 assert isBmpCodePoint(codePoint); |
6263 !isSupplementaryCodePoint(codePoint); |
|
6264 return CharacterData.of(codePoint).toUpperCaseCharArray(codePoint); |
6281 return CharacterData.of(codePoint).toUpperCaseCharArray(codePoint); |
6265 } |
6282 } |
6266 |
6283 |
6267 /** |
6284 /** |
6268 * The number of bits used to represent a <tt>char</tt> value in unsigned |
6285 * The number of bits used to represent a <tt>char</tt> value in unsigned |