jdk/src/share/classes/java/text/Normalizer.java
author alanb
Thu, 18 Aug 2011 16:47:20 +0100
changeset 10347 1c9efe1ec7d3
parent 5506 202f599c92aa
child 19054 a64012cb49d6
permissions -rw-r--r--
7015589: (spec) BufferedWriter.close leaves stream open if close of underlying Writer fails Reviewed-by: forax, mduigou
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     1
/*
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
     2
 * Copyright (c) 2005, 2006, Oracle and/or its affiliates. All rights reserved.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
90ce3da70b43 Initial load
duke
parents:
diff changeset
     4
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
     5
 * This code is free software; you can redistribute it and/or modify it
90ce3da70b43 Initial load
duke
parents:
diff changeset
     6
 * under the terms of the GNU General Public License version 2 only, as
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
     7
 * published by the Free Software Foundation.  Oracle designates this
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     8
 * particular file as subject to the "Classpath" exception as provided
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
     9
 * by Oracle in the LICENSE file that accompanied this code.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    10
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    11
 * This code is distributed in the hope that it will be useful, but WITHOUT
90ce3da70b43 Initial load
duke
parents:
diff changeset
    12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
90ce3da70b43 Initial load
duke
parents:
diff changeset
    13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
90ce3da70b43 Initial load
duke
parents:
diff changeset
    14
 * version 2 for more details (a copy is included in the LICENSE file that
90ce3da70b43 Initial load
duke
parents:
diff changeset
    15
 * accompanied this code).
90ce3da70b43 Initial load
duke
parents:
diff changeset
    16
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    17
 * You should have received a copy of the GNU General Public License version
90ce3da70b43 Initial load
duke
parents:
diff changeset
    18
 * 2 along with this work; if not, write to the Free Software Foundation,
90ce3da70b43 Initial load
duke
parents:
diff changeset
    19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    20
 *
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
    21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
    22
 * or visit www.oracle.com if you need additional information or have any
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
    23
 * questions.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    24
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
    25
90ce3da70b43 Initial load
duke
parents:
diff changeset
    26
/*
90ce3da70b43 Initial load
duke
parents:
diff changeset
    27
 *******************************************************************************
90ce3da70b43 Initial load
duke
parents:
diff changeset
    28
 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    29
 *                                                                             *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    30
 * The original version of this source code and documentation is copyrighted   *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    31
 * and owned by IBM, These materials are provided under terms of a License     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    32
 * Agreement between IBM and Sun. This technology is protected by multiple     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    33
 * US and International patents. This notice and attribution to IBM may not    *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    34
 * to removed.                                                                 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    35
 *******************************************************************************
90ce3da70b43 Initial load
duke
parents:
diff changeset
    36
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
    37
90ce3da70b43 Initial load
duke
parents:
diff changeset
    38
package java.text;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    39
90ce3da70b43 Initial load
duke
parents:
diff changeset
    40
import sun.text.normalizer.NormalizerBase;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    41
import sun.text.normalizer.NormalizerImpl;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    42
90ce3da70b43 Initial load
duke
parents:
diff changeset
    43
/**
90ce3da70b43 Initial load
duke
parents:
diff changeset
    44
 * This class provides the method <code>normalize</code> which transforms Unicode
90ce3da70b43 Initial load
duke
parents:
diff changeset
    45
 * text into an equivalent composed or decomposed form, allowing for easier
90ce3da70b43 Initial load
duke
parents:
diff changeset
    46
 * sorting and searching of text.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    47
 * The <code>normalize</code> method supports the standard normalization forms
90ce3da70b43 Initial load
duke
parents:
diff changeset
    48
 * described in
90ce3da70b43 Initial load
duke
parents:
diff changeset
    49
 * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
90ce3da70b43 Initial load
duke
parents:
diff changeset
    50
 * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    51
 * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    52
 * Characters with accents or other adornments can be encoded in
90ce3da70b43 Initial load
duke
parents:
diff changeset
    53
 * several different ways in Unicode.  For example, take the character A-acute.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    54
 * In Unicode, this can be encoded as a single character (the "composed" form):
90ce3da70b43 Initial load
duke
parents:
diff changeset
    55
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    56
 * <p><pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    57
 *      U+00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    58
 * </p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    59
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    60
 * or as two separate characters (the "decomposed" form):
90ce3da70b43 Initial load
duke
parents:
diff changeset
    61
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    62
 * <p><pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    63
 *      U+0041    LATIN CAPITAL LETTER A
90ce3da70b43 Initial load
duke
parents:
diff changeset
    64
 *      U+0301    COMBINING ACUTE ACCENT</pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    65
 * </p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    66
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    67
 * To a user of your program, however, both of these sequences should be
90ce3da70b43 Initial load
duke
parents:
diff changeset
    68
 * treated as the same "user-level" character "A with acute accent".  When you
90ce3da70b43 Initial load
duke
parents:
diff changeset
    69
 * are searching or comparing text, you must ensure that these two sequences are
90ce3da70b43 Initial load
duke
parents:
diff changeset
    70
 * treated as equivalent.  In addition, you must handle characters with more than
90ce3da70b43 Initial load
duke
parents:
diff changeset
    71
 * one accent. Sometimes the order of a character's combining accents is
90ce3da70b43 Initial load
duke
parents:
diff changeset
    72
 * significant, while in other cases accent sequences in different orders are
90ce3da70b43 Initial load
duke
parents:
diff changeset
    73
 * really equivalent.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    74
 * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    75
 * Similarly, the string "ffi" can be encoded as three separate letters:
90ce3da70b43 Initial load
duke
parents:
diff changeset
    76
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    77
 * <p><pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    78
 *      U+0066    LATIN SMALL LETTER F
90ce3da70b43 Initial load
duke
parents:
diff changeset
    79
 *      U+0066    LATIN SMALL LETTER F
90ce3da70b43 Initial load
duke
parents:
diff changeset
    80
 *      U+0069    LATIN SMALL LETTER I</pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    81
 * </p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    82
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    83
 * or as the single character
90ce3da70b43 Initial load
duke
parents:
diff changeset
    84
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    85
 * <p><pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    86
 *      U+FB03    LATIN SMALL LIGATURE FFI</pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    87
 * </p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    88
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    89
 * The ffi ligature is not a distinct semantic character, and strictly speaking
90ce3da70b43 Initial load
duke
parents:
diff changeset
    90
 * it shouldn't be in Unicode at all, but it was included for compatibility
90ce3da70b43 Initial load
duke
parents:
diff changeset
    91
 * with existing character sets that already provided it.  The Unicode standard
90ce3da70b43 Initial load
duke
parents:
diff changeset
    92
 * identifies such characters by giving them "compatibility" decompositions
90ce3da70b43 Initial load
duke
parents:
diff changeset
    93
 * into the corresponding semantic characters.  When sorting and searching, you
90ce3da70b43 Initial load
duke
parents:
diff changeset
    94
 * will often want to use these mappings.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    95
 * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    96
 * The <code>normalize</code> method helps solve these problems by transforming
90ce3da70b43 Initial load
duke
parents:
diff changeset
    97
 * text into the canonical composed and decomposed forms as shown in the first
90ce3da70b43 Initial load
duke
parents:
diff changeset
    98
 * example above. In addition, you can have it perform compatibility
90ce3da70b43 Initial load
duke
parents:
diff changeset
    99
 * decompositions so that you can treat compatibility characters the same as
90ce3da70b43 Initial load
duke
parents:
diff changeset
   100
 * their equivalents.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   101
 * Finally, the <code>normalize</code> method rearranges accents into the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   102
 * proper canonical order, so that you do not have to worry about accent
90ce3da70b43 Initial load
duke
parents:
diff changeset
   103
 * rearrangement on your own.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   104
 * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   105
 * The W3C generally recommends to exchange texts in NFC.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   106
 * Note also that most legacy character encodings use only precomposed forms and
90ce3da70b43 Initial load
duke
parents:
diff changeset
   107
 * often do not encode any combining marks by themselves. For conversion to such
90ce3da70b43 Initial load
duke
parents:
diff changeset
   108
 * character encodings the Unicode text needs to be normalized to NFC.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   109
 * For more usage examples, see the Unicode Standard Annex.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   110
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   111
 * @since 1.6
90ce3da70b43 Initial load
duke
parents:
diff changeset
   112
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   113
public final class Normalizer {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   114
90ce3da70b43 Initial load
duke
parents:
diff changeset
   115
   private Normalizer() {};
90ce3da70b43 Initial load
duke
parents:
diff changeset
   116
90ce3da70b43 Initial load
duke
parents:
diff changeset
   117
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   118
     * This enum provides constants of the four Unicode normalization forms
90ce3da70b43 Initial load
duke
parents:
diff changeset
   119
     * that are described in
90ce3da70b43 Initial load
duke
parents:
diff changeset
   120
     * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
90ce3da70b43 Initial load
duke
parents:
diff changeset
   121
     * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   122
     * and two methods to access them.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   123
     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   124
     * @since 1.6
90ce3da70b43 Initial load
duke
parents:
diff changeset
   125
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   126
    public static enum Form {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   127
90ce3da70b43 Initial load
duke
parents:
diff changeset
   128
        /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   129
         * Canonical decomposition.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   130
         */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   131
        NFD,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   132
90ce3da70b43 Initial load
duke
parents:
diff changeset
   133
        /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   134
         * Canonical decomposition, followed by canonical composition.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   135
         */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   136
        NFC,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   137
90ce3da70b43 Initial load
duke
parents:
diff changeset
   138
        /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   139
         * Compatibility decomposition.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   140
         */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   141
        NFKD,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   142
90ce3da70b43 Initial load
duke
parents:
diff changeset
   143
        /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   144
         * Compatibility decomposition, followed by canonical composition.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   145
         */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   146
        NFKC
90ce3da70b43 Initial load
duke
parents:
diff changeset
   147
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   148
90ce3da70b43 Initial load
duke
parents:
diff changeset
   149
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   150
     * Normalize a sequence of char values.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   151
     * The sequence will be normalized according to the specified normalization
90ce3da70b43 Initial load
duke
parents:
diff changeset
   152
     * from.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   153
     * @param src        The sequence of char values to normalize.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   154
     * @param form       The normalization form; one of
90ce3da70b43 Initial load
duke
parents:
diff changeset
   155
     *                   {@link java.text.Normalizer.Form#NFC},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   156
     *                   {@link java.text.Normalizer.Form#NFD},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   157
     *                   {@link java.text.Normalizer.Form#NFKC},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   158
     *                   {@link java.text.Normalizer.Form#NFKD}
90ce3da70b43 Initial load
duke
parents:
diff changeset
   159
     * @return The normalized String
90ce3da70b43 Initial load
duke
parents:
diff changeset
   160
     * @throws NullPointerException If <code>src</code> or <code>form</code>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   161
     * is null.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   162
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   163
    public static String normalize(CharSequence src, Form form) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   164
        return NormalizerBase.normalize(src.toString(), form);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   165
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   166
90ce3da70b43 Initial load
duke
parents:
diff changeset
   167
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   168
     * Determines if the given sequence of char values is normalized.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   169
     * @param src        The sequence of char values to be checked.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   170
     * @param form       The normalization form; one of
90ce3da70b43 Initial load
duke
parents:
diff changeset
   171
     *                   {@link java.text.Normalizer.Form#NFC},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   172
     *                   {@link java.text.Normalizer.Form#NFD},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   173
     *                   {@link java.text.Normalizer.Form#NFKC},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   174
     *                   {@link java.text.Normalizer.Form#NFKD}
90ce3da70b43 Initial load
duke
parents:
diff changeset
   175
     * @return true if the sequence of char values is normalized;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   176
     * false otherwise.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   177
     * @throws NullPointerException If <code>src</code> or <code>form</code>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   178
     * is null.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   179
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   180
    public static boolean isNormalized(CharSequence src, Form form) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   181
        return NormalizerBase.isNormalized(src.toString(), form);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   182
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   183
}