src/java.base/share/classes/java/text/Normalizer.java
author naoto
Tue, 15 Oct 2019 09:25:59 -0700
changeset 58603 2312d1a04c49
parent 58288 48e480e56aad
permissions -rw-r--r--
8212749: DecimalFormat.setGroupingSize(int) allows setting negative grouping size 8231984: Clarify semantics of DecimalFormat.getGroupingSize(0) Reviewed-by: rriggs
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     1
/*
58288
48e480e56aad 8231186: Replace html tag <code>foo</code> with javadoc tag {@code foo} in java.base
jboes
parents: 47216
diff changeset
     2
 * Copyright (c) 2005, 2019, Oracle and/or its affiliates. All rights reserved.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
90ce3da70b43 Initial load
duke
parents:
diff changeset
     4
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
     5
 * This code is free software; you can redistribute it and/or modify it
90ce3da70b43 Initial load
duke
parents:
diff changeset
     6
 * under the terms of the GNU General Public License version 2 only, as
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
     7
 * published by the Free Software Foundation.  Oracle designates this
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     8
 * particular file as subject to the "Classpath" exception as provided
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
     9
 * by Oracle in the LICENSE file that accompanied this code.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    10
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    11
 * This code is distributed in the hope that it will be useful, but WITHOUT
90ce3da70b43 Initial load
duke
parents:
diff changeset
    12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
90ce3da70b43 Initial load
duke
parents:
diff changeset
    13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
90ce3da70b43 Initial load
duke
parents:
diff changeset
    14
 * version 2 for more details (a copy is included in the LICENSE file that
90ce3da70b43 Initial load
duke
parents:
diff changeset
    15
 * accompanied this code).
90ce3da70b43 Initial load
duke
parents:
diff changeset
    16
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    17
 * You should have received a copy of the GNU General Public License version
90ce3da70b43 Initial load
duke
parents:
diff changeset
    18
 * 2 along with this work; if not, write to the Free Software Foundation,
90ce3da70b43 Initial load
duke
parents:
diff changeset
    19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    20
 *
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
    21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
    22
 * or visit www.oracle.com if you need additional information or have any
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 2
diff changeset
    23
 * questions.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    24
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
    25
90ce3da70b43 Initial load
duke
parents:
diff changeset
    26
/*
90ce3da70b43 Initial load
duke
parents:
diff changeset
    27
 *******************************************************************************
90ce3da70b43 Initial load
duke
parents:
diff changeset
    28
 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    29
 *                                                                             *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    30
 * The original version of this source code and documentation is copyrighted   *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    31
 * and owned by IBM, These materials are provided under terms of a License     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    32
 * Agreement between IBM and Sun. This technology is protected by multiple     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    33
 * US and International patents. This notice and attribution to IBM may not    *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    34
 * to removed.                                                                 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    35
 *******************************************************************************
90ce3da70b43 Initial load
duke
parents:
diff changeset
    36
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
    37
90ce3da70b43 Initial load
duke
parents:
diff changeset
    38
package java.text;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    39
90ce3da70b43 Initial load
duke
parents:
diff changeset
    40
import sun.text.normalizer.NormalizerBase;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    41
90ce3da70b43 Initial load
duke
parents:
diff changeset
    42
/**
58288
48e480e56aad 8231186: Replace html tag <code>foo</code> with javadoc tag {@code foo} in java.base
jboes
parents: 47216
diff changeset
    43
 * This class provides the method {@code normalize} which transforms Unicode
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    44
 * text into an equivalent composed or decomposed form, allowing for easier
90ce3da70b43 Initial load
duke
parents:
diff changeset
    45
 * sorting and searching of text.
58288
48e480e56aad 8231186: Replace html tag <code>foo</code> with javadoc tag {@code foo} in java.base
jboes
parents: 47216
diff changeset
    46
 * The {@code normalize} method supports the standard normalization forms
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    47
 * described in
90ce3da70b43 Initial load
duke
parents:
diff changeset
    48
 * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
90ce3da70b43 Initial load
duke
parents:
diff changeset
    49
 * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    50
 * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    51
 * Characters with accents or other adornments can be encoded in
90ce3da70b43 Initial load
duke
parents:
diff changeset
    52
 * several different ways in Unicode.  For example, take the character A-acute.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    53
 * In Unicode, this can be encoded as a single character (the "composed" form):
90ce3da70b43 Initial load
duke
parents:
diff changeset
    54
 *
21334
c60dfce46a77 8026982: javadoc errors in core libs
rriggs
parents: 19054
diff changeset
    55
 * <pre>
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    56
 *      U+00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    57
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    58
 * or as two separate characters (the "decomposed" form):
90ce3da70b43 Initial load
duke
parents:
diff changeset
    59
 *
21334
c60dfce46a77 8026982: javadoc errors in core libs
rriggs
parents: 19054
diff changeset
    60
 * <pre>
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    61
 *      U+0041    LATIN CAPITAL LETTER A
90ce3da70b43 Initial load
duke
parents:
diff changeset
    62
 *      U+0301    COMBINING ACUTE ACCENT</pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    63
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    64
 * To a user of your program, however, both of these sequences should be
90ce3da70b43 Initial load
duke
parents:
diff changeset
    65
 * treated as the same "user-level" character "A with acute accent".  When you
90ce3da70b43 Initial load
duke
parents:
diff changeset
    66
 * are searching or comparing text, you must ensure that these two sequences are
90ce3da70b43 Initial load
duke
parents:
diff changeset
    67
 * treated as equivalent.  In addition, you must handle characters with more than
90ce3da70b43 Initial load
duke
parents:
diff changeset
    68
 * one accent. Sometimes the order of a character's combining accents is
90ce3da70b43 Initial load
duke
parents:
diff changeset
    69
 * significant, while in other cases accent sequences in different orders are
90ce3da70b43 Initial load
duke
parents:
diff changeset
    70
 * really equivalent.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    71
 * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    72
 * Similarly, the string "ffi" can be encoded as three separate letters:
90ce3da70b43 Initial load
duke
parents:
diff changeset
    73
 *
21334
c60dfce46a77 8026982: javadoc errors in core libs
rriggs
parents: 19054
diff changeset
    74
 * <pre>
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    75
 *      U+0066    LATIN SMALL LETTER F
90ce3da70b43 Initial load
duke
parents:
diff changeset
    76
 *      U+0066    LATIN SMALL LETTER F
90ce3da70b43 Initial load
duke
parents:
diff changeset
    77
 *      U+0069    LATIN SMALL LETTER I</pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    78
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    79
 * or as the single character
90ce3da70b43 Initial load
duke
parents:
diff changeset
    80
 *
21334
c60dfce46a77 8026982: javadoc errors in core libs
rriggs
parents: 19054
diff changeset
    81
 * <pre>
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    82
 *      U+FB03    LATIN SMALL LIGATURE FFI</pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    83
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    84
 * The ffi ligature is not a distinct semantic character, and strictly speaking
90ce3da70b43 Initial load
duke
parents:
diff changeset
    85
 * it shouldn't be in Unicode at all, but it was included for compatibility
90ce3da70b43 Initial load
duke
parents:
diff changeset
    86
 * with existing character sets that already provided it.  The Unicode standard
90ce3da70b43 Initial load
duke
parents:
diff changeset
    87
 * identifies such characters by giving them "compatibility" decompositions
90ce3da70b43 Initial load
duke
parents:
diff changeset
    88
 * into the corresponding semantic characters.  When sorting and searching, you
90ce3da70b43 Initial load
duke
parents:
diff changeset
    89
 * will often want to use these mappings.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    90
 * <p>
58288
48e480e56aad 8231186: Replace html tag <code>foo</code> with javadoc tag {@code foo} in java.base
jboes
parents: 47216
diff changeset
    91
 * The {@code normalize} method helps solve these problems by transforming
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    92
 * text into the canonical composed and decomposed forms as shown in the first
90ce3da70b43 Initial load
duke
parents:
diff changeset
    93
 * example above. In addition, you can have it perform compatibility
90ce3da70b43 Initial load
duke
parents:
diff changeset
    94
 * decompositions so that you can treat compatibility characters the same as
90ce3da70b43 Initial load
duke
parents:
diff changeset
    95
 * their equivalents.
58288
48e480e56aad 8231186: Replace html tag <code>foo</code> with javadoc tag {@code foo} in java.base
jboes
parents: 47216
diff changeset
    96
 * Finally, the {@code normalize} method rearranges accents into the
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    97
 * proper canonical order, so that you do not have to worry about accent
90ce3da70b43 Initial load
duke
parents:
diff changeset
    98
 * rearrangement on your own.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    99
 * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   100
 * The W3C generally recommends to exchange texts in NFC.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   101
 * Note also that most legacy character encodings use only precomposed forms and
90ce3da70b43 Initial load
duke
parents:
diff changeset
   102
 * often do not encode any combining marks by themselves. For conversion to such
90ce3da70b43 Initial load
duke
parents:
diff changeset
   103
 * character encodings the Unicode text needs to be normalized to NFC.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   104
 * For more usage examples, see the Unicode Standard Annex.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   105
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   106
 * @since 1.6
90ce3da70b43 Initial load
duke
parents:
diff changeset
   107
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   108
public final class Normalizer {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   109
90ce3da70b43 Initial load
duke
parents:
diff changeset
   110
   private Normalizer() {};
90ce3da70b43 Initial load
duke
parents:
diff changeset
   111
90ce3da70b43 Initial load
duke
parents:
diff changeset
   112
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   113
     * This enum provides constants of the four Unicode normalization forms
90ce3da70b43 Initial load
duke
parents:
diff changeset
   114
     * that are described in
90ce3da70b43 Initial load
duke
parents:
diff changeset
   115
     * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
90ce3da70b43 Initial load
duke
parents:
diff changeset
   116
     * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   117
     * and two methods to access them.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   118
     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   119
     * @since 1.6
90ce3da70b43 Initial load
duke
parents:
diff changeset
   120
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   121
    public static enum Form {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   122
90ce3da70b43 Initial load
duke
parents:
diff changeset
   123
        /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   124
         * Canonical decomposition.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   125
         */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   126
        NFD,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   127
90ce3da70b43 Initial load
duke
parents:
diff changeset
   128
        /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   129
         * Canonical decomposition, followed by canonical composition.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   130
         */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   131
        NFC,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   132
90ce3da70b43 Initial load
duke
parents:
diff changeset
   133
        /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   134
         * Compatibility decomposition.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   135
         */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   136
        NFKD,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   137
90ce3da70b43 Initial load
duke
parents:
diff changeset
   138
        /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   139
         * Compatibility decomposition, followed by canonical composition.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   140
         */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   141
        NFKC
90ce3da70b43 Initial load
duke
parents:
diff changeset
   142
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   143
90ce3da70b43 Initial load
duke
parents:
diff changeset
   144
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   145
     * Normalize a sequence of char values.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   146
     * The sequence will be normalized according to the specified normalization
90ce3da70b43 Initial load
duke
parents:
diff changeset
   147
     * from.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   148
     * @param src        The sequence of char values to normalize.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   149
     * @param form       The normalization form; one of
90ce3da70b43 Initial load
duke
parents:
diff changeset
   150
     *                   {@link java.text.Normalizer.Form#NFC},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   151
     *                   {@link java.text.Normalizer.Form#NFD},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   152
     *                   {@link java.text.Normalizer.Form#NFKC},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   153
     *                   {@link java.text.Normalizer.Form#NFKD}
90ce3da70b43 Initial load
duke
parents:
diff changeset
   154
     * @return The normalized String
58288
48e480e56aad 8231186: Replace html tag <code>foo</code> with javadoc tag {@code foo} in java.base
jboes
parents: 47216
diff changeset
   155
     * @throws NullPointerException If {@code src} or {@code form}
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   156
     * is null.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   157
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   158
    public static String normalize(CharSequence src, Form form) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   159
        return NormalizerBase.normalize(src.toString(), form);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   160
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   161
90ce3da70b43 Initial load
duke
parents:
diff changeset
   162
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   163
     * Determines if the given sequence of char values is normalized.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   164
     * @param src        The sequence of char values to be checked.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   165
     * @param form       The normalization form; one of
90ce3da70b43 Initial load
duke
parents:
diff changeset
   166
     *                   {@link java.text.Normalizer.Form#NFC},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   167
     *                   {@link java.text.Normalizer.Form#NFD},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   168
     *                   {@link java.text.Normalizer.Form#NFKC},
90ce3da70b43 Initial load
duke
parents:
diff changeset
   169
     *                   {@link java.text.Normalizer.Form#NFKD}
90ce3da70b43 Initial load
duke
parents:
diff changeset
   170
     * @return true if the sequence of char values is normalized;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   171
     * false otherwise.
58288
48e480e56aad 8231186: Replace html tag <code>foo</code> with javadoc tag {@code foo} in java.base
jboes
parents: 47216
diff changeset
   172
     * @throws NullPointerException If {@code src} or {@code form}
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   173
     * is null.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   174
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   175
    public static boolean isNormalized(CharSequence src, Form form) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   176
        return NormalizerBase.isNormalized(src.toString(), form);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   177
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   178
}