src/java.base/share/classes/java/util/regex/Grapheme.java
changeset 47216 71c04702a3d5
parent 35783 2690535d72cc
child 55013 8dae495a59e7
equal deleted inserted replaced
47215:4ebc2e2fb97c 47216:71c04702a3d5
       
     1 /*
       
     2  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 
       
    26 package java.util.regex;
       
    27 
       
    28 final class Grapheme {
       
    29 
       
    30     /**
       
    31      * Determines if there is an extended  grapheme cluster boundary between two
       
    32      * continuing characters {@code cp1} and {@code cp2}.
       
    33      * <p>
       
    34      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
       
    35      * for the extended grapheme cluster boundary rules
       
    36      */
       
    37     static boolean isBoundary(int cp1, int cp2) {
       
    38         return rules[getType(cp1)][getType(cp2)];
       
    39     }
       
    40 
       
    41     // types
       
    42     private static final int OTHER = 0;
       
    43     private static final int CR = 1;
       
    44     private static final int LF = 2;
       
    45     private static final int CONTROL = 3;
       
    46     private static final int EXTEND = 4;
       
    47     private static final int RI = 5;
       
    48     private static final int PREPEND = 6;
       
    49     private static final int SPACINGMARK = 7;
       
    50     private static final int L = 8;
       
    51     private static final int V = 9;
       
    52     private static final int T = 10;
       
    53     private static final int LV = 11;
       
    54     private static final int LVT = 12;
       
    55 
       
    56     private static final int FIRST_TYPE = 0;
       
    57     private static final int LAST_TYPE = 12;
       
    58 
       
    59     private static boolean[][] rules;
       
    60     static {
       
    61         rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
       
    62         // default, any + any
       
    63         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
       
    64             for (int j = FIRST_TYPE; j <= LAST_TYPE; j++)
       
    65                 rules[i][j] = true;
       
    66         // GB 6 L x (L | V | LV | VT)
       
    67         rules[L][L] = false;
       
    68         rules[L][V] = false;
       
    69         rules[L][LV] = false;
       
    70         rules[L][LVT] = false;
       
    71         // GB 7 (LV | V) x (V | T)
       
    72         rules[LV][V] = false;
       
    73         rules[LV][T] = false;
       
    74         rules[V][V] = false;
       
    75         rules[V][T] = false;
       
    76         // GB 8 (LVT | T) x T
       
    77         rules[LVT][T] = false;
       
    78         rules[T][T] = false;
       
    79         // GB 8a RI x RI
       
    80         rules[RI][RI] = false;
       
    81         // GB 9 x Extend
       
    82         // GB 9a x Spacing Mark
       
    83         // GB 9b Prepend x
       
    84         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) {
       
    85             rules[i][EXTEND] = false;
       
    86             rules[i][SPACINGMARK] = false;
       
    87             rules[PREPEND][i] = false;
       
    88         }
       
    89         // GB 4  (Control | CR | LF) +
       
    90         // GB 5  + (Control | CR | LF)
       
    91         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
       
    92             for (int j = CR; j <= CONTROL; j++) {
       
    93                 rules[i][j] = true;
       
    94                 rules[j][i] = true;
       
    95             }
       
    96         // GB 3 CR x LF
       
    97         rules[CR][LF] = false;
       
    98         // GB 10 Any + Any  -> default
       
    99     }
       
   100 
       
   101     // Hangul syllables
       
   102     private static final int SYLLABLE_BASE = 0xAC00;
       
   103     private static final int LCOUNT = 19;
       
   104     private static final int VCOUNT = 21;
       
   105     private static final int TCOUNT = 28;
       
   106     private static final int NCOUNT = VCOUNT * TCOUNT; // 588
       
   107     private static final int SCOUNT = LCOUNT * NCOUNT; // 11172
       
   108 
       
   109     // #tr29: SpacingMark exceptions: The following (which have
       
   110     // General_Category = Spacing_Mark and would otherwise be included)
       
   111     // are specifically excluded
       
   112     private static boolean isExcludedSpacingMark(int cp) {
       
   113        return  cp == 0x102B || cp == 0x102C || cp == 0x1038 ||
       
   114                cp >= 0x1062 && cp <= 0x1064 ||
       
   115                cp >= 0x1062 && cp <= 0x106D ||
       
   116                cp == 0x1083 ||
       
   117                cp >= 0x1087 && cp <= 0x108C ||
       
   118                cp == 0x108F ||
       
   119                cp >= 0x109A && cp <= 0x109C ||
       
   120                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
       
   121                cp == 0xAA7B || cp == 0xAA7D;
       
   122     }
       
   123 
       
   124     @SuppressWarnings("fallthrough")
       
   125     private static int getType(int cp) {
       
   126         int type = Character.getType(cp);
       
   127         switch(type) {
       
   128         case Character.CONTROL:
       
   129             if (cp == 0x000D)
       
   130                 return CR;
       
   131             if (cp == 0x000A)
       
   132                 return LF;
       
   133             return CONTROL;
       
   134          case Character.UNASSIGNED:
       
   135             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
       
   136             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
       
   137             // so type it as "Other" to make the test happy
       
   138              if (cp == 0x0378)
       
   139                  return OTHER;
       
   140 
       
   141         case Character.LINE_SEPARATOR:
       
   142         case Character.PARAGRAPH_SEPARATOR:
       
   143         case Character.SURROGATE:
       
   144             return CONTROL;
       
   145         case Character.FORMAT:
       
   146             if (cp == 0x200C || cp == 0x200D)
       
   147                 return EXTEND;
       
   148             return CONTROL;
       
   149         case Character.NON_SPACING_MARK:
       
   150         case Character.ENCLOSING_MARK:
       
   151              // NOTE:
       
   152              // #tr29 "plus a few General_Category = Spacing_Mark needed for
       
   153              // canonical equivalence."
       
   154              // but for "extended grapheme clusters" support, there is no
       
   155              // need actually to diff "extend" and "spackmark" given GB9, GB9a
       
   156              return EXTEND;
       
   157         case  Character.COMBINING_SPACING_MARK:
       
   158             if (isExcludedSpacingMark(cp))
       
   159                 return OTHER;
       
   160             // NOTE:
       
   161             // 0x11720 and 0x11721 are mentioned in #tr29 as
       
   162             // OTHER_LETTER but it appears their category has been updated to
       
   163             // COMBING_SPACING_MARK already (verified in ver.8)
       
   164             return SPACINGMARK;
       
   165         case Character.OTHER_SYMBOL:
       
   166             if (cp >= 0x1F1E6 && cp <= 0x1F1FF)
       
   167                 return RI;
       
   168             return OTHER;
       
   169         case Character.MODIFIER_LETTER:
       
   170             // WARNING:
       
   171             // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt
       
   172             if (cp == 0xFF9E || cp == 0xFF9F)
       
   173                 return EXTEND;
       
   174             return OTHER;
       
   175         case Character.OTHER_LETTER:
       
   176             if (cp == 0x0E33 || cp == 0x0EB3)
       
   177                 return SPACINGMARK;
       
   178             // hangul jamo
       
   179             if (cp >= 0x1100 && cp <= 0x11FF) {
       
   180                 if (cp <= 0x115F)
       
   181                     return L;
       
   182                 if (cp <= 0x11A7)
       
   183                     return V;
       
   184                 return T;
       
   185             }
       
   186             // hangul syllables
       
   187             int sindex = cp - SYLLABLE_BASE;
       
   188             if (sindex >= 0 && sindex < SCOUNT) {
       
   189 
       
   190                 if (sindex % TCOUNT == 0)
       
   191                     return LV;
       
   192                 return LVT;
       
   193             }
       
   194             //  hangul jamo_extended A
       
   195             if (cp >= 0xA960 && cp <= 0xA97C)
       
   196                 return L;
       
   197             //  hangul jamo_extended B
       
   198             if (cp >= 0xD7B0 && cp <= 0xD7C6)
       
   199                 return V;
       
   200             if (cp >= 0xD7CB && cp <= 0xD7FB)
       
   201                 return T;
       
   202         }
       
   203         return OTHER;
       
   204     }
       
   205 }