src/java.base/share/classes/java/util/regex/Grapheme.java
changeset 55013 8dae495a59e7
parent 47216 71c04702a3d5
child 55141 db105c4c5776
equal deleted inserted replaced
55012:fb0cfce19262 55013:8dae495a59e7
     1 /*
     1 /*
     2  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
     2  * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Oracle designates this
     7  * published by the Free Software Foundation.  Oracle designates this
    23  * questions.
    23  * questions.
    24  */
    24  */
    25 
    25 
    26 package java.util.regex;
    26 package java.util.regex;
    27 
    27 
       
    28 import java.util.Objects;
       
    29 
    28 final class Grapheme {
    30 final class Grapheme {
    29 
    31 
    30     /**
    32     /**
    31      * Determines if there is an extended  grapheme cluster boundary between two
    33      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
    32      * continuing characters {@code cp1} and {@code cp2}.
    34      * the start of the char sequence is a boundary.
    33      * <p>
    35      * <p>
    34      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
    36      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
    35      * for the extended grapheme cluster boundary rules
    37      * for the extended grapheme cluster boundary rules. The following implementation
       
    38      * is based on version 12.0 of the annex.
       
    39      * (http://www.unicode.org/reports/tr29/tr29-35.html)
       
    40      *
       
    41      * @param src the {@code CharSequence} to be scanned
       
    42      * @param off offset to start looking for the next boundary in the src
       
    43      * @param limit limit offset in the src (exclusive)
       
    44      * @return the next possible boundary
    36      */
    45      */
    37     static boolean isBoundary(int cp1, int cp2) {
    46     static int nextBoundary(CharSequence src, int off, int limit) {
    38         return rules[getType(cp1)][getType(cp2)];
    47         Objects.checkFromToIndex(off, limit, src.length());
       
    48 
       
    49         int ch0 = Character.codePointAt(src, 0);
       
    50         int ret = Character.charCount(ch0);
       
    51         int ch1;
       
    52         // indicates whether gb11 or gb12 is underway
       
    53         boolean gb11 = EmojiData.isExtendedPictographic(ch0);
       
    54         int riCount = getType(ch0) == RI ? 1 : 0;
       
    55         while (ret < limit) {
       
    56             ch1 = Character.codePointAt(src, ret);
       
    57             int t0 = getType(ch0);
       
    58             int t1 = getType(ch1);
       
    59 
       
    60             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
       
    61                 gb11 = false;
       
    62             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
       
    63                 // continue for gb12
       
    64             } else if (rules[t0][t1]) {
       
    65                 if (ret > off) {
       
    66                     break;
       
    67                 } else {
       
    68                     gb11 = EmojiData.isExtendedPictographic(ch1);
       
    69                     riCount = 0;
       
    70                 }
       
    71             }
       
    72 
       
    73             riCount += getType(ch1) == RI ? 1 : 0;
       
    74             ch0 = ch1;
       
    75             ret += Character.charCount(ch1);
       
    76         }
       
    77         return ret;
    39     }
    78     }
    40 
    79 
    41     // types
    80     // types
    42     private static final int OTHER = 0;
    81     private static final int OTHER = 0;
    43     private static final int CR = 1;
    82     private static final int CR = 1;
    44     private static final int LF = 2;
    83     private static final int LF = 2;
    45     private static final int CONTROL = 3;
    84     private static final int CONTROL = 3;
    46     private static final int EXTEND = 4;
    85     private static final int EXTEND = 4;
    47     private static final int RI = 5;
    86     private static final int ZWJ = 5;
    48     private static final int PREPEND = 6;
    87     private static final int RI = 6;
    49     private static final int SPACINGMARK = 7;
    88     private static final int PREPEND = 7;
    50     private static final int L = 8;
    89     private static final int SPACINGMARK = 8;
    51     private static final int V = 9;
    90     private static final int L = 9;
    52     private static final int T = 10;
    91     private static final int V = 10;
    53     private static final int LV = 11;
    92     private static final int T = 11;
    54     private static final int LVT = 12;
    93     private static final int LV = 12;
       
    94     private static final int LVT = 13;
       
    95     private static final int EXTENDED_PICTOGRAPHIC = 14;
    55 
    96 
    56     private static final int FIRST_TYPE = 0;
    97     private static final int FIRST_TYPE = 0;
    57     private static final int LAST_TYPE = 12;
    98     private static final int LAST_TYPE = 14;
    58 
    99 
    59     private static boolean[][] rules;
   100     private static boolean[][] rules;
    60     static {
   101     static {
    61         rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
   102         rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
    62         // default, any + any
   103         // GB 999 Any + Any  -> default
    63         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
   104         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
    64             for (int j = FIRST_TYPE; j <= LAST_TYPE; j++)
   105             for (int j = FIRST_TYPE; j <= LAST_TYPE; j++)
    65                 rules[i][j] = true;
   106                 rules[i][j] = true;
    66         // GB 6 L x (L | V | LV | VT)
   107         // GB 6 L x (L | V | LV | VT)
    67         rules[L][L] = false;
   108         rules[L][L] = false;
    74         rules[V][V] = false;
   115         rules[V][V] = false;
    75         rules[V][T] = false;
   116         rules[V][T] = false;
    76         // GB 8 (LVT | T) x T
   117         // GB 8 (LVT | T) x T
    77         rules[LVT][T] = false;
   118         rules[LVT][T] = false;
    78         rules[T][T] = false;
   119         rules[T][T] = false;
    79         // GB 8a RI x RI
   120         // GB 9 x (Extend|ZWJ)
    80         rules[RI][RI] = false;
       
    81         // GB 9 x Extend
       
    82         // GB 9a x Spacing Mark
   121         // GB 9a x Spacing Mark
    83         // GB 9b Prepend x
   122         // GB 9b Prepend x
    84         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) {
   123         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) {
    85             rules[i][EXTEND] = false;
   124             rules[i][EXTEND] = false;
       
   125             rules[i][ZWJ] = false;
    86             rules[i][SPACINGMARK] = false;
   126             rules[i][SPACINGMARK] = false;
    87             rules[PREPEND][i] = false;
   127             rules[PREPEND][i] = false;
    88         }
   128         }
    89         // GB 4  (Control | CR | LF) +
   129         // GB 4  (Control | CR | LF) +
    90         // GB 5  + (Control | CR | LF)
   130         // GB 5  + (Control | CR | LF)
    93                 rules[i][j] = true;
   133                 rules[i][j] = true;
    94                 rules[j][i] = true;
   134                 rules[j][i] = true;
    95             }
   135             }
    96         // GB 3 CR x LF
   136         // GB 3 CR x LF
    97         rules[CR][LF] = false;
   137         rules[CR][LF] = false;
    98         // GB 10 Any + Any  -> default
   138         // GB 11 Exended_Pictographic x (Extend|ZWJ)
       
   139         rules[EXTENDED_PICTOGRAPHIC][EXTEND] = false;
       
   140         rules[EXTENDED_PICTOGRAPHIC][ZWJ] = false;
    99     }
   141     }
   100 
   142 
   101     // Hangul syllables
   143     // Hangul syllables
   102     private static final int SYLLABLE_BASE = 0xAC00;
   144     private static final int SYLLABLE_BASE = 0xAC00;
   103     private static final int LCOUNT = 19;
   145     private static final int LCOUNT = 19;
   121                cp == 0xAA7B || cp == 0xAA7D;
   163                cp == 0xAA7B || cp == 0xAA7D;
   122     }
   164     }
   123 
   165 
   124     @SuppressWarnings("fallthrough")
   166     @SuppressWarnings("fallthrough")
   125     private static int getType(int cp) {
   167     private static int getType(int cp) {
       
   168         if (EmojiData.isExtendedPictographic(cp)) {
       
   169             return EXTENDED_PICTOGRAPHIC;
       
   170         }
       
   171 
   126         int type = Character.getType(cp);
   172         int type = Character.getType(cp);
   127         switch(type) {
   173         switch(type) {
   128         case Character.CONTROL:
   174         case Character.CONTROL:
   129             if (cp == 0x000D)
   175             if (cp == 0x000D)
   130                 return CR;
   176                 return CR;
   131             if (cp == 0x000A)
   177             if (cp == 0x000A)
   132                 return LF;
   178                 return LF;
   133             return CONTROL;
   179             return CONTROL;
   134          case Character.UNASSIGNED:
   180         case Character.UNASSIGNED:
   135             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
   181             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
   136             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
   182             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
   137             // so type it as "Other" to make the test happy
   183             // so type it as "Other" to make the test happy
   138              if (cp == 0x0378)
   184             if (cp == 0x0378)
   139                  return OTHER;
   185                 return OTHER;
   140 
   186 
   141         case Character.LINE_SEPARATOR:
   187         case Character.LINE_SEPARATOR:
   142         case Character.PARAGRAPH_SEPARATOR:
   188         case Character.PARAGRAPH_SEPARATOR:
   143         case Character.SURROGATE:
   189         case Character.SURROGATE:
   144             return CONTROL;
   190             return CONTROL;
   145         case Character.FORMAT:
   191         case Character.FORMAT:
   146             if (cp == 0x200C || cp == 0x200D)
   192             if (cp == 0x200C ||
       
   193                 cp >= 0xE0020 && cp <= 0xE007F)
   147                 return EXTEND;
   194                 return EXTEND;
       
   195             if (cp == 0x200D)
       
   196                 return ZWJ;
       
   197             if (cp >= 0x0600 && cp <= 0x0605 ||
       
   198                 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
       
   199                 cp == 0x110BD || cp == 0x110CD)
       
   200                 return PREPEND;
   148             return CONTROL;
   201             return CONTROL;
   149         case Character.NON_SPACING_MARK:
   202         case Character.NON_SPACING_MARK:
   150         case Character.ENCLOSING_MARK:
   203         case Character.ENCLOSING_MARK:
   151              // NOTE:
   204             // NOTE:
   152              // #tr29 "plus a few General_Category = Spacing_Mark needed for
   205             // #tr29 "plus a few General_Category = Spacing_Mark needed for
   153              // canonical equivalence."
   206             // canonical equivalence."
   154              // but for "extended grapheme clusters" support, there is no
   207             // but for "extended grapheme clusters" support, there is no
   155              // need actually to diff "extend" and "spackmark" given GB9, GB9a
   208             // need actually to diff "extend" and "spackmark" given GB9, GB9a
   156              return EXTEND;
   209             return EXTEND;
   157         case  Character.COMBINING_SPACING_MARK:
   210         case  Character.COMBINING_SPACING_MARK:
   158             if (isExcludedSpacingMark(cp))
   211             if (isExcludedSpacingMark(cp))
   159                 return OTHER;
   212                 return OTHER;
   160             // NOTE:
   213             // NOTE:
   161             // 0x11720 and 0x11721 are mentioned in #tr29 as
   214             // 0x11720 and 0x11721 are mentioned in #tr29 as
   165         case Character.OTHER_SYMBOL:
   218         case Character.OTHER_SYMBOL:
   166             if (cp >= 0x1F1E6 && cp <= 0x1F1FF)
   219             if (cp >= 0x1F1E6 && cp <= 0x1F1FF)
   167                 return RI;
   220                 return RI;
   168             return OTHER;
   221             return OTHER;
   169         case Character.MODIFIER_LETTER:
   222         case Character.MODIFIER_LETTER:
       
   223         case Character.MODIFIER_SYMBOL:
   170             // WARNING:
   224             // WARNING:
   171             // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt
   225             // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt
   172             if (cp == 0xFF9E || cp == 0xFF9F)
   226             if (cp == 0xFF9E || cp == 0xFF9F ||
       
   227                 cp >= 0x1F3FB && cp <= 0x1F3FF)
   173                 return EXTEND;
   228                 return EXTEND;
   174             return OTHER;
   229             return OTHER;
   175         case Character.OTHER_LETTER:
   230         case Character.OTHER_LETTER:
   176             if (cp == 0x0E33 || cp == 0x0EB3)
   231             if (cp == 0x0E33 || cp == 0x0EB3)
   177                 return SPACINGMARK;
   232                 return SPACINGMARK;
   197             //  hangul jamo_extended B
   252             //  hangul jamo_extended B
   198             if (cp >= 0xD7B0 && cp <= 0xD7C6)
   253             if (cp >= 0xD7B0 && cp <= 0xD7C6)
   199                 return V;
   254                 return V;
   200             if (cp >= 0xD7CB && cp <= 0xD7FB)
   255             if (cp >= 0xD7CB && cp <= 0xD7FB)
   201                 return T;
   256                 return T;
       
   257 
       
   258             // Prepend
       
   259             switch (cp) {
       
   260                 case 0x0D4E:
       
   261                 case 0x111C2:
       
   262                 case 0x111C3:
       
   263                 case 0x11A3A:
       
   264                 case 0x11A84:
       
   265                 case 0x11A85:
       
   266                 case 0x11A86:
       
   267                 case 0x11A87:
       
   268                 case 0x11A88:
       
   269                 case 0x11A89:
       
   270                 case 0x11D46:
       
   271                     return PREPEND;
       
   272             }
   202         }
   273         }
   203         return OTHER;
   274         return OTHER;
   204     }
   275     }
   205 }
   276 }