src/java.base/share/classes/java/util/regex/Grapheme.java
changeset 55141 db105c4c5776
parent 55013 8dae495a59e7
equal deleted inserted replaced
55140:d4890c3721be 55141:db105c4c5776
    26 package java.util.regex;
    26 package java.util.regex;
    27 
    27 
    28 import java.util.Objects;
    28 import java.util.Objects;
    29 
    29 
    30 final class Grapheme {
    30 final class Grapheme {
       
    31 
       
    32     /**
       
    33      * Determines if there is an extended  grapheme cluster boundary between two
       
    34      * continuing characters {@code cp1} and {@code cp2}.
       
    35      * <p>
       
    36      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
       
    37      * for the extended grapheme cluster boundary rules
       
    38      * <p>
       
    39      * Note: this method does not take care of stateful breaking.
       
    40      */
       
    41     static boolean isBoundary(int cp1, int cp2) {
       
    42         return rules[getType(cp1)][getType(cp2)];
       
    43     }
    31 
    44 
    32     /**
    45     /**
    33      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
    46      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
    34      * the start of the char sequence is a boundary.
    47      * the start of the char sequence is a boundary.
    35      * <p>
    48      * <p>
    48 
    61 
    49         int ch0 = Character.codePointAt(src, 0);
    62         int ch0 = Character.codePointAt(src, 0);
    50         int ret = Character.charCount(ch0);
    63         int ret = Character.charCount(ch0);
    51         int ch1;
    64         int ch1;
    52         // indicates whether gb11 or gb12 is underway
    65         // indicates whether gb11 or gb12 is underway
    53         boolean gb11 = EmojiData.isExtendedPictographic(ch0);
    66         int t0 = getGraphemeType(ch0);
    54         int riCount = getType(ch0) == RI ? 1 : 0;
    67         int riCount = t0 == RI ? 1 : 0;
       
    68         boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
    55         while (ret < limit) {
    69         while (ret < limit) {
    56             ch1 = Character.codePointAt(src, ret);
    70             ch1 = Character.codePointAt(src, ret);
    57             int t0 = getType(ch0);
    71             int t1 = getGraphemeType(ch1);
    58             int t1 = getType(ch1);
       
    59 
    72 
    60             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
    73             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
    61                 gb11 = false;
    74                 gb11 = false;
    62             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
    75             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
    63                 // continue for gb12
    76                 // continue for gb12
    64             } else if (rules[t0][t1]) {
    77             } else if (rules[t0][t1]) {
    65                 if (ret > off) {
    78                 if (ret > off) {
    66                     break;
    79                     break;
    67                 } else {
    80                 } else {
    68                     gb11 = EmojiData.isExtendedPictographic(ch1);
    81                     gb11 = t1 == EXTENDED_PICTOGRAPHIC;
    69                     riCount = 0;
    82                     riCount = 0;
    70                 }
    83                 }
    71             }
    84             }
    72 
    85 
    73             riCount += getType(ch1) == RI ? 1 : 0;
    86             riCount += (t1 == RI) ? 1 : 0;
    74             ch0 = ch1;
    87             t0 = t1;
       
    88 
    75             ret += Character.charCount(ch1);
    89             ret += Character.charCount(ch1);
    76         }
    90         }
    77         return ret;
    91         return ret;
    78     }
    92     }
    79 
    93 
   161                cp >= 0x109A && cp <= 0x109C ||
   175                cp >= 0x109A && cp <= 0x109C ||
   162                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
   176                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
   163                cp == 0xAA7B || cp == 0xAA7D;
   177                cp == 0xAA7B || cp == 0xAA7D;
   164     }
   178     }
   165 
   179 
       
   180     private static int getGraphemeType(int cp) {
       
   181         if (cp < 0x007F) { // ASCII
       
   182             if (cp < 32) { // Control characters
       
   183                 if (cp == 0x000D)
       
   184                     return CR;
       
   185                 if (cp == 0x000A)
       
   186                     return LF;
       
   187                 return CONTROL;
       
   188             }
       
   189             return OTHER;
       
   190         }
       
   191         return getType(cp);
       
   192     }
       
   193 
   166     @SuppressWarnings("fallthrough")
   194     @SuppressWarnings("fallthrough")
   167     private static int getType(int cp) {
   195     private static int getType(int cp) {
   168         if (EmojiData.isExtendedPictographic(cp)) {
   196         if (EmojiData.isExtendedPictographic(cp)) {
   169             return EXTENDED_PICTOGRAPHIC;
   197             return EXTENDED_PICTOGRAPHIC;
   170         }
   198         }
   171 
   199 
   172         int type = Character.getType(cp);
   200         int type = Character.getType(cp);
   173         switch(type) {
   201         switch(type) {
   174         case Character.CONTROL:
       
   175             if (cp == 0x000D)
       
   176                 return CR;
       
   177             if (cp == 0x000A)
       
   178                 return LF;
       
   179             return CONTROL;
       
   180         case Character.UNASSIGNED:
   202         case Character.UNASSIGNED:
   181             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
   203             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
   182             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
   204             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
   183             // so type it as "Other" to make the test happy
   205             // so type it as "Other" to make the test happy
   184             if (cp == 0x0378)
   206             if (cp == 0x0378)
   185                 return OTHER;
   207                 return OTHER;
   186 
   208 
       
   209         case Character.CONTROL:
   187         case Character.LINE_SEPARATOR:
   210         case Character.LINE_SEPARATOR:
   188         case Character.PARAGRAPH_SEPARATOR:
   211         case Character.PARAGRAPH_SEPARATOR:
   189         case Character.SURROGATE:
   212         case Character.SURROGATE:
   190             return CONTROL;
   213             return CONTROL;
   191         case Character.FORMAT:
   214         case Character.FORMAT: