8032446: Support Unicode 7.0.0 in JDK 9
8130889: Missing "@since 1.8" tags in j.l.Character.java
Reviewed-by: naoto, okutsu
--- a/jdk/make/data/characterdata/CharacterData00.java.template Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/data/characterdata/CharacterData00.java.template Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -226,6 +226,11 @@
case 0xA77D : mapChar = 0x1D79; break;
case 0xA78D : mapChar = 0x0265; break;
case 0xA7AA : mapChar = 0x0266; break;
+ case 0xA7AB : mapChar = 0x025C; break;
+ case 0xA7AC : mapChar = 0x0261; break;
+ case 0xA7AD : mapChar = 0x026C; break;
+ case 0xA7B0 : mapChar = 0x029E; break;
+ case 0xA7B1 : mapChar = 0x0287; break;
// default mapChar is already set, so no
// need to redo it here.
// default : mapChar = ch;
@@ -284,10 +289,15 @@
case 0x0250 : mapChar = 0x2C6F; break;
case 0x0251 : mapChar = 0x2C6D; break;
case 0x0252 : mapChar = 0x2C70; break;
+ case 0x025C : mapChar = 0xA7AB; break;
+ case 0x0261 : mapChar = 0xA7AC; break;
case 0x0265 : mapChar = 0xA78D; break;
case 0x0266 : mapChar = 0xA7AA; break;
case 0x026B : mapChar = 0x2C62; break;
+ case 0x026C : mapChar = 0xA7AD; break;
case 0x0271 : mapChar = 0x2C6E; break;
+ case 0x0287 : mapChar = 0xA7B1; break;
+ case 0x029E : mapChar = 0xA7B0; break;
case 0x027D : mapChar = 0x2C64; break;
case 0x1D79 : mapChar = 0xA77D; break;
case 0x1D7D : mapChar = 0x2C63; break;
@@ -503,6 +513,22 @@
// This is the only char with RLO
directionality = Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE;
break;
+ case 0x2066 :
+ // This is the only char with LRI
+ directionality = Character.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE;
+ break;
+ case 0x2067 :
+ // This is the only char with RLI
+ directionality = Character.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE;
+ break;
+ case 0x2068 :
+ // This is the only char with FSI
+ directionality = Character.DIRECTIONALITY_FIRST_STRONG_ISOLATE;
+ break;
+ case 0x2069 :
+ // This is the only char with PDI
+ directionality = Character.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE;
+ break;
default :
directionality = Character.DIRECTIONALITY_UNDEFINED;
break;
@@ -537,11 +563,16 @@
case 0x0250 : mapChar = 0x2C6F; break;
case 0x0251 : mapChar = 0x2C6D; break;
case 0x0252 : mapChar = 0x2C70; break;
+ case 0x025C : mapChar = 0xA7AB; break;
+ case 0x0261 : mapChar = 0xA7AC; break;
case 0x0265 : mapChar = 0xA78D; break;
case 0x0266 : mapChar = 0xA7AA; break;
case 0x026B : mapChar = 0x2C62; break;
+ case 0x026C : mapChar = 0xA7AD; break;
case 0x0271 : mapChar = 0x2C6E; break;
case 0x027D : mapChar = 0x2C64; break;
+ case 0x0287 : mapChar = 0xA7B1; break;
+ case 0x029E : mapChar = 0xA7B0; break;
case 0x1D79 : mapChar = 0xA77D; break;
case 0x1D7D : mapChar = 0x2C63; break;
case 0x2C65 : mapChar = 0x023A; break;
--- a/jdk/make/data/characterdata/CharacterData01.java.template Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/data/characterdata/CharacterData01.java.template Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -244,81 +244,118 @@
case 0x10132: retval = 80000; break; // AEGEAN NUMBER EIGHTY THOUSAND
case 0x10133: retval = 90000; break; // AEGEAN NUMBER NINETY THOUSAND
case 0x10323: retval = 50; break; // OLD ITALIC NUMERAL FIFTY
-
- case 0x010144: retval = 50; break; // ACROPHONIC ATTIC FIFTY
- case 0x010145: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED
- case 0x010146: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND
- case 0x010147: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND
- case 0x01014A: retval = 50; break; // ACROPHONIC ATTIC FIFTY TALENTS
- case 0x01014B: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED TALENTS
- case 0x01014C: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED TALENTS
- case 0x01014D: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND TALENTS
- case 0x01014E: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND TALENTS
- case 0x010151: retval = 50; break; // ACROPHONIC ATTIC FIFTY STATERS
- case 0x010152: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED STATERS
- case 0x010153: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED STATERS
- case 0x010154: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND STATERS
- case 0x010155: retval = 10000; break; // ACROPHONIC ATTIC TEN THOUSAND STATERS
- case 0x010156: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND STATERS
- case 0x010166: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY
- case 0x010167: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY ALTERNATE FORM
- case 0x010168: retval = 50; break; // ACROPHONIC HERMIONIAN FIFTY
- case 0x010169: retval = 50; break; // ACROPHONIC THESPIAN FIFTY
- case 0x01016A: retval = 100; break; // ACROPHONIC THESPIAN ONE HUNDRED
- case 0x01016B: retval = 300; break; // ACROPHONIC THESPIAN THREE HUNDRED
- case 0x01016C: retval = 500; break; // ACROPHONIC EPIDAUREAN FIVE HUNDRED
- case 0x01016D: retval = 500; break; // ACROPHONIC TROEZENIAN FIVE HUNDRED
- case 0x01016E: retval = 500; break; // ACROPHONIC THESPIAN FIVE HUNDRED
- case 0x01016F: retval = 500; break; // ACROPHONIC CARYSTIAN FIVE HUNDRED
- case 0x010170: retval = 500; break; // ACROPHONIC NAXIAN FIVE HUNDRED
- case 0x010171: retval = 1000; break; // ACROPHONIC THESPIAN ONE THOUSAND
- case 0x010172: retval = 5000; break; // ACROPHONIC THESPIAN FIVE THOUSAND
- case 0x010174: retval = 50; break; // ACROPHONIC STRATIAN FIFTY MNAS
- case 0x010341: retval = 90; break; // GOTHIC LETTER NINETY
- case 0x01034A: retval = 900; break; // GOTHIC LETTER NINE HUNDRED
- case 0x0103D5: retval = 100; break; // OLD PERSIAN NUMBER HUNDRED
- case 0x01085D: retval = 100; break; // IMPERIAL ARAMAIC NUMBER ONE HUNDRED
- case 0x01085E: retval = 1000; break; // IMPERIAL ARAMAIC NUMBER ONE THOUSAND
- case 0x01085F: retval = 10000; break; // IMPERIAL ARAMAIC NUMBER TEN THOUSAND
- case 0x010919: retval = 100; break; // PHOENICIAN NUMBER ONE HUNDRED
- case 0x010A46: retval = 100; break; // KHAROSHTHI NUMBER ONE HUNDRED
- case 0x010A47: retval = 1000; break; // KHAROSHTHI NUMBER ONE THOUSAND
- case 0x010A7E: retval = 50; break; // OLD SOUTH ARABIAN NUMBER FIFTY
- case 0x010B5E: retval = 100; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE HUNDRED
- case 0x010B5F: retval = 1000; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND
- case 0x010B7E: retval = 100; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED
- case 0x010B7F: retval = 1000; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND
- case 0x010E6C: retval = 40; break; // RUMI NUMBER FORTY
- case 0x010E6D: retval = 50; break; // RUMI NUMBER FIFTY
- case 0x010E6E: retval = 60; break; // RUMI NUMBER SIXTY
- case 0x010E6F: retval = 70; break; // RUMI NUMBER SEVENTY
- case 0x010E70: retval = 80; break; // RUMI NUMBER EIGHTY
- case 0x010E71: retval = 90; break; // RUMI NUMBER NINETY
- case 0x010E72: retval = 100; break; // RUMI NUMBER ONE HUNDRED
- case 0x010E73: retval = 200; break; // RUMI NUMBER TWO HUNDRED
- case 0x010E74: retval = 300; break; // RUMI NUMBER THREE HUNDRED
- case 0x010E75: retval = 400; break; // RUMI NUMBER FOUR HUNDRED
- case 0x010E76: retval = 500; break; // RUMI NUMBER FIVE HUNDRED
- case 0x010E77: retval = 600; break; // RUMI NUMBER SIX HUNDRED
- case 0x010E78: retval = 700; break; // RUMI NUMBER SEVEN HUNDRED
- case 0x010E79: retval = 800; break; // RUMI NUMBER EIGHT HUNDRED
- case 0x010E7A: retval = 900; break; // RUMI NUMBER NINE HUNDRED
- case 0x01105E: retval = 40; break; // BRAHMI NUMBER FORTY
- case 0x01105F: retval = 50; break; // BRAHMI NUMBER FIFTY
- case 0x011060: retval = 60; break; // BRAHMI NUMBER SIXTY
- case 0x011061: retval = 70; break; // BRAHMI NUMBER SEVENTY
- case 0x011062: retval = 80; break; // BRAHMI NUMBER EIGHTY
- case 0x011063: retval = 90; break; // BRAHMI NUMBER NINETY
- case 0x011064: retval = 100; break; // BRAHMI NUMBER ONE HUNDRED
- case 0x011065: retval = 1000; break; // BRAHMI NUMBER ONE THOUSAND
- case 0x012432: retval = 216000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH
- case 0x012433: retval = 432000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN
- case 0x01D36C: retval = 40; break; // COUNTING ROD TENS DIGIT FOUR
- case 0x01D36D: retval = 50; break; // COUNTING ROD TENS DIGIT FIVE
- case 0x01D36E: retval = 60; break; // COUNTING ROD TENS DIGIT SIX
- case 0x01D36F: retval = 70; break; // COUNTING ROD TENS DIGIT SEVEN
- case 0x01D370: retval = 80; break; // COUNTING ROD TENS DIGIT EIGHT
- case 0x01D371: retval = 90; break; // COUNTING ROD TENS DIGIT NINE
+ case 0x10144: retval = 50; break; // ACROPHONIC ATTIC FIFTY
+ case 0x10145: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED
+ case 0x10146: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND
+ case 0x10147: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND
+ case 0x1014A: retval = 50; break; // ACROPHONIC ATTIC FIFTY TALENTS
+ case 0x1014B: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED TALENTS
+ case 0x1014C: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED TALENTS
+ case 0x1014D: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND TALENTS
+ case 0x1014E: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND TALENTS
+ case 0x10151: retval = 50; break; // ACROPHONIC ATTIC FIFTY STATERS
+ case 0x10152: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED STATERS
+ case 0x10153: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED STATERS
+ case 0x10154: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND STATERS
+ case 0x10155: retval = 10000; break; // ACROPHONIC ATTIC TEN THOUSAND STATERS
+ case 0x10156: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND STATERS
+ case 0x10166: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY
+ case 0x10167: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY ALTERNATE FORM
+ case 0x10168: retval = 50; break; // ACROPHONIC HERMIONIAN FIFTY
+ case 0x10169: retval = 50; break; // ACROPHONIC THESPIAN FIFTY
+ case 0x1016A: retval = 100; break; // ACROPHONIC THESPIAN ONE HUNDRED
+ case 0x1016B: retval = 300; break; // ACROPHONIC THESPIAN THREE HUNDRED
+ case 0x1016C: retval = 500; break; // ACROPHONIC EPIDAUREAN FIVE HUNDRED
+ case 0x1016D: retval = 500; break; // ACROPHONIC TROEZENIAN FIVE HUNDRED
+ case 0x1016E: retval = 500; break; // ACROPHONIC THESPIAN FIVE HUNDRED
+ case 0x1016F: retval = 500; break; // ACROPHONIC CARYSTIAN FIVE HUNDRED
+ case 0x10170: retval = 500; break; // ACROPHONIC NAXIAN FIVE HUNDRED
+ case 0x10171: retval = 1000; break; // ACROPHONIC THESPIAN ONE THOUSAND
+ case 0x10172: retval = 5000; break; // ACROPHONIC THESPIAN FIVE THOUSAND
+ case 0x10174: retval = 50; break; // ACROPHONIC STRATIAN FIFTY MNAS
+ case 0x102ED: retval = 40; break; // COPTIC EPACT NUMBER FORTY
+ case 0x102EE: retval = 50; break; // COPTIC EPACT NUMBER FIFTY
+ case 0x102EF: retval = 60; break; // COPTIC EPACT NUMBER SIXTY
+ case 0x102F0: retval = 70; break; // COPTIC EPACT NUMBER SEVENTY
+ case 0x102F1: retval = 80; break; // COPTIC EPACT NUMBER EIGHTY
+ case 0x102F2: retval = 90; break; // COPTIC EPACT NUMBER NINETY
+ case 0x102F3: retval = 100; break; // COPTIC EPACT NUMBER ONE HUNDRED
+ case 0x102F4: retval = 200; break; // COPTIC EPACT NUMBER TWO HUNDRED
+ case 0x102F5: retval = 300; break; // COPTIC EPACT NUMBER THREE HUNDRED
+ case 0x102F6: retval = 400; break; // COPTIC EPACT NUMBER FOUR HUNDRED
+ case 0x102F7: retval = 500; break; // COPTIC EPACT NUMBER FIVE HUNDRED
+ case 0x102F8: retval = 600; break; // COPTIC EPACT NUMBER SIX HUNDRED
+ case 0x102F9: retval = 700; break; // COPTIC EPACT NUMBER SEVEN HUNDRED
+ case 0x102FA: retval = 800; break; // COPTIC EPACT NUMBER EIGHT HUNDRED
+ case 0x102FB: retval = 900; break; // COPTIC EPACT NUMBER NINE HUNDRED
+ case 0x10341: retval = 90; break; // GOTHIC LETTER NINETY
+ case 0x1034A: retval = 900; break; // GOTHIC LETTER NINE HUNDRED
+ case 0x103D5: retval = 100; break; // OLD PERSIAN NUMBER HUNDRED
+ case 0x1085D: retval = 100; break; // IMPERIAL ARAMAIC NUMBER ONE HUNDRED
+ case 0x1085E: retval = 1000; break; // IMPERIAL ARAMAIC NUMBER ONE THOUSAND
+ case 0x1085F: retval = 10000; break; // IMPERIAL ARAMAIC NUMBER TEN THOUSAND
+ case 0x108AF: retval = 100; break; // NABATAEAN NUMBER ONE HUNDRED
+ case 0x10919: retval = 100; break; // PHOENICIAN NUMBER ONE HUNDRED
+ case 0x10A46: retval = 100; break; // KHAROSHTHI NUMBER ONE HUNDRED
+ case 0x10A47: retval = 1000; break; // KHAROSHTHI NUMBER ONE THOUSAND
+ case 0x10A7E: retval = 50; break; // OLD SOUTH ARABIAN NUMBER FIFTY
+ case 0x10AEF: retval = 100; break; // MANICHAEAN NUMBER ONE HUNDRED
+ case 0x10B5E: retval = 100; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE HUNDRED
+ case 0x10B5F: retval = 1000; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND
+ case 0x10B7E: retval = 100; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED
+ case 0x10B7F: retval = 1000; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND
+ case 0x10BAF: retval = 100; break; // PSALTER PAHLAVI NUMBER ONE HUNDRED
+ case 0x10E6C: retval = 40; break; // RUMI NUMBER FORTY
+ case 0x10E6D: retval = 50; break; // RUMI NUMBER FIFTY
+ case 0x10E6E: retval = 60; break; // RUMI NUMBER SIXTY
+ case 0x10E6F: retval = 70; break; // RUMI NUMBER SEVENTY
+ case 0x10E70: retval = 80; break; // RUMI NUMBER EIGHTY
+ case 0x10E71: retval = 90; break; // RUMI NUMBER NINETY
+ case 0x10E72: retval = 100; break; // RUMI NUMBER ONE HUNDRED
+ case 0x10E73: retval = 200; break; // RUMI NUMBER TWO HUNDRED
+ case 0x10E74: retval = 300; break; // RUMI NUMBER THREE HUNDRED
+ case 0x10E75: retval = 400; break; // RUMI NUMBER FOUR HUNDRED
+ case 0x10E76: retval = 500; break; // RUMI NUMBER FIVE HUNDRED
+ case 0x10E77: retval = 600; break; // RUMI NUMBER SIX HUNDRED
+ case 0x10E78: retval = 700; break; // RUMI NUMBER SEVEN HUNDRED
+ case 0x10E79: retval = 800; break; // RUMI NUMBER EIGHT HUNDRED
+ case 0x10E7A: retval = 900; break; // RUMI NUMBER NINE HUNDRED
+ case 0x1105E: retval = 40; break; // BRAHMI NUMBER FORTY
+ case 0x1105F: retval = 50; break; // BRAHMI NUMBER FIFTY
+ case 0x11060: retval = 60; break; // BRAHMI NUMBER SIXTY
+ case 0x11061: retval = 70; break; // BRAHMI NUMBER SEVENTY
+ case 0x11062: retval = 80; break; // BRAHMI NUMBER EIGHTY
+ case 0x11063: retval = 90; break; // BRAHMI NUMBER NINETY
+ case 0x11064: retval = 100; break; // BRAHMI NUMBER ONE HUNDRED
+ case 0x11065: retval = 1000; break; // BRAHMI NUMBER ONE THOUSAND
+ case 0x111ED: retval = 40; break; // SINHALA ARCHAIC NUMBER FORTY
+ case 0x111EE: retval = 50; break; // SINHALA ARCHAIC NUMBER FIFTY
+ case 0x111EF: retval = 60; break; // SINHALA ARCHAIC NUMBER SIXTY
+ case 0x111F0: retval = 70; break; // SINHALA ARCHAIC NUMBER SEVENTY
+ case 0x111F1: retval = 80; break; // SINHALA ARCHAIC NUMBER EIGHTY
+ case 0x111F2: retval = 90; break; // SINHALA ARCHAIC NUMBER NINETY
+ case 0x111F3: retval = 100; break; // SINHALA ARCHAIC NUMBER ONE HUNDRED
+ case 0x111F4: retval = 1000; break; // SINHALA ARCHAIC NUMBER ONE THOUSAND
+ case 0x118ED: retval = 40; break; // WARANG CITI NUMBER FORTY
+ case 0x118EE: retval = 50; break; // WARANG CITI NUMBER FIFTY
+ case 0x118EF: retval = 60; break; // WARANG CITI NUMBER SIXTY
+ case 0x118F0: retval = 70; break; // WARANG CITI NUMBER SEVENTY
+ case 0x118F1: retval = 80; break; // WARANG CITI NUMBER EIGHTY
+ case 0x118F2: retval = 90; break; // WARANG CITI NUMBER NINETY
+ case 0x12432: retval = 216000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH
+ case 0x12433: retval = 432000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN
+ case 0x12467: retval = 40; break; // CUNEIFORM NUMERIC SIGN ELAMITE FORTY
+ case 0x12468: retval = 50; break; // CUNEIFORM NUMERIC SIGN ELAMITE FIFTY
+ case 0x16B5C: retval = 100; break; // PAHAWH HMONG NUMBER HUNDREDS
+ case 0x16B5D: retval = 10000; break; // PAHAWH HMONG NUMBER TEN THOUSANDS
+ case 0x16B5E: retval = 1000000; break; // PAHAWH HMONG NUMBER MILLIONS
+ case 0x16B5F: retval = 100000000; break;// PAHAWH HMONG NUMBER HUNDRED MILLIONS
+ case 0x1D36C: retval = 40; break; // COUNTING ROD TENS DIGIT FOUR
+ case 0x1D36D: retval = 50; break; // COUNTING ROD TENS DIGIT FIVE
+ case 0x1D36E: retval = 60; break; // COUNTING ROD TENS DIGIT SIX
+ case 0x1D36F: retval = 70; break; // COUNTING ROD TENS DIGIT SEVEN
+ case 0x1D370: retval = 80; break; // COUNTING ROD TENS DIGIT EIGHT
+ case 0x1D371: retval = 90; break; // COUNTING ROD TENS DIGIT NINE
default: retval = -2; break;
}
--- a/jdk/make/data/unicodedata/PropList.txt Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/data/unicodedata/PropList.txt Wed Jul 15 11:05:51 2015 +0900
@@ -1,8 +1,8 @@
-# PropList-6.2.0.txt
-# Date: 2012-05-23, 20:34:59 GMT [MD]
+# PropList-7.0.0.txt
+# Date: 2014-02-19, 15:51:26 GMT [MD]
#
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@@ -13,7 +13,6 @@
0085 ; White_Space # Cc <control-0085>
00A0 ; White_Space # Zs NO-BREAK SPACE
1680 ; White_Space # Zs OGHAM SPACE MARK
-180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
2028 ; White_Space # Zl LINE SEPARATOR
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
@@ -21,14 +20,16 @@
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
3000 ; White_Space # Zs IDEOGRAPHIC SPACE
-# Total code points: 26
+# Total code points: 25
# ================================================
+061C ; Bidi_Control # Cf ARABIC LETTER MARK
200E..200F ; Bidi_Control # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK
202A..202E ; Bidi_Control # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
+2066..2069 ; Bidi_Control # Cf [4] LEFT-TO-RIGHT ISOLATE..POP DIRECTIONAL ISOLATE
-# Total code points: 7
+# Total code points: 12
# ================================================
@@ -51,6 +52,7 @@
2E17 ; Dash # Pd DOUBLE OBLIQUE HYPHEN
2E1A ; Dash # Pd HYPHEN WITH DIAERESIS
2E3A..2E3B ; Dash # Pd [2] TWO-EM DASH..THREE-EM DASH
+2E40 ; Dash # Pd DOUBLE HYPHEN
301C ; Dash # Pd WAVE DASH
3030 ; Dash # Pd WAVY DASH
30A0 ; Dash # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
@@ -59,7 +61,7 @@
FE63 ; Dash # Pd SMALL HYPHEN-MINUS
FF0D ; Dash # Pd FULLWIDTH HYPHEN-MINUS
-# Total code points: 27
+# Total code points: 28
# ================================================
@@ -91,6 +93,7 @@
201F ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK
2039 ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK
203A ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+2E42 ; Quotation_Mark # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
300C ; Quotation_Mark # Ps LEFT CORNER BRACKET
300D ; Quotation_Mark # Pe RIGHT CORNER BRACKET
300E ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET
@@ -106,7 +109,7 @@
FF62 ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET
FF63 ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET
-# Total code points: 29
+# Total code points: 30
# ================================================
@@ -136,6 +139,7 @@
1361..1368 ; Terminal_Punctuation # Po [8] ETHIOPIC WORDSPACE..ETHIOPIC PARAGRAPH SEPARATOR
166D..166E ; Terminal_Punctuation # Po [2] CANADIAN SYLLABICS CHI SIGN..CANADIAN SYLLABICS FULL STOP
16EB..16ED ; Terminal_Punctuation # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION
+1735..1736 ; Terminal_Punctuation # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
17D4..17D6 ; Terminal_Punctuation # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH
17DA ; Terminal_Punctuation # Po KHMER SIGN KOOMUUT
1802..1805 ; Terminal_Punctuation # Po [4] MONGOLIAN COMMA..MONGOLIAN FOUR DOTS
@@ -149,6 +153,8 @@
203C..203D ; Terminal_Punctuation # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2E2E ; Terminal_Punctuation # Po REVERSED QUESTION MARK
+2E3C ; Terminal_Punctuation # Po STENOGRAPHIC FULL STOP
+2E41 ; Terminal_Punctuation # Po REVERSED COMMA
3001..3002 ; Terminal_Punctuation # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
A4FE..A4FF ; Terminal_Punctuation # Po [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP
A60D..A60F ; Terminal_Punctuation # Po [3] VAI COMMA..VAI QUESTION MARK
@@ -174,14 +180,27 @@
103D0 ; Terminal_Punctuation # Po OLD PERSIAN WORD DIVIDER
10857 ; Terminal_Punctuation # Po IMPERIAL ARAMAIC SECTION SIGN
1091F ; Terminal_Punctuation # Po PHOENICIAN WORD SEPARATOR
+10A56..10A57 ; Terminal_Punctuation # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA
+10AF0..10AF5 ; Terminal_Punctuation # Po [6] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS
10B3A..10B3F ; Terminal_Punctuation # Po [6] TINY TWO DOTS OVER ONE DOT PUNCTUATION..LARGE ONE RING OVER TWO RINGS PUNCTUATION
+10B99..10B9C ; Terminal_Punctuation # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
11047..1104D ; Terminal_Punctuation # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
110BE..110C1 ; Terminal_Punctuation # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
11141..11143 ; Terminal_Punctuation # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK
111C5..111C6 ; Terminal_Punctuation # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA
-12470..12473 ; Terminal_Punctuation # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON
+111CD ; Terminal_Punctuation # Po SHARADA SUTRA MARK
+11238..1123C ; Terminal_Punctuation # Po [5] KHOJKI DANDA..KHOJKI DOUBLE SECTION MARK
+115C2..115C5 ; Terminal_Punctuation # Po [4] SIDDHAM DANDA..SIDDHAM SEPARATOR BAR
+115C9 ; Terminal_Punctuation # Po SIDDHAM END OF TEXT MARK
+11641..11642 ; Terminal_Punctuation # Po [2] MODI DANDA..MODI DOUBLE DANDA
+12470..12474 ; Terminal_Punctuation # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
+16A6E..16A6F ; Terminal_Punctuation # Po [2] MRO DANDA..MRO DOUBLE DANDA
+16AF5 ; Terminal_Punctuation # Po BASSA VAH FULL STOP
+16B37..16B39 ; Terminal_Punctuation # Po [3] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN CIM CHEEM
+16B44 ; Terminal_Punctuation # Po PAHAWH HMONG SIGN XAUS
+1BC9F ; Terminal_Punctuation # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
-# Total code points: 176
+# Total code points: 214
# ================================================
@@ -230,6 +249,10 @@
21D5..21DB ; Other_Math # So [7] UP DOWN DOUBLE ARROW..RIGHTWARDS TRIPLE ARROW
21DD ; Other_Math # So RIGHTWARDS SQUIGGLE ARROW
21E4..21E5 ; Other_Math # So [2] LEFTWARDS ARROW TO BAR..RIGHTWARDS ARROW TO BAR
+2308 ; Other_Math # Ps LEFT CEILING
+2309 ; Other_Math # Pe RIGHT CEILING
+230A ; Other_Math # Ps LEFT FLOOR
+230B ; Other_Math # Pe RIGHT FLOOR
23B4..23B5 ; Other_Math # So [2] TOP SQUARE BRACKET..BOTTOM SQUARE BRACKET
23B7 ; Other_Math # So RADICAL SYMBOL BOTTOM
23D0 ; Other_Math # So VERTICAL LINE EXTENSION
@@ -358,7 +381,7 @@
1EEA5..1EEA9 ; Other_Math # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
1EEAB..1EEBB ; Other_Math # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
-# Total code points: 1358
+# Total code points: 1362
# ================================================
@@ -403,8 +426,7 @@
0825..0827 ; Other_Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082C ; Other_Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN
08E4..08E9 ; Other_Alphabetic # Mn [6] ARABIC CURLY FATHA..ARABIC CURLY KASRATAN
-08F0..08FE ; Other_Alphabetic # Mn [15] ARABIC OPEN FATHATAN..ARABIC DAMMA WITH DOT
-0900..0902 ; Other_Alphabetic # Mn [3] DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI SIGN ANUSVARA
+08F0..0902 ; Other_Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA
0903 ; Other_Alphabetic # Mc DEVANAGARI SIGN VISARGA
093A ; Other_Alphabetic # Mn DEVANAGARI VOWEL SIGN OE
093B ; Other_Alphabetic # Mc DEVANAGARI VOWEL SIGN OOE
@@ -457,6 +479,7 @@
0BC6..0BC8 ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI
0BCA..0BCC ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU
0BD7 ; Other_Alphabetic # Mc TAMIL AU LENGTH MARK
+0C00 ; Other_Alphabetic # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
0C01..0C03 ; Other_Alphabetic # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
0C3E..0C40 ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
0C41..0C44 ; Other_Alphabetic # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
@@ -464,6 +487,7 @@
0C4A..0C4C ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN O..TELUGU VOWEL SIGN AU
0C55..0C56 ; Other_Alphabetic # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
0C62..0C63 ; Other_Alphabetic # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL
+0C81 ; Other_Alphabetic # Mn KANNADA SIGN CANDRABINDU
0C82..0C83 ; Other_Alphabetic # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
0CBE ; Other_Alphabetic # Mc KANNADA VOWEL SIGN AA
0CBF ; Other_Alphabetic # Mn KANNADA VOWEL SIGN I
@@ -474,6 +498,7 @@
0CCC ; Other_Alphabetic # Mn KANNADA VOWEL SIGN AU
0CD5..0CD6 ; Other_Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
0CE2..0CE3 ; Other_Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
+0D01 ; Other_Alphabetic # Mn MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Other_Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D3E..0D40 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
0D41..0D44 ; Other_Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
@@ -538,7 +563,8 @@
19B0..19C0 ; Other_Alphabetic # Mc [17] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE VOWEL SIGN IY
19C8..19C9 ; Other_Alphabetic # Mc [2] NEW TAI LUE TONE MARK-1..NEW TAI LUE TONE MARK-2
1A17..1A18 ; Other_Alphabetic # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
-1A19..1A1B ; Other_Alphabetic # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE
+1A19..1A1A ; Other_Alphabetic # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O
+1A1B ; Other_Alphabetic # Mn BUGINESE VOWEL SIGN AE
1A55 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN MEDIAL RA
1A56 ; Other_Alphabetic # Mn TAI THAM CONSONANT SIGN MEDIAL LA
1A57 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN LA TANG LAI
@@ -564,7 +590,7 @@
1BA2..1BA5 ; Other_Alphabetic # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU
1BA6..1BA7 ; Other_Alphabetic # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG
1BA8..1BA9 ; Other_Alphabetic # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
-1BAC..1BAD ; Other_Alphabetic # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA
+1BAC..1BAD ; Other_Alphabetic # Mn [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA
1BE7 ; Other_Alphabetic # Mc BATAK VOWEL SIGN E
1BE8..1BE9 ; Other_Alphabetic # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE
1BEA..1BEC ; Other_Alphabetic # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O
@@ -575,6 +601,7 @@
1C2C..1C33 ; Other_Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T
1C34..1C35 ; Other_Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG
1CF2..1CF3 ; Other_Alphabetic # Mc [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
+1DE7..1DF4 ; Other_Alphabetic # Mn [14] COMBINING LATIN SMALL LETTER ALPHA..COMBINING LATIN SMALL LETTER U WITH DIAERESIS
24B6..24E9 ; Other_Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z
2DE0..2DFF ; Other_Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
A674..A67B ; Other_Alphabetic # Mn [8] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC LETTER OMEGA
@@ -616,6 +643,7 @@
ABE8 ; Other_Alphabetic # Mn MEETEI MAYEK VOWEL SIGN UNAP
ABE9..ABEA ; Other_Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG
FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
+10376..1037A ; Other_Alphabetic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
10A01..10A03 ; Other_Alphabetic # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R
10A05..10A06 ; Other_Alphabetic # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O
10A0C..10A0F ; Other_Alphabetic # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
@@ -636,14 +664,54 @@
111B3..111B5 ; Other_Alphabetic # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II
111B6..111BE ; Other_Alphabetic # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O
111BF ; Other_Alphabetic # Mc SHARADA VOWEL SIGN AU
+1122C..1122E ; Other_Alphabetic # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II
+1122F..11231 ; Other_Alphabetic # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI
+11232..11233 ; Other_Alphabetic # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
+11234 ; Other_Alphabetic # Mn KHOJKI SIGN ANUSVARA
+11237 ; Other_Alphabetic # Mn KHOJKI SIGN SHADDA
+112DF ; Other_Alphabetic # Mn KHUDAWADI SIGN ANUSVARA
+112E0..112E2 ; Other_Alphabetic # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
+112E3..112E8 ; Other_Alphabetic # Mn [6] KHUDAWADI VOWEL SIGN U..KHUDAWADI VOWEL SIGN AU
+11301 ; Other_Alphabetic # Mn GRANTHA SIGN CANDRABINDU
+11302..11303 ; Other_Alphabetic # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA
+1133E..1133F ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I
+11340 ; Other_Alphabetic # Mn GRANTHA VOWEL SIGN II
+11341..11344 ; Other_Alphabetic # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR
+11347..11348 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI
+1134B..1134C ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU
+11357 ; Other_Alphabetic # Mc GRANTHA AU LENGTH MARK
+11362..11363 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
+114B0..114B2 ; Other_Alphabetic # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
+114B3..114B8 ; Other_Alphabetic # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
+114B9 ; Other_Alphabetic # Mc TIRHUTA VOWEL SIGN E
+114BA ; Other_Alphabetic # Mn TIRHUTA VOWEL SIGN SHORT E
+114BB..114BE ; Other_Alphabetic # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU
+114BF..114C0 ; Other_Alphabetic # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA
+114C1 ; Other_Alphabetic # Mc TIRHUTA SIGN VISARGA
+115AF..115B1 ; Other_Alphabetic # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II
+115B2..115B5 ; Other_Alphabetic # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR
+115B8..115BB ; Other_Alphabetic # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU
+115BC..115BD ; Other_Alphabetic # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA
+115BE ; Other_Alphabetic # Mc SIDDHAM SIGN VISARGA
+11630..11632 ; Other_Alphabetic # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II
+11633..1163A ; Other_Alphabetic # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI
+1163B..1163C ; Other_Alphabetic # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU
+1163D ; Other_Alphabetic # Mn MODI SIGN ANUSVARA
+1163E ; Other_Alphabetic # Mc MODI SIGN VISARGA
+11640 ; Other_Alphabetic # Mn MODI SIGN ARDHACANDRA
116AB ; Other_Alphabetic # Mn TAKRI SIGN ANUSVARA
116AC ; Other_Alphabetic # Mc TAKRI SIGN VISARGA
116AD ; Other_Alphabetic # Mn TAKRI VOWEL SIGN AA
116AE..116AF ; Other_Alphabetic # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II
116B0..116B5 ; Other_Alphabetic # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU
+16B30..16B36 ; Other_Alphabetic # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F51..16F7E ; Other_Alphabetic # Mc [46] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN NG
+1BC9E ; Other_Alphabetic # Mn DUPLOYAN DOUBLE MARK
+1F130..1F149 ; Other_Alphabetic # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
+1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
+1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
-# Total code points: 922
+# Total code points: 1116
# ================================================
@@ -746,6 +814,7 @@
1939..193B ; Diacritic # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I
1A75..1A7C ; Diacritic # Mn [8] TAI THAM SIGN TONE-1..TAI THAM SIGN KHUEN-LUE KARAN
1A7F ; Diacritic # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT
+1AB0..1ABD ; Diacritic # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
1B34 ; Diacritic # Mn BALINESE SIGN REREKAN
1B44 ; Diacritic # Mc BALINESE ADEG ADEG
1B6B..1B73 ; Diacritic # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
@@ -760,8 +829,10 @@
1CE2..1CE8 ; Diacritic # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; Diacritic # Mn VEDIC SIGN TIRYAK
1CF4 ; Diacritic # Mn VEDIC TONE CANDRA ABOVE
+1CF8..1CF9 ; Diacritic # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
1D2C..1D6A ; Diacritic # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI
1DC4..1DCF ; Diacritic # Mn [12] COMBINING MACRON-ACUTE..COMBINING ZIGZAG BELOW
+1DF5 ; Diacritic # Mn COMBINING UP TACK ABOVE
1DFD..1DFF ; Diacritic # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1FBD ; Diacritic # Sk GREEK KORONIS
1FBF..1FC1 ; Diacritic # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
@@ -779,6 +850,7 @@
A66F ; Diacritic # Mn COMBINING CYRILLIC VZMET
A67C..A67D ; Diacritic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK
A67F ; Diacritic # Lm CYRILLIC PAYEROK
+A69C..A69D ; Diacritic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
A6F0..A6F1 ; Diacritic # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS
A717..A71F ; Diacritic # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
A720..A721 ; Diacritic # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE
@@ -791,26 +863,45 @@
A953 ; Diacritic # Mc REJANG VIRAMA
A9B3 ; Diacritic # Mn JAVANESE SIGN CECAK TELU
A9C0 ; Diacritic # Mc JAVANESE PANGKON
+A9E5 ; Diacritic # Mn MYANMAR SIGN SHAN SAW
AA7B ; Diacritic # Mc MYANMAR SIGN PAO KAREN TONE
+AA7C ; Diacritic # Mn MYANMAR SIGN TAI LAING TONE-2
+AA7D ; Diacritic # Mc MYANMAR SIGN TAI LAING TONE-5
AABF ; Diacritic # Mn TAI VIET TONE MAI EK
AAC0 ; Diacritic # Lo TAI VIET TONE MAI NUENG
AAC1 ; Diacritic # Mn TAI VIET TONE MAI THO
AAC2 ; Diacritic # Lo TAI VIET TONE MAI SONG
AAF6 ; Diacritic # Mn MEETEI MAYEK VIRAMA
+AB5B ; Diacritic # Sk MODIFIER BREVE WITH INVERTED BREVE
+AB5C..AB5F ; Diacritic # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
ABEC ; Diacritic # Mc MEETEI MAYEK LUM IYEK
ABED ; Diacritic # Mn MEETEI MAYEK APUN IYEK
FB1E ; Diacritic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
-FE20..FE26 ; Diacritic # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON
+FE20..FE2D ; Diacritic # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW
FF3E ; Diacritic # Sk FULLWIDTH CIRCUMFLEX ACCENT
FF40 ; Diacritic # Sk FULLWIDTH GRAVE ACCENT
FF70 ; Diacritic # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF9E..FF9F ; Diacritic # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
FFE3 ; Diacritic # Sk FULLWIDTH MACRON
+102E0 ; Diacritic # Mn COPTIC EPACT THOUSANDS MARK
+10AE5..10AE6 ; Diacritic # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
110B9..110BA ; Diacritic # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
11133..11134 ; Diacritic # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA
+11173 ; Diacritic # Mn MAHAJANI SIGN NUKTA
111C0 ; Diacritic # Mc SHARADA SIGN VIRAMA
+11235 ; Diacritic # Mc KHOJKI SIGN VIRAMA
+11236 ; Diacritic # Mn KHOJKI SIGN NUKTA
+112E9..112EA ; Diacritic # Mn [2] KHUDAWADI SIGN NUKTA..KHUDAWADI SIGN VIRAMA
+1133C ; Diacritic # Mn GRANTHA SIGN NUKTA
+1134D ; Diacritic # Mc GRANTHA SIGN VIRAMA
+11366..1136C ; Diacritic # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
+11370..11374 ; Diacritic # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
+114C2..114C3 ; Diacritic # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
+115BF..115C0 ; Diacritic # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
+1163F ; Diacritic # Mn MODI SIGN VIRAMA
116B6 ; Diacritic # Mc TAKRI SIGN VIRAMA
116B7 ; Diacritic # Mn TAKRI SIGN NUKTA
+16AF0..16AF4 ; Diacritic # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16F8F..16F92 ; Diacritic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
16F93..16F9F ; Diacritic # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
1D167..1D169 ; Diacritic # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
@@ -818,8 +909,9 @@
1D17B..1D182 ; Diacritic # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
1D185..1D18B ; Diacritic # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
1D1AA..1D1AD ; Diacritic # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
+1E8D0..1E8D6 ; Diacritic # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
-# Total code points: 693
+# Total code points: 766
# ================================================
@@ -841,12 +933,16 @@
A015 ; Extender # Lm YI SYLLABLE WU
A60C ; Extender # Lm VAI SYLLABLE LENGTHENER
A9CF ; Extender # Lm JAVANESE PANGRANGKEP
+A9E6 ; Extender # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION
AA70 ; Extender # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION
AADD ; Extender # Lm TAI VIET SYMBOL SAM
AAF3..AAF4 ; Extender # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK
FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
+1135D ; Extender # Lo GRANTHA SIGN PLUTA
+115C6..115C8 ; Extender # Po [3] SIDDHAM REPETITION MARK-1..SIDDHAM REPETITION MARK-3
+16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM
-# Total code points: 31
+# Total code points: 38
# ================================================
@@ -866,17 +962,22 @@
2170..217F ; Other_Lowercase # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
24D0..24E9 ; Other_Lowercase # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
2C7C..2C7D ; Other_Lowercase # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V
+A69C..A69D ; Other_Lowercase # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
A770 ; Other_Lowercase # Lm MODIFIER LETTER US
A7F8..A7F9 ; Other_Lowercase # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
+AB5C..AB5F ; Other_Lowercase # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
-# Total code points: 183
+# Total code points: 189
# ================================================
2160..216F ; Other_Uppercase # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
24B6..24CF ; Other_Uppercase # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
+1F130..1F149 ; Other_Uppercase # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
+1F150..1F169 ; Other_Uppercase # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
+1F170..1F189 ; Other_Uppercase # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
-# Total code points: 42
+# Total code points: 120
# ================================================
@@ -918,10 +1019,15 @@
200C..200D ; Other_Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
302E..302F ; Other_Grapheme_Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+1133E ; Other_Grapheme_Extend # Mc GRANTHA VOWEL SIGN AA
+11357 ; Other_Grapheme_Extend # Mc GRANTHA AU LENGTH MARK
+114B0 ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN AA
+114BD ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN SHORT O
+115AF ; Other_Grapheme_Extend # Mc SIDDHAM VOWEL SIGN AA
1D165 ; Other_Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM
1D16E..1D172 ; Other_Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5
-# Total code points: 25
+# Total code points: 30
# ================================================
@@ -966,7 +1072,7 @@
034F ; Other_Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER
115F..1160 ; Other_Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER
17B4..17B5 ; Other_Default_Ignorable_Code_Point # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
-2065..2069 ; Other_Default_Ignorable_Code_Point # Cn [5] <reserved-2065>..<reserved-2069>
+2065 ; Other_Default_Ignorable_Code_Point # Cn <reserved-2065>
3164 ; Other_Default_Ignorable_Code_Point # Lo HANGUL FILLER
FFA0 ; Other_Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER
FFF0..FFF8 ; Other_Default_Ignorable_Code_Point # Cn [9] <reserved-FFF0>..<reserved-FFF8>
@@ -975,7 +1081,7 @@
E0080..E00FF ; Other_Default_Ignorable_Code_Point # Cn [128] <reserved-E0080>..<reserved-E00FF>
E01F0..E0FFF ; Other_Default_Ignorable_Code_Point # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
-# Total code points: 3780
+# Total code points: 3776
# ================================================
@@ -1060,8 +1166,6 @@
0021 ; STerm # Po EXCLAMATION MARK
002E ; STerm # Po FULL STOP
003F ; STerm # Po QUESTION MARK
-055C ; STerm # Po ARMENIAN EXCLAMATION MARK
-055E ; STerm # Po ARMENIAN QUESTION MARK
0589 ; STerm # Po ARMENIAN FULL STOP
061F ; STerm # Po ARABIC QUESTION MARK
06D4 ; STerm # Po ARABIC FULL STOP
@@ -1084,6 +1188,7 @@
203C..203D ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2E2E ; STerm # Po REVERSED QUESTION MARK
+2E3C ; STerm # Po STENOGRAPHIC FULL STOP
3002 ; STerm # Po IDEOGRAPHIC FULL STOP
A4FF ; STerm # Po LISU PUNCTUATION FULL STOP
A60E..A60F ; STerm # Po [2] VAI FULL STOP..VAI QUESTION MARK
@@ -1107,8 +1212,19 @@
110BE..110C1 ; STerm # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
11141..11143 ; STerm # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK
111C5..111C6 ; STerm # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA
+111CD ; STerm # Po SHARADA SUTRA MARK
+11238..11239 ; STerm # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA
+1123B..1123C ; STerm # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
+115C2..115C3 ; STerm # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA
+115C9 ; STerm # Po SIDDHAM END OF TEXT MARK
+11641..11642 ; STerm # Po [2] MODI DANDA..MODI DOUBLE DANDA
+16A6E..16A6F ; STerm # Po [2] MRO DANDA..MRO DOUBLE DANDA
+16AF5 ; STerm # Po BASSA VAH FULL STOP
+16B37..16B38 ; STerm # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB
+16B44 ; STerm # Po PAHAWH HMONG SIGN XAUS
+1BC9F ; STerm # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
-# Total code points: 83
+# Total code points: 99
# ================================================
@@ -1210,7 +1326,10 @@
21D5..21F3 ; Pattern_Syntax # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW
21F4..22FF ; Pattern_Syntax # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP
2300..2307 ; Pattern_Syntax # So [8] DIAMETER SIGN..WAVY LINE
-2308..230B ; Pattern_Syntax # Sm [4] LEFT CEILING..RIGHT FLOOR
+2308 ; Pattern_Syntax # Ps LEFT CEILING
+2309 ; Pattern_Syntax # Pe RIGHT CEILING
+230A ; Pattern_Syntax # Ps LEFT FLOOR
+230B ; Pattern_Syntax # Pe RIGHT FLOOR
230C..231F ; Pattern_Syntax # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER
2320..2321 ; Pattern_Syntax # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL
2322..2328 ; Pattern_Syntax # So [7] FROWN..KEYBOARD
@@ -1222,8 +1341,8 @@
239B..23B3 ; Pattern_Syntax # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Pattern_Syntax # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Pattern_Syntax # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
-23E2..23F3 ; Pattern_Syntax # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND
-23F4..23FF ; Pattern_Syntax # Cn [12] <reserved-23F4>..<reserved-23FF>
+23E2..23FA ; Pattern_Syntax # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
+23FB..23FF ; Pattern_Syntax # Cn [5] <reserved-23FB>..<reserved-23FF>
2400..2426 ; Pattern_Syntax # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
2427..243F ; Pattern_Syntax # Cn [25] <reserved-2427>..<reserved-243F>
2440..244A ; Pattern_Syntax # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
@@ -1236,9 +1355,7 @@
25F8..25FF ; Pattern_Syntax # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE
2600..266E ; Pattern_Syntax # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN
266F ; Pattern_Syntax # Sm MUSIC SHARP SIGN
-2670..26FF ; Pattern_Syntax # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
-2700 ; Pattern_Syntax # Cn <reserved-2700>
-2701..2767 ; Pattern_Syntax # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET
+2670..2767 ; Pattern_Syntax # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET
2768 ; Pattern_Syntax # Ps MEDIUM LEFT PARENTHESIS ORNAMENT
2769 ; Pattern_Syntax # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT
276A ; Pattern_Syntax # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
@@ -1306,9 +1423,16 @@
2B30..2B44 ; Pattern_Syntax # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET
2B45..2B46 ; Pattern_Syntax # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW
2B47..2B4C ; Pattern_Syntax # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR
-2B4D..2B4F ; Pattern_Syntax # Cn [3] <reserved-2B4D>..<reserved-2B4F>
-2B50..2B59 ; Pattern_Syntax # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE
-2B5A..2BFF ; Pattern_Syntax # Cn [166] <reserved-2B5A>..<reserved-2BFF>
+2B4D..2B73 ; Pattern_Syntax # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR
+2B74..2B75 ; Pattern_Syntax # Cn [2] <reserved-2B74>..<reserved-2B75>
+2B76..2B95 ; Pattern_Syntax # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
+2B96..2B97 ; Pattern_Syntax # Cn [2] <reserved-2B96>..<reserved-2B97>
+2B98..2BB9 ; Pattern_Syntax # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX
+2BBA..2BBC ; Pattern_Syntax # Cn [3] <reserved-2BBA>..<reserved-2BBC>
+2BBD..2BC8 ; Pattern_Syntax # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
+2BC9 ; Pattern_Syntax # Cn <reserved-2BC9>
+2BCA..2BD1 ; Pattern_Syntax # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
+2BD2..2BFF ; Pattern_Syntax # Cn [46] <reserved-2BD2>..<reserved-2BFF>
2E00..2E01 ; Pattern_Syntax # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
2E02 ; Pattern_Syntax # Pi LEFT SUBSTITUTION BRACKET
2E03 ; Pattern_Syntax # Pf RIGHT SUBSTITUTION BRACKET
@@ -1342,7 +1466,11 @@
2E2F ; Pattern_Syntax # Lm VERTICAL TILDE
2E30..2E39 ; Pattern_Syntax # Po [10] RING POINT..TOP HALF SECTION SIGN
2E3A..2E3B ; Pattern_Syntax # Pd [2] TWO-EM DASH..THREE-EM DASH
-2E3C..2E7F ; Pattern_Syntax # Cn [68] <reserved-2E3C>..<reserved-2E7F>
+2E3C..2E3F ; Pattern_Syntax # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM
+2E40 ; Pattern_Syntax # Pd DOUBLE HYPHEN
+2E41 ; Pattern_Syntax # Po REVERSED COMMA
+2E42 ; Pattern_Syntax # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
+2E43..2E7F ; Pattern_Syntax # Cn [61] <reserved-2E43>..<reserved-2E7F>
3001..3003 ; Pattern_Syntax # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
3008 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET
3009 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET
@@ -1368,8 +1496,8 @@
301E..301F ; Pattern_Syntax # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
3020 ; Pattern_Syntax # So POSTAL MARK FACE
3030 ; Pattern_Syntax # Pd WAVY DASH
-FD3E ; Pattern_Syntax # Ps ORNATE LEFT PARENTHESIS
-FD3F ; Pattern_Syntax # Pe ORNATE RIGHT PARENTHESIS
+FD3E ; Pattern_Syntax # Pe ORNATE LEFT PARENTHESIS
+FD3F ; Pattern_Syntax # Ps ORNATE RIGHT PARENTHESIS
FE45..FE46 ; Pattern_Syntax # Po [2] SESAME DOT..WHITE SESAME DOT
# Total code points: 2760
--- a/jdk/make/data/unicodedata/Scripts.txt Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/data/unicodedata/Scripts.txt Wed Jul 15 11:05:51 2015 +0900
@@ -1,8 +1,8 @@
-# Scripts-6.2.0.txt
-# Date: 2012-06-04, 17:21:29 GMT [MD]
+# Scripts-7.0.0.txt
+# Date: 2014-05-15, 00:11:35 GMT [MD]
#
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@@ -83,8 +83,10 @@
0385 ; Common # Sk GREEK DIALYTIKA TONOS
0387 ; Common # Po GREEK ANO TELEIA
0589 ; Common # Po ARMENIAN FULL STOP
+0605 ; Common # Cf ARABIC NUMBER MARK ABOVE
060C ; Common # Po ARABIC COMMA
061B ; Common # Po ARABIC SEMICOLON
+061C ; Common # Cf ARABIC LETTER MARK
061F ; Common # Po ARABIC QUESTION MARK
0640 ; Common # Lm ARABIC TATWEEL
0660..0669 ; Common # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
@@ -136,7 +138,7 @@
2055..205E ; Common # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS
205F ; Common # Zs MEDIUM MATHEMATICAL SPACE
2060..2064 ; Common # Cf [5] WORD JOINER..INVISIBLE PLUS
-206A..206F ; Common # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES
+2066..206F ; Common # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
2070 ; Common # No SUPERSCRIPT ZERO
2074..2079 ; Common # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE
207A..207C ; Common # Sm [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN
@@ -146,7 +148,7 @@
208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN
208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS
208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS
-20A0..20BA ; Common # Sc [27] EURO-CURRENCY SIGN..TURKISH LIRA SIGN
+20A0..20BD ; Common # Sc [30] EURO-CURRENCY SIGN..RUBLE SIGN
2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
2102 ; Common # L& DOUBLE-STRUCK CAPITAL C
2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA
@@ -200,7 +202,10 @@
21D5..21F3 ; Common # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW
21F4..22FF ; Common # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP
2300..2307 ; Common # So [8] DIAMETER SIGN..WAVY LINE
-2308..230B ; Common # Sm [4] LEFT CEILING..RIGHT FLOOR
+2308 ; Common # Ps LEFT CEILING
+2309 ; Common # Pe RIGHT CEILING
+230A ; Common # Ps LEFT FLOOR
+230B ; Common # Pe RIGHT FLOOR
230C..231F ; Common # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER
2320..2321 ; Common # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL
2322..2328 ; Common # So [7] FROWN..KEYBOARD
@@ -212,7 +217,7 @@
239B..23B3 ; Common # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Common # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Common # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
-23E2..23F3 ; Common # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND
+23E2..23FA ; Common # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
2400..2426 ; Common # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
2440..244A ; Common # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
2460..249B ; Common # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP
@@ -226,8 +231,7 @@
25F8..25FF ; Common # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE
2600..266E ; Common # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN
266F ; Common # Sm MUSIC SHARP SIGN
-2670..26FF ; Common # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
-2701..2767 ; Common # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET
+2670..2767 ; Common # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET
2768 ; Common # Ps MEDIUM LEFT PARENTHESIS ORNAMENT
2769 ; Common # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT
276A ; Common # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
@@ -295,7 +299,11 @@
2B30..2B44 ; Common # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET
2B45..2B46 ; Common # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW
2B47..2B4C ; Common # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR
-2B50..2B59 ; Common # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE
+2B4D..2B73 ; Common # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR
+2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
+2B98..2BB9 ; Common # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX
+2BBD..2BC8 ; Common # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
+2BCA..2BD1 ; Common # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET
2E03 ; Common # Pf RIGHT SUBSTITUTION BRACKET
@@ -329,6 +337,10 @@
2E2F ; Common # Lm VERTICAL TILDE
2E30..2E39 ; Common # Po [10] RING POINT..TOP HALF SECTION SIGN
2E3A..2E3B ; Common # Pd [2] TWO-EM DASH..THREE-EM DASH
+2E3C..2E3F ; Common # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM
+2E40 ; Common # Pd DOUBLE HYPHEN
+2E41 ; Common # Po REVERSED COMMA
+2E42 ; Common # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
2FF0..2FFB ; Common # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
3000 ; Common # Zs IDEOGRAPHIC SPACE
3001..3003 ; Common # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
@@ -392,9 +404,11 @@
A836..A837 ; Common # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
A838 ; Common # Sc NORTH INDIC RUPEE MARK
A839 ; Common # So NORTH INDIC QUANTITY MARK
-FD3E ; Common # Ps ORNATE LEFT PARENTHESIS
-FD3F ; Common # Pe ORNATE RIGHT PARENTHESIS
-FDFD ; Common # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
+A92E ; Common # Po KAYAH LI SIGN CWI
+A9CF ; Common # Lm JAVANESE PANGRANGKEP
+AB5B ; Common # Sk MODIFIER BREVE WITH INVERTED BREVE
+FD3E ; Common # Pe ORNATE LEFT PARENTHESIS
+FD3F ; Common # Ps ORNATE RIGHT PARENTHESIS
FE10..FE16 ; Common # Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK
FE17 ; Common # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
FE18 ; Common # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
@@ -487,6 +501,8 @@
10137..1013F ; Common # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
10190..1019B ; Common # So [12] ROMAN SEXTANS SIGN..ROMAN CENTURIAL SIGN
101D0..101FC ; Common # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND
+102E1..102FB ; Common # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
+1BCA0..1BCA3 ; Common # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
1D000..1D0F5 ; Common # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
1D100..1D126 ; Common # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
1D129..1D164 ; Common # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
@@ -543,10 +559,10 @@
1F000..1F02B ; Common # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK
1F030..1F093 ; Common # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06
1F0A0..1F0AE ; Common # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES
-1F0B1..1F0BE ; Common # So [14] PLAYING CARD ACE OF HEARTS..PLAYING CARD KING OF HEARTS
+1F0B1..1F0BF ; Common # So [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER
1F0C1..1F0CF ; Common # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER
-1F0D1..1F0DF ; Common # So [15] PLAYING CARD ACE OF CLUBS..PLAYING CARD WHITE JOKER
-1F100..1F10A ; Common # No [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA
+1F0D1..1F0F5 ; Common # So [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21
+1F100..1F10C ; Common # No [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
1F110..1F12E ; Common # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ
1F130..1F16B ; Common # So [60] SQUARED LATIN CAPITAL LETTER A..RAISED MD SIGN
1F170..1F19A ; Common # So [43] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VS
@@ -555,28 +571,29 @@
1F210..1F23A ; Common # So [43] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-55B6
1F240..1F248 ; Common # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
1F250..1F251 ; Common # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
-1F300..1F320 ; Common # So [33] CYCLONE..SHOOTING STAR
-1F330..1F335 ; Common # So [6] CHESTNUT..CACTUS
-1F337..1F37C ; Common # So [70] TULIP..BABY BOTTLE
-1F380..1F393 ; Common # So [20] RIBBON..GRADUATION CAP
-1F3A0..1F3C4 ; Common # So [37] CAROUSEL HORSE..SURFER
-1F3C6..1F3CA ; Common # So [5] TROPHY..SWIMMER
-1F3E0..1F3F0 ; Common # So [17] HOUSE BUILDING..EUROPEAN CASTLE
-1F400..1F43E ; Common # So [63] RAT..PAW PRINTS
-1F440 ; Common # So EYES
-1F442..1F4F7 ; Common # So [182] EAR..CAMERA
-1F4F9..1F4FC ; Common # So [4] VIDEO CAMERA..VIDEOCASSETTE
-1F500..1F53D ; Common # So [62] TWISTED RIGHTWARDS ARROWS..DOWN-POINTING SMALL RED TRIANGLE
-1F540..1F543 ; Common # So [4] CIRCLED CROSS POMMEE..NOTCHED LEFT SEMICIRCLE WITH THREE DOTS
-1F550..1F567 ; Common # So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
-1F5FB..1F640 ; Common # So [70] MOUNT FUJI..WEARY CAT FACE
-1F645..1F64F ; Common # So [11] FACE WITH NO GOOD GESTURE..PERSON WITH FOLDED HANDS
-1F680..1F6C5 ; Common # So [70] ROCKET..LEFT LUGGAGE
+1F300..1F32C ; Common # So [45] CYCLONE..WIND BLOWING FACE
+1F330..1F37D ; Common # So [78] CHESTNUT..FORK AND KNIFE WITH PLATE
+1F380..1F3CE ; Common # So [79] RIBBON..RACING CAR
+1F3D4..1F3F7 ; Common # So [36] SNOW CAPPED MOUNTAIN..LABEL
+1F400..1F4FE ; Common # So [255] RAT..PORTABLE STEREO
+1F500..1F54A ; Common # So [75] TWISTED RIGHTWARDS ARROWS..DOVE OF PEACE
+1F550..1F579 ; Common # So [42] CLOCK FACE ONE OCLOCK..JOYSTICK
+1F57B..1F5A3 ; Common # So [41] LEFT HAND TELEPHONE RECEIVER..BLACK DOWN POINTING BACKHAND INDEX
+1F5A5..1F642 ; Common # So [158] DESKTOP COMPUTER..SLIGHTLY SMILING FACE
+1F645..1F6CF ; Common # So [139] FACE WITH NO GOOD GESTURE..BED
+1F6E0..1F6EC ; Common # So [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
+1F6F0..1F6F3 ; Common # So [4] SATELLITE..PASSENGER SHIP
1F700..1F773 ; Common # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
+1F780..1F7D4 ; Common # So [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR
+1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
+1F810..1F847 ; Common # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
+1F850..1F859 ; Common # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
+1F860..1F887 ; Common # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
+1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
E0001 ; Common # Cf LANGUAGE TAG
E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG
-# Total code points: 6413
+# Total code points: 7129
# ================================================
@@ -618,16 +635,20 @@
A770 ; Latin # Lm MODIFIER LETTER US
A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
-A790..A793 ; Latin # L& [4] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER C WITH BAR
-A7A0..A7AA ; Latin # L& [11] LATIN CAPITAL LETTER G WITH OBLIQUE STROKE..LATIN CAPITAL LETTER H WITH HOOK
+A790..A7AD ; Latin # L& [30] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER L WITH BELT
+A7B0..A7B1 ; Latin # L& [2] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER TURNED T
+A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
A7FA ; Latin # L& LATIN LETTER SMALL CAPITAL TURNED M
A7FB..A7FF ; Latin # Lo [5] LATIN EPIGRAPHIC LETTER REVERSED F..LATIN EPIGRAPHIC LETTER ARCHAIC M
+AB30..AB5A ; Latin # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
+AB5C..AB5F ; Latin # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
+AB64 ; Latin # L& LATIN SMALL LETTER INVERTED ALPHA
FB00..FB06 ; Latin # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
FF21..FF3A ; Latin # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
-# Total code points: 1272
+# Total code points: 1338
# ================================================
@@ -636,6 +657,7 @@
0376..0377 ; Greek # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
037A ; Greek # Lm GREEK YPOGEGRAMMENI
037B..037D ; Greek # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL
+037F ; Greek # L& GREEK CAPITAL LETTER YOT
0384 ; Greek # Sk GREEK TONOS
0386 ; Greek # L& GREEK CAPITAL LETTER ALPHA WITH TONOS
0388..038A ; Greek # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS
@@ -675,15 +697,18 @@
1FF6..1FFC ; Greek # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
1FFD..1FFE ; Greek # Sk [2] GREEK OXIA..GREEK DASIA
2126 ; Greek # L& OHM SIGN
+AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA
10140..10174 ; Greek # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS
10175..10178 ; Greek # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN
10179..10189 ; Greek # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN
-1018A ; Greek # No GREEK ZERO SIGN
+1018A..1018B ; Greek # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN
+1018C ; Greek # So GREEK SINUSOID SIGN
+101A0 ; Greek # So GREEK SYMBOL TAU RHO
1D200..1D241 ; Greek # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D242..1D244 ; Greek # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
1D245 ; Greek # So GREEK MUSICAL LEIMMA
-# Total code points: 511
+# Total code points: 516
# ================================================
@@ -692,7 +717,7 @@
0483..0484 ; Cyrillic # Mn [2] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC PALATALIZATION
0487 ; Cyrillic # Mn COMBINING CYRILLIC POKRYTIE
0488..0489 ; Cyrillic # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
-048A..0527 ; Cyrillic # L& [158] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER SHHA WITH DESCENDER
+048A..052F ; Cyrillic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER
1D2B ; Cyrillic # L& CYRILLIC LETTER SMALL CAPITAL EL
1D78 ; Cyrillic # Lm MODIFIER LETTER CYRILLIC EN
2DE0..2DFF ; Cyrillic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
@@ -704,10 +729,11 @@
A674..A67D ; Cyrillic # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK
A67E ; Cyrillic # Po CYRILLIC KAVYKA
A67F ; Cyrillic # Lm CYRILLIC PAYEROK
-A680..A697 ; Cyrillic # L& [24] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER SHWE
+A680..A69B ; Cyrillic # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O
+A69C..A69D ; Cyrillic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
A69F ; Cyrillic # Mn COMBINING CYRILLIC LETTER IOTIFIED E
-# Total code points: 417
+# Total code points: 431
# ================================================
@@ -716,10 +742,11 @@
055A..055F ; Armenian # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK
0561..0587 ; Armenian # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN
058A ; Armenian # Pd ARMENIAN HYPHEN
+058D..058E ; Armenian # So [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN
058F ; Armenian # Sc ARMENIAN DRAM SIGN
FB13..FB17 ; Armenian # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
-# Total code points: 91
+# Total code points: 93
# ================================================
@@ -779,9 +806,8 @@
06FD..06FE ; Arabic # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN
06FF ; Arabic # Lo ARABIC LETTER HEH WITH INVERTED V
0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE
-08A0 ; Arabic # Lo ARABIC LETTER BEH WITH SMALL V BELOW
-08A2..08AC ; Arabic # Lo [11] ARABIC LETTER JEEM WITH TWO DOTS ABOVE..ARABIC LETTER ROHINGYA YEH
-08E4..08FE ; Arabic # Mn [27] ARABIC CURLY FATHA..ARABIC DAMMA WITH DOT
+08A0..08B2 ; Arabic # Lo [19] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER ZAIN WITH INVERTED V ABOVE
+08E4..08FF ; Arabic # Mn [28] ARABIC CURLY FATHA..ARABIC MARK SIDEWAYS NOON GHUNNA
FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
FBB2..FBC1 ; Arabic # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
FBD3..FD3D ; Arabic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
@@ -789,6 +815,7 @@
FD92..FDC7 ; Arabic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
FDF0..FDFB ; Arabic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU
FDFC ; Arabic # Sc RIAL SIGN
+FDFD ; Arabic # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS
@@ -827,7 +854,7 @@
1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL
-# Total code points: 1235
+# Total code points: 1244
# ================================================
@@ -870,17 +897,17 @@
0966..096F ; Devanagari # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
0970 ; Devanagari # Po DEVANAGARI ABBREVIATION SIGN
0971 ; Devanagari # Lm DEVANAGARI SIGN HIGH SPACING DOT
-0972..0977 ; Devanagari # Lo [6] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER UUE
-0979..097F ; Devanagari # Lo [7] DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA
+0972..097F ; Devanagari # Lo [14] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER BBA
A8E0..A8F1 ; Devanagari # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA
A8F2..A8F7 ; Devanagari # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA
A8F8..A8FA ; Devanagari # Po [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET
A8FB ; Devanagari # Lo DEVANAGARI HEADSTROKE
-# Total code points: 151
+# Total code points: 152
# ================================================
+0980 ; Bengali # Lo BENGALI ANJI
0981 ; Bengali # Mn BENGALI SIGN CANDRABINDU
0982..0983 ; Bengali # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA
0985..098C ; Bengali # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L
@@ -908,7 +935,7 @@
09FA ; Bengali # So BENGALI ISSHAR
09FB ; Bengali # Sc BENGALI GANDA MARK
-# Total code points: 92
+# Total code points: 93
# ================================================
@@ -1025,12 +1052,12 @@
# ================================================
+0C00 ; Telugu # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
0C01..0C03 ; Telugu # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
0C05..0C0C ; Telugu # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
0C0E..0C10 ; Telugu # Lo [3] TELUGU LETTER E..TELUGU LETTER AI
0C12..0C28 ; Telugu # Lo [23] TELUGU LETTER O..TELUGU LETTER NA
-0C2A..0C33 ; Telugu # Lo [10] TELUGU LETTER PA..TELUGU LETTER LLA
-0C35..0C39 ; Telugu # Lo [5] TELUGU LETTER VA..TELUGU LETTER HA
+0C2A..0C39 ; Telugu # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA
0C3D ; Telugu # Lo TELUGU SIGN AVAGRAHA
0C3E..0C40 ; Telugu # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
0C41..0C44 ; Telugu # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
@@ -1044,10 +1071,11 @@
0C78..0C7E ; Telugu # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR
0C7F ; Telugu # So TELUGU SIGN TUUMU
-# Total code points: 93
+# Total code points: 95
# ================================================
+0C81 ; Kannada # Mn KANNADA SIGN CANDRABINDU
0C82..0C83 ; Kannada # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
0C85..0C8C ; Kannada # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
0C8E..0C90 ; Kannada # Lo [3] KANNADA LETTER E..KANNADA LETTER AI
@@ -1070,10 +1098,11 @@
0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
-# Total code points: 86
+# Total code points: 87
# ================================================
+0D01 ; Malayalam # Mn MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Malayalam # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D05..0D0C ; Malayalam # Lo [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L
0D0E..0D10 ; Malayalam # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
@@ -1093,7 +1122,7 @@
0D79 ; Malayalam # So MALAYALAM DATE MARK
0D7A..0D7F ; Malayalam # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
-# Total code points: 98
+# Total code points: 99
# ================================================
@@ -1108,10 +1137,12 @@
0DD2..0DD4 ; Sinhala # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
0DD6 ; Sinhala # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA
0DD8..0DDF ; Sinhala # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA
+0DE6..0DEF ; Sinhala # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE
0DF2..0DF3 ; Sinhala # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA
0DF4 ; Sinhala # Po SINHALA PUNCTUATION KUNDDALIYA
+111E1..111F4 ; Sinhala # No [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND
-# Total code points: 80
+# Total code points: 110
# ================================================
@@ -1234,14 +1265,23 @@
109A..109C ; Myanmar # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
109D ; Myanmar # Mn MYANMAR VOWEL SIGN AITON AI
109E..109F ; Myanmar # So [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION
+A9E0..A9E4 ; Myanmar # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA
+A9E5 ; Myanmar # Mn MYANMAR SIGN SHAN SAW
+A9E6 ; Myanmar # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION
+A9E7..A9EF ; Myanmar # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA
+A9F0..A9F9 ; Myanmar # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE
+A9FA..A9FE ; Myanmar # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA
AA60..AA6F ; Myanmar # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA
AA70 ; Myanmar # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION
AA71..AA76 ; Myanmar # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM
AA77..AA79 ; Myanmar # So [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO
AA7A ; Myanmar # Lo MYANMAR LETTER AITON RA
AA7B ; Myanmar # Mc MYANMAR SIGN PAO KAREN TONE
+AA7C ; Myanmar # Mn MYANMAR SIGN TAI LAING TONE-2
+AA7D ; Myanmar # Mc MYANMAR SIGN TAI LAING TONE-5
+AA7E..AA7F ; Myanmar # Lo [2] MYANMAR LETTER SHWE PALAUNG CHA..MYANMAR LETTER SHWE PALAUNG SHA
-# Total code points: 188
+# Total code points: 223
# ================================================
@@ -1345,8 +1385,9 @@
16A0..16EA ; Runic # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
16EE..16F0 ; Runic # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL
+16F1..16F8 ; Runic # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
-# Total code points: 78
+# Total code points: 86
# ================================================
@@ -1377,7 +1418,7 @@
1806 ; Mongolian # Pd MONGOLIAN TODO SOFT HYPHEN
1807..180A ; Mongolian # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU
180B..180D ; Mongolian # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
-180E ; Mongolian # Zs MONGOLIAN VOWEL SEPARATOR
+180E ; Mongolian # Cf MONGOLIAN VOWEL SEPARATOR
1810..1819 ; Mongolian # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE
1820..1842 ; Mongolian # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
1843 ; Mongolian # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
@@ -1452,10 +1493,10 @@
# ================================================
-10300..1031E ; Old_Italic # Lo [31] OLD ITALIC LETTER A..OLD ITALIC LETTER UU
+10300..1031F ; Old_Italic # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS
10320..10323 ; Old_Italic # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY
-# Total code points: 35
+# Total code points: 36
# ================================================
@@ -1479,12 +1520,15 @@
064B..0655 ; Inherited # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
0670 ; Inherited # Mn ARABIC LETTER SUPERSCRIPT ALEF
0951..0952 ; Inherited # Mn [2] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI STRESS SIGN ANUDATTA
+1AB0..1ABD ; Inherited # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
+1ABE ; Inherited # Me COMBINING PARENTHESES OVERLAY
1CD0..1CD2 ; Inherited # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
1CD4..1CE0 ; Inherited # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
1CE2..1CE8 ; Inherited # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; Inherited # Mn VEDIC SIGN TIRYAK
1CF4 ; Inherited # Mn VEDIC TONE CANDRA ABOVE
-1DC0..1DE6 ; Inherited # Mn [39] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER Z
+1CF8..1CF9 ; Inherited # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
+1DC0..1DF5 ; Inherited # Mn [54] COMBINING DOTTED GRAVE ACCENT..COMBINING UP TACK ABOVE
1DFC..1DFF ; Inherited # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
200C..200D ; Inherited # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
20D0..20DC ; Inherited # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
@@ -1495,15 +1539,16 @@
302A..302D ; Inherited # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
3099..309A ; Inherited # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
FE00..FE0F ; Inherited # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
-FE20..FE26 ; Inherited # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON
+FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW
101FD ; Inherited # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
+102E0 ; Inherited # Mn COPTIC EPACT THOUSANDS MARK
1D167..1D169 ; Inherited # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
1D17B..1D182 ; Inherited # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
1D185..1D18B ; Inherited # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 523
+# Total code points: 563
# ================================================
@@ -1537,7 +1582,7 @@
# ================================================
-1900..191C ; Limbu # Lo [29] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER HA
+1900..191E ; Limbu # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA
1920..1922 ; Limbu # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
1923..1926 ; Limbu # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU
1927..1928 ; Limbu # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O
@@ -1550,7 +1595,7 @@
1944..1945 ; Limbu # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
1946..194F ; Limbu # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE
-# Total code points: 66
+# Total code points: 68
# ================================================
@@ -1612,7 +1657,8 @@
1A00..1A16 ; Buginese # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA
1A17..1A18 ; Buginese # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
-1A19..1A1B ; Buginese # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE
+1A19..1A1A ; Buginese # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O
+1A1B ; Buginese # Mn BUGINESE VOWEL SIGN AE
1A1E..1A1F ; Buginese # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION
# Total code points: 30
@@ -1724,11 +1770,11 @@
# ================================================
-12000..1236E ; Cuneiform # Lo [879] CUNEIFORM SIGN A..CUNEIFORM SIGN ZUM
-12400..12462 ; Cuneiform # Nl [99] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER
-12470..12473 ; Cuneiform # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON
+12000..12398 ; Cuneiform # Lo [921] CUNEIFORM SIGN A..CUNEIFORM SIGN UM TIMES ME
+12400..1246E ; Cuneiform # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
+12470..12474 ; Cuneiform # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
-# Total code points: 982
+# Total code points: 1037
# ================================================
@@ -1767,8 +1813,7 @@
1BA6..1BA7 ; Sundanese # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG
1BA8..1BA9 ; Sundanese # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
1BAA ; Sundanese # Mc SUNDANESE SIGN PAMAAEH
-1BAB ; Sundanese # Mn SUNDANESE SIGN VIRAMA
-1BAC..1BAD ; Sundanese # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA
+1BAB..1BAD ; Sundanese # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA
1BAE..1BAF ; Sundanese # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
1BB0..1BB9 ; Sundanese # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE
1BBA..1BBF ; Sundanese # Lo [6] SUNDANESE AVAGRAHA..SUNDANESE LETTER FINAL M
@@ -1825,9 +1870,9 @@
A900..A909 ; Kayah_Li # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE
A90A..A925 ; Kayah_Li # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO
A926..A92D ; Kayah_Li # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU
-A92E..A92F ; Kayah_Li # Po [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA
+A92F ; Kayah_Li # Po KAYAH LI SIGN SHYA
-# Total code points: 48
+# Total code points: 47
# ================================================
@@ -1974,11 +2019,10 @@
A9BC ; Javanese # Mn JAVANESE VOWEL SIGN PEPET
A9BD..A9C0 ; Javanese # Mc [4] JAVANESE CONSONANT SIGN KERET..JAVANESE PANGKON
A9C1..A9CD ; Javanese # Po [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH
-A9CF ; Javanese # Lm JAVANESE PANGRANGKEP
A9D0..A9D9 ; Javanese # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE
A9DE..A9DF ; Javanese # Po [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN
-# Total code points: 91
+# Total code points: 90
# ================================================
@@ -2080,8 +2124,9 @@
11047..1104D ; Brahmi # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
11052..11065 ; Brahmi # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND
11066..1106F ; Brahmi # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE
+1107F ; Brahmi # Mn BRAHMI NUMBER JOINER
-# Total code points: 108
+# Total code points: 109
# ================================================
@@ -2136,9 +2181,11 @@
111BF..111C0 ; Sharada # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA
111C1..111C4 ; Sharada # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM
111C5..111C8 ; Sharada # Po [4] SHARADA DANDA..SHARADA SEPARATOR
+111CD ; Sharada # Po SHARADA SUTRA MARK
111D0..111D9 ; Sharada # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE
+111DA ; Sharada # Lo SHARADA EKAM
-# Total code points: 83
+# Total code points: 85
# ================================================
@@ -2161,4 +2208,244 @@
# Total code points: 66
+# ================================================
+
+10530..10563 ; Caucasian_Albanian # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW
+1056F ; Caucasian_Albanian # Po CAUCASIAN ALBANIAN CITATION MARK
+
+# Total code points: 53
+
+# ================================================
+
+16AD0..16AED ; Bassa_Vah # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
+16AF0..16AF4 ; Bassa_Vah # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
+16AF5 ; Bassa_Vah # Po BASSA VAH FULL STOP
+
+# Total code points: 36
+
+# ================================================
+
+1BC00..1BC6A ; Duployan # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
+1BC70..1BC7C ; Duployan # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
+1BC80..1BC88 ; Duployan # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
+1BC90..1BC99 ; Duployan # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
+1BC9C ; Duployan # So DUPLOYAN SIGN O WITH CROSS
+1BC9D..1BC9E ; Duployan # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
+1BC9F ; Duployan # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
+
+# Total code points: 143
+
+# ================================================
+
+10500..10527 ; Elbasan # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE
+
+# Total code points: 40
+
+# ================================================
+
+11301 ; Grantha # Mn GRANTHA SIGN CANDRABINDU
+11302..11303 ; Grantha # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA
+11305..1130C ; Grantha # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L
+1130F..11310 ; Grantha # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI
+11313..11328 ; Grantha # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA
+1132A..11330 ; Grantha # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA
+11332..11333 ; Grantha # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA
+11335..11339 ; Grantha # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA
+1133C ; Grantha # Mn GRANTHA SIGN NUKTA
+1133D ; Grantha # Lo GRANTHA SIGN AVAGRAHA
+1133E..1133F ; Grantha # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I
+11340 ; Grantha # Mn GRANTHA VOWEL SIGN II
+11341..11344 ; Grantha # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR
+11347..11348 ; Grantha # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI
+1134B..1134D ; Grantha # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA
+11357 ; Grantha # Mc GRANTHA AU LENGTH MARK
+1135D..11361 ; Grantha # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL
+11362..11363 ; Grantha # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
+11366..1136C ; Grantha # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
+11370..11374 ; Grantha # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
+
+# Total code points: 83
+
+# ================================================
+
+16B00..16B2F ; Pahawh_Hmong # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
+16B30..16B36 ; Pahawh_Hmong # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
+16B37..16B3B ; Pahawh_Hmong # Po [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM
+16B3C..16B3F ; Pahawh_Hmong # So [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB
+16B40..16B43 ; Pahawh_Hmong # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
+16B44 ; Pahawh_Hmong # Po PAHAWH HMONG SIGN XAUS
+16B45 ; Pahawh_Hmong # So PAHAWH HMONG SIGN CIM TSOV ROG
+16B50..16B59 ; Pahawh_Hmong # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
+16B5B..16B61 ; Pahawh_Hmong # No [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS
+16B63..16B77 ; Pahawh_Hmong # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
+16B7D..16B8F ; Pahawh_Hmong # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ
+
+# Total code points: 127
+
+# ================================================
+
+11200..11211 ; Khojki # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA
+11213..1122B ; Khojki # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1122C..1122E ; Khojki # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II
+1122F..11231 ; Khojki # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI
+11232..11233 ; Khojki # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
+11234 ; Khojki # Mn KHOJKI SIGN ANUSVARA
+11235 ; Khojki # Mc KHOJKI SIGN VIRAMA
+11236..11237 ; Khojki # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
+11238..1123D ; Khojki # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
+
+# Total code points: 61
+
+# ================================================
+
+10600..10736 ; Linear_A # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664
+10740..10755 ; Linear_A # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE
+10760..10767 ; Linear_A # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807
+
+# Total code points: 341
+
+# ================================================
+
+11150..11172 ; Mahajani # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA
+11173 ; Mahajani # Mn MAHAJANI SIGN NUKTA
+11174..11175 ; Mahajani # Po [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK
+11176 ; Mahajani # Lo MAHAJANI LIGATURE SHRI
+
+# Total code points: 39
+
+# ================================================
+
+10AC0..10AC7 ; Manichaean # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW
+10AC8 ; Manichaean # So MANICHAEAN SIGN UD
+10AC9..10AE4 ; Manichaean # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW
+10AE5..10AE6 ; Manichaean # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
+10AEB..10AEF ; Manichaean # No [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED
+10AF0..10AF6 ; Manichaean # Po [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER
+
+# Total code points: 51
+
+# ================================================
+
+1E800..1E8C4 ; Mende_Kikakui # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
+1E8C7..1E8CF ; Mende_Kikakui # No [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE
+1E8D0..1E8D6 ; Mende_Kikakui # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
+
+# Total code points: 213
+
+# ================================================
+
+11600..1162F ; Modi # Lo [48] MODI LETTER A..MODI LETTER LLA
+11630..11632 ; Modi # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II
+11633..1163A ; Modi # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI
+1163B..1163C ; Modi # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU
+1163D ; Modi # Mn MODI SIGN ANUSVARA
+1163E ; Modi # Mc MODI SIGN VISARGA
+1163F..11640 ; Modi # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA
+11641..11643 ; Modi # Po [3] MODI DANDA..MODI ABBREVIATION SIGN
+11644 ; Modi # Lo MODI SIGN HUVA
+11650..11659 ; Modi # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE
+
+# Total code points: 79
+
+# ================================================
+
+16A40..16A5E ; Mro # Lo [31] MRO LETTER TA..MRO LETTER TEK
+16A60..16A69 ; Mro # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE
+16A6E..16A6F ; Mro # Po [2] MRO DANDA..MRO DOUBLE DANDA
+
+# Total code points: 43
+
+# ================================================
+
+10A80..10A9C ; Old_North_Arabian # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH
+10A9D..10A9F ; Old_North_Arabian # No [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY
+
+# Total code points: 32
+
+# ================================================
+
+10880..1089E ; Nabataean # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW
+108A7..108AF ; Nabataean # No [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED
+
+# Total code points: 40
+
+# ================================================
+
+10860..10876 ; Palmyrene # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW
+10877..10878 ; Palmyrene # So [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON
+10879..1087F ; Palmyrene # No [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY
+
+# Total code points: 32
+
+# ================================================
+
+11AC0..11AF8 ; Pau_Cin_Hau # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
+
+# Total code points: 57
+
+# ================================================
+
+10350..10375 ; Old_Permic # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA
+10376..1037A ; Old_Permic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
+
+# Total code points: 43
+
+# ================================================
+
+10B80..10B91 ; Psalter_Pahlavi # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW
+10B99..10B9C ; Psalter_Pahlavi # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
+10BA9..10BAF ; Psalter_Pahlavi # No [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED
+
+# Total code points: 29
+
+# ================================================
+
+11580..115AE ; Siddham # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA
+115AF..115B1 ; Siddham # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II
+115B2..115B5 ; Siddham # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR
+115B8..115BB ; Siddham # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU
+115BC..115BD ; Siddham # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA
+115BE ; Siddham # Mc SIDDHAM SIGN VISARGA
+115BF..115C0 ; Siddham # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
+115C1..115C9 ; Siddham # Po [9] SIDDHAM SIGN SIDDHAM..SIDDHAM END OF TEXT MARK
+
+# Total code points: 72
+
+# ================================================
+
+112B0..112DE ; Khudawadi # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA
+112DF ; Khudawadi # Mn KHUDAWADI SIGN ANUSVARA
+112E0..112E2 ; Khudawadi # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
+112E3..112EA ; Khudawadi # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
+112F0..112F9 ; Khudawadi # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE
+
+# Total code points: 69
+
+# ================================================
+
+11480..114AF ; Tirhuta # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA
+114B0..114B2 ; Tirhuta # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
+114B3..114B8 ; Tirhuta # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
+114B9 ; Tirhuta # Mc TIRHUTA VOWEL SIGN E
+114BA ; Tirhuta # Mn TIRHUTA VOWEL SIGN SHORT E
+114BB..114BE ; Tirhuta # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU
+114BF..114C0 ; Tirhuta # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA
+114C1 ; Tirhuta # Mc TIRHUTA SIGN VISARGA
+114C2..114C3 ; Tirhuta # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
+114C4..114C5 ; Tirhuta # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG
+114C6 ; Tirhuta # Po TIRHUTA ABBREVIATION SIGN
+114C7 ; Tirhuta # Lo TIRHUTA OM
+114D0..114D9 ; Tirhuta # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE
+
+# Total code points: 82
+
+# ================================================
+
+118A0..118DF ; Warang_Citi # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
+118E0..118E9 ; Warang_Citi # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE
+118EA..118F2 ; Warang_Citi # No [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY
+118FF ; Warang_Citi # Lo WARANG CITI OM
+
+# Total code points: 84
+
# EOF
--- a/jdk/make/data/unicodedata/SpecialCasing.txt Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/data/unicodedata/SpecialCasing.txt Wed Jul 15 11:05:51 2015 +0900
@@ -1,18 +1,25 @@
-# SpecialCasing-6.2.0.txt
-# Date: 2012-05-23, 20:35:15 GMT [MD]
+# SpecialCasing-7.0.0.txt
+# Date: 2014-03-18, 07:18:02 GMT [MD]
#
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
-# Special Casing Properties
+# Special Casing
#
-# This file is a supplement to the UnicodeData file.
-# It contains additional information about the casing of Unicode characters.
-# (For compatibility, the UnicodeData.txt file only contains case mappings for
-# characters where they are 1-1, and independent of context and language.
-# For more information, see the discussion of Case Mappings in the Unicode Standard.
+# This file is a supplement to the UnicodeData.txt file. It does not define any
+# properties, but rather provides additional information about the casing of
+# Unicode characters, for situations when casing incurs a change in string length
+# or is dependent on context or locale. For compatibility, the UnicodeData.txt
+# file only contains simple case mappings for characters where they are one-to-one
+# and independent of context and language. The data in this file, combined with
+# the simple case mappings in UnicodeData.txt, defines the full case mappings
+# Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc).
+#
+# Note that the preferred mechanism for defining tailored casing operations is
+# the Unicode Common Locale Data Repository (CLDR). For more information, see the
+# discussion of case mappings and case algorithms in the Unicode Standard.
#
# All code points not listed in this file that do not have a simple case mappings
# in UnicodeData.txt map to themselves.
@@ -21,16 +28,17 @@
# ================================================================================
# The entries in this file are in the following machine-readable format:
#
-# <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
+# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
#
-# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more
-# than one character, they are separated by spaces. Other than as used to separate
-# elements, spaces are to be ignored.
+# <code>, <lower>, <title>, and <upper> provide the respective full case mappings
+# of <code>, expressed as character values in hex. If there is more than one character,
+# they are separated by spaces. Other than as used to separate elements, spaces are
+# to be ignored.
#
# The <condition_list> is optional. Where present, it consists of one or more language IDs
-# or contexts, separated by spaces. In these conditions:
+# or casing contexts, separated by spaces. In these conditions:
# - A condition list overrides the normal behavior if all of the listed conditions are true.
-# - The context is always the context of the characters in the original string,
+# - The casing context is always the context of the characters in the original string,
# NOT in the resulting string.
# - Case distinctions in the condition list are not significant.
# - Conditions preceded by "Not_" represent the negation of the condition.
@@ -38,18 +46,14 @@
#
# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
#
-# A context for a character C is defined by Section 3.13 Default Case
-# Operations, of The Unicode Standard, Version 5.0.
-# (This is identical to the context defined by Unicode 4.1.0,
-# as specified in http://www.unicode.org/versions/Unicode4.1.0/)
+# A casing context for a character is defined by Section 3.13 Default Case Algorithms
+# of The Unicode Standard.
#
# Parsers of this file must be prepared to deal with future additions to this format:
# * Additional contexts
# * Additional fields
# ================================================================================
-# @missing: 0000..10FFFF; <slc>; <stc>; <suc>;
-
# ================================================================================
# Unconditional mappings
# ================================================================================
@@ -114,7 +118,7 @@
# This process can be achieved by first transforming the text to NFC before casing.
# E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA>
-# The following cases are already in the UnicodeData file, so are only commented here.
+# The following cases are already in the UnicodeData.txt file, so are only commented here.
# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI
@@ -205,7 +209,7 @@
03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
-# Note: the following cases for non-final are already in the UnicodeData file.
+# Note: the following cases for non-final are already in the UnicodeData.txt file.
# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
@@ -268,7 +272,7 @@
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
-# Note: the following case is already in the UnicodeData file.
+# Note: the following case is already in the UnicodeData.txt file.
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
--- a/jdk/make/data/unicodedata/UnicodeData.txt Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/data/unicodedata/UnicodeData.txt Wed Jul 15 11:05:51 2015 +0900
@@ -602,12 +602,12 @@
0259;LATIN SMALL LETTER SCHWA;Ll;0;L;;;;;N;;;018F;;018F
025A;LATIN SMALL LETTER SCHWA WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER SCHWA HOOK;;;;
025B;LATIN SMALL LETTER OPEN E;Ll;0;L;;;;;N;LATIN SMALL LETTER EPSILON;;0190;;0190
-025C;LATIN SMALL LETTER REVERSED OPEN E;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED EPSILON;;;;
+025C;LATIN SMALL LETTER REVERSED OPEN E;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED EPSILON;;A7AB;;A7AB
025D;LATIN SMALL LETTER REVERSED OPEN E WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED EPSILON HOOK;;;;
025E;LATIN SMALL LETTER CLOSED REVERSED OPEN E;Ll;0;L;;;;;N;LATIN SMALL LETTER CLOSED REVERSED EPSILON;;;;
025F;LATIN SMALL LETTER DOTLESS J WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER DOTLESS J BAR;;;;
0260;LATIN SMALL LETTER G WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER G HOOK;;0193;;0193
-0261;LATIN SMALL LETTER SCRIPT G;Ll;0;L;;;;;N;;;;;
+0261;LATIN SMALL LETTER SCRIPT G;Ll;0;L;;;;;N;;;A7AC;;A7AC
0262;LATIN LETTER SMALL CAPITAL G;Ll;0;L;;;;;N;;;;;
0263;LATIN SMALL LETTER GAMMA;Ll;0;L;;;;;N;;;0194;;0194
0264;LATIN SMALL LETTER RAMS HORN;Ll;0;L;;;;;N;LATIN SMALL LETTER BABY GAMMA;;;;
@@ -618,7 +618,7 @@
0269;LATIN SMALL LETTER IOTA;Ll;0;L;;;;;N;;;0196;;0196
026A;LATIN LETTER SMALL CAPITAL I;Ll;0;L;;;;;N;;;;;
026B;LATIN SMALL LETTER L WITH MIDDLE TILDE;Ll;0;L;;;;;N;;;2C62;;2C62
-026C;LATIN SMALL LETTER L WITH BELT;Ll;0;L;;;;;N;LATIN SMALL LETTER L BELT;;;;
+026C;LATIN SMALL LETTER L WITH BELT;Ll;0;L;;;;;N;LATIN SMALL LETTER L BELT;;A7AD;;A7AD
026D;LATIN SMALL LETTER L WITH RETROFLEX HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER L RETROFLEX HOOK;;;;
026E;LATIN SMALL LETTER LEZH;Ll;0;L;;;;;N;LATIN SMALL LETTER L YOGH;;;;
026F;LATIN SMALL LETTER TURNED M;Ll;0;L;;;;;N;;;019C;;019C
@@ -645,7 +645,7 @@
0284;LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER DOTLESS J BAR HOOK;;;;
0285;LATIN SMALL LETTER SQUAT REVERSED ESH;Ll;0;L;;;;;N;;;;;
0286;LATIN SMALL LETTER ESH WITH CURL;Ll;0;L;;;;;N;LATIN SMALL LETTER ESH CURL;;;;
-0287;LATIN SMALL LETTER TURNED T;Ll;0;L;;;;;N;;;;;
+0287;LATIN SMALL LETTER TURNED T;Ll;0;L;;;;;N;;;A7B1;;A7B1
0288;LATIN SMALL LETTER T WITH RETROFLEX HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T RETROFLEX HOOK;;01AE;;01AE
0289;LATIN SMALL LETTER U BAR;Ll;0;L;;;;;N;;;0244;;0244
028A;LATIN SMALL LETTER UPSILON;Ll;0;L;;;;;N;;;01B1;;01B1
@@ -668,7 +668,7 @@
029B;LATIN LETTER SMALL CAPITAL G WITH HOOK;Ll;0;L;;;;;N;LATIN LETTER SMALL CAPITAL G HOOK;;;;
029C;LATIN LETTER SMALL CAPITAL H;Ll;0;L;;;;;N;;;;;
029D;LATIN SMALL LETTER J WITH CROSSED-TAIL;Ll;0;L;;;;;N;LATIN SMALL LETTER CROSSED-TAIL J;;;;
-029E;LATIN SMALL LETTER TURNED K;Ll;0;L;;;;;N;;;;;
+029E;LATIN SMALL LETTER TURNED K;Ll;0;L;;;;;N;;;A7B0;;A7B0
029F;LATIN LETTER SMALL CAPITAL L;Ll;0;L;;;;;N;;;;;
02A0;LATIN SMALL LETTER Q WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER Q HOOK;;;;
02A1;LATIN LETTER GLOTTAL STOP WITH STROKE;Ll;0;L;;;;;N;LATIN LETTER GLOTTAL STOP BAR;;;;
@@ -891,6 +891,7 @@
037C;GREEK SMALL DOTTED LUNATE SIGMA SYMBOL;Ll;0;L;;;;;N;;;03FE;;03FE
037D;GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL;Ll;0;L;;;;;N;;;03FF;;03FF
037E;GREEK QUESTION MARK;Po;0;ON;003B;;;;N;;;;;
+037F;GREEK CAPITAL LETTER YOT;Lu;0;L;;;;;N;;;;03F3;
0384;GREEK TONOS;Sk;0;ON;<compat> 0020 0301;;;;N;GREEK SPACING TONOS;;;;
0385;GREEK DIALYTIKA TONOS;Sk;0;ON;00A8 0301;;;;N;GREEK SPACING DIAERESIS TONOS;;;;
0386;GREEK CAPITAL LETTER ALPHA WITH TONOS;Lu;0;L;0391 0301;;;;N;GREEK CAPITAL LETTER ALPHA TONOS;;;03AC;
@@ -999,7 +1000,7 @@
03F0;GREEK KAPPA SYMBOL;Ll;0;L;<compat> 03BA;;;;N;GREEK SMALL LETTER SCRIPT KAPPA;;039A;;039A
03F1;GREEK RHO SYMBOL;Ll;0;L;<compat> 03C1;;;;N;GREEK SMALL LETTER TAILED RHO;;03A1;;03A1
03F2;GREEK LUNATE SIGMA SYMBOL;Ll;0;L;<compat> 03C2;;;;N;GREEK SMALL LETTER LUNATE SIGMA;;03F9;;03F9
-03F3;GREEK LETTER YOT;Ll;0;L;;;;;N;;;;;
+03F3;GREEK LETTER YOT;Ll;0;L;;;;;N;;;037F;;037F
03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
03F5;GREEK LUNATE EPSILON SYMBOL;Ll;0;L;<compat> 03B5;;;;N;;;0395;;0395
03F6;GREEK REVERSED LUNATE EPSILON SYMBOL;Sm;0;ON;;;;;N;;;;;
@@ -1308,6 +1309,14 @@
0525;CYRILLIC SMALL LETTER PE WITH DESCENDER;Ll;0;L;;;;;N;;;0524;;0524
0526;CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER;Lu;0;L;;;;;N;;;;0527;
0527;CYRILLIC SMALL LETTER SHHA WITH DESCENDER;Ll;0;L;;;;;N;;;0526;;0526
+0528;CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK;Lu;0;L;;;;;N;;;;0529;
+0529;CYRILLIC SMALL LETTER EN WITH LEFT HOOK;Ll;0;L;;;;;N;;;0528;;0528
+052A;CYRILLIC CAPITAL LETTER DZZHE;Lu;0;L;;;;;N;;;;052B;
+052B;CYRILLIC SMALL LETTER DZZHE;Ll;0;L;;;;;N;;;052A;;052A
+052C;CYRILLIC CAPITAL LETTER DCHE;Lu;0;L;;;;;N;;;;052D;
+052D;CYRILLIC SMALL LETTER DCHE;Ll;0;L;;;;;N;;;052C;;052C
+052E;CYRILLIC CAPITAL LETTER EL WITH DESCENDER;Lu;0;L;;;;;N;;;;052F;
+052F;CYRILLIC SMALL LETTER EL WITH DESCENDER;Ll;0;L;;;;;N;;;052E;;052E
0531;ARMENIAN CAPITAL LETTER AYB;Lu;0;L;;;;;N;;;;0561;
0532;ARMENIAN CAPITAL LETTER BEN;Lu;0;L;;;;;N;;;;0562;
0533;ARMENIAN CAPITAL LETTER GIM;Lu;0;L;;;;;N;;;;0563;
@@ -1394,6 +1403,8 @@
0587;ARMENIAN SMALL LIGATURE ECH YIWN;Ll;0;L;<compat> 0565 0582;;;;N;;;;;
0589;ARMENIAN FULL STOP;Po;0;L;;;;;N;ARMENIAN PERIOD;;;;
058A;ARMENIAN HYPHEN;Pd;0;ON;;;;;N;;;;;
+058D;RIGHT-FACING ARMENIAN ETERNITY SIGN;So;0;ON;;;;;N;;;;;
+058E;LEFT-FACING ARMENIAN ETERNITY SIGN;So;0;ON;;;;;N;;;;;
058F;ARMENIAN DRAM SIGN;Sc;0;ET;;;;;N;;;;;
0591;HEBREW ACCENT ETNAHTA;Mn;220;NSM;;;;;N;;;;;
0592;HEBREW ACCENT SEGOL;Mn;230;NSM;;;;;N;;;;;
@@ -1487,6 +1498,7 @@
0602;ARABIC FOOTNOTE MARKER;Cf;0;AN;;;;;N;;;;;
0603;ARABIC SIGN SAFHA;Cf;0;AN;;;;;N;;;;;
0604;ARABIC SIGN SAMVAT;Cf;0;AN;;;;;N;;;;;
+0605;ARABIC NUMBER MARK ABOVE;Cf;0;AN;;;;;N;;;;;
0606;ARABIC-INDIC CUBE ROOT;Sm;0;ON;;;;;N;;;;;
0607;ARABIC-INDIC FOURTH ROOT;Sm;0;ON;;;;;N;;;;;
0608;ARABIC RAY;Sm;0;AL;;;;;N;;;;;
@@ -1509,6 +1521,7 @@
0619;ARABIC SMALL DAMMA;Mn;31;NSM;;;;;N;;;;;
061A;ARABIC SMALL KASRA;Mn;32;NSM;;;;;N;;;;;
061B;ARABIC SEMICOLON;Po;0;AL;;;;;N;;;;;
+061C;ARABIC LETTER MARK;Cf;0;AL;;;;;N;;;;;
061E;ARABIC TRIPLE DOT PUNCTUATION MARK;Po;0;AL;;;;;N;;;;;
061F;ARABIC QUESTION MARK;Po;0;AL;;;;;N;;;;;
0620;ARABIC LETTER KASHMIRI YEH;Lo;0;AL;;;;;N;;;;;
@@ -2060,6 +2073,7 @@
085B;MANDAIC GEMINATION MARK;Mn;220;NSM;;;;;N;;;;;
085E;MANDAIC PUNCTUATION;Po;0;R;;;;;N;;;;;
08A0;ARABIC LETTER BEH WITH SMALL V BELOW;Lo;0;AL;;;;;N;;;;;
+08A1;ARABIC LETTER BEH WITH HAMZA ABOVE;Lo;0;AL;;;;;N;;;;;
08A2;ARABIC LETTER JEEM WITH TWO DOTS ABOVE;Lo;0;AL;;;;;N;;;;;
08A3;ARABIC LETTER TAH WITH TWO DOTS ABOVE;Lo;0;AL;;;;;N;;;;;
08A4;ARABIC LETTER FEH WITH DOT BELOW AND THREE DOTS ABOVE;Lo;0;AL;;;;;N;;;;;
@@ -2071,6 +2085,12 @@
08AA;ARABIC LETTER REH WITH LOOP;Lo;0;AL;;;;;N;;;;;
08AB;ARABIC LETTER WAW WITH DOT WITHIN;Lo;0;AL;;;;;N;;;;;
08AC;ARABIC LETTER ROHINGYA YEH;Lo;0;AL;;;;;N;;;;;
+08AD;ARABIC LETTER LOW ALEF;Lo;0;AL;;;;;N;;;;;
+08AE;ARABIC LETTER DAL WITH THREE DOTS BELOW;Lo;0;AL;;;;;N;;;;;
+08AF;ARABIC LETTER SAD WITH THREE DOTS BELOW;Lo;0;AL;;;;;N;;;;;
+08B0;ARABIC LETTER GAF WITH INVERTED STROKE;Lo;0;AL;;;;;N;;;;;
+08B1;ARABIC LETTER STRAIGHT WAW;Lo;0;AL;;;;;N;;;;;
+08B2;ARABIC LETTER ZAIN WITH INVERTED V ABOVE;Lo;0;AL;;;;;N;;;;;
08E4;ARABIC CURLY FATHA;Mn;230;NSM;;;;;N;;;;;
08E5;ARABIC CURLY DAMMA;Mn;230;NSM;;;;;N;;;;;
08E6;ARABIC CURLY KASRA;Mn;220;NSM;;;;;N;;;;;
@@ -2098,6 +2118,7 @@
08FC;ARABIC DOUBLE RIGHT ARROWHEAD ABOVE WITH DOT;Mn;230;NSM;;;;;N;;;;;
08FD;ARABIC RIGHT ARROWHEAD ABOVE WITH DOT;Mn;230;NSM;;;;;N;;;;;
08FE;ARABIC DAMMA WITH DOT;Mn;230;NSM;;;;;N;;;;;
+08FF;ARABIC MARK SIDEWAYS NOON GHUNNA;Mn;230;NSM;;;;;N;;;;;
0900;DEVANAGARI SIGN INVERTED CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0901;DEVANAGARI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0902;DEVANAGARI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
@@ -2218,6 +2239,7 @@
0975;DEVANAGARI LETTER AW;Lo;0;L;;;;;N;;;;;
0976;DEVANAGARI LETTER UE;Lo;0;L;;;;;N;;;;;
0977;DEVANAGARI LETTER UUE;Lo;0;L;;;;;N;;;;;
+0978;DEVANAGARI LETTER MARWARI DDA;Lo;0;L;;;;;N;;;;;
0979;DEVANAGARI LETTER ZHA;Lo;0;L;;;;;N;;;;;
097A;DEVANAGARI LETTER HEAVY YA;Lo;0;L;;;;;N;;;;;
097B;DEVANAGARI LETTER GGA;Lo;0;L;;;;;N;;;;;
@@ -2225,6 +2247,7 @@
097D;DEVANAGARI LETTER GLOTTAL STOP;Lo;0;L;;;;;N;;;;;
097E;DEVANAGARI LETTER DDDA;Lo;0;L;;;;;N;;;;;
097F;DEVANAGARI LETTER BBA;Lo;0;L;;;;;N;;;;;
+0980;BENGALI ANJI;Lo;0;L;;;;;N;;;;;
0981;BENGALI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0982;BENGALI SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
0983;BENGALI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
@@ -2642,6 +2665,7 @@
0BF8;TAMIL AS ABOVE SIGN;So;0;ON;;;;;N;;;;;
0BF9;TAMIL RUPEE SIGN;Sc;0;ET;;;;;N;;;;;
0BFA;TAMIL NUMBER SIGN;So;0;ON;;;;;N;;;;;
+0C00;TELUGU SIGN COMBINING CANDRABINDU ABOVE;Mn;0;NSM;;;;;N;;;;;
0C01;TELUGU SIGN CANDRABINDU;Mc;0;L;;;;;N;;;;;
0C02;TELUGU SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
0C03;TELUGU SIGN VISARGA;Mc;0;L;;;;;N;;;;;
@@ -2689,6 +2713,7 @@
0C31;TELUGU LETTER RRA;Lo;0;L;;;;;N;;;;;
0C32;TELUGU LETTER LA;Lo;0;L;;;;;N;;;;;
0C33;TELUGU LETTER LLA;Lo;0;L;;;;;N;;;;;
+0C34;TELUGU LETTER LLLA;Lo;0;L;;;;;N;;;;;
0C35;TELUGU LETTER VA;Lo;0;L;;;;;N;;;;;
0C36;TELUGU LETTER SHA;Lo;0;L;;;;;N;;;;;
0C37;TELUGU LETTER SSA;Lo;0;L;;;;;N;;;;;
@@ -2735,6 +2760,7 @@
0C7D;TELUGU FRACTION DIGIT TWO FOR EVEN POWERS OF FOUR;No;0;ON;;;;2;N;;;;;
0C7E;TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR;No;0;ON;;;;3;N;;;;;
0C7F;TELUGU SIGN TUUMU;So;0;L;;;;;N;;;;;
+0C81;KANNADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0C82;KANNADA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
0C83;KANNADA SIGN VISARGA;Mc;0;L;;;;;N;;;;;
0C85;KANNADA LETTER A;Lo;0;L;;;;;N;;;;;
@@ -2821,6 +2847,7 @@
0CEF;KANNADA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
0CF1;KANNADA SIGN JIHVAMULIYA;Lo;0;L;;;;;N;;;;;
0CF2;KANNADA SIGN UPADHMANIYA;Lo;0;L;;;;;N;;;;;
+0D01;MALAYALAM SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
0D03;MALAYALAM SIGN VISARGA;Mc;0;L;;;;;N;;;;;
0D05;MALAYALAM LETTER A;Lo;0;L;;;;;N;;;;;
@@ -2996,6 +3023,16 @@
0DDD;SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA;Mc;0;L;0DDC 0DCA;;;;N;;;;;
0DDE;SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA;Mc;0;L;0DD9 0DDF;;;;N;;;;;
0DDF;SINHALA VOWEL SIGN GAYANUKITTA;Mc;0;L;;;;;N;;;;;
+0DE6;SINHALA LITH DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+0DE7;SINHALA LITH DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+0DE8;SINHALA LITH DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+0DE9;SINHALA LITH DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+0DEA;SINHALA LITH DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+0DEB;SINHALA LITH DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+0DEC;SINHALA LITH DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+0DED;SINHALA LITH DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+0DEE;SINHALA LITH DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+0DEF;SINHALA LITH DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
0DF2;SINHALA VOWEL SIGN DIGA GAETTA-PILLA;Mc;0;L;;;;;N;;;;;
0DF3;SINHALA VOWEL SIGN DIGA GAYANUKITTA;Mc;0;L;;;;;N;;;;;
0DF4;SINHALA PUNCTUATION KUNDDALIYA;Po;0;L;;;;;N;;;;;
@@ -5087,6 +5124,14 @@
16EE;RUNIC ARLAUG SYMBOL;Nl;0;L;;;;17;N;;;;;
16EF;RUNIC TVIMADUR SYMBOL;Nl;0;L;;;;18;N;;;;;
16F0;RUNIC BELGTHOR SYMBOL;Nl;0;L;;;;19;N;;;;;
+16F1;RUNIC LETTER K;Lo;0;L;;;;;N;;;;;
+16F2;RUNIC LETTER SH;Lo;0;L;;;;;N;;;;;
+16F3;RUNIC LETTER OO;Lo;0;L;;;;;N;;;;;
+16F4;RUNIC LETTER FRANKS CASKET OS;Lo;0;L;;;;;N;;;;;
+16F5;RUNIC LETTER FRANKS CASKET IS;Lo;0;L;;;;;N;;;;;
+16F6;RUNIC LETTER FRANKS CASKET EH;Lo;0;L;;;;;N;;;;;
+16F7;RUNIC LETTER FRANKS CASKET AC;Lo;0;L;;;;;N;;;;;
+16F8;RUNIC LETTER FRANKS CASKET AESC;Lo;0;L;;;;;N;;;;;
1700;TAGALOG LETTER A;Lo;0;L;;;;;N;;;;;
1701;TAGALOG LETTER I;Lo;0;L;;;;;N;;;;;
1702;TAGALOG LETTER U;Lo;0;L;;;;;N;;;;;
@@ -5296,7 +5341,7 @@
180B;MONGOLIAN FREE VARIATION SELECTOR ONE;Mn;0;NSM;;;;;N;;;;;
180C;MONGOLIAN FREE VARIATION SELECTOR TWO;Mn;0;NSM;;;;;N;;;;;
180D;MONGOLIAN FREE VARIATION SELECTOR THREE;Mn;0;NSM;;;;;N;;;;;
-180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;;
+180E;MONGOLIAN VOWEL SEPARATOR;Cf;0;BN;;;;;N;;;;;
1810;MONGOLIAN DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
1811;MONGOLIAN DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
1812;MONGOLIAN DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
@@ -5537,6 +5582,8 @@
191A;LIMBU LETTER SSA;Lo;0;L;;;;;N;;;;;
191B;LIMBU LETTER SA;Lo;0;L;;;;;N;;;;;
191C;LIMBU LETTER HA;Lo;0;L;;;;;N;;;;;
+191D;LIMBU LETTER GYAN;Lo;0;L;;;;;N;;;;;
+191E;LIMBU LETTER TRA;Lo;0;L;;;;;N;;;;;
1920;LIMBU VOWEL SIGN A;Mn;0;NSM;;;;;N;;;;;
1921;LIMBU VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;
1922;LIMBU VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
@@ -5751,7 +5798,7 @@
1A18;BUGINESE VOWEL SIGN U;Mn;220;NSM;;;;;N;;;;;
1A19;BUGINESE VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
1A1A;BUGINESE VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
-1A1B;BUGINESE VOWEL SIGN AE;Mc;0;L;;;;;N;;;;;
+1A1B;BUGINESE VOWEL SIGN AE;Mn;0;NSM;;;;;N;;;;;
1A1E;BUGINESE PALLAWA;Po;0;L;;;;;N;;;;;
1A1F;BUGINESE END OF SECTION;Po;0;L;;;;;N;;;;;
1A20;TAI THAM LETTER HIGH KA;Lo;0;L;;;;;N;;;;;
@@ -5881,6 +5928,21 @@
1AAB;TAI THAM SIGN SATKAANKUU;Po;0;L;;;;;N;;;;;
1AAC;TAI THAM SIGN HANG;Po;0;L;;;;;N;;;;;
1AAD;TAI THAM SIGN CAANG;Po;0;L;;;;;N;;;;;
+1AB0;COMBINING DOUBLED CIRCUMFLEX ACCENT;Mn;230;NSM;;;;;N;;;;;
+1AB1;COMBINING DIAERESIS-RING;Mn;230;NSM;;;;;N;;;;;
+1AB2;COMBINING INFINITY;Mn;230;NSM;;;;;N;;;;;
+1AB3;COMBINING DOWNWARDS ARROW;Mn;230;NSM;;;;;N;;;;;
+1AB4;COMBINING TRIPLE DOT;Mn;230;NSM;;;;;N;;;;;
+1AB5;COMBINING X-X BELOW;Mn;220;NSM;;;;;N;;;;;
+1AB6;COMBINING WIGGLY LINE BELOW;Mn;220;NSM;;;;;N;;;;;
+1AB7;COMBINING OPEN MARK BELOW;Mn;220;NSM;;;;;N;;;;;
+1AB8;COMBINING DOUBLE OPEN MARK BELOW;Mn;220;NSM;;;;;N;;;;;
+1AB9;COMBINING LIGHT CENTRALIZATION STROKE BELOW;Mn;220;NSM;;;;;N;;;;;
+1ABA;COMBINING STRONG CENTRALIZATION STROKE BELOW;Mn;220;NSM;;;;;N;;;;;
+1ABB;COMBINING PARENTHESES ABOVE;Mn;230;NSM;;;;;N;;;;;
+1ABC;COMBINING DOUBLE PARENTHESES ABOVE;Mn;230;NSM;;;;;N;;;;;
+1ABD;COMBINING PARENTHESES BELOW;Mn;220;NSM;;;;;N;;;;;
+1ABE;COMBINING PARENTHESES OVERLAY;Me;0;NSM;;;;;N;;;;;
1B00;BALINESE SIGN ULU RICEM;Mn;0;NSM;;;;;N;;;;;
1B01;BALINESE SIGN ULU CANDRA;Mn;0;NSM;;;;;N;;;;;
1B02;BALINESE SIGN CECEK;Mn;0;NSM;;;;;N;;;;;
@@ -6046,8 +6108,8 @@
1BA9;SUNDANESE VOWEL SIGN PANEULEUNG;Mn;0;NSM;;;;;N;;;;;
1BAA;SUNDANESE SIGN PAMAAEH;Mc;9;L;;;;;N;;;;;
1BAB;SUNDANESE SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
-1BAC;SUNDANESE CONSONANT SIGN PASANGAN MA;Mc;0;L;;;;;N;;;;;
-1BAD;SUNDANESE CONSONANT SIGN PASANGAN WA;Mc;0;L;;;;;N;;;;;
+1BAC;SUNDANESE CONSONANT SIGN PASANGAN MA;Mn;0;NSM;;;;;N;;;;;
+1BAD;SUNDANESE CONSONANT SIGN PASANGAN WA;Mn;0;NSM;;;;;N;;;;;
1BAE;SUNDANESE LETTER KHA;Lo;0;L;;;;;N;;;;;
1BAF;SUNDANESE LETTER SYA;Lo;0;L;;;;;N;;;;;
1BB0;SUNDANESE DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
@@ -6291,6 +6353,8 @@
1CF4;VEDIC TONE CANDRA ABOVE;Mn;230;NSM;;;;;N;;;;;
1CF5;VEDIC SIGN JIHVAMULIYA;Lo;0;L;;;;;N;;;;;
1CF6;VEDIC SIGN UPADHMANIYA;Lo;0;L;;;;;N;;;;;
+1CF8;VEDIC TONE RING ABOVE;Mn;230;NSM;;;;;N;;;;;
+1CF9;VEDIC TONE DOUBLE RING ABOVE;Mn;230;NSM;;;;;N;;;;;
1D00;LATIN LETTER SMALL CAPITAL A;Ll;0;L;;;;;N;;;;;
1D01;LATIN LETTER SMALL CAPITAL AE;Ll;0;L;;;;;N;;;;;
1D02;LATIN SMALL LETTER TURNED AE;Ll;0;L;;;;;N;;;;;
@@ -6522,6 +6586,21 @@
1DE4;COMBINING LATIN SMALL LETTER S;Mn;230;NSM;;;;;N;;;;;
1DE5;COMBINING LATIN SMALL LETTER LONG S;Mn;230;NSM;;;;;N;;;;;
1DE6;COMBINING LATIN SMALL LETTER Z;Mn;230;NSM;;;;;N;;;;;
+1DE7;COMBINING LATIN SMALL LETTER ALPHA;Mn;230;NSM;;;;;N;;;;;
+1DE8;COMBINING LATIN SMALL LETTER B;Mn;230;NSM;;;;;N;;;;;
+1DE9;COMBINING LATIN SMALL LETTER BETA;Mn;230;NSM;;;;;N;;;;;
+1DEA;COMBINING LATIN SMALL LETTER SCHWA;Mn;230;NSM;;;;;N;;;;;
+1DEB;COMBINING LATIN SMALL LETTER F;Mn;230;NSM;;;;;N;;;;;
+1DEC;COMBINING LATIN SMALL LETTER L WITH DOUBLE MIDDLE TILDE;Mn;230;NSM;;;;;N;;;;;
+1DED;COMBINING LATIN SMALL LETTER O WITH LIGHT CENTRALIZATION STROKE;Mn;230;NSM;;;;;N;;;;;
+1DEE;COMBINING LATIN SMALL LETTER P;Mn;230;NSM;;;;;N;;;;;
+1DEF;COMBINING LATIN SMALL LETTER ESH;Mn;230;NSM;;;;;N;;;;;
+1DF0;COMBINING LATIN SMALL LETTER U WITH LIGHT CENTRALIZATION STROKE;Mn;230;NSM;;;;;N;;;;;
+1DF1;COMBINING LATIN SMALL LETTER W;Mn;230;NSM;;;;;N;;;;;
+1DF2;COMBINING LATIN SMALL LETTER A WITH DIAERESIS;Mn;230;NSM;;;;;N;;;;;
+1DF3;COMBINING LATIN SMALL LETTER O WITH DIAERESIS;Mn;230;NSM;;;;;N;;;;;
+1DF4;COMBINING LATIN SMALL LETTER U WITH DIAERESIS;Mn;230;NSM;;;;;N;;;;;
+1DF5;COMBINING UP TACK ABOVE;Mn;230;NSM;;;;;N;;;;;
1DFC;COMBINING DOUBLE INVERTED BREVE BELOW;Mn;233;NSM;;;;;N;;;;;
1DFD;COMBINING ALMOST EQUAL TO BELOW;Mn;220;NSM;;;;;N;;;;;
1DFE;COMBINING LEFT ARROWHEAD ABOVE;Mn;230;NSM;;;;;N;;;;;
@@ -7116,6 +7195,10 @@
2062;INVISIBLE TIMES;Cf;0;BN;;;;;N;;;;;
2063;INVISIBLE SEPARATOR;Cf;0;BN;;;;;N;;;;;
2064;INVISIBLE PLUS;Cf;0;BN;;;;;N;;;;;
+2066;LEFT-TO-RIGHT ISOLATE;Cf;0;LRI;;;;;N;;;;;
+2067;RIGHT-TO-LEFT ISOLATE;Cf;0;RLI;;;;;N;;;;;
+2068;FIRST STRONG ISOLATE;Cf;0;FSI;;;;;N;;;;;
+2069;POP DIRECTIONAL ISOLATE;Cf;0;PDI;;;;;N;;;;;
206A;INHIBIT SYMMETRIC SWAPPING;Cf;0;BN;;;;;N;;;;;
206B;ACTIVATE SYMMETRIC SWAPPING;Cf;0;BN;;;;;N;;;;;
206C;INHIBIT ARABIC FORM SHAPING;Cf;0;BN;;;;;N;;;;;
@@ -7191,6 +7274,9 @@
20B8;TENGE SIGN;Sc;0;ET;;;;;N;;;;;
20B9;INDIAN RUPEE SIGN;Sc;0;ET;;;;;N;;;;;
20BA;TURKISH LIRA SIGN;Sc;0;ET;;;;;N;;;;;
+20BB;NORDIC MARK SIGN;Sc;0;ET;;;;;N;;;;;
+20BC;MANAT SIGN;Sc;0;ET;;;;;N;;;;;
+20BD;RUBLE SIGN;Sc;0;ET;;;;;N;;;;;
20D0;COMBINING LEFT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING LEFT HARPOON ABOVE;;;;
20D1;COMBINING RIGHT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING RIGHT HARPOON ABOVE;;;;
20D2;COMBINING LONG VERTICAL LINE OVERLAY;Mn;1;NSM;;;;;N;NON-SPACING LONG VERTICAL BAR OVERLAY;;;;
@@ -7738,10 +7824,10 @@
2305;PROJECTIVE;So;0;ON;;;;;N;;;;;
2306;PERSPECTIVE;So;0;ON;;;;;N;;;;;
2307;WAVY LINE;So;0;ON;;;;;N;;;;;
-2308;LEFT CEILING;Sm;0;ON;;;;;Y;;;;;
-2309;RIGHT CEILING;Sm;0;ON;;;;;Y;;;;;
-230A;LEFT FLOOR;Sm;0;ON;;;;;Y;;;;;
-230B;RIGHT FLOOR;Sm;0;ON;;;;;Y;;;;;
+2308;LEFT CEILING;Ps;0;ON;;;;;Y;;;;;
+2309;RIGHT CEILING;Pe;0;ON;;;;;Y;;;;;
+230A;LEFT FLOOR;Ps;0;ON;;;;;Y;;;;;
+230B;RIGHT FLOOR;Pe;0;ON;;;;;Y;;;;;
230C;BOTTOM RIGHT CROP;So;0;ON;;;;;N;;;;;
230D;BOTTOM LEFT CROP;So;0;ON;;;;;N;;;;;
230E;TOP RIGHT CROP;So;0;ON;;;;;N;;;;;
@@ -7974,6 +8060,13 @@
23F1;STOPWATCH;So;0;ON;;;;;N;;;;;
23F2;TIMER CLOCK;So;0;ON;;;;;N;;;;;
23F3;HOURGLASS WITH FLOWING SAND;So;0;ON;;;;;N;;;;;
+23F4;BLACK MEDIUM LEFT-POINTING TRIANGLE;So;0;ON;;;;;N;;;;;
+23F5;BLACK MEDIUM RIGHT-POINTING TRIANGLE;So;0;ON;;;;;N;;;;;
+23F6;BLACK MEDIUM UP-POINTING TRIANGLE;So;0;ON;;;;;N;;;;;
+23F7;BLACK MEDIUM DOWN-POINTING TRIANGLE;So;0;ON;;;;;N;;;;;
+23F8;DOUBLE VERTICAL BAR;So;0;ON;;;;;N;;;;;
+23F9;BLACK SQUARE FOR STOP;So;0;ON;;;;;N;;;;;
+23FA;BLACK CIRCLE FOR RECORD;So;0;ON;;;;;N;;;;;
2400;SYMBOL FOR NULL;So;0;ON;;;;;N;GRAPHIC FOR NULL;;;;
2401;SYMBOL FOR START OF HEADING;So;0;ON;;;;;N;GRAPHIC FOR START OF HEADING;;;;
2402;SYMBOL FOR START OF TEXT;So;0;ON;;;;;N;GRAPHIC FOR START OF TEXT;;;;
@@ -8696,6 +8789,7 @@
26FD;FUEL PUMP;So;0;ON;;;;;N;;;;;
26FE;CUP ON BLACK SQUARE;So;0;ON;;;;;N;;;;;
26FF;WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE;So;0;ON;;;;;N;;;;;
+2700;BLACK SAFETY SCISSORS;So;0;ON;;;;;N;;;;;
2701;UPPER BLADE SCISSORS;So;0;ON;;;;;N;;;;;
2702;BLACK SCISSORS;So;0;ON;;;;;N;;;;;
2703;LOWER BLADE SCISSORS;So;0;ON;;;;;N;;;;;
@@ -9796,6 +9890,9 @@
2B4A;LEFTWARDS ARROW ABOVE ALMOST EQUAL TO;Sm;0;ON;;;;;N;;;;;
2B4B;LEFTWARDS ARROW ABOVE REVERSE TILDE OPERATOR;Sm;0;ON;;;;;N;;;;;
2B4C;RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR;Sm;0;ON;;;;;N;;;;;
+2B4D;DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW;So;0;ON;;;;;N;;;;;
+2B4E;SHORT SLANTED NORTH ARROW;So;0;ON;;;;;N;;;;;
+2B4F;SHORT BACKSLANTED SOUTH ARROW;So;0;ON;;;;;N;;;;;
2B50;WHITE MEDIUM STAR;So;0;ON;;;;;N;;;;;
2B51;BLACK SMALL STAR;So;0;ON;;;;;N;;;;;
2B52;WHITE SMALL STAR;So;0;ON;;;;;N;;;;;
@@ -9806,6 +9903,118 @@
2B57;HEAVY CIRCLE WITH CIRCLE INSIDE;So;0;ON;;;;;N;;;;;
2B58;HEAVY CIRCLE;So;0;ON;;;;;N;;;;;
2B59;HEAVY CIRCLED SALTIRE;So;0;ON;;;;;N;;;;;
+2B5A;SLANTED NORTH ARROW WITH HOOKED HEAD;So;0;ON;;;;;N;;;;;
+2B5B;BACKSLANTED SOUTH ARROW WITH HOOKED TAIL;So;0;ON;;;;;N;;;;;
+2B5C;SLANTED NORTH ARROW WITH HORIZONTAL TAIL;So;0;ON;;;;;N;;;;;
+2B5D;BACKSLANTED SOUTH ARROW WITH HORIZONTAL TAIL;So;0;ON;;;;;N;;;;;
+2B5E;BENT ARROW POINTING DOWNWARDS THEN NORTH EAST;So;0;ON;;;;;N;;;;;
+2B5F;SHORT BENT ARROW POINTING DOWNWARDS THEN NORTH EAST;So;0;ON;;;;;N;;;;;
+2B60;LEFTWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B61;UPWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B62;RIGHTWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B63;DOWNWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B64;LEFT RIGHT TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B65;UP DOWN TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B66;NORTH WEST TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B67;NORTH EAST TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B68;SOUTH EAST TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B69;SOUTH WEST TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B6A;LEFTWARDS TRIANGLE-HEADED DASHED ARROW;So;0;ON;;;;;N;;;;;
+2B6B;UPWARDS TRIANGLE-HEADED DASHED ARROW;So;0;ON;;;;;N;;;;;
+2B6C;RIGHTWARDS TRIANGLE-HEADED DASHED ARROW;So;0;ON;;;;;N;;;;;
+2B6D;DOWNWARDS TRIANGLE-HEADED DASHED ARROW;So;0;ON;;;;;N;;;;;
+2B6E;CLOCKWISE TRIANGLE-HEADED OPEN CIRCLE ARROW;So;0;ON;;;;;N;;;;;
+2B6F;ANTICLOCKWISE TRIANGLE-HEADED OPEN CIRCLE ARROW;So;0;ON;;;;;N;;;;;
+2B70;LEFTWARDS TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;;
+2B71;UPWARDS TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;;
+2B72;RIGHTWARDS TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;;
+2B73;DOWNWARDS TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;;
+2B76;NORTH WEST TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;;
+2B77;NORTH EAST TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;;
+2B78;SOUTH EAST TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;;
+2B79;SOUTH WEST TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;;
+2B7A;LEFTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE HORIZONTAL STROKE;So;0;ON;;;;;N;;;;;
+2B7B;UPWARDS TRIANGLE-HEADED ARROW WITH DOUBLE HORIZONTAL STROKE;So;0;ON;;;;;N;;;;;
+2B7C;RIGHTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE HORIZONTAL STROKE;So;0;ON;;;;;N;;;;;
+2B7D;DOWNWARDS TRIANGLE-HEADED ARROW WITH DOUBLE HORIZONTAL STROKE;So;0;ON;;;;;N;;;;;
+2B7E;HORIZONTAL TAB KEY;So;0;ON;;;;;N;;;;;
+2B7F;VERTICAL TAB KEY;So;0;ON;;;;;N;;;;;
+2B80;LEFTWARDS TRIANGLE-HEADED ARROW OVER RIGHTWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B81;UPWARDS TRIANGLE-HEADED ARROW LEFTWARDS OF DOWNWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B82;RIGHTWARDS TRIANGLE-HEADED ARROW OVER LEFTWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B83;DOWNWARDS TRIANGLE-HEADED ARROW LEFTWARDS OF UPWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;;
+2B84;LEFTWARDS TRIANGLE-HEADED PAIRED ARROWS;So;0;ON;;;;;N;;;;;
+2B85;UPWARDS TRIANGLE-HEADED PAIRED ARROWS;So;0;ON;;;;;N;;;;;
+2B86;RIGHTWARDS TRIANGLE-HEADED PAIRED ARROWS;So;0;ON;;;;;N;;;;;
+2B87;DOWNWARDS TRIANGLE-HEADED PAIRED ARROWS;So;0;ON;;;;;N;;;;;
+2B88;LEFTWARDS BLACK CIRCLED WHITE ARROW;So;0;ON;;;;;N;;;;;
+2B89;UPWARDS BLACK CIRCLED WHITE ARROW;So;0;ON;;;;;N;;;;;
+2B8A;RIGHTWARDS BLACK CIRCLED WHITE ARROW;So;0;ON;;;;;N;;;;;
+2B8B;DOWNWARDS BLACK CIRCLED WHITE ARROW;So;0;ON;;;;;N;;;;;
+2B8C;ANTICLOCKWISE TRIANGLE-HEADED RIGHT U-SHAPED ARROW;So;0;ON;;;;;N;;;;;
+2B8D;ANTICLOCKWISE TRIANGLE-HEADED BOTTOM U-SHAPED ARROW;So;0;ON;;;;;N;;;;;
+2B8E;ANTICLOCKWISE TRIANGLE-HEADED LEFT U-SHAPED ARROW;So;0;ON;;;;;N;;;;;
+2B8F;ANTICLOCKWISE TRIANGLE-HEADED TOP U-SHAPED ARROW;So;0;ON;;;;;N;;;;;
+2B90;RETURN LEFT;So;0;ON;;;;;N;;;;;
+2B91;RETURN RIGHT;So;0;ON;;;;;N;;;;;
+2B92;NEWLINE LEFT;So;0;ON;;;;;N;;;;;
+2B93;NEWLINE RIGHT;So;0;ON;;;;;N;;;;;
+2B94;FOUR CORNER ARROWS CIRCLING ANTICLOCKWISE;So;0;ON;;;;;N;;;;;
+2B95;RIGHTWARDS BLACK ARROW;So;0;ON;;;;;N;;;;;
+2B98;THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+2B99;THREE-D RIGHT-LIGHTED UPWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+2B9A;THREE-D TOP-LIGHTED RIGHTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+2B9B;THREE-D LEFT-LIGHTED DOWNWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+2B9C;BLACK LEFTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+2B9D;BLACK UPWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+2B9E;BLACK RIGHTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+2B9F;BLACK DOWNWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+2BA0;DOWNWARDS TRIANGLE-HEADED ARROW WITH LONG TIP LEFTWARDS;So;0;ON;;;;;N;;;;;
+2BA1;DOWNWARDS TRIANGLE-HEADED ARROW WITH LONG TIP RIGHTWARDS;So;0;ON;;;;;N;;;;;
+2BA2;UPWARDS TRIANGLE-HEADED ARROW WITH LONG TIP LEFTWARDS;So;0;ON;;;;;N;;;;;
+2BA3;UPWARDS TRIANGLE-HEADED ARROW WITH LONG TIP RIGHTWARDS;So;0;ON;;;;;N;;;;;
+2BA4;LEFTWARDS TRIANGLE-HEADED ARROW WITH LONG TIP UPWARDS;So;0;ON;;;;;N;;;;;
+2BA5;RIGHTWARDS TRIANGLE-HEADED ARROW WITH LONG TIP UPWARDS;So;0;ON;;;;;N;;;;;
+2BA6;LEFTWARDS TRIANGLE-HEADED ARROW WITH LONG TIP DOWNWARDS;So;0;ON;;;;;N;;;;;
+2BA7;RIGHTWARDS TRIANGLE-HEADED ARROW WITH LONG TIP DOWNWARDS;So;0;ON;;;;;N;;;;;
+2BA8;BLACK CURVED DOWNWARDS AND LEFTWARDS ARROW;So;0;ON;;;;;N;;;;;
+2BA9;BLACK CURVED DOWNWARDS AND RIGHTWARDS ARROW;So;0;ON;;;;;N;;;;;
+2BAA;BLACK CURVED UPWARDS AND LEFTWARDS ARROW;So;0;ON;;;;;N;;;;;
+2BAB;BLACK CURVED UPWARDS AND RIGHTWARDS ARROW;So;0;ON;;;;;N;;;;;
+2BAC;BLACK CURVED LEFTWARDS AND UPWARDS ARROW;So;0;ON;;;;;N;;;;;
+2BAD;BLACK CURVED RIGHTWARDS AND UPWARDS ARROW;So;0;ON;;;;;N;;;;;
+2BAE;BLACK CURVED LEFTWARDS AND DOWNWARDS ARROW;So;0;ON;;;;;N;;;;;
+2BAF;BLACK CURVED RIGHTWARDS AND DOWNWARDS ARROW;So;0;ON;;;;;N;;;;;
+2BB0;RIBBON ARROW DOWN LEFT;So;0;ON;;;;;N;;;;;
+2BB1;RIBBON ARROW DOWN RIGHT;So;0;ON;;;;;N;;;;;
+2BB2;RIBBON ARROW UP LEFT;So;0;ON;;;;;N;;;;;
+2BB3;RIBBON ARROW UP RIGHT;So;0;ON;;;;;N;;;;;
+2BB4;RIBBON ARROW LEFT UP;So;0;ON;;;;;N;;;;;
+2BB5;RIBBON ARROW RIGHT UP;So;0;ON;;;;;N;;;;;
+2BB6;RIBBON ARROW LEFT DOWN;So;0;ON;;;;;N;;;;;
+2BB7;RIBBON ARROW RIGHT DOWN;So;0;ON;;;;;N;;;;;
+2BB8;UPWARDS WHITE ARROW FROM BAR WITH HORIZONTAL BAR;So;0;ON;;;;;N;;;;;
+2BB9;UP ARROWHEAD IN A RECTANGLE BOX;So;0;ON;;;;;N;;;;;
+2BBD;BALLOT BOX WITH LIGHT X;So;0;ON;;;;;N;;;;;
+2BBE;CIRCLED X;So;0;ON;;;;;N;;;;;
+2BBF;CIRCLED BOLD X;So;0;ON;;;;;N;;;;;
+2BC0;BLACK SQUARE CENTRED;So;0;ON;;;;;N;;;;;
+2BC1;BLACK DIAMOND CENTRED;So;0;ON;;;;;N;;;;;
+2BC2;TURNED BLACK PENTAGON;So;0;ON;;;;;N;;;;;
+2BC3;HORIZONTAL BLACK OCTAGON;So;0;ON;;;;;N;;;;;
+2BC4;BLACK OCTAGON;So;0;ON;;;;;N;;;;;
+2BC5;BLACK MEDIUM UP-POINTING TRIANGLE CENTRED;So;0;ON;;;;;N;;;;;
+2BC6;BLACK MEDIUM DOWN-POINTING TRIANGLE CENTRED;So;0;ON;;;;;N;;;;;
+2BC7;BLACK MEDIUM LEFT-POINTING TRIANGLE CENTRED;So;0;ON;;;;;N;;;;;
+2BC8;BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED;So;0;ON;;;;;N;;;;;
+2BCA;TOP HALF BLACK CIRCLE;So;0;ON;;;;;N;;;;;
+2BCB;BOTTOM HALF BLACK CIRCLE;So;0;ON;;;;;N;;;;;
+2BCC;LIGHT FOUR POINTED BLACK CUSP;So;0;ON;;;;;N;;;;;
+2BCD;ROTATED LIGHT FOUR POINTED BLACK CUSP;So;0;ON;;;;;N;;;;;
+2BCE;WHITE FOUR POINTED CUSP;So;0;ON;;;;;N;;;;;
+2BCF;ROTATED WHITE FOUR POINTED CUSP;So;0;ON;;;;;N;;;;;
+2BD0;SQUARE POSITION INDICATOR;So;0;ON;;;;;N;;;;;
+2BD1;UNCERTAINTY SIGN;So;0;ON;;;;;N;;;;;
2C00;GLAGOLITIC CAPITAL LETTER AZU;Lu;0;L;;;;;N;;;;2C30;
2C01;GLAGOLITIC CAPITAL LETTER BUKY;Lu;0;L;;;;;N;;;;2C31;
2C02;GLAGOLITIC CAPITAL LETTER VEDE;Lu;0;L;;;;;N;;;;2C32;
@@ -10325,6 +10534,13 @@
2E39;TOP HALF SECTION SIGN;Po;0;ON;;;;;N;;;;;
2E3A;TWO-EM DASH;Pd;0;ON;;;;;N;;;;;
2E3B;THREE-EM DASH;Pd;0;ON;;;;;N;;;;;
+2E3C;STENOGRAPHIC FULL STOP;Po;0;ON;;;;;N;;;;;
+2E3D;VERTICAL SIX DOTS;Po;0;ON;;;;;N;;;;;
+2E3E;WIGGLY VERTICAL LINE;Po;0;ON;;;;;N;;;;;
+2E3F;CAPITULUM;Po;0;ON;;;;;N;;;;;
+2E40;DOUBLE HYPHEN;Pd;0;ON;;;;;N;;;;;
+2E41;REVERSED COMMA;Po;0;ON;;;;;N;;;;;
+2E42;DOUBLE LOW-REVERSED-9 QUOTATION MARK;Ps;0;ON;;;;;N;;;;;
2E80;CJK RADICAL REPEAT;So;0;ON;;;;;N;;;;;
2E81;CJK RADICAL CLIFF;So;0;ON;;;;;N;;;;;
2E82;CJK RADICAL SECOND ONE;So;0;ON;;;;;N;;;;;
@@ -13383,6 +13599,12 @@
A695;CYRILLIC SMALL LETTER HWE;Ll;0;L;;;;;N;;;A694;;A694
A696;CYRILLIC CAPITAL LETTER SHWE;Lu;0;L;;;;;N;;;;A697;
A697;CYRILLIC SMALL LETTER SHWE;Ll;0;L;;;;;N;;;A696;;A696
+A698;CYRILLIC CAPITAL LETTER DOUBLE O;Lu;0;L;;;;;N;;;;A699;
+A699;CYRILLIC SMALL LETTER DOUBLE O;Ll;0;L;;;;;N;;;A698;;A698
+A69A;CYRILLIC CAPITAL LETTER CROSSED O;Lu;0;L;;;;;N;;;;A69B;
+A69B;CYRILLIC SMALL LETTER CROSSED O;Ll;0;L;;;;;N;;;A69A;;A69A
+A69C;MODIFIER LETTER CYRILLIC HARD SIGN;Lm;0;L;<super> 044A;;;;N;;;;;
+A69D;MODIFIER LETTER CYRILLIC SOFT SIGN;Lm;0;L;<super> 044C;;;;N;;;;;
A69F;COMBINING CYRILLIC LETTER IOTIFIED E;Mn;230;NSM;;;;;N;;;;;
A6A0;BAMUM LETTER A;Lo;0;L;;;;;N;;;;;
A6A1;BAMUM LETTER KA;Lo;0;L;;;;;N;;;;;
@@ -13619,6 +13841,18 @@
A791;LATIN SMALL LETTER N WITH DESCENDER;Ll;0;L;;;;;N;;;A790;;A790
A792;LATIN CAPITAL LETTER C WITH BAR;Lu;0;L;;;;;N;;;;A793;
A793;LATIN SMALL LETTER C WITH BAR;Ll;0;L;;;;;N;;;A792;;A792
+A794;LATIN SMALL LETTER C WITH PALATAL HOOK;Ll;0;L;;;;;N;;;;;
+A795;LATIN SMALL LETTER H WITH PALATAL HOOK;Ll;0;L;;;;;N;;;;;
+A796;LATIN CAPITAL LETTER B WITH FLOURISH;Lu;0;L;;;;;N;;;;A797;
+A797;LATIN SMALL LETTER B WITH FLOURISH;Ll;0;L;;;;;N;;;A796;;A796
+A798;LATIN CAPITAL LETTER F WITH STROKE;Lu;0;L;;;;;N;;;;A799;
+A799;LATIN SMALL LETTER F WITH STROKE;Ll;0;L;;;;;N;;;A798;;A798
+A79A;LATIN CAPITAL LETTER VOLAPUK AE;Lu;0;L;;;;;N;;;;A79B;
+A79B;LATIN SMALL LETTER VOLAPUK AE;Ll;0;L;;;;;N;;;A79A;;A79A
+A79C;LATIN CAPITAL LETTER VOLAPUK OE;Lu;0;L;;;;;N;;;;A79D;
+A79D;LATIN SMALL LETTER VOLAPUK OE;Ll;0;L;;;;;N;;;A79C;;A79C
+A79E;LATIN CAPITAL LETTER VOLAPUK UE;Lu;0;L;;;;;N;;;;A79F;
+A79F;LATIN SMALL LETTER VOLAPUK UE;Ll;0;L;;;;;N;;;A79E;;A79E
A7A0;LATIN CAPITAL LETTER G WITH OBLIQUE STROKE;Lu;0;L;;;;;N;;;;A7A1;
A7A1;LATIN SMALL LETTER G WITH OBLIQUE STROKE;Ll;0;L;;;;;N;;;A7A0;;A7A0
A7A2;LATIN CAPITAL LETTER K WITH OBLIQUE STROKE;Lu;0;L;;;;;N;;;;A7A3;
@@ -13630,6 +13864,12 @@
A7A8;LATIN CAPITAL LETTER S WITH OBLIQUE STROKE;Lu;0;L;;;;;N;;;;A7A9;
A7A9;LATIN SMALL LETTER S WITH OBLIQUE STROKE;Ll;0;L;;;;;N;;;A7A8;;A7A8
A7AA;LATIN CAPITAL LETTER H WITH HOOK;Lu;0;L;;;;;N;;;;0266;
+A7AB;LATIN CAPITAL LETTER REVERSED OPEN E;Lu;0;L;;;;;N;;;;025C;
+A7AC;LATIN CAPITAL LETTER SCRIPT G;Lu;0;L;;;;;N;;;;0261;
+A7AD;LATIN CAPITAL LETTER L WITH BELT;Lu;0;L;;;;;N;;;;026C;
+A7B0;LATIN CAPITAL LETTER TURNED K;Lu;0;L;;;;;N;;;;029E;
+A7B1;LATIN CAPITAL LETTER TURNED T;Lu;0;L;;;;;N;;;;0287;
+A7F7;LATIN EPIGRAPHIC LETTER SIDEWAYS I;Lo;0;L;;;;;N;;;;;
A7F8;MODIFIER LETTER CAPITAL H WITH STROKE;Lm;0;L;<super> 0126;;;;N;;;;;
A7F9;MODIFIER LETTER SMALL LIGATURE OE;Lm;0;L;<super> 0153;;;;N;;;;;
A7FA;LATIN LETTER SMALL CAPITAL TURNED M;Ll;0;L;;;;;N;;;;;
@@ -14062,6 +14302,37 @@
A9D9;JAVANESE DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
A9DE;JAVANESE PADA TIRTA TUMETES;Po;0;L;;;;;N;;;;;
A9DF;JAVANESE PADA ISEN-ISEN;Po;0;L;;;;;N;;;;;
+A9E0;MYANMAR LETTER SHAN GHA;Lo;0;L;;;;;N;;;;;
+A9E1;MYANMAR LETTER SHAN CHA;Lo;0;L;;;;;N;;;;;
+A9E2;MYANMAR LETTER SHAN JHA;Lo;0;L;;;;;N;;;;;
+A9E3;MYANMAR LETTER SHAN NNA;Lo;0;L;;;;;N;;;;;
+A9E4;MYANMAR LETTER SHAN BHA;Lo;0;L;;;;;N;;;;;
+A9E5;MYANMAR SIGN SHAN SAW;Mn;0;NSM;;;;;N;;;;;
+A9E6;MYANMAR MODIFIER LETTER SHAN REDUPLICATION;Lm;0;L;;;;;N;;;;;
+A9E7;MYANMAR LETTER TAI LAING NYA;Lo;0;L;;;;;N;;;;;
+A9E8;MYANMAR LETTER TAI LAING FA;Lo;0;L;;;;;N;;;;;
+A9E9;MYANMAR LETTER TAI LAING GA;Lo;0;L;;;;;N;;;;;
+A9EA;MYANMAR LETTER TAI LAING GHA;Lo;0;L;;;;;N;;;;;
+A9EB;MYANMAR LETTER TAI LAING JA;Lo;0;L;;;;;N;;;;;
+A9EC;MYANMAR LETTER TAI LAING JHA;Lo;0;L;;;;;N;;;;;
+A9ED;MYANMAR LETTER TAI LAING DDA;Lo;0;L;;;;;N;;;;;
+A9EE;MYANMAR LETTER TAI LAING DDHA;Lo;0;L;;;;;N;;;;;
+A9EF;MYANMAR LETTER TAI LAING NNA;Lo;0;L;;;;;N;;;;;
+A9F0;MYANMAR TAI LAING DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+A9F1;MYANMAR TAI LAING DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+A9F2;MYANMAR TAI LAING DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+A9F3;MYANMAR TAI LAING DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+A9F4;MYANMAR TAI LAING DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+A9F5;MYANMAR TAI LAING DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+A9F6;MYANMAR TAI LAING DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+A9F7;MYANMAR TAI LAING DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+A9F8;MYANMAR TAI LAING DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+A9F9;MYANMAR TAI LAING DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+A9FA;MYANMAR LETTER TAI LAING LLA;Lo;0;L;;;;;N;;;;;
+A9FB;MYANMAR LETTER TAI LAING DA;Lo;0;L;;;;;N;;;;;
+A9FC;MYANMAR LETTER TAI LAING DHA;Lo;0;L;;;;;N;;;;;
+A9FD;MYANMAR LETTER TAI LAING BA;Lo;0;L;;;;;N;;;;;
+A9FE;MYANMAR LETTER TAI LAING BHA;Lo;0;L;;;;;N;;;;;
AA00;CHAM LETTER A;Lo;0;L;;;;;N;;;;;
AA01;CHAM LETTER I;Lo;0;L;;;;;N;;;;;
AA02;CHAM LETTER U;Lo;0;L;;;;;N;;;;;
@@ -14173,6 +14444,10 @@
AA79;MYANMAR SYMBOL AITON TWO;So;0;L;;;;;N;;;;;
AA7A;MYANMAR LETTER AITON RA;Lo;0;L;;;;;N;;;;;
AA7B;MYANMAR SIGN PAO KAREN TONE;Mc;0;L;;;;;N;;;;;
+AA7C;MYANMAR SIGN TAI LAING TONE-2;Mn;0;NSM;;;;;N;;;;;
+AA7D;MYANMAR SIGN TAI LAING TONE-5;Mc;0;L;;;;;N;;;;;
+AA7E;MYANMAR LETTER SHWE PALAUNG CHA;Lo;0;L;;;;;N;;;;;
+AA7F;MYANMAR LETTER SHWE PALAUNG SHA;Lo;0;L;;;;;N;;;;;
AA80;TAI VIET LETTER LOW KO;Lo;0;L;;;;;N;;;;;
AA81;TAI VIET LETTER HIGH KO;Lo;0;L;;;;;N;;;;;
AA82;TAI VIET LETTER LOW KHO;Lo;0;L;;;;;N;;;;;
@@ -14300,6 +14575,56 @@
AB2C;ETHIOPIC SYLLABLE BBEE;Lo;0;L;;;;;N;;;;;
AB2D;ETHIOPIC SYLLABLE BBE;Lo;0;L;;;;;N;;;;;
AB2E;ETHIOPIC SYLLABLE BBO;Lo;0;L;;;;;N;;;;;
+AB30;LATIN SMALL LETTER BARRED ALPHA;Ll;0;L;;;;;N;;;;;
+AB31;LATIN SMALL LETTER A REVERSED-SCHWA;Ll;0;L;;;;;N;;;;;
+AB32;LATIN SMALL LETTER BLACKLETTER E;Ll;0;L;;;;;N;;;;;
+AB33;LATIN SMALL LETTER BARRED E;Ll;0;L;;;;;N;;;;;
+AB34;LATIN SMALL LETTER E WITH FLOURISH;Ll;0;L;;;;;N;;;;;
+AB35;LATIN SMALL LETTER LENIS F;Ll;0;L;;;;;N;;;;;
+AB36;LATIN SMALL LETTER SCRIPT G WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;;
+AB37;LATIN SMALL LETTER L WITH INVERTED LAZY S;Ll;0;L;;;;;N;;;;;
+AB38;LATIN SMALL LETTER L WITH DOUBLE MIDDLE TILDE;Ll;0;L;;;;;N;;;;;
+AB39;LATIN SMALL LETTER L WITH MIDDLE RING;Ll;0;L;;;;;N;;;;;
+AB3A;LATIN SMALL LETTER M WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;;
+AB3B;LATIN SMALL LETTER N WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;;
+AB3C;LATIN SMALL LETTER ENG WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;;
+AB3D;LATIN SMALL LETTER BLACKLETTER O;Ll;0;L;;;;;N;;;;;
+AB3E;LATIN SMALL LETTER BLACKLETTER O WITH STROKE;Ll;0;L;;;;;N;;;;;
+AB3F;LATIN SMALL LETTER OPEN O WITH STROKE;Ll;0;L;;;;;N;;;;;
+AB40;LATIN SMALL LETTER INVERTED OE;Ll;0;L;;;;;N;;;;;
+AB41;LATIN SMALL LETTER TURNED OE WITH STROKE;Ll;0;L;;;;;N;;;;;
+AB42;LATIN SMALL LETTER TURNED OE WITH HORIZONTAL STROKE;Ll;0;L;;;;;N;;;;;
+AB43;LATIN SMALL LETTER TURNED O OPEN-O;Ll;0;L;;;;;N;;;;;
+AB44;LATIN SMALL LETTER TURNED O OPEN-O WITH STROKE;Ll;0;L;;;;;N;;;;;
+AB45;LATIN SMALL LETTER STIRRUP R;Ll;0;L;;;;;N;;;;;
+AB46;LATIN LETTER SMALL CAPITAL R WITH RIGHT LEG;Ll;0;L;;;;;N;;;;;
+AB47;LATIN SMALL LETTER R WITHOUT HANDLE;Ll;0;L;;;;;N;;;;;
+AB48;LATIN SMALL LETTER DOUBLE R;Ll;0;L;;;;;N;;;;;
+AB49;LATIN SMALL LETTER R WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;;
+AB4A;LATIN SMALL LETTER DOUBLE R WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;;
+AB4B;LATIN SMALL LETTER SCRIPT R;Ll;0;L;;;;;N;;;;;
+AB4C;LATIN SMALL LETTER SCRIPT R WITH RING;Ll;0;L;;;;;N;;;;;
+AB4D;LATIN SMALL LETTER BASELINE ESH;Ll;0;L;;;;;N;;;;;
+AB4E;LATIN SMALL LETTER U WITH SHORT RIGHT LEG;Ll;0;L;;;;;N;;;;;
+AB4F;LATIN SMALL LETTER U BAR WITH SHORT RIGHT LEG;Ll;0;L;;;;;N;;;;;
+AB50;LATIN SMALL LETTER UI;Ll;0;L;;;;;N;;;;;
+AB51;LATIN SMALL LETTER TURNED UI;Ll;0;L;;;;;N;;;;;
+AB52;LATIN SMALL LETTER U WITH LEFT HOOK;Ll;0;L;;;;;N;;;;;
+AB53;LATIN SMALL LETTER CHI;Ll;0;L;;;;;N;;;;;
+AB54;LATIN SMALL LETTER CHI WITH LOW RIGHT RING;Ll;0;L;;;;;N;;;;;
+AB55;LATIN SMALL LETTER CHI WITH LOW LEFT SERIF;Ll;0;L;;;;;N;;;;;
+AB56;LATIN SMALL LETTER X WITH LOW RIGHT RING;Ll;0;L;;;;;N;;;;;
+AB57;LATIN SMALL LETTER X WITH LONG LEFT LEG;Ll;0;L;;;;;N;;;;;
+AB58;LATIN SMALL LETTER X WITH LONG LEFT LEG AND LOW RIGHT RING;Ll;0;L;;;;;N;;;;;
+AB59;LATIN SMALL LETTER X WITH LONG LEFT LEG WITH SERIF;Ll;0;L;;;;;N;;;;;
+AB5A;LATIN SMALL LETTER Y WITH SHORT RIGHT LEG;Ll;0;L;;;;;N;;;;;
+AB5B;MODIFIER BREVE WITH INVERTED BREVE;Sk;0;L;;;;;N;;;;;
+AB5C;MODIFIER LETTER SMALL HENG;Lm;0;L;<super> A727;;;;N;;;;;
+AB5D;MODIFIER LETTER SMALL L WITH INVERTED LAZY S;Lm;0;L;<super> AB37;;;;N;;;;;
+AB5E;MODIFIER LETTER SMALL L WITH MIDDLE TILDE;Lm;0;L;<super> 026B;;;;N;;;;;
+AB5F;MODIFIER LETTER SMALL U WITH LEFT HOOK;Lm;0;L;<super> AB52;;;;N;;;;;
+AB64;LATIN SMALL LETTER INVERTED ALPHA;Ll;0;L;;;;;N;;;;;
+AB65;GREEK LETTER SMALL CAPITAL OMEGA;Ll;0;L;;;;;N;;;;;
ABC0;MEETEI MAYEK LETTER KOK;Lo;0;L;;;;;N;;;;;
ABC1;MEETEI MAYEK LETTER SAM;Lo;0;L;;;;;N;;;;;
ABC2;MEETEI MAYEK LETTER LAI;Lo;0;L;;;;;N;;;;;
@@ -15445,8 +15770,8 @@
FD3B;ARABIC LIGATURE ZAH WITH MEEM MEDIAL FORM;Lo;0;AL;<medial> 0638 0645;;;;N;;;;;
FD3C;ARABIC LIGATURE ALEF WITH FATHATAN FINAL FORM;Lo;0;AL;<final> 0627 064B;;;;N;;;;;
FD3D;ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM;Lo;0;AL;<isolated> 0627 064B;;;;N;;;;;
-FD3E;ORNATE LEFT PARENTHESIS;Ps;0;ON;;;;;N;;;;;
-FD3F;ORNATE RIGHT PARENTHESIS;Pe;0;ON;;;;;N;;;;;
+FD3E;ORNATE LEFT PARENTHESIS;Pe;0;ON;;;;;N;;;;;
+FD3F;ORNATE RIGHT PARENTHESIS;Ps;0;ON;;;;;N;;;;;
FD50;ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM;Lo;0;AL;<initial> 062A 062C 0645;;;;N;;;;;
FD51;ARABIC LIGATURE TEH WITH HAH WITH JEEM FINAL FORM;Lo;0;AL;<final> 062A 062D 062C;;;;N;;;;;
FD52;ARABIC LIGATURE TEH WITH HAH WITH JEEM INITIAL FORM;Lo;0;AL;<initial> 062A 062D 062C;;;;N;;;;;
@@ -15612,6 +15937,13 @@
FE24;COMBINING MACRON LEFT HALF;Mn;230;NSM;;;;;N;;;;;
FE25;COMBINING MACRON RIGHT HALF;Mn;230;NSM;;;;;N;;;;;
FE26;COMBINING CONJOINING MACRON;Mn;230;NSM;;;;;N;;;;;
+FE27;COMBINING LIGATURE LEFT HALF BELOW;Mn;220;NSM;;;;;N;;;;;
+FE28;COMBINING LIGATURE RIGHT HALF BELOW;Mn;220;NSM;;;;;N;;;;;
+FE29;COMBINING TILDE LEFT HALF BELOW;Mn;220;NSM;;;;;N;;;;;
+FE2A;COMBINING TILDE RIGHT HALF BELOW;Mn;220;NSM;;;;;N;;;;;
+FE2B;COMBINING MACRON LEFT HALF BELOW;Mn;220;NSM;;;;;N;;;;;
+FE2C;COMBINING MACRON RIGHT HALF BELOW;Mn;220;NSM;;;;;N;;;;;
+FE2D;COMBINING CONJOINING MACRON BELOW;Mn;220;NSM;;;;;N;;;;;
FE30;PRESENTATION FORM FOR VERTICAL TWO DOT LEADER;Po;0;ON;<vertical> 2025;;;;N;GLYPH FOR VERTICAL TWO DOT LEADER;;;;
FE31;PRESENTATION FORM FOR VERTICAL EM DASH;Pd;0;ON;<vertical> 2014;;;;N;GLYPH FOR VERTICAL EM DASH;;;;
FE32;PRESENTATION FORM FOR VERTICAL EN DASH;Pd;0;ON;<vertical> 2013;;;;N;GLYPH FOR VERTICAL EN DASH;;;;
@@ -16384,6 +16716,8 @@
10188;GREEK GRAMMA SIGN;So;0;ON;;;;;N;;;;;
10189;GREEK TRYBLION BASE SIGN;So;0;ON;;;;;N;;;;;
1018A;GREEK ZERO SIGN;No;0;ON;;;;0;N;;;;;
+1018B;GREEK ONE QUARTER SIGN;No;0;ON;;;;1/4;N;;;;;
+1018C;GREEK SINUSOID SIGN;So;0;ON;;;;;N;;;;;
10190;ROMAN SEXTANS SIGN;So;0;ON;;;;;N;;;;;
10191;ROMAN UNCIA SIGN;So;0;ON;;;;;N;;;;;
10192;ROMAN SEMUNCIA SIGN;So;0;ON;;;;;N;;;;;
@@ -16396,6 +16730,7 @@
10199;ROMAN DUPONDIUS SIGN;So;0;ON;;;;;N;;;;;
1019A;ROMAN AS SIGN;So;0;ON;;;;;N;;;;;
1019B;ROMAN CENTURIAL SIGN;So;0;ON;;;;;N;;;;;
+101A0;GREEK SYMBOL TAU RHO;So;0;ON;;;;;N;;;;;
101D0;PHAISTOS DISC SIGN PEDESTRIAN;So;0;L;;;;;N;;;;;
101D1;PHAISTOS DISC SIGN PLUMED HEAD;So;0;L;;;;;N;;;;;
101D2;PHAISTOS DISC SIGN TATTOOED HEAD;So;0;L;;;;;N;;;;;
@@ -16520,6 +16855,34 @@
102CE;CARIAN LETTER LD2;Lo;0;L;;;;;N;;;;;
102CF;CARIAN LETTER E2;Lo;0;L;;;;;N;;;;;
102D0;CARIAN LETTER UUU3;Lo;0;L;;;;;N;;;;;
+102E0;COPTIC EPACT THOUSANDS MARK;Mn;220;NSM;;;;;N;;;;;
+102E1;COPTIC EPACT DIGIT ONE;No;0;EN;;;;1;N;;;;;
+102E2;COPTIC EPACT DIGIT TWO;No;0;EN;;;;2;N;;;;;
+102E3;COPTIC EPACT DIGIT THREE;No;0;EN;;;;3;N;;;;;
+102E4;COPTIC EPACT DIGIT FOUR;No;0;EN;;;;4;N;;;;;
+102E5;COPTIC EPACT DIGIT FIVE;No;0;EN;;;;5;N;;;;;
+102E6;COPTIC EPACT DIGIT SIX;No;0;EN;;;;6;N;;;;;
+102E7;COPTIC EPACT DIGIT SEVEN;No;0;EN;;;;7;N;;;;;
+102E8;COPTIC EPACT DIGIT EIGHT;No;0;EN;;;;8;N;;;;;
+102E9;COPTIC EPACT DIGIT NINE;No;0;EN;;;;9;N;;;;;
+102EA;COPTIC EPACT NUMBER TEN;No;0;EN;;;;10;N;;;;;
+102EB;COPTIC EPACT NUMBER TWENTY;No;0;EN;;;;20;N;;;;;
+102EC;COPTIC EPACT NUMBER THIRTY;No;0;EN;;;;30;N;;;;;
+102ED;COPTIC EPACT NUMBER FORTY;No;0;EN;;;;40;N;;;;;
+102EE;COPTIC EPACT NUMBER FIFTY;No;0;EN;;;;50;N;;;;;
+102EF;COPTIC EPACT NUMBER SIXTY;No;0;EN;;;;60;N;;;;;
+102F0;COPTIC EPACT NUMBER SEVENTY;No;0;EN;;;;70;N;;;;;
+102F1;COPTIC EPACT NUMBER EIGHTY;No;0;EN;;;;80;N;;;;;
+102F2;COPTIC EPACT NUMBER NINETY;No;0;EN;;;;90;N;;;;;
+102F3;COPTIC EPACT NUMBER ONE HUNDRED;No;0;EN;;;;100;N;;;;;
+102F4;COPTIC EPACT NUMBER TWO HUNDRED;No;0;EN;;;;200;N;;;;;
+102F5;COPTIC EPACT NUMBER THREE HUNDRED;No;0;EN;;;;300;N;;;;;
+102F6;COPTIC EPACT NUMBER FOUR HUNDRED;No;0;EN;;;;400;N;;;;;
+102F7;COPTIC EPACT NUMBER FIVE HUNDRED;No;0;EN;;;;500;N;;;;;
+102F8;COPTIC EPACT NUMBER SIX HUNDRED;No;0;EN;;;;600;N;;;;;
+102F9;COPTIC EPACT NUMBER SEVEN HUNDRED;No;0;EN;;;;700;N;;;;;
+102FA;COPTIC EPACT NUMBER EIGHT HUNDRED;No;0;EN;;;;800;N;;;;;
+102FB;COPTIC EPACT NUMBER NINE HUNDRED;No;0;EN;;;;900;N;;;;;
10300;OLD ITALIC LETTER A;Lo;0;L;;;;;N;;;;;
10301;OLD ITALIC LETTER BE;Lo;0;L;;;;;N;;;;;
10302;OLD ITALIC LETTER KE;Lo;0;L;;;;;N;;;;;
@@ -16551,6 +16914,7 @@
1031C;OLD ITALIC LETTER CHE;Lo;0;L;;;;;N;;;;;
1031D;OLD ITALIC LETTER II;Lo;0;L;;;;;N;;;;;
1031E;OLD ITALIC LETTER UU;Lo;0;L;;;;;N;;;;;
+1031F;OLD ITALIC LETTER ESS;Lo;0;L;;;;;N;;;;;
10320;OLD ITALIC NUMERAL ONE;No;0;L;;;;1;N;;;;;
10321;OLD ITALIC NUMERAL FIVE;No;0;L;;;;5;N;;;;;
10322;OLD ITALIC NUMERAL TEN;No;0;L;;;;10;N;;;;;
@@ -16582,6 +16946,49 @@
10348;GOTHIC LETTER HWAIR;Lo;0;L;;;;;N;;;;;
10349;GOTHIC LETTER OTHAL;Lo;0;L;;;;;N;;;;;
1034A;GOTHIC LETTER NINE HUNDRED;Nl;0;L;;;;900;N;;;;;
+10350;OLD PERMIC LETTER AN;Lo;0;L;;;;;N;;;;;
+10351;OLD PERMIC LETTER BUR;Lo;0;L;;;;;N;;;;;
+10352;OLD PERMIC LETTER GAI;Lo;0;L;;;;;N;;;;;
+10353;OLD PERMIC LETTER DOI;Lo;0;L;;;;;N;;;;;
+10354;OLD PERMIC LETTER E;Lo;0;L;;;;;N;;;;;
+10355;OLD PERMIC LETTER ZHOI;Lo;0;L;;;;;N;;;;;
+10356;OLD PERMIC LETTER DZHOI;Lo;0;L;;;;;N;;;;;
+10357;OLD PERMIC LETTER ZATA;Lo;0;L;;;;;N;;;;;
+10358;OLD PERMIC LETTER DZITA;Lo;0;L;;;;;N;;;;;
+10359;OLD PERMIC LETTER I;Lo;0;L;;;;;N;;;;;
+1035A;OLD PERMIC LETTER KOKE;Lo;0;L;;;;;N;;;;;
+1035B;OLD PERMIC LETTER LEI;Lo;0;L;;;;;N;;;;;
+1035C;OLD PERMIC LETTER MENOE;Lo;0;L;;;;;N;;;;;
+1035D;OLD PERMIC LETTER NENOE;Lo;0;L;;;;;N;;;;;
+1035E;OLD PERMIC LETTER VOOI;Lo;0;L;;;;;N;;;;;
+1035F;OLD PERMIC LETTER PEEI;Lo;0;L;;;;;N;;;;;
+10360;OLD PERMIC LETTER REI;Lo;0;L;;;;;N;;;;;
+10361;OLD PERMIC LETTER SII;Lo;0;L;;;;;N;;;;;
+10362;OLD PERMIC LETTER TAI;Lo;0;L;;;;;N;;;;;
+10363;OLD PERMIC LETTER U;Lo;0;L;;;;;N;;;;;
+10364;OLD PERMIC LETTER CHERY;Lo;0;L;;;;;N;;;;;
+10365;OLD PERMIC LETTER SHOOI;Lo;0;L;;;;;N;;;;;
+10366;OLD PERMIC LETTER SHCHOOI;Lo;0;L;;;;;N;;;;;
+10367;OLD PERMIC LETTER YRY;Lo;0;L;;;;;N;;;;;
+10368;OLD PERMIC LETTER YERU;Lo;0;L;;;;;N;;;;;
+10369;OLD PERMIC LETTER O;Lo;0;L;;;;;N;;;;;
+1036A;OLD PERMIC LETTER OO;Lo;0;L;;;;;N;;;;;
+1036B;OLD PERMIC LETTER EF;Lo;0;L;;;;;N;;;;;
+1036C;OLD PERMIC LETTER HA;Lo;0;L;;;;;N;;;;;
+1036D;OLD PERMIC LETTER TSIU;Lo;0;L;;;;;N;;;;;
+1036E;OLD PERMIC LETTER VER;Lo;0;L;;;;;N;;;;;
+1036F;OLD PERMIC LETTER YER;Lo;0;L;;;;;N;;;;;
+10370;OLD PERMIC LETTER YERI;Lo;0;L;;;;;N;;;;;
+10371;OLD PERMIC LETTER YAT;Lo;0;L;;;;;N;;;;;
+10372;OLD PERMIC LETTER IE;Lo;0;L;;;;;N;;;;;
+10373;OLD PERMIC LETTER YU;Lo;0;L;;;;;N;;;;;
+10374;OLD PERMIC LETTER YA;Lo;0;L;;;;;N;;;;;
+10375;OLD PERMIC LETTER IA;Lo;0;L;;;;;N;;;;;
+10376;COMBINING OLD PERMIC LETTER AN;Mn;230;NSM;;;;;N;;;;;
+10377;COMBINING OLD PERMIC LETTER DOI;Mn;230;NSM;;;;;N;;;;;
+10378;COMBINING OLD PERMIC LETTER ZATA;Mn;230;NSM;;;;;N;;;;;
+10379;COMBINING OLD PERMIC LETTER NENOE;Mn;230;NSM;;;;;N;;;;;
+1037A;COMBINING OLD PERMIC LETTER SII;Mn;230;NSM;;;;;N;;;;;
10380;UGARITIC LETTER ALPA;Lo;0;L;;;;;N;;;;;
10381;UGARITIC LETTER BETA;Lo;0;L;;;;;N;;;;;
10382;UGARITIC LETTER GAMLA;Lo;0;L;;;;;N;;;;;
@@ -16831,6 +17238,440 @@
104A7;OSMANYA DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
104A8;OSMANYA DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
104A9;OSMANYA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+10500;ELBASAN LETTER A;Lo;0;L;;;;;N;;;;;
+10501;ELBASAN LETTER BE;Lo;0;L;;;;;N;;;;;
+10502;ELBASAN LETTER CE;Lo;0;L;;;;;N;;;;;
+10503;ELBASAN LETTER CHE;Lo;0;L;;;;;N;;;;;
+10504;ELBASAN LETTER DE;Lo;0;L;;;;;N;;;;;
+10505;ELBASAN LETTER NDE;Lo;0;L;;;;;N;;;;;
+10506;ELBASAN LETTER DHE;Lo;0;L;;;;;N;;;;;
+10507;ELBASAN LETTER EI;Lo;0;L;;;;;N;;;;;
+10508;ELBASAN LETTER E;Lo;0;L;;;;;N;;;;;
+10509;ELBASAN LETTER FE;Lo;0;L;;;;;N;;;;;
+1050A;ELBASAN LETTER GE;Lo;0;L;;;;;N;;;;;
+1050B;ELBASAN LETTER GJE;Lo;0;L;;;;;N;;;;;
+1050C;ELBASAN LETTER HE;Lo;0;L;;;;;N;;;;;
+1050D;ELBASAN LETTER I;Lo;0;L;;;;;N;;;;;
+1050E;ELBASAN LETTER JE;Lo;0;L;;;;;N;;;;;
+1050F;ELBASAN LETTER KE;Lo;0;L;;;;;N;;;;;
+10510;ELBASAN LETTER LE;Lo;0;L;;;;;N;;;;;
+10511;ELBASAN LETTER LLE;Lo;0;L;;;;;N;;;;;
+10512;ELBASAN LETTER ME;Lo;0;L;;;;;N;;;;;
+10513;ELBASAN LETTER NE;Lo;0;L;;;;;N;;;;;
+10514;ELBASAN LETTER NA;Lo;0;L;;;;;N;;;;;
+10515;ELBASAN LETTER NJE;Lo;0;L;;;;;N;;;;;
+10516;ELBASAN LETTER O;Lo;0;L;;;;;N;;;;;
+10517;ELBASAN LETTER PE;Lo;0;L;;;;;N;;;;;
+10518;ELBASAN LETTER QE;Lo;0;L;;;;;N;;;;;
+10519;ELBASAN LETTER RE;Lo;0;L;;;;;N;;;;;
+1051A;ELBASAN LETTER RRE;Lo;0;L;;;;;N;;;;;
+1051B;ELBASAN LETTER SE;Lo;0;L;;;;;N;;;;;
+1051C;ELBASAN LETTER SHE;Lo;0;L;;;;;N;;;;;
+1051D;ELBASAN LETTER TE;Lo;0;L;;;;;N;;;;;
+1051E;ELBASAN LETTER THE;Lo;0;L;;;;;N;;;;;
+1051F;ELBASAN LETTER U;Lo;0;L;;;;;N;;;;;
+10520;ELBASAN LETTER VE;Lo;0;L;;;;;N;;;;;
+10521;ELBASAN LETTER XE;Lo;0;L;;;;;N;;;;;
+10522;ELBASAN LETTER Y;Lo;0;L;;;;;N;;;;;
+10523;ELBASAN LETTER ZE;Lo;0;L;;;;;N;;;;;
+10524;ELBASAN LETTER ZHE;Lo;0;L;;;;;N;;;;;
+10525;ELBASAN LETTER GHE;Lo;0;L;;;;;N;;;;;
+10526;ELBASAN LETTER GHAMMA;Lo;0;L;;;;;N;;;;;
+10527;ELBASAN LETTER KHE;Lo;0;L;;;;;N;;;;;
+10530;CAUCASIAN ALBANIAN LETTER ALT;Lo;0;L;;;;;N;;;;;
+10531;CAUCASIAN ALBANIAN LETTER BET;Lo;0;L;;;;;N;;;;;
+10532;CAUCASIAN ALBANIAN LETTER GIM;Lo;0;L;;;;;N;;;;;
+10533;CAUCASIAN ALBANIAN LETTER DAT;Lo;0;L;;;;;N;;;;;
+10534;CAUCASIAN ALBANIAN LETTER EB;Lo;0;L;;;;;N;;;;;
+10535;CAUCASIAN ALBANIAN LETTER ZARL;Lo;0;L;;;;;N;;;;;
+10536;CAUCASIAN ALBANIAN LETTER EYN;Lo;0;L;;;;;N;;;;;
+10537;CAUCASIAN ALBANIAN LETTER ZHIL;Lo;0;L;;;;;N;;;;;
+10538;CAUCASIAN ALBANIAN LETTER TAS;Lo;0;L;;;;;N;;;;;
+10539;CAUCASIAN ALBANIAN LETTER CHA;Lo;0;L;;;;;N;;;;;
+1053A;CAUCASIAN ALBANIAN LETTER YOWD;Lo;0;L;;;;;N;;;;;
+1053B;CAUCASIAN ALBANIAN LETTER ZHA;Lo;0;L;;;;;N;;;;;
+1053C;CAUCASIAN ALBANIAN LETTER IRB;Lo;0;L;;;;;N;;;;;
+1053D;CAUCASIAN ALBANIAN LETTER SHA;Lo;0;L;;;;;N;;;;;
+1053E;CAUCASIAN ALBANIAN LETTER LAN;Lo;0;L;;;;;N;;;;;
+1053F;CAUCASIAN ALBANIAN LETTER INYA;Lo;0;L;;;;;N;;;;;
+10540;CAUCASIAN ALBANIAN LETTER XEYN;Lo;0;L;;;;;N;;;;;
+10541;CAUCASIAN ALBANIAN LETTER DYAN;Lo;0;L;;;;;N;;;;;
+10542;CAUCASIAN ALBANIAN LETTER CAR;Lo;0;L;;;;;N;;;;;
+10543;CAUCASIAN ALBANIAN LETTER JHOX;Lo;0;L;;;;;N;;;;;
+10544;CAUCASIAN ALBANIAN LETTER KAR;Lo;0;L;;;;;N;;;;;
+10545;CAUCASIAN ALBANIAN LETTER LYIT;Lo;0;L;;;;;N;;;;;
+10546;CAUCASIAN ALBANIAN LETTER HEYT;Lo;0;L;;;;;N;;;;;
+10547;CAUCASIAN ALBANIAN LETTER QAY;Lo;0;L;;;;;N;;;;;
+10548;CAUCASIAN ALBANIAN LETTER AOR;Lo;0;L;;;;;N;;;;;
+10549;CAUCASIAN ALBANIAN LETTER CHOY;Lo;0;L;;;;;N;;;;;
+1054A;CAUCASIAN ALBANIAN LETTER CHI;Lo;0;L;;;;;N;;;;;
+1054B;CAUCASIAN ALBANIAN LETTER CYAY;Lo;0;L;;;;;N;;;;;
+1054C;CAUCASIAN ALBANIAN LETTER MAQ;Lo;0;L;;;;;N;;;;;
+1054D;CAUCASIAN ALBANIAN LETTER QAR;Lo;0;L;;;;;N;;;;;
+1054E;CAUCASIAN ALBANIAN LETTER NOWC;Lo;0;L;;;;;N;;;;;
+1054F;CAUCASIAN ALBANIAN LETTER DZYAY;Lo;0;L;;;;;N;;;;;
+10550;CAUCASIAN ALBANIAN LETTER SHAK;Lo;0;L;;;;;N;;;;;
+10551;CAUCASIAN ALBANIAN LETTER JAYN;Lo;0;L;;;;;N;;;;;
+10552;CAUCASIAN ALBANIAN LETTER ON;Lo;0;L;;;;;N;;;;;
+10553;CAUCASIAN ALBANIAN LETTER TYAY;Lo;0;L;;;;;N;;;;;
+10554;CAUCASIAN ALBANIAN LETTER FAM;Lo;0;L;;;;;N;;;;;
+10555;CAUCASIAN ALBANIAN LETTER DZAY;Lo;0;L;;;;;N;;;;;
+10556;CAUCASIAN ALBANIAN LETTER CHAT;Lo;0;L;;;;;N;;;;;
+10557;CAUCASIAN ALBANIAN LETTER PEN;Lo;0;L;;;;;N;;;;;
+10558;CAUCASIAN ALBANIAN LETTER GHEYS;Lo;0;L;;;;;N;;;;;
+10559;CAUCASIAN ALBANIAN LETTER RAT;Lo;0;L;;;;;N;;;;;
+1055A;CAUCASIAN ALBANIAN LETTER SEYK;Lo;0;L;;;;;N;;;;;
+1055B;CAUCASIAN ALBANIAN LETTER VEYZ;Lo;0;L;;;;;N;;;;;
+1055C;CAUCASIAN ALBANIAN LETTER TIWR;Lo;0;L;;;;;N;;;;;
+1055D;CAUCASIAN ALBANIAN LETTER SHOY;Lo;0;L;;;;;N;;;;;
+1055E;CAUCASIAN ALBANIAN LETTER IWN;Lo;0;L;;;;;N;;;;;
+1055F;CAUCASIAN ALBANIAN LETTER CYAW;Lo;0;L;;;;;N;;;;;
+10560;CAUCASIAN ALBANIAN LETTER CAYN;Lo;0;L;;;;;N;;;;;
+10561;CAUCASIAN ALBANIAN LETTER YAYD;Lo;0;L;;;;;N;;;;;
+10562;CAUCASIAN ALBANIAN LETTER PIWR;Lo;0;L;;;;;N;;;;;
+10563;CAUCASIAN ALBANIAN LETTER KIW;Lo;0;L;;;;;N;;;;;
+1056F;CAUCASIAN ALBANIAN CITATION MARK;Po;0;L;;;;;N;;;;;
+10600;LINEAR A SIGN AB001;Lo;0;L;;;;;N;;;;;
+10601;LINEAR A SIGN AB002;Lo;0;L;;;;;N;;;;;
+10602;LINEAR A SIGN AB003;Lo;0;L;;;;;N;;;;;
+10603;LINEAR A SIGN AB004;Lo;0;L;;;;;N;;;;;
+10604;LINEAR A SIGN AB005;Lo;0;L;;;;;N;;;;;
+10605;LINEAR A SIGN AB006;Lo;0;L;;;;;N;;;;;
+10606;LINEAR A SIGN AB007;Lo;0;L;;;;;N;;;;;
+10607;LINEAR A SIGN AB008;Lo;0;L;;;;;N;;;;;
+10608;LINEAR A SIGN AB009;Lo;0;L;;;;;N;;;;;
+10609;LINEAR A SIGN AB010;Lo;0;L;;;;;N;;;;;
+1060A;LINEAR A SIGN AB011;Lo;0;L;;;;;N;;;;;
+1060B;LINEAR A SIGN AB013;Lo;0;L;;;;;N;;;;;
+1060C;LINEAR A SIGN AB016;Lo;0;L;;;;;N;;;;;
+1060D;LINEAR A SIGN AB017;Lo;0;L;;;;;N;;;;;
+1060E;LINEAR A SIGN AB020;Lo;0;L;;;;;N;;;;;
+1060F;LINEAR A SIGN AB021;Lo;0;L;;;;;N;;;;;
+10610;LINEAR A SIGN AB021F;Lo;0;L;;;;;N;;;;;
+10611;LINEAR A SIGN AB021M;Lo;0;L;;;;;N;;;;;
+10612;LINEAR A SIGN AB022;Lo;0;L;;;;;N;;;;;
+10613;LINEAR A SIGN AB022F;Lo;0;L;;;;;N;;;;;
+10614;LINEAR A SIGN AB022M;Lo;0;L;;;;;N;;;;;
+10615;LINEAR A SIGN AB023;Lo;0;L;;;;;N;;;;;
+10616;LINEAR A SIGN AB023M;Lo;0;L;;;;;N;;;;;
+10617;LINEAR A SIGN AB024;Lo;0;L;;;;;N;;;;;
+10618;LINEAR A SIGN AB026;Lo;0;L;;;;;N;;;;;
+10619;LINEAR A SIGN AB027;Lo;0;L;;;;;N;;;;;
+1061A;LINEAR A SIGN AB028;Lo;0;L;;;;;N;;;;;
+1061B;LINEAR A SIGN A028B;Lo;0;L;;;;;N;;;;;
+1061C;LINEAR A SIGN AB029;Lo;0;L;;;;;N;;;;;
+1061D;LINEAR A SIGN AB030;Lo;0;L;;;;;N;;;;;
+1061E;LINEAR A SIGN AB031;Lo;0;L;;;;;N;;;;;
+1061F;LINEAR A SIGN AB034;Lo;0;L;;;;;N;;;;;
+10620;LINEAR A SIGN AB037;Lo;0;L;;;;;N;;;;;
+10621;LINEAR A SIGN AB038;Lo;0;L;;;;;N;;;;;
+10622;LINEAR A SIGN AB039;Lo;0;L;;;;;N;;;;;
+10623;LINEAR A SIGN AB040;Lo;0;L;;;;;N;;;;;
+10624;LINEAR A SIGN AB041;Lo;0;L;;;;;N;;;;;
+10625;LINEAR A SIGN AB044;Lo;0;L;;;;;N;;;;;
+10626;LINEAR A SIGN AB045;Lo;0;L;;;;;N;;;;;
+10627;LINEAR A SIGN AB046;Lo;0;L;;;;;N;;;;;
+10628;LINEAR A SIGN AB047;Lo;0;L;;;;;N;;;;;
+10629;LINEAR A SIGN AB048;Lo;0;L;;;;;N;;;;;
+1062A;LINEAR A SIGN AB049;Lo;0;L;;;;;N;;;;;
+1062B;LINEAR A SIGN AB050;Lo;0;L;;;;;N;;;;;
+1062C;LINEAR A SIGN AB051;Lo;0;L;;;;;N;;;;;
+1062D;LINEAR A SIGN AB053;Lo;0;L;;;;;N;;;;;
+1062E;LINEAR A SIGN AB054;Lo;0;L;;;;;N;;;;;
+1062F;LINEAR A SIGN AB055;Lo;0;L;;;;;N;;;;;
+10630;LINEAR A SIGN AB056;Lo;0;L;;;;;N;;;;;
+10631;LINEAR A SIGN AB057;Lo;0;L;;;;;N;;;;;
+10632;LINEAR A SIGN AB058;Lo;0;L;;;;;N;;;;;
+10633;LINEAR A SIGN AB059;Lo;0;L;;;;;N;;;;;
+10634;LINEAR A SIGN AB060;Lo;0;L;;;;;N;;;;;
+10635;LINEAR A SIGN AB061;Lo;0;L;;;;;N;;;;;
+10636;LINEAR A SIGN AB065;Lo;0;L;;;;;N;;;;;
+10637;LINEAR A SIGN AB066;Lo;0;L;;;;;N;;;;;
+10638;LINEAR A SIGN AB067;Lo;0;L;;;;;N;;;;;
+10639;LINEAR A SIGN AB069;Lo;0;L;;;;;N;;;;;
+1063A;LINEAR A SIGN AB070;Lo;0;L;;;;;N;;;;;
+1063B;LINEAR A SIGN AB073;Lo;0;L;;;;;N;;;;;
+1063C;LINEAR A SIGN AB074;Lo;0;L;;;;;N;;;;;
+1063D;LINEAR A SIGN AB076;Lo;0;L;;;;;N;;;;;
+1063E;LINEAR A SIGN AB077;Lo;0;L;;;;;N;;;;;
+1063F;LINEAR A SIGN AB078;Lo;0;L;;;;;N;;;;;
+10640;LINEAR A SIGN AB079;Lo;0;L;;;;;N;;;;;
+10641;LINEAR A SIGN AB080;Lo;0;L;;;;;N;;;;;
+10642;LINEAR A SIGN AB081;Lo;0;L;;;;;N;;;;;
+10643;LINEAR A SIGN AB082;Lo;0;L;;;;;N;;;;;
+10644;LINEAR A SIGN AB085;Lo;0;L;;;;;N;;;;;
+10645;LINEAR A SIGN AB086;Lo;0;L;;;;;N;;;;;
+10646;LINEAR A SIGN AB087;Lo;0;L;;;;;N;;;;;
+10647;LINEAR A SIGN A100-102;Lo;0;L;;;;;N;;;;;
+10648;LINEAR A SIGN AB118;Lo;0;L;;;;;N;;;;;
+10649;LINEAR A SIGN AB120;Lo;0;L;;;;;N;;;;;
+1064A;LINEAR A SIGN A120B;Lo;0;L;;;;;N;;;;;
+1064B;LINEAR A SIGN AB122;Lo;0;L;;;;;N;;;;;
+1064C;LINEAR A SIGN AB123;Lo;0;L;;;;;N;;;;;
+1064D;LINEAR A SIGN AB131A;Lo;0;L;;;;;N;;;;;
+1064E;LINEAR A SIGN AB131B;Lo;0;L;;;;;N;;;;;
+1064F;LINEAR A SIGN A131C;Lo;0;L;;;;;N;;;;;
+10650;LINEAR A SIGN AB164;Lo;0;L;;;;;N;;;;;
+10651;LINEAR A SIGN AB171;Lo;0;L;;;;;N;;;;;
+10652;LINEAR A SIGN AB180;Lo;0;L;;;;;N;;;;;
+10653;LINEAR A SIGN AB188;Lo;0;L;;;;;N;;;;;
+10654;LINEAR A SIGN AB191;Lo;0;L;;;;;N;;;;;
+10655;LINEAR A SIGN A301;Lo;0;L;;;;;N;;;;;
+10656;LINEAR A SIGN A302;Lo;0;L;;;;;N;;;;;
+10657;LINEAR A SIGN A303;Lo;0;L;;;;;N;;;;;
+10658;LINEAR A SIGN A304;Lo;0;L;;;;;N;;;;;
+10659;LINEAR A SIGN A305;Lo;0;L;;;;;N;;;;;
+1065A;LINEAR A SIGN A306;Lo;0;L;;;;;N;;;;;
+1065B;LINEAR A SIGN A307;Lo;0;L;;;;;N;;;;;
+1065C;LINEAR A SIGN A308;Lo;0;L;;;;;N;;;;;
+1065D;LINEAR A SIGN A309A;Lo;0;L;;;;;N;;;;;
+1065E;LINEAR A SIGN A309B;Lo;0;L;;;;;N;;;;;
+1065F;LINEAR A SIGN A309C;Lo;0;L;;;;;N;;;;;
+10660;LINEAR A SIGN A310;Lo;0;L;;;;;N;;;;;
+10661;LINEAR A SIGN A311;Lo;0;L;;;;;N;;;;;
+10662;LINEAR A SIGN A312;Lo;0;L;;;;;N;;;;;
+10663;LINEAR A SIGN A313A;Lo;0;L;;;;;N;;;;;
+10664;LINEAR A SIGN A313B;Lo;0;L;;;;;N;;;;;
+10665;LINEAR A SIGN A313C;Lo;0;L;;;;;N;;;;;
+10666;LINEAR A SIGN A314;Lo;0;L;;;;;N;;;;;
+10667;LINEAR A SIGN A315;Lo;0;L;;;;;N;;;;;
+10668;LINEAR A SIGN A316;Lo;0;L;;;;;N;;;;;
+10669;LINEAR A SIGN A317;Lo;0;L;;;;;N;;;;;
+1066A;LINEAR A SIGN A318;Lo;0;L;;;;;N;;;;;
+1066B;LINEAR A SIGN A319;Lo;0;L;;;;;N;;;;;
+1066C;LINEAR A SIGN A320;Lo;0;L;;;;;N;;;;;
+1066D;LINEAR A SIGN A321;Lo;0;L;;;;;N;;;;;
+1066E;LINEAR A SIGN A322;Lo;0;L;;;;;N;;;;;
+1066F;LINEAR A SIGN A323;Lo;0;L;;;;;N;;;;;
+10670;LINEAR A SIGN A324;Lo;0;L;;;;;N;;;;;
+10671;LINEAR A SIGN A325;Lo;0;L;;;;;N;;;;;
+10672;LINEAR A SIGN A326;Lo;0;L;;;;;N;;;;;
+10673;LINEAR A SIGN A327;Lo;0;L;;;;;N;;;;;
+10674;LINEAR A SIGN A328;Lo;0;L;;;;;N;;;;;
+10675;LINEAR A SIGN A329;Lo;0;L;;;;;N;;;;;
+10676;LINEAR A SIGN A330;Lo;0;L;;;;;N;;;;;
+10677;LINEAR A SIGN A331;Lo;0;L;;;;;N;;;;;
+10678;LINEAR A SIGN A332;Lo;0;L;;;;;N;;;;;
+10679;LINEAR A SIGN A333;Lo;0;L;;;;;N;;;;;
+1067A;LINEAR A SIGN A334;Lo;0;L;;;;;N;;;;;
+1067B;LINEAR A SIGN A335;Lo;0;L;;;;;N;;;;;
+1067C;LINEAR A SIGN A336;Lo;0;L;;;;;N;;;;;
+1067D;LINEAR A SIGN A337;Lo;0;L;;;;;N;;;;;
+1067E;LINEAR A SIGN A338;Lo;0;L;;;;;N;;;;;
+1067F;LINEAR A SIGN A339;Lo;0;L;;;;;N;;;;;
+10680;LINEAR A SIGN A340;Lo;0;L;;;;;N;;;;;
+10681;LINEAR A SIGN A341;Lo;0;L;;;;;N;;;;;
+10682;LINEAR A SIGN A342;Lo;0;L;;;;;N;;;;;
+10683;LINEAR A SIGN A343;Lo;0;L;;;;;N;;;;;
+10684;LINEAR A SIGN A344;Lo;0;L;;;;;N;;;;;
+10685;LINEAR A SIGN A345;Lo;0;L;;;;;N;;;;;
+10686;LINEAR A SIGN A346;Lo;0;L;;;;;N;;;;;
+10687;LINEAR A SIGN A347;Lo;0;L;;;;;N;;;;;
+10688;LINEAR A SIGN A348;Lo;0;L;;;;;N;;;;;
+10689;LINEAR A SIGN A349;Lo;0;L;;;;;N;;;;;
+1068A;LINEAR A SIGN A350;Lo;0;L;;;;;N;;;;;
+1068B;LINEAR A SIGN A351;Lo;0;L;;;;;N;;;;;
+1068C;LINEAR A SIGN A352;Lo;0;L;;;;;N;;;;;
+1068D;LINEAR A SIGN A353;Lo;0;L;;;;;N;;;;;
+1068E;LINEAR A SIGN A354;Lo;0;L;;;;;N;;;;;
+1068F;LINEAR A SIGN A355;Lo;0;L;;;;;N;;;;;
+10690;LINEAR A SIGN A356;Lo;0;L;;;;;N;;;;;
+10691;LINEAR A SIGN A357;Lo;0;L;;;;;N;;;;;
+10692;LINEAR A SIGN A358;Lo;0;L;;;;;N;;;;;
+10693;LINEAR A SIGN A359;Lo;0;L;;;;;N;;;;;
+10694;LINEAR A SIGN A360;Lo;0;L;;;;;N;;;;;
+10695;LINEAR A SIGN A361;Lo;0;L;;;;;N;;;;;
+10696;LINEAR A SIGN A362;Lo;0;L;;;;;N;;;;;
+10697;LINEAR A SIGN A363;Lo;0;L;;;;;N;;;;;
+10698;LINEAR A SIGN A364;Lo;0;L;;;;;N;;;;;
+10699;LINEAR A SIGN A365;Lo;0;L;;;;;N;;;;;
+1069A;LINEAR A SIGN A366;Lo;0;L;;;;;N;;;;;
+1069B;LINEAR A SIGN A367;Lo;0;L;;;;;N;;;;;
+1069C;LINEAR A SIGN A368;Lo;0;L;;;;;N;;;;;
+1069D;LINEAR A SIGN A369;Lo;0;L;;;;;N;;;;;
+1069E;LINEAR A SIGN A370;Lo;0;L;;;;;N;;;;;
+1069F;LINEAR A SIGN A371;Lo;0;L;;;;;N;;;;;
+106A0;LINEAR A SIGN A400-VAS;Lo;0;L;;;;;N;;;;;
+106A1;LINEAR A SIGN A401-VAS;Lo;0;L;;;;;N;;;;;
+106A2;LINEAR A SIGN A402-VAS;Lo;0;L;;;;;N;;;;;
+106A3;LINEAR A SIGN A403-VAS;Lo;0;L;;;;;N;;;;;
+106A4;LINEAR A SIGN A404-VAS;Lo;0;L;;;;;N;;;;;
+106A5;LINEAR A SIGN A405-VAS;Lo;0;L;;;;;N;;;;;
+106A6;LINEAR A SIGN A406-VAS;Lo;0;L;;;;;N;;;;;
+106A7;LINEAR A SIGN A407-VAS;Lo;0;L;;;;;N;;;;;
+106A8;LINEAR A SIGN A408-VAS;Lo;0;L;;;;;N;;;;;
+106A9;LINEAR A SIGN A409-VAS;Lo;0;L;;;;;N;;;;;
+106AA;LINEAR A SIGN A410-VAS;Lo;0;L;;;;;N;;;;;
+106AB;LINEAR A SIGN A411-VAS;Lo;0;L;;;;;N;;;;;
+106AC;LINEAR A SIGN A412-VAS;Lo;0;L;;;;;N;;;;;
+106AD;LINEAR A SIGN A413-VAS;Lo;0;L;;;;;N;;;;;
+106AE;LINEAR A SIGN A414-VAS;Lo;0;L;;;;;N;;;;;
+106AF;LINEAR A SIGN A415-VAS;Lo;0;L;;;;;N;;;;;
+106B0;LINEAR A SIGN A416-VAS;Lo;0;L;;;;;N;;;;;
+106B1;LINEAR A SIGN A417-VAS;Lo;0;L;;;;;N;;;;;
+106B2;LINEAR A SIGN A418-VAS;Lo;0;L;;;;;N;;;;;
+106B3;LINEAR A SIGN A501;Lo;0;L;;;;;N;;;;;
+106B4;LINEAR A SIGN A502;Lo;0;L;;;;;N;;;;;
+106B5;LINEAR A SIGN A503;Lo;0;L;;;;;N;;;;;
+106B6;LINEAR A SIGN A504;Lo;0;L;;;;;N;;;;;
+106B7;LINEAR A SIGN A505;Lo;0;L;;;;;N;;;;;
+106B8;LINEAR A SIGN A506;Lo;0;L;;;;;N;;;;;
+106B9;LINEAR A SIGN A508;Lo;0;L;;;;;N;;;;;
+106BA;LINEAR A SIGN A509;Lo;0;L;;;;;N;;;;;
+106BB;LINEAR A SIGN A510;Lo;0;L;;;;;N;;;;;
+106BC;LINEAR A SIGN A511;Lo;0;L;;;;;N;;;;;
+106BD;LINEAR A SIGN A512;Lo;0;L;;;;;N;;;;;
+106BE;LINEAR A SIGN A513;Lo;0;L;;;;;N;;;;;
+106BF;LINEAR A SIGN A515;Lo;0;L;;;;;N;;;;;
+106C0;LINEAR A SIGN A516;Lo;0;L;;;;;N;;;;;
+106C1;LINEAR A SIGN A520;Lo;0;L;;;;;N;;;;;
+106C2;LINEAR A SIGN A521;Lo;0;L;;;;;N;;;;;
+106C3;LINEAR A SIGN A523;Lo;0;L;;;;;N;;;;;
+106C4;LINEAR A SIGN A524;Lo;0;L;;;;;N;;;;;
+106C5;LINEAR A SIGN A525;Lo;0;L;;;;;N;;;;;
+106C6;LINEAR A SIGN A526;Lo;0;L;;;;;N;;;;;
+106C7;LINEAR A SIGN A527;Lo;0;L;;;;;N;;;;;
+106C8;LINEAR A SIGN A528;Lo;0;L;;;;;N;;;;;
+106C9;LINEAR A SIGN A529;Lo;0;L;;;;;N;;;;;
+106CA;LINEAR A SIGN A530;Lo;0;L;;;;;N;;;;;
+106CB;LINEAR A SIGN A531;Lo;0;L;;;;;N;;;;;
+106CC;LINEAR A SIGN A532;Lo;0;L;;;;;N;;;;;
+106CD;LINEAR A SIGN A534;Lo;0;L;;;;;N;;;;;
+106CE;LINEAR A SIGN A535;Lo;0;L;;;;;N;;;;;
+106CF;LINEAR A SIGN A536;Lo;0;L;;;;;N;;;;;
+106D0;LINEAR A SIGN A537;Lo;0;L;;;;;N;;;;;
+106D1;LINEAR A SIGN A538;Lo;0;L;;;;;N;;;;;
+106D2;LINEAR A SIGN A539;Lo;0;L;;;;;N;;;;;
+106D3;LINEAR A SIGN A540;Lo;0;L;;;;;N;;;;;
+106D4;LINEAR A SIGN A541;Lo;0;L;;;;;N;;;;;
+106D5;LINEAR A SIGN A542;Lo;0;L;;;;;N;;;;;
+106D6;LINEAR A SIGN A545;Lo;0;L;;;;;N;;;;;
+106D7;LINEAR A SIGN A547;Lo;0;L;;;;;N;;;;;
+106D8;LINEAR A SIGN A548;Lo;0;L;;;;;N;;;;;
+106D9;LINEAR A SIGN A549;Lo;0;L;;;;;N;;;;;
+106DA;LINEAR A SIGN A550;Lo;0;L;;;;;N;;;;;
+106DB;LINEAR A SIGN A551;Lo;0;L;;;;;N;;;;;
+106DC;LINEAR A SIGN A552;Lo;0;L;;;;;N;;;;;
+106DD;LINEAR A SIGN A553;Lo;0;L;;;;;N;;;;;
+106DE;LINEAR A SIGN A554;Lo;0;L;;;;;N;;;;;
+106DF;LINEAR A SIGN A555;Lo;0;L;;;;;N;;;;;
+106E0;LINEAR A SIGN A556;Lo;0;L;;;;;N;;;;;
+106E1;LINEAR A SIGN A557;Lo;0;L;;;;;N;;;;;
+106E2;LINEAR A SIGN A559;Lo;0;L;;;;;N;;;;;
+106E3;LINEAR A SIGN A563;Lo;0;L;;;;;N;;;;;
+106E4;LINEAR A SIGN A564;Lo;0;L;;;;;N;;;;;
+106E5;LINEAR A SIGN A565;Lo;0;L;;;;;N;;;;;
+106E6;LINEAR A SIGN A566;Lo;0;L;;;;;N;;;;;
+106E7;LINEAR A SIGN A568;Lo;0;L;;;;;N;;;;;
+106E8;LINEAR A SIGN A569;Lo;0;L;;;;;N;;;;;
+106E9;LINEAR A SIGN A570;Lo;0;L;;;;;N;;;;;
+106EA;LINEAR A SIGN A571;Lo;0;L;;;;;N;;;;;
+106EB;LINEAR A SIGN A572;Lo;0;L;;;;;N;;;;;
+106EC;LINEAR A SIGN A573;Lo;0;L;;;;;N;;;;;
+106ED;LINEAR A SIGN A574;Lo;0;L;;;;;N;;;;;
+106EE;LINEAR A SIGN A575;Lo;0;L;;;;;N;;;;;
+106EF;LINEAR A SIGN A576;Lo;0;L;;;;;N;;;;;
+106F0;LINEAR A SIGN A577;Lo;0;L;;;;;N;;;;;
+106F1;LINEAR A SIGN A578;Lo;0;L;;;;;N;;;;;
+106F2;LINEAR A SIGN A579;Lo;0;L;;;;;N;;;;;
+106F3;LINEAR A SIGN A580;Lo;0;L;;;;;N;;;;;
+106F4;LINEAR A SIGN A581;Lo;0;L;;;;;N;;;;;
+106F5;LINEAR A SIGN A582;Lo;0;L;;;;;N;;;;;
+106F6;LINEAR A SIGN A583;Lo;0;L;;;;;N;;;;;
+106F7;LINEAR A SIGN A584;Lo;0;L;;;;;N;;;;;
+106F8;LINEAR A SIGN A585;Lo;0;L;;;;;N;;;;;
+106F9;LINEAR A SIGN A586;Lo;0;L;;;;;N;;;;;
+106FA;LINEAR A SIGN A587;Lo;0;L;;;;;N;;;;;
+106FB;LINEAR A SIGN A588;Lo;0;L;;;;;N;;;;;
+106FC;LINEAR A SIGN A589;Lo;0;L;;;;;N;;;;;
+106FD;LINEAR A SIGN A591;Lo;0;L;;;;;N;;;;;
+106FE;LINEAR A SIGN A592;Lo;0;L;;;;;N;;;;;
+106FF;LINEAR A SIGN A594;Lo;0;L;;;;;N;;;;;
+10700;LINEAR A SIGN A595;Lo;0;L;;;;;N;;;;;
+10701;LINEAR A SIGN A596;Lo;0;L;;;;;N;;;;;
+10702;LINEAR A SIGN A598;Lo;0;L;;;;;N;;;;;
+10703;LINEAR A SIGN A600;Lo;0;L;;;;;N;;;;;
+10704;LINEAR A SIGN A601;Lo;0;L;;;;;N;;;;;
+10705;LINEAR A SIGN A602;Lo;0;L;;;;;N;;;;;
+10706;LINEAR A SIGN A603;Lo;0;L;;;;;N;;;;;
+10707;LINEAR A SIGN A604;Lo;0;L;;;;;N;;;;;
+10708;LINEAR A SIGN A606;Lo;0;L;;;;;N;;;;;
+10709;LINEAR A SIGN A608;Lo;0;L;;;;;N;;;;;
+1070A;LINEAR A SIGN A609;Lo;0;L;;;;;N;;;;;
+1070B;LINEAR A SIGN A610;Lo;0;L;;;;;N;;;;;
+1070C;LINEAR A SIGN A611;Lo;0;L;;;;;N;;;;;
+1070D;LINEAR A SIGN A612;Lo;0;L;;;;;N;;;;;
+1070E;LINEAR A SIGN A613;Lo;0;L;;;;;N;;;;;
+1070F;LINEAR A SIGN A614;Lo;0;L;;;;;N;;;;;
+10710;LINEAR A SIGN A615;Lo;0;L;;;;;N;;;;;
+10711;LINEAR A SIGN A616;Lo;0;L;;;;;N;;;;;
+10712;LINEAR A SIGN A617;Lo;0;L;;;;;N;;;;;
+10713;LINEAR A SIGN A618;Lo;0;L;;;;;N;;;;;
+10714;LINEAR A SIGN A619;Lo;0;L;;;;;N;;;;;
+10715;LINEAR A SIGN A620;Lo;0;L;;;;;N;;;;;
+10716;LINEAR A SIGN A621;Lo;0;L;;;;;N;;;;;
+10717;LINEAR A SIGN A622;Lo;0;L;;;;;N;;;;;
+10718;LINEAR A SIGN A623;Lo;0;L;;;;;N;;;;;
+10719;LINEAR A SIGN A624;Lo;0;L;;;;;N;;;;;
+1071A;LINEAR A SIGN A626;Lo;0;L;;;;;N;;;;;
+1071B;LINEAR A SIGN A627;Lo;0;L;;;;;N;;;;;
+1071C;LINEAR A SIGN A628;Lo;0;L;;;;;N;;;;;
+1071D;LINEAR A SIGN A629;Lo;0;L;;;;;N;;;;;
+1071E;LINEAR A SIGN A634;Lo;0;L;;;;;N;;;;;
+1071F;LINEAR A SIGN A637;Lo;0;L;;;;;N;;;;;
+10720;LINEAR A SIGN A638;Lo;0;L;;;;;N;;;;;
+10721;LINEAR A SIGN A640;Lo;0;L;;;;;N;;;;;
+10722;LINEAR A SIGN A642;Lo;0;L;;;;;N;;;;;
+10723;LINEAR A SIGN A643;Lo;0;L;;;;;N;;;;;
+10724;LINEAR A SIGN A644;Lo;0;L;;;;;N;;;;;
+10725;LINEAR A SIGN A645;Lo;0;L;;;;;N;;;;;
+10726;LINEAR A SIGN A646;Lo;0;L;;;;;N;;;;;
+10727;LINEAR A SIGN A648;Lo;0;L;;;;;N;;;;;
+10728;LINEAR A SIGN A649;Lo;0;L;;;;;N;;;;;
+10729;LINEAR A SIGN A651;Lo;0;L;;;;;N;;;;;
+1072A;LINEAR A SIGN A652;Lo;0;L;;;;;N;;;;;
+1072B;LINEAR A SIGN A653;Lo;0;L;;;;;N;;;;;
+1072C;LINEAR A SIGN A654;Lo;0;L;;;;;N;;;;;
+1072D;LINEAR A SIGN A655;Lo;0;L;;;;;N;;;;;
+1072E;LINEAR A SIGN A656;Lo;0;L;;;;;N;;;;;
+1072F;LINEAR A SIGN A657;Lo;0;L;;;;;N;;;;;
+10730;LINEAR A SIGN A658;Lo;0;L;;;;;N;;;;;
+10731;LINEAR A SIGN A659;Lo;0;L;;;;;N;;;;;
+10732;LINEAR A SIGN A660;Lo;0;L;;;;;N;;;;;
+10733;LINEAR A SIGN A661;Lo;0;L;;;;;N;;;;;
+10734;LINEAR A SIGN A662;Lo;0;L;;;;;N;;;;;
+10735;LINEAR A SIGN A663;Lo;0;L;;;;;N;;;;;
+10736;LINEAR A SIGN A664;Lo;0;L;;;;;N;;;;;
+10740;LINEAR A SIGN A701 A;Lo;0;L;;;;;N;;;;;
+10741;LINEAR A SIGN A702 B;Lo;0;L;;;;;N;;;;;
+10742;LINEAR A SIGN A703 D;Lo;0;L;;;;;N;;;;;
+10743;LINEAR A SIGN A704 E;Lo;0;L;;;;;N;;;;;
+10744;LINEAR A SIGN A705 F;Lo;0;L;;;;;N;;;;;
+10745;LINEAR A SIGN A706 H;Lo;0;L;;;;;N;;;;;
+10746;LINEAR A SIGN A707 J;Lo;0;L;;;;;N;;;;;
+10747;LINEAR A SIGN A708 K;Lo;0;L;;;;;N;;;;;
+10748;LINEAR A SIGN A709 L;Lo;0;L;;;;;N;;;;;
+10749;LINEAR A SIGN A709-2 L2;Lo;0;L;;;;;N;;;;;
+1074A;LINEAR A SIGN A709-3 L3;Lo;0;L;;;;;N;;;;;
+1074B;LINEAR A SIGN A709-4 L4;Lo;0;L;;;;;N;;;;;
+1074C;LINEAR A SIGN A709-6 L6;Lo;0;L;;;;;N;;;;;
+1074D;LINEAR A SIGN A710 W;Lo;0;L;;;;;N;;;;;
+1074E;LINEAR A SIGN A711 X;Lo;0;L;;;;;N;;;;;
+1074F;LINEAR A SIGN A712 Y;Lo;0;L;;;;;N;;;;;
+10750;LINEAR A SIGN A713 OMEGA;Lo;0;L;;;;;N;;;;;
+10751;LINEAR A SIGN A714 ABB;Lo;0;L;;;;;N;;;;;
+10752;LINEAR A SIGN A715 BB;Lo;0;L;;;;;N;;;;;
+10753;LINEAR A SIGN A717 DD;Lo;0;L;;;;;N;;;;;
+10754;LINEAR A SIGN A726 EYYY;Lo;0;L;;;;;N;;;;;
+10755;LINEAR A SIGN A732 JE;Lo;0;L;;;;;N;;;;;
+10760;LINEAR A SIGN A800;Lo;0;L;;;;;N;;;;;
+10761;LINEAR A SIGN A801;Lo;0;L;;;;;N;;;;;
+10762;LINEAR A SIGN A802;Lo;0;L;;;;;N;;;;;
+10763;LINEAR A SIGN A803;Lo;0;L;;;;;N;;;;;
+10764;LINEAR A SIGN A804;Lo;0;L;;;;;N;;;;;
+10765;LINEAR A SIGN A805;Lo;0;L;;;;;N;;;;;
+10766;LINEAR A SIGN A806;Lo;0;L;;;;;N;;;;;
+10767;LINEAR A SIGN A807;Lo;0;L;;;;;N;;;;;
10800;CYPRIOT SYLLABLE A;Lo;0;R;;;;;N;;;;;
10801;CYPRIOT SYLLABLE E;Lo;0;R;;;;;N;;;;;
10802;CYPRIOT SYLLABLE I;Lo;0;R;;;;;N;;;;;
@@ -16917,6 +17758,78 @@
1085D;IMPERIAL ARAMAIC NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;;
1085E;IMPERIAL ARAMAIC NUMBER ONE THOUSAND;No;0;R;;;;1000;N;;;;;
1085F;IMPERIAL ARAMAIC NUMBER TEN THOUSAND;No;0;R;;;;10000;N;;;;;
+10860;PALMYRENE LETTER ALEPH;Lo;0;R;;;;;N;;;;;
+10861;PALMYRENE LETTER BETH;Lo;0;R;;;;;N;;;;;
+10862;PALMYRENE LETTER GIMEL;Lo;0;R;;;;;N;;;;;
+10863;PALMYRENE LETTER DALETH;Lo;0;R;;;;;N;;;;;
+10864;PALMYRENE LETTER HE;Lo;0;R;;;;;N;;;;;
+10865;PALMYRENE LETTER WAW;Lo;0;R;;;;;N;;;;;
+10866;PALMYRENE LETTER ZAYIN;Lo;0;R;;;;;N;;;;;
+10867;PALMYRENE LETTER HETH;Lo;0;R;;;;;N;;;;;
+10868;PALMYRENE LETTER TETH;Lo;0;R;;;;;N;;;;;
+10869;PALMYRENE LETTER YODH;Lo;0;R;;;;;N;;;;;
+1086A;PALMYRENE LETTER KAPH;Lo;0;R;;;;;N;;;;;
+1086B;PALMYRENE LETTER LAMEDH;Lo;0;R;;;;;N;;;;;
+1086C;PALMYRENE LETTER MEM;Lo;0;R;;;;;N;;;;;
+1086D;PALMYRENE LETTER FINAL NUN;Lo;0;R;;;;;N;;;;;
+1086E;PALMYRENE LETTER NUN;Lo;0;R;;;;;N;;;;;
+1086F;PALMYRENE LETTER SAMEKH;Lo;0;R;;;;;N;;;;;
+10870;PALMYRENE LETTER AYIN;Lo;0;R;;;;;N;;;;;
+10871;PALMYRENE LETTER PE;Lo;0;R;;;;;N;;;;;
+10872;PALMYRENE LETTER SADHE;Lo;0;R;;;;;N;;;;;
+10873;PALMYRENE LETTER QOPH;Lo;0;R;;;;;N;;;;;
+10874;PALMYRENE LETTER RESH;Lo;0;R;;;;;N;;;;;
+10875;PALMYRENE LETTER SHIN;Lo;0;R;;;;;N;;;;;
+10876;PALMYRENE LETTER TAW;Lo;0;R;;;;;N;;;;;
+10877;PALMYRENE LEFT-POINTING FLEURON;So;0;R;;;;;N;;;;;
+10878;PALMYRENE RIGHT-POINTING FLEURON;So;0;R;;;;;N;;;;;
+10879;PALMYRENE NUMBER ONE;No;0;R;;;;1;N;;;;;
+1087A;PALMYRENE NUMBER TWO;No;0;R;;;;2;N;;;;;
+1087B;PALMYRENE NUMBER THREE;No;0;R;;;;3;N;;;;;
+1087C;PALMYRENE NUMBER FOUR;No;0;R;;;;4;N;;;;;
+1087D;PALMYRENE NUMBER FIVE;No;0;R;;;;5;N;;;;;
+1087E;PALMYRENE NUMBER TEN;No;0;R;;;;10;N;;;;;
+1087F;PALMYRENE NUMBER TWENTY;No;0;R;;;;20;N;;;;;
+10880;NABATAEAN LETTER FINAL ALEPH;Lo;0;R;;;;;N;;;;;
+10881;NABATAEAN LETTER ALEPH;Lo;0;R;;;;;N;;;;;
+10882;NABATAEAN LETTER FINAL BETH;Lo;0;R;;;;;N;;;;;
+10883;NABATAEAN LETTER BETH;Lo;0;R;;;;;N;;;;;
+10884;NABATAEAN LETTER GIMEL;Lo;0;R;;;;;N;;;;;
+10885;NABATAEAN LETTER DALETH;Lo;0;R;;;;;N;;;;;
+10886;NABATAEAN LETTER FINAL HE;Lo;0;R;;;;;N;;;;;
+10887;NABATAEAN LETTER HE;Lo;0;R;;;;;N;;;;;
+10888;NABATAEAN LETTER WAW;Lo;0;R;;;;;N;;;;;
+10889;NABATAEAN LETTER ZAYIN;Lo;0;R;;;;;N;;;;;
+1088A;NABATAEAN LETTER HETH;Lo;0;R;;;;;N;;;;;
+1088B;NABATAEAN LETTER TETH;Lo;0;R;;;;;N;;;;;
+1088C;NABATAEAN LETTER FINAL YODH;Lo;0;R;;;;;N;;;;;
+1088D;NABATAEAN LETTER YODH;Lo;0;R;;;;;N;;;;;
+1088E;NABATAEAN LETTER FINAL KAPH;Lo;0;R;;;;;N;;;;;
+1088F;NABATAEAN LETTER KAPH;Lo;0;R;;;;;N;;;;;
+10890;NABATAEAN LETTER FINAL LAMEDH;Lo;0;R;;;;;N;;;;;
+10891;NABATAEAN LETTER LAMEDH;Lo;0;R;;;;;N;;;;;
+10892;NABATAEAN LETTER FINAL MEM;Lo;0;R;;;;;N;;;;;
+10893;NABATAEAN LETTER MEM;Lo;0;R;;;;;N;;;;;
+10894;NABATAEAN LETTER FINAL NUN;Lo;0;R;;;;;N;;;;;
+10895;NABATAEAN LETTER NUN;Lo;0;R;;;;;N;;;;;
+10896;NABATAEAN LETTER SAMEKH;Lo;0;R;;;;;N;;;;;
+10897;NABATAEAN LETTER AYIN;Lo;0;R;;;;;N;;;;;
+10898;NABATAEAN LETTER PE;Lo;0;R;;;;;N;;;;;
+10899;NABATAEAN LETTER SADHE;Lo;0;R;;;;;N;;;;;
+1089A;NABATAEAN LETTER QOPH;Lo;0;R;;;;;N;;;;;
+1089B;NABATAEAN LETTER RESH;Lo;0;R;;;;;N;;;;;
+1089C;NABATAEAN LETTER FINAL SHIN;Lo;0;R;;;;;N;;;;;
+1089D;NABATAEAN LETTER SHIN;Lo;0;R;;;;;N;;;;;
+1089E;NABATAEAN LETTER TAW;Lo;0;R;;;;;N;;;;;
+108A7;NABATAEAN NUMBER ONE;No;0;R;;;;1;N;;;;;
+108A8;NABATAEAN NUMBER TWO;No;0;R;;;;2;N;;;;;
+108A9;NABATAEAN NUMBER THREE;No;0;R;;;;3;N;;;;;
+108AA;NABATAEAN NUMBER FOUR;No;0;R;;;;4;N;;;;;
+108AB;NABATAEAN CRUCIFORM NUMBER FOUR;No;0;R;;;;4;N;;;;;
+108AC;NABATAEAN NUMBER FIVE;No;0;R;;;;5;N;;;;;
+108AD;NABATAEAN NUMBER TEN;No;0;R;;;;10;N;;;;;
+108AE;NABATAEAN NUMBER TWENTY;No;0;R;;;;20;N;;;;;
+108AF;NABATAEAN NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;;
10900;PHOENICIAN LETTER ALF;Lo;0;R;;;;;N;;;;;
10901;PHOENICIAN LETTER BET;Lo;0;R;;;;;N;;;;;
10902;PHOENICIAN LETTER GAML;Lo;0;R;;;;;N;;;;;
@@ -17128,6 +18041,89 @@
10A7D;OLD SOUTH ARABIAN NUMBER ONE;No;0;R;;;;1;N;;;;;
10A7E;OLD SOUTH ARABIAN NUMBER FIFTY;No;0;R;;;;50;N;;;;;
10A7F;OLD SOUTH ARABIAN NUMERIC INDICATOR;Po;0;R;;;;;N;;;;;
+10A80;OLD NORTH ARABIAN LETTER HEH;Lo;0;R;;;;;N;;;;;
+10A81;OLD NORTH ARABIAN LETTER LAM;Lo;0;R;;;;;N;;;;;
+10A82;OLD NORTH ARABIAN LETTER HAH;Lo;0;R;;;;;N;;;;;
+10A83;OLD NORTH ARABIAN LETTER MEEM;Lo;0;R;;;;;N;;;;;
+10A84;OLD NORTH ARABIAN LETTER QAF;Lo;0;R;;;;;N;;;;;
+10A85;OLD NORTH ARABIAN LETTER WAW;Lo;0;R;;;;;N;;;;;
+10A86;OLD NORTH ARABIAN LETTER ES-2;Lo;0;R;;;;;N;;;;;
+10A87;OLD NORTH ARABIAN LETTER REH;Lo;0;R;;;;;N;;;;;
+10A88;OLD NORTH ARABIAN LETTER BEH;Lo;0;R;;;;;N;;;;;
+10A89;OLD NORTH ARABIAN LETTER TEH;Lo;0;R;;;;;N;;;;;
+10A8A;OLD NORTH ARABIAN LETTER ES-1;Lo;0;R;;;;;N;;;;;
+10A8B;OLD NORTH ARABIAN LETTER KAF;Lo;0;R;;;;;N;;;;;
+10A8C;OLD NORTH ARABIAN LETTER NOON;Lo;0;R;;;;;N;;;;;
+10A8D;OLD NORTH ARABIAN LETTER KHAH;Lo;0;R;;;;;N;;;;;
+10A8E;OLD NORTH ARABIAN LETTER SAD;Lo;0;R;;;;;N;;;;;
+10A8F;OLD NORTH ARABIAN LETTER ES-3;Lo;0;R;;;;;N;;;;;
+10A90;OLD NORTH ARABIAN LETTER FEH;Lo;0;R;;;;;N;;;;;
+10A91;OLD NORTH ARABIAN LETTER ALEF;Lo;0;R;;;;;N;;;;;
+10A92;OLD NORTH ARABIAN LETTER AIN;Lo;0;R;;;;;N;;;;;
+10A93;OLD NORTH ARABIAN LETTER DAD;Lo;0;R;;;;;N;;;;;
+10A94;OLD NORTH ARABIAN LETTER GEEM;Lo;0;R;;;;;N;;;;;
+10A95;OLD NORTH ARABIAN LETTER DAL;Lo;0;R;;;;;N;;;;;
+10A96;OLD NORTH ARABIAN LETTER GHAIN;Lo;0;R;;;;;N;;;;;
+10A97;OLD NORTH ARABIAN LETTER TAH;Lo;0;R;;;;;N;;;;;
+10A98;OLD NORTH ARABIAN LETTER ZAIN;Lo;0;R;;;;;N;;;;;
+10A99;OLD NORTH ARABIAN LETTER THAL;Lo;0;R;;;;;N;;;;;
+10A9A;OLD NORTH ARABIAN LETTER YEH;Lo;0;R;;;;;N;;;;;
+10A9B;OLD NORTH ARABIAN LETTER THEH;Lo;0;R;;;;;N;;;;;
+10A9C;OLD NORTH ARABIAN LETTER ZAH;Lo;0;R;;;;;N;;;;;
+10A9D;OLD NORTH ARABIAN NUMBER ONE;No;0;R;;;;1;N;;;;;
+10A9E;OLD NORTH ARABIAN NUMBER TEN;No;0;R;;;;10;N;;;;;
+10A9F;OLD NORTH ARABIAN NUMBER TWENTY;No;0;R;;;;20;N;;;;;
+10AC0;MANICHAEAN LETTER ALEPH;Lo;0;R;;;;;N;;;;;
+10AC1;MANICHAEAN LETTER BETH;Lo;0;R;;;;;N;;;;;
+10AC2;MANICHAEAN LETTER BHETH;Lo;0;R;;;;;N;;;;;
+10AC3;MANICHAEAN LETTER GIMEL;Lo;0;R;;;;;N;;;;;
+10AC4;MANICHAEAN LETTER GHIMEL;Lo;0;R;;;;;N;;;;;
+10AC5;MANICHAEAN LETTER DALETH;Lo;0;R;;;;;N;;;;;
+10AC6;MANICHAEAN LETTER HE;Lo;0;R;;;;;N;;;;;
+10AC7;MANICHAEAN LETTER WAW;Lo;0;R;;;;;N;;;;;
+10AC8;MANICHAEAN SIGN UD;So;0;R;;;;;N;;;;;
+10AC9;MANICHAEAN LETTER ZAYIN;Lo;0;R;;;;;N;;;;;
+10ACA;MANICHAEAN LETTER ZHAYIN;Lo;0;R;;;;;N;;;;;
+10ACB;MANICHAEAN LETTER JAYIN;Lo;0;R;;;;;N;;;;;
+10ACC;MANICHAEAN LETTER JHAYIN;Lo;0;R;;;;;N;;;;;
+10ACD;MANICHAEAN LETTER HETH;Lo;0;R;;;;;N;;;;;
+10ACE;MANICHAEAN LETTER TETH;Lo;0;R;;;;;N;;;;;
+10ACF;MANICHAEAN LETTER YODH;Lo;0;R;;;;;N;;;;;
+10AD0;MANICHAEAN LETTER KAPH;Lo;0;R;;;;;N;;;;;
+10AD1;MANICHAEAN LETTER XAPH;Lo;0;R;;;;;N;;;;;
+10AD2;MANICHAEAN LETTER KHAPH;Lo;0;R;;;;;N;;;;;
+10AD3;MANICHAEAN LETTER LAMEDH;Lo;0;R;;;;;N;;;;;
+10AD4;MANICHAEAN LETTER DHAMEDH;Lo;0;R;;;;;N;;;;;
+10AD5;MANICHAEAN LETTER THAMEDH;Lo;0;R;;;;;N;;;;;
+10AD6;MANICHAEAN LETTER MEM;Lo;0;R;;;;;N;;;;;
+10AD7;MANICHAEAN LETTER NUN;Lo;0;R;;;;;N;;;;;
+10AD8;MANICHAEAN LETTER SAMEKH;Lo;0;R;;;;;N;;;;;
+10AD9;MANICHAEAN LETTER AYIN;Lo;0;R;;;;;N;;;;;
+10ADA;MANICHAEAN LETTER AAYIN;Lo;0;R;;;;;N;;;;;
+10ADB;MANICHAEAN LETTER PE;Lo;0;R;;;;;N;;;;;
+10ADC;MANICHAEAN LETTER FE;Lo;0;R;;;;;N;;;;;
+10ADD;MANICHAEAN LETTER SADHE;Lo;0;R;;;;;N;;;;;
+10ADE;MANICHAEAN LETTER QOPH;Lo;0;R;;;;;N;;;;;
+10ADF;MANICHAEAN LETTER XOPH;Lo;0;R;;;;;N;;;;;
+10AE0;MANICHAEAN LETTER QHOPH;Lo;0;R;;;;;N;;;;;
+10AE1;MANICHAEAN LETTER RESH;Lo;0;R;;;;;N;;;;;
+10AE2;MANICHAEAN LETTER SHIN;Lo;0;R;;;;;N;;;;;
+10AE3;MANICHAEAN LETTER SSHIN;Lo;0;R;;;;;N;;;;;
+10AE4;MANICHAEAN LETTER TAW;Lo;0;R;;;;;N;;;;;
+10AE5;MANICHAEAN ABBREVIATION MARK ABOVE;Mn;230;NSM;;;;;N;;;;;
+10AE6;MANICHAEAN ABBREVIATION MARK BELOW;Mn;220;NSM;;;;;N;;;;;
+10AEB;MANICHAEAN NUMBER ONE;No;0;R;;;;1;N;;;;;
+10AEC;MANICHAEAN NUMBER FIVE;No;0;R;;;;5;N;;;;;
+10AED;MANICHAEAN NUMBER TEN;No;0;R;;;;10;N;;;;;
+10AEE;MANICHAEAN NUMBER TWENTY;No;0;R;;;;20;N;;;;;
+10AEF;MANICHAEAN NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;;
+10AF0;MANICHAEAN PUNCTUATION STAR;Po;0;R;;;;;N;;;;;
+10AF1;MANICHAEAN PUNCTUATION FLEURON;Po;0;R;;;;;N;;;;;
+10AF2;MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT;Po;0;R;;;;;N;;;;;
+10AF3;MANICHAEAN PUNCTUATION DOT WITHIN DOT;Po;0;R;;;;;N;;;;;
+10AF4;MANICHAEAN PUNCTUATION DOT;Po;0;R;;;;;N;;;;;
+10AF5;MANICHAEAN PUNCTUATION TWO DOTS;Po;0;R;;;;;N;;;;;
+10AF6;MANICHAEAN PUNCTUATION LINE FILLER;Po;0;R;;;;;N;;;;;
10B00;AVESTAN LETTER A;Lo;0;R;;;;;N;;;;;
10B01;AVESTAN LETTER AA;Lo;0;R;;;;;N;;;;;
10B02;AVESTAN LETTER AO;Lo;0;R;;;;;N;;;;;
@@ -17246,6 +18242,35 @@
10B7D;INSCRIPTIONAL PAHLAVI NUMBER TWENTY;No;0;R;;;;20;N;;;;;
10B7E;INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;;
10B7F;INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND;No;0;R;;;;1000;N;;;;;
+10B80;PSALTER PAHLAVI LETTER ALEPH;Lo;0;R;;;;;N;;;;;
+10B81;PSALTER PAHLAVI LETTER BETH;Lo;0;R;;;;;N;;;;;
+10B82;PSALTER PAHLAVI LETTER GIMEL;Lo;0;R;;;;;N;;;;;
+10B83;PSALTER PAHLAVI LETTER DALETH;Lo;0;R;;;;;N;;;;;
+10B84;PSALTER PAHLAVI LETTER HE;Lo;0;R;;;;;N;;;;;
+10B85;PSALTER PAHLAVI LETTER WAW-AYIN-RESH;Lo;0;R;;;;;N;;;;;
+10B86;PSALTER PAHLAVI LETTER ZAYIN;Lo;0;R;;;;;N;;;;;
+10B87;PSALTER PAHLAVI LETTER HETH;Lo;0;R;;;;;N;;;;;
+10B88;PSALTER PAHLAVI LETTER YODH;Lo;0;R;;;;;N;;;;;
+10B89;PSALTER PAHLAVI LETTER KAPH;Lo;0;R;;;;;N;;;;;
+10B8A;PSALTER PAHLAVI LETTER LAMEDH;Lo;0;R;;;;;N;;;;;
+10B8B;PSALTER PAHLAVI LETTER MEM-QOPH;Lo;0;R;;;;;N;;;;;
+10B8C;PSALTER PAHLAVI LETTER NUN;Lo;0;R;;;;;N;;;;;
+10B8D;PSALTER PAHLAVI LETTER SAMEKH;Lo;0;R;;;;;N;;;;;
+10B8E;PSALTER PAHLAVI LETTER PE;Lo;0;R;;;;;N;;;;;
+10B8F;PSALTER PAHLAVI LETTER SADHE;Lo;0;R;;;;;N;;;;;
+10B90;PSALTER PAHLAVI LETTER SHIN;Lo;0;R;;;;;N;;;;;
+10B91;PSALTER PAHLAVI LETTER TAW;Lo;0;R;;;;;N;;;;;
+10B99;PSALTER PAHLAVI SECTION MARK;Po;0;R;;;;;N;;;;;
+10B9A;PSALTER PAHLAVI TURNED SECTION MARK;Po;0;R;;;;;N;;;;;
+10B9B;PSALTER PAHLAVI FOUR DOTS WITH CROSS;Po;0;R;;;;;N;;;;;
+10B9C;PSALTER PAHLAVI FOUR DOTS WITH DOT;Po;0;R;;;;;N;;;;;
+10BA9;PSALTER PAHLAVI NUMBER ONE;No;0;R;;;;1;N;;;;;
+10BAA;PSALTER PAHLAVI NUMBER TWO;No;0;R;;;;2;N;;;;;
+10BAB;PSALTER PAHLAVI NUMBER THREE;No;0;R;;;;3;N;;;;;
+10BAC;PSALTER PAHLAVI NUMBER FOUR;No;0;R;;;;4;N;;;;;
+10BAD;PSALTER PAHLAVI NUMBER TEN;No;0;R;;;;10;N;;;;;
+10BAE;PSALTER PAHLAVI NUMBER TWENTY;No;0;R;;;;20;N;;;;;
+10BAF;PSALTER PAHLAVI NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;;
10C00;OLD TURKIC LETTER ORKHON A;Lo;0;R;;;;;N;;;;;
10C01;OLD TURKIC LETTER YENISEI A;Lo;0;R;;;;;N;;;;;
10C02;OLD TURKIC LETTER YENISEI AE;Lo;0;R;;;;;N;;;;;
@@ -17458,6 +18483,7 @@
1106D;BRAHMI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
1106E;BRAHMI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
1106F;BRAHMI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+1107F;BRAHMI NUMBER JOINER;Mn;9;NSM;;;;;N;;;;;
11080;KAITHI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
11081;KAITHI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
11082;KAITHI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
@@ -17626,6 +18652,45 @@
11141;CHAKMA DANDA;Po;0;L;;;;;N;;;;;
11142;CHAKMA DOUBLE DANDA;Po;0;L;;;;;N;;;;;
11143;CHAKMA QUESTION MARK;Po;0;L;;;;;N;;;;;
+11150;MAHAJANI LETTER A;Lo;0;L;;;;;N;;;;;
+11151;MAHAJANI LETTER I;Lo;0;L;;;;;N;;;;;
+11152;MAHAJANI LETTER U;Lo;0;L;;;;;N;;;;;
+11153;MAHAJANI LETTER E;Lo;0;L;;;;;N;;;;;
+11154;MAHAJANI LETTER O;Lo;0;L;;;;;N;;;;;
+11155;MAHAJANI LETTER KA;Lo;0;L;;;;;N;;;;;
+11156;MAHAJANI LETTER KHA;Lo;0;L;;;;;N;;;;;
+11157;MAHAJANI LETTER GA;Lo;0;L;;;;;N;;;;;
+11158;MAHAJANI LETTER GHA;Lo;0;L;;;;;N;;;;;
+11159;MAHAJANI LETTER CA;Lo;0;L;;;;;N;;;;;
+1115A;MAHAJANI LETTER CHA;Lo;0;L;;;;;N;;;;;
+1115B;MAHAJANI LETTER JA;Lo;0;L;;;;;N;;;;;
+1115C;MAHAJANI LETTER JHA;Lo;0;L;;;;;N;;;;;
+1115D;MAHAJANI LETTER NYA;Lo;0;L;;;;;N;;;;;
+1115E;MAHAJANI LETTER TTA;Lo;0;L;;;;;N;;;;;
+1115F;MAHAJANI LETTER TTHA;Lo;0;L;;;;;N;;;;;
+11160;MAHAJANI LETTER DDA;Lo;0;L;;;;;N;;;;;
+11161;MAHAJANI LETTER DDHA;Lo;0;L;;;;;N;;;;;
+11162;MAHAJANI LETTER NNA;Lo;0;L;;;;;N;;;;;
+11163;MAHAJANI LETTER TA;Lo;0;L;;;;;N;;;;;
+11164;MAHAJANI LETTER THA;Lo;0;L;;;;;N;;;;;
+11165;MAHAJANI LETTER DA;Lo;0;L;;;;;N;;;;;
+11166;MAHAJANI LETTER DHA;Lo;0;L;;;;;N;;;;;
+11167;MAHAJANI LETTER NA;Lo;0;L;;;;;N;;;;;
+11168;MAHAJANI LETTER PA;Lo;0;L;;;;;N;;;;;
+11169;MAHAJANI LETTER PHA;Lo;0;L;;;;;N;;;;;
+1116A;MAHAJANI LETTER BA;Lo;0;L;;;;;N;;;;;
+1116B;MAHAJANI LETTER BHA;Lo;0;L;;;;;N;;;;;
+1116C;MAHAJANI LETTER MA;Lo;0;L;;;;;N;;;;;
+1116D;MAHAJANI LETTER RA;Lo;0;L;;;;;N;;;;;
+1116E;MAHAJANI LETTER LA;Lo;0;L;;;;;N;;;;;
+1116F;MAHAJANI LETTER VA;Lo;0;L;;;;;N;;;;;
+11170;MAHAJANI LETTER SA;Lo;0;L;;;;;N;;;;;
+11171;MAHAJANI LETTER HA;Lo;0;L;;;;;N;;;;;
+11172;MAHAJANI LETTER RRA;Lo;0;L;;;;;N;;;;;
+11173;MAHAJANI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
+11174;MAHAJANI ABBREVIATION SIGN;Po;0;L;;;;;N;;;;;
+11175;MAHAJANI SECTION MARK;Po;0;L;;;;;N;;;;;
+11176;MAHAJANI LIGATURE SHRI;Lo;0;L;;;;;N;;;;;
11180;SHARADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
11181;SHARADA SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
11182;SHARADA SIGN VISARGA;Mc;0;L;;;;;N;;;;;
@@ -17699,6 +18764,7 @@
111C6;SHARADA DOUBLE DANDA;Po;0;L;;;;;N;;;;;
111C7;SHARADA ABBREVIATION SIGN;Po;0;L;;;;;N;;;;;
111C8;SHARADA SEPARATOR;Po;0;L;;;;;N;;;;;
+111CD;SHARADA SUTRA MARK;Po;0;L;;;;;N;;;;;
111D0;SHARADA DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
111D1;SHARADA DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
111D2;SHARADA DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
@@ -17709,6 +18775,473 @@
111D7;SHARADA DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
111D8;SHARADA DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
111D9;SHARADA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+111DA;SHARADA EKAM;Lo;0;L;;;;;N;;;;;
+111E1;SINHALA ARCHAIC DIGIT ONE;No;0;L;;;;1;N;;;;;
+111E2;SINHALA ARCHAIC DIGIT TWO;No;0;L;;;;2;N;;;;;
+111E3;SINHALA ARCHAIC DIGIT THREE;No;0;L;;;;3;N;;;;;
+111E4;SINHALA ARCHAIC DIGIT FOUR;No;0;L;;;;4;N;;;;;
+111E5;SINHALA ARCHAIC DIGIT FIVE;No;0;L;;;;5;N;;;;;
+111E6;SINHALA ARCHAIC DIGIT SIX;No;0;L;;;;6;N;;;;;
+111E7;SINHALA ARCHAIC DIGIT SEVEN;No;0;L;;;;7;N;;;;;
+111E8;SINHALA ARCHAIC DIGIT EIGHT;No;0;L;;;;8;N;;;;;
+111E9;SINHALA ARCHAIC DIGIT NINE;No;0;L;;;;9;N;;;;;
+111EA;SINHALA ARCHAIC NUMBER TEN;No;0;L;;;;10;N;;;;;
+111EB;SINHALA ARCHAIC NUMBER TWENTY;No;0;L;;;;20;N;;;;;
+111EC;SINHALA ARCHAIC NUMBER THIRTY;No;0;L;;;;30;N;;;;;
+111ED;SINHALA ARCHAIC NUMBER FORTY;No;0;L;;;;40;N;;;;;
+111EE;SINHALA ARCHAIC NUMBER FIFTY;No;0;L;;;;50;N;;;;;
+111EF;SINHALA ARCHAIC NUMBER SIXTY;No;0;L;;;;60;N;;;;;
+111F0;SINHALA ARCHAIC NUMBER SEVENTY;No;0;L;;;;70;N;;;;;
+111F1;SINHALA ARCHAIC NUMBER EIGHTY;No;0;L;;;;80;N;;;;;
+111F2;SINHALA ARCHAIC NUMBER NINETY;No;0;L;;;;90;N;;;;;
+111F3;SINHALA ARCHAIC NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;
+111F4;SINHALA ARCHAIC NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;
+11200;KHOJKI LETTER A;Lo;0;L;;;;;N;;;;;
+11201;KHOJKI LETTER AA;Lo;0;L;;;;;N;;;;;
+11202;KHOJKI LETTER I;Lo;0;L;;;;;N;;;;;
+11203;KHOJKI LETTER U;Lo;0;L;;;;;N;;;;;
+11204;KHOJKI LETTER E;Lo;0;L;;;;;N;;;;;
+11205;KHOJKI LETTER AI;Lo;0;L;;;;;N;;;;;
+11206;KHOJKI LETTER O;Lo;0;L;;;;;N;;;;;
+11207;KHOJKI LETTER AU;Lo;0;L;;;;;N;;;;;
+11208;KHOJKI LETTER KA;Lo;0;L;;;;;N;;;;;
+11209;KHOJKI LETTER KHA;Lo;0;L;;;;;N;;;;;
+1120A;KHOJKI LETTER GA;Lo;0;L;;;;;N;;;;;
+1120B;KHOJKI LETTER GGA;Lo;0;L;;;;;N;;;;;
+1120C;KHOJKI LETTER GHA;Lo;0;L;;;;;N;;;;;
+1120D;KHOJKI LETTER NGA;Lo;0;L;;;;;N;;;;;
+1120E;KHOJKI LETTER CA;Lo;0;L;;;;;N;;;;;
+1120F;KHOJKI LETTER CHA;Lo;0;L;;;;;N;;;;;
+11210;KHOJKI LETTER JA;Lo;0;L;;;;;N;;;;;
+11211;KHOJKI LETTER JJA;Lo;0;L;;;;;N;;;;;
+11213;KHOJKI LETTER NYA;Lo;0;L;;;;;N;;;;;
+11214;KHOJKI LETTER TTA;Lo;0;L;;;;;N;;;;;
+11215;KHOJKI LETTER TTHA;Lo;0;L;;;;;N;;;;;
+11216;KHOJKI LETTER DDA;Lo;0;L;;;;;N;;;;;
+11217;KHOJKI LETTER DDHA;Lo;0;L;;;;;N;;;;;
+11218;KHOJKI LETTER NNA;Lo;0;L;;;;;N;;;;;
+11219;KHOJKI LETTER TA;Lo;0;L;;;;;N;;;;;
+1121A;KHOJKI LETTER THA;Lo;0;L;;;;;N;;;;;
+1121B;KHOJKI LETTER DA;Lo;0;L;;;;;N;;;;;
+1121C;KHOJKI LETTER DDDA;Lo;0;L;;;;;N;;;;;
+1121D;KHOJKI LETTER DHA;Lo;0;L;;;;;N;;;;;
+1121E;KHOJKI LETTER NA;Lo;0;L;;;;;N;;;;;
+1121F;KHOJKI LETTER PA;Lo;0;L;;;;;N;;;;;
+11220;KHOJKI LETTER PHA;Lo;0;L;;;;;N;;;;;
+11221;KHOJKI LETTER BA;Lo;0;L;;;;;N;;;;;
+11222;KHOJKI LETTER BBA;Lo;0;L;;;;;N;;;;;
+11223;KHOJKI LETTER BHA;Lo;0;L;;;;;N;;;;;
+11224;KHOJKI LETTER MA;Lo;0;L;;;;;N;;;;;
+11225;KHOJKI LETTER YA;Lo;0;L;;;;;N;;;;;
+11226;KHOJKI LETTER RA;Lo;0;L;;;;;N;;;;;
+11227;KHOJKI LETTER LA;Lo;0;L;;;;;N;;;;;
+11228;KHOJKI LETTER VA;Lo;0;L;;;;;N;;;;;
+11229;KHOJKI LETTER SA;Lo;0;L;;;;;N;;;;;
+1122A;KHOJKI LETTER HA;Lo;0;L;;;;;N;;;;;
+1122B;KHOJKI LETTER LLA;Lo;0;L;;;;;N;;;;;
+1122C;KHOJKI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
+1122D;KHOJKI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
+1122E;KHOJKI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
+1122F;KHOJKI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
+11230;KHOJKI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;
+11231;KHOJKI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;
+11232;KHOJKI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
+11233;KHOJKI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;
+11234;KHOJKI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
+11235;KHOJKI SIGN VIRAMA;Mc;9;L;;;;;N;;;;;
+11236;KHOJKI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
+11237;KHOJKI SIGN SHADDA;Mn;0;NSM;;;;;N;;;;;
+11238;KHOJKI DANDA;Po;0;L;;;;;N;;;;;
+11239;KHOJKI DOUBLE DANDA;Po;0;L;;;;;N;;;;;
+1123A;KHOJKI WORD SEPARATOR;Po;0;L;;;;;N;;;;;
+1123B;KHOJKI SECTION MARK;Po;0;L;;;;;N;;;;;
+1123C;KHOJKI DOUBLE SECTION MARK;Po;0;L;;;;;N;;;;;
+1123D;KHOJKI ABBREVIATION SIGN;Po;0;L;;;;;N;;;;;
+112B0;KHUDAWADI LETTER A;Lo;0;L;;;;;N;;;;;
+112B1;KHUDAWADI LETTER AA;Lo;0;L;;;;;N;;;;;
+112B2;KHUDAWADI LETTER I;Lo;0;L;;;;;N;;;;;
+112B3;KHUDAWADI LETTER II;Lo;0;L;;;;;N;;;;;
+112B4;KHUDAWADI LETTER U;Lo;0;L;;;;;N;;;;;
+112B5;KHUDAWADI LETTER UU;Lo;0;L;;;;;N;;;;;
+112B6;KHUDAWADI LETTER E;Lo;0;L;;;;;N;;;;;
+112B7;KHUDAWADI LETTER AI;Lo;0;L;;;;;N;;;;;
+112B8;KHUDAWADI LETTER O;Lo;0;L;;;;;N;;;;;
+112B9;KHUDAWADI LETTER AU;Lo;0;L;;;;;N;;;;;
+112BA;KHUDAWADI LETTER KA;Lo;0;L;;;;;N;;;;;
+112BB;KHUDAWADI LETTER KHA;Lo;0;L;;;;;N;;;;;
+112BC;KHUDAWADI LETTER GA;Lo;0;L;;;;;N;;;;;
+112BD;KHUDAWADI LETTER GGA;Lo;0;L;;;;;N;;;;;
+112BE;KHUDAWADI LETTER GHA;Lo;0;L;;;;;N;;;;;
+112BF;KHUDAWADI LETTER NGA;Lo;0;L;;;;;N;;;;;
+112C0;KHUDAWADI LETTER CA;Lo;0;L;;;;;N;;;;;
+112C1;KHUDAWADI LETTER CHA;Lo;0;L;;;;;N;;;;;
+112C2;KHUDAWADI LETTER JA;Lo;0;L;;;;;N;;;;;
+112C3;KHUDAWADI LETTER JJA;Lo;0;L;;;;;N;;;;;
+112C4;KHUDAWADI LETTER JHA;Lo;0;L;;;;;N;;;;;
+112C5;KHUDAWADI LETTER NYA;Lo;0;L;;;;;N;;;;;
+112C6;KHUDAWADI LETTER TTA;Lo;0;L;;;;;N;;;;;
+112C7;KHUDAWADI LETTER TTHA;Lo;0;L;;;;;N;;;;;
+112C8;KHUDAWADI LETTER DDA;Lo;0;L;;;;;N;;;;;
+112C9;KHUDAWADI LETTER DDDA;Lo;0;L;;;;;N;;;;;
+112CA;KHUDAWADI LETTER RRA;Lo;0;L;;;;;N;;;;;
+112CB;KHUDAWADI LETTER DDHA;Lo;0;L;;;;;N;;;;;
+112CC;KHUDAWADI LETTER NNA;Lo;0;L;;;;;N;;;;;
+112CD;KHUDAWADI LETTER TA;Lo;0;L;;;;;N;;;;;
+112CE;KHUDAWADI LETTER THA;Lo;0;L;;;;;N;;;;;
+112CF;KHUDAWADI LETTER DA;Lo;0;L;;;;;N;;;;;
+112D0;KHUDAWADI LETTER DHA;Lo;0;L;;;;;N;;;;;
+112D1;KHUDAWADI LETTER NA;Lo;0;L;;;;;N;;;;;
+112D2;KHUDAWADI LETTER PA;Lo;0;L;;;;;N;;;;;
+112D3;KHUDAWADI LETTER PHA;Lo;0;L;;;;;N;;;;;
+112D4;KHUDAWADI LETTER BA;Lo;0;L;;;;;N;;;;;
+112D5;KHUDAWADI LETTER BBA;Lo;0;L;;;;;N;;;;;
+112D6;KHUDAWADI LETTER BHA;Lo;0;L;;;;;N;;;;;
+112D7;KHUDAWADI LETTER MA;Lo;0;L;;;;;N;;;;;
+112D8;KHUDAWADI LETTER YA;Lo;0;L;;;;;N;;;;;
+112D9;KHUDAWADI LETTER RA;Lo;0;L;;;;;N;;;;;
+112DA;KHUDAWADI LETTER LA;Lo;0;L;;;;;N;;;;;
+112DB;KHUDAWADI LETTER VA;Lo;0;L;;;;;N;;;;;
+112DC;KHUDAWADI LETTER SHA;Lo;0;L;;;;;N;;;;;
+112DD;KHUDAWADI LETTER SA;Lo;0;L;;;;;N;;;;;
+112DE;KHUDAWADI LETTER HA;Lo;0;L;;;;;N;;;;;
+112DF;KHUDAWADI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
+112E0;KHUDAWADI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
+112E1;KHUDAWADI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
+112E2;KHUDAWADI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
+112E3;KHUDAWADI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
+112E4;KHUDAWADI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
+112E5;KHUDAWADI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;
+112E6;KHUDAWADI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;
+112E7;KHUDAWADI VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;;
+112E8;KHUDAWADI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;
+112E9;KHUDAWADI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
+112EA;KHUDAWADI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
+112F0;KHUDAWADI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+112F1;KHUDAWADI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+112F2;KHUDAWADI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+112F3;KHUDAWADI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+112F4;KHUDAWADI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+112F5;KHUDAWADI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+112F6;KHUDAWADI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+112F7;KHUDAWADI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+112F8;KHUDAWADI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+112F9;KHUDAWADI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+11301;GRANTHA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
+11302;GRANTHA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
+11303;GRANTHA SIGN VISARGA;Mc;0;L;;;;;N;;;;;
+11305;GRANTHA LETTER A;Lo;0;L;;;;;N;;;;;
+11306;GRANTHA LETTER AA;Lo;0;L;;;;;N;;;;;
+11307;GRANTHA LETTER I;Lo;0;L;;;;;N;;;;;
+11308;GRANTHA LETTER II;Lo;0;L;;;;;N;;;;;
+11309;GRANTHA LETTER U;Lo;0;L;;;;;N;;;;;
+1130A;GRANTHA LETTER UU;Lo;0;L;;;;;N;;;;;
+1130B;GRANTHA LETTER VOCALIC R;Lo;0;L;;;;;N;;;;;
+1130C;GRANTHA LETTER VOCALIC L;Lo;0;L;;;;;N;;;;;
+1130F;GRANTHA LETTER EE;Lo;0;L;;;;;N;;;;;
+11310;GRANTHA LETTER AI;Lo;0;L;;;;;N;;;;;
+11313;GRANTHA LETTER OO;Lo;0;L;;;;;N;;;;;
+11314;GRANTHA LETTER AU;Lo;0;L;;;;;N;;;;;
+11315;GRANTHA LETTER KA;Lo;0;L;;;;;N;;;;;
+11316;GRANTHA LETTER KHA;Lo;0;L;;;;;N;;;;;
+11317;GRANTHA LETTER GA;Lo;0;L;;;;;N;;;;;
+11318;GRANTHA LETTER GHA;Lo;0;L;;;;;N;;;;;
+11319;GRANTHA LETTER NGA;Lo;0;L;;;;;N;;;;;
+1131A;GRANTHA LETTER CA;Lo;0;L;;;;;N;;;;;
+1131B;GRANTHA LETTER CHA;Lo;0;L;;;;;N;;;;;
+1131C;GRANTHA LETTER JA;Lo;0;L;;;;;N;;;;;
+1131D;GRANTHA LETTER JHA;Lo;0;L;;;;;N;;;;;
+1131E;GRANTHA LETTER NYA;Lo;0;L;;;;;N;;;;;
+1131F;GRANTHA LETTER TTA;Lo;0;L;;;;;N;;;;;
+11320;GRANTHA LETTER TTHA;Lo;0;L;;;;;N;;;;;
+11321;GRANTHA LETTER DDA;Lo;0;L;;;;;N;;;;;
+11322;GRANTHA LETTER DDHA;Lo;0;L;;;;;N;;;;;
+11323;GRANTHA LETTER NNA;Lo;0;L;;;;;N;;;;;
+11324;GRANTHA LETTER TA;Lo;0;L;;;;;N;;;;;
+11325;GRANTHA LETTER THA;Lo;0;L;;;;;N;;;;;
+11326;GRANTHA LETTER DA;Lo;0;L;;;;;N;;;;;
+11327;GRANTHA LETTER DHA;Lo;0;L;;;;;N;;;;;
+11328;GRANTHA LETTER NA;Lo;0;L;;;;;N;;;;;
+1132A;GRANTHA LETTER PA;Lo;0;L;;;;;N;;;;;
+1132B;GRANTHA LETTER PHA;Lo;0;L;;;;;N;;;;;
+1132C;GRANTHA LETTER BA;Lo;0;L;;;;;N;;;;;
+1132D;GRANTHA LETTER BHA;Lo;0;L;;;;;N;;;;;
+1132E;GRANTHA LETTER MA;Lo;0;L;;;;;N;;;;;
+1132F;GRANTHA LETTER YA;Lo;0;L;;;;;N;;;;;
+11330;GRANTHA LETTER RA;Lo;0;L;;;;;N;;;;;
+11332;GRANTHA LETTER LA;Lo;0;L;;;;;N;;;;;
+11333;GRANTHA LETTER LLA;Lo;0;L;;;;;N;;;;;
+11335;GRANTHA LETTER VA;Lo;0;L;;;;;N;;;;;
+11336;GRANTHA LETTER SHA;Lo;0;L;;;;;N;;;;;
+11337;GRANTHA LETTER SSA;Lo;0;L;;;;;N;;;;;
+11338;GRANTHA LETTER SA;Lo;0;L;;;;;N;;;;;
+11339;GRANTHA LETTER HA;Lo;0;L;;;;;N;;;;;
+1133C;GRANTHA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
+1133D;GRANTHA SIGN AVAGRAHA;Lo;0;L;;;;;N;;;;;
+1133E;GRANTHA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
+1133F;GRANTHA VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
+11340;GRANTHA VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;
+11341;GRANTHA VOWEL SIGN U;Mc;0;L;;;;;N;;;;;
+11342;GRANTHA VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;
+11343;GRANTHA VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;
+11344;GRANTHA VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;
+11347;GRANTHA VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;
+11348;GRANTHA VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
+1134B;GRANTHA VOWEL SIGN OO;Mc;0;L;11347 1133E;;;;N;;;;;
+1134C;GRANTHA VOWEL SIGN AU;Mc;0;L;11347 11357;;;;N;;;;;
+1134D;GRANTHA SIGN VIRAMA;Mc;9;L;;;;;N;;;;;
+11357;GRANTHA AU LENGTH MARK;Mc;0;L;;;;;N;;;;;
+1135D;GRANTHA SIGN PLUTA;Lo;0;L;;;;;N;;;;;
+1135E;GRANTHA LETTER VEDIC ANUSVARA;Lo;0;L;;;;;N;;;;;
+1135F;GRANTHA LETTER VEDIC DOUBLE ANUSVARA;Lo;0;L;;;;;N;;;;;
+11360;GRANTHA LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;;
+11361;GRANTHA LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;;
+11362;GRANTHA VOWEL SIGN VOCALIC L;Mc;0;L;;;;;N;;;;;
+11363;GRANTHA VOWEL SIGN VOCALIC LL;Mc;0;L;;;;;N;;;;;
+11366;COMBINING GRANTHA DIGIT ZERO;Mn;230;NSM;;;;;N;;;;;
+11367;COMBINING GRANTHA DIGIT ONE;Mn;230;NSM;;;;;N;;;;;
+11368;COMBINING GRANTHA DIGIT TWO;Mn;230;NSM;;;;;N;;;;;
+11369;COMBINING GRANTHA DIGIT THREE;Mn;230;NSM;;;;;N;;;;;
+1136A;COMBINING GRANTHA DIGIT FOUR;Mn;230;NSM;;;;;N;;;;;
+1136B;COMBINING GRANTHA DIGIT FIVE;Mn;230;NSM;;;;;N;;;;;
+1136C;COMBINING GRANTHA DIGIT SIX;Mn;230;NSM;;;;;N;;;;;
+11370;COMBINING GRANTHA LETTER A;Mn;230;NSM;;;;;N;;;;;
+11371;COMBINING GRANTHA LETTER KA;Mn;230;NSM;;;;;N;;;;;
+11372;COMBINING GRANTHA LETTER NA;Mn;230;NSM;;;;;N;;;;;
+11373;COMBINING GRANTHA LETTER VI;Mn;230;NSM;;;;;N;;;;;
+11374;COMBINING GRANTHA LETTER PA;Mn;230;NSM;;;;;N;;;;;
+11480;TIRHUTA ANJI;Lo;0;L;;;;;N;;;;;
+11481;TIRHUTA LETTER A;Lo;0;L;;;;;N;;;;;
+11482;TIRHUTA LETTER AA;Lo;0;L;;;;;N;;;;;
+11483;TIRHUTA LETTER I;Lo;0;L;;;;;N;;;;;
+11484;TIRHUTA LETTER II;Lo;0;L;;;;;N;;;;;
+11485;TIRHUTA LETTER U;Lo;0;L;;;;;N;;;;;
+11486;TIRHUTA LETTER UU;Lo;0;L;;;;;N;;;;;
+11487;TIRHUTA LETTER VOCALIC R;Lo;0;L;;;;;N;;;;;
+11488;TIRHUTA LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;;
+11489;TIRHUTA LETTER VOCALIC L;Lo;0;L;;;;;N;;;;;
+1148A;TIRHUTA LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;;
+1148B;TIRHUTA LETTER E;Lo;0;L;;;;;N;;;;;
+1148C;TIRHUTA LETTER AI;Lo;0;L;;;;;N;;;;;
+1148D;TIRHUTA LETTER O;Lo;0;L;;;;;N;;;;;
+1148E;TIRHUTA LETTER AU;Lo;0;L;;;;;N;;;;;
+1148F;TIRHUTA LETTER KA;Lo;0;L;;;;;N;;;;;
+11490;TIRHUTA LETTER KHA;Lo;0;L;;;;;N;;;;;
+11491;TIRHUTA LETTER GA;Lo;0;L;;;;;N;;;;;
+11492;TIRHUTA LETTER GHA;Lo;0;L;;;;;N;;;;;
+11493;TIRHUTA LETTER NGA;Lo;0;L;;;;;N;;;;;
+11494;TIRHUTA LETTER CA;Lo;0;L;;;;;N;;;;;
+11495;TIRHUTA LETTER CHA;Lo;0;L;;;;;N;;;;;
+11496;TIRHUTA LETTER JA;Lo;0;L;;;;;N;;;;;
+11497;TIRHUTA LETTER JHA;Lo;0;L;;;;;N;;;;;
+11498;TIRHUTA LETTER NYA;Lo;0;L;;;;;N;;;;;
+11499;TIRHUTA LETTER TTA;Lo;0;L;;;;;N;;;;;
+1149A;TIRHUTA LETTER TTHA;Lo;0;L;;;;;N;;;;;
+1149B;TIRHUTA LETTER DDA;Lo;0;L;;;;;N;;;;;
+1149C;TIRHUTA LETTER DDHA;Lo;0;L;;;;;N;;;;;
+1149D;TIRHUTA LETTER NNA;Lo;0;L;;;;;N;;;;;
+1149E;TIRHUTA LETTER TA;Lo;0;L;;;;;N;;;;;
+1149F;TIRHUTA LETTER THA;Lo;0;L;;;;;N;;;;;
+114A0;TIRHUTA LETTER DA;Lo;0;L;;;;;N;;;;;
+114A1;TIRHUTA LETTER DHA;Lo;0;L;;;;;N;;;;;
+114A2;TIRHUTA LETTER NA;Lo;0;L;;;;;N;;;;;
+114A3;TIRHUTA LETTER PA;Lo;0;L;;;;;N;;;;;
+114A4;TIRHUTA LETTER PHA;Lo;0;L;;;;;N;;;;;
+114A5;TIRHUTA LETTER BA;Lo;0;L;;;;;N;;;;;
+114A6;TIRHUTA LETTER BHA;Lo;0;L;;;;;N;;;;;
+114A7;TIRHUTA LETTER MA;Lo;0;L;;;;;N;;;;;
+114A8;TIRHUTA LETTER YA;Lo;0;L;;;;;N;;;;;
+114A9;TIRHUTA LETTER RA;Lo;0;L;;;;;N;;;;;
+114AA;TIRHUTA LETTER LA;Lo;0;L;;;;;N;;;;;
+114AB;TIRHUTA LETTER VA;Lo;0;L;;;;;N;;;;;
+114AC;TIRHUTA LETTER SHA;Lo;0;L;;;;;N;;;;;
+114AD;TIRHUTA LETTER SSA;Lo;0;L;;;;;N;;;;;
+114AE;TIRHUTA LETTER SA;Lo;0;L;;;;;N;;;;;
+114AF;TIRHUTA LETTER HA;Lo;0;L;;;;;N;;;;;
+114B0;TIRHUTA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
+114B1;TIRHUTA VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
+114B2;TIRHUTA VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
+114B3;TIRHUTA VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
+114B4;TIRHUTA VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
+114B5;TIRHUTA VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
+114B6;TIRHUTA VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
+114B7;TIRHUTA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
+114B8;TIRHUTA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
+114B9;TIRHUTA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
+114BA;TIRHUTA VOWEL SIGN SHORT E;Mn;0;NSM;;;;;N;;;;;
+114BB;TIRHUTA VOWEL SIGN AI;Mc;0;L;114B9 114BA;;;;N;;;;;
+114BC;TIRHUTA VOWEL SIGN O;Mc;0;L;114B9 114B0;;;;N;;;;;
+114BD;TIRHUTA VOWEL SIGN SHORT O;Mc;0;L;;;;;N;;;;;
+114BE;TIRHUTA VOWEL SIGN AU;Mc;0;L;114B9 114BD;;;;N;;;;;
+114BF;TIRHUTA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
+114C0;TIRHUTA SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
+114C1;TIRHUTA SIGN VISARGA;Mc;0;L;;;;;N;;;;;
+114C2;TIRHUTA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
+114C3;TIRHUTA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
+114C4;TIRHUTA SIGN AVAGRAHA;Lo;0;L;;;;;N;;;;;
+114C5;TIRHUTA GVANG;Lo;0;L;;;;;N;;;;;
+114C6;TIRHUTA ABBREVIATION SIGN;Po;0;L;;;;;N;;;;;
+114C7;TIRHUTA OM;Lo;0;L;;;;;N;;;;;
+114D0;TIRHUTA DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+114D1;TIRHUTA DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+114D2;TIRHUTA DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+114D3;TIRHUTA DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+114D4;TIRHUTA DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+114D5;TIRHUTA DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+114D6;TIRHUTA DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+114D7;TIRHUTA DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+114D8;TIRHUTA DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+114D9;TIRHUTA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+11580;SIDDHAM LETTER A;Lo;0;L;;;;;N;;;;;
+11581;SIDDHAM LETTER AA;Lo;0;L;;;;;N;;;;;
+11582;SIDDHAM LETTER I;Lo;0;L;;;;;N;;;;;
+11583;SIDDHAM LETTER II;Lo;0;L;;;;;N;;;;;
+11584;SIDDHAM LETTER U;Lo;0;L;;;;;N;;;;;
+11585;SIDDHAM LETTER UU;Lo;0;L;;;;;N;;;;;
+11586;SIDDHAM LETTER VOCALIC R;Lo;0;L;;;;;N;;;;;
+11587;SIDDHAM LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;;
+11588;SIDDHAM LETTER VOCALIC L;Lo;0;L;;;;;N;;;;;
+11589;SIDDHAM LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;;
+1158A;SIDDHAM LETTER E;Lo;0;L;;;;;N;;;;;
+1158B;SIDDHAM LETTER AI;Lo;0;L;;;;;N;;;;;
+1158C;SIDDHAM LETTER O;Lo;0;L;;;;;N;;;;;
+1158D;SIDDHAM LETTER AU;Lo;0;L;;;;;N;;;;;
+1158E;SIDDHAM LETTER KA;Lo;0;L;;;;;N;;;;;
+1158F;SIDDHAM LETTER KHA;Lo;0;L;;;;;N;;;;;
+11590;SIDDHAM LETTER GA;Lo;0;L;;;;;N;;;;;
+11591;SIDDHAM LETTER GHA;Lo;0;L;;;;;N;;;;;
+11592;SIDDHAM LETTER NGA;Lo;0;L;;;;;N;;;;;
+11593;SIDDHAM LETTER CA;Lo;0;L;;;;;N;;;;;
+11594;SIDDHAM LETTER CHA;Lo;0;L;;;;;N;;;;;
+11595;SIDDHAM LETTER JA;Lo;0;L;;;;;N;;;;;
+11596;SIDDHAM LETTER JHA;Lo;0;L;;;;;N;;;;;
+11597;SIDDHAM LETTER NYA;Lo;0;L;;;;;N;;;;;
+11598;SIDDHAM LETTER TTA;Lo;0;L;;;;;N;;;;;
+11599;SIDDHAM LETTER TTHA;Lo;0;L;;;;;N;;;;;
+1159A;SIDDHAM LETTER DDA;Lo;0;L;;;;;N;;;;;
+1159B;SIDDHAM LETTER DDHA;Lo;0;L;;;;;N;;;;;
+1159C;SIDDHAM LETTER NNA;Lo;0;L;;;;;N;;;;;
+1159D;SIDDHAM LETTER TA;Lo;0;L;;;;;N;;;;;
+1159E;SIDDHAM LETTER THA;Lo;0;L;;;;;N;;;;;
+1159F;SIDDHAM LETTER DA;Lo;0;L;;;;;N;;;;;
+115A0;SIDDHAM LETTER DHA;Lo;0;L;;;;;N;;;;;
+115A1;SIDDHAM LETTER NA;Lo;0;L;;;;;N;;;;;
+115A2;SIDDHAM LETTER PA;Lo;0;L;;;;;N;;;;;
+115A3;SIDDHAM LETTER PHA;Lo;0;L;;;;;N;;;;;
+115A4;SIDDHAM LETTER BA;Lo;0;L;;;;;N;;;;;
+115A5;SIDDHAM LETTER BHA;Lo;0;L;;;;;N;;;;;
+115A6;SIDDHAM LETTER MA;Lo;0;L;;;;;N;;;;;
+115A7;SIDDHAM LETTER YA;Lo;0;L;;;;;N;;;;;
+115A8;SIDDHAM LETTER RA;Lo;0;L;;;;;N;;;;;
+115A9;SIDDHAM LETTER LA;Lo;0;L;;;;;N;;;;;
+115AA;SIDDHAM LETTER VA;Lo;0;L;;;;;N;;;;;
+115AB;SIDDHAM LETTER SHA;Lo;0;L;;;;;N;;;;;
+115AC;SIDDHAM LETTER SSA;Lo;0;L;;;;;N;;;;;
+115AD;SIDDHAM LETTER SA;Lo;0;L;;;;;N;;;;;
+115AE;SIDDHAM LETTER HA;Lo;0;L;;;;;N;;;;;
+115AF;SIDDHAM VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
+115B0;SIDDHAM VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
+115B1;SIDDHAM VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
+115B2;SIDDHAM VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
+115B3;SIDDHAM VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
+115B4;SIDDHAM VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
+115B5;SIDDHAM VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
+115B8;SIDDHAM VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
+115B9;SIDDHAM VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
+115BA;SIDDHAM VOWEL SIGN O;Mc;0;L;115B8 115AF;;;;N;;;;;
+115BB;SIDDHAM VOWEL SIGN AU;Mc;0;L;115B9 115AF;;;;N;;;;;
+115BC;SIDDHAM SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
+115BD;SIDDHAM SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
+115BE;SIDDHAM SIGN VISARGA;Mc;0;L;;;;;N;;;;;
+115BF;SIDDHAM SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
+115C0;SIDDHAM SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;
+115C1;SIDDHAM SIGN SIDDHAM;Po;0;L;;;;;N;;;;;
+115C2;SIDDHAM DANDA;Po;0;L;;;;;N;;;;;
+115C3;SIDDHAM DOUBLE DANDA;Po;0;L;;;;;N;;;;;
+115C4;SIDDHAM SEPARATOR DOT;Po;0;L;;;;;N;;;;;
+115C5;SIDDHAM SEPARATOR BAR;Po;0;L;;;;;N;;;;;
+115C6;SIDDHAM REPETITION MARK-1;Po;0;L;;;;;N;;;;;
+115C7;SIDDHAM REPETITION MARK-2;Po;0;L;;;;;N;;;;;
+115C8;SIDDHAM REPETITION MARK-3;Po;0;L;;;;;N;;;;;
+115C9;SIDDHAM END OF TEXT MARK;Po;0;L;;;;;N;;;;;
+11600;MODI LETTER A;Lo;0;L;;;;;N;;;;;
+11601;MODI LETTER AA;Lo;0;L;;;;;N;;;;;
+11602;MODI LETTER I;Lo;0;L;;;;;N;;;;;
+11603;MODI LETTER II;Lo;0;L;;;;;N;;;;;
+11604;MODI LETTER U;Lo;0;L;;;;;N;;;;;
+11605;MODI LETTER UU;Lo;0;L;;;;;N;;;;;
+11606;MODI LETTER VOCALIC R;Lo;0;L;;;;;N;;;;;
+11607;MODI LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;;
+11608;MODI LETTER VOCALIC L;Lo;0;L;;;;;N;;;;;
+11609;MODI LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;;
+1160A;MODI LETTER E;Lo;0;L;;;;;N;;;;;
+1160B;MODI LETTER AI;Lo;0;L;;;;;N;;;;;
+1160C;MODI LETTER O;Lo;0;L;;;;;N;;;;;
+1160D;MODI LETTER AU;Lo;0;L;;;;;N;;;;;
+1160E;MODI LETTER KA;Lo;0;L;;;;;N;;;;;
+1160F;MODI LETTER KHA;Lo;0;L;;;;;N;;;;;
+11610;MODI LETTER GA;Lo;0;L;;;;;N;;;;;
+11611;MODI LETTER GHA;Lo;0;L;;;;;N;;;;;
+11612;MODI LETTER NGA;Lo;0;L;;;;;N;;;;;
+11613;MODI LETTER CA;Lo;0;L;;;;;N;;;;;
+11614;MODI LETTER CHA;Lo;0;L;;;;;N;;;;;
+11615;MODI LETTER JA;Lo;0;L;;;;;N;;;;;
+11616;MODI LETTER JHA;Lo;0;L;;;;;N;;;;;
+11617;MODI LETTER NYA;Lo;0;L;;;;;N;;;;;
+11618;MODI LETTER TTA;Lo;0;L;;;;;N;;;;;
+11619;MODI LETTER TTHA;Lo;0;L;;;;;N;;;;;
+1161A;MODI LETTER DDA;Lo;0;L;;;;;N;;;;;
+1161B;MODI LETTER DDHA;Lo;0;L;;;;;N;;;;;
+1161C;MODI LETTER NNA;Lo;0;L;;;;;N;;;;;
+1161D;MODI LETTER TA;Lo;0;L;;;;;N;;;;;
+1161E;MODI LETTER THA;Lo;0;L;;;;;N;;;;;
+1161F;MODI LETTER DA;Lo;0;L;;;;;N;;;;;
+11620;MODI LETTER DHA;Lo;0;L;;;;;N;;;;;
+11621;MODI LETTER NA;Lo;0;L;;;;;N;;;;;
+11622;MODI LETTER PA;Lo;0;L;;;;;N;;;;;
+11623;MODI LETTER PHA;Lo;0;L;;;;;N;;;;;
+11624;MODI LETTER BA;Lo;0;L;;;;;N;;;;;
+11625;MODI LETTER BHA;Lo;0;L;;;;;N;;;;;
+11626;MODI LETTER MA;Lo;0;L;;;;;N;;;;;
+11627;MODI LETTER YA;Lo;0;L;;;;;N;;;;;
+11628;MODI LETTER RA;Lo;0;L;;;;;N;;;;;
+11629;MODI LETTER LA;Lo;0;L;;;;;N;;;;;
+1162A;MODI LETTER VA;Lo;0;L;;;;;N;;;;;
+1162B;MODI LETTER SHA;Lo;0;L;;;;;N;;;;;
+1162C;MODI LETTER SSA;Lo;0;L;;;;;N;;;;;
+1162D;MODI LETTER SA;Lo;0;L;;;;;N;;;;;
+1162E;MODI LETTER HA;Lo;0;L;;;;;N;;;;;
+1162F;MODI LETTER LLA;Lo;0;L;;;;;N;;;;;
+11630;MODI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
+11631;MODI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;
+11632;MODI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;
+11633;MODI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
+11634;MODI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
+11635;MODI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
+11636;MODI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;
+11637;MODI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
+11638;MODI VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
+11639;MODI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;
+1163A;MODI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;
+1163B;MODI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
+1163C;MODI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;
+1163D;MODI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
+1163E;MODI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
+1163F;MODI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;
+11640;MODI SIGN ARDHACANDRA;Mn;0;NSM;;;;;N;;;;;
+11641;MODI DANDA;Po;0;L;;;;;N;;;;;
+11642;MODI DOUBLE DANDA;Po;0;L;;;;;N;;;;;
+11643;MODI ABBREVIATION SIGN;Po;0;L;;;;;N;;;;;
+11644;MODI SIGN HUVA;Lo;0;L;;;;;N;;;;;
+11650;MODI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+11651;MODI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+11652;MODI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+11653;MODI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+11654;MODI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+11655;MODI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+11656;MODI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+11657;MODI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+11658;MODI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+11659;MODI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
11680;TAKRI LETTER A;Lo;0;L;;;;;N;;;;;
11681;TAKRI LETTER AA;Lo;0;L;;;;;N;;;;;
11682;TAKRI LETTER I;Lo;0;L;;;;;N;;;;;
@@ -17775,6 +19308,147 @@
116C7;TAKRI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
116C8;TAKRI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
116C9;TAKRI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+118A0;WARANG CITI CAPITAL LETTER NGAA;Lu;0;L;;;;;N;;;;118C0;
+118A1;WARANG CITI CAPITAL LETTER A;Lu;0;L;;;;;N;;;;118C1;
+118A2;WARANG CITI CAPITAL LETTER WI;Lu;0;L;;;;;N;;;;118C2;
+118A3;WARANG CITI CAPITAL LETTER YU;Lu;0;L;;;;;N;;;;118C3;
+118A4;WARANG CITI CAPITAL LETTER YA;Lu;0;L;;;;;N;;;;118C4;
+118A5;WARANG CITI CAPITAL LETTER YO;Lu;0;L;;;;;N;;;;118C5;
+118A6;WARANG CITI CAPITAL LETTER II;Lu;0;L;;;;;N;;;;118C6;
+118A7;WARANG CITI CAPITAL LETTER UU;Lu;0;L;;;;;N;;;;118C7;
+118A8;WARANG CITI CAPITAL LETTER E;Lu;0;L;;;;;N;;;;118C8;
+118A9;WARANG CITI CAPITAL LETTER O;Lu;0;L;;;;;N;;;;118C9;
+118AA;WARANG CITI CAPITAL LETTER ANG;Lu;0;L;;;;;N;;;;118CA;
+118AB;WARANG CITI CAPITAL LETTER GA;Lu;0;L;;;;;N;;;;118CB;
+118AC;WARANG CITI CAPITAL LETTER KO;Lu;0;L;;;;;N;;;;118CC;
+118AD;WARANG CITI CAPITAL LETTER ENY;Lu;0;L;;;;;N;;;;118CD;
+118AE;WARANG CITI CAPITAL LETTER YUJ;Lu;0;L;;;;;N;;;;118CE;
+118AF;WARANG CITI CAPITAL LETTER UC;Lu;0;L;;;;;N;;;;118CF;
+118B0;WARANG CITI CAPITAL LETTER ENN;Lu;0;L;;;;;N;;;;118D0;
+118B1;WARANG CITI CAPITAL LETTER ODD;Lu;0;L;;;;;N;;;;118D1;
+118B2;WARANG CITI CAPITAL LETTER TTE;Lu;0;L;;;;;N;;;;118D2;
+118B3;WARANG CITI CAPITAL LETTER NUNG;Lu;0;L;;;;;N;;;;118D3;
+118B4;WARANG CITI CAPITAL LETTER DA;Lu;0;L;;;;;N;;;;118D4;
+118B5;WARANG CITI CAPITAL LETTER AT;Lu;0;L;;;;;N;;;;118D5;
+118B6;WARANG CITI CAPITAL LETTER AM;Lu;0;L;;;;;N;;;;118D6;
+118B7;WARANG CITI CAPITAL LETTER BU;Lu;0;L;;;;;N;;;;118D7;
+118B8;WARANG CITI CAPITAL LETTER PU;Lu;0;L;;;;;N;;;;118D8;
+118B9;WARANG CITI CAPITAL LETTER HIYO;Lu;0;L;;;;;N;;;;118D9;
+118BA;WARANG CITI CAPITAL LETTER HOLO;Lu;0;L;;;;;N;;;;118DA;
+118BB;WARANG CITI CAPITAL LETTER HORR;Lu;0;L;;;;;N;;;;118DB;
+118BC;WARANG CITI CAPITAL LETTER HAR;Lu;0;L;;;;;N;;;;118DC;
+118BD;WARANG CITI CAPITAL LETTER SSUU;Lu;0;L;;;;;N;;;;118DD;
+118BE;WARANG CITI CAPITAL LETTER SII;Lu;0;L;;;;;N;;;;118DE;
+118BF;WARANG CITI CAPITAL LETTER VIYO;Lu;0;L;;;;;N;;;;118DF;
+118C0;WARANG CITI SMALL LETTER NGAA;Ll;0;L;;;;;N;;;118A0;;118A0
+118C1;WARANG CITI SMALL LETTER A;Ll;0;L;;;;;N;;;118A1;;118A1
+118C2;WARANG CITI SMALL LETTER WI;Ll;0;L;;;;;N;;;118A2;;118A2
+118C3;WARANG CITI SMALL LETTER YU;Ll;0;L;;;;;N;;;118A3;;118A3
+118C4;WARANG CITI SMALL LETTER YA;Ll;0;L;;;;;N;;;118A4;;118A4
+118C5;WARANG CITI SMALL LETTER YO;Ll;0;L;;;;;N;;;118A5;;118A5
+118C6;WARANG CITI SMALL LETTER II;Ll;0;L;;;;;N;;;118A6;;118A6
+118C7;WARANG CITI SMALL LETTER UU;Ll;0;L;;;;;N;;;118A7;;118A7
+118C8;WARANG CITI SMALL LETTER E;Ll;0;L;;;;;N;;;118A8;;118A8
+118C9;WARANG CITI SMALL LETTER O;Ll;0;L;;;;;N;;;118A9;;118A9
+118CA;WARANG CITI SMALL LETTER ANG;Ll;0;L;;;;;N;;;118AA;;118AA
+118CB;WARANG CITI SMALL LETTER GA;Ll;0;L;;;;;N;;;118AB;;118AB
+118CC;WARANG CITI SMALL LETTER KO;Ll;0;L;;;;;N;;;118AC;;118AC
+118CD;WARANG CITI SMALL LETTER ENY;Ll;0;L;;;;;N;;;118AD;;118AD
+118CE;WARANG CITI SMALL LETTER YUJ;Ll;0;L;;;;;N;;;118AE;;118AE
+118CF;WARANG CITI SMALL LETTER UC;Ll;0;L;;;;;N;;;118AF;;118AF
+118D0;WARANG CITI SMALL LETTER ENN;Ll;0;L;;;;;N;;;118B0;;118B0
+118D1;WARANG CITI SMALL LETTER ODD;Ll;0;L;;;;;N;;;118B1;;118B1
+118D2;WARANG CITI SMALL LETTER TTE;Ll;0;L;;;;;N;;;118B2;;118B2
+118D3;WARANG CITI SMALL LETTER NUNG;Ll;0;L;;;;;N;;;118B3;;118B3
+118D4;WARANG CITI SMALL LETTER DA;Ll;0;L;;;;;N;;;118B4;;118B4
+118D5;WARANG CITI SMALL LETTER AT;Ll;0;L;;;;;N;;;118B5;;118B5
+118D6;WARANG CITI SMALL LETTER AM;Ll;0;L;;;;;N;;;118B6;;118B6
+118D7;WARANG CITI SMALL LETTER BU;Ll;0;L;;;;;N;;;118B7;;118B7
+118D8;WARANG CITI SMALL LETTER PU;Ll;0;L;;;;;N;;;118B8;;118B8
+118D9;WARANG CITI SMALL LETTER HIYO;Ll;0;L;;;;;N;;;118B9;;118B9
+118DA;WARANG CITI SMALL LETTER HOLO;Ll;0;L;;;;;N;;;118BA;;118BA
+118DB;WARANG CITI SMALL LETTER HORR;Ll;0;L;;;;;N;;;118BB;;118BB
+118DC;WARANG CITI SMALL LETTER HAR;Ll;0;L;;;;;N;;;118BC;;118BC
+118DD;WARANG CITI SMALL LETTER SSUU;Ll;0;L;;;;;N;;;118BD;;118BD
+118DE;WARANG CITI SMALL LETTER SII;Ll;0;L;;;;;N;;;118BE;;118BE
+118DF;WARANG CITI SMALL LETTER VIYO;Ll;0;L;;;;;N;;;118BF;;118BF
+118E0;WARANG CITI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+118E1;WARANG CITI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+118E2;WARANG CITI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+118E3;WARANG CITI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+118E4;WARANG CITI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+118E5;WARANG CITI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+118E6;WARANG CITI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+118E7;WARANG CITI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+118E8;WARANG CITI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+118E9;WARANG CITI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+118EA;WARANG CITI NUMBER TEN;No;0;L;;;;10;N;;;;;
+118EB;WARANG CITI NUMBER TWENTY;No;0;L;;;;20;N;;;;;
+118EC;WARANG CITI NUMBER THIRTY;No;0;L;;;;30;N;;;;;
+118ED;WARANG CITI NUMBER FORTY;No;0;L;;;;40;N;;;;;
+118EE;WARANG CITI NUMBER FIFTY;No;0;L;;;;50;N;;;;;
+118EF;WARANG CITI NUMBER SIXTY;No;0;L;;;;60;N;;;;;
+118F0;WARANG CITI NUMBER SEVENTY;No;0;L;;;;70;N;;;;;
+118F1;WARANG CITI NUMBER EIGHTY;No;0;L;;;;80;N;;;;;
+118F2;WARANG CITI NUMBER NINETY;No;0;L;;;;90;N;;;;;
+118FF;WARANG CITI OM;Lo;0;L;;;;;N;;;;;
+11AC0;PAU CIN HAU LETTER PA;Lo;0;L;;;;;N;;;;;
+11AC1;PAU CIN HAU LETTER KA;Lo;0;L;;;;;N;;;;;
+11AC2;PAU CIN HAU LETTER LA;Lo;0;L;;;;;N;;;;;
+11AC3;PAU CIN HAU LETTER MA;Lo;0;L;;;;;N;;;;;
+11AC4;PAU CIN HAU LETTER DA;Lo;0;L;;;;;N;;;;;
+11AC5;PAU CIN HAU LETTER ZA;Lo;0;L;;;;;N;;;;;
+11AC6;PAU CIN HAU LETTER VA;Lo;0;L;;;;;N;;;;;
+11AC7;PAU CIN HAU LETTER NGA;Lo;0;L;;;;;N;;;;;
+11AC8;PAU CIN HAU LETTER HA;Lo;0;L;;;;;N;;;;;
+11AC9;PAU CIN HAU LETTER GA;Lo;0;L;;;;;N;;;;;
+11ACA;PAU CIN HAU LETTER KHA;Lo;0;L;;;;;N;;;;;
+11ACB;PAU CIN HAU LETTER SA;Lo;0;L;;;;;N;;;;;
+11ACC;PAU CIN HAU LETTER BA;Lo;0;L;;;;;N;;;;;
+11ACD;PAU CIN HAU LETTER CA;Lo;0;L;;;;;N;;;;;
+11ACE;PAU CIN HAU LETTER TA;Lo;0;L;;;;;N;;;;;
+11ACF;PAU CIN HAU LETTER THA;Lo;0;L;;;;;N;;;;;
+11AD0;PAU CIN HAU LETTER NA;Lo;0;L;;;;;N;;;;;
+11AD1;PAU CIN HAU LETTER PHA;Lo;0;L;;;;;N;;;;;
+11AD2;PAU CIN HAU LETTER RA;Lo;0;L;;;;;N;;;;;
+11AD3;PAU CIN HAU LETTER FA;Lo;0;L;;;;;N;;;;;
+11AD4;PAU CIN HAU LETTER CHA;Lo;0;L;;;;;N;;;;;
+11AD5;PAU CIN HAU LETTER A;Lo;0;L;;;;;N;;;;;
+11AD6;PAU CIN HAU LETTER E;Lo;0;L;;;;;N;;;;;
+11AD7;PAU CIN HAU LETTER I;Lo;0;L;;;;;N;;;;;
+11AD8;PAU CIN HAU LETTER O;Lo;0;L;;;;;N;;;;;
+11AD9;PAU CIN HAU LETTER U;Lo;0;L;;;;;N;;;;;
+11ADA;PAU CIN HAU LETTER UA;Lo;0;L;;;;;N;;;;;
+11ADB;PAU CIN HAU LETTER IA;Lo;0;L;;;;;N;;;;;
+11ADC;PAU CIN HAU LETTER FINAL P;Lo;0;L;;;;;N;;;;;
+11ADD;PAU CIN HAU LETTER FINAL K;Lo;0;L;;;;;N;;;;;
+11ADE;PAU CIN HAU LETTER FINAL T;Lo;0;L;;;;;N;;;;;
+11ADF;PAU CIN HAU LETTER FINAL M;Lo;0;L;;;;;N;;;;;
+11AE0;PAU CIN HAU LETTER FINAL N;Lo;0;L;;;;;N;;;;;
+11AE1;PAU CIN HAU LETTER FINAL L;Lo;0;L;;;;;N;;;;;
+11AE2;PAU CIN HAU LETTER FINAL W;Lo;0;L;;;;;N;;;;;
+11AE3;PAU CIN HAU LETTER FINAL NG;Lo;0;L;;;;;N;;;;;
+11AE4;PAU CIN HAU LETTER FINAL Y;Lo;0;L;;;;;N;;;;;
+11AE5;PAU CIN HAU RISING TONE LONG;Lo;0;L;;;;;N;;;;;
+11AE6;PAU CIN HAU RISING TONE;Lo;0;L;;;;;N;;;;;
+11AE7;PAU CIN HAU SANDHI GLOTTAL STOP;Lo;0;L;;;;;N;;;;;
+11AE8;PAU CIN HAU RISING TONE LONG FINAL;Lo;0;L;;;;;N;;;;;
+11AE9;PAU CIN HAU RISING TONE FINAL;Lo;0;L;;;;;N;;;;;
+11AEA;PAU CIN HAU SANDHI GLOTTAL STOP FINAL;Lo;0;L;;;;;N;;;;;
+11AEB;PAU CIN HAU SANDHI TONE LONG;Lo;0;L;;;;;N;;;;;
+11AEC;PAU CIN HAU SANDHI TONE;Lo;0;L;;;;;N;;;;;
+11AED;PAU CIN HAU SANDHI TONE LONG FINAL;Lo;0;L;;;;;N;;;;;
+11AEE;PAU CIN HAU SANDHI TONE FINAL;Lo;0;L;;;;;N;;;;;
+11AEF;PAU CIN HAU MID-LEVEL TONE;Lo;0;L;;;;;N;;;;;
+11AF0;PAU CIN HAU GLOTTAL STOP VARIANT;Lo;0;L;;;;;N;;;;;
+11AF1;PAU CIN HAU MID-LEVEL TONE LONG FINAL;Lo;0;L;;;;;N;;;;;
+11AF2;PAU CIN HAU MID-LEVEL TONE FINAL;Lo;0;L;;;;;N;;;;;
+11AF3;PAU CIN HAU LOW-FALLING TONE LONG;Lo;0;L;;;;;N;;;;;
+11AF4;PAU CIN HAU LOW-FALLING TONE;Lo;0;L;;;;;N;;;;;
+11AF5;PAU CIN HAU GLOTTAL STOP;Lo;0;L;;;;;N;;;;;
+11AF6;PAU CIN HAU LOW-FALLING TONE LONG FINAL;Lo;0;L;;;;;N;;;;;
+11AF7;PAU CIN HAU LOW-FALLING TONE FINAL;Lo;0;L;;;;;N;;;;;
+11AF8;PAU CIN HAU GLOTTAL STOP FINAL;Lo;0;L;;;;;N;;;;;
12000;CUNEIFORM SIGN A;Lo;0;L;;;;;N;;;;;
12001;CUNEIFORM SIGN A TIMES A;Lo;0;L;;;;;N;;;;;
12002;CUNEIFORM SIGN A TIMES BAD;Lo;0;L;;;;;N;;;;;
@@ -18654,6 +20328,48 @@
1236C;CUNEIFORM SIGN ZU5 TIMES A;Lo;0;L;;;;;N;;;;;
1236D;CUNEIFORM SIGN ZUBUR;Lo;0;L;;;;;N;;;;;
1236E;CUNEIFORM SIGN ZUM;Lo;0;L;;;;;N;;;;;
+1236F;CUNEIFORM SIGN KAP ELAMITE;Lo;0;L;;;;;N;;;;;
+12370;CUNEIFORM SIGN AB TIMES NUN;Lo;0;L;;;;;N;;;;;
+12371;CUNEIFORM SIGN AB2 TIMES A;Lo;0;L;;;;;N;;;;;
+12372;CUNEIFORM SIGN AMAR TIMES KUG;Lo;0;L;;;;;N;;;;;
+12373;CUNEIFORM SIGN DAG KISIM5 TIMES U2 PLUS MASH;Lo;0;L;;;;;N;;;;;
+12374;CUNEIFORM SIGN DAG3;Lo;0;L;;;;;N;;;;;
+12375;CUNEIFORM SIGN DISH PLUS SHU;Lo;0;L;;;;;N;;;;;
+12376;CUNEIFORM SIGN DUB TIMES SHE;Lo;0;L;;;;;N;;;;;
+12377;CUNEIFORM SIGN EZEN TIMES GUD;Lo;0;L;;;;;N;;;;;
+12378;CUNEIFORM SIGN EZEN TIMES SHE;Lo;0;L;;;;;N;;;;;
+12379;CUNEIFORM SIGN GA2 TIMES AN PLUS KAK PLUS A;Lo;0;L;;;;;N;;;;;
+1237A;CUNEIFORM SIGN GA2 TIMES ASH2;Lo;0;L;;;;;N;;;;;
+1237B;CUNEIFORM SIGN GE22;Lo;0;L;;;;;N;;;;;
+1237C;CUNEIFORM SIGN GIG;Lo;0;L;;;;;N;;;;;
+1237D;CUNEIFORM SIGN HUSH;Lo;0;L;;;;;N;;;;;
+1237E;CUNEIFORM SIGN KA TIMES ANSHE;Lo;0;L;;;;;N;;;;;
+1237F;CUNEIFORM SIGN KA TIMES ASH3;Lo;0;L;;;;;N;;;;;
+12380;CUNEIFORM SIGN KA TIMES GISH;Lo;0;L;;;;;N;;;;;
+12381;CUNEIFORM SIGN KA TIMES GUD;Lo;0;L;;;;;N;;;;;
+12382;CUNEIFORM SIGN KA TIMES HI TIMES ASH2;Lo;0;L;;;;;N;;;;;
+12383;CUNEIFORM SIGN KA TIMES LUM;Lo;0;L;;;;;N;;;;;
+12384;CUNEIFORM SIGN KA TIMES PA;Lo;0;L;;;;;N;;;;;
+12385;CUNEIFORM SIGN KA TIMES SHUL;Lo;0;L;;;;;N;;;;;
+12386;CUNEIFORM SIGN KA TIMES TU;Lo;0;L;;;;;N;;;;;
+12387;CUNEIFORM SIGN KA TIMES UR2;Lo;0;L;;;;;N;;;;;
+12388;CUNEIFORM SIGN LAGAB TIMES GI;Lo;0;L;;;;;N;;;;;
+12389;CUNEIFORM SIGN LU2 SHESHIG TIMES BAD;Lo;0;L;;;;;N;;;;;
+1238A;CUNEIFORM SIGN LU2 TIMES ESH2 PLUS LAL;Lo;0;L;;;;;N;;;;;
+1238B;CUNEIFORM SIGN LU2 TIMES SHU;Lo;0;L;;;;;N;;;;;
+1238C;CUNEIFORM SIGN MESH;Lo;0;L;;;;;N;;;;;
+1238D;CUNEIFORM SIGN MUSH3 TIMES ZA;Lo;0;L;;;;;N;;;;;
+1238E;CUNEIFORM SIGN NA4;Lo;0;L;;;;;N;;;;;
+1238F;CUNEIFORM SIGN NIN;Lo;0;L;;;;;N;;;;;
+12390;CUNEIFORM SIGN NIN9;Lo;0;L;;;;;N;;;;;
+12391;CUNEIFORM SIGN NINDA2 TIMES BAL;Lo;0;L;;;;;N;;;;;
+12392;CUNEIFORM SIGN NINDA2 TIMES GI;Lo;0;L;;;;;N;;;;;
+12393;CUNEIFORM SIGN NU11 ROTATED NINETY DEGREES;Lo;0;L;;;;;N;;;;;
+12394;CUNEIFORM SIGN PESH2 ASTERISK;Lo;0;L;;;;;N;;;;;
+12395;CUNEIFORM SIGN PIR2;Lo;0;L;;;;;N;;;;;
+12396;CUNEIFORM SIGN SAG TIMES IGI GUNU;Lo;0;L;;;;;N;;;;;
+12397;CUNEIFORM SIGN TI2;Lo;0;L;;;;;N;;;;;
+12398;CUNEIFORM SIGN UM TIMES ME;Lo;0;L;;;;;N;;;;;
12400;CUNEIFORM NUMERIC SIGN TWO ASH;Nl;0;L;;;;2;N;;;;;
12401;CUNEIFORM NUMERIC SIGN THREE ASH;Nl;0;L;;;;3;N;;;;;
12402;CUNEIFORM NUMERIC SIGN FOUR ASH;Nl;0;L;;;;4;N;;;;;
@@ -18740,8 +20456,8 @@
12453;CUNEIFORM NUMERIC SIGN FOUR BAN2 VARIANT FORM;Nl;0;L;;;;4;N;;;;;
12454;CUNEIFORM NUMERIC SIGN FIVE BAN2;Nl;0;L;;;;5;N;;;;;
12455;CUNEIFORM NUMERIC SIGN FIVE BAN2 VARIANT FORM;Nl;0;L;;;;5;N;;;;;
-12456;CUNEIFORM NUMERIC SIGN NIGIDAMIN;Nl;0;L;;;;-1;N;;;;;
-12457;CUNEIFORM NUMERIC SIGN NIGIDAESH;Nl;0;L;;;;-1;N;;;;;
+12456;CUNEIFORM NUMERIC SIGN NIGIDAMIN;Nl;0;L;;;;2;N;;;;;
+12457;CUNEIFORM NUMERIC SIGN NIGIDAESH;Nl;0;L;;;;3;N;;;;;
12458;CUNEIFORM NUMERIC SIGN ONE ESHE3;Nl;0;L;;;;1;N;;;;;
12459;CUNEIFORM NUMERIC SIGN TWO ESHE3;Nl;0;L;;;;2;N;;;;;
1245A;CUNEIFORM NUMERIC SIGN ONE THIRD DISH;Nl;0;L;;;;1/3;N;;;;;
@@ -18753,10 +20469,23 @@
12460;CUNEIFORM NUMERIC SIGN ONE QUARTER ASH;Nl;0;L;;;;1/4;N;;;;;
12461;CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE SIXTH;Nl;0;L;;;;1/6;N;;;;;
12462;CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER;Nl;0;L;;;;1/4;N;;;;;
+12463;CUNEIFORM NUMERIC SIGN ONE QUARTER GUR;Nl;0;L;;;;1/4;N;;;;;
+12464;CUNEIFORM NUMERIC SIGN ONE HALF GUR;Nl;0;L;;;;1/2;N;;;;;
+12465;CUNEIFORM NUMERIC SIGN ELAMITE ONE THIRD;Nl;0;L;;;;1/3;N;;;;;
+12466;CUNEIFORM NUMERIC SIGN ELAMITE TWO THIRDS;Nl;0;L;;;;2/3;N;;;;;
+12467;CUNEIFORM NUMERIC SIGN ELAMITE FORTY;Nl;0;L;;;;40;N;;;;;
+12468;CUNEIFORM NUMERIC SIGN ELAMITE FIFTY;Nl;0;L;;;;50;N;;;;;
+12469;CUNEIFORM NUMERIC SIGN FOUR U VARIANT FORM;Nl;0;L;;;;4;N;;;;;
+1246A;CUNEIFORM NUMERIC SIGN FIVE U VARIANT FORM;Nl;0;L;;;;5;N;;;;;
+1246B;CUNEIFORM NUMERIC SIGN SIX U VARIANT FORM;Nl;0;L;;;;6;N;;;;;
+1246C;CUNEIFORM NUMERIC SIGN SEVEN U VARIANT FORM;Nl;0;L;;;;7;N;;;;;
+1246D;CUNEIFORM NUMERIC SIGN EIGHT U VARIANT FORM;Nl;0;L;;;;8;N;;;;;
+1246E;CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM;Nl;0;L;;;;9;N;;;;;
12470;CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER;Po;0;L;;;;;N;;;;;
12471;CUNEIFORM PUNCTUATION SIGN VERTICAL COLON;Po;0;L;;;;;N;;;;;
12472;CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON;Po;0;L;;;;;N;;;;;
12473;CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON;Po;0;L;;;;;N;;;;;
+12474;CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON;Po;0;L;;;;;N;;;;;
13000;EGYPTIAN HIEROGLYPH A001;Lo;0;L;;;;;N;;;;;
13001;EGYPTIAN HIEROGLYPH A002;Lo;0;L;;;;;N;;;;;
13002;EGYPTIAN HIEROGLYPH A003;Lo;0;L;;;;;N;;;;;
@@ -20397,6 +22126,212 @@
16A36;BAMUM LETTER PHASE-F KPA;Lo;0;L;;;;;N;;;;;
16A37;BAMUM LETTER PHASE-F SAMBA;Lo;0;L;;;;;N;;;;;
16A38;BAMUM LETTER PHASE-F VUEQ;Lo;0;L;;;;;N;;;;;
+16A40;MRO LETTER TA;Lo;0;L;;;;;N;;;;;
+16A41;MRO LETTER NGI;Lo;0;L;;;;;N;;;;;
+16A42;MRO LETTER YO;Lo;0;L;;;;;N;;;;;
+16A43;MRO LETTER MIM;Lo;0;L;;;;;N;;;;;
+16A44;MRO LETTER BA;Lo;0;L;;;;;N;;;;;
+16A45;MRO LETTER DA;Lo;0;L;;;;;N;;;;;
+16A46;MRO LETTER A;Lo;0;L;;;;;N;;;;;
+16A47;MRO LETTER PHI;Lo;0;L;;;;;N;;;;;
+16A48;MRO LETTER KHAI;Lo;0;L;;;;;N;;;;;
+16A49;MRO LETTER HAO;Lo;0;L;;;;;N;;;;;
+16A4A;MRO LETTER DAI;Lo;0;L;;;;;N;;;;;
+16A4B;MRO LETTER CHU;Lo;0;L;;;;;N;;;;;
+16A4C;MRO LETTER KEAAE;Lo;0;L;;;;;N;;;;;
+16A4D;MRO LETTER OL;Lo;0;L;;;;;N;;;;;
+16A4E;MRO LETTER MAEM;Lo;0;L;;;;;N;;;;;
+16A4F;MRO LETTER NIN;Lo;0;L;;;;;N;;;;;
+16A50;MRO LETTER PA;Lo;0;L;;;;;N;;;;;
+16A51;MRO LETTER OO;Lo;0;L;;;;;N;;;;;
+16A52;MRO LETTER O;Lo;0;L;;;;;N;;;;;
+16A53;MRO LETTER RO;Lo;0;L;;;;;N;;;;;
+16A54;MRO LETTER SHI;Lo;0;L;;;;;N;;;;;
+16A55;MRO LETTER THEA;Lo;0;L;;;;;N;;;;;
+16A56;MRO LETTER EA;Lo;0;L;;;;;N;;;;;
+16A57;MRO LETTER WA;Lo;0;L;;;;;N;;;;;
+16A58;MRO LETTER E;Lo;0;L;;;;;N;;;;;
+16A59;MRO LETTER KO;Lo;0;L;;;;;N;;;;;
+16A5A;MRO LETTER LAN;Lo;0;L;;;;;N;;;;;
+16A5B;MRO LETTER LA;Lo;0;L;;;;;N;;;;;
+16A5C;MRO LETTER HAI;Lo;0;L;;;;;N;;;;;
+16A5D;MRO LETTER RI;Lo;0;L;;;;;N;;;;;
+16A5E;MRO LETTER TEK;Lo;0;L;;;;;N;;;;;
+16A60;MRO DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+16A61;MRO DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+16A62;MRO DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+16A63;MRO DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+16A64;MRO DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+16A65;MRO DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+16A66;MRO DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+16A67;MRO DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+16A68;MRO DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+16A69;MRO DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+16A6E;MRO DANDA;Po;0;L;;;;;N;;;;;
+16A6F;MRO DOUBLE DANDA;Po;0;L;;;;;N;;;;;
+16AD0;BASSA VAH LETTER ENNI;Lo;0;L;;;;;N;;;;;
+16AD1;BASSA VAH LETTER KA;Lo;0;L;;;;;N;;;;;
+16AD2;BASSA VAH LETTER SE;Lo;0;L;;;;;N;;;;;
+16AD3;BASSA VAH LETTER FA;Lo;0;L;;;;;N;;;;;
+16AD4;BASSA VAH LETTER MBE;Lo;0;L;;;;;N;;;;;
+16AD5;BASSA VAH LETTER YIE;Lo;0;L;;;;;N;;;;;
+16AD6;BASSA VAH LETTER GAH;Lo;0;L;;;;;N;;;;;
+16AD7;BASSA VAH LETTER DHII;Lo;0;L;;;;;N;;;;;
+16AD8;BASSA VAH LETTER KPAH;Lo;0;L;;;;;N;;;;;
+16AD9;BASSA VAH LETTER JO;Lo;0;L;;;;;N;;;;;
+16ADA;BASSA VAH LETTER HWAH;Lo;0;L;;;;;N;;;;;
+16ADB;BASSA VAH LETTER WA;Lo;0;L;;;;;N;;;;;
+16ADC;BASSA VAH LETTER ZO;Lo;0;L;;;;;N;;;;;
+16ADD;BASSA VAH LETTER GBU;Lo;0;L;;;;;N;;;;;
+16ADE;BASSA VAH LETTER DO;Lo;0;L;;;;;N;;;;;
+16ADF;BASSA VAH LETTER CE;Lo;0;L;;;;;N;;;;;
+16AE0;BASSA VAH LETTER UWU;Lo;0;L;;;;;N;;;;;
+16AE1;BASSA VAH LETTER TO;Lo;0;L;;;;;N;;;;;
+16AE2;BASSA VAH LETTER BA;Lo;0;L;;;;;N;;;;;
+16AE3;BASSA VAH LETTER VU;Lo;0;L;;;;;N;;;;;
+16AE4;BASSA VAH LETTER YEIN;Lo;0;L;;;;;N;;;;;
+16AE5;BASSA VAH LETTER PA;Lo;0;L;;;;;N;;;;;
+16AE6;BASSA VAH LETTER WADDA;Lo;0;L;;;;;N;;;;;
+16AE7;BASSA VAH LETTER A;Lo;0;L;;;;;N;;;;;
+16AE8;BASSA VAH LETTER O;Lo;0;L;;;;;N;;;;;
+16AE9;BASSA VAH LETTER OO;Lo;0;L;;;;;N;;;;;
+16AEA;BASSA VAH LETTER U;Lo;0;L;;;;;N;;;;;
+16AEB;BASSA VAH LETTER EE;Lo;0;L;;;;;N;;;;;
+16AEC;BASSA VAH LETTER E;Lo;0;L;;;;;N;;;;;
+16AED;BASSA VAH LETTER I;Lo;0;L;;;;;N;;;;;
+16AF0;BASSA VAH COMBINING HIGH TONE;Mn;1;NSM;;;;;N;;;;;
+16AF1;BASSA VAH COMBINING LOW TONE;Mn;1;NSM;;;;;N;;;;;
+16AF2;BASSA VAH COMBINING MID TONE;Mn;1;NSM;;;;;N;;;;;
+16AF3;BASSA VAH COMBINING LOW-MID TONE;Mn;1;NSM;;;;;N;;;;;
+16AF4;BASSA VAH COMBINING HIGH-LOW TONE;Mn;1;NSM;;;;;N;;;;;
+16AF5;BASSA VAH FULL STOP;Po;0;L;;;;;N;;;;;
+16B00;PAHAWH HMONG VOWEL KEEB;Lo;0;L;;;;;N;;;;;
+16B01;PAHAWH HMONG VOWEL KEEV;Lo;0;L;;;;;N;;;;;
+16B02;PAHAWH HMONG VOWEL KIB;Lo;0;L;;;;;N;;;;;
+16B03;PAHAWH HMONG VOWEL KIV;Lo;0;L;;;;;N;;;;;
+16B04;PAHAWH HMONG VOWEL KAUB;Lo;0;L;;;;;N;;;;;
+16B05;PAHAWH HMONG VOWEL KAUV;Lo;0;L;;;;;N;;;;;
+16B06;PAHAWH HMONG VOWEL KUB;Lo;0;L;;;;;N;;;;;
+16B07;PAHAWH HMONG VOWEL KUV;Lo;0;L;;;;;N;;;;;
+16B08;PAHAWH HMONG VOWEL KEB;Lo;0;L;;;;;N;;;;;
+16B09;PAHAWH HMONG VOWEL KEV;Lo;0;L;;;;;N;;;;;
+16B0A;PAHAWH HMONG VOWEL KAIB;Lo;0;L;;;;;N;;;;;
+16B0B;PAHAWH HMONG VOWEL KAIV;Lo;0;L;;;;;N;;;;;
+16B0C;PAHAWH HMONG VOWEL KOOB;Lo;0;L;;;;;N;;;;;
+16B0D;PAHAWH HMONG VOWEL KOOV;Lo;0;L;;;;;N;;;;;
+16B0E;PAHAWH HMONG VOWEL KAWB;Lo;0;L;;;;;N;;;;;
+16B0F;PAHAWH HMONG VOWEL KAWV;Lo;0;L;;;;;N;;;;;
+16B10;PAHAWH HMONG VOWEL KUAB;Lo;0;L;;;;;N;;;;;
+16B11;PAHAWH HMONG VOWEL KUAV;Lo;0;L;;;;;N;;;;;
+16B12;PAHAWH HMONG VOWEL KOB;Lo;0;L;;;;;N;;;;;
+16B13;PAHAWH HMONG VOWEL KOV;Lo;0;L;;;;;N;;;;;
+16B14;PAHAWH HMONG VOWEL KIAB;Lo;0;L;;;;;N;;;;;
+16B15;PAHAWH HMONG VOWEL KIAV;Lo;0;L;;;;;N;;;;;
+16B16;PAHAWH HMONG VOWEL KAB;Lo;0;L;;;;;N;;;;;
+16B17;PAHAWH HMONG VOWEL KAV;Lo;0;L;;;;;N;;;;;
+16B18;PAHAWH HMONG VOWEL KWB;Lo;0;L;;;;;N;;;;;
+16B19;PAHAWH HMONG VOWEL KWV;Lo;0;L;;;;;N;;;;;
+16B1A;PAHAWH HMONG VOWEL KAAB;Lo;0;L;;;;;N;;;;;
+16B1B;PAHAWH HMONG VOWEL KAAV;Lo;0;L;;;;;N;;;;;
+16B1C;PAHAWH HMONG CONSONANT VAU;Lo;0;L;;;;;N;;;;;
+16B1D;PAHAWH HMONG CONSONANT NTSAU;Lo;0;L;;;;;N;;;;;
+16B1E;PAHAWH HMONG CONSONANT LAU;Lo;0;L;;;;;N;;;;;
+16B1F;PAHAWH HMONG CONSONANT HAU;Lo;0;L;;;;;N;;;;;
+16B20;PAHAWH HMONG CONSONANT NLAU;Lo;0;L;;;;;N;;;;;
+16B21;PAHAWH HMONG CONSONANT RAU;Lo;0;L;;;;;N;;;;;
+16B22;PAHAWH HMONG CONSONANT NKAU;Lo;0;L;;;;;N;;;;;
+16B23;PAHAWH HMONG CONSONANT QHAU;Lo;0;L;;;;;N;;;;;
+16B24;PAHAWH HMONG CONSONANT YAU;Lo;0;L;;;;;N;;;;;
+16B25;PAHAWH HMONG CONSONANT HLAU;Lo;0;L;;;;;N;;;;;
+16B26;PAHAWH HMONG CONSONANT MAU;Lo;0;L;;;;;N;;;;;
+16B27;PAHAWH HMONG CONSONANT CHAU;Lo;0;L;;;;;N;;;;;
+16B28;PAHAWH HMONG CONSONANT NCHAU;Lo;0;L;;;;;N;;;;;
+16B29;PAHAWH HMONG CONSONANT HNAU;Lo;0;L;;;;;N;;;;;
+16B2A;PAHAWH HMONG CONSONANT PLHAU;Lo;0;L;;;;;N;;;;;
+16B2B;PAHAWH HMONG CONSONANT NTHAU;Lo;0;L;;;;;N;;;;;
+16B2C;PAHAWH HMONG CONSONANT NAU;Lo;0;L;;;;;N;;;;;
+16B2D;PAHAWH HMONG CONSONANT AU;Lo;0;L;;;;;N;;;;;
+16B2E;PAHAWH HMONG CONSONANT XAU;Lo;0;L;;;;;N;;;;;
+16B2F;PAHAWH HMONG CONSONANT CAU;Lo;0;L;;;;;N;;;;;
+16B30;PAHAWH HMONG MARK CIM TUB;Mn;230;NSM;;;;;N;;;;;
+16B31;PAHAWH HMONG MARK CIM SO;Mn;230;NSM;;;;;N;;;;;
+16B32;PAHAWH HMONG MARK CIM KES;Mn;230;NSM;;;;;N;;;;;
+16B33;PAHAWH HMONG MARK CIM KHAV;Mn;230;NSM;;;;;N;;;;;
+16B34;PAHAWH HMONG MARK CIM SUAM;Mn;230;NSM;;;;;N;;;;;
+16B35;PAHAWH HMONG MARK CIM HOM;Mn;230;NSM;;;;;N;;;;;
+16B36;PAHAWH HMONG MARK CIM TAUM;Mn;230;NSM;;;;;N;;;;;
+16B37;PAHAWH HMONG SIGN VOS THOM;Po;0;L;;;;;N;;;;;
+16B38;PAHAWH HMONG SIGN VOS TSHAB CEEB;Po;0;L;;;;;N;;;;;
+16B39;PAHAWH HMONG SIGN CIM CHEEM;Po;0;L;;;;;N;;;;;
+16B3A;PAHAWH HMONG SIGN VOS THIAB;Po;0;L;;;;;N;;;;;
+16B3B;PAHAWH HMONG SIGN VOS FEEM;Po;0;L;;;;;N;;;;;
+16B3C;PAHAWH HMONG SIGN XYEEM NTXIV;So;0;L;;;;;N;;;;;
+16B3D;PAHAWH HMONG SIGN XYEEM RHO;So;0;L;;;;;N;;;;;
+16B3E;PAHAWH HMONG SIGN XYEEM TOV;So;0;L;;;;;N;;;;;
+16B3F;PAHAWH HMONG SIGN XYEEM FAIB;So;0;L;;;;;N;;;;;
+16B40;PAHAWH HMONG SIGN VOS SEEV;Lm;0;L;;;;;N;;;;;
+16B41;PAHAWH HMONG SIGN MEEJ SUAB;Lm;0;L;;;;;N;;;;;
+16B42;PAHAWH HMONG SIGN VOS NRUA;Lm;0;L;;;;;N;;;;;
+16B43;PAHAWH HMONG SIGN IB YAM;Lm;0;L;;;;;N;;;;;
+16B44;PAHAWH HMONG SIGN XAUS;Po;0;L;;;;;N;;;;;
+16B45;PAHAWH HMONG SIGN CIM TSOV ROG;So;0;L;;;;;N;;;;;
+16B50;PAHAWH HMONG DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
+16B51;PAHAWH HMONG DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
+16B52;PAHAWH HMONG DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
+16B53;PAHAWH HMONG DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
+16B54;PAHAWH HMONG DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
+16B55;PAHAWH HMONG DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
+16B56;PAHAWH HMONG DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
+16B57;PAHAWH HMONG DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
+16B58;PAHAWH HMONG DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
+16B59;PAHAWH HMONG DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+16B5B;PAHAWH HMONG NUMBER TENS;No;0;L;;;;10;N;;;;;
+16B5C;PAHAWH HMONG NUMBER HUNDREDS;No;0;L;;;;100;N;;;;;
+16B5D;PAHAWH HMONG NUMBER TEN THOUSANDS;No;0;L;;;;10000;N;;;;;
+16B5E;PAHAWH HMONG NUMBER MILLIONS;No;0;L;;;;1000000;N;;;;;
+16B5F;PAHAWH HMONG NUMBER HUNDRED MILLIONS;No;0;L;;;;100000000;N;;;;;
+16B60;PAHAWH HMONG NUMBER TEN BILLIONS;No;0;L;;;;10000000000;N;;;;;
+16B61;PAHAWH HMONG NUMBER TRILLIONS;No;0;L;;;;1000000000000;N;;;;;
+16B63;PAHAWH HMONG SIGN VOS LUB;Lo;0;L;;;;;N;;;;;
+16B64;PAHAWH HMONG SIGN XYOO;Lo;0;L;;;;;N;;;;;
+16B65;PAHAWH HMONG SIGN HLI;Lo;0;L;;;;;N;;;;;
+16B66;PAHAWH HMONG SIGN THIRD-STAGE HLI;Lo;0;L;;;;;N;;;;;
+16B67;PAHAWH HMONG SIGN ZWJ THAJ;Lo;0;L;;;;;N;;;;;
+16B68;PAHAWH HMONG SIGN HNUB;Lo;0;L;;;;;N;;;;;
+16B69;PAHAWH HMONG SIGN NQIG;Lo;0;L;;;;;N;;;;;
+16B6A;PAHAWH HMONG SIGN XIAB;Lo;0;L;;;;;N;;;;;
+16B6B;PAHAWH HMONG SIGN NTUJ;Lo;0;L;;;;;N;;;;;
+16B6C;PAHAWH HMONG SIGN AV;Lo;0;L;;;;;N;;;;;
+16B6D;PAHAWH HMONG SIGN TXHEEJ CEEV;Lo;0;L;;;;;N;;;;;
+16B6E;PAHAWH HMONG SIGN MEEJ TSEEB;Lo;0;L;;;;;N;;;;;
+16B6F;PAHAWH HMONG SIGN TAU;Lo;0;L;;;;;N;;;;;
+16B70;PAHAWH HMONG SIGN LOS;Lo;0;L;;;;;N;;;;;
+16B71;PAHAWH HMONG SIGN MUS;Lo;0;L;;;;;N;;;;;
+16B72;PAHAWH HMONG SIGN CIM HAIS LUS NTOG NTOG;Lo;0;L;;;;;N;;;;;
+16B73;PAHAWH HMONG SIGN CIM CUAM TSHOOJ;Lo;0;L;;;;;N;;;;;
+16B74;PAHAWH HMONG SIGN CIM TXWV;Lo;0;L;;;;;N;;;;;
+16B75;PAHAWH HMONG SIGN CIM TXWV CHWV;Lo;0;L;;;;;N;;;;;
+16B76;PAHAWH HMONG SIGN CIM PUB DAWB;Lo;0;L;;;;;N;;;;;
+16B77;PAHAWH HMONG SIGN CIM NRES TOS;Lo;0;L;;;;;N;;;;;
+16B7D;PAHAWH HMONG CLAN SIGN TSHEEJ;Lo;0;L;;;;;N;;;;;
+16B7E;PAHAWH HMONG CLAN SIGN YEEG;Lo;0;L;;;;;N;;;;;
+16B7F;PAHAWH HMONG CLAN SIGN LIS;Lo;0;L;;;;;N;;;;;
+16B80;PAHAWH HMONG CLAN SIGN LAUJ;Lo;0;L;;;;;N;;;;;
+16B81;PAHAWH HMONG CLAN SIGN XYOOJ;Lo;0;L;;;;;N;;;;;
+16B82;PAHAWH HMONG CLAN SIGN KOO;Lo;0;L;;;;;N;;;;;
+16B83;PAHAWH HMONG CLAN SIGN HAWJ;Lo;0;L;;;;;N;;;;;
+16B84;PAHAWH HMONG CLAN SIGN MUAS;Lo;0;L;;;;;N;;;;;
+16B85;PAHAWH HMONG CLAN SIGN THOJ;Lo;0;L;;;;;N;;;;;
+16B86;PAHAWH HMONG CLAN SIGN TSAB;Lo;0;L;;;;;N;;;;;
+16B87;PAHAWH HMONG CLAN SIGN PHAB;Lo;0;L;;;;;N;;;;;
+16B88;PAHAWH HMONG CLAN SIGN KHAB;Lo;0;L;;;;;N;;;;;
+16B89;PAHAWH HMONG CLAN SIGN HAM;Lo;0;L;;;;;N;;;;;
+16B8A;PAHAWH HMONG CLAN SIGN VAJ;Lo;0;L;;;;;N;;;;;
+16B8B;PAHAWH HMONG CLAN SIGN FAJ;Lo;0;L;;;;;N;;;;;
+16B8C;PAHAWH HMONG CLAN SIGN YAJ;Lo;0;L;;;;;N;;;;;
+16B8D;PAHAWH HMONG CLAN SIGN TSWB;Lo;0;L;;;;;N;;;;;
+16B8E;PAHAWH HMONG CLAN SIGN KWM;Lo;0;L;;;;;N;;;;;
+16B8F;PAHAWH HMONG CLAN SIGN VWJ;Lo;0;L;;;;;N;;;;;
16F00;MIAO LETTER PA;Lo;0;L;;;;;N;;;;;
16F01;MIAO LETTER BA;Lo;0;L;;;;;N;;;;;
16F02;MIAO LETTER YI PA;Lo;0;L;;;;;N;;;;;
@@ -20532,6 +22467,153 @@
16F9F;MIAO LETTER REFORMED TONE-8;Lm;0;L;;;;;N;;;;;
1B000;KATAKANA LETTER ARCHAIC E;Lo;0;L;;;;;N;;;;;
1B001;HIRAGANA LETTER ARCHAIC YE;Lo;0;L;;;;;N;;;;;
+1BC00;DUPLOYAN LETTER H;Lo;0;L;;;;;N;;;;;
+1BC01;DUPLOYAN LETTER X;Lo;0;L;;;;;N;;;;;
+1BC02;DUPLOYAN LETTER P;Lo;0;L;;;;;N;;;;;
+1BC03;DUPLOYAN LETTER T;Lo;0;L;;;;;N;;;;;
+1BC04;DUPLOYAN LETTER F;Lo;0;L;;;;;N;;;;;
+1BC05;DUPLOYAN LETTER K;Lo;0;L;;;;;N;;;;;
+1BC06;DUPLOYAN LETTER L;Lo;0;L;;;;;N;;;;;
+1BC07;DUPLOYAN LETTER B;Lo;0;L;;;;;N;;;;;
+1BC08;DUPLOYAN LETTER D;Lo;0;L;;;;;N;;;;;
+1BC09;DUPLOYAN LETTER V;Lo;0;L;;;;;N;;;;;
+1BC0A;DUPLOYAN LETTER G;Lo;0;L;;;;;N;;;;;
+1BC0B;DUPLOYAN LETTER R;Lo;0;L;;;;;N;;;;;
+1BC0C;DUPLOYAN LETTER P N;Lo;0;L;;;;;N;;;;;
+1BC0D;DUPLOYAN LETTER D S;Lo;0;L;;;;;N;;;;;
+1BC0E;DUPLOYAN LETTER F N;Lo;0;L;;;;;N;;;;;
+1BC0F;DUPLOYAN LETTER K M;Lo;0;L;;;;;N;;;;;
+1BC10;DUPLOYAN LETTER R S;Lo;0;L;;;;;N;;;;;
+1BC11;DUPLOYAN LETTER TH;Lo;0;L;;;;;N;;;;;
+1BC12;DUPLOYAN LETTER SLOAN DH;Lo;0;L;;;;;N;;;;;
+1BC13;DUPLOYAN LETTER DH;Lo;0;L;;;;;N;;;;;
+1BC14;DUPLOYAN LETTER KK;Lo;0;L;;;;;N;;;;;
+1BC15;DUPLOYAN LETTER SLOAN J;Lo;0;L;;;;;N;;;;;
+1BC16;DUPLOYAN LETTER HL;Lo;0;L;;;;;N;;;;;
+1BC17;DUPLOYAN LETTER LH;Lo;0;L;;;;;N;;;;;
+1BC18;DUPLOYAN LETTER RH;Lo;0;L;;;;;N;;;;;
+1BC19;DUPLOYAN LETTER M;Lo;0;L;;;;;N;;;;;
+1BC1A;DUPLOYAN LETTER N;Lo;0;L;;;;;N;;;;;
+1BC1B;DUPLOYAN LETTER J;Lo;0;L;;;;;N;;;;;
+1BC1C;DUPLOYAN LETTER S;Lo;0;L;;;;;N;;;;;
+1BC1D;DUPLOYAN LETTER M N;Lo;0;L;;;;;N;;;;;
+1BC1E;DUPLOYAN LETTER N M;Lo;0;L;;;;;N;;;;;
+1BC1F;DUPLOYAN LETTER J M;Lo;0;L;;;;;N;;;;;
+1BC20;DUPLOYAN LETTER S J;Lo;0;L;;;;;N;;;;;
+1BC21;DUPLOYAN LETTER M WITH DOT;Lo;0;L;;;;;N;;;;;
+1BC22;DUPLOYAN LETTER N WITH DOT;Lo;0;L;;;;;N;;;;;
+1BC23;DUPLOYAN LETTER J WITH DOT;Lo;0;L;;;;;N;;;;;
+1BC24;DUPLOYAN LETTER J WITH DOTS INSIDE AND ABOVE;Lo;0;L;;;;;N;;;;;
+1BC25;DUPLOYAN LETTER S WITH DOT;Lo;0;L;;;;;N;;;;;
+1BC26;DUPLOYAN LETTER S WITH DOT BELOW;Lo;0;L;;;;;N;;;;;
+1BC27;DUPLOYAN LETTER M S;Lo;0;L;;;;;N;;;;;
+1BC28;DUPLOYAN LETTER N S;Lo;0;L;;;;;N;;;;;
+1BC29;DUPLOYAN LETTER J S;Lo;0;L;;;;;N;;;;;
+1BC2A;DUPLOYAN LETTER S S;Lo;0;L;;;;;N;;;;;
+1BC2B;DUPLOYAN LETTER M N S;Lo;0;L;;;;;N;;;;;
+1BC2C;DUPLOYAN LETTER N M S;Lo;0;L;;;;;N;;;;;
+1BC2D;DUPLOYAN LETTER J M S;Lo;0;L;;;;;N;;;;;
+1BC2E;DUPLOYAN LETTER S J S;Lo;0;L;;;;;N;;;;;
+1BC2F;DUPLOYAN LETTER J S WITH DOT;Lo;0;L;;;;;N;;;;;
+1BC30;DUPLOYAN LETTER J N;Lo;0;L;;;;;N;;;;;
+1BC31;DUPLOYAN LETTER J N S;Lo;0;L;;;;;N;;;;;
+1BC32;DUPLOYAN LETTER S T;Lo;0;L;;;;;N;;;;;
+1BC33;DUPLOYAN LETTER S T R;Lo;0;L;;;;;N;;;;;
+1BC34;DUPLOYAN LETTER S P;Lo;0;L;;;;;N;;;;;
+1BC35;DUPLOYAN LETTER S P R;Lo;0;L;;;;;N;;;;;
+1BC36;DUPLOYAN LETTER T S;Lo;0;L;;;;;N;;;;;
+1BC37;DUPLOYAN LETTER T R S;Lo;0;L;;;;;N;;;;;
+1BC38;DUPLOYAN LETTER W;Lo;0;L;;;;;N;;;;;
+1BC39;DUPLOYAN LETTER WH;Lo;0;L;;;;;N;;;;;
+1BC3A;DUPLOYAN LETTER W R;Lo;0;L;;;;;N;;;;;
+1BC3B;DUPLOYAN LETTER S N;Lo;0;L;;;;;N;;;;;
+1BC3C;DUPLOYAN LETTER S M;Lo;0;L;;;;;N;;;;;
+1BC3D;DUPLOYAN LETTER K R S;Lo;0;L;;;;;N;;;;;
+1BC3E;DUPLOYAN LETTER G R S;Lo;0;L;;;;;N;;;;;
+1BC3F;DUPLOYAN LETTER S K;Lo;0;L;;;;;N;;;;;
+1BC40;DUPLOYAN LETTER S K R;Lo;0;L;;;;;N;;;;;
+1BC41;DUPLOYAN LETTER A;Lo;0;L;;;;;N;;;;;
+1BC42;DUPLOYAN LETTER SLOAN OW;Lo;0;L;;;;;N;;;;;
+1BC43;DUPLOYAN LETTER OA;Lo;0;L;;;;;N;;;;;
+1BC44;DUPLOYAN LETTER O;Lo;0;L;;;;;N;;;;;
+1BC45;DUPLOYAN LETTER AOU;Lo;0;L;;;;;N;;;;;
+1BC46;DUPLOYAN LETTER I;Lo;0;L;;;;;N;;;;;
+1BC47;DUPLOYAN LETTER E;Lo;0;L;;;;;N;;;;;
+1BC48;DUPLOYAN LETTER IE;Lo;0;L;;;;;N;;;;;
+1BC49;DUPLOYAN LETTER SHORT I;Lo;0;L;;;;;N;;;;;
+1BC4A;DUPLOYAN LETTER UI;Lo;0;L;;;;;N;;;;;
+1BC4B;DUPLOYAN LETTER EE;Lo;0;L;;;;;N;;;;;
+1BC4C;DUPLOYAN LETTER SLOAN EH;Lo;0;L;;;;;N;;;;;
+1BC4D;DUPLOYAN LETTER ROMANIAN I;Lo;0;L;;;;;N;;;;;
+1BC4E;DUPLOYAN LETTER SLOAN EE;Lo;0;L;;;;;N;;;;;
+1BC4F;DUPLOYAN LETTER LONG I;Lo;0;L;;;;;N;;;;;
+1BC50;DUPLOYAN LETTER YE;Lo;0;L;;;;;N;;;;;
+1BC51;DUPLOYAN LETTER U;Lo;0;L;;;;;N;;;;;
+1BC52;DUPLOYAN LETTER EU;Lo;0;L;;;;;N;;;;;
+1BC53;DUPLOYAN LETTER XW;Lo;0;L;;;;;N;;;;;
+1BC54;DUPLOYAN LETTER U N;Lo;0;L;;;;;N;;;;;
+1BC55;DUPLOYAN LETTER LONG U;Lo;0;L;;;;;N;;;;;
+1BC56;DUPLOYAN LETTER ROMANIAN U;Lo;0;L;;;;;N;;;;;
+1BC57;DUPLOYAN LETTER UH;Lo;0;L;;;;;N;;;;;
+1BC58;DUPLOYAN LETTER SLOAN U;Lo;0;L;;;;;N;;;;;
+1BC59;DUPLOYAN LETTER OOH;Lo;0;L;;;;;N;;;;;
+1BC5A;DUPLOYAN LETTER OW;Lo;0;L;;;;;N;;;;;
+1BC5B;DUPLOYAN LETTER OU;Lo;0;L;;;;;N;;;;;
+1BC5C;DUPLOYAN LETTER WA;Lo;0;L;;;;;N;;;;;
+1BC5D;DUPLOYAN LETTER WO;Lo;0;L;;;;;N;;;;;
+1BC5E;DUPLOYAN LETTER WI;Lo;0;L;;;;;N;;;;;
+1BC5F;DUPLOYAN LETTER WEI;Lo;0;L;;;;;N;;;;;
+1BC60;DUPLOYAN LETTER WOW;Lo;0;L;;;;;N;;;;;
+1BC61;DUPLOYAN LETTER NASAL U;Lo;0;L;;;;;N;;;;;
+1BC62;DUPLOYAN LETTER NASAL O;Lo;0;L;;;;;N;;;;;
+1BC63;DUPLOYAN LETTER NASAL I;Lo;0;L;;;;;N;;;;;
+1BC64;DUPLOYAN LETTER NASAL A;Lo;0;L;;;;;N;;;;;
+1BC65;DUPLOYAN LETTER PERNIN AN;Lo;0;L;;;;;N;;;;;
+1BC66;DUPLOYAN LETTER PERNIN AM;Lo;0;L;;;;;N;;;;;
+1BC67;DUPLOYAN LETTER SLOAN EN;Lo;0;L;;;;;N;;;;;
+1BC68;DUPLOYAN LETTER SLOAN AN;Lo;0;L;;;;;N;;;;;
+1BC69;DUPLOYAN LETTER SLOAN ON;Lo;0;L;;;;;N;;;;;
+1BC6A;DUPLOYAN LETTER VOCALIC M;Lo;0;L;;;;;N;;;;;
+1BC70;DUPLOYAN AFFIX LEFT HORIZONTAL SECANT;Lo;0;L;;;;;N;;;;;
+1BC71;DUPLOYAN AFFIX MID HORIZONTAL SECANT;Lo;0;L;;;;;N;;;;;
+1BC72;DUPLOYAN AFFIX RIGHT HORIZONTAL SECANT;Lo;0;L;;;;;N;;;;;
+1BC73;DUPLOYAN AFFIX LOW VERTICAL SECANT;Lo;0;L;;;;;N;;;;;
+1BC74;DUPLOYAN AFFIX MID VERTICAL SECANT;Lo;0;L;;;;;N;;;;;
+1BC75;DUPLOYAN AFFIX HIGH VERTICAL SECANT;Lo;0;L;;;;;N;;;;;
+1BC76;DUPLOYAN AFFIX ATTACHED SECANT;Lo;0;L;;;;;N;;;;;
+1BC77;DUPLOYAN AFFIX ATTACHED LEFT-TO-RIGHT SECANT;Lo;0;L;;;;;N;;;;;
+1BC78;DUPLOYAN AFFIX ATTACHED TANGENT;Lo;0;L;;;;;N;;;;;
+1BC79;DUPLOYAN AFFIX ATTACHED TAIL;Lo;0;L;;;;;N;;;;;
+1BC7A;DUPLOYAN AFFIX ATTACHED E HOOK;Lo;0;L;;;;;N;;;;;
+1BC7B;DUPLOYAN AFFIX ATTACHED I HOOK;Lo;0;L;;;;;N;;;;;
+1BC7C;DUPLOYAN AFFIX ATTACHED TANGENT HOOK;Lo;0;L;;;;;N;;;;;
+1BC80;DUPLOYAN AFFIX HIGH ACUTE;Lo;0;L;;;;;N;;;;;
+1BC81;DUPLOYAN AFFIX HIGH TIGHT ACUTE;Lo;0;L;;;;;N;;;;;
+1BC82;DUPLOYAN AFFIX HIGH GRAVE;Lo;0;L;;;;;N;;;;;
+1BC83;DUPLOYAN AFFIX HIGH LONG GRAVE;Lo;0;L;;;;;N;;;;;
+1BC84;DUPLOYAN AFFIX HIGH DOT;Lo;0;L;;;;;N;;;;;
+1BC85;DUPLOYAN AFFIX HIGH CIRCLE;Lo;0;L;;;;;N;;;;;
+1BC86;DUPLOYAN AFFIX HIGH LINE;Lo;0;L;;;;;N;;;;;
+1BC87;DUPLOYAN AFFIX HIGH WAVE;Lo;0;L;;;;;N;;;;;
+1BC88;DUPLOYAN AFFIX HIGH VERTICAL;Lo;0;L;;;;;N;;;;;
+1BC90;DUPLOYAN AFFIX LOW ACUTE;Lo;0;L;;;;;N;;;;;
+1BC91;DUPLOYAN AFFIX LOW TIGHT ACUTE;Lo;0;L;;;;;N;;;;;
+1BC92;DUPLOYAN AFFIX LOW GRAVE;Lo;0;L;;;;;N;;;;;
+1BC93;DUPLOYAN AFFIX LOW LONG GRAVE;Lo;0;L;;;;;N;;;;;
+1BC94;DUPLOYAN AFFIX LOW DOT;Lo;0;L;;;;;N;;;;;
+1BC95;DUPLOYAN AFFIX LOW CIRCLE;Lo;0;L;;;;;N;;;;;
+1BC96;DUPLOYAN AFFIX LOW LINE;Lo;0;L;;;;;N;;;;;
+1BC97;DUPLOYAN AFFIX LOW WAVE;Lo;0;L;;;;;N;;;;;
+1BC98;DUPLOYAN AFFIX LOW VERTICAL;Lo;0;L;;;;;N;;;;;
+1BC99;DUPLOYAN AFFIX LOW ARROW;Lo;0;L;;;;;N;;;;;
+1BC9C;DUPLOYAN SIGN O WITH CROSS;So;0;L;;;;;N;;;;;
+1BC9D;DUPLOYAN THICK LETTER SELECTOR;Mn;0;NSM;;;;;N;;;;;
+1BC9E;DUPLOYAN DOUBLE MARK;Mn;1;NSM;;;;;N;;;;;
+1BC9F;DUPLOYAN PUNCTUATION CHINOOK FULL STOP;Po;0;L;;;;;N;;;;;
+1BCA0;SHORTHAND FORMAT LETTER OVERLAP;Cf;0;BN;;;;;N;;;;;
+1BCA1;SHORTHAND FORMAT CONTINUING OVERLAP;Cf;0;BN;;;;;N;;;;;
+1BCA2;SHORTHAND FORMAT DOWN STEP;Cf;0;BN;;;;;N;;;;;
+1BCA3;SHORTHAND FORMAT UP STEP;Cf;0;BN;;;;;N;;;;;
1D000;BYZANTINE MUSICAL SYMBOL PSILI;So;0;L;;;;;N;;;;;
1D001;BYZANTINE MUSICAL SYMBOL DASEIA;So;0;L;;;;;N;;;;;
1D002;BYZANTINE MUSICAL SYMBOL PERISPOMENI;So;0;L;;;;;N;;;;;
@@ -22169,6 +24251,219 @@
1D7FD;MATHEMATICAL MONOSPACE DIGIT SEVEN;Nd;0;EN;<font> 0037;7;7;7;N;;;;;
1D7FE;MATHEMATICAL MONOSPACE DIGIT EIGHT;Nd;0;EN;<font> 0038;8;8;8;N;;;;;
1D7FF;MATHEMATICAL MONOSPACE DIGIT NINE;Nd;0;EN;<font> 0039;9;9;9;N;;;;;
+1E800;MENDE KIKAKUI SYLLABLE M001 KI;Lo;0;R;;;;;N;;;;;
+1E801;MENDE KIKAKUI SYLLABLE M002 KA;Lo;0;R;;;;;N;;;;;
+1E802;MENDE KIKAKUI SYLLABLE M003 KU;Lo;0;R;;;;;N;;;;;
+1E803;MENDE KIKAKUI SYLLABLE M065 KEE;Lo;0;R;;;;;N;;;;;
+1E804;MENDE KIKAKUI SYLLABLE M095 KE;Lo;0;R;;;;;N;;;;;
+1E805;MENDE KIKAKUI SYLLABLE M076 KOO;Lo;0;R;;;;;N;;;;;
+1E806;MENDE KIKAKUI SYLLABLE M048 KO;Lo;0;R;;;;;N;;;;;
+1E807;MENDE KIKAKUI SYLLABLE M179 KUA;Lo;0;R;;;;;N;;;;;
+1E808;MENDE KIKAKUI SYLLABLE M004 WI;Lo;0;R;;;;;N;;;;;
+1E809;MENDE KIKAKUI SYLLABLE M005 WA;Lo;0;R;;;;;N;;;;;
+1E80A;MENDE KIKAKUI SYLLABLE M006 WU;Lo;0;R;;;;;N;;;;;
+1E80B;MENDE KIKAKUI SYLLABLE M126 WEE;Lo;0;R;;;;;N;;;;;
+1E80C;MENDE KIKAKUI SYLLABLE M118 WE;Lo;0;R;;;;;N;;;;;
+1E80D;MENDE KIKAKUI SYLLABLE M114 WOO;Lo;0;R;;;;;N;;;;;
+1E80E;MENDE KIKAKUI SYLLABLE M045 WO;Lo;0;R;;;;;N;;;;;
+1E80F;MENDE KIKAKUI SYLLABLE M194 WUI;Lo;0;R;;;;;N;;;;;
+1E810;MENDE KIKAKUI SYLLABLE M143 WEI;Lo;0;R;;;;;N;;;;;
+1E811;MENDE KIKAKUI SYLLABLE M061 WVI;Lo;0;R;;;;;N;;;;;
+1E812;MENDE KIKAKUI SYLLABLE M049 WVA;Lo;0;R;;;;;N;;;;;
+1E813;MENDE KIKAKUI SYLLABLE M139 WVE;Lo;0;R;;;;;N;;;;;
+1E814;MENDE KIKAKUI SYLLABLE M007 MIN;Lo;0;R;;;;;N;;;;;
+1E815;MENDE KIKAKUI SYLLABLE M008 MAN;Lo;0;R;;;;;N;;;;;
+1E816;MENDE KIKAKUI SYLLABLE M009 MUN;Lo;0;R;;;;;N;;;;;
+1E817;MENDE KIKAKUI SYLLABLE M059 MEN;Lo;0;R;;;;;N;;;;;
+1E818;MENDE KIKAKUI SYLLABLE M094 MON;Lo;0;R;;;;;N;;;;;
+1E819;MENDE KIKAKUI SYLLABLE M154 MUAN;Lo;0;R;;;;;N;;;;;
+1E81A;MENDE KIKAKUI SYLLABLE M189 MUEN;Lo;0;R;;;;;N;;;;;
+1E81B;MENDE KIKAKUI SYLLABLE M010 BI;Lo;0;R;;;;;N;;;;;
+1E81C;MENDE KIKAKUI SYLLABLE M011 BA;Lo;0;R;;;;;N;;;;;
+1E81D;MENDE KIKAKUI SYLLABLE M012 BU;Lo;0;R;;;;;N;;;;;
+1E81E;MENDE KIKAKUI SYLLABLE M150 BEE;Lo;0;R;;;;;N;;;;;
+1E81F;MENDE KIKAKUI SYLLABLE M097 BE;Lo;0;R;;;;;N;;;;;
+1E820;MENDE KIKAKUI SYLLABLE M103 BOO;Lo;0;R;;;;;N;;;;;
+1E821;MENDE KIKAKUI SYLLABLE M138 BO;Lo;0;R;;;;;N;;;;;
+1E822;MENDE KIKAKUI SYLLABLE M013 I;Lo;0;R;;;;;N;;;;;
+1E823;MENDE KIKAKUI SYLLABLE M014 A;Lo;0;R;;;;;N;;;;;
+1E824;MENDE KIKAKUI SYLLABLE M015 U;Lo;0;R;;;;;N;;;;;
+1E825;MENDE KIKAKUI SYLLABLE M163 EE;Lo;0;R;;;;;N;;;;;
+1E826;MENDE KIKAKUI SYLLABLE M100 E;Lo;0;R;;;;;N;;;;;
+1E827;MENDE KIKAKUI SYLLABLE M165 OO;Lo;0;R;;;;;N;;;;;
+1E828;MENDE KIKAKUI SYLLABLE M147 O;Lo;0;R;;;;;N;;;;;
+1E829;MENDE KIKAKUI SYLLABLE M137 EI;Lo;0;R;;;;;N;;;;;
+1E82A;MENDE KIKAKUI SYLLABLE M131 IN;Lo;0;R;;;;;N;;;;;
+1E82B;MENDE KIKAKUI SYLLABLE M135 IN;Lo;0;R;;;;;N;;;;;
+1E82C;MENDE KIKAKUI SYLLABLE M195 AN;Lo;0;R;;;;;N;;;;;
+1E82D;MENDE KIKAKUI SYLLABLE M178 EN;Lo;0;R;;;;;N;;;;;
+1E82E;MENDE KIKAKUI SYLLABLE M019 SI;Lo;0;R;;;;;N;;;;;
+1E82F;MENDE KIKAKUI SYLLABLE M020 SA;Lo;0;R;;;;;N;;;;;
+1E830;MENDE KIKAKUI SYLLABLE M021 SU;Lo;0;R;;;;;N;;;;;
+1E831;MENDE KIKAKUI SYLLABLE M162 SEE;Lo;0;R;;;;;N;;;;;
+1E832;MENDE KIKAKUI SYLLABLE M116 SE;Lo;0;R;;;;;N;;;;;
+1E833;MENDE KIKAKUI SYLLABLE M136 SOO;Lo;0;R;;;;;N;;;;;
+1E834;MENDE KIKAKUI SYLLABLE M079 SO;Lo;0;R;;;;;N;;;;;
+1E835;MENDE KIKAKUI SYLLABLE M196 SIA;Lo;0;R;;;;;N;;;;;
+1E836;MENDE KIKAKUI SYLLABLE M025 LI;Lo;0;R;;;;;N;;;;;
+1E837;MENDE KIKAKUI SYLLABLE M026 LA;Lo;0;R;;;;;N;;;;;
+1E838;MENDE KIKAKUI SYLLABLE M027 LU;Lo;0;R;;;;;N;;;;;
+1E839;MENDE KIKAKUI SYLLABLE M084 LEE;Lo;0;R;;;;;N;;;;;
+1E83A;MENDE KIKAKUI SYLLABLE M073 LE;Lo;0;R;;;;;N;;;;;
+1E83B;MENDE KIKAKUI SYLLABLE M054 LOO;Lo;0;R;;;;;N;;;;;
+1E83C;MENDE KIKAKUI SYLLABLE M153 LO;Lo;0;R;;;;;N;;;;;
+1E83D;MENDE KIKAKUI SYLLABLE M110 LONG LE;Lo;0;R;;;;;N;;;;;
+1E83E;MENDE KIKAKUI SYLLABLE M016 DI;Lo;0;R;;;;;N;;;;;
+1E83F;MENDE KIKAKUI SYLLABLE M017 DA;Lo;0;R;;;;;N;;;;;
+1E840;MENDE KIKAKUI SYLLABLE M018 DU;Lo;0;R;;;;;N;;;;;
+1E841;MENDE KIKAKUI SYLLABLE M089 DEE;Lo;0;R;;;;;N;;;;;
+1E842;MENDE KIKAKUI SYLLABLE M180 DOO;Lo;0;R;;;;;N;;;;;
+1E843;MENDE KIKAKUI SYLLABLE M181 DO;Lo;0;R;;;;;N;;;;;
+1E844;MENDE KIKAKUI SYLLABLE M022 TI;Lo;0;R;;;;;N;;;;;
+1E845;MENDE KIKAKUI SYLLABLE M023 TA;Lo;0;R;;;;;N;;;;;
+1E846;MENDE KIKAKUI SYLLABLE M024 TU;Lo;0;R;;;;;N;;;;;
+1E847;MENDE KIKAKUI SYLLABLE M091 TEE;Lo;0;R;;;;;N;;;;;
+1E848;MENDE KIKAKUI SYLLABLE M055 TE;Lo;0;R;;;;;N;;;;;
+1E849;MENDE KIKAKUI SYLLABLE M104 TOO;Lo;0;R;;;;;N;;;;;
+1E84A;MENDE KIKAKUI SYLLABLE M069 TO;Lo;0;R;;;;;N;;;;;
+1E84B;MENDE KIKAKUI SYLLABLE M028 JI;Lo;0;R;;;;;N;;;;;
+1E84C;MENDE KIKAKUI SYLLABLE M029 JA;Lo;0;R;;;;;N;;;;;
+1E84D;MENDE KIKAKUI SYLLABLE M030 JU;Lo;0;R;;;;;N;;;;;
+1E84E;MENDE KIKAKUI SYLLABLE M157 JEE;Lo;0;R;;;;;N;;;;;
+1E84F;MENDE KIKAKUI SYLLABLE M113 JE;Lo;0;R;;;;;N;;;;;
+1E850;MENDE KIKAKUI SYLLABLE M160 JOO;Lo;0;R;;;;;N;;;;;
+1E851;MENDE KIKAKUI SYLLABLE M063 JO;Lo;0;R;;;;;N;;;;;
+1E852;MENDE KIKAKUI SYLLABLE M175 LONG JO;Lo;0;R;;;;;N;;;;;
+1E853;MENDE KIKAKUI SYLLABLE M031 YI;Lo;0;R;;;;;N;;;;;
+1E854;MENDE KIKAKUI SYLLABLE M032 YA;Lo;0;R;;;;;N;;;;;
+1E855;MENDE KIKAKUI SYLLABLE M033 YU;Lo;0;R;;;;;N;;;;;
+1E856;MENDE KIKAKUI SYLLABLE M109 YEE;Lo;0;R;;;;;N;;;;;
+1E857;MENDE KIKAKUI SYLLABLE M080 YE;Lo;0;R;;;;;N;;;;;
+1E858;MENDE KIKAKUI SYLLABLE M141 YOO;Lo;0;R;;;;;N;;;;;
+1E859;MENDE KIKAKUI SYLLABLE M121 YO;Lo;0;R;;;;;N;;;;;
+1E85A;MENDE KIKAKUI SYLLABLE M034 FI;Lo;0;R;;;;;N;;;;;
+1E85B;MENDE KIKAKUI SYLLABLE M035 FA;Lo;0;R;;;;;N;;;;;
+1E85C;MENDE KIKAKUI SYLLABLE M036 FU;Lo;0;R;;;;;N;;;;;
+1E85D;MENDE KIKAKUI SYLLABLE M078 FEE;Lo;0;R;;;;;N;;;;;
+1E85E;MENDE KIKAKUI SYLLABLE M075 FE;Lo;0;R;;;;;N;;;;;
+1E85F;MENDE KIKAKUI SYLLABLE M133 FOO;Lo;0;R;;;;;N;;;;;
+1E860;MENDE KIKAKUI SYLLABLE M088 FO;Lo;0;R;;;;;N;;;;;
+1E861;MENDE KIKAKUI SYLLABLE M197 FUA;Lo;0;R;;;;;N;;;;;
+1E862;MENDE KIKAKUI SYLLABLE M101 FAN;Lo;0;R;;;;;N;;;;;
+1E863;MENDE KIKAKUI SYLLABLE M037 NIN;Lo;0;R;;;;;N;;;;;
+1E864;MENDE KIKAKUI SYLLABLE M038 NAN;Lo;0;R;;;;;N;;;;;
+1E865;MENDE KIKAKUI SYLLABLE M039 NUN;Lo;0;R;;;;;N;;;;;
+1E866;MENDE KIKAKUI SYLLABLE M117 NEN;Lo;0;R;;;;;N;;;;;
+1E867;MENDE KIKAKUI SYLLABLE M169 NON;Lo;0;R;;;;;N;;;;;
+1E868;MENDE KIKAKUI SYLLABLE M176 HI;Lo;0;R;;;;;N;;;;;
+1E869;MENDE KIKAKUI SYLLABLE M041 HA;Lo;0;R;;;;;N;;;;;
+1E86A;MENDE KIKAKUI SYLLABLE M186 HU;Lo;0;R;;;;;N;;;;;
+1E86B;MENDE KIKAKUI SYLLABLE M040 HEE;Lo;0;R;;;;;N;;;;;
+1E86C;MENDE KIKAKUI SYLLABLE M096 HE;Lo;0;R;;;;;N;;;;;
+1E86D;MENDE KIKAKUI SYLLABLE M042 HOO;Lo;0;R;;;;;N;;;;;
+1E86E;MENDE KIKAKUI SYLLABLE M140 HO;Lo;0;R;;;;;N;;;;;
+1E86F;MENDE KIKAKUI SYLLABLE M083 HEEI;Lo;0;R;;;;;N;;;;;
+1E870;MENDE KIKAKUI SYLLABLE M128 HOOU;Lo;0;R;;;;;N;;;;;
+1E871;MENDE KIKAKUI SYLLABLE M053 HIN;Lo;0;R;;;;;N;;;;;
+1E872;MENDE KIKAKUI SYLLABLE M130 HAN;Lo;0;R;;;;;N;;;;;
+1E873;MENDE KIKAKUI SYLLABLE M087 HUN;Lo;0;R;;;;;N;;;;;
+1E874;MENDE KIKAKUI SYLLABLE M052 HEN;Lo;0;R;;;;;N;;;;;
+1E875;MENDE KIKAKUI SYLLABLE M193 HON;Lo;0;R;;;;;N;;;;;
+1E876;MENDE KIKAKUI SYLLABLE M046 HUAN;Lo;0;R;;;;;N;;;;;
+1E877;MENDE KIKAKUI SYLLABLE M090 NGGI;Lo;0;R;;;;;N;;;;;
+1E878;MENDE KIKAKUI SYLLABLE M043 NGGA;Lo;0;R;;;;;N;;;;;
+1E879;MENDE KIKAKUI SYLLABLE M082 NGGU;Lo;0;R;;;;;N;;;;;
+1E87A;MENDE KIKAKUI SYLLABLE M115 NGGEE;Lo;0;R;;;;;N;;;;;
+1E87B;MENDE KIKAKUI SYLLABLE M146 NGGE;Lo;0;R;;;;;N;;;;;
+1E87C;MENDE KIKAKUI SYLLABLE M156 NGGOO;Lo;0;R;;;;;N;;;;;
+1E87D;MENDE KIKAKUI SYLLABLE M120 NGGO;Lo;0;R;;;;;N;;;;;
+1E87E;MENDE KIKAKUI SYLLABLE M159 NGGAA;Lo;0;R;;;;;N;;;;;
+1E87F;MENDE KIKAKUI SYLLABLE M127 NGGUA;Lo;0;R;;;;;N;;;;;
+1E880;MENDE KIKAKUI SYLLABLE M086 LONG NGGE;Lo;0;R;;;;;N;;;;;
+1E881;MENDE KIKAKUI SYLLABLE M106 LONG NGGOO;Lo;0;R;;;;;N;;;;;
+1E882;MENDE KIKAKUI SYLLABLE M183 LONG NGGO;Lo;0;R;;;;;N;;;;;
+1E883;MENDE KIKAKUI SYLLABLE M155 GI;Lo;0;R;;;;;N;;;;;
+1E884;MENDE KIKAKUI SYLLABLE M111 GA;Lo;0;R;;;;;N;;;;;
+1E885;MENDE KIKAKUI SYLLABLE M168 GU;Lo;0;R;;;;;N;;;;;
+1E886;MENDE KIKAKUI SYLLABLE M190 GEE;Lo;0;R;;;;;N;;;;;
+1E887;MENDE KIKAKUI SYLLABLE M166 GUEI;Lo;0;R;;;;;N;;;;;
+1E888;MENDE KIKAKUI SYLLABLE M167 GUAN;Lo;0;R;;;;;N;;;;;
+1E889;MENDE KIKAKUI SYLLABLE M184 NGEN;Lo;0;R;;;;;N;;;;;
+1E88A;MENDE KIKAKUI SYLLABLE M057 NGON;Lo;0;R;;;;;N;;;;;
+1E88B;MENDE KIKAKUI SYLLABLE M177 NGUAN;Lo;0;R;;;;;N;;;;;
+1E88C;MENDE KIKAKUI SYLLABLE M068 PI;Lo;0;R;;;;;N;;;;;
+1E88D;MENDE KIKAKUI SYLLABLE M099 PA;Lo;0;R;;;;;N;;;;;
+1E88E;MENDE KIKAKUI SYLLABLE M050 PU;Lo;0;R;;;;;N;;;;;
+1E88F;MENDE KIKAKUI SYLLABLE M081 PEE;Lo;0;R;;;;;N;;;;;
+1E890;MENDE KIKAKUI SYLLABLE M051 PE;Lo;0;R;;;;;N;;;;;
+1E891;MENDE KIKAKUI SYLLABLE M102 POO;Lo;0;R;;;;;N;;;;;
+1E892;MENDE KIKAKUI SYLLABLE M066 PO;Lo;0;R;;;;;N;;;;;
+1E893;MENDE KIKAKUI SYLLABLE M145 MBI;Lo;0;R;;;;;N;;;;;
+1E894;MENDE KIKAKUI SYLLABLE M062 MBA;Lo;0;R;;;;;N;;;;;
+1E895;MENDE KIKAKUI SYLLABLE M122 MBU;Lo;0;R;;;;;N;;;;;
+1E896;MENDE KIKAKUI SYLLABLE M047 MBEE;Lo;0;R;;;;;N;;;;;
+1E897;MENDE KIKAKUI SYLLABLE M188 MBEE;Lo;0;R;;;;;N;;;;;
+1E898;MENDE KIKAKUI SYLLABLE M072 MBE;Lo;0;R;;;;;N;;;;;
+1E899;MENDE KIKAKUI SYLLABLE M172 MBOO;Lo;0;R;;;;;N;;;;;
+1E89A;MENDE KIKAKUI SYLLABLE M174 MBO;Lo;0;R;;;;;N;;;;;
+1E89B;MENDE KIKAKUI SYLLABLE M187 MBUU;Lo;0;R;;;;;N;;;;;
+1E89C;MENDE KIKAKUI SYLLABLE M161 LONG MBE;Lo;0;R;;;;;N;;;;;
+1E89D;MENDE KIKAKUI SYLLABLE M105 LONG MBOO;Lo;0;R;;;;;N;;;;;
+1E89E;MENDE KIKAKUI SYLLABLE M142 LONG MBO;Lo;0;R;;;;;N;;;;;
+1E89F;MENDE KIKAKUI SYLLABLE M132 KPI;Lo;0;R;;;;;N;;;;;
+1E8A0;MENDE KIKAKUI SYLLABLE M092 KPA;Lo;0;R;;;;;N;;;;;
+1E8A1;MENDE KIKAKUI SYLLABLE M074 KPU;Lo;0;R;;;;;N;;;;;
+1E8A2;MENDE KIKAKUI SYLLABLE M044 KPEE;Lo;0;R;;;;;N;;;;;
+1E8A3;MENDE KIKAKUI SYLLABLE M108 KPE;Lo;0;R;;;;;N;;;;;
+1E8A4;MENDE KIKAKUI SYLLABLE M112 KPOO;Lo;0;R;;;;;N;;;;;
+1E8A5;MENDE KIKAKUI SYLLABLE M158 KPO;Lo;0;R;;;;;N;;;;;
+1E8A6;MENDE KIKAKUI SYLLABLE M124 GBI;Lo;0;R;;;;;N;;;;;
+1E8A7;MENDE KIKAKUI SYLLABLE M056 GBA;Lo;0;R;;;;;N;;;;;
+1E8A8;MENDE KIKAKUI SYLLABLE M148 GBU;Lo;0;R;;;;;N;;;;;
+1E8A9;MENDE KIKAKUI SYLLABLE M093 GBEE;Lo;0;R;;;;;N;;;;;
+1E8AA;MENDE KIKAKUI SYLLABLE M107 GBE;Lo;0;R;;;;;N;;;;;
+1E8AB;MENDE KIKAKUI SYLLABLE M071 GBOO;Lo;0;R;;;;;N;;;;;
+1E8AC;MENDE KIKAKUI SYLLABLE M070 GBO;Lo;0;R;;;;;N;;;;;
+1E8AD;MENDE KIKAKUI SYLLABLE M171 RA;Lo;0;R;;;;;N;;;;;
+1E8AE;MENDE KIKAKUI SYLLABLE M123 NDI;Lo;0;R;;;;;N;;;;;
+1E8AF;MENDE KIKAKUI SYLLABLE M129 NDA;Lo;0;R;;;;;N;;;;;
+1E8B0;MENDE KIKAKUI SYLLABLE M125 NDU;Lo;0;R;;;;;N;;;;;
+1E8B1;MENDE KIKAKUI SYLLABLE M191 NDEE;Lo;0;R;;;;;N;;;;;
+1E8B2;MENDE KIKAKUI SYLLABLE M119 NDE;Lo;0;R;;;;;N;;;;;
+1E8B3;MENDE KIKAKUI SYLLABLE M067 NDOO;Lo;0;R;;;;;N;;;;;
+1E8B4;MENDE KIKAKUI SYLLABLE M064 NDO;Lo;0;R;;;;;N;;;;;
+1E8B5;MENDE KIKAKUI SYLLABLE M152 NJA;Lo;0;R;;;;;N;;;;;
+1E8B6;MENDE KIKAKUI SYLLABLE M192 NJU;Lo;0;R;;;;;N;;;;;
+1E8B7;MENDE KIKAKUI SYLLABLE M149 NJEE;Lo;0;R;;;;;N;;;;;
+1E8B8;MENDE KIKAKUI SYLLABLE M134 NJOO;Lo;0;R;;;;;N;;;;;
+1E8B9;MENDE KIKAKUI SYLLABLE M182 VI;Lo;0;R;;;;;N;;;;;
+1E8BA;MENDE KIKAKUI SYLLABLE M185 VA;Lo;0;R;;;;;N;;;;;
+1E8BB;MENDE KIKAKUI SYLLABLE M151 VU;Lo;0;R;;;;;N;;;;;
+1E8BC;MENDE KIKAKUI SYLLABLE M173 VEE;Lo;0;R;;;;;N;;;;;
+1E8BD;MENDE KIKAKUI SYLLABLE M085 VE;Lo;0;R;;;;;N;;;;;
+1E8BE;MENDE KIKAKUI SYLLABLE M144 VOO;Lo;0;R;;;;;N;;;;;
+1E8BF;MENDE KIKAKUI SYLLABLE M077 VO;Lo;0;R;;;;;N;;;;;
+1E8C0;MENDE KIKAKUI SYLLABLE M164 NYIN;Lo;0;R;;;;;N;;;;;
+1E8C1;MENDE KIKAKUI SYLLABLE M058 NYAN;Lo;0;R;;;;;N;;;;;
+1E8C2;MENDE KIKAKUI SYLLABLE M170 NYUN;Lo;0;R;;;;;N;;;;;
+1E8C3;MENDE KIKAKUI SYLLABLE M098 NYEN;Lo;0;R;;;;;N;;;;;
+1E8C4;MENDE KIKAKUI SYLLABLE M060 NYON;Lo;0;R;;;;;N;;;;;
+1E8C7;MENDE KIKAKUI DIGIT ONE;No;0;R;;;;1;N;;;;;
+1E8C8;MENDE KIKAKUI DIGIT TWO;No;0;R;;;;2;N;;;;;
+1E8C9;MENDE KIKAKUI DIGIT THREE;No;0;R;;;;3;N;;;;;
+1E8CA;MENDE KIKAKUI DIGIT FOUR;No;0;R;;;;4;N;;;;;
+1E8CB;MENDE KIKAKUI DIGIT FIVE;No;0;R;;;;5;N;;;;;
+1E8CC;MENDE KIKAKUI DIGIT SIX;No;0;R;;;;6;N;;;;;
+1E8CD;MENDE KIKAKUI DIGIT SEVEN;No;0;R;;;;7;N;;;;;
+1E8CE;MENDE KIKAKUI DIGIT EIGHT;No;0;R;;;;8;N;;;;;
+1E8CF;MENDE KIKAKUI DIGIT NINE;No;0;R;;;;9;N;;;;;
+1E8D0;MENDE KIKAKUI COMBINING NUMBER TEENS;Mn;220;NSM;;;;;N;;;;;
+1E8D1;MENDE KIKAKUI COMBINING NUMBER TENS;Mn;220;NSM;;;;;N;;;;;
+1E8D2;MENDE KIKAKUI COMBINING NUMBER HUNDREDS;Mn;220;NSM;;;;;N;;;;;
+1E8D3;MENDE KIKAKUI COMBINING NUMBER THOUSANDS;Mn;220;NSM;;;;;N;;;;;
+1E8D4;MENDE KIKAKUI COMBINING NUMBER TEN THOUSANDS;Mn;220;NSM;;;;;N;;;;;
+1E8D5;MENDE KIKAKUI COMBINING NUMBER HUNDRED THOUSANDS;Mn;220;NSM;;;;;N;;;;;
+1E8D6;MENDE KIKAKUI COMBINING NUMBER MILLIONS;Mn;220;NSM;;;;;N;;;;;
1EE00;ARABIC MATHEMATICAL ALEF;Lo;0;AL;<font> 0627;;;;N;;;;;
1EE01;ARABIC MATHEMATICAL BEH;Lo;0;AL;<font> 0628;;;;N;;;;;
1EE02;ARABIC MATHEMATICAL JEEM;Lo;0;AL;<font> 062C;;;;N;;;;;
@@ -22485,6 +24780,7 @@
1F0BC;PLAYING CARD KNIGHT OF HEARTS;So;0;ON;;;;;N;;;;;
1F0BD;PLAYING CARD QUEEN OF HEARTS;So;0;ON;;;;;N;;;;;
1F0BE;PLAYING CARD KING OF HEARTS;So;0;ON;;;;;N;;;;;
+1F0BF;PLAYING CARD RED JOKER;So;0;ON;;;;;N;;;;;
1F0C1;PLAYING CARD ACE OF DIAMONDS;So;0;ON;;;;;N;;;;;
1F0C2;PLAYING CARD TWO OF DIAMONDS;So;0;ON;;;;;N;;;;;
1F0C3;PLAYING CARD THREE OF DIAMONDS;So;0;ON;;;;;N;;;;;
@@ -22515,6 +24811,28 @@
1F0DD;PLAYING CARD QUEEN OF CLUBS;So;0;ON;;;;;N;;;;;
1F0DE;PLAYING CARD KING OF CLUBS;So;0;ON;;;;;N;;;;;
1F0DF;PLAYING CARD WHITE JOKER;So;0;ON;;;;;N;;;;;
+1F0E0;PLAYING CARD FOOL;So;0;ON;;;;;N;;;;;
+1F0E1;PLAYING CARD TRUMP-1;So;0;ON;;;;;N;;;;;
+1F0E2;PLAYING CARD TRUMP-2;So;0;ON;;;;;N;;;;;
+1F0E3;PLAYING CARD TRUMP-3;So;0;ON;;;;;N;;;;;
+1F0E4;PLAYING CARD TRUMP-4;So;0;ON;;;;;N;;;;;
+1F0E5;PLAYING CARD TRUMP-5;So;0;ON;;;;;N;;;;;
+1F0E6;PLAYING CARD TRUMP-6;So;0;ON;;;;;N;;;;;
+1F0E7;PLAYING CARD TRUMP-7;So;0;ON;;;;;N;;;;;
+1F0E8;PLAYING CARD TRUMP-8;So;0;ON;;;;;N;;;;;
+1F0E9;PLAYING CARD TRUMP-9;So;0;ON;;;;;N;;;;;
+1F0EA;PLAYING CARD TRUMP-10;So;0;ON;;;;;N;;;;;
+1F0EB;PLAYING CARD TRUMP-11;So;0;ON;;;;;N;;;;;
+1F0EC;PLAYING CARD TRUMP-12;So;0;ON;;;;;N;;;;;
+1F0ED;PLAYING CARD TRUMP-13;So;0;ON;;;;;N;;;;;
+1F0EE;PLAYING CARD TRUMP-14;So;0;ON;;;;;N;;;;;
+1F0EF;PLAYING CARD TRUMP-15;So;0;ON;;;;;N;;;;;
+1F0F0;PLAYING CARD TRUMP-16;So;0;ON;;;;;N;;;;;
+1F0F1;PLAYING CARD TRUMP-17;So;0;ON;;;;;N;;;;;
+1F0F2;PLAYING CARD TRUMP-18;So;0;ON;;;;;N;;;;;
+1F0F3;PLAYING CARD TRUMP-19;So;0;ON;;;;;N;;;;;
+1F0F4;PLAYING CARD TRUMP-20;So;0;ON;;;;;N;;;;;
+1F0F5;PLAYING CARD TRUMP-21;So;0;ON;;;;;N;;;;;
1F100;DIGIT ZERO FULL STOP;No;0;EN;<compat> 0030 002E;;0;0;N;;;;;
1F101;DIGIT ZERO COMMA;No;0;EN;<compat> 0030 002C;;0;0;N;;;;;
1F102;DIGIT ONE COMMA;No;0;EN;<compat> 0031 002C;;1;1;N;;;;;
@@ -22526,6 +24844,8 @@
1F108;DIGIT SEVEN COMMA;No;0;EN;<compat> 0037 002C;;7;7;N;;;;;
1F109;DIGIT EIGHT COMMA;No;0;EN;<compat> 0038 002C;;8;8;N;;;;;
1F10A;DIGIT NINE COMMA;No;0;EN;<compat> 0039 002C;;9;9;N;;;;;
+1F10B;DINGBAT CIRCLED SANS-SERIF DIGIT ZERO;No;0;ON;;;;0;N;;;;;
+1F10C;DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO;No;0;ON;;;;0;N;;;;;
1F110;PARENTHESIZED LATIN CAPITAL LETTER A;So;0;L;<compat> 0028 0041 0029;;;;N;;;;;
1F111;PARENTHESIZED LATIN CAPITAL LETTER B;So;0;L;<compat> 0028 0042 0029;;;;N;;;;;
1F112;PARENTHESIZED LATIN CAPITAL LETTER C;So;0;L;<compat> 0028 0043 0029;;;;N;;;;;
@@ -22776,12 +25096,25 @@
1F31E;SUN WITH FACE;So;0;ON;;;;;N;;;;;
1F31F;GLOWING STAR;So;0;ON;;;;;N;;;;;
1F320;SHOOTING STAR;So;0;ON;;;;;N;;;;;
+1F321;THERMOMETER;So;0;ON;;;;;N;;;;;
+1F322;BLACK DROPLET;So;0;ON;;;;;N;;;;;
+1F323;WHITE SUN;So;0;ON;;;;;N;;;;;
+1F324;WHITE SUN WITH SMALL CLOUD;So;0;ON;;;;;N;;;;;
+1F325;WHITE SUN BEHIND CLOUD;So;0;ON;;;;;N;;;;;
+1F326;WHITE SUN BEHIND CLOUD WITH RAIN;So;0;ON;;;;;N;;;;;
+1F327;CLOUD WITH RAIN;So;0;ON;;;;;N;;;;;
+1F328;CLOUD WITH SNOW;So;0;ON;;;;;N;;;;;
+1F329;CLOUD WITH LIGHTNING;So;0;ON;;;;;N;;;;;
+1F32A;CLOUD WITH TORNADO;So;0;ON;;;;;N;;;;;
+1F32B;FOG;So;0;ON;;;;;N;;;;;
+1F32C;WIND BLOWING FACE;So;0;ON;;;;;N;;;;;
1F330;CHESTNUT;So;0;ON;;;;;N;;;;;
1F331;SEEDLING;So;0;ON;;;;;N;;;;;
1F332;EVERGREEN TREE;So;0;ON;;;;;N;;;;;
1F333;DECIDUOUS TREE;So;0;ON;;;;;N;;;;;
1F334;PALM TREE;So;0;ON;;;;;N;;;;;
1F335;CACTUS;So;0;ON;;;;;N;;;;;
+1F336;HOT PEPPER;So;0;ON;;;;;N;;;;;
1F337;TULIP;So;0;ON;;;;;N;;;;;
1F338;CHERRY BLOSSOM;So;0;ON;;;;;N;;;;;
1F339;ROSE;So;0;ON;;;;;N;;;;;
@@ -22852,6 +25185,7 @@
1F37A;BEER MUG;So;0;ON;;;;;N;;;;;
1F37B;CLINKING BEER MUGS;So;0;ON;;;;;N;;;;;
1F37C;BABY BOTTLE;So;0;ON;;;;;N;;;;;
+1F37D;FORK AND KNIFE WITH PLATE;So;0;ON;;;;;N;;;;;
1F380;RIBBON;So;0;ON;;;;;N;;;;;
1F381;WRAPPED PRESENT;So;0;ON;;;;;N;;;;;
1F382;BIRTHDAY CAKE;So;0;ON;;;;;N;;;;;
@@ -22872,6 +25206,18 @@
1F391;MOON VIEWING CEREMONY;So;0;ON;;;;;N;;;;;
1F392;SCHOOL SATCHEL;So;0;ON;;;;;N;;;;;
1F393;GRADUATION CAP;So;0;ON;;;;;N;;;;;
+1F394;HEART WITH TIP ON THE LEFT;So;0;ON;;;;;N;;;;;
+1F395;BOUQUET OF FLOWERS;So;0;ON;;;;;N;;;;;
+1F396;MILITARY MEDAL;So;0;ON;;;;;N;;;;;
+1F397;REMINDER RIBBON;So;0;ON;;;;;N;;;;;
+1F398;MUSICAL KEYBOARD WITH JACKS;So;0;ON;;;;;N;;;;;
+1F399;STUDIO MICROPHONE;So;0;ON;;;;;N;;;;;
+1F39A;LEVEL SLIDER;So;0;ON;;;;;N;;;;;
+1F39B;CONTROL KNOBS;So;0;ON;;;;;N;;;;;
+1F39C;BEAMED ASCENDING MUSICAL NOTES;So;0;ON;;;;;N;;;;;
+1F39D;BEAMED DESCENDING MUSICAL NOTES;So;0;ON;;;;;N;;;;;
+1F39E;FILM FRAMES;So;0;ON;;;;;N;;;;;
+1F39F;ADMISSION TICKETS;So;0;ON;;;;;N;;;;;
1F3A0;CAROUSEL HORSE;So;0;ON;;;;;N;;;;;
1F3A1;FERRIS WHEEL;So;0;ON;;;;;N;;;;;
1F3A2;ROLLER COASTER;So;0;ON;;;;;N;;;;;
@@ -22909,11 +25255,28 @@
1F3C2;SNOWBOARDER;So;0;ON;;;;;N;;;;;
1F3C3;RUNNER;So;0;ON;;;;;N;;;;;
1F3C4;SURFER;So;0;ON;;;;;N;;;;;
+1F3C5;SPORTS MEDAL;So;0;ON;;;;;N;;;;;
1F3C6;TROPHY;So;0;ON;;;;;N;;;;;
1F3C7;HORSE RACING;So;0;ON;;;;;N;;;;;
1F3C8;AMERICAN FOOTBALL;So;0;ON;;;;;N;;;;;
1F3C9;RUGBY FOOTBALL;So;0;ON;;;;;N;;;;;
1F3CA;SWIMMER;So;0;ON;;;;;N;;;;;
+1F3CB;WEIGHT LIFTER;So;0;ON;;;;;N;;;;;
+1F3CC;GOLFER;So;0;ON;;;;;N;;;;;
+1F3CD;RACING MOTORCYCLE;So;0;ON;;;;;N;;;;;
+1F3CE;RACING CAR;So;0;ON;;;;;N;;;;;
+1F3D4;SNOW CAPPED MOUNTAIN;So;0;ON;;;;;N;;;;;
+1F3D5;CAMPING;So;0;ON;;;;;N;;;;;
+1F3D6;BEACH WITH UMBRELLA;So;0;ON;;;;;N;;;;;
+1F3D7;BUILDING CONSTRUCTION;So;0;ON;;;;;N;;;;;
+1F3D8;HOUSE BUILDINGS;So;0;ON;;;;;N;;;;;
+1F3D9;CITYSCAPE;So;0;ON;;;;;N;;;;;
+1F3DA;DERELICT HOUSE BUILDING;So;0;ON;;;;;N;;;;;
+1F3DB;CLASSICAL BUILDING;So;0;ON;;;;;N;;;;;
+1F3DC;DESERT;So;0;ON;;;;;N;;;;;
+1F3DD;DESERT ISLAND;So;0;ON;;;;;N;;;;;
+1F3DE;NATIONAL PARK;So;0;ON;;;;;N;;;;;
+1F3DF;STADIUM;So;0;ON;;;;;N;;;;;
1F3E0;HOUSE BUILDING;So;0;ON;;;;;N;;;;;
1F3E1;HOUSE WITH GARDEN;So;0;ON;;;;;N;;;;;
1F3E2;OFFICE BUILDING;So;0;ON;;;;;N;;;;;
@@ -22931,6 +25294,13 @@
1F3EE;IZAKAYA LANTERN;So;0;ON;;;;;N;;;;;
1F3EF;JAPANESE CASTLE;So;0;ON;;;;;N;;;;;
1F3F0;EUROPEAN CASTLE;So;0;ON;;;;;N;;;;;
+1F3F1;WHITE PENNANT;So;0;ON;;;;;N;;;;;
+1F3F2;BLACK PENNANT;So;0;ON;;;;;N;;;;;
+1F3F3;WAVING WHITE FLAG;So;0;ON;;;;;N;;;;;
+1F3F4;WAVING BLACK FLAG;So;0;ON;;;;;N;;;;;
+1F3F5;ROSETTE;So;0;ON;;;;;N;;;;;
+1F3F6;BLACK ROSETTE;So;0;ON;;;;;N;;;;;
+1F3F7;LABEL;So;0;ON;;;;;N;;;;;
1F400;RAT;So;0;ON;;;;;N;;;;;
1F401;MOUSE;So;0;ON;;;;;N;;;;;
1F402;OX;So;0;ON;;;;;N;;;;;
@@ -22994,7 +25364,9 @@
1F43C;PANDA FACE;So;0;ON;;;;;N;;;;;
1F43D;PIG NOSE;So;0;ON;;;;;N;;;;;
1F43E;PAW PRINTS;So;0;ON;;;;;N;;;;;
+1F43F;CHIPMUNK;So;0;ON;;;;;N;;;;;
1F440;EYES;So;0;ON;;;;;N;;;;;
+1F441;EYE;So;0;ON;;;;;N;;;;;
1F442;EAR;So;0;ON;;;;;N;;;;;
1F443;NOSE;So;0;ON;;;;;N;;;;;
1F444;MOUTH;So;0;ON;;;;;N;;;;;
@@ -23177,10 +25549,13 @@
1F4F5;NO MOBILE PHONES;So;0;ON;;;;;N;;;;;
1F4F6;ANTENNA WITH BARS;So;0;ON;;;;;N;;;;;
1F4F7;CAMERA;So;0;ON;;;;;N;;;;;
+1F4F8;CAMERA WITH FLASH;So;0;ON;;;;;N;;;;;
1F4F9;VIDEO CAMERA;So;0;ON;;;;;N;;;;;
1F4FA;TELEVISION;So;0;ON;;;;;N;;;;;
1F4FB;RADIO;So;0;ON;;;;;N;;;;;
1F4FC;VIDEOCASSETTE;So;0;ON;;;;;N;;;;;
+1F4FD;FILM PROJECTOR;So;0;ON;;;;;N;;;;;
+1F4FE;PORTABLE STEREO;So;0;ON;;;;;N;;;;;
1F500;TWISTED RIGHTWARDS ARROWS;So;0;ON;;;;;N;;;;;
1F501;CLOCKWISE RIGHTWARDS AND LEFTWARDS OPEN CIRCLE ARROWS;So;0;ON;;;;;N;;;;;
1F502;CLOCKWISE RIGHTWARDS AND LEFTWARDS OPEN CIRCLE ARROWS WITH CIRCLED ONE OVERLAY;So;0;ON;;;;;N;;;;;
@@ -23243,10 +25618,19 @@
1F53B;DOWN-POINTING RED TRIANGLE;So;0;ON;;;;;N;;;;;
1F53C;UP-POINTING SMALL RED TRIANGLE;So;0;ON;;;;;N;;;;;
1F53D;DOWN-POINTING SMALL RED TRIANGLE;So;0;ON;;;;;N;;;;;
+1F53E;LOWER RIGHT SHADOWED WHITE CIRCLE;So;0;ON;;;;;N;;;;;
+1F53F;UPPER RIGHT SHADOWED WHITE CIRCLE;So;0;ON;;;;;N;;;;;
1F540;CIRCLED CROSS POMMEE;So;0;ON;;;;;N;;;;;
1F541;CROSS POMMEE WITH HALF-CIRCLE BELOW;So;0;ON;;;;;N;;;;;
1F542;CROSS POMMEE;So;0;ON;;;;;N;;;;;
1F543;NOTCHED LEFT SEMICIRCLE WITH THREE DOTS;So;0;ON;;;;;N;;;;;
+1F544;NOTCHED RIGHT SEMICIRCLE WITH THREE DOTS;So;0;ON;;;;;N;;;;;
+1F545;SYMBOL FOR MARKS CHAPTER;So;0;ON;;;;;N;;;;;
+1F546;WHITE LATIN CROSS;So;0;ON;;;;;N;;;;;
+1F547;HEAVY LATIN CROSS;So;0;ON;;;;;N;;;;;
+1F548;CELTIC CROSS;So;0;ON;;;;;N;;;;;
+1F549;OM SYMBOL;So;0;ON;;;;;N;;;;;
+1F54A;DOVE OF PEACE;So;0;ON;;;;;N;;;;;
1F550;CLOCK FACE ONE OCLOCK;So;0;ON;;;;;N;;;;;
1F551;CLOCK FACE TWO OCLOCK;So;0;ON;;;;;N;;;;;
1F552;CLOCK FACE THREE OCLOCK;So;0;ON;;;;;N;;;;;
@@ -23271,6 +25655,151 @@
1F565;CLOCK FACE TEN-THIRTY;So;0;ON;;;;;N;;;;;
1F566;CLOCK FACE ELEVEN-THIRTY;So;0;ON;;;;;N;;;;;
1F567;CLOCK FACE TWELVE-THIRTY;So;0;ON;;;;;N;;;;;
+1F568;RIGHT SPEAKER;So;0;ON;;;;;N;;;;;
+1F569;RIGHT SPEAKER WITH ONE SOUND WAVE;So;0;ON;;;;;N;;;;;
+1F56A;RIGHT SPEAKER WITH THREE SOUND WAVES;So;0;ON;;;;;N;;;;;
+1F56B;BULLHORN;So;0;ON;;;;;N;;;;;
+1F56C;BULLHORN WITH SOUND WAVES;So;0;ON;;;;;N;;;;;
+1F56D;RINGING BELL;So;0;ON;;;;;N;;;;;
+1F56E;BOOK;So;0;ON;;;;;N;;;;;
+1F56F;CANDLE;So;0;ON;;;;;N;;;;;
+1F570;MANTELPIECE CLOCK;So;0;ON;;;;;N;;;;;
+1F571;BLACK SKULL AND CROSSBONES;So;0;ON;;;;;N;;;;;
+1F572;NO PIRACY;So;0;ON;;;;;N;;;;;
+1F573;HOLE;So;0;ON;;;;;N;;;;;
+1F574;MAN IN BUSINESS SUIT LEVITATING;So;0;ON;;;;;N;;;;;
+1F575;SLEUTH OR SPY;So;0;ON;;;;;N;;;;;
+1F576;DARK SUNGLASSES;So;0;ON;;;;;N;;;;;
+1F577;SPIDER;So;0;ON;;;;;N;;;;;
+1F578;SPIDER WEB;So;0;ON;;;;;N;;;;;
+1F579;JOYSTICK;So;0;ON;;;;;N;;;;;
+1F57B;LEFT HAND TELEPHONE RECEIVER;So;0;ON;;;;;N;;;;;
+1F57C;TELEPHONE RECEIVER WITH PAGE;So;0;ON;;;;;N;;;;;
+1F57D;RIGHT HAND TELEPHONE RECEIVER;So;0;ON;;;;;N;;;;;
+1F57E;WHITE TOUCHTONE TELEPHONE;So;0;ON;;;;;N;;;;;
+1F57F;BLACK TOUCHTONE TELEPHONE;So;0;ON;;;;;N;;;;;
+1F580;TELEPHONE ON TOP OF MODEM;So;0;ON;;;;;N;;;;;
+1F581;CLAMSHELL MOBILE PHONE;So;0;ON;;;;;N;;;;;
+1F582;BACK OF ENVELOPE;So;0;ON;;;;;N;;;;;
+1F583;STAMPED ENVELOPE;So;0;ON;;;;;N;;;;;
+1F584;ENVELOPE WITH LIGHTNING;So;0;ON;;;;;N;;;;;
+1F585;FLYING ENVELOPE;So;0;ON;;;;;N;;;;;
+1F586;PEN OVER STAMPED ENVELOPE;So;0;ON;;;;;N;;;;;
+1F587;LINKED PAPERCLIPS;So;0;ON;;;;;N;;;;;
+1F588;BLACK PUSHPIN;So;0;ON;;;;;N;;;;;
+1F589;LOWER LEFT PENCIL;So;0;ON;;;;;N;;;;;
+1F58A;LOWER LEFT BALLPOINT PEN;So;0;ON;;;;;N;;;;;
+1F58B;LOWER LEFT FOUNTAIN PEN;So;0;ON;;;;;N;;;;;
+1F58C;LOWER LEFT PAINTBRUSH;So;0;ON;;;;;N;;;;;
+1F58D;LOWER LEFT CRAYON;So;0;ON;;;;;N;;;;;
+1F58E;LEFT WRITING HAND;So;0;ON;;;;;N;;;;;
+1F58F;TURNED OK HAND SIGN;So;0;ON;;;;;N;;;;;
+1F590;RAISED HAND WITH FINGERS SPLAYED;So;0;ON;;;;;N;;;;;
+1F591;REVERSED RAISED HAND WITH FINGERS SPLAYED;So;0;ON;;;;;N;;;;;
+1F592;REVERSED THUMBS UP SIGN;So;0;ON;;;;;N;;;;;
+1F593;REVERSED THUMBS DOWN SIGN;So;0;ON;;;;;N;;;;;
+1F594;REVERSED VICTORY HAND;So;0;ON;;;;;N;;;;;
+1F595;REVERSED HAND WITH MIDDLE FINGER EXTENDED;So;0;ON;;;;;N;;;;;
+1F596;RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS;So;0;ON;;;;;N;;;;;
+1F597;WHITE DOWN POINTING LEFT HAND INDEX;So;0;ON;;;;;N;;;;;
+1F598;SIDEWAYS WHITE LEFT POINTING INDEX;So;0;ON;;;;;N;;;;;
+1F599;SIDEWAYS WHITE RIGHT POINTING INDEX;So;0;ON;;;;;N;;;;;
+1F59A;SIDEWAYS BLACK LEFT POINTING INDEX;So;0;ON;;;;;N;;;;;
+1F59B;SIDEWAYS BLACK RIGHT POINTING INDEX;So;0;ON;;;;;N;;;;;
+1F59C;BLACK LEFT POINTING BACKHAND INDEX;So;0;ON;;;;;N;;;;;
+1F59D;BLACK RIGHT POINTING BACKHAND INDEX;So;0;ON;;;;;N;;;;;
+1F59E;SIDEWAYS WHITE UP POINTING INDEX;So;0;ON;;;;;N;;;;;
+1F59F;SIDEWAYS WHITE DOWN POINTING INDEX;So;0;ON;;;;;N;;;;;
+1F5A0;SIDEWAYS BLACK UP POINTING INDEX;So;0;ON;;;;;N;;;;;
+1F5A1;SIDEWAYS BLACK DOWN POINTING INDEX;So;0;ON;;;;;N;;;;;
+1F5A2;BLACK UP POINTING BACKHAND INDEX;So;0;ON;;;;;N;;;;;
+1F5A3;BLACK DOWN POINTING BACKHAND INDEX;So;0;ON;;;;;N;;;;;
+1F5A5;DESKTOP COMPUTER;So;0;ON;;;;;N;;;;;
+1F5A6;KEYBOARD AND MOUSE;So;0;ON;;;;;N;;;;;
+1F5A7;THREE NETWORKED COMPUTERS;So;0;ON;;;;;N;;;;;
+1F5A8;PRINTER;So;0;ON;;;;;N;;;;;
+1F5A9;POCKET CALCULATOR;So;0;ON;;;;;N;;;;;
+1F5AA;BLACK HARD SHELL FLOPPY DISK;So;0;ON;;;;;N;;;;;
+1F5AB;WHITE HARD SHELL FLOPPY DISK;So;0;ON;;;;;N;;;;;
+1F5AC;SOFT SHELL FLOPPY DISK;So;0;ON;;;;;N;;;;;
+1F5AD;TAPE CARTRIDGE;So;0;ON;;;;;N;;;;;
+1F5AE;WIRED KEYBOARD;So;0;ON;;;;;N;;;;;
+1F5AF;ONE BUTTON MOUSE;So;0;ON;;;;;N;;;;;
+1F5B0;TWO BUTTON MOUSE;So;0;ON;;;;;N;;;;;
+1F5B1;THREE BUTTON MOUSE;So;0;ON;;;;;N;;;;;
+1F5B2;TRACKBALL;So;0;ON;;;;;N;;;;;
+1F5B3;OLD PERSONAL COMPUTER;So;0;ON;;;;;N;;;;;
+1F5B4;HARD DISK;So;0;ON;;;;;N;;;;;
+1F5B5;SCREEN;So;0;ON;;;;;N;;;;;
+1F5B6;PRINTER ICON;So;0;ON;;;;;N;;;;;
+1F5B7;FAX ICON;So;0;ON;;;;;N;;;;;
+1F5B8;OPTICAL DISC ICON;So;0;ON;;;;;N;;;;;
+1F5B9;DOCUMENT WITH TEXT;So;0;ON;;;;;N;;;;;
+1F5BA;DOCUMENT WITH TEXT AND PICTURE;So;0;ON;;;;;N;;;;;
+1F5BB;DOCUMENT WITH PICTURE;So;0;ON;;;;;N;;;;;
+1F5BC;FRAME WITH PICTURE;So;0;ON;;;;;N;;;;;
+1F5BD;FRAME WITH TILES;So;0;ON;;;;;N;;;;;
+1F5BE;FRAME WITH AN X;So;0;ON;;;;;N;;;;;
+1F5BF;BLACK FOLDER;So;0;ON;;;;;N;;;;;
+1F5C0;FOLDER;So;0;ON;;;;;N;;;;;
+1F5C1;OPEN FOLDER;So;0;ON;;;;;N;;;;;
+1F5C2;CARD INDEX DIVIDERS;So;0;ON;;;;;N;;;;;
+1F5C3;CARD FILE BOX;So;0;ON;;;;;N;;;;;
+1F5C4;FILE CABINET;So;0;ON;;;;;N;;;;;
+1F5C5;EMPTY NOTE;So;0;ON;;;;;N;;;;;
+1F5C6;EMPTY NOTE PAGE;So;0;ON;;;;;N;;;;;
+1F5C7;EMPTY NOTE PAD;So;0;ON;;;;;N;;;;;
+1F5C8;NOTE;So;0;ON;;;;;N;;;;;
+1F5C9;NOTE PAGE;So;0;ON;;;;;N;;;;;
+1F5CA;NOTE PAD;So;0;ON;;;;;N;;;;;
+1F5CB;EMPTY DOCUMENT;So;0;ON;;;;;N;;;;;
+1F5CC;EMPTY PAGE;So;0;ON;;;;;N;;;;;
+1F5CD;EMPTY PAGES;So;0;ON;;;;;N;;;;;
+1F5CE;DOCUMENT;So;0;ON;;;;;N;;;;;
+1F5CF;PAGE;So;0;ON;;;;;N;;;;;
+1F5D0;PAGES;So;0;ON;;;;;N;;;;;
+1F5D1;WASTEBASKET;So;0;ON;;;;;N;;;;;
+1F5D2;SPIRAL NOTE PAD;So;0;ON;;;;;N;;;;;
+1F5D3;SPIRAL CALENDAR PAD;So;0;ON;;;;;N;;;;;
+1F5D4;DESKTOP WINDOW;So;0;ON;;;;;N;;;;;
+1F5D5;MINIMIZE;So;0;ON;;;;;N;;;;;
+1F5D6;MAXIMIZE;So;0;ON;;;;;N;;;;;
+1F5D7;OVERLAP;So;0;ON;;;;;N;;;;;
+1F5D8;CLOCKWISE RIGHT AND LEFT SEMICIRCLE ARROWS;So;0;ON;;;;;N;;;;;
+1F5D9;CANCELLATION X;So;0;ON;;;;;N;;;;;
+1F5DA;INCREASE FONT SIZE SYMBOL;So;0;ON;;;;;N;;;;;
+1F5DB;DECREASE FONT SIZE SYMBOL;So;0;ON;;;;;N;;;;;
+1F5DC;COMPRESSION;So;0;ON;;;;;N;;;;;
+1F5DD;OLD KEY;So;0;ON;;;;;N;;;;;
+1F5DE;ROLLED-UP NEWSPAPER;So;0;ON;;;;;N;;;;;
+1F5DF;PAGE WITH CIRCLED TEXT;So;0;ON;;;;;N;;;;;
+1F5E0;STOCK CHART;So;0;ON;;;;;N;;;;;
+1F5E1;DAGGER KNIFE;So;0;ON;;;;;N;;;;;
+1F5E2;LIPS;So;0;ON;;;;;N;;;;;
+1F5E3;SPEAKING HEAD IN SILHOUETTE;So;0;ON;;;;;N;;;;;
+1F5E4;THREE RAYS ABOVE;So;0;ON;;;;;N;;;;;
+1F5E5;THREE RAYS BELOW;So;0;ON;;;;;N;;;;;
+1F5E6;THREE RAYS LEFT;So;0;ON;;;;;N;;;;;
+1F5E7;THREE RAYS RIGHT;So;0;ON;;;;;N;;;;;
+1F5E8;LEFT SPEECH BUBBLE;So;0;ON;;;;;N;;;;;
+1F5E9;RIGHT SPEECH BUBBLE;So;0;ON;;;;;N;;;;;
+1F5EA;TWO SPEECH BUBBLES;So;0;ON;;;;;N;;;;;
+1F5EB;THREE SPEECH BUBBLES;So;0;ON;;;;;N;;;;;
+1F5EC;LEFT THOUGHT BUBBLE;So;0;ON;;;;;N;;;;;
+1F5ED;RIGHT THOUGHT BUBBLE;So;0;ON;;;;;N;;;;;
+1F5EE;LEFT ANGER BUBBLE;So;0;ON;;;;;N;;;;;
+1F5EF;RIGHT ANGER BUBBLE;So;0;ON;;;;;N;;;;;
+1F5F0;MOOD BUBBLE;So;0;ON;;;;;N;;;;;
+1F5F1;LIGHTNING MOOD BUBBLE;So;0;ON;;;;;N;;;;;
+1F5F2;LIGHTNING MOOD;So;0;ON;;;;;N;;;;;
+1F5F3;BALLOT BOX WITH BALLOT;So;0;ON;;;;;N;;;;;
+1F5F4;BALLOT SCRIPT X;So;0;ON;;;;;N;;;;;
+1F5F5;BALLOT BOX WITH SCRIPT X;So;0;ON;;;;;N;;;;;
+1F5F6;BALLOT BOLD SCRIPT X;So;0;ON;;;;;N;;;;;
+1F5F7;BALLOT BOX WITH BOLD SCRIPT X;So;0;ON;;;;;N;;;;;
+1F5F8;LIGHT CHECK MARK;So;0;ON;;;;;N;;;;;
+1F5F9;BALLOT BOX WITH BOLD CHECK;So;0;ON;;;;;N;;;;;
+1F5FA;WORLD MAP;So;0;ON;;;;;N;;;;;
1F5FB;MOUNT FUJI;So;0;ON;;;;;N;;;;;
1F5FC;TOKYO TOWER;So;0;ON;;;;;N;;;;;
1F5FD;STATUE OF LIBERTY;So;0;ON;;;;;N;;;;;
@@ -23341,6 +25870,8 @@
1F63E;POUTING CAT FACE;So;0;ON;;;;;N;;;;;
1F63F;CRYING CAT FACE;So;0;ON;;;;;N;;;;;
1F640;WEARY CAT FACE;So;0;ON;;;;;N;;;;;
+1F641;SLIGHTLY FROWNING FACE;So;0;ON;;;;;N;;;;;
+1F642;SLIGHTLY SMILING FACE;So;0;ON;;;;;N;;;;;
1F645;FACE WITH NO GOOD GESTURE;So;0;ON;;;;;N;;;;;
1F646;FACE WITH OK GESTURE;So;0;ON;;;;;N;;;;;
1F647;PERSON BOWING DEEPLY;So;0;ON;;;;;N;;;;;
@@ -23352,6 +25883,54 @@
1F64D;PERSON FROWNING;So;0;ON;;;;;N;;;;;
1F64E;PERSON WITH POUTING FACE;So;0;ON;;;;;N;;;;;
1F64F;PERSON WITH FOLDED HANDS;So;0;ON;;;;;N;;;;;
+1F650;NORTH WEST POINTING LEAF;So;0;ON;;;;;N;;;;;
+1F651;SOUTH WEST POINTING LEAF;So;0;ON;;;;;N;;;;;
+1F652;NORTH EAST POINTING LEAF;So;0;ON;;;;;N;;;;;
+1F653;SOUTH EAST POINTING LEAF;So;0;ON;;;;;N;;;;;
+1F654;TURNED NORTH WEST POINTING LEAF;So;0;ON;;;;;N;;;;;
+1F655;TURNED SOUTH WEST POINTING LEAF;So;0;ON;;;;;N;;;;;
+1F656;TURNED NORTH EAST POINTING LEAF;So;0;ON;;;;;N;;;;;
+1F657;TURNED SOUTH EAST POINTING LEAF;So;0;ON;;;;;N;;;;;
+1F658;NORTH WEST POINTING VINE LEAF;So;0;ON;;;;;N;;;;;
+1F659;SOUTH WEST POINTING VINE LEAF;So;0;ON;;;;;N;;;;;
+1F65A;NORTH EAST POINTING VINE LEAF;So;0;ON;;;;;N;;;;;
+1F65B;SOUTH EAST POINTING VINE LEAF;So;0;ON;;;;;N;;;;;
+1F65C;HEAVY NORTH WEST POINTING VINE LEAF;So;0;ON;;;;;N;;;;;
+1F65D;HEAVY SOUTH WEST POINTING VINE LEAF;So;0;ON;;;;;N;;;;;
+1F65E;HEAVY NORTH EAST POINTING VINE LEAF;So;0;ON;;;;;N;;;;;
+1F65F;HEAVY SOUTH EAST POINTING VINE LEAF;So;0;ON;;;;;N;;;;;
+1F660;NORTH WEST POINTING BUD;So;0;ON;;;;;N;;;;;
+1F661;SOUTH WEST POINTING BUD;So;0;ON;;;;;N;;;;;
+1F662;NORTH EAST POINTING BUD;So;0;ON;;;;;N;;;;;
+1F663;SOUTH EAST POINTING BUD;So;0;ON;;;;;N;;;;;
+1F664;HEAVY NORTH WEST POINTING BUD;So;0;ON;;;;;N;;;;;
+1F665;HEAVY SOUTH WEST POINTING BUD;So;0;ON;;;;;N;;;;;
+1F666;HEAVY NORTH EAST POINTING BUD;So;0;ON;;;;;N;;;;;
+1F667;HEAVY SOUTH EAST POINTING BUD;So;0;ON;;;;;N;;;;;
+1F668;HOLLOW QUILT SQUARE ORNAMENT;So;0;ON;;;;;N;;;;;
+1F669;HOLLOW QUILT SQUARE ORNAMENT IN BLACK SQUARE;So;0;ON;;;;;N;;;;;
+1F66A;SOLID QUILT SQUARE ORNAMENT;So;0;ON;;;;;N;;;;;
+1F66B;SOLID QUILT SQUARE ORNAMENT IN BLACK SQUARE;So;0;ON;;;;;N;;;;;
+1F66C;LEFTWARDS ROCKET;So;0;ON;;;;;N;;;;;
+1F66D;UPWARDS ROCKET;So;0;ON;;;;;N;;;;;
+1F66E;RIGHTWARDS ROCKET;So;0;ON;;;;;N;;;;;
+1F66F;DOWNWARDS ROCKET;So;0;ON;;;;;N;;;;;
+1F670;SCRIPT LIGATURE ET ORNAMENT;So;0;ON;;;;;N;;;;;
+1F671;HEAVY SCRIPT LIGATURE ET ORNAMENT;So;0;ON;;;;;N;;;;;
+1F672;LIGATURE OPEN ET ORNAMENT;So;0;ON;;;;;N;;;;;
+1F673;HEAVY LIGATURE OPEN ET ORNAMENT;So;0;ON;;;;;N;;;;;
+1F674;HEAVY AMPERSAND ORNAMENT;So;0;ON;;;;;N;;;;;
+1F675;SWASH AMPERSAND ORNAMENT;So;0;ON;;;;;N;;;;;
+1F676;SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT;So;0;ON;;;;;N;;;;;
+1F677;SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT;So;0;ON;;;;;N;;;;;
+1F678;SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT;So;0;ON;;;;;N;;;;;
+1F679;HEAVY INTERROBANG ORNAMENT;So;0;ON;;;;;N;;;;;
+1F67A;SANS-SERIF INTERROBANG ORNAMENT;So;0;ON;;;;;N;;;;;
+1F67B;HEAVY SANS-SERIF INTERROBANG ORNAMENT;So;0;ON;;;;;N;;;;;
+1F67C;VERY HEAVY SOLIDUS;So;0;ON;;;;;N;;;;;
+1F67D;VERY HEAVY REVERSE SOLIDUS;So;0;ON;;;;;N;;;;;
+1F67E;CHECKER BOARD;So;0;ON;;;;;N;;;;;
+1F67F;REVERSE CHECKER BOARD;So;0;ON;;;;;N;;;;;
1F680;ROCKET;So;0;ON;;;;;N;;;;;
1F681;HELICOPTER;So;0;ON;;;;;N;;;;;
1F682;STEAM LOCOMOTIVE;So;0;ON;;;;;N;;;;;
@@ -23422,6 +26001,33 @@
1F6C3;CUSTOMS;So;0;ON;;;;;N;;;;;
1F6C4;BAGGAGE CLAIM;So;0;ON;;;;;N;;;;;
1F6C5;LEFT LUGGAGE;So;0;ON;;;;;N;;;;;
+1F6C6;TRIANGLE WITH ROUNDED CORNERS;So;0;ON;;;;;N;;;;;
+1F6C7;PROHIBITED SIGN;So;0;ON;;;;;N;;;;;
+1F6C8;CIRCLED INFORMATION SOURCE;So;0;ON;;;;;N;;;;;
+1F6C9;BOYS SYMBOL;So;0;ON;;;;;N;;;;;
+1F6CA;GIRLS SYMBOL;So;0;ON;;;;;N;;;;;
+1F6CB;COUCH AND LAMP;So;0;ON;;;;;N;;;;;
+1F6CC;SLEEPING ACCOMMODATION;So;0;ON;;;;;N;;;;;
+1F6CD;SHOPPING BAGS;So;0;ON;;;;;N;;;;;
+1F6CE;BELLHOP BELL;So;0;ON;;;;;N;;;;;
+1F6CF;BED;So;0;ON;;;;;N;;;;;
+1F6E0;HAMMER AND WRENCH;So;0;ON;;;;;N;;;;;
+1F6E1;SHIELD;So;0;ON;;;;;N;;;;;
+1F6E2;OIL DRUM;So;0;ON;;;;;N;;;;;
+1F6E3;MOTORWAY;So;0;ON;;;;;N;;;;;
+1F6E4;RAILWAY TRACK;So;0;ON;;;;;N;;;;;
+1F6E5;MOTOR BOAT;So;0;ON;;;;;N;;;;;
+1F6E6;UP-POINTING MILITARY AIRPLANE;So;0;ON;;;;;N;;;;;
+1F6E7;UP-POINTING AIRPLANE;So;0;ON;;;;;N;;;;;
+1F6E8;UP-POINTING SMALL AIRPLANE;So;0;ON;;;;;N;;;;;
+1F6E9;SMALL AIRPLANE;So;0;ON;;;;;N;;;;;
+1F6EA;NORTHEAST-POINTING AIRPLANE;So;0;ON;;;;;N;;;;;
+1F6EB;AIRPLANE DEPARTURE;So;0;ON;;;;;N;;;;;
+1F6EC;AIRPLANE ARRIVING;So;0;ON;;;;;N;;;;;
+1F6F0;SATELLITE;So;0;ON;;;;;N;;;;;
+1F6F1;ONCOMING FIRE ENGINE;So;0;ON;;;;;N;;;;;
+1F6F2;DIESEL LOCOMOTIVE;So;0;ON;;;;;N;;;;;
+1F6F3;PASSENGER SHIP;So;0;ON;;;;;N;;;;;
1F700;ALCHEMICAL SYMBOL FOR QUINTESSENCE;So;0;ON;;;;;N;;;;;
1F701;ALCHEMICAL SYMBOL FOR AIR;So;0;ON;;;;;N;;;;;
1F702;ALCHEMICAL SYMBOL FOR FIRE;So;0;ON;;;;;N;;;;;
@@ -23538,6 +26144,239 @@
1F771;ALCHEMICAL SYMBOL FOR MONTH;So;0;ON;;;;;N;;;;;
1F772;ALCHEMICAL SYMBOL FOR HALF DRAM;So;0;ON;;;;;N;;;;;
1F773;ALCHEMICAL SYMBOL FOR HALF OUNCE;So;0;ON;;;;;N;;;;;
+1F780;BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
+1F781;BLACK UP-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
+1F782;BLACK RIGHT-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
+1F783;BLACK DOWN-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
+1F784;BLACK SLIGHTLY SMALL CIRCLE;So;0;ON;;;;;N;;;;;
+1F785;MEDIUM BOLD WHITE CIRCLE;So;0;ON;;;;;N;;;;;
+1F786;BOLD WHITE CIRCLE;So;0;ON;;;;;N;;;;;
+1F787;HEAVY WHITE CIRCLE;So;0;ON;;;;;N;;;;;
+1F788;VERY HEAVY WHITE CIRCLE;So;0;ON;;;;;N;;;;;
+1F789;EXTREMELY HEAVY WHITE CIRCLE;So;0;ON;;;;;N;;;;;
+1F78A;WHITE CIRCLE CONTAINING BLACK SMALL CIRCLE;So;0;ON;;;;;N;;;;;
+1F78B;ROUND TARGET;So;0;ON;;;;;N;;;;;
+1F78C;BLACK TINY SQUARE;So;0;ON;;;;;N;;;;;
+1F78D;BLACK SLIGHTLY SMALL SQUARE;So;0;ON;;;;;N;;;;;
+1F78E;LIGHT WHITE SQUARE;So;0;ON;;;;;N;;;;;
+1F78F;MEDIUM WHITE SQUARE;So;0;ON;;;;;N;;;;;
+1F790;BOLD WHITE SQUARE;So;0;ON;;;;;N;;;;;
+1F791;HEAVY WHITE SQUARE;So;0;ON;;;;;N;;;;;
+1F792;VERY HEAVY WHITE SQUARE;So;0;ON;;;;;N;;;;;
+1F793;EXTREMELY HEAVY WHITE SQUARE;So;0;ON;;;;;N;;;;;
+1F794;WHITE SQUARE CONTAINING BLACK VERY SMALL SQUARE;So;0;ON;;;;;N;;;;;
+1F795;WHITE SQUARE CONTAINING BLACK MEDIUM SQUARE;So;0;ON;;;;;N;;;;;
+1F796;SQUARE TARGET;So;0;ON;;;;;N;;;;;
+1F797;BLACK TINY DIAMOND;So;0;ON;;;;;N;;;;;
+1F798;BLACK VERY SMALL DIAMOND;So;0;ON;;;;;N;;;;;
+1F799;BLACK MEDIUM SMALL DIAMOND;So;0;ON;;;;;N;;;;;
+1F79A;WHITE DIAMOND CONTAINING BLACK VERY SMALL DIAMOND;So;0;ON;;;;;N;;;;;
+1F79B;WHITE DIAMOND CONTAINING BLACK MEDIUM DIAMOND;So;0;ON;;;;;N;;;;;
+1F79C;DIAMOND TARGET;So;0;ON;;;;;N;;;;;
+1F79D;BLACK TINY LOZENGE;So;0;ON;;;;;N;;;;;
+1F79E;BLACK VERY SMALL LOZENGE;So;0;ON;;;;;N;;;;;
+1F79F;BLACK MEDIUM SMALL LOZENGE;So;0;ON;;;;;N;;;;;
+1F7A0;WHITE LOZENGE CONTAINING BLACK SMALL LOZENGE;So;0;ON;;;;;N;;;;;
+1F7A1;THIN GREEK CROSS;So;0;ON;;;;;N;;;;;
+1F7A2;LIGHT GREEK CROSS;So;0;ON;;;;;N;;;;;
+1F7A3;MEDIUM GREEK CROSS;So;0;ON;;;;;N;;;;;
+1F7A4;BOLD GREEK CROSS;So;0;ON;;;;;N;;;;;
+1F7A5;VERY BOLD GREEK CROSS;So;0;ON;;;;;N;;;;;
+1F7A6;VERY HEAVY GREEK CROSS;So;0;ON;;;;;N;;;;;
+1F7A7;EXTREMELY HEAVY GREEK CROSS;So;0;ON;;;;;N;;;;;
+1F7A8;THIN SALTIRE;So;0;ON;;;;;N;;;;;
+1F7A9;LIGHT SALTIRE;So;0;ON;;;;;N;;;;;
+1F7AA;MEDIUM SALTIRE;So;0;ON;;;;;N;;;;;
+1F7AB;BOLD SALTIRE;So;0;ON;;;;;N;;;;;
+1F7AC;HEAVY SALTIRE;So;0;ON;;;;;N;;;;;
+1F7AD;VERY HEAVY SALTIRE;So;0;ON;;;;;N;;;;;
+1F7AE;EXTREMELY HEAVY SALTIRE;So;0;ON;;;;;N;;;;;
+1F7AF;LIGHT FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B0;MEDIUM FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B1;BOLD FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B2;HEAVY FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B3;VERY HEAVY FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B4;EXTREMELY HEAVY FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B5;LIGHT SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B6;MEDIUM SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B7;BOLD SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B8;HEAVY SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7B9;VERY HEAVY SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7BA;EXTREMELY HEAVY SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7BB;LIGHT EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7BC;MEDIUM EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7BD;BOLD EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7BE;HEAVY EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7BF;VERY HEAVY EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;;
+1F7C0;LIGHT THREE POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7C1;MEDIUM THREE POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7C2;THREE POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7C3;MEDIUM THREE POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;;
+1F7C4;LIGHT FOUR POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7C5;MEDIUM FOUR POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7C6;FOUR POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7C7;MEDIUM FOUR POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;;
+1F7C8;REVERSE LIGHT FOUR POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;;
+1F7C9;LIGHT FIVE POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7CA;HEAVY FIVE POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7CB;MEDIUM SIX POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7CC;HEAVY SIX POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7CD;SIX POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;;
+1F7CE;MEDIUM EIGHT POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7CF;HEAVY EIGHT POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7D0;VERY HEAVY EIGHT POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7D1;HEAVY EIGHT POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;;
+1F7D2;LIGHT TWELVE POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7D3;HEAVY TWELVE POINTED BLACK STAR;So;0;ON;;;;;N;;;;;
+1F7D4;HEAVY TWELVE POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;;
+1F800;LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F801;UPWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F802;RIGHTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F803;DOWNWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F804;LEFTWARDS ARROW WITH MEDIUM TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F805;UPWARDS ARROW WITH MEDIUM TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F806;RIGHTWARDS ARROW WITH MEDIUM TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F807;DOWNWARDS ARROW WITH MEDIUM TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F808;LEFTWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F809;UPWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F80A;RIGHTWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F80B;DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F810;LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F811;UPWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F812;RIGHTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F813;DOWNWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F814;LEFTWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F815;UPWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F816;RIGHTWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F817;DOWNWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F818;HEAVY LEFTWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F819;HEAVY UPWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F81A;HEAVY RIGHTWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F81B;HEAVY DOWNWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F81C;HEAVY LEFTWARDS ARROW WITH LARGE EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F81D;HEAVY UPWARDS ARROW WITH LARGE EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F81E;HEAVY RIGHTWARDS ARROW WITH LARGE EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F81F;HEAVY DOWNWARDS ARROW WITH LARGE EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F820;LEFTWARDS TRIANGLE-HEADED ARROW WITH NARROW SHAFT;So;0;ON;;;;;N;;;;;
+1F821;UPWARDS TRIANGLE-HEADED ARROW WITH NARROW SHAFT;So;0;ON;;;;;N;;;;;
+1F822;RIGHTWARDS TRIANGLE-HEADED ARROW WITH NARROW SHAFT;So;0;ON;;;;;N;;;;;
+1F823;DOWNWARDS TRIANGLE-HEADED ARROW WITH NARROW SHAFT;So;0;ON;;;;;N;;;;;
+1F824;LEFTWARDS TRIANGLE-HEADED ARROW WITH MEDIUM SHAFT;So;0;ON;;;;;N;;;;;
+1F825;UPWARDS TRIANGLE-HEADED ARROW WITH MEDIUM SHAFT;So;0;ON;;;;;N;;;;;
+1F826;RIGHTWARDS TRIANGLE-HEADED ARROW WITH MEDIUM SHAFT;So;0;ON;;;;;N;;;;;
+1F827;DOWNWARDS TRIANGLE-HEADED ARROW WITH MEDIUM SHAFT;So;0;ON;;;;;N;;;;;
+1F828;LEFTWARDS TRIANGLE-HEADED ARROW WITH BOLD SHAFT;So;0;ON;;;;;N;;;;;
+1F829;UPWARDS TRIANGLE-HEADED ARROW WITH BOLD SHAFT;So;0;ON;;;;;N;;;;;
+1F82A;RIGHTWARDS TRIANGLE-HEADED ARROW WITH BOLD SHAFT;So;0;ON;;;;;N;;;;;
+1F82B;DOWNWARDS TRIANGLE-HEADED ARROW WITH BOLD SHAFT;So;0;ON;;;;;N;;;;;
+1F82C;LEFTWARDS TRIANGLE-HEADED ARROW WITH HEAVY SHAFT;So;0;ON;;;;;N;;;;;
+1F82D;UPWARDS TRIANGLE-HEADED ARROW WITH HEAVY SHAFT;So;0;ON;;;;;N;;;;;
+1F82E;RIGHTWARDS TRIANGLE-HEADED ARROW WITH HEAVY SHAFT;So;0;ON;;;;;N;;;;;
+1F82F;DOWNWARDS TRIANGLE-HEADED ARROW WITH HEAVY SHAFT;So;0;ON;;;;;N;;;;;
+1F830;LEFTWARDS TRIANGLE-HEADED ARROW WITH VERY HEAVY SHAFT;So;0;ON;;;;;N;;;;;
+1F831;UPWARDS TRIANGLE-HEADED ARROW WITH VERY HEAVY SHAFT;So;0;ON;;;;;N;;;;;
+1F832;RIGHTWARDS TRIANGLE-HEADED ARROW WITH VERY HEAVY SHAFT;So;0;ON;;;;;N;;;;;
+1F833;DOWNWARDS TRIANGLE-HEADED ARROW WITH VERY HEAVY SHAFT;So;0;ON;;;;;N;;;;;
+1F834;LEFTWARDS FINGER-POST ARROW;So;0;ON;;;;;N;;;;;
+1F835;UPWARDS FINGER-POST ARROW;So;0;ON;;;;;N;;;;;
+1F836;RIGHTWARDS FINGER-POST ARROW;So;0;ON;;;;;N;;;;;
+1F837;DOWNWARDS FINGER-POST ARROW;So;0;ON;;;;;N;;;;;
+1F838;LEFTWARDS SQUARED ARROW;So;0;ON;;;;;N;;;;;
+1F839;UPWARDS SQUARED ARROW;So;0;ON;;;;;N;;;;;
+1F83A;RIGHTWARDS SQUARED ARROW;So;0;ON;;;;;N;;;;;
+1F83B;DOWNWARDS SQUARED ARROW;So;0;ON;;;;;N;;;;;
+1F83C;LEFTWARDS COMPRESSED ARROW;So;0;ON;;;;;N;;;;;
+1F83D;UPWARDS COMPRESSED ARROW;So;0;ON;;;;;N;;;;;
+1F83E;RIGHTWARDS COMPRESSED ARROW;So;0;ON;;;;;N;;;;;
+1F83F;DOWNWARDS COMPRESSED ARROW;So;0;ON;;;;;N;;;;;
+1F840;LEFTWARDS HEAVY COMPRESSED ARROW;So;0;ON;;;;;N;;;;;
+1F841;UPWARDS HEAVY COMPRESSED ARROW;So;0;ON;;;;;N;;;;;
+1F842;RIGHTWARDS HEAVY COMPRESSED ARROW;So;0;ON;;;;;N;;;;;
+1F843;DOWNWARDS HEAVY COMPRESSED ARROW;So;0;ON;;;;;N;;;;;
+1F844;LEFTWARDS HEAVY ARROW;So;0;ON;;;;;N;;;;;
+1F845;UPWARDS HEAVY ARROW;So;0;ON;;;;;N;;;;;
+1F846;RIGHTWARDS HEAVY ARROW;So;0;ON;;;;;N;;;;;
+1F847;DOWNWARDS HEAVY ARROW;So;0;ON;;;;;N;;;;;
+1F850;LEFTWARDS SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F851;UPWARDS SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F852;RIGHTWARDS SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F853;DOWNWARDS SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F854;NORTH WEST SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F855;NORTH EAST SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F856;SOUTH EAST SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F857;SOUTH WEST SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F858;LEFT RIGHT SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F859;UP DOWN SANS-SERIF ARROW;So;0;ON;;;;;N;;;;;
+1F860;WIDE-HEADED LEFTWARDS LIGHT BARB ARROW;So;0;ON;;;;;N;;;;;
+1F861;WIDE-HEADED UPWARDS LIGHT BARB ARROW;So;0;ON;;;;;N;;;;;
+1F862;WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW;So;0;ON;;;;;N;;;;;
+1F863;WIDE-HEADED DOWNWARDS LIGHT BARB ARROW;So;0;ON;;;;;N;;;;;
+1F864;WIDE-HEADED NORTH WEST LIGHT BARB ARROW;So;0;ON;;;;;N;;;;;
+1F865;WIDE-HEADED NORTH EAST LIGHT BARB ARROW;So;0;ON;;;;;N;;;;;
+1F866;WIDE-HEADED SOUTH EAST LIGHT BARB ARROW;So;0;ON;;;;;N;;;;;
+1F867;WIDE-HEADED SOUTH WEST LIGHT BARB ARROW;So;0;ON;;;;;N;;;;;
+1F868;WIDE-HEADED LEFTWARDS BARB ARROW;So;0;ON;;;;;N;;;;;
+1F869;WIDE-HEADED UPWARDS BARB ARROW;So;0;ON;;;;;N;;;;;
+1F86A;WIDE-HEADED RIGHTWARDS BARB ARROW;So;0;ON;;;;;N;;;;;
+1F86B;WIDE-HEADED DOWNWARDS BARB ARROW;So;0;ON;;;;;N;;;;;
+1F86C;WIDE-HEADED NORTH WEST BARB ARROW;So;0;ON;;;;;N;;;;;
+1F86D;WIDE-HEADED NORTH EAST BARB ARROW;So;0;ON;;;;;N;;;;;
+1F86E;WIDE-HEADED SOUTH EAST BARB ARROW;So;0;ON;;;;;N;;;;;
+1F86F;WIDE-HEADED SOUTH WEST BARB ARROW;So;0;ON;;;;;N;;;;;
+1F870;WIDE-HEADED LEFTWARDS MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;;
+1F871;WIDE-HEADED UPWARDS MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;;
+1F872;WIDE-HEADED RIGHTWARDS MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;;
+1F873;WIDE-HEADED DOWNWARDS MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;;
+1F874;WIDE-HEADED NORTH WEST MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;;
+1F875;WIDE-HEADED NORTH EAST MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;;
+1F876;WIDE-HEADED SOUTH EAST MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;;
+1F877;WIDE-HEADED SOUTH WEST MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;;
+1F878;WIDE-HEADED LEFTWARDS HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F879;WIDE-HEADED UPWARDS HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F87A;WIDE-HEADED RIGHTWARDS HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F87B;WIDE-HEADED DOWNWARDS HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F87C;WIDE-HEADED NORTH WEST HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F87D;WIDE-HEADED NORTH EAST HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F87E;WIDE-HEADED SOUTH EAST HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F87F;WIDE-HEADED SOUTH WEST HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F880;WIDE-HEADED LEFTWARDS VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F881;WIDE-HEADED UPWARDS VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F882;WIDE-HEADED RIGHTWARDS VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F883;WIDE-HEADED DOWNWARDS VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F884;WIDE-HEADED NORTH WEST VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F885;WIDE-HEADED NORTH EAST VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F886;WIDE-HEADED SOUTH EAST VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F887;WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;;
+1F890;LEFTWARDS TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F891;UPWARDS TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F892;RIGHTWARDS TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F893;DOWNWARDS TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F894;LEFTWARDS WHITE ARROW WITHIN TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F895;UPWARDS WHITE ARROW WITHIN TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F896;RIGHTWARDS WHITE ARROW WITHIN TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F897;DOWNWARDS WHITE ARROW WITHIN TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;;
+1F898;LEFTWARDS ARROW WITH NOTCHED TAIL;So;0;ON;;;;;N;;;;;
+1F899;UPWARDS ARROW WITH NOTCHED TAIL;So;0;ON;;;;;N;;;;;
+1F89A;RIGHTWARDS ARROW WITH NOTCHED TAIL;So;0;ON;;;;;N;;;;;
+1F89B;DOWNWARDS ARROW WITH NOTCHED TAIL;So;0;ON;;;;;N;;;;;
+1F89C;HEAVY ARROW SHAFT WIDTH ONE;So;0;ON;;;;;N;;;;;
+1F89D;HEAVY ARROW SHAFT WIDTH TWO THIRDS;So;0;ON;;;;;N;;;;;
+1F89E;HEAVY ARROW SHAFT WIDTH ONE HALF;So;0;ON;;;;;N;;;;;
+1F89F;HEAVY ARROW SHAFT WIDTH ONE THIRD;So;0;ON;;;;;N;;;;;
+1F8A0;LEFTWARDS BOTTOM-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A1;RIGHTWARDS BOTTOM SHADED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A2;LEFTWARDS TOP SHADED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A3;RIGHTWARDS TOP SHADED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A4;LEFTWARDS LEFT-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A5;RIGHTWARDS RIGHT-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A6;LEFTWARDS RIGHT-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A7;RIGHTWARDS LEFT-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A8;LEFTWARDS BACK-TILTED SHADOWED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8A9;RIGHTWARDS BACK-TILTED SHADOWED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8AA;LEFTWARDS FRONT-TILTED SHADOWED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8AB;RIGHTWARDS FRONT-TILTED SHADOWED WHITE ARROW;So;0;ON;;;;;N;;;;;
+1F8AC;WHITE ARROW SHAFT WIDTH ONE;So;0;ON;;;;;N;;;;;
+1F8AD;WHITE ARROW SHAFT WIDTH TWO THIRDS;So;0;ON;;;;;N;;;;;
20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
--- a/jdk/make/data/unicodedata/VERSION Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/data/unicodedata/VERSION Wed Jul 15 11:05:51 2015 +0900
@@ -1,1 +1,1 @@
-6.2.0
+7.0.0
--- a/jdk/make/src/classes/build/tools/generatecharacter/GenerateCharacter.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/src/classes/build/tools/generatecharacter/GenerateCharacter.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -906,6 +906,14 @@
return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE);
if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG]))
return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS);
+ if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE][UnicodeSpec.LONG]))
+ return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE);
+ if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE][UnicodeSpec.LONG]))
+ return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE);
+ if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE][UnicodeSpec.LONG]))
+ return Integer.toString(UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE);
+ if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE][UnicodeSpec.LONG]))
+ return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE);
FAIL("Unknown text substitution marker " + commandMarker + x);
return commandMarker + x;
}
--- a/jdk/make/src/classes/build/tools/generatecharacter/UnicodeSpec.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/make/src/classes/build/tools/generatecharacter/UnicodeSpec.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -121,7 +121,7 @@
String[] tokens = null;
try {
- tokens = tokenSeparator.split(s, REQUIRED_FIELDS);
+ tokens = tokenSeparator.split(s, REQUIRED_FIELDS);
spec = new UnicodeSpec();
spec.setCodePoint(parseCodePoint(tokens[FIELD_VALUE]));
spec.setName(parseName(tokens[FIELD_NAME]));
@@ -672,7 +672,8 @@
* Bidirectional categories
*/
public static final byte
- DIRECTIONALITY_UNDEFINED = -1,
+ DIRECTIONALITY_UNDEFINED = -1,
+
// Strong category
DIRECTIONALITY_LEFT_TO_RIGHT = 0, // L
DIRECTIONALITY_RIGHT_TO_LEFT = 1, // R
@@ -689,15 +690,19 @@
DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10, // B
DIRECTIONALITY_SEGMENT_SEPARATOR = 11, // S
DIRECTIONALITY_WHITESPACE = 12, // WS
- DIRECTIONALITY_OTHER_NEUTRALS = 13, // ON
-
+ DIRECTIONALITY_OTHER_NEUTRALS = 13, // ON
+ // Explicit Formatting category
DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14, // LRE
DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15, // LRO
DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16, // RLE
DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17, // RLO
DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18, // PDF
+ DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 19, // LRI
+ DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 20, // RLI
+ DIRECTIONALITY_FIRST_STRONG_ISOLATE = 21, // FSI
+ DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 22, // PDI
- DIRECTIONALITY_CATEGORY_COUNT = 19; // sentinel value
+ DIRECTIONALITY_CATEGORY_COUNT = 23; // sentinel value
// If changes are made to the above bidi category assignments, this
// list of bidi category names must be changed to keep their order in synch.
@@ -722,7 +727,10 @@
{"RLE", "DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING"},
{"RLO", "DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE"},
{"PDF", "DIRECTIONALITY_POP_DIRECTIONAL_FORMAT"},
-
+ {"LRI", "DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE"},
+ {"RLI", "DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE"},
+ {"FSI", "DIRECTIONALITY_FIRST_STRONG_ISOLATE"},
+ {"PDI", "DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE"},
};
// Unicode specification lines have fields in this order.
--- a/jdk/src/java.base/share/classes/java/lang/Character.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/java/lang/Character.java Wed Jul 15 11:05:51 2015 +0900
@@ -42,7 +42,7 @@
* a character's category (lowercase letter, digit, etc.) and for converting
* characters from uppercase to lowercase and vice versa.
* <p>
- * Character information is based on the Unicode Standard, version 6.2.0.
+ * Character information is based on the Unicode Standard, version 7.0.0.
* <p>
* The methods and data of class {@code Character} are defined by
* the information in the <i>UnicodeData</i> file that is part of the
@@ -492,6 +492,30 @@
public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
/**
+ * Weak bidirectional character type "LRI" in the Unicode specification.
+ * @since 1.9
+ */
+ public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 19;
+
+ /**
+ * Weak bidirectional character type "RLI" in the Unicode specification.
+ * @since 1.9
+ */
+ public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 20;
+
+ /**
+ * Weak bidirectional character type "FSI" in the Unicode specification.
+ * @since 1.9
+ */
+ public static final byte DIRECTIONALITY_FIRST_STRONG_ISOLATE = 21;
+
+ /**
+ * Weak bidirectional character type "PDI" in the Unicode specification.
+ * @since 1.9
+ */
+ public static final byte DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 22;
+
+ /**
* The minimum value of a
* <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit">
* Unicode high-surrogate code unit</a>
@@ -2563,6 +2587,269 @@
"ARABIC MATHEMATICAL ALPHABETIC SYMBOLS",
"ARABICMATHEMATICALALPHABETICSYMBOLS");
+ /**
+ * Constant for the "Combining Diacritical Marks Extended" Unicode
+ * character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS_EXTENDED =
+ new UnicodeBlock("COMBINING_DIACRITICAL_MARKS_EXTENDED",
+ "COMBINING DIACRITICAL MARKS EXTENDED",
+ "COMBININGDIACRITICALMARKSEXTENDED");
+
+ /**
+ * Constant for the "Myanmar Extended-B" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MYANMAR_EXTENDED_B =
+ new UnicodeBlock("MYANMAR_EXTENDED_B",
+ "MYANMAR EXTENDED-B",
+ "MYANMAREXTENDED-B");
+
+ /**
+ * Constant for the "Latin Extended-E" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock LATIN_EXTENDED_E =
+ new UnicodeBlock("LATIN_EXTENDED_E",
+ "LATIN EXTENDED-E",
+ "LATINEXTENDED-E");
+
+ /**
+ * Constant for the "Coptic Epact Numbers" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock COPTIC_EPACT_NUMBERS =
+ new UnicodeBlock("COPTIC_EPACT_NUMBERS",
+ "COPTIC EPACT NUMBERS",
+ "COPTICEPACTNUMBERS");
+
+ /**
+ * Constant for the "Old Permic" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock OLD_PERMIC =
+ new UnicodeBlock("OLD_PERMIC",
+ "OLD PERMIC",
+ "OLDPERMIC");
+
+ /**
+ * Constant for the "Elbasan" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock ELBASAN =
+ new UnicodeBlock("ELBASAN");
+
+ /**
+ * Constant for the "Caucasian Albanian" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock CAUCASIAN_ALBANIAN =
+ new UnicodeBlock("CAUCASIAN_ALBANIAN",
+ "CAUCASIAN ALBANIAN",
+ "CAUCASIANALBANIAN");
+
+ /**
+ * Constant for the "Linear A" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock LINEAR_A =
+ new UnicodeBlock("LINEAR_A",
+ "LINEAR A",
+ "LINEARA");
+
+ /**
+ * Constant for the "Palmyrene" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock PALMYRENE =
+ new UnicodeBlock("PALMYRENE");
+
+ /**
+ * Constant for the "Nabataean" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock NABATAEAN =
+ new UnicodeBlock("NABATAEAN");
+
+ /**
+ * Constant for the "Old North Arabian" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock OLD_NORTH_ARABIAN =
+ new UnicodeBlock("OLD_NORTH_ARABIAN",
+ "OLD NORTH ARABIAN",
+ "OLDNORTHARABIAN");
+
+ /**
+ * Constant for the "Manichaean" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MANICHAEAN =
+ new UnicodeBlock("MANICHAEAN");
+
+ /**
+ * Constant for the "Psalter Pahlavi" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock PSALTER_PAHLAVI =
+ new UnicodeBlock("PSALTER_PAHLAVI",
+ "PSALTER PAHLAVI",
+ "PSALTERPAHLAVI");
+
+ /**
+ * Constant for the "Mahajani" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MAHAJANI =
+ new UnicodeBlock("MAHAJANI");
+
+ /**
+ * Constant for the "Sinhala Archaic Numbers" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock SINHALA_ARCHAIC_NUMBERS =
+ new UnicodeBlock("SINHALA_ARCHAIC_NUMBERS",
+ "SINHALA ARCHAIC NUMBERS",
+ "SINHALAARCHAICNUMBERS");
+
+ /**
+ * Constant for the "Khojki" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock KHOJKI =
+ new UnicodeBlock("KHOJKI");
+
+ /**
+ * Constant for the "Khudawadi" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock KHUDAWADI =
+ new UnicodeBlock("KHUDAWADI");
+
+ /**
+ * Constant for the "Grantha" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock GRANTHA =
+ new UnicodeBlock("GRANTHA");
+
+ /**
+ * Constant for the "Tirhuta" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock TIRHUTA =
+ new UnicodeBlock("TIRHUTA");
+
+ /**
+ * Constant for the "Siddham" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock SIDDHAM =
+ new UnicodeBlock("SIDDHAM");
+
+ /**
+ * Constant for the "Modi" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MODI =
+ new UnicodeBlock("MODI");
+
+ /**
+ * Constant for the "Warang Citi" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock WARANG_CITI =
+ new UnicodeBlock("WARANG_CITI",
+ "WARANG CITI",
+ "WARANGCITI");
+
+ /**
+ * Constant for the "Pau Cin Hau" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock PAU_CIN_HAU =
+ new UnicodeBlock("PAU_CIN_HAU",
+ "PAU CIN HAU",
+ "PAUCINHAU");
+
+ /**
+ * Constant for the "Mro" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MRO =
+ new UnicodeBlock("MRO");
+
+ /**
+ * Constant for the "Bassa Vah" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock BASSA_VAH =
+ new UnicodeBlock("BASSA_VAH",
+ "BASSA VAH",
+ "BASSAVAH");
+
+ /**
+ * Constant for the "Pahawh Hmong" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock PAHAWH_HMONG =
+ new UnicodeBlock("PAHAWH_HMONG",
+ "PAHAWH HMONG",
+ "PAHAWHHMONG");
+
+ /**
+ * Constant for the "Duployan" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock DUPLOYAN =
+ new UnicodeBlock("DUPLOYAN");
+
+ /**
+ * Constant for the "Shorthand Format Controls" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock SHORTHAND_FORMAT_CONTROLS =
+ new UnicodeBlock("SHORTHAND_FORMAT_CONTROLS",
+ "SHORTHAND FORMAT CONTROLS",
+ "SHORTHANDFORMATCONTROLS");
+
+ /**
+ * Constant for the "Mende Kikakui" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MENDE_KIKAKUI =
+ new UnicodeBlock("MENDE_KIKAKUI",
+ "MENDE KIKAKUI",
+ "MENDEKIKAKUI");
+
+ /**
+ * Constant for the "Ornamental Dingbats" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock ORNAMENTAL_DINGBATS =
+ new UnicodeBlock("ORNAMENTAL_DINGBATS",
+ "ORNAMENTAL DINGBATS",
+ "ORNAMENTALDINGBATS");
+
+ /**
+ * Constant for the "Geometric Shapes Extended" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock GEOMETRIC_SHAPES_EXTENDED =
+ new UnicodeBlock("GEOMETRIC_SHAPES_EXTENDED",
+ "GEOMETRIC SHAPES EXTENDED",
+ "GEOMETRICSHAPESEXTENDED");
+
+ /**
+ * Constant for the "Supplemental Arrows-C" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock SUPPLEMENTAL_ARROWS_C =
+ new UnicodeBlock("SUPPLEMENTAL_ARROWS_C",
+ "SUPPLEMENTAL ARROWS-C",
+ "SUPPLEMENTALARROWS-C");
+
private static final int blockStarts[] = {
0x0000, // 0000..007F; Basic Latin
0x0080, // 0080..00FF; Latin-1 Supplement
@@ -2620,7 +2907,7 @@
0x19E0, // 19E0..19FF; Khmer Symbols
0x1A00, // 1A00..1A1F; Buginese
0x1A20, // 1A20..1AAF; Tai Tham
- 0x1AB0, // unassigned
+ 0x1AB0, // 1AB0..1AFF; Combining Diacritical Marks Extended
0x1B00, // 1B00..1B7F; Balinese
0x1B80, // 1B80..1BBF; Sundanese
0x1BC0, // 1BC0..1BFF; Batak
@@ -2701,13 +2988,14 @@
0xA930, // A930..A95F; Rejang
0xA960, // A960..A97F; Hangul Jamo Extended-A
0xA980, // A980..A9DF; Javanese
- 0xA9E0, // unassigned
+ 0xA9E0, // A9E0..A9FF; Myanmar Extended-B
0xAA00, // AA00..AA5F; Cham
0xAA60, // AA60..AA7F; Myanmar Extended-A
0xAA80, // AA80..AADF; Tai Viet
0xAAE0, // AAE0..AAFF; Meetei Mayek Extensions
0xAB00, // AB00..AB2F; Ethiopic Extended-A
- 0xAB30, // unassigned
+ 0xAB30, // AB30..AB6F; Latin Extended-E
+ 0xAB70, // unassigned
0xABC0, // ABC0..ABFF; Meetei Mayek
0xAC00, // AC00..D7AF; Hangul Syllables
0xD7B0, // D7B0..D7FF; Hangul Jamo Extended-B
@@ -2735,10 +3023,10 @@
0x10200, // unassigned
0x10280, // 10280..1029F; Lycian
0x102A0, // 102A0..102DF; Carian
- 0x102E0, // unassigned
+ 0x102E0, // 102E0..102FF; Coptic Epact Numbers
0x10300, // 10300..1032F; Old Italic
0x10330, // 10330..1034F; Gothic
- 0x10350, // unassigned
+ 0x10350, // 10350..1037F; Old Permic
0x10380, // 10380..1039F; Ugaritic
0x103A0, // 103A0..103DF; Old Persian
0x103E0, // unassigned
@@ -2746,9 +3034,16 @@
0x10450, // 10450..1047F; Shavian
0x10480, // 10480..104AF; Osmanya
0x104B0, // unassigned
+ 0x10500, // 10500..1052F; Elbasan
+ 0x10530, // 10530..1056F; Caucasian Albanian
+ 0x10570, // unassigned
+ 0x10600, // 10600..1077F; Linear A
+ 0x10780, // unassigned
0x10800, // 10800..1083F; Cypriot Syllabary
0x10840, // 10840..1085F; Imperial Aramaic
- 0x10860, // unassigned
+ 0x10860, // 10860..1087F; Palmyrene
+ 0x10880, // 10880..108AF; Nabataean
+ 0x108B0, // unassigned
0x10900, // 10900..1091F; Phoenician
0x10920, // 10920..1093F; Lydian
0x10940, // unassigned
@@ -2756,11 +3051,14 @@
0x109A0, // 109A0..109FF; Meroitic Cursive
0x10A00, // 10A00..10A5F; Kharoshthi
0x10A60, // 10A60..10A7F; Old South Arabian
- 0x10A80, // unassigned
+ 0x10A80, // 10A80..10A9F; Old North Arabian
+ 0x10AA0, // unassigned
+ 0x10AC0, // 10AC0..10AFF; Manichaean
0x10B00, // 10B00..10B3F; Avestan
0x10B40, // 10B40..10B5F; Inscriptional Parthian
0x10B60, // 10B60..10B7F; Inscriptional Pahlavi
- 0x10B80, // unassigned
+ 0x10B80, // 10B80..10BAF; Psalter Pahlavi
+ 0x10BB0, // unassigned
0x10C00, // 10C00..10C4F; Old Turkic
0x10C50, // unassigned
0x10E60, // 10E60..10E7F; Rumi Numeral Symbols
@@ -2769,22 +3067,43 @@
0x11080, // 11080..110CF; Kaithi
0x110D0, // 110D0..110FF; Sora Sompeng
0x11100, // 11100..1114F; Chakma
- 0x11150, // unassigned
+ 0x11150, // 11150..1117F; Mahajani
0x11180, // 11180..111DF; Sharada
- 0x111E0, // unassigned
+ 0x111E0, // 111E0..111FF; Sinhala Archaic Numbers
+ 0x11200, // 11200..1124F; Khojki
+ 0x11250, // unassigned
+ 0x112B0, // 112B0..112FF; Khudawadi
+ 0x11300, // 11300..1137F; Grantha
+ 0x11380, // unassigned
+ 0x11480, // 11480..114DF; Tirhuta
+ 0x114E0, // unassigned
+ 0x11580, // 11580..115FF; Siddham
+ 0x11600, // 11600..1165F; Modi
+ 0x11660, // unassigned
0x11680, // 11680..116CF; Takri
0x116D0, // unassigned
+ 0x118A0, // 118A0..118FF; Warang Citi
+ 0x11900, // unassigned
+ 0x11AC0, // 11AC0..11AFF; Pau Cin Hau
+ 0x11B00, // unassigned
0x12000, // 12000..123FF; Cuneiform
0x12400, // 12400..1247F; Cuneiform Numbers and Punctuation
0x12480, // unassigned
0x13000, // 13000..1342F; Egyptian Hieroglyphs
0x13430, // unassigned
0x16800, // 16800..16A3F; Bamum Supplement
- 0x16A40, // unassigned
+ 0x16A40, // 16A40..16A6F; Mro
+ 0x16A70, // unassigned
+ 0x16AD0, // 16AD0..16AFF; Bassa Vah
+ 0x16B00, // 16B00..16B8F; Pahawh Hmong
+ 0x16B90, // unassigned
0x16F00, // 16F00..16F9F; Miao
0x16FA0, // unassigned
0x1B000, // 1B000..1B0FF; Kana Supplement
0x1B100, // unassigned
+ 0x1BC00, // 1BC00..1BC9F; Duployan
+ 0x1BCA0, // 1BCA0..1BCAF; Shorthand Format Controls
+ 0x1BCB0, // unassigned
0x1D000, // 1D000..1D0FF; Byzantine Musical Symbols
0x1D100, // 1D100..1D1FF; Musical Symbols
0x1D200, // 1D200..1D24F; Ancient Greek Musical Notation
@@ -2794,6 +3113,8 @@
0x1D380, // unassigned
0x1D400, // 1D400..1D7FF; Mathematical Alphanumeric Symbols
0x1D800, // unassigned
+ 0x1E800, // 1E800..1E8DF; Mende Kikakui
+ 0x1E8E0, // unassigned
0x1EE00, // 1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
0x1EF00, // unassigned
0x1F000, // 1F000..1F02F; Mahjong Tiles
@@ -2803,10 +3124,12 @@
0x1F200, // 1F200..1F2FF; Enclosed Ideographic Supplement
0x1F300, // 1F300..1F5FF; Miscellaneous Symbols And Pictographs
0x1F600, // 1F600..1F64F; Emoticons
- 0x1F650, // unassigned
+ 0x1F650, // 1F650..1F67F; Ornamental Dingbats
0x1F680, // 1F680..1F6FF; Transport And Map Symbols
0x1F700, // 1F700..1F77F; Alchemical Symbols
- 0x1F780, // unassigned
+ 0x1F780, // 1F780..1F7FF; Geometric Shapes Extended
+ 0x1F800, // 1F800..1F8FF; Supplemental Arrows-C
+ 0x1F900, // unassigned
0x20000, // 20000..2A6DF; CJK Unified Ideographs Extension B
0x2A6E0, // unassigned
0x2A700, // 2A700..2B73F; CJK Unified Ideographs Extension C
@@ -2879,7 +3202,7 @@
KHMER_SYMBOLS,
BUGINESE,
TAI_THAM,
- null,
+ COMBINING_DIACRITICAL_MARKS_EXTENDED,
BALINESE,
SUNDANESE,
BATAK,
@@ -2960,12 +3283,13 @@
REJANG,
HANGUL_JAMO_EXTENDED_A,
JAVANESE,
- null,
+ MYANMAR_EXTENDED_B,
CHAM,
MYANMAR_EXTENDED_A,
TAI_VIET,
MEETEI_MAYEK_EXTENSIONS,
ETHIOPIC_EXTENDED_A,
+ LATIN_EXTENDED_E,
null,
MEETEI_MAYEK,
HANGUL_SYLLABLES,
@@ -2994,10 +3318,10 @@
null,
LYCIAN,
CARIAN,
- null,
+ COPTIC_EPACT_NUMBERS,
OLD_ITALIC,
GOTHIC,
- null,
+ OLD_PERMIC,
UGARITIC,
OLD_PERSIAN,
null,
@@ -3005,8 +3329,15 @@
SHAVIAN,
OSMANYA,
null,
+ ELBASAN,
+ CAUCASIAN_ALBANIAN,
+ null,
+ LINEAR_A,
+ null,
CYPRIOT_SYLLABARY,
IMPERIAL_ARAMAIC,
+ PALMYRENE,
+ NABATAEAN,
null,
PHOENICIAN,
LYDIAN,
@@ -3015,10 +3346,13 @@
MEROITIC_CURSIVE,
KHAROSHTHI,
OLD_SOUTH_ARABIAN,
+ OLD_NORTH_ARABIAN,
null,
+ MANICHAEAN,
AVESTAN,
INSCRIPTIONAL_PARTHIAN,
INSCRIPTIONAL_PAHLAVI,
+ PSALTER_PAHLAVI,
null,
OLD_TURKIC,
null,
@@ -3028,22 +3362,43 @@
KAITHI,
SORA_SOMPENG,
CHAKMA,
+ MAHAJANI,
+ SHARADA,
+ SINHALA_ARCHAIC_NUMBERS,
+ KHOJKI,
null,
- SHARADA,
+ KHUDAWADI,
+ GRANTHA,
+ null,
+ TIRHUTA,
+ null,
+ SIDDHAM,
+ MODI,
null,
TAKRI,
null,
+ WARANG_CITI,
+ null,
+ PAU_CIN_HAU,
+ null,
CUNEIFORM,
CUNEIFORM_NUMBERS_AND_PUNCTUATION,
null,
EGYPTIAN_HIEROGLYPHS,
null,
BAMUM_SUPPLEMENT,
+ MRO,
+ null,
+ BASSA_VAH,
+ PAHAWH_HMONG,
null,
MIAO,
null,
KANA_SUPPLEMENT,
null,
+ DUPLOYAN,
+ SHORTHAND_FORMAT_CONTROLS,
+ null,
BYZANTINE_MUSICAL_SYMBOLS,
MUSICAL_SYMBOLS,
ANCIENT_GREEK_MUSICAL_NOTATION,
@@ -3053,6 +3408,8 @@
null,
MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
null,
+ MENDE_KIKAKUI,
+ null,
ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS,
null,
MAHJONG_TILES,
@@ -3062,9 +3419,11 @@
ENCLOSED_IDEOGRAPHIC_SUPPLEMENT,
MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS,
EMOTICONS,
- null,
+ ORNAMENTAL_DINGBATS,
TRANSPORT_AND_MAP_SYMBOLS,
ALCHEMICAL_SYMBOLS,
+ GEOMETRIC_SHAPES_EXTENDED,
+ SUPPLEMENTAL_ARROWS_C,
null,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
null,
@@ -3677,40 +4036,185 @@
/**
* Unicode script "Meroitic Hieroglyphs".
+ * @since 1.8
*/
MEROITIC_HIEROGLYPHS,
/**
* Unicode script "Meroitic Cursive".
+ * @since 1.8
*/
MEROITIC_CURSIVE,
/**
* Unicode script "Sora Sompeng".
+ * @since 1.8
*/
SORA_SOMPENG,
/**
* Unicode script "Chakma".
+ * @since 1.8
*/
CHAKMA,
/**
* Unicode script "Sharada".
+ * @since 1.8
*/
SHARADA,
/**
* Unicode script "Takri".
+ * @since 1.8
*/
TAKRI,
/**
* Unicode script "Miao".
+ * @since 1.8
*/
MIAO,
/**
+ * Unicode script "Caucasian Albanian".
+ * @since 1.9
+ */
+ CAUCASIAN_ALBANIAN,
+
+ /**
+ * Unicode script "Bassa Vah".
+ * @since 1.9
+ */
+ BASSA_VAH,
+
+ /**
+ * Unicode script "Duployan".
+ * @since 1.9
+ */
+ DUPLOYAN,
+
+ /**
+ * Unicode script "Elbasan".
+ * @since 1.9
+ */
+ ELBASAN,
+
+ /**
+ * Unicode script "Grantha".
+ * @since 1.9
+ */
+ GRANTHA,
+
+ /**
+ * Unicode script "Pahawh Hmong".
+ * @since 1.9
+ */
+ PAHAWH_HMONG,
+
+ /**
+ * Unicode script "Khojki".
+ * @since 1.9
+ */
+ KHOJKI,
+
+ /**
+ * Unicode script "Linear A".
+ * @since 1.9
+ */
+ LINEAR_A,
+
+ /**
+ * Unicode script "Mahajani".
+ * @since 1.9
+ */
+ MAHAJANI,
+
+ /**
+ * Unicode script "Manichaean".
+ * @since 1.9
+ */
+ MANICHAEAN,
+
+ /**
+ * Unicode script "Mende Kikakui".
+ * @since 1.9
+ */
+ MENDE_KIKAKUI,
+
+ /**
+ * Unicode script "Modi".
+ * @since 1.9
+ */
+ MODI,
+
+ /**
+ * Unicode script "Mro".
+ * @since 1.9
+ */
+ MRO,
+
+ /**
+ * Unicode script "Old North Arabian".
+ * @since 1.9
+ */
+ OLD_NORTH_ARABIAN,
+
+ /**
+ * Unicode script "Nabataean".
+ * @since 1.9
+ */
+ NABATAEAN,
+
+ /**
+ * Unicode script "Palmyrene".
+ * @since 1.9
+ */
+ PALMYRENE,
+
+ /**
+ * Unicode script "Pau Cin Hau".
+ * @since 1.9
+ */
+ PAU_CIN_HAU,
+
+ /**
+ * Unicode script "Old Permic".
+ * @since 1.9
+ */
+ OLD_PERMIC,
+
+ /**
+ * Unicode script "Psalter Pahlavi".
+ * @since 1.9
+ */
+ PSALTER_PAHLAVI,
+
+ /**
+ * Unicode script "Siddham".
+ * @since 1.9
+ */
+ SIDDHAM,
+
+ /**
+ * Unicode script "Khudawadi".
+ * @since 1.9
+ */
+ KHUDAWADI,
+
+ /**
+ * Unicode script "Tirhuta".
+ * @since 1.9
+ */
+ TIRHUTA,
+
+ /**
+ * Unicode script "Warang Citi".
+ * @since 1.9
+ */
+ WARANG_CITI,
+
+ /**
* Unicode script "Unknown".
*/
UNKNOWN;
@@ -3721,14 +4225,14 @@
0x005B, // 005B..0060; COMMON
0x0061, // 0061..007A; LATIN
0x007B, // 007B..00A9; COMMON
- 0x00AA, // 00AA..00AA; LATIN
+ 0x00AA, // 00AA ; LATIN
0x00AB, // 00AB..00B9; COMMON
- 0x00BA, // 00BA..00BA; LATIN
+ 0x00BA, // 00BA ; LATIN
0x00BB, // 00BB..00BF; COMMON
0x00C0, // 00C0..00D6; LATIN
- 0x00D7, // 00D7..00D7; COMMON
+ 0x00D7, // 00D7 ; COMMON
0x00D8, // 00D8..00F6; LATIN
- 0x00F7, // 00F7..00F7; COMMON
+ 0x00F7, // 00F7 ; COMMON
0x00F8, // 00F8..02B8; LATIN
0x02B9, // 02B9..02DF; COMMON
0x02E0, // 02E0..02E4; LATIN
@@ -3737,284 +4241,1178 @@
0x02EC, // 02EC..02FF; COMMON
0x0300, // 0300..036F; INHERITED
0x0370, // 0370..0373; GREEK
- 0x0374, // 0374..0374; COMMON
- 0x0375, // 0375..037D; GREEK
- 0x037E, // 037E..0383; COMMON
- 0x0384, // 0384..0384; GREEK
- 0x0385, // 0385..0385; COMMON
- 0x0386, // 0386..0386; GREEK
- 0x0387, // 0387..0387; COMMON
- 0x0388, // 0388..03E1; GREEK
+ 0x0374, // 0374 ; COMMON
+ 0x0375, // 0375..0377; GREEK
+ 0x0378, // 0378..0379; UNKNOWN
+ 0x037A, // 037A..037D; GREEK
+ 0x037E, // 037E ; COMMON
+ 0x037F, // 037F ; GREEK
+ 0x0380, // 0380..0383; UNKNOWN
+ 0x0384, // 0384 ; GREEK
+ 0x0385, // 0385 ; COMMON
+ 0x0386, // 0386 ; GREEK
+ 0x0387, // 0387 ; COMMON
+ 0x0388, // 0388..038A; GREEK
+ 0x038B, // 038B ; UNKNOWN
+ 0x038C, // 038C ; GREEK
+ 0x038D, // 038D ; UNKNOWN
+ 0x038E, // 038E..03A1; GREEK
+ 0x03A2, // 03A2 ; UNKNOWN
+ 0x03A3, // 03A3..03E1; GREEK
0x03E2, // 03E2..03EF; COPTIC
0x03F0, // 03F0..03FF; GREEK
0x0400, // 0400..0484; CYRILLIC
0x0485, // 0485..0486; INHERITED
- 0x0487, // 0487..0530; CYRILLIC
- 0x0531, // 0531..0588; ARMENIAN
- 0x0589, // 0589..0589; COMMON
- 0x058A, // 058A..0590; ARMENIAN
- 0x0591, // 0591..05FF; HEBREW
- 0x0600, // 0600..060B; ARABIC
- 0x060C, // 060C..060C; COMMON
+ 0x0487, // 0487..052F; CYRILLIC
+ 0x0530, // 0530 ; UNKNOWN
+ 0x0531, // 0531..0556; ARMENIAN
+ 0x0557, // 0557..0558; UNKNOWN
+ 0x0559, // 0559..055F; ARMENIAN
+ 0x0560, // 0560 ; UNKNOWN
+ 0x0561, // 0561..0587; ARMENIAN
+ 0x0588, // 0588 ; UNKNOWN
+ 0x0589, // 0589 ; COMMON
+ 0x058A, // 058A ; ARMENIAN
+ 0x058B, // 058B..058C; UNKNOWN
+ 0x058D, // 058D..058F; ARMENIAN
+ 0x0590, // 0590 ; UNKNOWN
+ 0x0591, // 0591..05C7; HEBREW
+ 0x05C8, // 05C8..05CF; UNKNOWN
+ 0x05D0, // 05D0..05EA; HEBREW
+ 0x05EB, // 05EB..05EF; UNKNOWN
+ 0x05F0, // 05F0..05F4; HEBREW
+ 0x05F5, // 05F5..05FF; UNKNOWN
+ 0x0600, // 0600..0604; ARABIC
+ 0x0605, // 0605 ; COMMON
+ 0x0606, // 0606..060B; ARABIC
+ 0x060C, // 060C ; COMMON
0x060D, // 060D..061A; ARABIC
- 0x061B, // 061B..061D; COMMON
- 0x061E, // 061E..061E; ARABIC
- 0x061F, // 061F..061F; COMMON
+ 0x061B, // 061B..061C; COMMON
+ 0x061D, // 061D ; UNKNOWN
+ 0x061E, // 061E ; ARABIC
+ 0x061F, // 061F ; COMMON
0x0620, // 0620..063F; ARABIC
- 0x0640, // 0640..0640; COMMON
+ 0x0640, // 0640 ; COMMON
0x0641, // 0641..064A; ARABIC
0x064B, // 064B..0655; INHERITED
0x0656, // 0656..065F; ARABIC
0x0660, // 0660..0669; COMMON
0x066A, // 066A..066F; ARABIC
- 0x0670, // 0670..0670; INHERITED
+ 0x0670, // 0670 ; INHERITED
0x0671, // 0671..06DC; ARABIC
- 0x06DD, // 06DD..06DD; COMMON
+ 0x06DD, // 06DD ; COMMON
0x06DE, // 06DE..06FF; ARABIC
- 0x0700, // 0700..074F; SYRIAC
+ 0x0700, // 0700..070D; SYRIAC
+ 0x070E, // 070E ; UNKNOWN
+ 0x070F, // 070F..074A; SYRIAC
+ 0x074B, // 074B..074C; UNKNOWN
+ 0x074D, // 074D..074F; SYRIAC
0x0750, // 0750..077F; ARABIC
- 0x0780, // 0780..07BF; THAANA
- 0x07C0, // 07C0..07FF; NKO
- 0x0800, // 0800..083F; SAMARITAN
- 0x0840, // 0840..089F; MANDAIC
- 0x08A0, // 08A0..08FF; ARABIC
+ 0x0780, // 0780..07B1; THAANA
+ 0x07B2, // 07B2..07BF; UNKNOWN
+ 0x07C0, // 07C0..07FA; NKO
+ 0x07FB, // 07FB..07FF; UNKNOWN
+ 0x0800, // 0800..082D; SAMARITAN
+ 0x082E, // 082E..082F; UNKNOWN
+ 0x0830, // 0830..083E; SAMARITAN
+ 0x083F, // 083F ; UNKNOWN
+ 0x0840, // 0840..085B; MANDAIC
+ 0x085C, // 085C..085D; UNKNOWN
+ 0x085E, // 085E ; MANDAIC
+ 0x085F, // 085F..089F; UNKNOWN
+ 0x08A0, // 08A0..08B2; ARABIC
+ 0x08B3, // 08B3..08E3; UNKNOWN
+ 0x08E4, // 08E4..08FF; ARABIC
0x0900, // 0900..0950; DEVANAGARI
0x0951, // 0951..0952; INHERITED
0x0953, // 0953..0963; DEVANAGARI
0x0964, // 0964..0965; COMMON
- 0x0966, // 0966..0980; DEVANAGARI
- 0x0981, // 0981..0A00; BENGALI
- 0x0A01, // 0A01..0A80; GURMUKHI
- 0x0A81, // 0A81..0B00; GUJARATI
- 0x0B01, // 0B01..0B81; ORIYA
- 0x0B82, // 0B82..0C00; TAMIL
- 0x0C01, // 0C01..0C81; TELUGU
- 0x0C82, // 0C82..0CF0; KANNADA
- 0x0D02, // 0D02..0D81; MALAYALAM
- 0x0D82, // 0D82..0E00; SINHALA
- 0x0E01, // 0E01..0E3E; THAI
- 0x0E3F, // 0E3F..0E3F; COMMON
- 0x0E40, // 0E40..0E80; THAI
- 0x0E81, // 0E81..0EFF; LAO
- 0x0F00, // 0F00..0FD4; TIBETAN
+ 0x0966, // 0966..097F; DEVANAGARI
+ 0x0980, // 0980..0983; BENGALI
+ 0x0984, // 0984 ; UNKNOWN
+ 0x0985, // 0985..098C; BENGALI
+ 0x098D, // 098D..098E; UNKNOWN
+ 0x098F, // 098F..0990; BENGALI
+ 0x0991, // 0991..0992; UNKNOWN
+ 0x0993, // 0993..09A8; BENGALI
+ 0x09A9, // 09A9 ; UNKNOWN
+ 0x09AA, // 09AA..09B0; BENGALI
+ 0x09B1, // 09B1 ; UNKNOWN
+ 0x09B2, // 09B2 ; BENGALI
+ 0x09B3, // 09B3..09B5; UNKNOWN
+ 0x09B6, // 09B6..09B9; BENGALI
+ 0x09BA, // 09BA..09BB; UNKNOWN
+ 0x09BC, // 09BC..09C4; BENGALI
+ 0x09C5, // 09C5..09C6; UNKNOWN
+ 0x09C7, // 09C7..09C8; BENGALI
+ 0x09C9, // 09C9..09CA; UNKNOWN
+ 0x09CB, // 09CB..09CE; BENGALI
+ 0x09CF, // 09CF..09D6; UNKNOWN
+ 0x09D7, // 09D7 ; BENGALI
+ 0x09D8, // 09D8..09DB; UNKNOWN
+ 0x09DC, // 09DC..09DD; BENGALI
+ 0x09DE, // 09DE ; UNKNOWN
+ 0x09DF, // 09DF..09E3; BENGALI
+ 0x09E4, // 09E4..09E5; UNKNOWN
+ 0x09E6, // 09E6..09FB; BENGALI
+ 0x09FC, // 09FC..0A00; UNKNOWN
+ 0x0A01, // 0A01..0A03; GURMUKHI
+ 0x0A04, // 0A04 ; UNKNOWN
+ 0x0A05, // 0A05..0A0A; GURMUKHI
+ 0x0A0B, // 0A0B..0A0E; UNKNOWN
+ 0x0A0F, // 0A0F..0A10; GURMUKHI
+ 0x0A11, // 0A11..0A12; UNKNOWN
+ 0x0A13, // 0A13..0A28; GURMUKHI
+ 0x0A29, // 0A29 ; UNKNOWN
+ 0x0A2A, // 0A2A..0A30; GURMUKHI
+ 0x0A31, // 0A31 ; UNKNOWN
+ 0x0A32, // 0A32..0A33; GURMUKHI
+ 0x0A34, // 0A34 ; UNKNOWN
+ 0x0A35, // 0A35..0A36; GURMUKHI
+ 0x0A37, // 0A37 ; UNKNOWN
+ 0x0A38, // 0A38..0A39; GURMUKHI
+ 0x0A3A, // 0A3A..0A3B; UNKNOWN
+ 0x0A3C, // 0A3C ; GURMUKHI
+ 0x0A3D, // 0A3D ; UNKNOWN
+ 0x0A3E, // 0A3E..0A42; GURMUKHI
+ 0x0A43, // 0A43..0A46; UNKNOWN
+ 0x0A47, // 0A47..0A48; GURMUKHI
+ 0x0A49, // 0A49..0A4A; UNKNOWN
+ 0x0A4B, // 0A4B..0A4D; GURMUKHI
+ 0x0A4E, // 0A4E..0A50; UNKNOWN
+ 0x0A51, // 0A51 ; GURMUKHI
+ 0x0A52, // 0A52..0A58; UNKNOWN
+ 0x0A59, // 0A59..0A5C; GURMUKHI
+ 0x0A5D, // 0A5D ; UNKNOWN
+ 0x0A5E, // 0A5E ; GURMUKHI
+ 0x0A5F, // 0A5F..0A65; UNKNOWN
+ 0x0A66, // 0A66..0A75; GURMUKHI
+ 0x0A76, // 0A76..0A80; UNKNOWN
+ 0x0A81, // 0A81..0A83; GUJARATI
+ 0x0A84, // 0A84 ; UNKNOWN
+ 0x0A85, // 0A85..0A8D; GUJARATI
+ 0x0A8E, // 0A8E ; UNKNOWN
+ 0x0A8F, // 0A8F..0A91; GUJARATI
+ 0x0A92, // 0A92 ; UNKNOWN
+ 0x0A93, // 0A93..0AA8; GUJARATI
+ 0x0AA9, // 0AA9 ; UNKNOWN
+ 0x0AAA, // 0AAA..0AB0; GUJARATI
+ 0x0AB1, // 0AB1 ; UNKNOWN
+ 0x0AB2, // 0AB2..0AB3; GUJARATI
+ 0x0AB4, // 0AB4 ; UNKNOWN
+ 0x0AB5, // 0AB5..0AB9; GUJARATI
+ 0x0ABA, // 0ABA..0ABB; UNKNOWN
+ 0x0ABC, // 0ABC..0AC5; GUJARATI
+ 0x0AC6, // 0AC6 ; UNKNOWN
+ 0x0AC7, // 0AC7..0AC9; GUJARATI
+ 0x0ACA, // 0ACA ; UNKNOWN
+ 0x0ACB, // 0ACB..0ACD; GUJARATI
+ 0x0ACE, // 0ACE..0ACF; UNKNOWN
+ 0x0AD0, // 0AD0 ; GUJARATI
+ 0x0AD1, // 0AD1..0ADF; UNKNOWN
+ 0x0AE0, // 0AE0..0AE3; GUJARATI
+ 0x0AE4, // 0AE4..0AE5; UNKNOWN
+ 0x0AE6, // 0AE6..0AF1; GUJARATI
+ 0x0AF2, // 0AF2..0B00; UNKNOWN
+ 0x0B01, // 0B01..0B03; ORIYA
+ 0x0B04, // 0B04 ; UNKNOWN
+ 0x0B05, // 0B05..0B0C; ORIYA
+ 0x0B0D, // 0B0D..0B0E; UNKNOWN
+ 0x0B0F, // 0B0F..0B10; ORIYA
+ 0x0B11, // 0B11..0B12; UNKNOWN
+ 0x0B13, // 0B13..0B28; ORIYA
+ 0x0B29, // 0B29 ; UNKNOWN
+ 0x0B2A, // 0B2A..0B30; ORIYA
+ 0x0B31, // 0B31 ; UNKNOWN
+ 0x0B32, // 0B32..0B33; ORIYA
+ 0x0B34, // 0B34 ; UNKNOWN
+ 0x0B35, // 0B35..0B39; ORIYA
+ 0x0B3A, // 0B3A..0B3B; UNKNOWN
+ 0x0B3C, // 0B3C..0B44; ORIYA
+ 0x0B45, // 0B45..0B46; UNKNOWN
+ 0x0B47, // 0B47..0B48; ORIYA
+ 0x0B49, // 0B49..0B4A; UNKNOWN
+ 0x0B4B, // 0B4B..0B4D; ORIYA
+ 0x0B4E, // 0B4E..0B55; UNKNOWN
+ 0x0B56, // 0B56..0B57; ORIYA
+ 0x0B58, // 0B58..0B5B; UNKNOWN
+ 0x0B5C, // 0B5C..0B5D; ORIYA
+ 0x0B5E, // 0B5E ; UNKNOWN
+ 0x0B5F, // 0B5F..0B63; ORIYA
+ 0x0B64, // 0B64..0B65; UNKNOWN
+ 0x0B66, // 0B66..0B77; ORIYA
+ 0x0B78, // 0B78..0B81; UNKNOWN
+ 0x0B82, // 0B82..0B83; TAMIL
+ 0x0B84, // 0B84 ; UNKNOWN
+ 0x0B85, // 0B85..0B8A; TAMIL
+ 0x0B8B, // 0B8B..0B8D; UNKNOWN
+ 0x0B8E, // 0B8E..0B90; TAMIL
+ 0x0B91, // 0B91 ; UNKNOWN
+ 0x0B92, // 0B92..0B95; TAMIL
+ 0x0B96, // 0B96..0B98; UNKNOWN
+ 0x0B99, // 0B99..0B9A; TAMIL
+ 0x0B9B, // 0B9B ; UNKNOWN
+ 0x0B9C, // 0B9C ; TAMIL
+ 0x0B9D, // 0B9D ; UNKNOWN
+ 0x0B9E, // 0B9E..0B9F; TAMIL
+ 0x0BA0, // 0BA0..0BA2; UNKNOWN
+ 0x0BA3, // 0BA3..0BA4; TAMIL
+ 0x0BA5, // 0BA5..0BA7; UNKNOWN
+ 0x0BA8, // 0BA8..0BAA; TAMIL
+ 0x0BAB, // 0BAB..0BAD; UNKNOWN
+ 0x0BAE, // 0BAE..0BB9; TAMIL
+ 0x0BBA, // 0BBA..0BBD; UNKNOWN
+ 0x0BBE, // 0BBE..0BC2; TAMIL
+ 0x0BC3, // 0BC3..0BC5; UNKNOWN
+ 0x0BC6, // 0BC6..0BC8; TAMIL
+ 0x0BC9, // 0BC9 ; UNKNOWN
+ 0x0BCA, // 0BCA..0BCD; TAMIL
+ 0x0BCE, // 0BCE..0BCF; UNKNOWN
+ 0x0BD0, // 0BD0 ; TAMIL
+ 0x0BD1, // 0BD1..0BD6; UNKNOWN
+ 0x0BD7, // 0BD7 ; TAMIL
+ 0x0BD8, // 0BD8..0BE5; UNKNOWN
+ 0x0BE6, // 0BE6..0BFA; TAMIL
+ 0x0BFB, // 0BFB..0BFF; UNKNOWN
+ 0x0C00, // 0C00..0C03; TELUGU
+ 0x0C04, // 0C04 ; UNKNOWN
+ 0x0C05, // 0C05..0C0C; TELUGU
+ 0x0C0D, // 0C0D ; UNKNOWN
+ 0x0C0E, // 0C0E..0C10; TELUGU
+ 0x0C11, // 0C11 ; UNKNOWN
+ 0x0C12, // 0C12..0C28; TELUGU
+ 0x0C29, // 0C29 ; UNKNOWN
+ 0x0C2A, // 0C2A..0C39; TELUGU
+ 0x0C3A, // 0C3A..0C3C; UNKNOWN
+ 0x0C3D, // 0C3D..0C44; TELUGU
+ 0x0C45, // 0C45 ; UNKNOWN
+ 0x0C46, // 0C46..0C48; TELUGU
+ 0x0C49, // 0C49 ; UNKNOWN
+ 0x0C4A, // 0C4A..0C4D; TELUGU
+ 0x0C4E, // 0C4E..0C54; UNKNOWN
+ 0x0C55, // 0C55..0C56; TELUGU
+ 0x0C57, // 0C57 ; UNKNOWN
+ 0x0C58, // 0C58..0C59; TELUGU
+ 0x0C5A, // 0C5A..0C5F; UNKNOWN
+ 0x0C60, // 0C60..0C63; TELUGU
+ 0x0C64, // 0C64..0C65; UNKNOWN
+ 0x0C66, // 0C66..0C6F; TELUGU
+ 0x0C70, // 0C70..0C77; UNKNOWN
+ 0x0C78, // 0C78..0C7F; TELUGU
+ 0x0C80, // 0C80 ; UNKNOWN
+ 0x0C81, // 0C81..0C83; KANNADA
+ 0x0C84, // 0C84 ; UNKNOWN
+ 0x0C85, // 0C85..0C8C; KANNADA
+ 0x0C8D, // 0C8D ; UNKNOWN
+ 0x0C8E, // 0C8E..0C90; KANNADA
+ 0x0C91, // 0C91 ; UNKNOWN
+ 0x0C92, // 0C92..0CA8; KANNADA
+ 0x0CA9, // 0CA9 ; UNKNOWN
+ 0x0CAA, // 0CAA..0CB3; KANNADA
+ 0x0CB4, // 0CB4 ; UNKNOWN
+ 0x0CB5, // 0CB5..0CB9; KANNADA
+ 0x0CBA, // 0CBA..0CBB; UNKNOWN
+ 0x0CBC, // 0CBC..0CC4; KANNADA
+ 0x0CC5, // 0CC5 ; UNKNOWN
+ 0x0CC6, // 0CC6..0CC8; KANNADA
+ 0x0CC9, // 0CC9 ; UNKNOWN
+ 0x0CCA, // 0CCA..0CCD; KANNADA
+ 0x0CCE, // 0CCE..0CD4; UNKNOWN
+ 0x0CD5, // 0CD5..0CD6; KANNADA
+ 0x0CD7, // 0CD7..0CDD; UNKNOWN
+ 0x0CDE, // 0CDE ; KANNADA
+ 0x0CDF, // 0CDF ; UNKNOWN
+ 0x0CE0, // 0CE0..0CE3; KANNADA
+ 0x0CE4, // 0CE4..0CE5; UNKNOWN
+ 0x0CE6, // 0CE6..0CEF; KANNADA
+ 0x0CF0, // 0CF0 ; UNKNOWN
+ 0x0CF1, // 0CF1..0CF2; KANNADA
+ 0x0CF3, // 0CF3..0D00; UNKNOWN
+ 0x0D01, // 0D01..0D03; MALAYALAM
+ 0x0D04, // 0D04 ; UNKNOWN
+ 0x0D05, // 0D05..0D0C; MALAYALAM
+ 0x0D0D, // 0D0D ; UNKNOWN
+ 0x0D0E, // 0D0E..0D10; MALAYALAM
+ 0x0D11, // 0D11 ; UNKNOWN
+ 0x0D12, // 0D12..0D3A; MALAYALAM
+ 0x0D3B, // 0D3B..0D3C; UNKNOWN
+ 0x0D3D, // 0D3D..0D44; MALAYALAM
+ 0x0D45, // 0D45 ; UNKNOWN
+ 0x0D46, // 0D46..0D48; MALAYALAM
+ 0x0D49, // 0D49 ; UNKNOWN
+ 0x0D4A, // 0D4A..0D4E; MALAYALAM
+ 0x0D4F, // 0D4F..0D56; UNKNOWN
+ 0x0D57, // 0D57 ; MALAYALAM
+ 0x0D58, // 0D58..0D5F; UNKNOWN
+ 0x0D60, // 0D60..0D63; MALAYALAM
+ 0x0D64, // 0D64..0D65; UNKNOWN
+ 0x0D66, // 0D66..0D75; MALAYALAM
+ 0x0D76, // 0D76..0D78; UNKNOWN
+ 0x0D79, // 0D79..0D7F; MALAYALAM
+ 0x0D80, // 0D80..0D81; UNKNOWN
+ 0x0D82, // 0D82..0D83; SINHALA
+ 0x0D84, // 0D84 ; UNKNOWN
+ 0x0D85, // 0D85..0D96; SINHALA
+ 0x0D97, // 0D97..0D99; UNKNOWN
+ 0x0D9A, // 0D9A..0DB1; SINHALA
+ 0x0DB2, // 0DB2 ; UNKNOWN
+ 0x0DB3, // 0DB3..0DBB; SINHALA
+ 0x0DBC, // 0DBC ; UNKNOWN
+ 0x0DBD, // 0DBD ; SINHALA
+ 0x0DBE, // 0DBE..0DBF; UNKNOWN
+ 0x0DC0, // 0DC0..0DC6; SINHALA
+ 0x0DC7, // 0DC7..0DC9; UNKNOWN
+ 0x0DCA, // 0DCA ; SINHALA
+ 0x0DCB, // 0DCB..0DCE; UNKNOWN
+ 0x0DCF, // 0DCF..0DD4; SINHALA
+ 0x0DD5, // 0DD5 ; UNKNOWN
+ 0x0DD6, // 0DD6 ; SINHALA
+ 0x0DD7, // 0DD7 ; UNKNOWN
+ 0x0DD8, // 0DD8..0DDF; SINHALA
+ 0x0DE0, // 0DE0..0DE5; UNKNOWN
+ 0x0DE6, // 0DE6..0DEF; SINHALA
+ 0x0DF0, // 0DF0..0DF1; UNKNOWN
+ 0x0DF2, // 0DF2..0DF4; SINHALA
+ 0x0DF5, // 0DF5..0E00; UNKNOWN
+ 0x0E01, // 0E01..0E3A; THAI
+ 0x0E3B, // 0E3B..0E3E; UNKNOWN
+ 0x0E3F, // 0E3F ; COMMON
+ 0x0E40, // 0E40..0E5B; THAI
+ 0x0E5C, // 0E5C..0E80; UNKNOWN
+ 0x0E81, // 0E81..0E82; LAO
+ 0x0E83, // 0E83 ; UNKNOWN
+ 0x0E84, // 0E84 ; LAO
+ 0x0E85, // 0E85..0E86; UNKNOWN
+ 0x0E87, // 0E87..0E88; LAO
+ 0x0E89, // 0E89 ; UNKNOWN
+ 0x0E8A, // 0E8A ; LAO
+ 0x0E8B, // 0E8B..0E8C; UNKNOWN
+ 0x0E8D, // 0E8D ; LAO
+ 0x0E8E, // 0E8E..0E93; UNKNOWN
+ 0x0E94, // 0E94..0E97; LAO
+ 0x0E98, // 0E98 ; UNKNOWN
+ 0x0E99, // 0E99..0E9F; LAO
+ 0x0EA0, // 0EA0 ; UNKNOWN
+ 0x0EA1, // 0EA1..0EA3; LAO
+ 0x0EA4, // 0EA4 ; UNKNOWN
+ 0x0EA5, // 0EA5 ; LAO
+ 0x0EA6, // 0EA6 ; UNKNOWN
+ 0x0EA7, // 0EA7 ; LAO
+ 0x0EA8, // 0EA8..0EA9; UNKNOWN
+ 0x0EAA, // 0EAA..0EAB; LAO
+ 0x0EAC, // 0EAC ; UNKNOWN
+ 0x0EAD, // 0EAD..0EB9; LAO
+ 0x0EBA, // 0EBA ; UNKNOWN
+ 0x0EBB, // 0EBB..0EBD; LAO
+ 0x0EBE, // 0EBE..0EBF; UNKNOWN
+ 0x0EC0, // 0EC0..0EC4; LAO
+ 0x0EC5, // 0EC5 ; UNKNOWN
+ 0x0EC6, // 0EC6 ; LAO
+ 0x0EC7, // 0EC7 ; UNKNOWN
+ 0x0EC8, // 0EC8..0ECD; LAO
+ 0x0ECE, // 0ECE..0ECF; UNKNOWN
+ 0x0ED0, // 0ED0..0ED9; LAO
+ 0x0EDA, // 0EDA..0EDB; UNKNOWN
+ 0x0EDC, // 0EDC..0EDF; LAO
+ 0x0EE0, // 0EE0..0EFF; UNKNOWN
+ 0x0F00, // 0F00..0F47; TIBETAN
+ 0x0F48, // 0F48 ; UNKNOWN
+ 0x0F49, // 0F49..0F6C; TIBETAN
+ 0x0F6D, // 0F6D..0F70; UNKNOWN
+ 0x0F71, // 0F71..0F97; TIBETAN
+ 0x0F98, // 0F98 ; UNKNOWN
+ 0x0F99, // 0F99..0FBC; TIBETAN
+ 0x0FBD, // 0FBD ; UNKNOWN
+ 0x0FBE, // 0FBE..0FCC; TIBETAN
+ 0x0FCD, // 0FCD ; UNKNOWN
+ 0x0FCE, // 0FCE..0FD4; TIBETAN
0x0FD5, // 0FD5..0FD8; COMMON
- 0x0FD9, // 0FD9..0FFF; TIBETAN
+ 0x0FD9, // 0FD9..0FDA; TIBETAN
+ 0x0FDB, // 0FDB..FFF; UNKNOWN
0x1000, // 1000..109F; MYANMAR
- 0x10A0, // 10A0..10FA; GEORGIAN
- 0x10FB, // 10FB..10FB; COMMON
+ 0x10A0, // 10A0..10C5; GEORGIAN
+ 0x10C6, // 10C6 ; UNKNOWN
+ 0x10C7, // 10C7 ; GEORGIAN
+ 0x10C8, // 10C8..10CC; UNKNOWN
+ 0x10CD, // 10CD ; GEORGIAN
+ 0x10CE, // 10CE..10CF; UNKNOWN
+ 0x10D0, // 10D0..10FA; GEORGIAN
+ 0x10FB, // 10FB ; COMMON
0x10FC, // 10FC..10FF; GEORGIAN
0x1100, // 1100..11FF; HANGUL
- 0x1200, // 1200..139F; ETHIOPIC
- 0x13A0, // 13A0..13FF; CHEROKEE
+ 0x1200, // 1200..1248; ETHIOPIC
+ 0x1249, // 1249 ; UNKNOWN
+ 0x124A, // 124A..124D; ETHIOPIC
+ 0x124E, // 124E..124F; UNKNOWN
+ 0x1250, // 1250..1256; ETHIOPIC
+ 0x1257, // 1257 ; UNKNOWN
+ 0x1258, // 1258 ; ETHIOPIC
+ 0x1259, // 1259 ; UNKNOWN
+ 0x125A, // 125A..125D; ETHIOPIC
+ 0x125E, // 125E..125F; UNKNOWN
+ 0x1260, // 1260..1288; ETHIOPIC
+ 0x1289, // 1289 ; UNKNOWN
+ 0x128A, // 128A..128D; ETHIOPIC
+ 0x128E, // 128E..128F; UNKNOWN
+ 0x1290, // 1290..12B0; ETHIOPIC
+ 0x12B1, // 12B1 ; UNKNOWN
+ 0x12B2, // 12B2..12B5; ETHIOPIC
+ 0x12B6, // 12B6..12B7; UNKNOWN
+ 0x12B8, // 12B8..12BE; ETHIOPIC
+ 0x12BF, // 12BF ; UNKNOWN
+ 0x12C0, // 12C0 ; ETHIOPIC
+ 0x12C1, // 12C1 ; UNKNOWN
+ 0x12C2, // 12C2..12C5; ETHIOPIC
+ 0x12C6, // 12C6..12C7; UNKNOWN
+ 0x12C8, // 12C8..12D6; ETHIOPIC
+ 0x12D7, // 12D7 ; UNKNOWN
+ 0x12D8, // 12D8..1310; ETHIOPIC
+ 0x1311, // 1311 ; UNKNOWN
+ 0x1312, // 1312..1315; ETHIOPIC
+ 0x1316, // 1316..1317; UNKNOWN
+ 0x1318, // 1318..135A; ETHIOPIC
+ 0x135B, // 135B..135C; UNKNOWN
+ 0x135D, // 135D..137C; ETHIOPIC
+ 0x137D, // 137D..137F; UNKNOWN
+ 0x1380, // 1380..1399; ETHIOPIC
+ 0x139A, // 139A..139F; UNKNOWN
+ 0x13A0, // 13A0..13F4; CHEROKEE
+ 0x13F5, // 13F5..13FF; UNKNOWN
0x1400, // 1400..167F; CANADIAN_ABORIGINAL
- 0x1680, // 1680..169F; OGHAM
+ 0x1680, // 1680..169C; OGHAM
+ 0x169D, // 169D..169F; UNKNOWN
0x16A0, // 16A0..16EA; RUNIC
0x16EB, // 16EB..16ED; COMMON
- 0x16EE, // 16EE..16FF; RUNIC
- 0x1700, // 1700..171F; TAGALOG
+ 0x16EE, // 16EE..16F8; RUNIC
+ 0x16F9, // 16F9..16FF; UNKNOWN
+ 0x1700, // 1700..170C; TAGALOG
+ 0x170D, // 170D ; UNKNOWN
+ 0x170E, // 170E..1714; TAGALOG
+ 0x1715, // 1715..171F; UNKNOWN
0x1720, // 1720..1734; HANUNOO
- 0x1735, // 1735..173F; COMMON
- 0x1740, // 1740..175F; BUHID
- 0x1760, // 1760..177F; TAGBANWA
- 0x1780, // 1780..17FF; KHMER
+ 0x1735, // 1735..1736; COMMON
+ 0x1737, // 1737..173F; UNKNOWN
+ 0x1740, // 1740..1753; BUHID
+ 0x1754, // 1754..175F; UNKNOWN
+ 0x1760, // 1760..176C; TAGBANWA
+ 0x176D, // 176D ; UNKNOWN
+ 0x176E, // 176E..1770; TAGBANWA
+ 0x1771, // 1771 ; UNKNOWN
+ 0x1772, // 1772..1773; TAGBANWA
+ 0x1774, // 1774..177F; UNKNOWN
+ 0x1780, // 1780..17DD; KHMER
+ 0x17DE, // 17DE..17DF; UNKNOWN
+ 0x17E0, // 17E0..17E9; KHMER
+ 0x17EA, // 17EA..17EF; UNKNOWN
+ 0x17F0, // 17F0..17F9; KHMER
+ 0x17FA, // 17FA..17FF; UNKNOWN
0x1800, // 1800..1801; MONGOLIAN
0x1802, // 1802..1803; COMMON
- 0x1804, // 1804..1804; MONGOLIAN
- 0x1805, // 1805..1805; COMMON
- 0x1806, // 1806..18AF; MONGOLIAN
- 0x18B0, // 18B0..18FF; CANADIAN_ABORIGINAL
- 0x1900, // 1900..194F; LIMBU
- 0x1950, // 1950..197F; TAI_LE
- 0x1980, // 1980..19DF; NEW_TAI_LUE
+ 0x1804, // 1804 ; MONGOLIAN
+ 0x1805, // 1805 ; COMMON
+ 0x1806, // 1806..180E; MONGOLIAN
+ 0x180F, // 180F ; UNKNOWN
+ 0x1810, // 1810..1819; MONGOLIAN
+ 0x181A, // 181A..181F; UNKNOWN
+ 0x1820, // 1820..1877; MONGOLIAN
+ 0x1878, // 1878..187F; UNKNOWN
+ 0x1880, // 1880..18AA; MONGOLIAN
+ 0x18AB, // 18AB..18AF; UNKNOWN
+ 0x18B0, // 18B0..18F5; CANADIAN_ABORIGINAL
+ 0x18F6, // 18F6..18FF; UNKNOWN
+ 0x1900, // 1900..191E; LIMBU
+ 0x191F, // 191F ; UNKNOWN
+ 0x1920, // 1920..192B; LIMBU
+ 0x192C, // 192C..192F; UNKNOWN
+ 0x1930, // 1930..193B; LIMBU
+ 0x193C, // 193C..193F; UNKNOWN
+ 0x1940, // 1940 ; LIMBU
+ 0x1941, // 1941..1943; UNKNOWN
+ 0x1944, // 1944..194F; LIMBU
+ 0x1950, // 1950..196D; TAI_LE
+ 0x196E, // 196E..196F; UNKNOWN
+ 0x1970, // 1970..1974; TAI_LE
+ 0x1975, // 1975..197F; UNKNOWN
+ 0x1980, // 1980..19AB; NEW_TAI_LUE
+ 0x19AC, // 19AC..19AF; UNKNOWN
+ 0x19B0, // 19B0..19C9; NEW_TAI_LUE
+ 0x19CA, // 19CA..19CF; UNKNOWN
+ 0x19D0, // 19D0..19DA; NEW_TAI_LUE
+ 0x19DB, // 19DB..19DD; UNKNOWN
+ 0x19DE, // 19DE..19DF; NEW_TAI_LUE
0x19E0, // 19E0..19FF; KHMER
- 0x1A00, // 1A00..1A1F; BUGINESE
- 0x1A20, // 1A20..1AFF; TAI_THAM
- 0x1B00, // 1B00..1B7F; BALINESE
+ 0x1A00, // 1A00..1A1B; BUGINESE
+ 0x1A1C, // 1A1C..1A1D; UNKNOWN
+ 0x1A1E, // 1A1E..1A1F; BUGINESE
+ 0x1A20, // 1A20..1A5E; TAI_THAM
+ 0x1A5F, // 1A5F ; UNKNOWN
+ 0x1A60, // 1A60..1A7C; TAI_THAM
+ 0x1A7D, // 1A7D..1A7E; UNKNOWN
+ 0x1A7F, // 1A7F..1A89; TAI_THAM
+ 0x1A8A, // 1A8A..1A8F; UNKNOWN
+ 0x1A90, // 1A90..1A99; TAI_THAM
+ 0x1A9A, // 1A9A..1A9F; UNKNOWN
+ 0x1AA0, // 1AA0..1AAD; TAI_THAM
+ 0x1AAE, // 1AAE..1AAF; UNKNOWN
+ 0x1AB0, // 1AB0..1ABE; INHERITED
+ 0x1ABF, // 1ABF..1AFF; UNKNOWN
+ 0x1B00, // 1B00..1B4B; BALINESE
+ 0x1B4C, // 1B4C..1B4F; UNKNOWN
+ 0x1B50, // 1B50..1B7C; BALINESE
+ 0x1B7D, // 1B7D..1B7F; UNKNOWN
0x1B80, // 1B80..1BBF; SUNDANESE
- 0x1BC0, // 1BC0..1BFF; BATAK
- 0x1C00, // 1C00..1C4F; LEPCHA
- 0x1C50, // 1C50..1CBF; OL_CHIKI
- 0x1CC0, // 1CC0..1CCF; SUNDANESE
+ 0x1BC0, // 1BC0..1BF3; BATAK
+ 0x1BF4, // 1BF4..1BFB; UNKNOWN
+ 0x1BFC, // 1BFC..1BFF; BATAK
+ 0x1C00, // 1C00..1C37; LEPCHA
+ 0x1C38, // 1C38..1C3A; UNKNOWN
+ 0x1C3B, // 1C3B..1C49; LEPCHA
+ 0x1C4A, // 1C4A..1C4C; UNKNOWN
+ 0x1C4D, // 1C4D..1C4F; LEPCHA
+ 0x1C50, // 1C50..1C7F; OL_CHIKI
+ 0x1C80, // 1C80..1CBF; UNKNOWN
+ 0x1CC0, // 1CC0..1CC7; SUNDANESE
+ 0x1CC8, // 1CC8..1CCF; UNKNOWN
0x1CD0, // 1CD0..1CD2; INHERITED
- 0x1CD3, // 1CD3..1CD3; COMMON
+ 0x1CD3, // 1CD3 ; COMMON
0x1CD4, // 1CD4..1CE0; INHERITED
- 0x1CE1, // 1CE1..1CE1; COMMON
+ 0x1CE1, // 1CE1 ; COMMON
0x1CE2, // 1CE2..1CE8; INHERITED
0x1CE9, // 1CE9..1CEC; COMMON
- 0x1CED, // 1CED..1CED; INHERITED
+ 0x1CED, // 1CED ; INHERITED
0x1CEE, // 1CEE..1CF3; COMMON
- 0x1CF4, // 1CF4..1CF4; INHERITED
- 0x1CF5, // 1CF5..1CFF; COMMON
+ 0x1CF4, // 1CF4 ; INHERITED
+ 0x1CF5, // 1CF5..1CF6; COMMON
+ 0x1CF7, // 1CF7 ; UNKNOWN
+ 0x1CF8, // 1CF8..1CF9; INHERITED
+ 0x1CFA, // 1CFA..1CFF; UNKNOWN
0x1D00, // 1D00..1D25; LATIN
0x1D26, // 1D26..1D2A; GREEK
- 0x1D2B, // 1D2B..1D2B; CYRILLIC
+ 0x1D2B, // 1D2B ; CYRILLIC
0x1D2C, // 1D2C..1D5C; LATIN
0x1D5D, // 1D5D..1D61; GREEK
0x1D62, // 1D62..1D65; LATIN
0x1D66, // 1D66..1D6A; GREEK
0x1D6B, // 1D6B..1D77; LATIN
- 0x1D78, // 1D78..1D78; CYRILLIC
+ 0x1D78, // 1D78 ; CYRILLIC
0x1D79, // 1D79..1DBE; LATIN
- 0x1DBF, // 1DBF..1DBF; GREEK
- 0x1DC0, // 1DC0..1DFF; INHERITED
+ 0x1DBF, // 1DBF ; GREEK
+ 0x1DC0, // 1DC0..1DF5; INHERITED
+ 0x1DF6, // 1DF6..1DFB; UNKNOWN
+ 0x1DFC, // 1DFC..1DFF; INHERITED
0x1E00, // 1E00..1EFF; LATIN
- 0x1F00, // 1F00..1FFF; GREEK
+ 0x1F00, // 1F00..1F15; GREEK
+ 0x1F16, // 1F16..1F17; UNKNOWN
+ 0x1F18, // 1F18..1F1D; GREEK
+ 0x1F1E, // 1F1E..1F1F; UNKNOWN
+ 0x1F20, // 1F20..1F45; GREEK
+ 0x1F46, // 1F46..1F47; UNKNOWN
+ 0x1F48, // 1F48..1F4D; GREEK
+ 0x1F4E, // 1F4E..1F4F; UNKNOWN
+ 0x1F50, // 1F50..1F57; GREEK
+ 0x1F58, // 1F58 ; UNKNOWN
+ 0x1F59, // 1F59 ; GREEK
+ 0x1F5A, // 1F5A ; UNKNOWN
+ 0x1F5B, // 1F5B ; GREEK
+ 0x1F5C, // 1F5C ; UNKNOWN
+ 0x1F5D, // 1F5D ; GREEK
+ 0x1F5E, // 1F5E ; UNKNOWN
+ 0x1F5F, // 1F5F..1F7D; GREEK
+ 0x1F7E, // 1F7E..1F7F; UNKNOWN
+ 0x1F80, // 1F80..1FB4; GREEK
+ 0x1FB5, // 1FB5 ; UNKNOWN
+ 0x1FB6, // 1FB6..1FC4; GREEK
+ 0x1FC5, // 1FC5 ; UNKNOWN
+ 0x1FC6, // 1FC6..1FD3; GREEK
+ 0x1FD4, // 1FD4..1FD5; UNKNOWN
+ 0x1FD6, // 1FD6..1FDB; GREEK
+ 0x1FDC, // 1FDC ; UNKNOWN
+ 0x1FDD, // 1FDD..1FEF; GREEK
+ 0x1FF0, // 1FF0..1FF1; UNKNOWN
+ 0x1FF2, // 1FF2..1FF4; GREEK
+ 0x1FF5, // 1FF5 ; UNKNOWN
+ 0x1FF6, // 1FF6..1FFE; GREEK
+ 0x1FFF, // 1FFF ; UNKNOWN
0x2000, // 2000..200B; COMMON
0x200C, // 200C..200D; INHERITED
- 0x200E, // 200E..2070; COMMON
- 0x2071, // 2071..2073; LATIN
+ 0x200E, // 200E..2064; COMMON
+ 0x2065, // 2065 ; UNKNOWN
+ 0x2066, // 2066..2070; COMMON
+ 0x2071, // 2071 ; LATIN
+ 0x2072, // 2072..2073; UNKNOWN
0x2074, // 2074..207E; COMMON
- 0x207F, // 207F..207F; LATIN
- 0x2080, // 2080..208F; COMMON
- 0x2090, // 2090..209F; LATIN
- 0x20A0, // 20A0..20CF; COMMON
- 0x20D0, // 20D0..20FF; INHERITED
+ 0x207F, // 207F ; LATIN
+ 0x2080, // 2080..208E; COMMON
+ 0x208F, // 208F ; UNKNOWN
+ 0x2090, // 2090..209C; LATIN
+ 0x209D, // 209D..209F; UNKNOWN
+ 0x20A0, // 20A0..20BD; COMMON
+ 0x20BE, // 20BE..20CF; UNKNOWN
+ 0x20D0, // 20D0..20F0; INHERITED
+ 0x20F1, // 20F1..20FF; UNKNOWN
0x2100, // 2100..2125; COMMON
- 0x2126, // 2126..2126; GREEK
+ 0x2126, // 2126 ; GREEK
0x2127, // 2127..2129; COMMON
0x212A, // 212A..212B; LATIN
0x212C, // 212C..2131; COMMON
- 0x2132, // 2132..2132; LATIN
+ 0x2132, // 2132 ; LATIN
0x2133, // 2133..214D; COMMON
- 0x214E, // 214E..214E; LATIN
+ 0x214E, // 214E ; LATIN
0x214F, // 214F..215F; COMMON
0x2160, // 2160..2188; LATIN
- 0x2189, // 2189..27FF; COMMON
+ 0x2189, // 2189 ; COMMON
+ 0x218A, // 218A..218F; UNKNOWN
+ 0x2190, // 2190..23FA; COMMON
+ 0x23FB, // 23FB..23FF; UNKNOWN
+ 0x2400, // 2400..2426; COMMON
+ 0x2427, // 2427..243F; UNKNOWN
+ 0x2440, // 2440..244A; COMMON
+ 0x244B, // 244B..245F; UNKNOWN
+ 0x2460, // 2460..27FF; COMMON
0x2800, // 2800..28FF; BRAILLE
- 0x2900, // 2900..2BFF; COMMON
- 0x2C00, // 2C00..2C5F; GLAGOLITIC
+ 0x2900, // 2900..2B73; COMMON
+ 0x2B74, // 2B74..2B75; UNKNOWN
+ 0x2B76, // 2B76..2B95; COMMON
+ 0x2B96, // 2B96..2B97; UNKNOWN
+ 0x2B98, // 2B98..2BB9; COMMON
+ 0x2BBA, // 2BBA..2BBC; UNKNOWN
+ 0x2BBD, // 2BBD..2BC8; COMMON
+ 0x2BC9, // 2BC9 ; UNKNOWN
+ 0x2BCA, // 2BCA..2BD1; COMMON
+ 0x2BD2, // 2BD2..2BFF; UNKNOWN
+ 0x2C00, // 2C00..2C2E; GLAGOLITIC
+ 0x2C2F, // 2C2F ; UNKNOWN
+ 0x2C30, // 2C30..2C5E; GLAGOLITIC
+ 0x2C5F, // 2C5F ; UNKNOWN
0x2C60, // 2C60..2C7F; LATIN
- 0x2C80, // 2C80..2CFF; COPTIC
- 0x2D00, // 2D00..2D2F; GEORGIAN
- 0x2D30, // 2D30..2D7F; TIFINAGH
- 0x2D80, // 2D80..2DDF; ETHIOPIC
+ 0x2C80, // 2C80..2CF3; COPTIC
+ 0x2CF4, // 2CF4..2CF8; UNKNOWN
+ 0x2CF9, // 2CF9..2CFF; COPTIC
+ 0x2D00, // 2D00..2D25; GEORGIAN
+ 0x2D26, // 2D26 ; UNKNOWN
+ 0x2D27, // 2D27 ; GEORGIAN
+ 0x2D28, // 2D28..2D2C; UNKNOWN
+ 0x2D2D, // 2D2D ; GEORGIAN
+ 0x2D2E, // 2D2E..2D2F; UNKNOWN
+ 0x2D30, // 2D30..2D67; TIFINAGH
+ 0x2D68, // 2D68..2D6E; UNKNOWN
+ 0x2D6F, // 2D6F..2D70; TIFINAGH
+ 0x2D71, // 2D71..2D7E; UNKNOWN
+ 0x2D7F, // 2D7F ; TIFINAGH
+ 0x2D80, // 2D80..2D96; ETHIOPIC
+ 0x2D97, // 2D97..2D9F; UNKNOWN
+ 0x2DA0, // 2DA0..2DA6; ETHIOPIC
+ 0x2DA7, // 2DA7 ; UNKNOWN
+ 0x2DA8, // 2DA8..2DAE; ETHIOPIC
+ 0x2DAF, // 2DAF ; UNKNOWN
+ 0x2DB0, // 2DB0..2DB6; ETHIOPIC
+ 0x2DB7, // 2DB7 ; UNKNOWN
+ 0x2DB8, // 2DB8..2DBE; ETHIOPIC
+ 0x2DBF, // 2DBF ; UNKNOWN
+ 0x2DC0, // 2DC0..2DC6; ETHIOPIC
+ 0x2DC7, // 2DC7 ; UNKNOWN
+ 0x2DC8, // 2DC8..2DCE; ETHIOPIC
+ 0x2DCF, // 2DCF ; UNKNOWN
+ 0x2DD0, // 2DD0..2DD6; ETHIOPIC
+ 0x2DD7, // 2DD7 ; UNKNOWN
+ 0x2DD8, // 2DD8..2DDE; ETHIOPIC
+ 0x2DDF, // 2DDF ; UNKNOWN
0x2DE0, // 2DE0..2DFF; CYRILLIC
- 0x2E00, // 2E00..2E7F; COMMON
- 0x2E80, // 2E80..2FEF; HAN
- 0x2FF0, // 2FF0..3004; COMMON
- 0x3005, // 3005..3005; HAN
- 0x3006, // 3006..3006; COMMON
- 0x3007, // 3007..3007; HAN
+ 0x2E00, // 2E00..2E42; COMMON
+ 0x2E43, // 2E43..2E7F; UNKNOWN
+ 0x2E80, // 2E80..2E99; HAN
+ 0x2E9A, // 2E9A ; UNKNOWN
+ 0x2E9B, // 2E9B..2EF3; HAN
+ 0x2EF4, // 2EF4..2EFF; UNKNOWN
+ 0x2F00, // 2F00..2FD5; HAN
+ 0x2FD6, // 2FD6..2FEF; UNKNOWN
+ 0x2FF0, // 2FF0..2FFB; COMMON
+ 0x2FFC, // 2FFC..2FFF; UNKNOWN
+ 0x3000, // 3000..3004; COMMON
+ 0x3005, // 3005 ; HAN
+ 0x3006, // 3006 ; COMMON
+ 0x3007, // 3007 ; HAN
0x3008, // 3008..3020; COMMON
0x3021, // 3021..3029; HAN
0x302A, // 302A..302D; INHERITED
0x302E, // 302E..302F; HANGUL
0x3030, // 3030..3037; COMMON
0x3038, // 3038..303B; HAN
- 0x303C, // 303C..3040; COMMON
- 0x3041, // 3041..3098; HIRAGANA
+ 0x303C, // 303C..303F; COMMON
+ 0x3040, // 3040 ; UNKNOWN
+ 0x3041, // 3041..3096; HIRAGANA
+ 0x3097, // 3097..3098; UNKNOWN
0x3099, // 3099..309A; INHERITED
0x309B, // 309B..309C; COMMON
0x309D, // 309D..309F; HIRAGANA
- 0x30A0, // 30A0..30A0; COMMON
+ 0x30A0, // 30A0 ; COMMON
0x30A1, // 30A1..30FA; KATAKANA
0x30FB, // 30FB..30FC; COMMON
- 0x30FD, // 30FD..3104; KATAKANA
- 0x3105, // 3105..3130; BOPOMOFO
- 0x3131, // 3131..318F; HANGUL
+ 0x30FD, // 30FD..30FF; KATAKANA
+ 0x3100, // 3100..3104; UNKNOWN
+ 0x3105, // 3105..312D; BOPOMOFO
+ 0x312E, // 312E..3130; UNKNOWN
+ 0x3131, // 3131..318E; HANGUL
+ 0x318F, // 318F ; UNKNOWN
0x3190, // 3190..319F; COMMON
- 0x31A0, // 31A0..31BF; BOPOMOFO
- 0x31C0, // 31C0..31EF; COMMON
+ 0x31A0, // 31A0..31BA; BOPOMOFO
+ 0x31BB, // 31BB..31BF; UNKNOWN
+ 0x31C0, // 31C0..31E3; COMMON
+ 0x31E4, // 31E4..31EF; UNKNOWN
0x31F0, // 31F0..31FF; KATAKANA
- 0x3200, // 3200..321F; HANGUL
+ 0x3200, // 3200..321E; HANGUL
+ 0x321F, // 321F ; UNKNOWN
0x3220, // 3220..325F; COMMON
0x3260, // 3260..327E; HANGUL
0x327F, // 327F..32CF; COMMON
- 0x32D0, // 32D0..3357; KATAKANA
+ 0x32D0, // 32D0..32FE; KATAKANA
+ 0x32FF, // 32FF ; UNKNOWN
+ 0x3300, // 3300..3357; KATAKANA
0x3358, // 3358..33FF; COMMON
- 0x3400, // 3400..4DBF; HAN
+ 0x3400, // 3400..4DB5; HAN
+ 0x4DB6, // 4DB6..4DBF; UNKNOWN
0x4DC0, // 4DC0..4DFF; COMMON
- 0x4E00, // 4E00..9FFF; HAN
- 0xA000, // A000..A4CF; YI
+ 0x4E00, // 4E00..9FCC; HAN
+ 0x9FCD, // 9FCD..9FFF; UNKNOWN
+ 0xA000, // A000..A48C; YI
+ 0xA48D, // A48D..A48F; UNKNOWN
+ 0xA490, // A490..A4C6; YI
+ 0xA4C7, // A4C7..A4CF; UNKNOWN
0xA4D0, // A4D0..A4FF; LISU
- 0xA500, // A500..A63F; VAI
- 0xA640, // A640..A69F; CYRILLIC
- 0xA6A0, // A6A0..A6FF; BAMUM
+ 0xA500, // A500..A62B; VAI
+ 0xA62C, // A62C..A63F; UNKNOWN
+ 0xA640, // A640..A69D; CYRILLIC
+ 0xA69E, // A69E ; UNKNOWN
+ 0xA69F, // A69F ; CYRILLIC
+ 0xA6A0, // A6A0..A6F7; BAMUM
+ 0xA6F8, // A6F8..A6FF; UNKNOWN
0xA700, // A700..A721; COMMON
0xA722, // A722..A787; LATIN
0xA788, // A788..A78A; COMMON
- 0xA78B, // A78B..A7FF; LATIN
- 0xA800, // A800..A82F; SYLOTI_NAGRI
- 0xA830, // A830..A83F; COMMON
- 0xA840, // A840..A87F; PHAGS_PA
- 0xA880, // A880..A8DF; SAURASHTRA
- 0xA8E0, // A8E0..A8FF; DEVANAGARI
- 0xA900, // A900..A92F; KAYAH_LI
- 0xA930, // A930..A95F; REJANG
- 0xA960, // A960..A97F; HANGUL
- 0xA980, // A980..A9FF; JAVANESE
- 0xAA00, // AA00..AA5F; CHAM
+ 0xA78B, // A78B..A78E; LATIN
+ 0xA78F, // A78F ; UNKNOWN
+ 0xA790, // A790..A7AD; LATIN
+ 0xA7AE, // A7AE..A7AF; UNKNOWN
+ 0xA7B0, // A7B0..A7B1; LATIN
+ 0xA7B2, // A7B2..A7F6; UNKNOWN
+ 0xA7F7, // A7F7..A7FF; LATIN
+ 0xA800, // A800..A82B; SYLOTI_NAGRI
+ 0xA82C, // A82C..A82F; UNKNOWN
+ 0xA830, // A830..A839; COMMON
+ 0xA83A, // A83A..A83F; UNKNOWN
+ 0xA840, // A840..A877; PHAGS_PA
+ 0xA878, // A878..A87F; UNKNOWN
+ 0xA880, // A880..A8C4; SAURASHTRA
+ 0xA8C5, // A8C5..A8CD; UNKNOWN
+ 0xA8CE, // A8CE..A8D9; SAURASHTRA
+ 0xA8DA, // A8DA..A8DF; UNKNOWN
+ 0xA8E0, // A8E0..A8FB; DEVANAGARI
+ 0xA8FC, // A8FC..A8FF; UNKNOWN
+ 0xA900, // A900..A92D; KAYAH_LI
+ 0xA92E, // A92E ; COMMON
+ 0xA92F, // A92F ; KAYAH_LI
+ 0xA930, // A930..A953; REJANG
+ 0xA954, // A954..A95E; UNKNOWN
+ 0xA95F, // A95F ; REJANG
+ 0xA960, // A960..A97C; HANGUL
+ 0xA97D, // A97D..A97F; UNKNOWN
+ 0xA980, // A980..A9CD; JAVANESE
+ 0xA9CE, // A9CE ; UNKNOWN
+ 0xA9CF, // A9CF ; COMMON
+ 0xA9D0, // A9D0..A9D9; JAVANESE
+ 0xA9DA, // A9DA..A9DD; UNKNOWN
+ 0xA9DE, // A9DE..A9DF; JAVANESE
+ 0xA9E0, // A9E0..A9FE; MYANMAR
+ 0xA9FF, // A9FF ; UNKNOWN
+ 0xAA00, // AA00..AA36; CHAM
+ 0xAA37, // AA37..AA3F; UNKNOWN
+ 0xAA40, // AA40..AA4D; CHAM
+ 0xAA4E, // AA4E..AA4F; UNKNOWN
+ 0xAA50, // AA50..AA59; CHAM
+ 0xAA5A, // AA5A..AA5B; UNKNOWN
+ 0xAA5C, // AA5C..AA5F; CHAM
0xAA60, // AA60..AA7F; MYANMAR
- 0xAA80, // AA80..AADF; TAI_VIET
- 0xAAE0, // AAE0..AB00; MEETEI_MAYEK
- 0xAB01, // AB01..ABBF; ETHIOPIC
- 0xABC0, // ABC0..ABFF; MEETEI_MAYEK
- 0xAC00, // AC00..D7FB; HANGUL
+ 0xAA80, // AA80..AAC2; TAI_VIET
+ 0xAAC3, // AAC3..AADA; UNKNOWN
+ 0xAADB, // AADB..AADF; TAI_VIET
+ 0xAAE0, // AAE0..AAF6; MEETEI_MAYEK
+ 0xAAF7, // AAF7..AB00; UNKNOWN
+ 0xAB01, // AB01..AB06; ETHIOPIC
+ 0xAB07, // AB07..AB08; UNKNOWN
+ 0xAB09, // AB09..AB0E; ETHIOPIC
+ 0xAB0F, // AB0F..AB10; UNKNOWN
+ 0xAB11, // AB11..AB16; ETHIOPIC
+ 0xAB17, // AB17..AB1F; UNKNOWN
+ 0xAB20, // AB20..AB26; ETHIOPIC
+ 0xAB27, // AB27 ; UNKNOWN
+ 0xAB28, // AB28..AB2E; ETHIOPIC
+ 0xAB2F, // AB2F ; UNKNOWN
+ 0xAB30, // AB30..AB5A; LATIN
+ 0xAB5B, // AB5B ; COMMON
+ 0xAB5C, // AB5C..AB5F; LATIN
+ 0xAB60, // AB60..AB63; UNKNOWN
+ 0xAB64, // AB64 ; LATIN
+ 0xAB65, // AB65 ; GREEK
+ 0xAB66, // AB66..ABBF; UNKNOWN
+ 0xABC0, // ABC0..ABED; MEETEI_MAYEK
+ 0xABEE, // ABEE..ABEF; UNKNOWN
+ 0xABF0, // ABF0..ABF9; MEETEI_MAYEK
+ 0xABFA, // ABFA..ABFF; UNKNOWN
+ 0xAC00, // AC00..D7A3; HANGUL
+ 0xD7A4, // D7A4..D7AF; UNKNOWN
+ 0xD7B0, // D7B0..D7C6; HANGUL
+ 0xD7C7, // D7C7..D7CA; UNKNOWN
+ 0xD7CB, // D7CB..D7FB; HANGUL
0xD7FC, // D7FC..F8FF; UNKNOWN
- 0xF900, // F900..FAFF; HAN
- 0xFB00, // FB00..FB12; LATIN
- 0xFB13, // FB13..FB1C; ARMENIAN
- 0xFB1D, // FB1D..FB4F; HEBREW
- 0xFB50, // FB50..FD3D; ARABIC
- 0xFD3E, // FD3E..FD4F; COMMON
- 0xFD50, // FD50..FDFC; ARABIC
- 0xFDFD, // FDFD..FDFF; COMMON
+ 0xF900, // F900..FA6D; HAN
+ 0xFA6E, // FA6E..FA6F; UNKNOWN
+ 0xFA70, // FA70..FAD9; HAN
+ 0xFADA, // FADA..FAFF; UNKNOWN
+ 0xFB00, // FB00..FB06; LATIN
+ 0xFB07, // FB07..FB12; UNKNOWN
+ 0xFB13, // FB13..FB17; ARMENIAN
+ 0xFB18, // FB18..FB1C; UNKNOWN
+ 0xFB1D, // FB1D..FB36; HEBREW
+ 0xFB37, // FB37 ; UNKNOWN
+ 0xFB38, // FB38..FB3C; HEBREW
+ 0xFB3D, // FB3D ; UNKNOWN
+ 0xFB3E, // FB3E ; HEBREW
+ 0xFB3F, // FB3F ; UNKNOWN
+ 0xFB40, // FB40..FB41; HEBREW
+ 0xFB42, // FB42 ; UNKNOWN
+ 0xFB43, // FB43..FB44; HEBREW
+ 0xFB45, // FB45 ; UNKNOWN
+ 0xFB46, // FB46..FB4F; HEBREW
+ 0xFB50, // FB50..FBC1; ARABIC
+ 0xFBC2, // FBC2..FBD2; UNKNOWN
+ 0xFBD3, // FBD3..FD3D; ARABIC
+ 0xFD3E, // FD3E..FD3F; COMMON
+ 0xFD40, // FD40..FD4F; UNKNOWN
+ 0xFD50, // FD50..FD8F; ARABIC
+ 0xFD90, // FD90..FD91; UNKNOWN
+ 0xFD92, // FD92..FDC7; ARABIC
+ 0xFDC8, // FDC8..FDEF; UNKNOWN
+ 0xFDF0, // FDF0..FDFD; ARABIC
+ 0xFDFE, // FDFE..FDFF; UNKNOWN
0xFE00, // FE00..FE0F; INHERITED
- 0xFE10, // FE10..FE1F; COMMON
- 0xFE20, // FE20..FE2F; INHERITED
- 0xFE30, // FE30..FE6F; COMMON
- 0xFE70, // FE70..FEFE; ARABIC
- 0xFEFF, // FEFF..FF20; COMMON
+ 0xFE10, // FE10..FE19; COMMON
+ 0xFE1A, // FE1A..FE1F; UNKNOWN
+ 0xFE20, // FE20..FE2D; INHERITED
+ 0xFE2E, // FE2E..FE2F; UNKNOWN
+ 0xFE30, // FE30..FE52; COMMON
+ 0xFE53, // FE53 ; UNKNOWN
+ 0xFE54, // FE54..FE66; COMMON
+ 0xFE67, // FE67 ; UNKNOWN
+ 0xFE68, // FE68..FE6B; COMMON
+ 0xFE6C, // FE6C..FE6F; UNKNOWN
+ 0xFE70, // FE70..FE74; ARABIC
+ 0xFE75, // FE75 ; UNKNOWN
+ 0xFE76, // FE76..FEFC; ARABIC
+ 0xFEFD, // FEFD..FEFE; UNKNOWN
+ 0xFEFF, // FEFF ; COMMON
+ 0xFF00, // FF00 ; UNKNOWN
+ 0xFF01, // FF01..FF20; COMMON
0xFF21, // FF21..FF3A; LATIN
0xFF3B, // FF3B..FF40; COMMON
0xFF41, // FF41..FF5A; LATIN
0xFF5B, // FF5B..FF65; COMMON
0xFF66, // FF66..FF6F; KATAKANA
- 0xFF70, // FF70..FF70; COMMON
+ 0xFF70, // FF70 ; COMMON
0xFF71, // FF71..FF9D; KATAKANA
0xFF9E, // FF9E..FF9F; COMMON
- 0xFFA0, // FFA0..FFDF; HANGUL
- 0xFFE0, // FFE0..FFFF; COMMON
- 0x10000, // 10000..100FF; LINEAR_B
- 0x10100, // 10100..1013F; COMMON
- 0x10140, // 10140..1018F; GREEK
- 0x10190, // 10190..101FC; COMMON
- 0x101FD, // 101FD..1027F; INHERITED
- 0x10280, // 10280..1029F; LYCIAN
- 0x102A0, // 102A0..102FF; CARIAN
- 0x10300, // 10300..1032F; OLD_ITALIC
- 0x10330, // 10330..1037F; GOTHIC
- 0x10380, // 10380..1039F; UGARITIC
- 0x103A0, // 103A0..103FF; OLD_PERSIAN
+ 0xFFA0, // FFA0..FFBE; HANGUL
+ 0xFFBF, // FFBF..FFC1; UNKNOWN
+ 0xFFC2, // FFC2..FFC7; HANGUL
+ 0xFFC8, // FFC8..FFC9; UNKNOWN
+ 0xFFCA, // FFCA..FFCF; HANGUL
+ 0xFFD0, // FFD0..FFD1; UNKNOWN
+ 0xFFD2, // FFD2..FFD7; HANGUL
+ 0xFFD8, // FFD8..FFD9; UNKNOWN
+ 0xFFDA, // FFDA..FFDC; HANGUL
+ 0xFFDD, // FFDD..FFDF; UNKNOWN
+ 0xFFE0, // FFE0..FFE6; COMMON
+ 0xFFE7, // FFE7 ; UNKNOWN
+ 0xFFE8, // FFE8..FFEE; COMMON
+ 0xFFEF, // FFEF..FFF8; UNKNOWN
+ 0xFFF9, // FFF9..FFFD; COMMON
+ 0xFFFE, // FFFE..FFFF; UNKNOWN
+ 0x10000, // 10000..1000B; LINEAR_B
+ 0x1000C, // 1000C ; UNKNOWN
+ 0x1000D, // 1000D..10026; LINEAR_B
+ 0x10027, // 10027 ; UNKNOWN
+ 0x10028, // 10028..1003A; LINEAR_B
+ 0x1003B, // 1003B ; UNKNOWN
+ 0x1003C, // 1003C..1003D; LINEAR_B
+ 0x1003E, // 1003E ; UNKNOWN
+ 0x1003F, // 1003F..1004D; LINEAR_B
+ 0x1004E, // 1004E..1004F; UNKNOWN
+ 0x10050, // 10050..1005D; LINEAR_B
+ 0x1005E, // 1005E..1007F; UNKNOWN
+ 0x10080, // 10080..100FA; LINEAR_B
+ 0x100FB, // 100FB..100FF; UNKNOWN
+ 0x10100, // 10100..10102; COMMON
+ 0x10103, // 10103..10106; UNKNOWN
+ 0x10107, // 10107..10133; COMMON
+ 0x10134, // 10134..10136; UNKNOWN
+ 0x10137, // 10137..1013F; COMMON
+ 0x10140, // 10140..1018C; GREEK
+ 0x1018D, // 1018D..1018F; UNKNOWN
+ 0x10190, // 10190..1019B; COMMON
+ 0x1019C, // 1019C..1019F; UNKNOWN
+ 0x101A0, // 101A0 ; GREEK
+ 0x101A1, // 101A1..101CF; UNKNOWN
+ 0x101D0, // 101D0..101FC; COMMON
+ 0x101FD, // 101FD ; INHERITED
+ 0x101FE, // 101FE..1027F; UNKNOWN
+ 0x10280, // 10280..1029C; LYCIAN
+ 0x1029D, // 1029D..1029F; UNKNOWN
+ 0x102A0, // 102A0..102D0; CARIAN
+ 0x102D1, // 102D1..102DF; UNKNOWN
+ 0x102E0, // 102E0 ; INHERITED
+ 0x102E1, // 102E1..102FB; COMMON
+ 0x102FC, // 102FC..102FF; UNKNOWN
+ 0x10300, // 10300..10323; OLD_ITALIC
+ 0x10324, // 10324..1032F; UNKNOWN
+ 0x10330, // 10330..1034A; GOTHIC
+ 0x1034B, // 1034B..1034F; UNKNOWN
+ 0x10350, // 10350..1037A; OLD_PERMIC
+ 0x1037B, // 1037B..1037F; UNKNOWN
+ 0x10380, // 10380..1039D; UGARITIC
+ 0x1039E, // 1039E ; UNKNOWN
+ 0x1039F, // 1039F ; UGARITIC
+ 0x103A0, // 103A0..103C3; OLD_PERSIAN
+ 0x103C4, // 103C4..103C7; UNKNOWN
+ 0x103C8, // 103C8..103D5; OLD_PERSIAN
+ 0x103D6, // 103D6..103FF; UNKNOWN
0x10400, // 10400..1044F; DESERET
0x10450, // 10450..1047F; SHAVIAN
- 0x10480, // 10480..107FF; OSMANYA
- 0x10800, // 10800..1083F; CYPRIOT
- 0x10840, // 10840..108FF; IMPERIAL_ARAMAIC
- 0x10900, // 10900..1091F; PHOENICIAN
- 0x10920, // 10920..1097F; LYDIAN
+ 0x10480, // 10480..1049D; OSMANYA
+ 0x1049E, // 1049E..1049F; UNKNOWN
+ 0x104A0, // 104A0..104A9; OSMANYA
+ 0x104AA, // 104AA..104FF; UNKNOWN
+ 0x10500, // 10500..10527; ELBASAN
+ 0x10528, // 10528..1052F; UNKNOWN
+ 0x10530, // 10530..10563; CAUCASIAN_ALBANIAN
+ 0x10564, // 10564..1056E; UNKNOWN
+ 0x1056F, // 1056F ; CAUCASIAN_ALBANIAN
+ 0x10570, // 10570..105FF; UNKNOWN
+ 0x10600, // 10600..10736; LINEAR_A
+ 0x10737, // 10737..1073F; UNKNOWN
+ 0x10740, // 10740..10755; LINEAR_A
+ 0x10756, // 10756..1075F; UNKNOWN
+ 0x10760, // 10760..10767; LINEAR_A
+ 0x10768, // 10768..107FF; UNKNOWN
+ 0x10800, // 10800..10805; CYPRIOT
+ 0x10806, // 10806..10807; UNKNOWN
+ 0x10808, // 10808 ; CYPRIOT
+ 0x10809, // 10809 ; UNKNOWN
+ 0x1080A, // 1080A..10835; CYPRIOT
+ 0x10836, // 10836 ; UNKNOWN
+ 0x10837, // 10837..10838; CYPRIOT
+ 0x10839, // 10839..1083B; UNKNOWN
+ 0x1083C, // 1083C ; CYPRIOT
+ 0x1083D, // 1083D..1083E; UNKNOWN
+ 0x1083F, // 1083F ; CYPRIOT
+ 0x10840, // 10840..10855; IMPERIAL_ARAMAIC
+ 0x10856, // 10856 ; UNKNOWN
+ 0x10857, // 10857..1085F; IMPERIAL_ARAMAIC
+ 0x10860, // 10860..1087F; PALMYRENE
+ 0x10880, // 10880..1089E; NABATAEAN
+ 0x1089F, // 1089F..108A6; UNKNOWN
+ 0x108A7, // 108A7..108AF; NABATAEAN
+ 0x108B0, // 108B0..108FF; UNKNOWN
+ 0x10900, // 10900..1091B; PHOENICIAN
+ 0x1091C, // 1091C..1091E; UNKNOWN
+ 0x1091F, // 1091F ; PHOENICIAN
+ 0x10920, // 10920..10939; LYDIAN
+ 0x1093A, // 1093A..1093E; UNKNOWN
+ 0x1093F, // 1093F ; LYDIAN
+ 0x10940, // 10940..1097F; UNKNOWN
0x10980, // 10980..1099F; MEROITIC_HIEROGLYPHS
- 0x109A0, // 109A0..109FF; MEROITIC_CURSIVE
- 0x10A00, // 10A00..10A5F; KHAROSHTHI
- 0x10A60, // 10A60..10AFF; OLD_SOUTH_ARABIAN
- 0x10B00, // 10B00..10B3F; AVESTAN
- 0x10B40, // 10B40..10B5F; INSCRIPTIONAL_PARTHIAN
- 0x10B60, // 10B60..10BFF; INSCRIPTIONAL_PAHLAVI
- 0x10C00, // 10C00..10E5F; OLD_TURKIC
- 0x10E60, // 10E60..10FFF; ARABIC
- 0x11000, // 11000..1107F; BRAHMI
- 0x11080, // 11080..110CF; KAITHI
- 0x110D0, // 110D0..110FF; SORA_SOMPENG
- 0x11100, // 11100..1117F; CHAKMA
- 0x11180, // 11180..1167F; SHARADA
- 0x11680, // 11680..116CF; TAKRI
- 0x12000, // 12000..12FFF; CUNEIFORM
- 0x13000, // 13000..167FF; EGYPTIAN_HIEROGLYPHS
+ 0x109A0, // 109A0..109B7; MEROITIC_CURSIVE
+ 0x109B8, // 109B8..109BD; UNKNOWN
+ 0x109BE, // 109BE..109BF; MEROITIC_CURSIVE
+ 0x109C0, // 109C0..109FF; UNKNOWN
+ 0x10A00, // 10A00..10A03; KHAROSHTHI
+ 0x10A04, // 10A04 ; UNKNOWN
+ 0x10A05, // 10A05..10A06; KHAROSHTHI
+ 0x10A07, // 10A07..10A0B; UNKNOWN
+ 0x10A0C, // 10A0C..10A13; KHAROSHTHI
+ 0x10A14, // 10A14 ; UNKNOWN
+ 0x10A15, // 10A15..10A17; KHAROSHTHI
+ 0x10A18, // 10A18 ; UNKNOWN
+ 0x10A19, // 10A19..10A33; KHAROSHTHI
+ 0x10A34, // 10A34..10A37; UNKNOWN
+ 0x10A38, // 10A38..10A3A; KHAROSHTHI
+ 0x10A3B, // 10A3B..10A3E; UNKNOWN
+ 0x10A3F, // 10A3F..10A47; KHAROSHTHI
+ 0x10A48, // 10A48..10A4F; UNKNOWN
+ 0x10A50, // 10A50..10A58; KHAROSHTHI
+ 0x10A59, // 10A59..10A5F; UNKNOWN
+ 0x10A60, // 10A60..10A7F; OLD_SOUTH_ARABIAN
+ 0x10A80, // 10A80..10A9F; OLD_NORTH_ARABIAN
+ 0x10AA0, // 10AA0..10ABF; UNKNOWN
+ 0x10AC0, // 10AC0..10AE6; MANICHAEAN
+ 0x10AE7, // 10AE7..10AEA; UNKNOWN
+ 0x10AEB, // 10AEB..10AF6; MANICHAEAN
+ 0x10AF7, // 10AF7..10AFF; UNKNOWN
+ 0x10B00, // 10B00..10B35; AVESTAN
+ 0x10B36, // 10B36..10B38; UNKNOWN
+ 0x10B39, // 10B39..10B3F; AVESTAN
+ 0x10B40, // 10B40..10B55; INSCRIPTIONAL_PARTHIAN
+ 0x10B56, // 10B56..10B57; UNKNOWN
+ 0x10B58, // 10B58..10B5F; INSCRIPTIONAL_PARTHIAN
+ 0x10B60, // 10B60..10B72; INSCRIPTIONAL_PAHLAVI
+ 0x10B73, // 10B73..10B77; UNKNOWN
+ 0x10B78, // 10B78..10B7F; INSCRIPTIONAL_PAHLAVI
+ 0x10B80, // 10B80..10B91; PSALTER_PAHLAVI
+ 0x10B92, // 10B92..10B98; UNKNOWN
+ 0x10B99, // 10B99..10B9C; PSALTER_PAHLAVI
+ 0x10B9D, // 10B9D..10BA8; UNKNOWN
+ 0x10BA9, // 10BA9..10BAF; PSALTER_PAHLAVI
+ 0x10BB0, // 10BB0..10BFF; UNKNOWN
+ 0x10C00, // 10C00..10C48; OLD_TURKIC
+ 0x10C49, // 10C49..10E5F; UNKNOWN
+ 0x10E60, // 10E60..10E7E; ARABIC
+ 0x10E7F, // 10E7F..10FFF; UNKNOWN
+ 0x11000, // 11000..1104D; BRAHMI
+ 0x1104E, // 1104E..11051; UNKNOWN
+ 0x11052, // 11052..1106F; BRAHMI
+ 0x11070, // 11070..1107E; UNKNOWN
+ 0x1107F, // 1107F ; BRAHMI
+ 0x11080, // 11080..110C1; KAITHI
+ 0x110C2, // 110C2..110CF; UNKNOWN
+ 0x110D0, // 110D0..110E8; SORA_SOMPENG
+ 0x110E9, // 110E9..110EF; UNKNOWN
+ 0x110F0, // 110F0..110F9; SORA_SOMPENG
+ 0x110FA, // 110FA..110FF; UNKNOWN
+ 0x11100, // 11100..11134; CHAKMA
+ 0x11135, // 11135 ; UNKNOWN
+ 0x11136, // 11136..11143; CHAKMA
+ 0x11144, // 11144..1114F; UNKNOWN
+ 0x11150, // 11150..11176; MAHAJANI
+ 0x11177, // 11177..1117F; UNKNOWN
+ 0x11180, // 11180..111C8; SHARADA
+ 0x111C9, // 111C9..111CC; UNKNOWN
+ 0x111CD, // 111CD ; SHARADA
+ 0x111CE, // 111CE..111CF; UNKNOWN
+ 0x111D0, // 111D0..111DA; SHARADA
+ 0x111DB, // 111DB..111E0; UNKNOWN
+ 0x111E1, // 111E1..111F4; SINHALA
+ 0x111F5, // 111F5..111FF; UNKNOWN
+ 0x11200, // 11200..11211; KHOJKI
+ 0x11212, // 11212 ; UNKNOWN
+ 0x11213, // 11213..1123D; KHOJKI
+ 0x1123E, // 1123E..112AF; UNKNOWN
+ 0x112B0, // 112B0..112EA; KHUDAWADI
+ 0x112EB, // 112EB..112EF; UNKNOWN
+ 0x112F0, // 112F0..112F9; KHUDAWADI
+ 0x112FA, // 112FA..11300; UNKNOWN
+ 0x11301, // 11301..11303; GRANTHA
+ 0x11304, // 11304 ; UNKNOWN
+ 0x11305, // 11305..1130C; GRANTHA
+ 0x1130D, // 1130D..1130E; UNKNOWN
+ 0x1130F, // 1130F..11310; GRANTHA
+ 0x11311, // 11311..11312; UNKNOWN
+ 0x11313, // 11313..11328; GRANTHA
+ 0x11329, // 11329 ; UNKNOWN
+ 0x1132A, // 1132A..11330; GRANTHA
+ 0x11331, // 11331 ; UNKNOWN
+ 0x11332, // 11332..11333; GRANTHA
+ 0x11334, // 11334 ; UNKNOWN
+ 0x11335, // 11335..11339; GRANTHA
+ 0x1133A, // 1133A..1133B; UNKNOWN
+ 0x1133C, // 1133C..11344; GRANTHA
+ 0x11345, // 11345..11346; UNKNOWN
+ 0x11347, // 11347..11348; GRANTHA
+ 0x11349, // 11349..1134A; UNKNOWN
+ 0x1134B, // 1134B..1134D; GRANTHA
+ 0x1134E, // 1134E..11356; UNKNOWN
+ 0x11357, // 11357 ; GRANTHA
+ 0x11358, // 11358..1135C; UNKNOWN
+ 0x1135D, // 1135D..11363; GRANTHA
+ 0x11364, // 11364..11365; UNKNOWN
+ 0x11366, // 11366..1136C; GRANTHA
+ 0x1136D, // 1136D..1136F; UNKNOWN
+ 0x11370, // 11370..11374; GRANTHA
+ 0x11375, // 11375..1147F; UNKNOWN
+ 0x11480, // 11480..114C7; TIRHUTA
+ 0x114C8, // 114C8..114CF; UNKNOWN
+ 0x114D0, // 114D0..114D9; TIRHUTA
+ 0x114DA, // 114DA..1157F; UNKNOWN
+ 0x11580, // 11580..115B5; SIDDHAM
+ 0x115B6, // 115B6..115B7; UNKNOWN
+ 0x115B8, // 115B8..115C9; SIDDHAM
+ 0x115CA, // 115CA..115FF; UNKNOWN
+ 0x11600, // 11600..11644; MODI
+ 0x11645, // 11645..1164F; UNKNOWN
+ 0x11650, // 11650..11659; MODI
+ 0x1165A, // 1165A..1167F; UNKNOWN
+ 0x11680, // 11680..116B7; TAKRI
+ 0x116B8, // 116B8..116BF; UNKNOWN
+ 0x116C0, // 116C0..116C9; TAKRI
+ 0x116CA, // 116CA..1189F; UNKNOWN
+ 0x118A0, // 118A0..118F2; WARANG_CITI
+ 0x118F3, // 118F3..118FE; UNKNOWN
+ 0x118FF, // 118FF ; WARANG_CITI
+ 0x11900, // 11900..11ABF; UNKNOWN
+ 0x11AC0, // 11AC0..11AF8; PAU_CIN_HAU
+ 0x11AF9, // 11AF9..11FFF; UNKNOWN
+ 0x12000, // 12000..12398; CUNEIFORM
+ 0x12399, // 12399..123FF; UNKNOWN
+ 0x12400, // 12400..1246E; CUNEIFORM
+ 0x1246F, // 1246F ; UNKNOWN
+ 0x12470, // 12470..12474; CUNEIFORM
+ 0x12475, // 12475..12FFF; UNKNOWN
+ 0x13000, // 13000..1342E; EGYPTIAN_HIEROGLYPHS
+ 0x1342F, // 1342F..167FF; UNKNOWN
0x16800, // 16800..16A38; BAMUM
- 0x16F00, // 16F00..16F9F; MIAO
- 0x1B000, // 1B000..1B000; KATAKANA
- 0x1B001, // 1B001..1CFFF; HIRAGANA
- 0x1D000, // 1D000..1D166; COMMON
+ 0x16A39, // 16A39..16A3F; UNKNOWN
+ 0x16A40, // 16A40..16A5E; MRO
+ 0x16A5F, // 16A5F ; UNKNOWN
+ 0x16A60, // 16A60..16A69; MRO
+ 0x16A6A, // 16A6A..16A6D; UNKNOWN
+ 0x16A6E, // 16A6E..16A6F; MRO
+ 0x16A70, // 16A70..16ACF; UNKNOWN
+ 0x16AD0, // 16AD0..16AED; BASSA_VAH
+ 0x16AEE, // 16AEE..16AEF; UNKNOWN
+ 0x16AF0, // 16AF0..16AF5; BASSA_VAH
+ 0x16AF6, // 16AF6..16AFF; UNKNOWN
+ 0x16B00, // 16B00..16B45; PAHAWH_HMONG
+ 0x16B46, // 16B46..16B4F; UNKNOWN
+ 0x16B50, // 16B50..16B59; PAHAWH_HMONG
+ 0x16B5A, // 16B5A ; UNKNOWN
+ 0x16B5B, // 16B5B..16B61; PAHAWH_HMONG
+ 0x16B62, // 16B62 ; UNKNOWN
+ 0x16B63, // 16B63..16B77; PAHAWH_HMONG
+ 0x16B78, // 16B78..16B7C; UNKNOWN
+ 0x16B7D, // 16B7D..16B8F; PAHAWH_HMONG
+ 0x16B90, // 16B90..16EFF; UNKNOWN
+ 0x16F00, // 16F00..16F44; MIAO
+ 0x16F45, // 16F45..16F4F; UNKNOWN
+ 0x16F50, // 16F50..16F7E; MIAO
+ 0x16F7F, // 16F7F..16F8E; UNKNOWN
+ 0x16F8F, // 16F8F..16F9F; MIAO
+ 0x16FA0, // 16FA0..1AFFF; UNKNOWN
+ 0x1B000, // 1B000 ; KATAKANA
+ 0x1B001, // 1B001 ; HIRAGANA
+ 0x1B002, // 1B002..1BBFF; UNKNOWN
+ 0x1BC00, // 1BC00..1BC6A; DUPLOYAN
+ 0x1BC6B, // 1BC6B..1BC6F; UNKNOWN
+ 0x1BC70, // 1BC70..1BC7C; DUPLOYAN
+ 0x1BC7D, // 1BC7D..1BC7F; UNKNOWN
+ 0x1BC80, // 1BC80..1BC88; DUPLOYAN
+ 0x1BC89, // 1BC89..1BC8F; UNKNOWN
+ 0x1BC90, // 1BC90..1BC99; DUPLOYAN
+ 0x1BC9A, // 1BC9A..1BC9B; UNKNOWN
+ 0x1BC9C, // 1BC9C..1BC9F; DUPLOYAN
+ 0x1BCA0, // 1BCA0..1BCA3; COMMON
+ 0x1BCA4, // 1BCA4..1CFFF; UNKNOWN
+ 0x1D000, // 1D000..1D0F5; COMMON
+ 0x1D0F6, // 1D0F6..1D0FF; UNKNOWN
+ 0x1D100, // 1D100..1D126; COMMON
+ 0x1D127, // 1D127..1D128; UNKNOWN
+ 0x1D129, // 1D129..1D166; COMMON
0x1D167, // 1D167..1D169; INHERITED
0x1D16A, // 1D16A..1D17A; COMMON
0x1D17B, // 1D17B..1D182; INHERITED
@@ -4022,354 +5420,1635 @@
0x1D185, // 1D185..1D18B; INHERITED
0x1D18C, // 1D18C..1D1A9; COMMON
0x1D1AA, // 1D1AA..1D1AD; INHERITED
- 0x1D1AE, // 1D1AE..1D1FF; COMMON
- 0x1D200, // 1D200..1D2FF; GREEK
- 0x1D300, // 1D300..1EDFF; COMMON
- 0x1EE00, // 1EE00..1EFFF; ARABIC
- 0x1F000, // 1F000..1F1FF; COMMON
- 0x1F200, // 1F200..1F200; HIRAGANA
- 0x1F201, // 1F210..1FFFF; COMMON
- 0x20000, // 20000..E0000; HAN
- 0xE0001, // E0001..E00FF; COMMON
+ 0x1D1AE, // 1D1AE..1D1DD; COMMON
+ 0x1D1DE, // 1D1DE..1D1FF; UNKNOWN
+ 0x1D200, // 1D200..1D245; GREEK
+ 0x1D246, // 1D246..1D2FF; UNKNOWN
+ 0x1D300, // 1D300..1D356; COMMON
+ 0x1D357, // 1D357..1D35F; UNKNOWN
+ 0x1D360, // 1D360..1D371; COMMON
+ 0x1D372, // 1D372..1D3FF; UNKNOWN
+ 0x1D400, // 1D400..1D454; COMMON
+ 0x1D455, // 1D455 ; UNKNOWN
+ 0x1D456, // 1D456..1D49C; COMMON
+ 0x1D49D, // 1D49D ; UNKNOWN
+ 0x1D49E, // 1D49E..1D49F; COMMON
+ 0x1D4A0, // 1D4A0..1D4A1; UNKNOWN
+ 0x1D4A2, // 1D4A2 ; COMMON
+ 0x1D4A3, // 1D4A3..1D4A4; UNKNOWN
+ 0x1D4A5, // 1D4A5..1D4A6; COMMON
+ 0x1D4A7, // 1D4A7..1D4A8; UNKNOWN
+ 0x1D4A9, // 1D4A9..1D4AC; COMMON
+ 0x1D4AD, // 1D4AD ; UNKNOWN
+ 0x1D4AE, // 1D4AE..1D4B9; COMMON
+ 0x1D4BA, // 1D4BA ; UNKNOWN
+ 0x1D4BB, // 1D4BB ; COMMON
+ 0x1D4BC, // 1D4BC ; UNKNOWN
+ 0x1D4BD, // 1D4BD..1D4C3; COMMON
+ 0x1D4C4, // 1D4C4 ; UNKNOWN
+ 0x1D4C5, // 1D4C5..1D505; COMMON
+ 0x1D506, // 1D506 ; UNKNOWN
+ 0x1D507, // 1D507..1D50A; COMMON
+ 0x1D50B, // 1D50B..1D50C; UNKNOWN
+ 0x1D50D, // 1D50D..1D514; COMMON
+ 0x1D515, // 1D515 ; UNKNOWN
+ 0x1D516, // 1D516..1D51C; COMMON
+ 0x1D51D, // 1D51D ; UNKNOWN
+ 0x1D51E, // 1D51E..1D539; COMMON
+ 0x1D53A, // 1D53A ; UNKNOWN
+ 0x1D53B, // 1D53B..1D53E; COMMON
+ 0x1D53F, // 1D53F ; UNKNOWN
+ 0x1D540, // 1D540..1D544; COMMON
+ 0x1D545, // 1D545 ; UNKNOWN
+ 0x1D546, // 1D546 ; COMMON
+ 0x1D547, // 1D547..1D549; UNKNOWN
+ 0x1D54A, // 1D54A..1D550; COMMON
+ 0x1D551, // 1D551 ; UNKNOWN
+ 0x1D552, // 1D552..1D6A5; COMMON
+ 0x1D6A6, // 1D6A6..1D6A7; UNKNOWN
+ 0x1D6A8, // 1D6A8..1D7CB; COMMON
+ 0x1D7CC, // 1D7CC..1D7CD; UNKNOWN
+ 0x1D7CE, // 1D7CE..1D7FF; COMMON
+ 0x1D800, // 1D800..1E7FF; UNKNOWN
+ 0x1E800, // 1E800..1E8C4; MENDE_KIKAKUI
+ 0x1E8C5, // 1E8C5..1E8C6; UNKNOWN
+ 0x1E8C7, // 1E8C7..1E8D6; MENDE_KIKAKUI
+ 0x1E8D7, // 1E8D7..1EDFF; UNKNOWN
+ 0x1EE00, // 1EE00..1EE03; ARABIC
+ 0x1EE04, // 1EE04 ; UNKNOWN
+ 0x1EE05, // 1EE05..1EE1F; ARABIC
+ 0x1EE20, // 1EE20 ; UNKNOWN
+ 0x1EE21, // 1EE21..1EE22; ARABIC
+ 0x1EE23, // 1EE23 ; UNKNOWN
+ 0x1EE24, // 1EE24 ; ARABIC
+ 0x1EE25, // 1EE25..1EE26; UNKNOWN
+ 0x1EE27, // 1EE27 ; ARABIC
+ 0x1EE28, // 1EE28 ; UNKNOWN
+ 0x1EE29, // 1EE29..1EE32; ARABIC
+ 0x1EE33, // 1EE33 ; UNKNOWN
+ 0x1EE34, // 1EE34..1EE37; ARABIC
+ 0x1EE38, // 1EE38 ; UNKNOWN
+ 0x1EE39, // 1EE39 ; ARABIC
+ 0x1EE3A, // 1EE3A ; UNKNOWN
+ 0x1EE3B, // 1EE3B ; ARABIC
+ 0x1EE3C, // 1EE3C..1EE41; UNKNOWN
+ 0x1EE42, // 1EE42 ; ARABIC
+ 0x1EE43, // 1EE43..1EE46; UNKNOWN
+ 0x1EE47, // 1EE47 ; ARABIC
+ 0x1EE48, // 1EE48 ; UNKNOWN
+ 0x1EE49, // 1EE49 ; ARABIC
+ 0x1EE4A, // 1EE4A ; UNKNOWN
+ 0x1EE4B, // 1EE4B ; ARABIC
+ 0x1EE4C, // 1EE4C ; UNKNOWN
+ 0x1EE4D, // 1EE4D..1EE4F; ARABIC
+ 0x1EE50, // 1EE50 ; UNKNOWN
+ 0x1EE51, // 1EE51..1EE52; ARABIC
+ 0x1EE53, // 1EE53 ; UNKNOWN
+ 0x1EE54, // 1EE54 ; ARABIC
+ 0x1EE55, // 1EE55..1EE56; UNKNOWN
+ 0x1EE57, // 1EE57 ; ARABIC
+ 0x1EE58, // 1EE58 ; UNKNOWN
+ 0x1EE59, // 1EE59 ; ARABIC
+ 0x1EE5A, // 1EE5A ; UNKNOWN
+ 0x1EE5B, // 1EE5B ; ARABIC
+ 0x1EE5C, // 1EE5C ; UNKNOWN
+ 0x1EE5D, // 1EE5D ; ARABIC
+ 0x1EE5E, // 1EE5E ; UNKNOWN
+ 0x1EE5F, // 1EE5F ; ARABIC
+ 0x1EE60, // 1EE60 ; UNKNOWN
+ 0x1EE61, // 1EE61..1EE62; ARABIC
+ 0x1EE63, // 1EE63 ; UNKNOWN
+ 0x1EE64, // 1EE64 ; ARABIC
+ 0x1EE65, // 1EE65..1EE66; UNKNOWN
+ 0x1EE67, // 1EE67..1EE6A; ARABIC
+ 0x1EE6B, // 1EE6B ; UNKNOWN
+ 0x1EE6C, // 1EE6C..1EE72; ARABIC
+ 0x1EE73, // 1EE73 ; UNKNOWN
+ 0x1EE74, // 1EE74..1EE77; ARABIC
+ 0x1EE78, // 1EE78 ; UNKNOWN
+ 0x1EE79, // 1EE79..1EE7C; ARABIC
+ 0x1EE7D, // 1EE7D ; UNKNOWN
+ 0x1EE7E, // 1EE7E ; ARABIC
+ 0x1EE7F, // 1EE7F ; UNKNOWN
+ 0x1EE80, // 1EE80..1EE89; ARABIC
+ 0x1EE8A, // 1EE8A ; UNKNOWN
+ 0x1EE8B, // 1EE8B..1EE9B; ARABIC
+ 0x1EE9C, // 1EE9C..1EEA0; UNKNOWN
+ 0x1EEA1, // 1EEA1..1EEA3; ARABIC
+ 0x1EEA4, // 1EEA4 ; UNKNOWN
+ 0x1EEA5, // 1EEA5..1EEA9; ARABIC
+ 0x1EEAA, // 1EEAA ; UNKNOWN
+ 0x1EEAB, // 1EEAB..1EEBB; ARABIC
+ 0x1EEBC, // 1EEBC..1EEEF; UNKNOWN
+ 0x1EEF0, // 1EEF0..1EEF1; ARABIC
+ 0x1EEF2, // 1EEF2..1EFFF; UNKNOWN
+ 0x1F000, // 1F000..1F02B; COMMON
+ 0x1F02C, // 1F02C..1F02F; UNKNOWN
+ 0x1F030, // 1F030..1F093; COMMON
+ 0x1F094, // 1F094..1F09F; UNKNOWN
+ 0x1F0A0, // 1F0A0..1F0AE; COMMON
+ 0x1F0AF, // 1F0AF..1F0B0; UNKNOWN
+ 0x1F0B1, // 1F0B1..1F0BF; COMMON
+ 0x1F0C0, // 1F0C0 ; UNKNOWN
+ 0x1F0C1, // 1F0C1..1F0CF; COMMON
+ 0x1F0D0, // 1F0D0 ; UNKNOWN
+ 0x1F0D1, // 1F0D1..1F0F5; COMMON
+ 0x1F0F6, // 1F0F6..1F0FF; UNKNOWN
+ 0x1F100, // 1F100..1F10C; COMMON
+ 0x1F10D, // 1F10D..1F10F; UNKNOWN
+ 0x1F110, // 1F110..1F12E; COMMON
+ 0x1F12F, // 1F12F ; UNKNOWN
+ 0x1F130, // 1F130..1F16B; COMMON
+ 0x1F16C, // 1F16C..1F16F; UNKNOWN
+ 0x1F170, // 1F170..1F19A; COMMON
+ 0x1F19B, // 1F19B..1F1E5; UNKNOWN
+ 0x1F1E6, // 1F1E6..1F1FF; COMMON
+ 0x1F200, // 1F200 ; HIRAGANA
+ 0x1F201, // 1F201..1F202; COMMON
+ 0x1F203, // 1F203..1F20F; UNKNOWN
+ 0x1F210, // 1F210..1F23A; COMMON
+ 0x1F23B, // 1F23B..1F23F; UNKNOWN
+ 0x1F240, // 1F240..1F248; COMMON
+ 0x1F249, // 1F249..1F24F; UNKNOWN
+ 0x1F250, // 1F250..1F251; COMMON
+ 0x1F252, // 1F252..1F2FF; UNKNOWN
+ 0x1F300, // 1F300..1F32C; COMMON
+ 0x1F32D, // 1F32D..1F32F; UNKNOWN
+ 0x1F330, // 1F330..1F37D; COMMON
+ 0x1F37E, // 1F37E..1F37F; UNKNOWN
+ 0x1F380, // 1F380..1F3CE; COMMON
+ 0x1F3CF, // 1F3CF..1F3D3; UNKNOWN
+ 0x1F3D4, // 1F3D4..1F3F7; COMMON
+ 0x1F3F8, // 1F3F8..1F3FF; UNKNOWN
+ 0x1F400, // 1F400..1F4FE; COMMON
+ 0x1F4FF, // 1F4FF ; UNKNOWN
+ 0x1F500, // 1F500..1F54A; COMMON
+ 0x1F54B, // 1F54B..1F54F; UNKNOWN
+ 0x1F550, // 1F550..1F579; COMMON
+ 0x1F57A, // 1F57A ; UNKNOWN
+ 0x1F57B, // 1F57B..1F5A3; COMMON
+ 0x1F5A4, // 1F5A4 ; UNKNOWN
+ 0x1F5A5, // 1F5A5..1F642; COMMON
+ 0x1F643, // 1F643..1F644; UNKNOWN
+ 0x1F645, // 1F645..1F6CF; COMMON
+ 0x1F6D0, // 1F6D0..1F6DF; UNKNOWN
+ 0x1F6E0, // 1F6E0..1F6EC; COMMON
+ 0x1F6ED, // 1F6ED..1F6EF; UNKNOWN
+ 0x1F6F0, // 1F6F0..1F6F3; COMMON
+ 0x1F6F4, // 1F6F4..1F6FF; UNKNOWN
+ 0x1F700, // 1F700..1F773; COMMON
+ 0x1F774, // 1F774..1F77F; UNKNOWN
+ 0x1F780, // 1F780..1F7D4; COMMON
+ 0x1F7D5, // 1F7D5..1F7FF; UNKNOWN
+ 0x1F800, // 1F800..1F80B; COMMON
+ 0x1F80C, // 1F80C..1F80F; UNKNOWN
+ 0x1F810, // 1F810..1F847; COMMON
+ 0x1F848, // 1F848..1F84F; UNKNOWN
+ 0x1F850, // 1F850..1F859; COMMON
+ 0x1F85A, // 1F85A..1F85F; UNKNOWN
+ 0x1F860, // 1F860..1F887; COMMON
+ 0x1F888, // 1F888..1F88F; UNKNOWN
+ 0x1F890, // 1F890..1F8AD; COMMON
+ 0x1F8AE, // 1F8AE..1FFFF; UNKNOWN
+ 0x20000, // 20000..2A6D6; HAN
+ 0x2A6D7, // 2A6D7..2A6FF; UNKNOWN
+ 0x2A700, // 2A700..2B734; HAN
+ 0x2B735, // 2B735..2B73F; UNKNOWN
+ 0x2B740, // 2B740..2B81D; HAN
+ 0x2B81E, // 2B81E..2F7FF; UNKNOWN
+ 0x2F800, // 2F800..2FA1D; HAN
+ 0x2FA1E, // 2FA1E..E0000; UNKNOWN
+ 0xE0001, // E0001 ; COMMON
+ 0xE0002, // E0002..E001F; UNKNOWN
+ 0xE0020, // E0020..E007F; COMMON
+ 0xE0080, // E0080..E00FF; UNKNOWN
0xE0100, // E0100..E01EF; INHERITED
0xE01F0 // E01F0..10FFFF; UNKNOWN
-
};
private static final UnicodeScript[] scripts = {
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- BOPOMOFO,
- COMMON,
- INHERITED,
- GREEK,
- COMMON,
- GREEK,
- COMMON,
- GREEK,
- COMMON,
- GREEK,
- COMMON,
- GREEK,
- COPTIC,
- GREEK,
- CYRILLIC,
- INHERITED,
- CYRILLIC,
- ARMENIAN,
- COMMON,
- ARMENIAN,
- HEBREW,
- ARABIC,
- COMMON,
- ARABIC,
- COMMON,
- ARABIC,
- COMMON,
- ARABIC,
- COMMON,
- ARABIC,
- INHERITED,
- ARABIC,
- COMMON,
- ARABIC,
- INHERITED,
- ARABIC,
- COMMON,
- ARABIC,
- SYRIAC,
- ARABIC,
- THAANA,
- NKO,
- SAMARITAN,
- MANDAIC,
- ARABIC,
- DEVANAGARI,
- INHERITED,
- DEVANAGARI,
- COMMON,
- DEVANAGARI,
- BENGALI,
- GURMUKHI,
- GUJARATI,
- ORIYA,
- TAMIL,
- TELUGU,
- KANNADA,
- MALAYALAM,
- SINHALA,
- THAI,
- COMMON,
- THAI,
- LAO,
- TIBETAN,
- COMMON,
- TIBETAN,
- MYANMAR,
- GEORGIAN,
- COMMON,
- GEORGIAN,
- HANGUL,
- ETHIOPIC,
- CHEROKEE,
- CANADIAN_ABORIGINAL,
- OGHAM,
- RUNIC,
- COMMON,
- RUNIC,
- TAGALOG,
- HANUNOO,
- COMMON,
- BUHID,
- TAGBANWA,
- KHMER,
- MONGOLIAN,
- COMMON,
- MONGOLIAN,
- COMMON,
- MONGOLIAN,
- CANADIAN_ABORIGINAL,
- LIMBU,
- TAI_LE,
- NEW_TAI_LUE,
- KHMER,
- BUGINESE,
- TAI_THAM,
- BALINESE,
- SUNDANESE,
- BATAK,
- LEPCHA,
- OL_CHIKI,
- SUNDANESE,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- LATIN,
- GREEK,
- CYRILLIC,
- LATIN,
- GREEK,
- LATIN,
- GREEK,
- LATIN,
- CYRILLIC,
- LATIN,
- GREEK,
- INHERITED,
- LATIN,
- GREEK,
- COMMON,
- INHERITED,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- INHERITED,
- COMMON,
- GREEK,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- BRAILLE,
- COMMON,
- GLAGOLITIC,
- LATIN,
- COPTIC,
- GEORGIAN,
- TIFINAGH,
- ETHIOPIC,
- CYRILLIC,
- COMMON,
- HAN,
- COMMON,
- HAN,
- COMMON,
- HAN,
- COMMON,
- HAN,
- INHERITED,
- HANGUL,
- COMMON,
- HAN,
- COMMON,
- HIRAGANA,
- INHERITED,
- COMMON,
- HIRAGANA,
- COMMON,
- KATAKANA,
- COMMON,
- KATAKANA,
- BOPOMOFO,
- HANGUL,
- COMMON,
- BOPOMOFO,
- COMMON,
- KATAKANA,
- HANGUL,
- COMMON,
- HANGUL,
- COMMON,
- KATAKANA,
- COMMON,
- HAN,
- COMMON,
- HAN,
- YI,
- LISU,
- VAI,
- CYRILLIC,
- BAMUM,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- SYLOTI_NAGRI,
- COMMON,
- PHAGS_PA,
- SAURASHTRA,
- DEVANAGARI,
- KAYAH_LI,
- REJANG,
- HANGUL,
- JAVANESE,
- CHAM,
- MYANMAR,
- TAI_VIET,
- MEETEI_MAYEK,
- ETHIOPIC,
- MEETEI_MAYEK,
- HANGUL,
- UNKNOWN ,
- HAN,
- LATIN,
- ARMENIAN,
- HEBREW,
- ARABIC,
- COMMON,
- ARABIC,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- ARABIC,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- KATAKANA,
- COMMON,
- KATAKANA,
- COMMON,
- HANGUL,
- COMMON,
- LINEAR_B,
- COMMON,
- GREEK,
- COMMON,
- INHERITED,
- LYCIAN,
- CARIAN,
- OLD_ITALIC,
- GOTHIC,
- UGARITIC,
- OLD_PERSIAN,
- DESERET,
- SHAVIAN,
- OSMANYA,
- CYPRIOT,
- IMPERIAL_ARAMAIC,
- PHOENICIAN,
- LYDIAN,
- MEROITIC_HIEROGLYPHS,
- MEROITIC_CURSIVE,
- KHAROSHTHI,
- OLD_SOUTH_ARABIAN,
- AVESTAN,
- INSCRIPTIONAL_PARTHIAN,
- INSCRIPTIONAL_PAHLAVI,
- OLD_TURKIC,
- ARABIC,
- BRAHMI,
- KAITHI,
- SORA_SOMPENG,
- CHAKMA,
- SHARADA,
- TAKRI,
- CUNEIFORM,
- EGYPTIAN_HIEROGLYPHS,
- BAMUM,
- MIAO,
- KATAKANA,
- HIRAGANA,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- GREEK,
- COMMON,
- ARABIC,
- COMMON,
- HIRAGANA,
- COMMON,
- HAN,
- COMMON,
- INHERITED,
- UNKNOWN
+ COMMON, // 0000..0040
+ LATIN, // 0041..005A
+ COMMON, // 005B..0060
+ LATIN, // 0061..007A
+ COMMON, // 007B..00A9
+ LATIN, // 00AA
+ COMMON, // 00AB..00B9
+ LATIN, // 00BA
+ COMMON, // 00BB..00BF
+ LATIN, // 00C0..00D6
+ COMMON, // 00D7
+ LATIN, // 00D8..00F6
+ COMMON, // 00F7
+ LATIN, // 00F8..02B8
+ COMMON, // 02B9..02DF
+ LATIN, // 02E0..02E4
+ COMMON, // 02E5..02E9
+ BOPOMOFO, // 02EA..02EB
+ COMMON, // 02EC..02FF
+ INHERITED, // 0300..036F
+ GREEK, // 0370..0373
+ COMMON, // 0374
+ GREEK, // 0375..0377
+ UNKNOWN, // 0378..0379
+ GREEK, // 037A..037D
+ COMMON, // 037E
+ GREEK, // 037F
+ UNKNOWN, // 0380..0383
+ GREEK, // 0384
+ COMMON, // 0385
+ GREEK, // 0386
+ COMMON, // 0387
+ GREEK, // 0388..038A
+ UNKNOWN, // 038B
+ GREEK, // 038C
+ UNKNOWN, // 038D
+ GREEK, // 038E..03A1
+ UNKNOWN, // 03A2
+ GREEK, // 03A3..03E1
+ COPTIC, // 03E2..03EF
+ GREEK, // 03F0..03FF
+ CYRILLIC, // 0400..0484
+ INHERITED, // 0485..0486
+ CYRILLIC, // 0487..052F
+ UNKNOWN, // 0530
+ ARMENIAN, // 0531..0556
+ UNKNOWN, // 0557..0558
+ ARMENIAN, // 0559..055F
+ UNKNOWN, // 0560
+ ARMENIAN, // 0561..0587
+ UNKNOWN, // 0588
+ COMMON, // 0589
+ ARMENIAN, // 058A
+ UNKNOWN, // 058B..058C
+ ARMENIAN, // 058D..058F
+ UNKNOWN, // 0590
+ HEBREW, // 0591..05C7
+ UNKNOWN, // 05C8..05CF
+ HEBREW, // 05D0..05EA
+ UNKNOWN, // 05EB..05EF
+ HEBREW, // 05F0..05F4
+ UNKNOWN, // 05F5..05FF
+ ARABIC, // 0600..0604
+ COMMON, // 0605
+ ARABIC, // 0606..060B
+ COMMON, // 060C
+ ARABIC, // 060D..061A
+ COMMON, // 061B..061C
+ UNKNOWN, // 061D
+ ARABIC, // 061E
+ COMMON, // 061F
+ ARABIC, // 0620..063F
+ COMMON, // 0640
+ ARABIC, // 0641..064A
+ INHERITED, // 064B..0655
+ ARABIC, // 0656..065F
+ COMMON, // 0660..0669
+ ARABIC, // 066A..066F
+ INHERITED, // 0670
+ ARABIC, // 0671..06DC
+ COMMON, // 06DD
+ ARABIC, // 06DE..06FF
+ SYRIAC, // 0700..070D
+ UNKNOWN, // 070E
+ SYRIAC, // 070F..074A
+ UNKNOWN, // 074B..074C
+ SYRIAC, // 074D..074F
+ ARABIC, // 0750..077F
+ THAANA, // 0780..07B1
+ UNKNOWN, // 07B2..07BF
+ NKO, // 07C0..07FA
+ UNKNOWN, // 07FB..07FF
+ SAMARITAN, // 0800..082D
+ UNKNOWN, // 082E..082F
+ SAMARITAN, // 0830..083E
+ UNKNOWN, // 083F
+ MANDAIC, // 0840..085B
+ UNKNOWN, // 085C..085D
+ MANDAIC, // 085E
+ UNKNOWN, // 085F..089F
+ ARABIC, // 08A0..08B2
+ UNKNOWN, // 08B3..08E3
+ ARABIC, // 08E4..08FF
+ DEVANAGARI, // 0900..0950
+ INHERITED, // 0951..0952
+ DEVANAGARI, // 0953..0963
+ COMMON, // 0964..0965
+ DEVANAGARI, // 0966..097F
+ BENGALI, // 0980..0983
+ UNKNOWN, // 0984
+ BENGALI, // 0985..098C
+ UNKNOWN, // 098D..098E
+ BENGALI, // 098F..0990
+ UNKNOWN, // 0991..0992
+ BENGALI, // 0993..09A8
+ UNKNOWN, // 09A9
+ BENGALI, // 09AA..09B0
+ UNKNOWN, // 09B1
+ BENGALI, // 09B2
+ UNKNOWN, // 09B3..09B5
+ BENGALI, // 09B6..09B9
+ UNKNOWN, // 09BA..09BB
+ BENGALI, // 09BC..09C4
+ UNKNOWN, // 09C5..09C6
+ BENGALI, // 09C7..09C8
+ UNKNOWN, // 09C9..09CA
+ BENGALI, // 09CB..09CE
+ UNKNOWN, // 09CF..09D6
+ BENGALI, // 09D7
+ UNKNOWN, // 09D8..09DB
+ BENGALI, // 09DC..09DD
+ UNKNOWN, // 09DE
+ BENGALI, // 09DF..09E3
+ UNKNOWN, // 09E4..09E5
+ BENGALI, // 09E6..09FB
+ UNKNOWN, // 09FC..0A00
+ GURMUKHI, // 0A01..0A03
+ UNKNOWN, // 0A04
+ GURMUKHI, // 0A05..0A0A
+ UNKNOWN, // 0A0B..0A0E
+ GURMUKHI, // 0A0F..0A10
+ UNKNOWN, // 0A11..0A12
+ GURMUKHI, // 0A13..0A28
+ UNKNOWN, // 0A29
+ GURMUKHI, // 0A2A..0A30
+ UNKNOWN, // 0A31
+ GURMUKHI, // 0A32..0A33
+ UNKNOWN, // 0A34
+ GURMUKHI, // 0A35..0A36
+ UNKNOWN, // 0A37
+ GURMUKHI, // 0A38..0A39
+ UNKNOWN, // 0A3A..0A3B
+ GURMUKHI, // 0A3C
+ UNKNOWN, // 0A3D
+ GURMUKHI, // 0A3E..0A42
+ UNKNOWN, // 0A43..0A46
+ GURMUKHI, // 0A47..0A48
+ UNKNOWN, // 0A49..0A4A
+ GURMUKHI, // 0A4B..0A4D
+ UNKNOWN, // 0A4E..0A50
+ GURMUKHI, // 0A51
+ UNKNOWN, // 0A52..0A58
+ GURMUKHI, // 0A59..0A5C
+ UNKNOWN, // 0A5D
+ GURMUKHI, // 0A5E
+ UNKNOWN, // 0A5F..0A65
+ GURMUKHI, // 0A66..0A75
+ UNKNOWN, // 0A76..0A80
+ GUJARATI, // 0A81..0A83
+ UNKNOWN, // 0A84
+ GUJARATI, // 0A85..0A8D
+ UNKNOWN, // 0A8E
+ GUJARATI, // 0A8F..0A91
+ UNKNOWN, // 0A92
+ GUJARATI, // 0A93..0AA8
+ UNKNOWN, // 0AA9
+ GUJARATI, // 0AAA..0AB0
+ UNKNOWN, // 0AB1
+ GUJARATI, // 0AB2..0AB3
+ UNKNOWN, // 0AB4
+ GUJARATI, // 0AB5..0AB9
+ UNKNOWN, // 0ABA..0ABB
+ GUJARATI, // 0ABC..0AC5
+ UNKNOWN, // 0AC6
+ GUJARATI, // 0AC7..0AC9
+ UNKNOWN, // 0ACA
+ GUJARATI, // 0ACB..0ACD
+ UNKNOWN, // 0ACE..0ACF
+ GUJARATI, // 0AD0
+ UNKNOWN, // 0AD1..0ADF
+ GUJARATI, // 0AE0..0AE3
+ UNKNOWN, // 0AE4..0AE5
+ GUJARATI, // 0AE6..0AF1
+ UNKNOWN, // 0AF2..0B00
+ ORIYA, // 0B01..0B03
+ UNKNOWN, // 0B04
+ ORIYA, // 0B05..0B0C
+ UNKNOWN, // 0B0D..0B0E
+ ORIYA, // 0B0F..0B10
+ UNKNOWN, // 0B11..0B12
+ ORIYA, // 0B13..0B28
+ UNKNOWN, // 0B29
+ ORIYA, // 0B2A..0B30
+ UNKNOWN, // 0B31
+ ORIYA, // 0B32..0B33
+ UNKNOWN, // 0B34
+ ORIYA, // 0B35..0B39
+ UNKNOWN, // 0B3A..0B3B
+ ORIYA, // 0B3C..0B44
+ UNKNOWN, // 0B45..0B46
+ ORIYA, // 0B47..0B48
+ UNKNOWN, // 0B49..0B4A
+ ORIYA, // 0B4B..0B4D
+ UNKNOWN, // 0B4E..0B55
+ ORIYA, // 0B56..0B57
+ UNKNOWN, // 0B58..0B5B
+ ORIYA, // 0B5C..0B5D
+ UNKNOWN, // 0B5E
+ ORIYA, // 0B5F..0B63
+ UNKNOWN, // 0B64..0B65
+ ORIYA, // 0B66..0B77
+ UNKNOWN, // 0B78..0B81
+ TAMIL, // 0B82..0B83
+ UNKNOWN, // 0B84
+ TAMIL, // 0B85..0B8A
+ UNKNOWN, // 0B8B..0B8D
+ TAMIL, // 0B8E..0B90
+ UNKNOWN, // 0B91
+ TAMIL, // 0B92..0B95
+ UNKNOWN, // 0B96..0B98
+ TAMIL, // 0B99..0B9A
+ UNKNOWN, // 0B9B
+ TAMIL, // 0B9C
+ UNKNOWN, // 0B9D
+ TAMIL, // 0B9E..0B9F
+ UNKNOWN, // 0BA0..0BA2
+ TAMIL, // 0BA3..0BA4
+ UNKNOWN, // 0BA5..0BA7
+ TAMIL, // 0BA8..0BAA
+ UNKNOWN, // 0BAB..0BAD
+ TAMIL, // 0BAE..0BB9
+ UNKNOWN, // 0BBA..0BBD
+ TAMIL, // 0BBE..0BC2
+ UNKNOWN, // 0BC3..0BC5
+ TAMIL, // 0BC6..0BC8
+ UNKNOWN, // 0BC9
+ TAMIL, // 0BCA..0BCD
+ UNKNOWN, // 0BCE..0BCF
+ TAMIL, // 0BD0
+ UNKNOWN, // 0BD1..0BD6
+ TAMIL, // 0BD7
+ UNKNOWN, // 0BD8..0BE5
+ TAMIL, // 0BE6..0BFA
+ UNKNOWN, // 0BFB..0BFF
+ TELUGU, // 0C00..0C03
+ UNKNOWN, // 0C04
+ TELUGU, // 0C05..0C0C
+ UNKNOWN, // 0C0D
+ TELUGU, // 0C0E..0C10
+ UNKNOWN, // 0C11
+ TELUGU, // 0C12..0C28
+ UNKNOWN, // 0C29
+ TELUGU, // 0C2A..0C39
+ UNKNOWN, // 0C3A..0C3C
+ TELUGU, // 0C3D..0C44
+ UNKNOWN, // 0C45
+ TELUGU, // 0C46..0C48
+ UNKNOWN, // 0C49
+ TELUGU, // 0C4A..0C4D
+ UNKNOWN, // 0C4E..0C54
+ TELUGU, // 0C55..0C56
+ UNKNOWN, // 0C57
+ TELUGU, // 0C58..0C59
+ UNKNOWN, // 0C5A..0C5F
+ TELUGU, // 0C60..0C63
+ UNKNOWN, // 0C64..0C65
+ TELUGU, // 0C66..0C6F
+ UNKNOWN, // 0C70..0C77
+ TELUGU, // 0C78..0C7F
+ UNKNOWN, // 0C80
+ KANNADA, // 0C81..0C83
+ UNKNOWN, // 0C84
+ KANNADA, // 0C85..0C8C
+ UNKNOWN, // 0C8D
+ KANNADA, // 0C8E..0C90
+ UNKNOWN, // 0C91
+ KANNADA, // 0C92..0CA8
+ UNKNOWN, // 0CA9
+ KANNADA, // 0CAA..0CB3
+ UNKNOWN, // 0CB4
+ KANNADA, // 0CB5..0CB9
+ UNKNOWN, // 0CBA..0CBB
+ KANNADA, // 0CBC..0CC4
+ UNKNOWN, // 0CC5
+ KANNADA, // 0CC6..0CC8
+ UNKNOWN, // 0CC9
+ KANNADA, // 0CCA..0CCD
+ UNKNOWN, // 0CCE..0CD4
+ KANNADA, // 0CD5..0CD6
+ UNKNOWN, // 0CD7..0CDD
+ KANNADA, // 0CDE
+ UNKNOWN, // 0CDF
+ KANNADA, // 0CE0..0CE3
+ UNKNOWN, // 0CE4..0CE5
+ KANNADA, // 0CE6..0CEF
+ UNKNOWN, // 0CF0
+ KANNADA, // 0CF1..0CF2
+ UNKNOWN, // 0CF3..0D00
+ MALAYALAM, // 0D01..0D03
+ UNKNOWN, // 0D04
+ MALAYALAM, // 0D05..0D0C
+ UNKNOWN, // 0D0D
+ MALAYALAM, // 0D0E..0D10
+ UNKNOWN, // 0D11
+ MALAYALAM, // 0D12..0D3A
+ UNKNOWN, // 0D3B..0D3C
+ MALAYALAM, // 0D3D..0D44
+ UNKNOWN, // 0D45
+ MALAYALAM, // 0D46..0D48
+ UNKNOWN, // 0D49
+ MALAYALAM, // 0D4A..0D4E
+ UNKNOWN, // 0D4F..0D56
+ MALAYALAM, // 0D57
+ UNKNOWN, // 0D58..0D5F
+ MALAYALAM, // 0D60..0D63
+ UNKNOWN, // 0D64..0D65
+ MALAYALAM, // 0D66..0D75
+ UNKNOWN, // 0D76..0D78
+ MALAYALAM, // 0D79..0D7F
+ UNKNOWN, // 0D80..0D81
+ SINHALA, // 0D82..0D83
+ UNKNOWN, // 0D84
+ SINHALA, // 0D85..0D96
+ UNKNOWN, // 0D97..0D99
+ SINHALA, // 0D9A..0DB1
+ UNKNOWN, // 0DB2
+ SINHALA, // 0DB3..0DBB
+ UNKNOWN, // 0DBC
+ SINHALA, // 0DBD
+ UNKNOWN, // 0DBE..0DBF
+ SINHALA, // 0DC0..0DC6
+ UNKNOWN, // 0DC7..0DC9
+ SINHALA, // 0DCA
+ UNKNOWN, // 0DCB..0DCE
+ SINHALA, // 0DCF..0DD4
+ UNKNOWN, // 0DD5
+ SINHALA, // 0DD6
+ UNKNOWN, // 0DD7
+ SINHALA, // 0DD8..0DDF
+ UNKNOWN, // 0DE0..0DE5
+ SINHALA, // 0DE6..0DEF
+ UNKNOWN, // 0DF0..0DF1
+ SINHALA, // 0DF2..0DF4
+ UNKNOWN, // 0DF5..0E00
+ THAI, // 0E01..0E3A
+ UNKNOWN, // 0E3B..0E3E
+ COMMON, // 0E3F
+ THAI, // 0E40..0E5B
+ UNKNOWN, // 0E5C..0E80
+ LAO, // 0E81..0E82
+ UNKNOWN, // 0E83
+ LAO, // 0E84
+ UNKNOWN, // 0E85..0E86
+ LAO, // 0E87..0E88
+ UNKNOWN, // 0E89
+ LAO, // 0E8A
+ UNKNOWN, // 0E8B..0E8C
+ LAO, // 0E8D
+ UNKNOWN, // 0E8E..0E93
+ LAO, // 0E94..0E97
+ UNKNOWN, // 0E98
+ LAO, // 0E99..0E9F
+ UNKNOWN, // 0EA0
+ LAO, // 0EA1..0EA3
+ UNKNOWN, // 0EA4
+ LAO, // 0EA5
+ UNKNOWN, // 0EA6
+ LAO, // 0EA7
+ UNKNOWN, // 0EA8..0EA9
+ LAO, // 0EAA..0EAB
+ UNKNOWN, // 0EAC
+ LAO, // 0EAD..0EB9
+ UNKNOWN, // 0EBA
+ LAO, // 0EBB..0EBD
+ UNKNOWN, // 0EBE..0EBF
+ LAO, // 0EC0..0EC4
+ UNKNOWN, // 0EC5
+ LAO, // 0EC6
+ UNKNOWN, // 0EC7
+ LAO, // 0EC8..0ECD
+ UNKNOWN, // 0ECE..0ECF
+ LAO, // 0ED0..0ED9
+ UNKNOWN, // 0EDA..0EDB
+ LAO, // 0EDC..0EDF
+ UNKNOWN, // 0EE0..0EFF
+ TIBETAN, // 0F00..0F47
+ UNKNOWN, // 0F48
+ TIBETAN, // 0F49..0F6C
+ UNKNOWN, // 0F6D..0F70
+ TIBETAN, // 0F71..0F97
+ UNKNOWN, // 0F98
+ TIBETAN, // 0F99..0FBC
+ UNKNOWN, // 0FBD
+ TIBETAN, // 0FBE..0FCC
+ UNKNOWN, // 0FCD
+ TIBETAN, // 0FCE..0FD4
+ COMMON, // 0FD5..0FD8
+ TIBETAN, // 0FD9..0FDA
+ UNKNOWN, // 0FDB..FFF
+ MYANMAR, // 1000..109F
+ GEORGIAN, // 10A0..10C5
+ UNKNOWN, // 10C6
+ GEORGIAN, // 10C7
+ UNKNOWN, // 10C8..10CC
+ GEORGIAN, // 10CD
+ UNKNOWN, // 10CE..10CF
+ GEORGIAN, // 10D0..10FA
+ COMMON, // 10FB
+ GEORGIAN, // 10FC..10FF
+ HANGUL, // 1100..11FF
+ ETHIOPIC, // 1200..1248
+ UNKNOWN, // 1249
+ ETHIOPIC, // 124A..124D
+ UNKNOWN, // 124E..124F
+ ETHIOPIC, // 1250..1256
+ UNKNOWN, // 1257
+ ETHIOPIC, // 1258
+ UNKNOWN, // 1259
+ ETHIOPIC, // 125A..125D
+ UNKNOWN, // 125E..125F
+ ETHIOPIC, // 1260..1288
+ UNKNOWN, // 1289
+ ETHIOPIC, // 128A..128D
+ UNKNOWN, // 128E..128F
+ ETHIOPIC, // 1290..12B0
+ UNKNOWN, // 12B1
+ ETHIOPIC, // 12B2..12B5
+ UNKNOWN, // 12B6..12B7
+ ETHIOPIC, // 12B8..12BE
+ UNKNOWN, // 12BF
+ ETHIOPIC, // 12C0
+ UNKNOWN, // 12C1
+ ETHIOPIC, // 12C2..12C5
+ UNKNOWN, // 12C6..12C7
+ ETHIOPIC, // 12C8..12D6
+ UNKNOWN, // 12D7
+ ETHIOPIC, // 12D8..1310
+ UNKNOWN, // 1311
+ ETHIOPIC, // 1312..1315
+ UNKNOWN, // 1316..1317
+ ETHIOPIC, // 1318..135A
+ UNKNOWN, // 135B..135C
+ ETHIOPIC, // 135D..137C
+ UNKNOWN, // 137D..137F
+ ETHIOPIC, // 1380..1399
+ UNKNOWN, // 139A..139F
+ CHEROKEE, // 13A0..13F4
+ UNKNOWN, // 13F5..13FF
+ CANADIAN_ABORIGINAL, // 1400..167F
+ OGHAM, // 1680..169C
+ UNKNOWN, // 169D..169F
+ RUNIC, // 16A0..16EA
+ COMMON, // 16EB..16ED
+ RUNIC, // 16EE..16F8
+ UNKNOWN, // 16F9..16FF
+ TAGALOG, // 1700..170C
+ UNKNOWN, // 170D
+ TAGALOG, // 170E..1714
+ UNKNOWN, // 1715..171F
+ HANUNOO, // 1720..1734
+ COMMON, // 1735..1736
+ UNKNOWN, // 1737..173F
+ BUHID, // 1740..1753
+ UNKNOWN, // 1754..175F
+ TAGBANWA, // 1760..176C
+ UNKNOWN, // 176D
+ TAGBANWA, // 176E..1770
+ UNKNOWN, // 1771
+ TAGBANWA, // 1772..1773
+ UNKNOWN, // 1774..177F
+ KHMER, // 1780..17DD
+ UNKNOWN, // 17DE..17DF
+ KHMER, // 17E0..17E9
+ UNKNOWN, // 17EA..17EF
+ KHMER, // 17F0..17F9
+ UNKNOWN, // 17FA..17FF
+ MONGOLIAN, // 1800..1801
+ COMMON, // 1802..1803
+ MONGOLIAN, // 1804
+ COMMON, // 1805
+ MONGOLIAN, // 1806..180E
+ UNKNOWN, // 180F
+ MONGOLIAN, // 1810..1819
+ UNKNOWN, // 181A..181F
+ MONGOLIAN, // 1820..1877
+ UNKNOWN, // 1878..187F
+ MONGOLIAN, // 1880..18AA
+ UNKNOWN, // 18AB..18AF
+ CANADIAN_ABORIGINAL, // 18B0..18F5
+ UNKNOWN, // 18F6..18FF
+ LIMBU, // 1900..191E
+ UNKNOWN, // 191F
+ LIMBU, // 1920..192B
+ UNKNOWN, // 192C..192F
+ LIMBU, // 1930..193B
+ UNKNOWN, // 193C..193F
+ LIMBU, // 1940
+ UNKNOWN, // 1941..1943
+ LIMBU, // 1944..194F
+ TAI_LE, // 1950..196D
+ UNKNOWN, // 196E..196F
+ TAI_LE, // 1970..1974
+ UNKNOWN, // 1975..197F
+ NEW_TAI_LUE, // 1980..19AB
+ UNKNOWN, // 19AC..19AF
+ NEW_TAI_LUE, // 19B0..19C9
+ UNKNOWN, // 19CA..19CF
+ NEW_TAI_LUE, // 19D0..19DA
+ UNKNOWN, // 19DB..19DD
+ NEW_TAI_LUE, // 19DE..19DF
+ KHMER, // 19E0..19FF
+ BUGINESE, // 1A00..1A1B
+ UNKNOWN, // 1A1C..1A1D
+ BUGINESE, // 1A1E..1A1F
+ TAI_THAM, // 1A20..1A5E
+ UNKNOWN, // 1A5F
+ TAI_THAM, // 1A60..1A7C
+ UNKNOWN, // 1A7D..1A7E
+ TAI_THAM, // 1A7F..1A89
+ UNKNOWN, // 1A8A..1A8F
+ TAI_THAM, // 1A90..1A99
+ UNKNOWN, // 1A9A..1A9F
+ TAI_THAM, // 1AA0..1AAD
+ UNKNOWN, // 1AAE..1AAF
+ INHERITED, // 1AB0..1ABE
+ UNKNOWN, // 1ABF..1AFF
+ BALINESE, // 1B00..1B4B
+ UNKNOWN, // 1B4C..1B4F
+ BALINESE, // 1B50..1B7C
+ UNKNOWN, // 1B7D..1B7F
+ SUNDANESE, // 1B80..1BBF
+ BATAK, // 1BC0..1BF3
+ UNKNOWN, // 1BF4..1BFB
+ BATAK, // 1BFC..1BFF
+ LEPCHA, // 1C00..1C37
+ UNKNOWN, // 1C38..1C3A
+ LEPCHA, // 1C3B..1C49
+ UNKNOWN, // 1C4A..1C4C
+ LEPCHA, // 1C4D..1C4F
+ OL_CHIKI, // 1C50..1C7F
+ UNKNOWN, // 1C80..1CBF
+ SUNDANESE, // 1CC0..1CC7
+ UNKNOWN, // 1CC8..1CCF
+ INHERITED, // 1CD0..1CD2
+ COMMON, // 1CD3
+ INHERITED, // 1CD4..1CE0
+ COMMON, // 1CE1
+ INHERITED, // 1CE2..1CE8
+ COMMON, // 1CE9..1CEC
+ INHERITED, // 1CED
+ COMMON, // 1CEE..1CF3
+ INHERITED, // 1CF4
+ COMMON, // 1CF5..1CF6
+ UNKNOWN, // 1CF7
+ INHERITED, // 1CF8..1CF9
+ UNKNOWN, // 1CFA..1CFF
+ LATIN, // 1D00..1D25
+ GREEK, // 1D26..1D2A
+ CYRILLIC, // 1D2B
+ LATIN, // 1D2C..1D5C
+ GREEK, // 1D5D..1D61
+ LATIN, // 1D62..1D65
+ GREEK, // 1D66..1D6A
+ LATIN, // 1D6B..1D77
+ CYRILLIC, // 1D78
+ LATIN, // 1D79..1DBE
+ GREEK, // 1DBF
+ INHERITED, // 1DC0..1DF5
+ UNKNOWN, // 1DF6..1DFB
+ INHERITED, // 1DFC..1DFF
+ LATIN, // 1E00..1EFF
+ GREEK, // 1F00..1F15
+ UNKNOWN, // 1F16..1F17
+ GREEK, // 1F18..1F1D
+ UNKNOWN, // 1F1E..1F1F
+ GREEK, // 1F20..1F45
+ UNKNOWN, // 1F46..1F47
+ GREEK, // 1F48..1F4D
+ UNKNOWN, // 1F4E..1F4F
+ GREEK, // 1F50..1F57
+ UNKNOWN, // 1F58
+ GREEK, // 1F59
+ UNKNOWN, // 1F5A
+ GREEK, // 1F5B
+ UNKNOWN, // 1F5C
+ GREEK, // 1F5D
+ UNKNOWN, // 1F5E
+ GREEK, // 1F5F..1F7D
+ UNKNOWN, // 1F7E..1F7F
+ GREEK, // 1F80..1FB4
+ UNKNOWN, // 1FB5
+ GREEK, // 1FB6..1FC4
+ UNKNOWN, // 1FC5
+ GREEK, // 1FC6..1FD3
+ UNKNOWN, // 1FD4..1FD5
+ GREEK, // 1FD6..1FDB
+ UNKNOWN, // 1FDC
+ GREEK, // 1FDD..1FEF
+ UNKNOWN, // 1FF0..1FF1
+ GREEK, // 1FF2..1FF4
+ UNKNOWN, // 1FF5
+ GREEK, // 1FF6..1FFE
+ UNKNOWN, // 1FFF
+ COMMON, // 2000..200B
+ INHERITED, // 200C..200D
+ COMMON, // 200E..2064
+ UNKNOWN, // 2065
+ COMMON, // 2066..2070
+ LATIN, // 2071
+ UNKNOWN, // 2072..2073
+ COMMON, // 2074..207E
+ LATIN, // 207F
+ COMMON, // 2080..208E
+ UNKNOWN, // 208F
+ LATIN, // 2090..209C
+ UNKNOWN, // 209D..209F
+ COMMON, // 20A0..20BD
+ UNKNOWN, // 20BE..20CF
+ INHERITED, // 20D0..20F0
+ UNKNOWN, // 20F1..20FF
+ COMMON, // 2100..2125
+ GREEK, // 2126
+ COMMON, // 2127..2129
+ LATIN, // 212A..212B
+ COMMON, // 212C..2131
+ LATIN, // 2132
+ COMMON, // 2133..214D
+ LATIN, // 214E
+ COMMON, // 214F..215F
+ LATIN, // 2160..2188
+ COMMON, // 2189
+ UNKNOWN, // 218A..218F
+ COMMON, // 2190..23FA
+ UNKNOWN, // 23FB..23FF
+ COMMON, // 2400..2426
+ UNKNOWN, // 2427..243F
+ COMMON, // 2440..244A
+ UNKNOWN, // 244B..245F
+ COMMON, // 2460..27FF
+ BRAILLE, // 2800..28FF
+ COMMON, // 2900..2B73
+ UNKNOWN, // 2B74..2B75
+ COMMON, // 2B76..2B95
+ UNKNOWN, // 2B96..2B97
+ COMMON, // 2B98..2BB9
+ UNKNOWN, // 2BBA..2BBC
+ COMMON, // 2BBD..2BC8
+ UNKNOWN, // 2BC9
+ COMMON, // 2BCA..2BD1
+ UNKNOWN, // 2BD2..2BFF
+ GLAGOLITIC, // 2C00..2C2E
+ UNKNOWN, // 2C2F
+ GLAGOLITIC, // 2C30..2C5E
+ UNKNOWN, // 2C5F
+ LATIN, // 2C60..2C7F
+ COPTIC, // 2C80..2CF3
+ UNKNOWN, // 2CF4..2CF8
+ COPTIC, // 2CF9..2CFF
+ GEORGIAN, // 2D00..2D25
+ UNKNOWN, // 2D26
+ GEORGIAN, // 2D27
+ UNKNOWN, // 2D28..2D2C
+ GEORGIAN, // 2D2D
+ UNKNOWN, // 2D2E..2D2F
+ TIFINAGH, // 2D30..2D67
+ UNKNOWN, // 2D68..2D6E
+ TIFINAGH, // 2D6F..2D70
+ UNKNOWN, // 2D71..2D7E
+ TIFINAGH, // 2D7F
+ ETHIOPIC, // 2D80..2D96
+ UNKNOWN, // 2D97..2D9F
+ ETHIOPIC, // 2DA0..2DA6
+ UNKNOWN, // 2DA7
+ ETHIOPIC, // 2DA8..2DAE
+ UNKNOWN, // 2DAF
+ ETHIOPIC, // 2DB0..2DB6
+ UNKNOWN, // 2DB7
+ ETHIOPIC, // 2DB8..2DBE
+ UNKNOWN, // 2DBF
+ ETHIOPIC, // 2DC0..2DC6
+ UNKNOWN, // 2DC7
+ ETHIOPIC, // 2DC8..2DCE
+ UNKNOWN, // 2DCF
+ ETHIOPIC, // 2DD0..2DD6
+ UNKNOWN, // 2DD7
+ ETHIOPIC, // 2DD8..2DDE
+ UNKNOWN, // 2DDF
+ CYRILLIC, // 2DE0..2DFF
+ COMMON, // 2E00..2E42
+ UNKNOWN, // 2E43..2E7F
+ HAN, // 2E80..2E99
+ UNKNOWN, // 2E9A
+ HAN, // 2E9B..2EF3
+ UNKNOWN, // 2EF4..2EFF
+ HAN, // 2F00..2FD5
+ UNKNOWN, // 2FD6..2FEF
+ COMMON, // 2FF0..2FFB
+ UNKNOWN, // 2FFC..2FFF
+ COMMON, // 3000..3004
+ HAN, // 3005
+ COMMON, // 3006
+ HAN, // 3007
+ COMMON, // 3008..3020
+ HAN, // 3021..3029
+ INHERITED, // 302A..302D
+ HANGUL, // 302E..302F
+ COMMON, // 3030..3037
+ HAN, // 3038..303B
+ COMMON, // 303C..303F
+ UNKNOWN, // 3040
+ HIRAGANA, // 3041..3096
+ UNKNOWN, // 3097..3098
+ INHERITED, // 3099..309A
+ COMMON, // 309B..309C
+ HIRAGANA, // 309D..309F
+ COMMON, // 30A0
+ KATAKANA, // 30A1..30FA
+ COMMON, // 30FB..30FC
+ KATAKANA, // 30FD..30FF
+ UNKNOWN, // 3100..3104
+ BOPOMOFO, // 3105..312D
+ UNKNOWN, // 312E..3130
+ HANGUL, // 3131..318E
+ UNKNOWN, // 318F
+ COMMON, // 3190..319F
+ BOPOMOFO, // 31A0..31BA
+ UNKNOWN, // 31BB..31BF
+ COMMON, // 31C0..31E3
+ UNKNOWN, // 31E4..31EF
+ KATAKANA, // 31F0..31FF
+ HANGUL, // 3200..321E
+ UNKNOWN, // 321F
+ COMMON, // 3220..325F
+ HANGUL, // 3260..327E
+ COMMON, // 327F..32CF
+ KATAKANA, // 32D0..32FE
+ UNKNOWN, // 32FF
+ KATAKANA, // 3300..3357
+ COMMON, // 3358..33FF
+ HAN, // 3400..4DB5
+ UNKNOWN, // 4DB6..4DBF
+ COMMON, // 4DC0..4DFF
+ HAN, // 4E00..9FCC
+ UNKNOWN, // 9FCD..9FFF
+ YI, // A000..A48C
+ UNKNOWN, // A48D..A48F
+ YI, // A490..A4C6
+ UNKNOWN, // A4C7..A4CF
+ LISU, // A4D0..A4FF
+ VAI, // A500..A62B
+ UNKNOWN, // A62C..A63F
+ CYRILLIC, // A640..A69D
+ UNKNOWN, // A69E
+ CYRILLIC, // A69F
+ BAMUM, // A6A0..A6F7
+ UNKNOWN, // A6F8..A6FF
+ COMMON, // A700..A721
+ LATIN, // A722..A787
+ COMMON, // A788..A78A
+ LATIN, // A78B..A78E
+ UNKNOWN, // A78F
+ LATIN, // A790..A7AD
+ UNKNOWN, // A7AE..A7AF
+ LATIN, // A7B0..A7B1
+ UNKNOWN, // A7B2..A7F6
+ LATIN, // A7F7..A7FF
+ SYLOTI_NAGRI, // A800..A82B
+ UNKNOWN, // A82C..A82F
+ COMMON, // A830..A839
+ UNKNOWN, // A83A..A83F
+ PHAGS_PA, // A840..A877
+ UNKNOWN, // A878..A87F
+ SAURASHTRA, // A880..A8C4
+ UNKNOWN, // A8C5..A8CD
+ SAURASHTRA, // A8CE..A8D9
+ UNKNOWN, // A8DA..A8DF
+ DEVANAGARI, // A8E0..A8FB
+ UNKNOWN, // A8FC..A8FF
+ KAYAH_LI, // A900..A92D
+ COMMON, // A92E
+ KAYAH_LI, // A92F
+ REJANG, // A930..A953
+ UNKNOWN, // A954..A95E
+ REJANG, // A95F
+ HANGUL, // A960..A97C
+ UNKNOWN, // A97D..A97F
+ JAVANESE, // A980..A9CD
+ UNKNOWN, // A9CE
+ COMMON, // A9CF
+ JAVANESE, // A9D0..A9D9
+ UNKNOWN, // A9DA..A9DD
+ JAVANESE, // A9DE..A9DF
+ MYANMAR, // A9E0..A9FE
+ UNKNOWN, // A9FF
+ CHAM, // AA00..AA36
+ UNKNOWN, // AA37..AA3F
+ CHAM, // AA40..AA4D
+ UNKNOWN, // AA4E..AA4F
+ CHAM, // AA50..AA59
+ UNKNOWN, // AA5A..AA5B
+ CHAM, // AA5C..AA5F
+ MYANMAR, // AA60..AA7F
+ TAI_VIET, // AA80..AAC2
+ UNKNOWN, // AAC3..AADA
+ TAI_VIET, // AADB..AADF
+ MEETEI_MAYEK, // AAE0..AAF6
+ UNKNOWN, // AAF7..AB00
+ ETHIOPIC, // AB01..AB06
+ UNKNOWN, // AB07..AB08
+ ETHIOPIC, // AB09..AB0E
+ UNKNOWN, // AB0F..AB10
+ ETHIOPIC, // AB11..AB16
+ UNKNOWN, // AB17..AB1F
+ ETHIOPIC, // AB20..AB26
+ UNKNOWN, // AB27
+ ETHIOPIC, // AB28..AB2E
+ UNKNOWN, // AB2F
+ LATIN, // AB30..AB5A
+ COMMON, // AB5B
+ LATIN, // AB5C..AB5F
+ UNKNOWN, // AB60..AB63
+ LATIN, // AB64
+ GREEK, // AB65
+ UNKNOWN, // AB66..ABBF
+ MEETEI_MAYEK, // ABC0..ABED
+ UNKNOWN, // ABEE..ABEF
+ MEETEI_MAYEK, // ABF0..ABF9
+ UNKNOWN, // ABFA..ABFF
+ HANGUL, // AC00..D7A3
+ UNKNOWN, // D7A4..D7AF
+ HANGUL, // D7B0..D7C6
+ UNKNOWN, // D7C7..D7CA
+ HANGUL, // D7CB..D7FB
+ UNKNOWN, // D7FC..F8FF
+ HAN, // F900..FA6D
+ UNKNOWN, // FA6E..FA6F
+ HAN, // FA70..FAD9
+ UNKNOWN, // FADA..FAFF
+ LATIN, // FB00..FB06
+ UNKNOWN, // FB07..FB12
+ ARMENIAN, // FB13..FB17
+ UNKNOWN, // FB18..FB1C
+ HEBREW, // FB1D..FB36
+ UNKNOWN, // FB37
+ HEBREW, // FB38..FB3C
+ UNKNOWN, // FB3D
+ HEBREW, // FB3E
+ UNKNOWN, // FB3F
+ HEBREW, // FB40..FB41
+ UNKNOWN, // FB42
+ HEBREW, // FB43..FB44
+ UNKNOWN, // FB45
+ HEBREW, // FB46..FB4F
+ ARABIC, // FB50..FBC1
+ UNKNOWN, // FBC2..FBD2
+ ARABIC, // FBD3..FD3D
+ COMMON, // FD3E..FD3F
+ UNKNOWN, // FD40..FD4F
+ ARABIC, // FD50..FD8F
+ UNKNOWN, // FD90..FD91
+ ARABIC, // FD92..FDC7
+ UNKNOWN, // FDC8..FDEF
+ ARABIC, // FDF0..FDFD
+ UNKNOWN, // FDFE..FDFF
+ INHERITED, // FE00..FE0F
+ COMMON, // FE10..FE19
+ UNKNOWN, // FE1A..FE1F
+ INHERITED, // FE20..FE2D
+ UNKNOWN, // FE2E..FE2F
+ COMMON, // FE30..FE52
+ UNKNOWN, // FE53
+ COMMON, // FE54..FE66
+ UNKNOWN, // FE67
+ COMMON, // FE68..FE6B
+ UNKNOWN, // FE6C..FE6F
+ ARABIC, // FE70..FE74
+ UNKNOWN, // FE75
+ ARABIC, // FE76..FEFC
+ UNKNOWN, // FEFD..FEFE
+ COMMON, // FEFF
+ UNKNOWN, // FF00
+ COMMON, // FF01..FF20
+ LATIN, // FF21..FF3A
+ COMMON, // FF3B..FF40
+ LATIN, // FF41..FF5A
+ COMMON, // FF5B..FF65
+ KATAKANA, // FF66..FF6F
+ COMMON, // FF70
+ KATAKANA, // FF71..FF9D
+ COMMON, // FF9E..FF9F
+ HANGUL, // FFA0..FFBE
+ UNKNOWN, // FFBF..FFC1
+ HANGUL, // FFC2..FFC7
+ UNKNOWN, // FFC8..FFC9
+ HANGUL, // FFCA..FFCF
+ UNKNOWN, // FFD0..FFD1
+ HANGUL, // FFD2..FFD7
+ UNKNOWN, // FFD8..FFD9
+ HANGUL, // FFDA..FFDC
+ UNKNOWN, // FFDD..FFDF
+ COMMON, // FFE0..FFE6
+ UNKNOWN, // FFE7
+ COMMON, // FFE8..FFEE
+ UNKNOWN, // FFEF..FFF8
+ COMMON, // FFF9..FFFD
+ UNKNOWN, // FFFE..FFFF
+ LINEAR_B, // 10000..1000B
+ UNKNOWN, // 1000C
+ LINEAR_B, // 1000D..10026
+ UNKNOWN, // 10027
+ LINEAR_B, // 10028..1003A
+ UNKNOWN, // 1003B
+ LINEAR_B, // 1003C..1003D
+ UNKNOWN, // 1003E
+ LINEAR_B, // 1003F..1004D
+ UNKNOWN, // 1004E..1004F
+ LINEAR_B, // 10050..1005D
+ UNKNOWN, // 1005E..1007F
+ LINEAR_B, // 10080..100FA
+ UNKNOWN, // 100FB..100FF
+ COMMON, // 10100..10102
+ UNKNOWN, // 10103..10106
+ COMMON, // 10107..10133
+ UNKNOWN, // 10134..10136
+ COMMON, // 10137..1013F
+ GREEK, // 10140..1018C
+ UNKNOWN, // 1018D..1018F
+ COMMON, // 10190..1019B
+ UNKNOWN, // 1019C..1019F
+ GREEK, // 101A0
+ UNKNOWN, // 101A1..101CF
+ COMMON, // 101D0..101FC
+ INHERITED, // 101FD
+ UNKNOWN, // 101FE..1027F
+ LYCIAN, // 10280..1029C
+ UNKNOWN, // 1029D..1029F
+ CARIAN, // 102A0..102D0
+ UNKNOWN, // 102D1..102DF
+ INHERITED, // 102E0
+ COMMON, // 102E1..102FB
+ UNKNOWN, // 102FC..102FF
+ OLD_ITALIC, // 10300..10323
+ UNKNOWN, // 10324..1032F
+ GOTHIC, // 10330..1034A
+ UNKNOWN, // 1034B..1034F
+ OLD_PERMIC, // 10350..1037A
+ UNKNOWN, // 1037B..1037F
+ UGARITIC, // 10380..1039D
+ UNKNOWN, // 1039E
+ UGARITIC, // 1039F
+ OLD_PERSIAN, // 103A0..103C3
+ UNKNOWN, // 103C4..103C7
+ OLD_PERSIAN, // 103C8..103D5
+ UNKNOWN, // 103D6..103FF
+ DESERET, // 10400..1044F
+ SHAVIAN, // 10450..1047F
+ OSMANYA, // 10480..1049D
+ UNKNOWN, // 1049E..1049F
+ OSMANYA, // 104A0..104A9
+ UNKNOWN, // 104AA..104FF
+ ELBASAN, // 10500..10527
+ UNKNOWN, // 10528..1052F
+ CAUCASIAN_ALBANIAN, // 10530..10563
+ UNKNOWN, // 10564..1056E
+ CAUCASIAN_ALBANIAN, // 1056F
+ UNKNOWN, // 10570..105FF
+ LINEAR_A, // 10600..10736
+ UNKNOWN, // 10737..1073F
+ LINEAR_A, // 10740..10755
+ UNKNOWN, // 10756..1075F
+ LINEAR_A, // 10760..10767
+ UNKNOWN, // 10768..107FF
+ CYPRIOT, // 10800..10805
+ UNKNOWN, // 10806..10807
+ CYPRIOT, // 10808
+ UNKNOWN, // 10809
+ CYPRIOT, // 1080A..10835
+ UNKNOWN, // 10836
+ CYPRIOT, // 10837..10838
+ UNKNOWN, // 10839..1083B
+ CYPRIOT, // 1083C
+ UNKNOWN, // 1083D..1083E
+ CYPRIOT, // 1083F
+ IMPERIAL_ARAMAIC, // 10840..10855
+ UNKNOWN, // 10856
+ IMPERIAL_ARAMAIC, // 10857..1085F
+ PALMYRENE, // 10860..1087F
+ NABATAEAN, // 10880..1089E
+ UNKNOWN, // 1089F..108A6
+ NABATAEAN, // 108A7..108AF
+ UNKNOWN, // 108B0..108FF
+ PHOENICIAN, // 10900..1091B
+ UNKNOWN, // 1091C..1091E
+ PHOENICIAN, // 1091F
+ LYDIAN, // 10920..10939
+ UNKNOWN, // 1093A..1093E
+ LYDIAN, // 1093F
+ UNKNOWN, // 10940..1097F
+ MEROITIC_HIEROGLYPHS, // 10980..1099F
+ MEROITIC_CURSIVE, // 109A0..109B7
+ UNKNOWN, // 109B8..109BD
+ MEROITIC_CURSIVE, // 109BE..109BF
+ UNKNOWN, // 109C0..109FF
+ KHAROSHTHI, // 10A00..10A03
+ UNKNOWN, // 10A04
+ KHAROSHTHI, // 10A05..10A06
+ UNKNOWN, // 10A07..10A0B
+ KHAROSHTHI, // 10A0C..10A13
+ UNKNOWN, // 10A14
+ KHAROSHTHI, // 10A15..10A17
+ UNKNOWN, // 10A18
+ KHAROSHTHI, // 10A19..10A33
+ UNKNOWN, // 10A34..10A37
+ KHAROSHTHI, // 10A38..10A3A
+ UNKNOWN, // 10A3B..10A3E
+ KHAROSHTHI, // 10A3F..10A47
+ UNKNOWN, // 10A48..10A4F
+ KHAROSHTHI, // 10A50..10A58
+ UNKNOWN, // 10A59..10A5F
+ OLD_SOUTH_ARABIAN, // 10A60..10A7F
+ OLD_NORTH_ARABIAN, // 10A80..10A9F
+ UNKNOWN, // 10AA0..10ABF
+ MANICHAEAN, // 10AC0..10AE6
+ UNKNOWN, // 10AE7..10AEA
+ MANICHAEAN, // 10AEB..10AF6
+ UNKNOWN, // 10AF7..10AFF
+ AVESTAN, // 10B00..10B35
+ UNKNOWN, // 10B36..10B38
+ AVESTAN, // 10B39..10B3F
+ INSCRIPTIONAL_PARTHIAN, // 10B40..10B55
+ UNKNOWN, // 10B56..10B57
+ INSCRIPTIONAL_PARTHIAN, // 10B58..10B5F
+ INSCRIPTIONAL_PAHLAVI, // 10B60..10B72
+ UNKNOWN, // 10B73..10B77
+ INSCRIPTIONAL_PAHLAVI, // 10B78..10B7F
+ PSALTER_PAHLAVI, // 10B80..10B91
+ UNKNOWN, // 10B92..10B98
+ PSALTER_PAHLAVI, // 10B99..10B9C
+ UNKNOWN, // 10B9D..10BA8
+ PSALTER_PAHLAVI, // 10BA9..10BAF
+ UNKNOWN, // 10BB0..10BFF
+ OLD_TURKIC, // 10C00..10C48
+ UNKNOWN, // 10C49..10E5F
+ ARABIC, // 10E60..10E7E
+ UNKNOWN, // 10E7F..10FFF
+ BRAHMI, // 11000..1104D
+ UNKNOWN, // 1104E..11051
+ BRAHMI, // 11052..1106F
+ UNKNOWN, // 11070..1107E
+ BRAHMI, // 1107F
+ KAITHI, // 11080..110C1
+ UNKNOWN, // 110C2..110CF
+ SORA_SOMPENG, // 110D0..110E8
+ UNKNOWN, // 110E9..110EF
+ SORA_SOMPENG, // 110F0..110F9
+ UNKNOWN, // 110FA..110FF
+ CHAKMA, // 11100..11134
+ UNKNOWN, // 11135
+ CHAKMA, // 11136..11143
+ UNKNOWN, // 11144..1114F
+ MAHAJANI, // 11150..11176
+ UNKNOWN, // 11177..1117F
+ SHARADA, // 11180..111C8
+ UNKNOWN, // 111C9..111CC
+ SHARADA, // 111CD
+ UNKNOWN, // 111CE..111CF
+ SHARADA, // 111D0..111DA
+ UNKNOWN, // 111DB..111E0
+ SINHALA, // 111E1..111F4
+ UNKNOWN, // 111F5..111FF
+ KHOJKI, // 11200..11211
+ UNKNOWN, // 11212
+ KHOJKI, // 11213..1123D
+ UNKNOWN, // 1123E..112AF
+ KHUDAWADI, // 112B0..112EA
+ UNKNOWN, // 112EB..112EF
+ KHUDAWADI, // 112F0..112F9
+ UNKNOWN, // 112FA..11300
+ GRANTHA, // 11301..11303
+ UNKNOWN, // 11304
+ GRANTHA, // 11305..1130C
+ UNKNOWN, // 1130D..1130E
+ GRANTHA, // 1130F..11310
+ UNKNOWN, // 11311..11312
+ GRANTHA, // 11313..11328
+ UNKNOWN, // 11329
+ GRANTHA, // 1132A..11330
+ UNKNOWN, // 11331
+ GRANTHA, // 11332..11333
+ UNKNOWN, // 11334
+ GRANTHA, // 11335..11339
+ UNKNOWN, // 1133A..1133B
+ GRANTHA, // 1133C..11344
+ UNKNOWN, // 11345..11346
+ GRANTHA, // 11347..11348
+ UNKNOWN, // 11349..1134A
+ GRANTHA, // 1134B..1134D
+ UNKNOWN, // 1134E..11356
+ GRANTHA, // 11357
+ UNKNOWN, // 11358..1135C
+ GRANTHA, // 1135D..11363
+ UNKNOWN, // 11364..11365
+ GRANTHA, // 11366..1136C
+ UNKNOWN, // 1136D..1136F
+ GRANTHA, // 11370..11374
+ UNKNOWN, // 11375..1147F
+ TIRHUTA, // 11480..114C7
+ UNKNOWN, // 114C8..114CF
+ TIRHUTA, // 114D0..114D9
+ UNKNOWN, // 114DA..1157F
+ SIDDHAM, // 11580..115B5
+ UNKNOWN, // 115B6..115B7
+ SIDDHAM, // 115B8..115C9
+ UNKNOWN, // 115CA..115FF
+ MODI, // 11600..11644
+ UNKNOWN, // 11645..1164F
+ MODI, // 11650..11659
+ UNKNOWN, // 1165A..1167F
+ TAKRI, // 11680..116B7
+ UNKNOWN, // 116B8..116BF
+ TAKRI, // 116C0..116C9
+ UNKNOWN, // 116CA..1189F
+ WARANG_CITI, // 118A0..118F2
+ UNKNOWN, // 118F3..118FE
+ WARANG_CITI, // 118FF
+ UNKNOWN, // 11900..11ABF
+ PAU_CIN_HAU, // 11AC0..11AF8
+ UNKNOWN, // 11AF9..11FFF
+ CUNEIFORM, // 12000..12398
+ UNKNOWN, // 12399..123FF
+ CUNEIFORM, // 12400..1246E
+ UNKNOWN, // 1246F
+ CUNEIFORM, // 12470..12474
+ UNKNOWN, // 12475..12FFF
+ EGYPTIAN_HIEROGLYPHS, // 13000..1342E
+ UNKNOWN, // 1342F..167FF
+ BAMUM, // 16800..16A38
+ UNKNOWN, // 16A39..16A3F
+ MRO, // 16A40..16A5E
+ UNKNOWN, // 16A5F
+ MRO, // 16A60..16A69
+ UNKNOWN, // 16A6A..16A6D
+ MRO, // 16A6E..16A6F
+ UNKNOWN, // 16A70..16ACF
+ BASSA_VAH, // 16AD0..16AED
+ UNKNOWN, // 16AEE..16AEF
+ BASSA_VAH, // 16AF0..16AF5
+ UNKNOWN, // 16AF6..16AFF
+ PAHAWH_HMONG, // 16B00..16B45
+ UNKNOWN, // 16B46..16B4F
+ PAHAWH_HMONG, // 16B50..16B59
+ UNKNOWN, // 16B5A
+ PAHAWH_HMONG, // 16B5B..16B61
+ UNKNOWN, // 16B62
+ PAHAWH_HMONG, // 16B63..16B77
+ UNKNOWN, // 16B78..16B7C
+ PAHAWH_HMONG, // 16B7D..16B8F
+ UNKNOWN, // 16B90..16EFF
+ MIAO, // 16F00..16F44
+ UNKNOWN, // 16F45..16F4F
+ MIAO, // 16F50..16F7E
+ UNKNOWN, // 16F7F..16F8E
+ MIAO, // 16F8F..16F9F
+ UNKNOWN, // 16FA0..1AFFF
+ KATAKANA, // 1B000
+ HIRAGANA, // 1B001
+ UNKNOWN, // 1B002..1BBFF
+ DUPLOYAN, // 1BC00..1BC6A
+ UNKNOWN, // 1BC6B..1BC6F
+ DUPLOYAN, // 1BC70..1BC7C
+ UNKNOWN, // 1BC7D..1BC7F
+ DUPLOYAN, // 1BC80..1BC88
+ UNKNOWN, // 1BC89..1BC8F
+ DUPLOYAN, // 1BC90..1BC99
+ UNKNOWN, // 1BC9A..1BC9B
+ DUPLOYAN, // 1BC9C..1BC9F
+ COMMON, // 1BCA0..1BCA3
+ UNKNOWN, // 1BCA4..1CFFF
+ COMMON, // 1D000..1D0F5
+ UNKNOWN, // 1D0F6..1D0FF
+ COMMON, // 1D100..1D126
+ UNKNOWN, // 1D127..1D128
+ COMMON, // 1D129..1D166
+ INHERITED, // 1D167..1D169
+ COMMON, // 1D16A..1D17A
+ INHERITED, // 1D17B..1D182
+ COMMON, // 1D183..1D184
+ INHERITED, // 1D185..1D18B
+ COMMON, // 1D18C..1D1A9
+ INHERITED, // 1D1AA..1D1AD
+ COMMON, // 1D1AE..1D1DD
+ UNKNOWN, // 1D1DE..1D1FF
+ GREEK, // 1D200..1D245
+ UNKNOWN, // 1D246..1D2FF
+ COMMON, // 1D300..1D356
+ UNKNOWN, // 1D357..1D35F
+ COMMON, // 1D360..1D371
+ UNKNOWN, // 1D372..1D3FF
+ COMMON, // 1D400..1D454
+ UNKNOWN, // 1D455
+ COMMON, // 1D456..1D49C
+ UNKNOWN, // 1D49D
+ COMMON, // 1D49E..1D49F
+ UNKNOWN, // 1D4A0..1D4A1
+ COMMON, // 1D4A2
+ UNKNOWN, // 1D4A3..1D4A4
+ COMMON, // 1D4A5..1D4A6
+ UNKNOWN, // 1D4A7..1D4A8
+ COMMON, // 1D4A9..1D4AC
+ UNKNOWN, // 1D4AD
+ COMMON, // 1D4AE..1D4B9
+ UNKNOWN, // 1D4BA
+ COMMON, // 1D4BB
+ UNKNOWN, // 1D4BC
+ COMMON, // 1D4BD..1D4C3
+ UNKNOWN, // 1D4C4
+ COMMON, // 1D4C5..1D505
+ UNKNOWN, // 1D506
+ COMMON, // 1D507..1D50A
+ UNKNOWN, // 1D50B..1D50C
+ COMMON, // 1D50D..1D514
+ UNKNOWN, // 1D515
+ COMMON, // 1D516..1D51C
+ UNKNOWN, // 1D51D
+ COMMON, // 1D51E..1D539
+ UNKNOWN, // 1D53A
+ COMMON, // 1D53B..1D53E
+ UNKNOWN, // 1D53F
+ COMMON, // 1D540..1D544
+ UNKNOWN, // 1D545
+ COMMON, // 1D546
+ UNKNOWN, // 1D547..1D549
+ COMMON, // 1D54A..1D550
+ UNKNOWN, // 1D551
+ COMMON, // 1D552..1D6A5
+ UNKNOWN, // 1D6A6..1D6A7
+ COMMON, // 1D6A8..1D7CB
+ UNKNOWN, // 1D7CC..1D7CD
+ COMMON, // 1D7CE..1D7FF
+ UNKNOWN, // 1D800..1E7FF
+ MENDE_KIKAKUI, // 1E800..1E8C4
+ UNKNOWN, // 1E8C5..1E8C6
+ MENDE_KIKAKUI, // 1E8C7..1E8D6
+ UNKNOWN, // 1E8D7..1EDFF
+ ARABIC, // 1EE00..1EE03
+ UNKNOWN, // 1EE04
+ ARABIC, // 1EE05..1EE1F
+ UNKNOWN, // 1EE20
+ ARABIC, // 1EE21..1EE22
+ UNKNOWN, // 1EE23
+ ARABIC, // 1EE24
+ UNKNOWN, // 1EE25..1EE26
+ ARABIC, // 1EE27
+ UNKNOWN, // 1EE28
+ ARABIC, // 1EE29..1EE32
+ UNKNOWN, // 1EE33
+ ARABIC, // 1EE34..1EE37
+ UNKNOWN, // 1EE38
+ ARABIC, // 1EE39
+ UNKNOWN, // 1EE3A
+ ARABIC, // 1EE3B
+ UNKNOWN, // 1EE3C..1EE41
+ ARABIC, // 1EE42
+ UNKNOWN, // 1EE43..1EE46
+ ARABIC, // 1EE47
+ UNKNOWN, // 1EE48
+ ARABIC, // 1EE49
+ UNKNOWN, // 1EE4A
+ ARABIC, // 1EE4B
+ UNKNOWN, // 1EE4C
+ ARABIC, // 1EE4D..1EE4F
+ UNKNOWN, // 1EE50
+ ARABIC, // 1EE51..1EE52
+ UNKNOWN, // 1EE53
+ ARABIC, // 1EE54
+ UNKNOWN, // 1EE55..1EE56
+ ARABIC, // 1EE57
+ UNKNOWN, // 1EE58
+ ARABIC, // 1EE59
+ UNKNOWN, // 1EE5A
+ ARABIC, // 1EE5B
+ UNKNOWN, // 1EE5C
+ ARABIC, // 1EE5D
+ UNKNOWN, // 1EE5E
+ ARABIC, // 1EE5F
+ UNKNOWN, // 1EE60
+ ARABIC, // 1EE61..1EE62
+ UNKNOWN, // 1EE63
+ ARABIC, // 1EE64
+ UNKNOWN, // 1EE65..1EE66
+ ARABIC, // 1EE67..1EE6A
+ UNKNOWN, // 1EE6B
+ ARABIC, // 1EE6C..1EE72
+ UNKNOWN, // 1EE73
+ ARABIC, // 1EE74..1EE77
+ UNKNOWN, // 1EE78
+ ARABIC, // 1EE79..1EE7C
+ UNKNOWN, // 1EE7D
+ ARABIC, // 1EE7E
+ UNKNOWN, // 1EE7F
+ ARABIC, // 1EE80..1EE89
+ UNKNOWN, // 1EE8A
+ ARABIC, // 1EE8B..1EE9B
+ UNKNOWN, // 1EE9C..1EEA0
+ ARABIC, // 1EEA1..1EEA3
+ UNKNOWN, // 1EEA4
+ ARABIC, // 1EEA5..1EEA9
+ UNKNOWN, // 1EEAA
+ ARABIC, // 1EEAB..1EEBB
+ UNKNOWN, // 1EEBC..1EEEF
+ ARABIC, // 1EEF0..1EEF1
+ UNKNOWN, // 1EEF2..1EFFF
+ COMMON, // 1F000..1F02B
+ UNKNOWN, // 1F02C..1F02F
+ COMMON, // 1F030..1F093
+ UNKNOWN, // 1F094..1F09F
+ COMMON, // 1F0A0..1F0AE
+ UNKNOWN, // 1F0AF..1F0B0
+ COMMON, // 1F0B1..1F0BF
+ UNKNOWN, // 1F0C0
+ COMMON, // 1F0C1..1F0CF
+ UNKNOWN, // 1F0D0
+ COMMON, // 1F0D1..1F0F5
+ UNKNOWN, // 1F0F6..1F0FF
+ COMMON, // 1F100..1F10C
+ UNKNOWN, // 1F10D..1F10F
+ COMMON, // 1F110..1F12E
+ UNKNOWN, // 1F12F
+ COMMON, // 1F130..1F16B
+ UNKNOWN, // 1F16C..1F16F
+ COMMON, // 1F170..1F19A
+ UNKNOWN, // 1F19B..1F1E5
+ COMMON, // 1F1E6..1F1FF
+ HIRAGANA, // 1F200
+ COMMON, // 1F201..1F202
+ UNKNOWN, // 1F203..1F20F
+ COMMON, // 1F210..1F23A
+ UNKNOWN, // 1F23B..1F23F
+ COMMON, // 1F240..1F248
+ UNKNOWN, // 1F249..1F24F
+ COMMON, // 1F250..1F251
+ UNKNOWN, // 1F252..1F2FF
+ COMMON, // 1F300..1F32C
+ UNKNOWN, // 1F32D..1F32F
+ COMMON, // 1F330..1F37D
+ UNKNOWN, // 1F37E..1F37F
+ COMMON, // 1F380..1F3CE
+ UNKNOWN, // 1F3CF..1F3D3
+ COMMON, // 1F3D4..1F3F7
+ UNKNOWN, // 1F3F8..1F3FF
+ COMMON, // 1F400..1F4FE
+ UNKNOWN, // 1F4FF
+ COMMON, // 1F500..1F54A
+ UNKNOWN, // 1F54B..1F54F
+ COMMON, // 1F550..1F579
+ UNKNOWN, // 1F57A
+ COMMON, // 1F57B..1F5A3
+ UNKNOWN, // 1F5A4
+ COMMON, // 1F5A5..1F642
+ UNKNOWN, // 1F643..1F644
+ COMMON, // 1F645..1F6CF
+ UNKNOWN, // 1F6D0..1F6DF
+ COMMON, // 1F6E0..1F6EC
+ UNKNOWN, // 1F6ED..1F6EF
+ COMMON, // 1F6F0..1F6F3
+ UNKNOWN, // 1F6F4..1F6FF
+ COMMON, // 1F700..1F773
+ UNKNOWN, // 1F774..1F77F
+ COMMON, // 1F780..1F7D4
+ UNKNOWN, // 1F7D5..1F7FF
+ COMMON, // 1F800..1F80B
+ UNKNOWN, // 1F80C..1F80F
+ COMMON, // 1F810..1F847
+ UNKNOWN, // 1F848..1F84F
+ COMMON, // 1F850..1F859
+ UNKNOWN, // 1F85A..1F85F
+ COMMON, // 1F860..1F887
+ UNKNOWN, // 1F888..1F88F
+ COMMON, // 1F890..1F8AD
+ UNKNOWN, // 1F8AE..1FFFF
+ HAN, // 20000..2A6D6
+ UNKNOWN, // 2A6D7..2A6FF
+ HAN, // 2A700..2B734
+ UNKNOWN, // 2B735..2B73F
+ HAN, // 2B740..2B81D
+ UNKNOWN, // 2B81E..2F7FF
+ HAN, // 2F800..2FA1D
+ UNKNOWN, // 2FA1E..E0000
+ COMMON, // E0001
+ UNKNOWN, // E0002..E001F
+ COMMON, // E0020..E007F
+ UNKNOWN, // E0080..E00FF
+ INHERITED, // E0100..E01EF
+ UNKNOWN // E01F0..10FFFF
};
private static HashMap<String, Character.UnicodeScript> aliases;
static {
aliases = new HashMap<>(128);
+ aliases.put("AGHB", CAUCASIAN_ALBANIAN);
aliases.put("ARAB", ARABIC);
aliases.put("ARMI", IMPERIAL_ARAMAIC);
aliases.put("ARMN", ARMENIAN);
aliases.put("AVST", AVESTAN);
aliases.put("BALI", BALINESE);
aliases.put("BAMU", BAMUM);
+ aliases.put("BASS", BASSA_VAH);
aliases.put("BATK", BATAK);
aliases.put("BENG", BENGALI);
aliases.put("BOPO", BOPOMOFO);
+ aliases.put("BRAH", BRAHMI);
aliases.put("BRAI", BRAILLE);
- aliases.put("BRAH", BRAHMI);
aliases.put("BUGI", BUGINESE);
aliases.put("BUHD", BUHID);
aliases.put("CAKM", CHAKMA);
@@ -4382,11 +7061,14 @@
aliases.put("CYRL", CYRILLIC);
aliases.put("DEVA", DEVANAGARI);
aliases.put("DSRT", DESERET);
+ aliases.put("DUPL", DUPLOYAN);
aliases.put("EGYP", EGYPTIAN_HIEROGLYPHS);
+ aliases.put("ELBA", ELBASAN);
aliases.put("ETHI", ETHIOPIC);
aliases.put("GEOR", GEORGIAN);
aliases.put("GLAG", GLAGOLITIC);
aliases.put("GOTH", GOTHIC);
+ aliases.put("GRAN", GRANTHA);
aliases.put("GREK", GREEK);
aliases.put("GUJR", GUJARATI);
aliases.put("GURU", GURMUKHI);
@@ -4395,6 +7077,7 @@
aliases.put("HANO", HANUNOO);
aliases.put("HEBR", HEBREW);
aliases.put("HIRA", HIRAGANA);
+ aliases.put("HMNG", PAHAWH_HMONG);
// it appears we don't have the KATAKANA_OR_HIRAGANA
//aliases.put("HRKT", KATAKANA_OR_HIRAGANA);
aliases.put("ITAL", OLD_ITALIC);
@@ -4403,6 +7086,7 @@
aliases.put("KANA", KATAKANA);
aliases.put("KHAR", KHAROSHTHI);
aliases.put("KHMR", KHMER);
+ aliases.put("KHOJ", KHOJKI);
aliases.put("KNDA", KANNADA);
aliases.put("KTHI", KAITHI);
aliases.put("LANA", TAI_THAM);
@@ -4410,27 +7094,39 @@
aliases.put("LATN", LATIN);
aliases.put("LEPC", LEPCHA);
aliases.put("LIMB", LIMBU);
+ aliases.put("LINA", LINEAR_A);
aliases.put("LINB", LINEAR_B);
aliases.put("LISU", LISU);
aliases.put("LYCI", LYCIAN);
aliases.put("LYDI", LYDIAN);
+ aliases.put("MAHJ", MAHAJANI);
aliases.put("MAND", MANDAIC);
+ aliases.put("MANI", MANICHAEAN);
+ aliases.put("MEND", MENDE_KIKAKUI);
aliases.put("MERC", MEROITIC_CURSIVE);
aliases.put("MERO", MEROITIC_HIEROGLYPHS);
aliases.put("MLYM", MALAYALAM);
+ aliases.put("MODI", MODI);
aliases.put("MONG", MONGOLIAN);
+ aliases.put("MROO", MRO);
aliases.put("MTEI", MEETEI_MAYEK);
aliases.put("MYMR", MYANMAR);
+ aliases.put("NARB", OLD_NORTH_ARABIAN);
+ aliases.put("NBAT", NABATAEAN);
aliases.put("NKOO", NKO);
aliases.put("OGAM", OGHAM);
aliases.put("OLCK", OL_CHIKI);
aliases.put("ORKH", OLD_TURKIC);
aliases.put("ORYA", ORIYA);
aliases.put("OSMA", OSMANYA);
+ aliases.put("PALM", PALMYRENE);
+ aliases.put("PAUC", PAU_CIN_HAU);
+ aliases.put("PERM", OLD_PERMIC);
aliases.put("PHAG", PHAGS_PA);
+ aliases.put("PHLI", INSCRIPTIONAL_PAHLAVI);
+ aliases.put("PHLP", PSALTER_PAHLAVI);
+ aliases.put("PHNX", PHOENICIAN);
aliases.put("PLRD", MIAO);
- aliases.put("PHLI", INSCRIPTIONAL_PAHLAVI);
- aliases.put("PHNX", PHOENICIAN);
aliases.put("PRTI", INSCRIPTIONAL_PARTHIAN);
aliases.put("RJNG", REJANG);
aliases.put("RUNR", RUNIC);
@@ -4439,14 +7135,16 @@
aliases.put("SAUR", SAURASHTRA);
aliases.put("SHAW", SHAVIAN);
aliases.put("SHRD", SHARADA);
+ aliases.put("SIDD", SIDDHAM);
+ aliases.put("SIND", KHUDAWADI);
aliases.put("SINH", SINHALA);
aliases.put("SORA", SORA_SOMPENG);
aliases.put("SUND", SUNDANESE);
aliases.put("SYLO", SYLOTI_NAGRI);
aliases.put("SYRC", SYRIAC);
aliases.put("TAGB", TAGBANWA);
+ aliases.put("TAKR", TAKRI);
aliases.put("TALE", TAI_LE);
- aliases.put("TAKR", TAKRI);
aliases.put("TALU", NEW_TAI_LUE);
aliases.put("TAML", TAMIL);
aliases.put("TAVT", TAI_VIET);
@@ -4456,8 +7154,10 @@
aliases.put("THAA", THAANA);
aliases.put("THAI", THAI);
aliases.put("TIBT", TIBETAN);
+ aliases.put("TIRH", TIRHUTA);
aliases.put("UGAR", UGARITIC);
aliases.put("VAII", VAI);
+ aliases.put("WARA", WARANG_CITI);
aliases.put("XPEO", OLD_PERSIAN);
aliases.put("XSUX", CUNEIFORM);
aliases.put("YIII", YI);
@@ -6598,8 +9298,9 @@
*
* @param ch the character to be converted.
* @return the numeric value of the character, as a nonnegative {@code int}
- * value; -2 if the character has a numeric value that is not a
- * nonnegative integer; -1 if the character has no numeric value.
+ * value; -2 if the character has a numeric value but the value
+ * can not be represented as a nonnegative {@code int} value;
+ * -1 if the character has no numeric value.
* @see Character#forDigit(int, int)
* @see Character#isDigit(char)
* @since 1.1
@@ -6631,8 +9332,9 @@
*
* @param codePoint the character (Unicode code point) to be converted.
* @return the numeric value of the character, as a nonnegative {@code int}
- * value; -2 if the character has a numeric value that is not a
- * nonnegative integer; -1 if the character has no numeric value.
+ * value; -2 if the character has a numeric value but the value
+ * can not be represented as a nonnegative {@code int} value;
+ * -1 if the character has no numeric value.
* @see Character#forDigit(int, int)
* @see Character#isDigit(int)
* @since 1.5
@@ -7002,6 +9704,10 @@
* @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
* @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
* @see Character#DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
+ * @see Character#DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE
+ * @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE
+ * @see Character#DIRECTIONALITY_FIRST_STRONG_ISOLATE
+ * @see Character#DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE
* @since 1.4
*/
public static byte getDirectionality(char ch) {
@@ -7039,6 +9745,10 @@
* @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
* @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE
* @see Character#DIRECTIONALITY_POP_DIRECTIONAL_FORMAT DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
+ * @see Character#DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE
+ * @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE
+ * @see Character#DIRECTIONALITY_FIRST_STRONG_ISOLATE DIRECTIONALITY_FIRST_STRONG_ISOLATE
+ * @see Character#DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE
* @since 1.5
*/
public static byte getDirectionality(int codePoint) {
--- a/jdk/src/java.base/share/classes/java/text/Bidi.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/java/text/Bidi.java Wed Jul 15 11:05:51 2015 +0900
@@ -185,7 +185,7 @@
AttributedString astr = new AttributedString("");
Bidi newBidi = new Bidi(astr.getIterator());
- return bidiBase.setLine(this, bidiBase, newBidi, newBidi.bidiBase,lineStart, lineLimit);
+ return bidiBase.setLine(this, bidiBase, newBidi, newBidi.bidiBase, lineStart, lineLimit);
}
/**
--- a/jdk/src/java.base/share/classes/java/text/Normalizer.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/java/text/Normalizer.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -38,7 +38,6 @@
package java.text;
import sun.text.normalizer.NormalizerBase;
-import sun.text.normalizer.NormalizerImpl;
/**
* This class provides the method <code>normalize</code> which transforms Unicode
--- a/jdk/src/java.base/share/classes/sun/net/idn/StringPrep.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/net/idn/StringPrep.java Wed Jul 15 11:05:51 2015 +0900
@@ -50,7 +50,6 @@
import sun.text.Normalizer;
import sun.text.normalizer.CharTrie;
import sun.text.normalizer.Trie;
-import sun.text.normalizer.NormalizerImpl;
import sun.text.normalizer.VersionInfo;
import sun.text.normalizer.UCharacter;
import sun.text.normalizer.UCharacterIterator;
@@ -227,7 +226,7 @@
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
- VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion();
+ VersionInfo normUniVer = UCharacter.getUnicodeVersion();
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
@@ -354,7 +353,7 @@
Normalizer.normalize(
src.toString(),
java.text.Normalizer.Form.NFKC,
- Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));
+ Normalizer.UNICODE_3_2));
}
/*
boolean isLabelSeparator(int ch){
--- a/jdk/src/java.base/share/classes/sun/text/ComposedCharIter.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/ComposedCharIter.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2001, 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -43,7 +43,7 @@
private static int decompNum;
static {
- int maxNum = 2000; //TBD: Unicode 4.0 only has 1926 canoDecomp...
+ int maxNum = 2100;
chars = new int[maxNum];
decomps = new String[maxNum];
decompNum = NormalizerImpl.getDecompose(chars, decomps);
--- a/jdk/src/java.base/share/classes/sun/text/Normalizer.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/Normalizer.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -26,7 +26,7 @@
package sun.text;
import sun.text.normalizer.NormalizerBase;
-import sun.text.normalizer.NormalizerImpl;
+import sun.text.normalizer.UCharacter;
/**
* This Normalizer is for Unicode 3.2 support for IDNA only.
@@ -93,6 +93,6 @@
* @return combining class of the given character
*/
public static final int getCombiningClass(int ch) {
- return NormalizerImpl.getCombiningClass(ch);
+ return UCharacter.getCombiningClass(ch);
}
}
--- a/jdk/src/java.base/share/classes/sun/text/bidi/BidiBase.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/bidi/BidiBase.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,17 +22,13 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
+*******************************************************************************
+* Copyright (C) 2001-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+*/
/* FOOD FOR THOUGHT: currently the reordering modes are a mixture of
* algorithm for direct BiDi, algorithm for inverse Bidi and the bizarre
@@ -52,12 +48,10 @@
package sun.text.bidi;
-import java.io.IOException;
import java.lang.reflect.Array;
import java.text.AttributedCharacterIterator;
import java.text.Bidi;
import java.util.Arrays;
-import java.util.MissingResourceException;
import sun.misc.JavaAWTFontAccess;
import sun.misc.SharedSecrets;
import sun.text.normalizer.UBiDiProps;
@@ -68,10 +62,9 @@
*
* <h2>Bidi algorithm for ICU</h2>
*
- * This is an implementation of the Unicode Bidirectional algorithm. The
+ * This is an implementation of the Unicode Bidirectional Algorithm. The
* algorithm is defined in the <a
- * href="http://www.unicode.org/unicode/reports/tr9/">Unicode Standard Annex #9</a>,
- * version 13, also described in The Unicode Standard, Version 4.0 .
+ * href="http://www.unicode.org/unicode/reports/tr9/">Unicode Standard Annex #9</a>.
* <p>
*
* Note: Libraries that perform a bidirectional algorithm and reorder strings
@@ -106,6 +99,7 @@
* <li>{@link #LTR}
* <li>{@link #RTL}
* <li>{@link #MIXED}
+ * <li>{@link #NEUTRAL}
* </ul>
*
* <h3>Basic concept: levels</h3>
@@ -167,6 +161,7 @@
*
* <h3>Basic concept: Reordering Options</h3>
* Reordering options can be applied during Bidi text transformations.
+ *
* <p><b>See Also:</b>
* <ul>
* <li>{@link #setReorderingOptions}
@@ -456,19 +451,134 @@
* }</pre>
*/
+/*
+ * General implementation notes:
+ *
+ * Throughout the implementation, there are comments like (W2) that refer to
+ * rules of the BiDi algorithm, in this example to the second rule of the
+ * resolution of weak types.
+ *
+ * For handling surrogate pairs, where two UChar's form one "abstract" (or UTF-32)
+ * character according to UTF-16, the second UChar gets the directional property of
+ * the entire character assigned, while the first one gets a BN, a boundary
+ * neutral, type, which is ignored by most of the algorithm according to
+ * rule (X9) and the implementation suggestions of the BiDi algorithm.
+ *
+ * Later, adjustWSLevels() will set the level for each BN to that of the
+ * following character (UChar), which results in surrogate pairs getting the
+ * same level on each of their surrogates.
+ *
+ * In a UTF-8 implementation, the same thing could be done: the last byte of
+ * a multi-byte sequence would get the "real" property, while all previous
+ * bytes of that sequence would get BN.
+ *
+ * It is not possible to assign all those parts of a character the same real
+ * property because this would fail in the resolution of weak types with rules
+ * that look at immediately surrounding types.
+ *
+ * As a related topic, this implementation does not remove Boundary Neutral
+ * types from the input, but ignores them wherever this is relevant.
+ * For example, the loop for the resolution of the weak types reads
+ * types until it finds a non-BN.
+ * Also, explicit embedding codes are neither changed into BN nor removed.
+ * They are only treated the same way real BNs are.
+ * As stated before, adjustWSLevels() takes care of them at the end.
+ * For the purpose of conformance, the levels of all these codes
+ * do not matter.
+ *
+ * Note that this implementation modifies the dirProps
+ * after the initial setup, when applying X5c (replace FSI by LRI or RLI),
+ * X6, N0 (replace paired brackets by L or R).
+ *
+ * In this implementation, the resolution of weak types (W1 to W6),
+ * neutrals (N1 and N2), and the assignment of the resolved level (In)
+ * are all done in one single loop, in resolveImplicitLevels().
+ * Changes of dirProp values are done on the fly, without writing
+ * them back to the dirProps array.
+ *
+ *
+ * This implementation contains code that allows to bypass steps of the
+ * algorithm that are not needed on the specific paragraph
+ * in order to speed up the most common cases considerably,
+ * like text that is entirely LTR, or RTL text without numbers.
+ *
+ * Most of this is done by setting a bit for each directional property
+ * in a flags variable and later checking for whether there are
+ * any LTR characters or any RTL characters, or both, whether
+ * there are any explicit embedding codes, etc.
+ *
+ * If the (Xn) steps are performed, then the flags are re-evaluated,
+ * because they will then not contain the embedding codes any more
+ * and will be adjusted for override codes, so that subsequently
+ * more bypassing may be possible than what the initial flags suggested.
+ *
+ * If the text is not mixed-directional, then the
+ * algorithm steps for the weak type resolution are not performed,
+ * and all levels are set to the paragraph level.
+ *
+ * If there are no explicit embedding codes, then the (Xn) steps
+ * are not performed.
+ *
+ * If embedding levels are supplied as a parameter, then all
+ * explicit embedding codes are ignored, and the (Xn) steps
+ * are not performed.
+ *
+ * White Space types could get the level of the run they belong to,
+ * and are checked with a test of (flags&MASK_EMBEDDING) to
+ * consider if the paragraph direction should be considered in
+ * the flags variable.
+ *
+ * If there are no White Space types in the paragraph, then
+ * (L1) is not necessary in adjustWSLevels().
+ */
+
public class BidiBase {
- class Point {
+ static class Point {
int pos; /* position in text */
int flag; /* flag for LRM/RLM, before/after */
}
- class InsertPoints {
+ static class InsertPoints {
int size;
int confirmed;
Point[] points = new Point[0];
}
+ static class Opening {
+ int position; /* position of opening bracket */
+ int match; /* matching char or -position of closing bracket */
+ int contextPos; /* position of last strong char found before opening */
+ short flags; /* bits for L or R/AL found within the pair */
+ byte contextDir; /* L or R according to last strong char before opening */
+ }
+
+ static class IsoRun {
+ int contextPos; /* position of char determining context */
+ short start; /* index of first opening entry for this run */
+ short limit; /* index after last opening entry for this run */
+ byte level; /* level of this run */
+ byte lastStrong; /* bidi class of last strong char found in this run */
+ byte lastBase; /* bidi class of last base char found in this run */
+ byte contextDir; /* L or R to use as context for following openings */
+ }
+
+ static class BracketData {
+ Opening[] openings = new Opening[SIMPLE_PARAS_COUNT];
+ int isoRunLast; /* index of last used entry */
+ /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL
+ + 1 for index 0, + 1 for before the first isolated sequence */
+ IsoRun[] isoRuns = new IsoRun[MAX_EXPLICIT_LEVEL+2];
+ boolean isNumbersSpecial; /*reordering mode for NUMBERS_SPECIAL */
+ }
+
+ static class Isolate {
+ int startON;
+ int start1;
+ short stateImp;
+ short state;
+ }
+
/** Paragraph level setting<p>
*
* Constant indicating that the base direction depends on the first strong
@@ -482,7 +592,7 @@
* is assumed to be visual LTR, and the text after reordering is required
* to be the corresponding logical string with appropriate contextual
* direction. The direction of the result string will be RTL if either
- * the righmost or leftmost strong character of the source text is RTL
+ * the rightmost or leftmost strong character of the source text is RTL
* or Arabic Letter, the direction will be LTR otherwise.<p>
*
* If reordering option <code>OPTION_INSERT_MARKS</code> is set, an RLM may
@@ -493,7 +603,7 @@
* @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL
* @stable ICU 3.8
*/
- public static final byte INTERNAL_LEVEL_DEFAULT_LTR = (byte)0x7e;
+ public static final byte LEVEL_DEFAULT_LTR = (byte)0x7e;
/** Paragraph level setting<p>
*
@@ -508,7 +618,7 @@
* is assumed to be visual LTR, and the text after reordering is required
* to be the corresponding logical string with appropriate contextual
* direction. The direction of the result string will be RTL if either
- * the righmost or leftmost strong character of the source text is RTL
+ * the rightmost or leftmost strong character of the source text is RTL
* or Arabic Letter, or if the text contains no strong character;
* the direction will be LTR otherwise.<p>
*
@@ -520,21 +630,21 @@
* @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL
* @stable ICU 3.8
*/
- public static final byte INTERNAL_LEVEL_DEFAULT_RTL = (byte)0x7f;
+ public static final byte LEVEL_DEFAULT_RTL = (byte)0x7f;
/**
* Maximum explicit embedding level.
* (The maximum resolved level can be up to <code>MAX_EXPLICIT_LEVEL+1</code>).
* @stable ICU 3.8
*/
- public static final byte MAX_EXPLICIT_LEVEL = 61;
+ public static final byte MAX_EXPLICIT_LEVEL = 125;
/**
* Bit flag for level input.
* Overrides directional properties.
* @stable ICU 3.8
*/
- public static final byte INTERNAL_LEVEL_OVERRIDE = (byte)0x80;
+ public static final byte LEVEL_OVERRIDE = (byte)0x80;
/**
* Special value which can be returned by the mapping methods when a
@@ -555,13 +665,53 @@
public static final int MAP_NOWHERE = -1;
/**
+ * Left-to-right text.
+ * <ul>
+ * <li>As return value for <code>getDirection()</code>, it means
+ * that the source string contains no right-to-left characters, or
+ * that the source string is empty and the paragraph level is even.
+ * <li>As return value for <code>getBaseDirection()</code>, it
+ * means that the first strong character of the source string has
+ * a left-to-right direction.
+ * </ul>
+ * @stable ICU 3.8
+ */
+ public static final byte LTR = 0;
+
+ /**
+ * Right-to-left text.
+ * <ul>
+ * <li>As return value for <code>getDirection()</code>, it means
+ * that the source string contains no left-to-right characters, or
+ * that the source string is empty and the paragraph level is odd.
+ * <li>As return value for <code>getBaseDirection()</code>, it
+ * means that the first strong character of the source string has
+ * a right-to-left direction.
+ * </ul>
+ * @stable ICU 3.8
+ */
+ public static final byte RTL = 1;
+
+ /**
* Mixed-directional text.
+ * <p>As return value for <code>getDirection()</code>, it means
+ * that the source string contains both left-to-right and
+ * right-to-left characters.
* @stable ICU 3.8
*/
public static final byte MIXED = 2;
/**
* option bit for writeReordered():
+ * keep combining characters after their base characters in RTL runs
+ *
+ * @see #writeReordered
+ * @stable ICU 3.8
+ */
+ public static final short KEEP_BASE_COMBINING = 1;
+
+ /**
+ * option bit for writeReordered():
* replace characters with the "mirrored" property in RTL runs
* by their mirror-image mappings
*
@@ -570,6 +720,50 @@
*/
public static final short DO_MIRRORING = 2;
+ /**
+ * option bit for writeReordered():
+ * surround the run with LRMs if necessary;
+ * this is part of the approximate "inverse Bidi" algorithm
+ *
+ * <p>This option does not imply corresponding adjustment of the index
+ * mappings.</p>
+ *
+ * @see #setInverse
+ * @see #writeReordered
+ * @stable ICU 3.8
+ */
+ public static final short INSERT_LRM_FOR_NUMERIC = 4;
+
+ /**
+ * option bit for writeReordered():
+ * remove Bidi control characters
+ * (this does not affect INSERT_LRM_FOR_NUMERIC)
+ *
+ * <p>This option does not imply corresponding adjustment of the index
+ * mappings.</p>
+ *
+ * @see #writeReordered
+ * @see #INSERT_LRM_FOR_NUMERIC
+ * @stable ICU 3.8
+ */
+ public static final short REMOVE_BIDI_CONTROLS = 8;
+
+ /**
+ * option bit for writeReordered():
+ * write the output in reverse order
+ *
+ * <p>This has the same effect as calling <code>writeReordered()</code>
+ * first without this option, and then calling
+ * <code>writeReverse()</code> without mirroring.
+ * Doing this in the same step is faster and avoids a temporary buffer.
+ * An example for using this option is output to a character terminal that
+ * is designed for RTL scripts and stores text in reverse order.</p>
+ *
+ * @see #writeReordered
+ * @stable ICU 3.8
+ */
+ public static final short OUTPUT_REVERSE = 16;
+
/** Reordering mode: Regular Logical to Visual Bidi algorithm according to Unicode.
* @see #setReorderingMode
* @stable ICU 3.8
@@ -600,7 +794,7 @@
* @see #setReorderingMode
* @stable ICU 3.8
*/
- private static final short REORDER_RUNS_ONLY = 3;
+ static final short REORDER_RUNS_ONLY = 3;
/** Reordering mode: Visual to Logical algorithm which handles numbers
* like L (same algorithm as selected by <code>setInverse(true)</code>.
@@ -608,21 +802,21 @@
* @see #setReorderingMode
* @stable ICU 3.8
*/
- private static final short REORDER_INVERSE_NUMBERS_AS_L = 4;
+ static final short REORDER_INVERSE_NUMBERS_AS_L = 4;
/** Reordering mode: Visual to Logical algorithm equivalent to the regular
* Logical to Visual algorithm.
* @see #setReorderingMode
* @stable ICU 3.8
*/
- private static final short REORDER_INVERSE_LIKE_DIRECT = 5;
+ static final short REORDER_INVERSE_LIKE_DIRECT = 5;
/** Reordering mode: Inverse Bidi (Visual to Logical) algorithm for the
* <code>REORDER_NUMBERS_SPECIAL</code> Bidi algorithm.
* @see #setReorderingMode
* @stable ICU 3.8
*/
- private static final short REORDER_INVERSE_FOR_NUMBERS_SPECIAL = 6;
+ static final short REORDER_INVERSE_FOR_NUMBERS_SPECIAL = 6;
/* Reordering mode values must be ordered so that all the regular logical to
* visual modes come first, and all inverse Bidi modes come last.
@@ -682,7 +876,7 @@
* @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL
* @stable ICU 3.8
*/
- private static final int OPTION_INSERT_MARKS = 1;
+ static final int OPTION_INSERT_MARKS = 1;
/**
* Option bit for <code>setReorderingOptions</code>:
@@ -704,7 +898,7 @@
* @see #REMOVE_BIDI_CONTROLS
* @stable ICU 3.8
*/
- private static final int OPTION_REMOVE_CONTROLS = 2;
+ static final int OPTION_REMOVE_CONTROLS = 2;
/**
* Option bit for <code>setReorderingOptions</code>:
@@ -741,8 +935,7 @@
* part of the text.</p>
*
* <p>When the <code>OPTION_STREAMING</code> option is used, it is
- * recommended to call <code>orderParagraphsLTR()</code> with argument
- * <code>orderParagraphsLTR</code> set to <code>true</code> before calling
+ * recommended to call <code>orderParagraphsLTR(true)</code> before calling
* <code>setPara()</code> so that later paragraphs may be concatenated to
* previous paragraphs on the right.
* </p>
@@ -750,7 +943,6 @@
* @see #setReorderingMode
* @see #setReorderingOptions
* @see #getProcessedLength
- * @see #orderParagraphsLTR
* @stable ICU 3.8
*/
private static final int OPTION_STREAMING = 4;
@@ -760,7 +952,7 @@
* is easier with the same names for the Bidi types in the code as there.
* See UCharacterDirection
*/
- private static final byte L = 0;
+ /* private */ static final byte L = 0;
private static final byte R = 1;
private static final byte EN = 2;
private static final byte ES = 3;
@@ -779,8 +971,55 @@
private static final byte PDF = 16;
private static final byte NSM = 17;
private static final byte BN = 18;
-
- private static final int MASK_R_AL = (1 << R | 1 << AL);
+ private static final byte FSI = 19;
+ private static final byte LRI = 20;
+ private static final byte RLI = 21;
+ private static final byte PDI = 22;
+ private static final byte ENL = PDI + 1; /* EN after W7 */
+ private static final byte ENR = ENL + 1; /* EN not subject to W7 */
+
+ // Number of directional types
+ private static final int CHAR_DIRECTION_COUNT = 23;
+
+ /**
+ * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
+ * Used in UAX #9: Unicode Bidirectional Algorithm
+ * (http://www.unicode.org/reports/tr9/)
+ * Returns UCharacter.BidiPairedBracketType values.
+ * @stable ICU 52
+ */
+ public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
+
+ /**
+ * Bidi Paired Bracket Type constants.
+ *
+ * @see UProperty#BIDI_PAIRED_BRACKET_TYPE
+ * @stable ICU 52
+ */
+ public static interface BidiPairedBracketType {
+ /**
+ * Not a paired bracket.
+ * @stable ICU 52
+ */
+ public static final int NONE = 0;
+ /**
+ * Open paired bracket.
+ * @stable ICU 52
+ */
+ public static final int OPEN = 1;
+ /**
+ * Close paired bracket.
+ * @stable ICU 52
+ */
+ public static final int CLOSE = 2;
+ /**
+ * @stable ICU 52
+ */
+ public static final int COUNT = 3;
+ }
+
+ /* number of paras entries allocated initially */
+ static final int SIMPLE_PARAS_COUNT = 10;
private static final char CR = '\r';
private static final char LF = '\n';
@@ -790,12 +1029,22 @@
static final int RLM_BEFORE = 4;
static final int RLM_AFTER = 8;
+ /* flags for Opening.flags */
+ static final byte FOUND_L = (byte)DirPropFlag(L);
+ static final byte FOUND_R = (byte)DirPropFlag(R);
+
+ /*
+ * The following bit is used for the directional isolate status.
+ * Stack entries corresponding to isolate sequences are greater than ISOLATE.
+ */
+ static final int ISOLATE = 0x0100;
+
/*
* reference to parent paragraph object (reference to self if this object is
* a paragraph object); set to null in a newly opened object; set to a
* real value after a successful execution of setPara or setLine
*/
- BidiBase paraBidi;
+ BidiBase paraBidi;
final UBiDiProps bdp;
@@ -828,6 +1077,15 @@
byte[] dirProps;
byte[] levels;
+ /* are we performing an approximation of the "inverse Bidi" algorithm? */
+ boolean isInverse;
+
+ /* are we using the basic algorithm or its variation? */
+ int reorderingMode;
+
+ /* bitmask for reordering options */
+ int reorderingOptions;
+
/* must block separators receive level 0? */
boolean orderParagraphsLTR;
@@ -855,14 +1113,10 @@
/* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */
int trailingWSStart;
- /* fields for paragraph handling */
- int paraCount; /* set in getDirProps() */
- int[] parasMemory = new int[1];
- int[] paras; /* limits of paragraphs, filled in
- ResolveExplicitLevels() or CheckExplicitLevels() */
-
- /* for single paragraph text, we only need a tiny array of paras (no allocation) */
- int[] simpleParas = {0};
+ /* fields for paragraph handling, set in getDirProps() */
+ int paraCount;
+ int[] paras_limit = new int[SIMPLE_PARAS_COUNT];
+ byte[] paras_level = new byte[SIMPLE_PARAS_COUNT];
/* fields for line reordering */
int runCount; /* ==-1: runs not set up yet */
@@ -872,9 +1126,18 @@
/* for non-mixed text, we only need a tiny array of runs (no allocation) */
BidiRun[] simpleRuns = {new BidiRun()};
+ /* fields for managing isolate sequences */
+ Isolate[] isolates;
+
+ /* maximum or current nesting depth of isolate sequences */
+ /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal
+ nesting encountered.
+ Within resolveImplicitLevels(), this is the index of the current isolates
+ stack entry. */
+ int isolateCount;
+
/* mapping of runs in logical order to visual order */
int[] logicalToVisualRunsMap;
-
/* flag to indicate that the map has been updated */
boolean isGoodLogicalToVisualRunsMap;
@@ -894,23 +1157,8 @@
return (1 << dir);
}
- /*
- * The following bit is ORed to the property of characters in paragraphs
- * with contextual RTL direction when paraLevel is contextual.
- */
- static final byte CONTEXT_RTL_SHIFT = 6;
- static final byte CONTEXT_RTL = (byte)(1<<CONTEXT_RTL_SHIFT); // 0x40
- static byte NoContextRTL(byte dir)
- {
- return (byte)(dir & ~CONTEXT_RTL);
- }
-
- /*
- * The following is a variant of DirProp.DirPropFlag() which ignores the
- * CONTEXT_RTL bit.
- */
- static int DirPropFlagNC(byte dir) {
- return (1<<(dir & ~CONTEXT_RTL));
+ boolean testDirPropFlagAt(int flag, int index) {
+ return ((DirPropFlag(dirProps[index]) & flag) != 0);
}
static final int DirPropFlagMultiRuns = DirPropFlag((byte)31);
@@ -923,40 +1171,38 @@
static final int DirPropFlagLR(byte level) { return DirPropFlagLR[level & 1]; }
static final int DirPropFlagE(byte level) { return DirPropFlagE[level & 1]; }
static final int DirPropFlagO(byte level) { return DirPropFlagO[level & 1]; }
-
- /*
- * are there any characters that are LTR?
- */
+ static final byte DirFromStrong(byte strong) { return strong == L ? L : R; }
+ static final byte NoOverride(byte level) { return (byte)(level & ~LEVEL_OVERRIDE); }
+
+ /* are there any characters that are LTR or RTL? */
static final int MASK_LTR =
- DirPropFlag(L)|DirPropFlag(EN)|DirPropFlag(AN)|DirPropFlag(LRE)|DirPropFlag(LRO);
-
- /*
- * are there any characters that are RTL?
- */
- static final int MASK_RTL = DirPropFlag(R)|DirPropFlag(AL)|DirPropFlag(RLE)|DirPropFlag(RLO);
+ DirPropFlag(L)|DirPropFlag(EN)|DirPropFlag(ENL)|DirPropFlag(ENR)|DirPropFlag(AN)|DirPropFlag(LRE)|DirPropFlag(LRO)|DirPropFlag(LRI);
+ static final int MASK_RTL = DirPropFlag(R)|DirPropFlag(AL)|DirPropFlag(RLE)|DirPropFlag(RLO)|DirPropFlag(RLI);
+
+ static final int MASK_R_AL = DirPropFlag(R)|DirPropFlag(AL);
/* explicit embedding codes */
- private static final int MASK_LRX = DirPropFlag(LRE)|DirPropFlag(LRO);
- private static final int MASK_RLX = DirPropFlag(RLE)|DirPropFlag(RLO);
- private static final int MASK_EXPLICIT = MASK_LRX|MASK_RLX|DirPropFlag(PDF);
+ private static final int MASK_EXPLICIT = DirPropFlag(LRE)|DirPropFlag(LRO)|DirPropFlag(RLE)|DirPropFlag(RLO)|DirPropFlag(PDF);
private static final int MASK_BN_EXPLICIT = DirPropFlag(BN)|MASK_EXPLICIT;
+ /* explicit isolate codes */
+ private static final int MASK_ISO = DirPropFlag(LRI)|DirPropFlag(RLI)|DirPropFlag(FSI)|DirPropFlag(PDI);
+
/* paragraph and segment separators */
private static final int MASK_B_S = DirPropFlag(B)|DirPropFlag(S);
/* all types that are counted as White Space or Neutral in some steps */
- static final int MASK_WS = MASK_B_S|DirPropFlag(WS)|MASK_BN_EXPLICIT;
- private static final int MASK_N = DirPropFlag(ON)|MASK_WS;
+ static final int MASK_WS = MASK_B_S|DirPropFlag(WS)|MASK_BN_EXPLICIT|MASK_ISO;
/* types that are neutrals or could becomes neutrals in (Wn) */
- private static final int MASK_POSSIBLE_N = DirPropFlag(CS)|DirPropFlag(ES)|DirPropFlag(ET)|MASK_N;
+ private static final int MASK_POSSIBLE_N = DirPropFlag(ON)|DirPropFlag(CS)|DirPropFlag(ES)|DirPropFlag(ET)|MASK_WS;
/*
* These types may be changed to "e",
* the embedding type (L or R) of the run,
* in the Bidi algorithm (N2)
*/
- static final int MASK_EMBEDDING = DirPropFlag(NSM)|MASK_POSSIBLE_N;
+ private static final int MASK_EMBEDDING = DirPropFlag(NSM)|MASK_POSSIBLE_N;
/*
* the dirProp's L and R are defined to 0 and 1 values in UCharacterDirection.java
@@ -968,30 +1214,25 @@
private static boolean IsDefaultLevel(byte level)
{
- return ((level & INTERNAL_LEVEL_DEFAULT_LTR) == INTERNAL_LEVEL_DEFAULT_LTR);
- }
-
- byte GetParaLevelAt(int index)
- {
- return (defaultParaLevel != 0) ?
- (byte)(dirProps[index]>>CONTEXT_RTL_SHIFT) : paraLevel;
+ return ((level & LEVEL_DEFAULT_LTR) == LEVEL_DEFAULT_LTR);
}
static boolean IsBidiControlChar(int c)
{
/* check for range 0x200c to 0x200f (ZWNJ, ZWJ, LRM, RLM) or
0x202a to 0x202e (LRE, RLE, PDF, LRO, RLO) */
- return (((c & 0xfffffffc) == 0x200c) || ((c >= 0x202a) && (c <= 0x202e)));
+ return (((c & 0xfffffffc) == 0x200c) || ((c >= 0x202a) && (c <= 0x202e))
+ || ((c >= 0x2066) && (c <= 0x2069)));
}
- public void verifyValidPara()
+ void verifyValidPara()
{
- if (this != this.paraBidi) {
- throw new IllegalStateException("");
+ if (!(this == this.paraBidi)) {
+ throw new IllegalStateException();
}
}
- public void verifyValidParaOrLine()
+ void verifyValidParaOrLine()
{
BidiBase para = this.paraBidi;
/* verify Para */
@@ -1004,7 +1245,7 @@
}
}
- public void verifyRange(int index, int start, int limit)
+ void verifyRange(int index, int start, int limit)
{
if (index < start || index >= limit) {
throw new IllegalArgumentException("Value " + index +
@@ -1012,14 +1253,6 @@
}
}
- public void verifyIndex(int index, int start, int limit)
- {
- if (index < start || index >= limit) {
- throw new ArrayIndexOutOfBoundsException("Index " + index +
- " is out of range " + start + " to " + limit);
- }
- }
-
/**
* Allocate a <code>Bidi</code> object with preallocated memory
* for internal structures.
@@ -1051,7 +1284,7 @@
* @stable ICU 3.8
*/
public BidiBase(int maxLength, int maxRunCount)
- {
+ {
/* check the argument values */
if (maxLength < 0 || maxRunCount < 0) {
throw new IllegalArgumentException();
@@ -1075,12 +1308,7 @@
direction = 0;
*/
/* get Bidi properties */
- try {
- bdp = UBiDiProps.getSingleton();
- }
- catch (IOException e) {
- throw new MissingResourceException(e.getMessage(), "(BidiProps)", "");
- }
+ bdp = UBiDiProps.INSTANCE;
/* allocate memory for arrays as requested */
if (maxLength > 0) {
@@ -1180,18 +1408,68 @@
getLevelsMemory(true, len);
}
- private void getInitialParasMemory(int len)
- {
- Object array = getMemory("Paras", parasMemory, Integer.TYPE, true, len);
- parasMemory = (int[]) array;
- }
-
private void getInitialRunsMemory(int len)
{
getRunsMemory(true, len);
}
-/* perform (P2)..(P3) ------------------------------------------------------- */
+ /**
+ * Is this <code>Bidi</code> object set to perform the inverse Bidi
+ * algorithm?
+ * <p>Note: calling this method after setting the reordering mode with
+ * <code>setReorderingMode</code> will return <code>true</code> if the
+ * reordering mode was set to
+ * <code>REORDER_INVERSE_NUMBERS_AS_L</code>, <code>false</code>
+ * for all other values.</p>
+ *
+ * @return <code>true</code> if the <code>Bidi</code> object is set to
+ * perform the inverse Bidi algorithm by handling numbers as L.
+ *
+ * @see #setInverse
+ * @see #setReorderingMode
+ * @see #REORDER_INVERSE_NUMBERS_AS_L
+ * @stable ICU 3.8
+ */
+ public boolean isInverse() {
+ return isInverse;
+ }
+
+ /* perform (P2)..(P3) ------------------------------------------------------- */
+
+ /*
+ * Check that there are enough entries in the arrays paras_limit and paras_level
+ */
+ private void checkParaCount() {
+ int[] saveLimits;
+ byte[] saveLevels;
+ int count = paraCount;
+ if (count <= paras_level.length)
+ return;
+ int oldLength = paras_level.length;
+ saveLimits = paras_limit;
+ saveLevels = paras_level;
+ try {
+ paras_limit = new int[count * 2];
+ paras_level = new byte[count * 2];
+ } catch (Exception e) {
+ throw new OutOfMemoryError("Failed to allocate memory for paras");
+ }
+ System.arraycopy(saveLimits, 0, paras_limit, 0, oldLength);
+ System.arraycopy(saveLevels, 0, paras_level, 0, oldLength);
+ }
+
+ /*
+ * Get the directional properties for the text, calculate the flags bit-set, and
+ * determine the paragraph level if necessary (in paras_level[i]).
+ * FSI initiators are also resolved and their dirProp replaced with LRI or RLI.
+ * When encountering an FSI, it is initially replaced with an LRI, which is the
+ * default. Only if a strong R or AL is found within its scope will the LRI be
+ * replaced by an RLI.
+ */
+ static final int NOT_SEEKING_STRONG = 0; /* 0: not contextual paraLevel, not after FSI */
+ static final int SEEKING_STRONG_FOR_PARA = 1; /* 1: looking for first strong char in para */
+ static final int SEEKING_STRONG_FOR_FSI = 2; /* 2: looking for first strong after FSI */
+ static final int LOOKING_FOR_PDI = 3; /* 3: found strong after FSI, looking for PDI */
private void getDirProps()
{
@@ -1199,32 +1477,44 @@
flags = 0; /* collect all directionalities in the text */
int uchar;
byte dirProp;
- byte paraDirDefault = 0; /* initialize to avoid compiler warnings */
+ byte defaultParaLevel = 0; /* initialize to avoid compiler warnings */
boolean isDefaultLevel = IsDefaultLevel(paraLevel);
/* for inverse Bidi, the default para level is set to RTL if there is a
strong R or AL character at either end of the text */
+ boolean isDefaultLevelInverse=isDefaultLevel &&
+ (reorderingMode == REORDER_INVERSE_LIKE_DIRECT ||
+ reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL);
lastArabicPos = -1;
- controlCount = 0;
-
- final int NOT_CONTEXTUAL = 0; /* 0: not contextual paraLevel */
- final int LOOKING_FOR_STRONG = 1; /* 1: looking for first strong char */
- final int FOUND_STRONG_CHAR = 2; /* 2: found first strong char */
-
- int state;
- int paraStart = 0; /* index of first char in paragraph */
- byte paraDir; /* == CONTEXT_RTL within paragraphs
- starting with strong R char */
- byte lastStrongDir=0; /* for default level & inverse Bidi */
- int lastStrongLTR=0; /* for STREAMING option */
+ int controlCount = 0;
+ boolean removeBidiControls = (reorderingOptions & OPTION_REMOVE_CONTROLS) != 0;
+
+ byte state;
+ byte lastStrong = ON; /* for default level & inverse Bidi */
+ /* The following stacks are used to manage isolate sequences. Those
+ sequences may be nested, but obviously never more deeply than the
+ maximum explicit embedding level.
+ lastStack is the index of the last used entry in the stack. A value of -1
+ means that there is no open isolate sequence.
+ lastStack is reset to -1 on paragraph boundaries. */
+ /* The following stack contains the position of the initiator of
+ each open isolate sequence */
+ int[] isolateStartStack= new int[MAX_EXPLICIT_LEVEL+1];
+ /* The following stack contains the last known state before
+ encountering the initiator of an isolate sequence */
+ byte[] previousStateStack = new byte[MAX_EXPLICIT_LEVEL+1];
+ int stackLast=-1;
+
+ if ((reorderingOptions & OPTION_STREAMING) != 0)
+ length = 0;
+ defaultParaLevel = (byte)(paraLevel & 1);
if (isDefaultLevel) {
- paraDirDefault = ((paraLevel & 1) != 0) ? CONTEXT_RTL : 0;
- paraDir = paraDirDefault;
- lastStrongDir = paraDirDefault;
- state = LOOKING_FOR_STRONG;
+ paras_level[0] = defaultParaLevel;
+ lastStrong = defaultParaLevel;
+ state = SEEKING_STRONG_FOR_PARA;
} else {
- state = NOT_CONTEXTUAL;
- paraDir = 0;
+ paras_level[0] = paraLevel;
+ state = NOT_SEEKING_STRONG;
}
/* count paragraphs and determine the paragraph level (P2..P3) */
/*
@@ -1236,90 +1526,509 @@
for (i = 0; i < originalLength; /* i is incremented in the loop */) {
i0 = i; /* index of first code unit */
uchar = UTF16.charAt(text, 0, originalLength, i);
- i += Character.charCount(uchar);
+ i += UTF16.getCharCount(uchar);
i1 = i - 1; /* index of last code unit, gets the directional property */
- dirProp = (byte)bdp.getClass(uchar);
-
+ dirProp = (byte)getCustomizedClass(uchar);
flags |= DirPropFlag(dirProp);
- dirProps[i1] = (byte)(dirProp | paraDir);
+ dirProps[i1] = dirProp;
if (i1 > i0) { /* set previous code units' properties to BN */
flags |= DirPropFlag(BN);
do {
- dirProps[--i1] = (byte)(BN | paraDir);
+ dirProps[--i1] = BN;
} while (i1 > i0);
}
- if (state == LOOKING_FOR_STRONG) {
- if (dirProp == L) {
- state = FOUND_STRONG_CHAR;
- if (paraDir != 0) {
- paraDir = 0;
- for (i1 = paraStart; i1 < i; i1++) {
- dirProps[i1] &= ~CONTEXT_RTL;
- }
- }
- continue;
- }
- if (dirProp == R || dirProp == AL) {
- state = FOUND_STRONG_CHAR;
- if (paraDir == 0) {
- paraDir = CONTEXT_RTL;
- for (i1 = paraStart; i1 < i; i1++) {
- dirProps[i1] |= CONTEXT_RTL;
- }
- }
- continue;
- }
+ if (removeBidiControls && IsBidiControlChar(uchar)) {
+ controlCount++;
}
if (dirProp == L) {
- lastStrongDir = 0;
- lastStrongLTR = i; /* i is index to next character */
- }
- else if (dirProp == R) {
- lastStrongDir = CONTEXT_RTL;
- }
- else if (dirProp == AL) {
- lastStrongDir = CONTEXT_RTL;
- lastArabicPos = i-1;
+ if (state == SEEKING_STRONG_FOR_PARA) {
+ paras_level[paraCount - 1] = 0;
+ state = NOT_SEEKING_STRONG;
+ }
+ else if (state == SEEKING_STRONG_FOR_FSI) {
+ if (stackLast <= MAX_EXPLICIT_LEVEL) {
+ /* no need for next statement, already set by default */
+ /* dirProps[isolateStartStack[stackLast]] = LRI; */
+ flags |= DirPropFlag(LRI);
+ }
+ state = LOOKING_FOR_PDI;
+ }
+ lastStrong = L;
+ continue;
}
- else if (dirProp == B) {
- if (i < originalLength) { /* B not last char in text */
- if (!((uchar == (int)CR) && (text[i] == (int)LF))) {
- paraCount++;
+ if (dirProp == R || dirProp == AL) {
+ if (state == SEEKING_STRONG_FOR_PARA) {
+ paras_level[paraCount - 1] = 1;
+ state = NOT_SEEKING_STRONG;
+ }
+ else if (state == SEEKING_STRONG_FOR_FSI) {
+ if (stackLast <= MAX_EXPLICIT_LEVEL) {
+ dirProps[isolateStartStack[stackLast]] = RLI;
+ flags |= DirPropFlag(RLI);
}
- if (isDefaultLevel) {
- state=LOOKING_FOR_STRONG;
- paraStart = i; /* i is index to next character */
- paraDir = paraDirDefault;
- lastStrongDir = paraDirDefault;
+ state = LOOKING_FOR_PDI;
+ }
+ lastStrong = R;
+ if (dirProp == AL)
+ lastArabicPos = i - 1;
+ continue;
+ }
+ if (dirProp >= FSI && dirProp <= RLI) { /* FSI, LRI or RLI */
+ stackLast++;
+ if (stackLast <= MAX_EXPLICIT_LEVEL) {
+ isolateStartStack[stackLast] = i - 1;
+ previousStateStack[stackLast] = state;
+ }
+ if (dirProp == FSI) {
+ dirProps[i-1] = LRI; /* default if no strong char */
+ state = SEEKING_STRONG_FOR_FSI;
+ }
+ else
+ state = LOOKING_FOR_PDI;
+ continue;
+ }
+ if (dirProp == PDI) {
+ if (state == SEEKING_STRONG_FOR_FSI) {
+ if (stackLast <= MAX_EXPLICIT_LEVEL) {
+ /* no need for next statement, already set by default */
+ /* dirProps[isolateStartStack[stackLast]] = LRI; */
+ flags |= DirPropFlag(LRI);
}
}
+ if (stackLast >= 0) {
+ if (stackLast <= MAX_EXPLICIT_LEVEL)
+ state = previousStateStack[stackLast];
+ stackLast--;
+ }
+ continue;
}
+ if (dirProp == B) {
+ if (i < originalLength && uchar == CR && text[i] == LF) /* do nothing on the CR */
+ continue;
+ paras_limit[paraCount - 1] = i;
+ if (isDefaultLevelInverse && lastStrong == R)
+ paras_level[paraCount - 1] = 1;
+ if ((reorderingOptions & OPTION_STREAMING) != 0) {
+ /* When streaming, we only process whole paragraphs
+ thus some updates are only done on paragraph boundaries */
+ length = i; /* i is index to next character */
+ this.controlCount = controlCount;
+ }
+ if (i < originalLength) { /* B not last char in text */
+ paraCount++;
+ checkParaCount(); /* check that there is enough memory for a new para entry */
+ if (isDefaultLevel) {
+ paras_level[paraCount - 1] = defaultParaLevel;
+ state = SEEKING_STRONG_FOR_PARA;
+ lastStrong = defaultParaLevel;
+ } else {
+ paras_level[paraCount - 1] = paraLevel;
+ state = NOT_SEEKING_STRONG;
+ }
+ stackLast = -1;
+ }
+ continue;
+ }
+ }
+ /* +Ignore still open isolate sequences with overflow */
+ if (stackLast > MAX_EXPLICIT_LEVEL) {
+ stackLast = MAX_EXPLICIT_LEVEL;
+ state=SEEKING_STRONG_FOR_FSI; /* to be on the safe side */
+ }
+ /* Resolve direction of still unresolved open FSI sequences */
+ while (stackLast >= 0) {
+ if (state == SEEKING_STRONG_FOR_FSI) {
+ /* no need for next statement, already set by default */
+ /* dirProps[isolateStartStack[stackLast]] = LRI; */
+ flags |= DirPropFlag(LRI);
+ break;
+ }
+ state = previousStateStack[stackLast];
+ stackLast--;
+ }
+ /* When streaming, ignore text after the last paragraph separator */
+ if ((reorderingOptions & OPTION_STREAMING) != 0) {
+ if (length < originalLength)
+ paraCount--;
+ } else {
+ paras_limit[paraCount - 1] = originalLength;
+ this.controlCount = controlCount;
+ }
+ /* For inverse bidi, default para direction is RTL if there is
+ a strong R or AL at either end of the paragraph */
+ if (isDefaultLevelInverse && lastStrong == R) {
+ paras_level[paraCount - 1] = 1;
}
if (isDefaultLevel) {
- paraLevel = GetParaLevelAt(0);
+ paraLevel = paras_level[0];
}
-
- /* The following line does nothing new for contextual paraLevel, but is
- needed for absolute paraLevel. */
- flags |= DirPropFlagLR(paraLevel);
+ /* The following is needed to resolve the text direction for default level
+ paragraphs containing no strong character */
+ for (i = 0; i < paraCount; i++)
+ flags |= DirPropFlagLR(paras_level[i]);
if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) {
flags |= DirPropFlag(L);
}
}
+ /* determine the paragraph level at position index */
+ byte GetParaLevelAt(int pindex)
+ {
+ if (defaultParaLevel == 0 || pindex < paras_limit[0])
+ return paraLevel;
+ int i;
+ for (i = 1; i < paraCount; i++)
+ if (pindex < paras_limit[i])
+ break;
+ if (i >= paraCount)
+ i = paraCount - 1;
+ return paras_level[i];
+ }
+
+ /* Functions for handling paired brackets ----------------------------------- */
+
+ /* In the isoRuns array, the first entry is used for text outside of any
+ isolate sequence. Higher entries are used for each more deeply nested
+ isolate sequence. isoRunLast is the index of the last used entry. The
+ openings array is used to note the data of opening brackets not yet
+ matched by a closing bracket, or matched but still susceptible to change
+ level.
+ Each isoRun entry contains the index of the first and
+ one-after-last openings entries for pending opening brackets it
+ contains. The next openings entry to use is the one-after-last of the
+ most deeply nested isoRun entry.
+ isoRun entries also contain their current embedding level and the last
+ encountered strong character, since these will be needed to resolve
+ the level of paired brackets. */
+
+ private void bracketInit(BracketData bd) {
+ bd.isoRunLast = 0;
+ bd.isoRuns[0] = new IsoRun();
+ bd.isoRuns[0].start = 0;
+ bd.isoRuns[0].limit = 0;
+ bd.isoRuns[0].level = GetParaLevelAt(0);
+ bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(GetParaLevelAt(0) & 1);
+ bd.isoRuns[0].contextPos = 0;
+ bd.openings = new Opening[SIMPLE_PARAS_COUNT];
+ bd.isNumbersSpecial = reorderingMode == REORDER_NUMBERS_SPECIAL ||
+ reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL;
+ }
+
+ /* paragraph boundary */
+ private void bracketProcessB(BracketData bd, byte level) {
+ bd.isoRunLast = 0;
+ bd.isoRuns[0].limit = 0;
+ bd.isoRuns[0].level = level;
+ bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(level & 1);
+ bd.isoRuns[0].contextPos = 0;
+ }
+
+ /* LRE, LRO, RLE, RLO, PDF */
+ private void bracketProcessBoundary(BracketData bd, int lastCcPos,
+ byte contextLevel, byte embeddingLevel) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ if ((DirPropFlag(dirProps[lastCcPos]) & MASK_ISO) != 0) /* after an isolate */
+ return;
+ if (NoOverride(embeddingLevel) > NoOverride(contextLevel)) /* not a PDF */
+ contextLevel = embeddingLevel;
+ pLastIsoRun.limit = pLastIsoRun.start;
+ pLastIsoRun.level = embeddingLevel;
+ pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(contextLevel & 1);
+ pLastIsoRun.contextPos = lastCcPos;
+ }
+
+ /* LRI or RLI */
+ private void bracketProcessLRI_RLI(BracketData bd, byte level) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ short lastLimit;
+ pLastIsoRun.lastBase = ON;
+ lastLimit = pLastIsoRun.limit;
+ bd.isoRunLast++;
+ pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ if (pLastIsoRun == null)
+ pLastIsoRun = bd.isoRuns[bd.isoRunLast] = new IsoRun();
+ pLastIsoRun.start = pLastIsoRun.limit = lastLimit;
+ pLastIsoRun.level = level;
+ pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(level & 1);
+ pLastIsoRun.contextPos = 0;
+ }
+
+ /* PDI */
+ private void bracketProcessPDI(BracketData bd) {
+ IsoRun pLastIsoRun;
+ bd.isoRunLast--;
+ pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ pLastIsoRun.lastBase = ON;
+ }
+
+ /* newly found opening bracket: create an openings entry */
+ private void bracketAddOpening(BracketData bd, char match, int position) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ Opening pOpening;
+ if (pLastIsoRun.limit >= bd.openings.length) { /* no available new entry */
+ Opening[] saveOpenings = bd.openings;
+ int count;
+ try {
+ count = bd.openings.length;
+ bd.openings = new Opening[count * 2];
+ } catch (Exception e) {
+ throw new OutOfMemoryError("Failed to allocate memory for openings");
+ }
+ System.arraycopy(saveOpenings, 0, bd.openings, 0, count);
+ }
+ pOpening = bd.openings[pLastIsoRun.limit];
+ if (pOpening == null)
+ pOpening = bd.openings[pLastIsoRun.limit]= new Opening();
+ pOpening.position = position;
+ pOpening.match = match;
+ pOpening.contextDir = pLastIsoRun.contextDir;
+ pOpening.contextPos = pLastIsoRun.contextPos;
+ pOpening.flags = 0;
+ pLastIsoRun.limit++;
+ }
+
+ /* change N0c1 to N0c2 when a preceding bracket is assigned the embedding level */
+ private void fixN0c(BracketData bd, int openingIndex, int newPropPosition, byte newProp) {
+ /* This function calls itself recursively */
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ Opening qOpening;
+ int k, openingPosition, closingPosition;
+ for (k = openingIndex+1; k < pLastIsoRun.limit; k++) {
+ qOpening = bd.openings[k];
+ if (qOpening.match >= 0) /* not an N0c match */
+ continue;
+ if (newPropPosition < qOpening.contextPos)
+ break;
+ if (newPropPosition >= qOpening.position)
+ continue;
+ if (newProp == qOpening.contextDir)
+ break;
+ openingPosition = qOpening.position;
+ dirProps[openingPosition] = newProp;
+ closingPosition = -(qOpening.match);
+ dirProps[closingPosition] = newProp;
+ qOpening.match = 0; /* prevent further changes */
+ fixN0c(bd, k, openingPosition, newProp);
+ fixN0c(bd, k, closingPosition, newProp);
+ }
+ }
+
+ /* process closing bracket; return L or R if N0b or N0c, ON if N0d */
+ private byte bracketProcessClosing(BracketData bd, int openIdx, int position) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ Opening pOpening, qOpening;
+ byte direction;
+ boolean stable;
+ byte newProp;
+ pOpening = bd.openings[openIdx];
+ direction = (byte)(pLastIsoRun.level & 1);
+ stable = true; /* assume stable until proved otherwise */
+
+ /* The stable flag is set when brackets are paired and their
+ level is resolved and cannot be changed by what will be
+ found later in the source string.
+ An unstable match can occur only when applying N0c, where
+ the resolved level depends on the preceding context, and
+ this context may be affected by text occurring later.
+ Example: RTL paragraph containing: abc[(latin) HEBREW]
+ When the closing parenthesis is encountered, it appears
+ that N0c1 must be applied since 'abc' sets an opposite
+ direction context and both parentheses receive level 2.
+ However, when the closing square bracket is processed,
+ N0b applies because of 'HEBREW' being included within the
+ brackets, thus the square brackets are treated like R and
+ receive level 1. However, this changes the preceding
+ context of the opening parenthesis, and it now appears
+ that N0c2 must be applied to the parentheses rather than
+ N0c1. */
+
+ if ((direction == 0 && (pOpening.flags & FOUND_L) > 0) ||
+ (direction == 1 && (pOpening.flags & FOUND_R) > 0)) { /* N0b */
+ newProp = direction;
+ }
+ else if ((pOpening.flags & (FOUND_L | FOUND_R)) != 0) { /* N0c */
+ /* it is stable if there is no preceding text or in
+ conditions too complicated and not worth checking */
+ stable = (openIdx == pLastIsoRun.start);
+ if (direction != pOpening.contextDir)
+ newProp = pOpening.contextDir; /* N0c1 */
+ else
+ newProp = direction; /* N0c2 */
+ } else {
+ /* forget this and any brackets nested within this pair */
+ pLastIsoRun.limit = (short)openIdx;
+ return ON; /* N0d */
+ }
+ dirProps[pOpening.position] = newProp;
+ dirProps[position] = newProp;
+ /* Update nested N0c pairs that may be affected */
+ fixN0c(bd, openIdx, pOpening.position, newProp);
+ if (stable) {
+ pLastIsoRun.limit = (short)openIdx; /* forget any brackets nested within this pair */
+ /* remove lower located synonyms if any */
+ while (pLastIsoRun.limit > pLastIsoRun.start &&
+ bd.openings[pLastIsoRun.limit - 1].position == pOpening.position)
+ pLastIsoRun.limit--;
+ } else {
+ int k;
+ pOpening.match = -position;
+ /* neutralize lower located synonyms if any */
+ k = openIdx - 1;
+ while (k >= pLastIsoRun.start &&
+ bd.openings[k].position == pOpening.position)
+ bd.openings[k--].match = 0;
+ /* neutralize any unmatched opening between the current pair;
+ this will also neutralize higher located synonyms if any */
+ for (k = openIdx + 1; k < pLastIsoRun.limit; k++) {
+ qOpening =bd.openings[k];
+ if (qOpening.position >= position)
+ break;
+ if (qOpening.match > 0)
+ qOpening.match = 0;
+ }
+ }
+ return newProp;
+ }
+
+ /* handle strong characters, digits and candidates for closing brackets */
+ private void bracketProcessChar(BracketData bd, int position) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ byte dirProp, newProp;
+ byte level;
+ dirProp = dirProps[position];
+ if (dirProp == ON) {
+ char c, match;
+ int idx;
+ /* First see if it is a matching closing bracket. Hopefully, this is
+ more efficient than checking if it is a closing bracket at all */
+ c = text[position];
+ for (idx = pLastIsoRun.limit - 1; idx >= pLastIsoRun.start; idx--) {
+ if (bd.openings[idx].match != c)
+ continue;
+ /* We have a match */
+ newProp = bracketProcessClosing(bd, idx, position);
+ if(newProp == ON) { /* N0d */
+ c = 0; /* prevent handling as an opening */
+ break;
+ }
+ pLastIsoRun.lastBase = ON;
+ pLastIsoRun.contextDir = newProp;
+ pLastIsoRun.contextPos = position;
+ level = levels[position];
+ if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */
+ short flag;
+ int i;
+ newProp = (byte)(level & 1);
+ pLastIsoRun.lastStrong = newProp;
+ flag = (short)DirPropFlag(newProp);
+ for (i = pLastIsoRun.start; i < idx; i++)
+ bd.openings[i].flags |= flag;
+ /* matching brackets are not overridden by LRO/RLO */
+ levels[position] &= ~LEVEL_OVERRIDE;
+ }
+ /* matching brackets are not overridden by LRO/RLO */
+ levels[bd.openings[idx].position] &= ~LEVEL_OVERRIDE;
+ return;
+ }
+ /* We get here only if the ON character is not a matching closing
+ bracket or it is a case of N0d */
+ /* Now see if it is an opening bracket */
+ if (c != 0) {
+ match = (char)UCharacter.getBidiPairedBracket(c); /* get the matching char */
+ } else {
+ match = 0;
+ }
+ if (match != c && /* has a matching char */
+ UCharacter.getIntPropertyValue(c, BIDI_PAIRED_BRACKET_TYPE) ==
+ /* opening bracket */ BidiPairedBracketType.OPEN) {
+ /* special case: process synonyms
+ create an opening entry for each synonym */
+ if (match == 0x232A) { /* RIGHT-POINTING ANGLE BRACKET */
+ bracketAddOpening(bd, (char)0x3009, position);
+ }
+ else if (match == 0x3009) { /* RIGHT ANGLE BRACKET */
+ bracketAddOpening(bd, (char)0x232A, position);
+ }
+ bracketAddOpening(bd, match, position);
+ }
+ }
+ level = levels[position];
+ if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */
+ newProp = (byte)(level & 1);
+ if (dirProp != S && dirProp != WS && dirProp != ON)
+ dirProps[position] = newProp;
+ pLastIsoRun.lastBase = newProp;
+ pLastIsoRun.lastStrong = newProp;
+ pLastIsoRun.contextDir = newProp;
+ pLastIsoRun.contextPos = position;
+ }
+ else if (dirProp <= R || dirProp == AL) {
+ newProp = DirFromStrong(dirProp);
+ pLastIsoRun.lastBase = dirProp;
+ pLastIsoRun.lastStrong = dirProp;
+ pLastIsoRun.contextDir = newProp;
+ pLastIsoRun.contextPos = position;
+ }
+ else if(dirProp == EN) {
+ pLastIsoRun.lastBase = EN;
+ if (pLastIsoRun.lastStrong == L) {
+ newProp = L; /* W7 */
+ if (!bd.isNumbersSpecial)
+ dirProps[position] = ENL;
+ pLastIsoRun.contextDir = L;
+ pLastIsoRun.contextPos = position;
+ }
+ else {
+ newProp = R; /* N0 */
+ if (pLastIsoRun.lastStrong == AL)
+ dirProps[position] = AN; /* W2 */
+ else
+ dirProps[position] = ENR;
+ pLastIsoRun.contextDir = R;
+ pLastIsoRun.contextPos = position;
+ }
+ }
+ else if (dirProp == AN) {
+ newProp = R; /* N0 */
+ pLastIsoRun.lastBase = AN;
+ pLastIsoRun.contextDir = R;
+ pLastIsoRun.contextPos = position;
+ }
+ else if (dirProp == NSM) {
+ /* if the last real char was ON, change NSM to ON so that it
+ will stay ON even if the last real char is a bracket which
+ may be changed to L or R */
+ newProp = pLastIsoRun.lastBase;
+ if (newProp == ON)
+ dirProps[position] = newProp;
+ }
+ else {
+ newProp = dirProp;
+ pLastIsoRun.lastBase = dirProp;
+ }
+ if (newProp <= R || newProp == AL) {
+ int i;
+ short flag = (short)DirPropFlag(DirFromStrong(newProp));
+ for (i = pLastIsoRun.start; i < pLastIsoRun.limit; i++)
+ if (position > bd.openings[i].position)
+ bd.openings[i].flags |= flag;
+ }
+ }
+
/* perform (X1)..(X9) ------------------------------------------------------- */
/* determine if the text is mixed-directional or single-directional */
private byte directionFromFlags() {
+
/* if the text contains AN and neutrals, then some neutrals may become RTL */
if (!((flags & MASK_RTL) != 0 ||
((flags & DirPropFlag(AN)) != 0 &&
(flags & MASK_POSSIBLE_N) != 0))) {
- return Bidi.DIRECTION_LEFT_TO_RIGHT;
+ return LTR;
} else if ((flags & MASK_LTR) == 0) {
- return Bidi.DIRECTION_RIGHT_TO_LEFT;
+ return RTL;
} else {
return MIXED;
}
@@ -1330,16 +2039,16 @@
* Recalculate the flags to have them reflect the real properties
* after taking the explicit embeddings into account.
*
- * The Bidi algorithm is designed to result in the same behavior whether embedding
+ * The BiDi algorithm is designed to result in the same behavior whether embedding
* levels are externally specified (from "styled text", supposedly the preferred
- * method) or set by explicit embedding codes (LRx, RLx, PDF) in the plain text.
- * That is why (X9) instructs to remove all explicit codes (and BN).
- * However, in a real implementation, this removal of these codes and their index
+ * method) or set by explicit embedding codes (LRx, RLx, PDF, FSI, PDI) in the plain text.
+ * That is why (X9) instructs to remove all not-isolate explicit codes (and BN).
+ * However, in a real implementation, the removal of these codes and their index
* positions in the plain text is undesirable since it would result in
* reallocated, reindexed text.
* Instead, this implementation leaves the codes in there and just ignores them
* in the subsequent processing.
- * In order to get the same reordering behavior, positions with a BN or an
+ * In order to get the same reordering behavior, positions with a BN or a not-isolate
* explicit embedding code just get the same level assigned as the last "real"
* character.
*
@@ -1351,185 +2060,281 @@
* This limits the scope of the implicit rules in effectively
* the same way as the run limits.
*
- * Instead, this implementation does not modify these codes.
+ * Instead, this implementation does not modify these codes, except for
+ * paired brackets whose properties (ON) may be replaced by L or R.
* On one hand, the paragraph has to be scanned for same-level-runs, but
* on the other hand, this saves another loop to reset these codes,
* or saves making and modifying a copy of dirProps[].
*
*
- * Note that (Pn) and (Xn) changed significantly from version 4 of the Bidi algorithm.
+ * Note that (Pn) and (Xn) changed significantly from version 4 of the BiDi algorithm.
*
*
* Handling the stack of explicit levels (Xn):
*
- * With the Bidi stack of explicit levels,
- * as pushed with each LRE, RLE, LRO, and RLO and popped with each PDF,
- * the explicit level must never exceed MAX_EXPLICIT_LEVEL==61.
+ * With the BiDi stack of explicit levels, as pushed with each
+ * LRE, RLE, LRO, RLO, LRI, RLI and FSI and popped with each PDF and PDI,
+ * the explicit level must never exceed MAX_EXPLICIT_LEVEL.
*
* In order to have a correct push-pop semantics even in the case of overflows,
- * there are two overflow counters:
- * - countOver60 is incremented with each LRx at level 60
- * - from level 60, one RLx increases the level to 61
- * - countOver61 is incremented with each LRx and RLx at level 61
- *
- * Popping levels with PDF must work in the opposite order so that level 61
- * is correct at the correct point. Underflows (too many PDFs) must be checked.
+ * overflow counters and a valid isolate counter are used as described in UAX#9
+ * section 3.3.2 "Explicit Levels and Directions".
*
* This implementation assumes that MAX_EXPLICIT_LEVEL is odd.
+ *
+ * Returns the direction
+ *
*/
private byte resolveExplicitLevels() {
int i = 0;
byte dirProp;
byte level = GetParaLevelAt(0);
-
byte dirct;
- int paraIndex = 0;
+ isolateCount = 0;
/* determine if the text is mixed-directional or single-directional */
dirct = directionFromFlags();
- /* we may not need to resolve any explicit levels, but for multiple
- paragraphs we want to loop on all chars to set the para boundaries */
- if ((dirct != MIXED) && (paraCount == 1)) {
+ /* we may not need to resolve any explicit levels */
+ if (dirct != MIXED) {
/* not mixed directionality: levels don't matter - trailingWSStart will be 0 */
- } else if ((paraCount == 1) &&
- ((flags & MASK_EXPLICIT) == 0)) {
- /* mixed, but all characters are at the same embedding level */
- /* or we are in "inverse Bidi" */
- /* and we don't have contextual multiple paragraphs with some B char */
+ return dirct;
+ }
+
+ if (reorderingMode > REORDER_LAST_LOGICAL_TO_VISUAL) {
+ /* inverse BiDi: mixed, but all characters are at the same embedding level */
/* set all levels to the paragraph level */
- for (i = 0; i < length; ++i) {
- levels[i] = level;
+ int paraIndex, start, limit;
+ for (paraIndex = 0; paraIndex < paraCount; paraIndex++) {
+ if (paraIndex == 0)
+ start = 0;
+ else
+ start = paras_limit[paraIndex - 1];
+ limit = paras_limit[paraIndex];
+ level = paras_level[paraIndex];
+ for (i = start; i < limit; i++)
+ levels[i] =level;
}
- } else {
- /* continue to perform (Xn) */
-
- /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */
- /* both variables may carry the LEVEL_OVERRIDE flag to indicate the override status */
- byte embeddingLevel = level;
- byte newLevel;
- byte stackTop = 0;
-
- byte[] stack = new byte[MAX_EXPLICIT_LEVEL]; /* we never push anything >=MAX_EXPLICIT_LEVEL */
- int countOver60 = 0;
- int countOver61 = 0; /* count overflows of explicit levels */
-
- /* recalculate the flags */
- flags = 0;
-
- for (i = 0; i < length; ++i) {
- dirProp = NoContextRTL(dirProps[i]);
- switch(dirProp) {
- case LRE:
- case LRO:
- /* (X3, X5) */
- newLevel = (byte)((embeddingLevel+2) & ~(INTERNAL_LEVEL_OVERRIDE | 1)); /* least greater even level */
- if (newLevel <= MAX_EXPLICIT_LEVEL) {
- stack[stackTop] = embeddingLevel;
- ++stackTop;
- embeddingLevel = newLevel;
- if (dirProp == LRO) {
- embeddingLevel |= INTERNAL_LEVEL_OVERRIDE;
+ return dirct; /* no bracket matching for inverse BiDi */
+ }
+ if ((flags & (MASK_EXPLICIT | MASK_ISO)) == 0) {
+ /* no embeddings, set all levels to the paragraph level */
+ /* we still have to perform bracket matching */
+ int paraIndex, start, limit;
+ BracketData bracketData = new BracketData();
+ bracketInit(bracketData);
+ for (paraIndex = 0; paraIndex < paraCount; paraIndex++) {
+ if (paraIndex == 0)
+ start = 0;
+ else
+ start = paras_limit[paraIndex-1];
+ limit = paras_limit[paraIndex];
+ level = paras_level[paraIndex];
+ for (i = start; i < limit; i++) {
+ levels[i] = level;
+ dirProp = dirProps[i];
+ if (dirProp == BN)
+ continue;
+ if (dirProp == B) {
+ if ((i + 1) < length) {
+ if (text[i] == CR && text[i + 1] == LF)
+ continue; /* skip CR when followed by LF */
+ bracketProcessB(bracketData, level);
}
- /* we don't need to set LEVEL_OVERRIDE off for LRE
- since this has already been done for newLevel which is
- the source for embeddingLevel.
- */
- } else if ((embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) == MAX_EXPLICIT_LEVEL) {
- ++countOver61;
- } else /* (embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) == MAX_EXPLICIT_LEVEL-1 */ {
- ++countOver60;
+ continue;
}
- flags |= DirPropFlag(BN);
- break;
- case RLE:
- case RLO:
- /* (X2, X4) */
- newLevel=(byte)(((embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) + 1) | 1); /* least greater odd level */
- if (newLevel<=MAX_EXPLICIT_LEVEL) {
- stack[stackTop] = embeddingLevel;
- ++stackTop;
- embeddingLevel = newLevel;
- if (dirProp == RLO) {
- embeddingLevel |= INTERNAL_LEVEL_OVERRIDE;
- }
- /* we don't need to set LEVEL_OVERRIDE off for RLE
- since this has already been done for newLevel which is
- the source for embeddingLevel.
- */
- } else {
- ++countOver61;
- }
- flags |= DirPropFlag(BN);
- break;
- case PDF:
- /* (X7) */
- /* handle all the overflow cases first */
- if (countOver61 > 0) {
- --countOver61;
- } else if (countOver60 > 0 && (embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) != MAX_EXPLICIT_LEVEL) {
- /* handle LRx overflows from level 60 */
- --countOver60;
- } else if (stackTop > 0) {
- /* this is the pop operation; it also pops level 61 while countOver60>0 */
- --stackTop;
- embeddingLevel = stack[stackTop];
- /* } else { (underflow) */
- }
- flags |= DirPropFlag(BN);
- break;
- case B:
- stackTop = 0;
- countOver60 = 0;
- countOver61 = 0;
- level = GetParaLevelAt(i);
- if ((i + 1) < length) {
- embeddingLevel = GetParaLevelAt(i+1);
- if (!((text[i] == CR) && (text[i + 1] == LF))) {
- paras[paraIndex++] = i+1;
- }
- }
- flags |= DirPropFlag(B);
- break;
- case BN:
- /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */
- /* they will get their levels set correctly in adjustWSLevels() */
- flags |= DirPropFlag(BN);
- break;
- default:
- /* all other types get the "real" level */
- if (level != embeddingLevel) {
- level = embeddingLevel;
- if ((level & INTERNAL_LEVEL_OVERRIDE) != 0) {
- flags |= DirPropFlagO(level) | DirPropFlagMultiRuns;
- } else {
- flags |= DirPropFlagE(level) | DirPropFlagMultiRuns;
- }
- }
- if ((level & INTERNAL_LEVEL_OVERRIDE) == 0) {
- flags |= DirPropFlag(dirProp);
- }
+ bracketProcessChar(bracketData, i);
+ }
+ }
+ return dirct;
+ }
+ /* continue to perform (Xn) */
+
+ /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */
+ /* both variables may carry the LEVEL_OVERRIDE flag to indicate the override status */
+ byte embeddingLevel = level, newLevel;
+ byte previousLevel = level; /* previous level for regular (not CC) characters */
+ int lastCcPos = 0; /* index of last effective LRx,RLx, PDx */
+
+ /* The following stack remembers the embedding level and the ISOLATE flag of level runs.
+ stackLast points to its current entry. */
+ short[] stack = new short[MAX_EXPLICIT_LEVEL + 2]; /* we never push anything >= MAX_EXPLICIT_LEVEL
+ but we need one more entry as base */
+ int stackLast = 0;
+ int overflowIsolateCount = 0;
+ int overflowEmbeddingCount = 0;
+ int validIsolateCount = 0;
+ BracketData bracketData = new BracketData();
+ bracketInit(bracketData);
+ stack[0] = level; /* initialize base entry to para level, no override, no isolate */
+
+ /* recalculate the flags */
+ flags = 0;
+
+ for (i = 0; i < length; i++) {
+ dirProp = dirProps[i];
+ switch (dirProp) {
+ case LRE:
+ case RLE:
+ case LRO:
+ case RLO:
+ /* (X2, X3, X4, X5) */
+ flags |= DirPropFlag(BN);
+ levels[i] = previousLevel;
+ if (dirProp == LRE || dirProp == LRO) {
+ /* least greater even level */
+ newLevel = (byte)((embeddingLevel+2) & ~(LEVEL_OVERRIDE | 1));
+ } else {
+ /* least greater odd level */
+ newLevel = (byte)((NoOverride(embeddingLevel) + 1) | 1);
+ }
+ if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0 &&
+ overflowEmbeddingCount == 0) {
+ lastCcPos = i;
+ embeddingLevel = newLevel;
+ if (dirProp == LRO || dirProp == RLO)
+ embeddingLevel |= LEVEL_OVERRIDE;
+ stackLast++;
+ stack[stackLast] = embeddingLevel;
+ /* we don't need to set LEVEL_OVERRIDE off for LRE and RLE
+ since this has already been done for newLevel which is
+ the source for embeddingLevel.
+ */
+ } else {
+ if (overflowIsolateCount == 0)
+ overflowEmbeddingCount++;
+ }
+ break;
+ case PDF:
+ /* (X7) */
+ flags |= DirPropFlag(BN);
+ levels[i] = previousLevel;
+ /* handle all the overflow cases first */
+ if (overflowIsolateCount > 0) {
break;
}
-
- /*
- * We need to set reasonable levels even on BN codes and
- * explicit codes because we will later look at same-level runs (X10).
- */
- levels[i] = level;
- }
- if ((flags & MASK_EMBEDDING) != 0) {
- flags |= DirPropFlagLR(paraLevel);
+ if (overflowEmbeddingCount > 0) {
+ overflowEmbeddingCount--;
+ break;
+ }
+ if (stackLast > 0 && stack[stackLast] < ISOLATE) { /* not an isolate entry */
+ lastCcPos = i;
+ stackLast--;
+ embeddingLevel = (byte)stack[stackLast];
+ }
+ break;
+ case LRI:
+ case RLI:
+ flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel);
+ levels[i] = NoOverride(embeddingLevel);
+ if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) {
+ bracketProcessBoundary(bracketData, lastCcPos,
+ previousLevel, embeddingLevel);
+ flags |= DirPropFlagMultiRuns;
+ }
+ previousLevel = embeddingLevel;
+ /* (X5a, X5b) */
+ if (dirProp == LRI)
+ /* least greater even level */
+ newLevel=(byte)((embeddingLevel+2)&~(LEVEL_OVERRIDE|1));
+ else
+ /* least greater odd level */
+ newLevel=(byte)((NoOverride(embeddingLevel)+1)|1);
+ if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0
+ && overflowEmbeddingCount == 0) {
+ flags |= DirPropFlag(dirProp);
+ lastCcPos = i;
+ validIsolateCount++;
+ if (validIsolateCount > isolateCount)
+ isolateCount = validIsolateCount;
+ embeddingLevel = newLevel;
+ /* we can increment stackLast without checking because newLevel
+ will exceed UBIDI_MAX_EXPLICIT_LEVEL before stackLast overflows */
+ stackLast++;
+ stack[stackLast] = (short)(embeddingLevel + ISOLATE);
+ bracketProcessLRI_RLI(bracketData, embeddingLevel);
+ } else {
+ /* make it WS so that it is handled by adjustWSLevels() */
+ dirProps[i] = WS;
+ overflowIsolateCount++;
+ }
+ break;
+ case PDI:
+ if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) {
+ bracketProcessBoundary(bracketData, lastCcPos,
+ previousLevel, embeddingLevel);
+ flags |= DirPropFlagMultiRuns;
+ }
+ /* (X6a) */
+ if (overflowIsolateCount > 0) {
+ overflowIsolateCount--;
+ /* make it WS so that it is handled by adjustWSLevels() */
+ dirProps[i] = WS;
+ }
+ else if (validIsolateCount > 0) {
+ flags |= DirPropFlag(PDI);
+ lastCcPos = i;
+ overflowEmbeddingCount = 0;
+ while (stack[stackLast] < ISOLATE) /* pop embedding entries */
+ stackLast--; /* until the last isolate entry */
+ stackLast--; /* pop also the last isolate entry */
+ validIsolateCount--;
+ bracketProcessPDI(bracketData);
+ } else
+ /* make it WS so that it is handled by adjustWSLevels() */
+ dirProps[i] = WS;
+ embeddingLevel = (byte)(stack[stackLast] & ~ISOLATE);
+ flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel);
+ previousLevel = embeddingLevel;
+ levels[i] = NoOverride(embeddingLevel);
+ break;
+ case B:
+ flags |= DirPropFlag(B);
+ levels[i] = GetParaLevelAt(i);
+ if ((i + 1) < length) {
+ if (text[i] == CR && text[i + 1] == LF)
+ break; /* skip CR when followed by LF */
+ overflowEmbeddingCount = overflowIsolateCount = 0;
+ validIsolateCount = 0;
+ stackLast = 0;
+ previousLevel = embeddingLevel = GetParaLevelAt(i + 1);
+ stack[0] = embeddingLevel; /* initialize base entry to para level, no override, no isolate */
+ bracketProcessB(bracketData, embeddingLevel);
+ }
+ break;
+ case BN:
+ /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */
+ /* they will get their levels set correctly in adjustWSLevels() */
+ levels[i] = previousLevel;
+ flags |= DirPropFlag(BN);
+ break;
+ default:
+ /* all other types are normal characters and get the "real" level */
+ if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) {
+ bracketProcessBoundary(bracketData, lastCcPos,
+ previousLevel, embeddingLevel);
+ flags |= DirPropFlagMultiRuns;
+ if ((embeddingLevel & LEVEL_OVERRIDE) != 0)
+ flags |= DirPropFlagO(embeddingLevel);
+ else
+ flags |= DirPropFlagE(embeddingLevel);
+ }
+ previousLevel = embeddingLevel;
+ levels[i] = embeddingLevel;
+ bracketProcessChar(bracketData, i);
+ /* the dirProp may have been changed in bracketProcessChar() */
+ flags |= DirPropFlag(dirProps[i]);
+ break;
}
- if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) {
- flags |= DirPropFlag(L);
- }
-
- /* subsequently, ignore the explicit codes and BN (X9) */
-
- /* again, determine if the text is mixed-directional or single-directional */
- dirct = directionFromFlags();
+ }
+ if ((flags & MASK_EMBEDDING) != 0) {
+ flags |= DirPropFlagLR(paraLevel);
}
+ if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) {
+ flags |= DirPropFlag(L);
+ }
+ /* again, determine if the text is mixed-directional or single-directional */
+ dirct = directionFromFlags();
return dirct;
}
@@ -1547,49 +2352,57 @@
private byte checkExplicitLevels() {
byte dirProp;
int i;
+ int isolateCount = 0;
+
this.flags = 0; /* collect all directionalities in the text */
byte level;
- int paraIndex = 0;
+ this.isolateCount = 0;
for (i = 0; i < length; ++i) {
if (levels[i] == 0) {
- levels[i] = paraLevel;
+ levels[i] = paraLevel;
}
+
+ // for backward compatibility
if (MAX_EXPLICIT_LEVEL < (levels[i]&0x7f)) {
- if ((levels[i] & INTERNAL_LEVEL_OVERRIDE) != 0) {
- levels[i] = (byte)(paraLevel|INTERNAL_LEVEL_OVERRIDE);
+ if ((levels[i] & LEVEL_OVERRIDE) != 0) {
+ levels[i] = (byte)(paraLevel|LEVEL_OVERRIDE);
} else {
levels[i] = paraLevel;
}
}
+
level = levels[i];
- dirProp = NoContextRTL(dirProps[i]);
- if ((level & INTERNAL_LEVEL_OVERRIDE) != 0) {
+ dirProp = dirProps[i];
+ if (dirProp == LRI || dirProp == RLI) {
+ isolateCount++;
+ if (isolateCount > this.isolateCount)
+ this.isolateCount = isolateCount;
+ }
+ else if (dirProp == PDI) {
+ isolateCount--;
+ } else if (dirProp == B) {
+ isolateCount = 0;
+ }
+ if ((level & LEVEL_OVERRIDE) != 0) {
/* keep the override flag in levels[i] but adjust the flags */
- level &= ~INTERNAL_LEVEL_OVERRIDE; /* make the range check below simpler */
+ level &= ~LEVEL_OVERRIDE; /* make the range check below simpler */
flags |= DirPropFlagO(level);
} else {
/* set the flags */
flags |= DirPropFlagE(level) | DirPropFlag(dirProp);
}
-
if ((level < GetParaLevelAt(i) &&
!((0 == level) && (dirProp == B))) ||
- (MAX_EXPLICIT_LEVEL <level)) {
+ (MAX_EXPLICIT_LEVEL < level)) {
/* level out of bounds */
throw new IllegalArgumentException("level " + level +
- " out of bounds at index " + i);
- }
- if ((dirProp == B) && ((i + 1) < length)) {
- if (!((text[i] == CR) && (text[i + 1] == LF))) {
- paras[paraIndex++] = i + 1;
- }
+ " out of bounds at " + i);
}
}
- if ((flags&MASK_EMBEDDING) != 0) {
+ if ((flags & MASK_EMBEDDING) != 0) {
flags |= DirPropFlagLR(paraLevel);
}
-
/* determine if the text is mixed-directional or single-directional */
return directionFromFlags();
}
@@ -1610,7 +2423,7 @@
/*********************************************************************/
/* Definitions and type for properties state tables */
/*********************************************************************/
- private static final int IMPTABPROPS_COLUMNS = 14;
+ private static final int IMPTABPROPS_COLUMNS = 16;
private static final int IMPTABPROPS_RES = IMPTABPROPS_COLUMNS - 1;
private static short GetStateProps(short cell) {
return (short)(cell & 0x1f);
@@ -1621,8 +2434,8 @@
private static final short groupProp[] = /* dirProp regrouped */
{
- /* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN */
- 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10
+ /* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN FSI LRI RLI PDI ENL ENR */
+ 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10, 4, 4, 4, 4, 13, 14
};
private static final short _L = 0;
private static final short _R = 1;
@@ -1637,7 +2450,7 @@
/* PROPERTIES STATE TABLE */
/* */
/* In table impTabProps, */
- /* - the ON column regroups ON and WS */
+ /* - the ON column regroups ON and WS, FSI, RLI, LRI and PDI */
/* - the BN column regroups BN, LRE, RLE, LRO, RLO, PDF */
/* - the Res column is the reduced property assigned to a run */
/* */
@@ -1668,25 +2481,31 @@
/* */
private static final short impTabProps[][] =
{
-/* L, R, EN, AN, ON, S, B, ES, ET, CS, BN, NSM, AL, Res */
-/* 0 Init */ { 1, 2, 4, 5, 7, 15, 17, 7, 9, 7, 0, 7, 3, _ON },
-/* 1 L */ { 1, 32+2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 1, 1, 32+3, _L },
-/* 2 R */ { 32+1, 2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 2, 2, 32+3, _R },
-/* 3 AL */ { 32+1, 32+2, 32+6, 32+6, 32+8, 32+16, 32+17, 32+8, 32+8, 32+8, 3, 3, 3, _R },
-/* 4 EN */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 64+10, 11, 64+10, 4, 4, 32+3, _EN },
-/* 5 AN */ { 32+1, 32+2, 32+4, 5, 32+7, 32+15, 32+17, 32+7, 32+9, 64+12, 5, 5, 32+3, _AN },
-/* 6 AL:EN/AN */ { 32+1, 32+2, 6, 6, 32+8, 32+16, 32+17, 32+8, 32+8, 64+13, 6, 6, 32+3, _AN },
-/* 7 ON */ { 32+1, 32+2, 32+4, 32+5, 7, 32+15, 32+17, 7, 64+14, 7, 7, 7, 32+3, _ON },
-/* 8 AL:ON */ { 32+1, 32+2, 32+6, 32+6, 8, 32+16, 32+17, 8, 8, 8, 8, 8, 32+3, _ON },
-/* 9 ET */ { 32+1, 32+2, 4, 32+5, 7, 32+15, 32+17, 7, 9, 7, 9, 9, 32+3, _ON },
-/*10 EN+ES/CS */ { 96+1, 96+2, 4, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 10, 128+7, 96+3, _EN },
-/*11 EN+ET */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 32+7, 11, 32+7, 11, 11, 32+3, _EN },
-/*12 AN+CS */ { 96+1, 96+2, 96+4, 5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 12, 128+7, 96+3, _AN },
-/*13 AL:EN/AN+CS */ { 96+1, 96+2, 6, 6, 128+8, 96+16, 96+17, 128+8, 128+8, 128+8, 13, 128+8, 96+3, _AN },
-/*14 ON+ET */ { 32+1, 32+2, 128+4, 32+5, 7, 32+15, 32+17, 7, 14, 7, 14, 14, 32+3, _ON },
-/*15 S */ { 32+1, 32+2, 32+4, 32+5, 32+7, 15, 32+17, 32+7, 32+9, 32+7, 15, 32+7, 32+3, _S },
-/*16 AL:S */ { 32+1, 32+2, 32+6, 32+6, 32+8, 16, 32+17, 32+8, 32+8, 32+8, 16, 32+8, 32+3, _S },
-/*17 B */ { 32+1, 32+2, 32+4, 32+5, 32+7, 32+15, 17, 32+7, 32+9, 32+7, 17, 32+7, 32+3, _B }
+/* L, R, EN, AN, ON, S, B, ES, ET, CS, BN, NSM, AL, ENL, ENR, Res */
+/* 0 Init */ { 1, 2, 4, 5, 7, 15, 17, 7, 9, 7, 0, 7, 3, 18, 21, _ON },
+/* 1 L */ { 1, 32+2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 1, 1, 32+3, 32+18, 32+21, _L },
+/* 2 R */ { 32+1, 2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 2, 2, 32+3, 32+18, 32+21, _R },
+/* 3 AL */ { 32+1, 32+2, 32+6, 32+6, 32+8, 32+16, 32+17, 32+8, 32+8, 32+8, 3, 3, 3, 32+18, 32+21, _R },
+/* 4 EN */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 64+10, 11, 64+10, 4, 4, 32+3, 18, 21, _EN },
+/* 5 AN */ { 32+1, 32+2, 32+4, 5, 32+7, 32+15, 32+17, 32+7, 32+9, 64+12, 5, 5, 32+3, 32+18, 32+21, _AN },
+/* 6 AL:EN/AN */ { 32+1, 32+2, 6, 6, 32+8, 32+16, 32+17, 32+8, 32+8, 64+13, 6, 6, 32+3, 18, 21, _AN },
+/* 7 ON */ { 32+1, 32+2, 32+4, 32+5, 7, 32+15, 32+17, 7, 64+14, 7, 7, 7, 32+3, 32+18, 32+21, _ON },
+/* 8 AL:ON */ { 32+1, 32+2, 32+6, 32+6, 8, 32+16, 32+17, 8, 8, 8, 8, 8, 32+3, 32+18, 32+21, _ON },
+/* 9 ET */ { 32+1, 32+2, 4, 32+5, 7, 32+15, 32+17, 7, 9, 7, 9, 9, 32+3, 18, 21, _ON },
+/*10 EN+ES/CS */ { 96+1, 96+2, 4, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 10, 128+7, 96+3, 18, 21, _EN },
+/*11 EN+ET */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 32+7, 11, 32+7, 11, 11, 32+3, 18, 21, _EN },
+/*12 AN+CS */ { 96+1, 96+2, 96+4, 5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 12, 128+7, 96+3, 96+18, 96+21, _AN },
+/*13 AL:EN/AN+CS */ { 96+1, 96+2, 6, 6, 128+8, 96+16, 96+17, 128+8, 128+8, 128+8, 13, 128+8, 96+3, 18, 21, _AN },
+/*14 ON+ET */ { 32+1, 32+2, 128+4, 32+5, 7, 32+15, 32+17, 7, 14, 7, 14, 14, 32+3,128+18,128+21, _ON },
+/*15 S */ { 32+1, 32+2, 32+4, 32+5, 32+7, 15, 32+17, 32+7, 32+9, 32+7, 15, 32+7, 32+3, 32+18, 32+21, _S },
+/*16 AL:S */ { 32+1, 32+2, 32+6, 32+6, 32+8, 16, 32+17, 32+8, 32+8, 32+8, 16, 32+8, 32+3, 32+18, 32+21, _S },
+/*17 B */ { 32+1, 32+2, 32+4, 32+5, 32+7, 32+15, 17, 32+7, 32+9, 32+7, 17, 32+7, 32+3, 32+18, 32+21, _B },
+/*18 ENL */ { 32+1, 32+2, 18, 32+5, 32+7, 32+15, 32+17, 64+19, 20, 64+19, 18, 18, 32+3, 18, 21, _L },
+/*19 ENL+ES/CS */ { 96+1, 96+2, 18, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 19, 128+7, 96+3, 18, 21, _L },
+/*20 ENL+ET */ { 32+1, 32+2, 18, 32+5, 32+7, 32+15, 32+17, 32+7, 20, 32+7, 20, 20, 32+3, 18, 21, _L },
+/*21 ENR */ { 32+1, 32+2, 21, 32+5, 32+7, 32+15, 32+17, 64+22, 23, 64+22, 21, 21, 32+3, 18, 21, _AN },
+/*22 ENR+ES/CS */ { 96+1, 96+2, 21, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 22, 128+7, 96+3, 18, 21, _AN },
+/*23 ENR+ET */ { 32+1, 32+2, 21, 32+5, 32+7, 32+15, 32+17, 32+7, 23, 32+7, 23, 23, 32+3, 18, 21, _AN }
};
/*********************************************************************/
@@ -1760,7 +2579,7 @@
/* */
private static final byte impTabL_DEFAULT[][] = /* Even paragraph level */
- /* In this table, conditional sequences receive the higher possible level
+ /* In this table, conditional sequences receive the lower possible level
until proven otherwise.
*/
{
@@ -1769,8 +2588,8 @@
/* 1 : R */ { 0, 1, 3, 3, 0x14, 0x14, 0, 1 },
/* 2 : AN */ { 0, 1, 0, 2, 0x15, 0x15, 0, 2 },
/* 3 : R+EN/AN */ { 0, 1, 3, 3, 0x14, 0x14, 0, 2 },
- /* 4 : R+ON */ { 0x20, 1, 3, 3, 4, 4, 0x20, 1 },
- /* 5 : AN+ON */ { 0x20, 1, 0x20, 2, 5, 5, 0x20, 1 }
+ /* 4 : R+ON */ { 0, 0x21, 0x33, 0x33, 4, 4, 0, 0 },
+ /* 5 : AN+ON */ { 0, 0x21, 0, 0x32, 5, 5, 0, 0 }
};
private static final byte impTabR_DEFAULT[][] = /* Odd paragraph level */
@@ -1787,20 +2606,20 @@
/* 5 : L+AN+ON */ { 1, 0, 1, 3, 5, 5, 0, 0 }
};
- private static final short[] impAct0 = {0,1,2,3,4,5,6};
+ private static final short[] impAct0 = {0,1,2,3,4};
private static final ImpTabPair impTab_DEFAULT = new ImpTabPair(
impTabL_DEFAULT, impTabR_DEFAULT, impAct0, impAct0);
private static final byte impTabL_NUMBERS_SPECIAL[][] = { /* Even paragraph level */
- /* In this table, conditional sequences receive the higher possible
+ /* In this table, conditional sequences receive the lower possible
level until proven otherwise.
*/
/* L, R, EN, AN, ON, S, B, Res */
- /* 0 : init */ { 0, 2, 1, 1, 0, 0, 0, 0 },
- /* 1 : L+EN/AN */ { 0, 2, 1, 1, 0, 0, 0, 2 },
- /* 2 : R */ { 0, 2, 4, 4, 0x13, 0, 0, 1 },
- /* 3 : R+ON */ { 0x20, 2, 4, 4, 3, 3, 0x20, 1 },
+ /* 0 : init */ { 0, 2, 0x11, 0x11, 0, 0, 0, 0 },
+ /* 1 : L+EN/AN */ { 0, 0x42, 1, 1, 0, 0, 0, 0 },
+ /* 2 : R */ { 0, 2, 4, 4, 0x13, 0x13, 0, 1 },
+ /* 3 : R+ON */ { 0, 0x22, 0x34, 0x34, 3, 3, 0, 0 },
/* 4 : R+EN/AN */ { 0, 2, 4, 4, 0x13, 0x13, 0, 2 }
};
private static final ImpTabPair impTab_NUMBERS_SPECIAL = new ImpTabPair(
@@ -1874,7 +2693,7 @@
/* 5 : L+AN+ON */ { 0x21, 0x30, 6, 4, 5, 5, 0x30, 2 },
/* 6 : L+ON+EN */ { 0x21, 0x30, 6, 4, 3, 3, 0x30, 1 }
};
- private static final short[] impAct1 = {0,1,11,12};
+ private static final short[] impAct1 = {0,1,13,14};
private static final ImpTabPair impTab_INVERSE_LIKE_DIRECT = new ImpTabPair(
impTabL_DEFAULT, impTabR_INVERSE_LIKE_DIRECT, impAct0, impAct1);
@@ -1898,15 +2717,16 @@
/* 0 : init */ { 0x13, 0, 1, 1, 0, 0, 0, 0 },
/* 1 : R+EN/AN */ { 0x23, 0, 1, 1, 2, 0x40, 0, 1 },
/* 2 : R+EN/AN+ON */ { 0x23, 0, 1, 1, 2, 0x40, 0, 0 },
- /* 3 : L */ { 3 , 0, 3, 0x36, 0x14, 0x40, 0, 1 },
+ /* 3 : L */ { 3, 0, 3, 0x36, 0x14, 0x40, 0, 1 },
/* 4 : L+ON */ { 0x53, 0x40, 5, 0x36, 4, 0x40, 0x40, 0 },
/* 5 : L+ON+EN */ { 0x53, 0x40, 5, 0x36, 4, 0x40, 0x40, 1 },
/* 6 : L+AN */ { 0x53, 0x40, 6, 6, 4, 0x40, 0x40, 3 }
};
- private static final short impAct2[] = {0,1,7,8,9,10};
+ private static final short[] impAct2 = {0,1,2,5,6,7,8};
+ private static final short[] impAct3 = {0,1,9,10,11,12};
private static final ImpTabPair impTab_INVERSE_LIKE_DIRECT_WITH_MARKS =
new ImpTabPair(impTabL_INVERSE_LIKE_DIRECT_WITH_MARKS,
- impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct0, impAct2);
+ impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct2, impAct3);
private static final ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL = new ImpTabPair(
impTabL_NUMBERS_SPECIAL, impTabR_INVERSE_LIKE_DIRECT, impAct0, impAct1);
@@ -1923,14 +2743,15 @@
};
private static final ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS = new
ImpTabPair(impTabL_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS,
- impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct0, impAct2);
-
- private class LevState {
+ impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct2, impAct3);
+
+ private static class LevState {
byte[][] impTab; /* level table pointer */
short[] impAct; /* action map array */
int startON; /* start of ON sequence */
int startL2EN; /* start of level 2 sequence */
int lastStrongRTL; /* index of last found R or AL */
+ int runStart; /* start position of the run */
short state; /* current state */
byte runLevel; /* run level before implicit solving */
}
@@ -1962,6 +2783,22 @@
insertPoints.size++;
}
+ private void setLevelsOutsideIsolates(int start, int limit, byte level)
+ {
+ byte dirProp;
+ int isolateCount = 0, k;
+ for (k = start; k < limit; k++) {
+ dirProp = dirProps[k];
+ if (dirProp == PDI)
+ isolateCount--;
+ if (isolateCount == 0) {
+ levels[k] = level;
+ }
+ if (dirProp == LRI || dirProp == RLI)
+ isolateCount++;
+ }
+ }
+
/* perform rules (Wn), (Nn), and (In) on a run of the text ------------------ */
/*
@@ -2003,7 +2840,17 @@
start = levState.startON;
break;
- case 3: /* L or S after possible relevant EN/AN */
+ case 3: /* EN/AN after R+ON */
+ level = (byte)(levState.runLevel + 1);
+ setLevelsOutsideIsolates(levState.startON, start0, level);
+ break;
+
+ case 4: /* EN/AN before R for NUMBERS_SPECIAL */
+ level = (byte)(levState.runLevel + 2);
+ setLevelsOutsideIsolates(levState.startON, start0, level);
+ break;
+
+ case 5: /* L or S after possible relevant EN/AN */
/* check if we had EN after R/AL */
if (levState.startL2EN >= 0) {
addPoint(levState.startL2EN, LRM_BEFORE);
@@ -2039,7 +2886,7 @@
}
break;
- case 4: /* R/AL after possible relevant EN/AN */
+ case 6: /* R/AL after possible relevant EN/AN */
/* just clean up */
if (insertPoints.points.length > 0)
/* remove all non confirmed insert points */
@@ -2049,12 +2896,15 @@
levState.lastStrongRTL = limit - 1;
break;
- case 5: /* EN/AN after R/AL + possible cont */
+ case 7: /* EN/AN after R/AL + possible cont */
/* check for real AN */
- if ((_prop == _AN) && (NoContextRTL(dirProps[start0]) == AN)) {
+
+ if ((_prop == _AN) && (dirProps[start0] == AN) &&
+ (reorderingMode != REORDER_INVERSE_FOR_NUMBERS_SPECIAL))
+ {
/* real AN */
if (levState.startL2EN == -1) { /* if no relevant EN already found */
- /* just note the righmost digit as a strong RTL */
+ /* just note the rightmost digit as a strong RTL */
levState.lastStrongRTL = limit - 1;
break;
}
@@ -2072,12 +2922,12 @@
}
break;
- case 6: /* note location of latest R/AL */
+ case 8: /* note location of latest R/AL */
levState.lastStrongRTL = limit - 1;
levState.startON = -1;
break;
- case 7: /* L after R+ON/EN/AN */
+ case 9: /* L after R+ON/EN/AN */
/* include possible adjacent number on the left */
for (k = start0-1; k >= 0 && ((levels[k] & 1) == 0); k--) {
}
@@ -2088,14 +2938,14 @@
levState.startON = start0;
break;
- case 8: /* AN after L */
+ case 10: /* AN after L */
/* AN numbers between L text on both sides may be trouble. */
/* tentatively bracket with LRMs; will be confirmed if followed by L */
addPoint(start0, LRM_BEFORE); /* add LRM before */
addPoint(start0, LRM_AFTER); /* add LRM after */
break;
- case 9: /* R after L+ON/EN/AN */
+ case 11: /* R after L+ON/EN/AN */
/* false alert, infirm LRMs around previous AN */
insertPoints.size=insertPoints.confirmed;
if (_prop == _S) { /* add RLM before S */
@@ -2104,7 +2954,7 @@
}
break;
- case 10: /* L after L+ON/AN */
+ case 12: /* L after L+ON/AN */
level = (byte)(levState.runLevel + addLevel);
for (k=levState.startON; k < start0; k++) {
if (levels[k] < level) {
@@ -2115,7 +2965,7 @@
levState.startON = start0;
break;
- case 11: /* L after L+ON+EN/AN/ON */
+ case 13: /* L after L+ON+EN/AN/ON */
level = levState.runLevel;
for (k = start0-1; k >= levState.startON; k--) {
if (levels[k] == level+3) {
@@ -2134,7 +2984,7 @@
}
break;
- case 12: /* R after L+ON+EN/AN/ON */
+ case 14: /* R after L+ON+EN/AN/ON */
level = (byte)(levState.runLevel+1);
for (k = start0-1; k >= levState.startON; k--) {
if (levels[k] > level) {
@@ -2149,22 +2999,27 @@
}
if ((addLevel) != 0 || (start < start0)) {
level = (byte)(levState.runLevel + addLevel);
- for (k = start; k < limit; k++) {
- levels[k] = level;
+ if (start >= levState.runStart) {
+ for (k = start; k < limit; k++) {
+ levels[k] = level;
+ }
+ } else {
+ setLevelsOutsideIsolates(start, limit, level);
}
}
}
private void resolveImplicitLevels(int start, int limit, short sor, short eor)
{
+ byte dirProp;
LevState levState = new LevState();
int i, start1, start2;
short oldStateImp, stateImp, actionImp;
short gprop, resProp, cell;
+ boolean inverseRTL;
short nextStrongProp = R;
int nextStrongPos = -1;
-
/* check for RTL inverse Bidi mode */
/* FOOD FOR THOUGHT: in case of RTL inverse Bidi, it would make sense to
* loop on the text characters from end to start.
@@ -2172,29 +3027,78 @@
* actions) and different levels state tables (maybe very similar to the
* LTR corresponding ones.
*/
- /* initialize for levels state table */
+ inverseRTL=((start<lastArabicPos) && ((GetParaLevelAt(start) & 1)>0) &&
+ (reorderingMode == REORDER_INVERSE_LIKE_DIRECT ||
+ reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL));
+ /* initialize for property and levels state table */
levState.startL2EN = -1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */
levState.lastStrongRTL = -1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */
- levState.state = 0;
+ levState.runStart = start;
levState.runLevel = levels[start];
levState.impTab = impTabPair.imptab[levState.runLevel & 1];
levState.impAct = impTabPair.impact[levState.runLevel & 1];
- processPropertySeq(levState, sor, start, start);
- /* initialize for property state table */
- if (dirProps[start] == NSM) {
- stateImp = (short)(1 + sor);
+
+ /* The isolates[] entries contain enough information to
+ resume the bidi algorithm in the same state as it was
+ when it was interrupted by an isolate sequence. */
+ if (dirProps[start] == PDI) {
+ levState.startON = isolates[isolateCount].startON;
+ start1 = isolates[isolateCount].start1;
+ stateImp = isolates[isolateCount].stateImp;
+ levState.state = isolates[isolateCount].state;
+ isolateCount--;
} else {
- stateImp = 0;
+ levState.startON = -1;
+ start1 = start;
+ if (dirProps[start] == NSM)
+ stateImp = (short)(1 + sor);
+ else
+ stateImp = 0;
+ levState.state = 0;
+ processPropertySeq(levState, sor, start, start);
}
- start1 = start;
- start2 = 0;
+ start2 = start; /* to make the Java compiler happy */
for (i = start; i <= limit; i++) {
if (i >= limit) {
+ int k;
+ for (k = limit - 1;
+ k > start &&
+ (DirPropFlag(dirProps[k]) & MASK_BN_EXPLICIT) != 0;
+ k--);
+ dirProp = dirProps[k];
+ if (dirProp == LRI || dirProp == RLI)
+ break; /* no forced closing for sequence ending with LRI/RLI */
gprop = eor;
} else {
- short prop, prop1;
- prop = NoContextRTL(dirProps[i]);
+ byte prop, prop1;
+ prop = dirProps[i];
+ if (prop == B)
+ isolateCount = -1; /* current isolates stack entry == none */
+ if (inverseRTL) {
+ if (prop == AL) {
+ /* AL before EN does not make it AN */
+ prop = R;
+ } else if (prop == EN) {
+ if (nextStrongPos <= i) {
+ /* look for next strong char (L/R/AL) */
+ int j;
+ nextStrongProp = R; /* set default */
+ nextStrongPos = limit;
+ for (j = i+1; j < limit; j++) {
+ prop1 = dirProps[j];
+ if (prop1 == L || prop1 == R || prop1 == AL) {
+ nextStrongProp = prop1;
+ nextStrongPos = j;
+ break;
+ }
+ }
+ }
+ if (nextStrongProp == AL) {
+ prop = AN;
+ }
+ }
+ }
gprop = groupProp[prop];
}
oldStateImp = stateImp;
@@ -2230,8 +3134,24 @@
}
}
}
- /* flush possible pending sequence, e.g. ON */
- processPropertySeq(levState, eor, limit, limit);
+
+ /* look for the last char not a BN or LRE/RLE/LRO/RLO/PDF */
+ for (i = limit - 1;
+ i > start &&
+ (DirPropFlag(dirProps[i]) & MASK_BN_EXPLICIT) != 0;
+ i--);
+ dirProp = dirProps[i];
+ if ((dirProp == LRI || dirProp == RLI) && limit < length) {
+ isolateCount++;
+ if (isolates[isolateCount] == null)
+ isolates[isolateCount] = new Isolate();
+ isolates[isolateCount].stateImp = stateImp;
+ isolates[isolateCount].state = levState.state;
+ isolates[isolateCount].start1 = start1;
+ isolates[isolateCount].startON = levState.startON;
+ }
+ else
+ processPropertySeq(levState, eor, limit, limit);
}
/* perform (L1) and (X9) ---------------------------------------------------- */
@@ -2250,7 +3170,7 @@
i = trailingWSStart;
while (i > 0) {
/* reset a sequence of WS/BN before eop and B/S to the paragraph paraLevel */
- while (i > 0 && ((flag = DirPropFlagNC(dirProps[--i])) & MASK_WS) != 0) {
+ while (i > 0 && ((flag = DirPropFlag(dirProps[--i])) & MASK_WS) != 0) {
if (orderParagraphsLTR && (flag & DirPropFlag(B)) != 0) {
levels[i] = 0;
} else {
@@ -2261,7 +3181,7 @@
/* reset BN to the next character's paraLevel until B/S, which restarts above loop */
/* here, i+1 is guaranteed to be <length */
while (i > 0) {
- flag = DirPropFlagNC(dirProps[--i]);
+ flag = DirPropFlag(dirProps[--i]);
if ((flag & MASK_BN_EXPLICIT) != 0) {
levels[i] = levels[i + 1];
} else if (orderParagraphsLTR && (flag & DirPropFlag(B)) != 0) {
@@ -2276,6 +3196,10 @@
}
}
+ private void setParaSuccess() {
+ paraBidi = this; /* mark successful setPara */
+ }
+
private int Bidi_Min(int x, int y) {
return x < y ? x : y;
}
@@ -2284,6 +3208,159 @@
return x >= 0 ? x : -x;
}
+ void setParaRunsOnly(char[] parmText, byte parmParaLevel) {
+ int[] visualMap;
+ String visualText;
+ int saveLength, saveTrailingWSStart;
+ byte[] saveLevels;
+ byte saveDirection;
+ int i, j, visualStart, logicalStart,
+ oldRunCount, runLength, addedRuns, insertRemove,
+ start, limit, step, indexOddBit, logicalPos,
+ index, index1;
+ int saveOptions;
+
+ reorderingMode = REORDER_DEFAULT;
+ int parmLength = parmText.length;
+ if (parmLength == 0) {
+ setPara(parmText, parmParaLevel, null);
+ reorderingMode = REORDER_RUNS_ONLY;
+ return;
+ }
+ /* obtain memory for mapping table and visual text */
+ saveOptions = reorderingOptions;
+ if ((saveOptions & OPTION_INSERT_MARKS) > 0) {
+ reorderingOptions &= ~OPTION_INSERT_MARKS;
+ reorderingOptions |= OPTION_REMOVE_CONTROLS;
+ }
+ parmParaLevel &= 1; /* accept only 0 or 1 */
+ setPara(parmText, parmParaLevel, null);
+ /* we cannot access directly levels since it is not yet set if
+ * direction is not MIXED
+ */
+ saveLevels = new byte[this.length];
+ System.arraycopy(getLevels(), 0, saveLevels, 0, this.length);
+ saveTrailingWSStart = trailingWSStart;
+
+ /* FOOD FOR THOUGHT: instead of writing the visual text, we could use
+ * the visual map and the dirProps array to drive the second call
+ * to setPara (but must make provision for possible removal of
+ * Bidi controls. Alternatively, only use the dirProps array via
+ * customized classifier callback.
+ */
+ visualText = writeReordered(DO_MIRRORING);
+ visualMap = getVisualMap();
+ this.reorderingOptions = saveOptions;
+ saveLength = this.length;
+ saveDirection=this.direction;
+
+ this.reorderingMode = REORDER_INVERSE_LIKE_DIRECT;
+ parmParaLevel ^= 1;
+ setPara(visualText, parmParaLevel, null);
+ BidiLine.getRuns(this);
+ /* check if some runs must be split, count how many splits */
+ addedRuns = 0;
+ oldRunCount = this.runCount;
+ visualStart = 0;
+ for (i = 0; i < oldRunCount; i++, visualStart += runLength) {
+ runLength = runs[i].limit - visualStart;
+ if (runLength < 2) {
+ continue;
+ }
+ logicalStart = runs[i].start;
+ for (j = logicalStart+1; j < logicalStart+runLength; j++) {
+ index = visualMap[j];
+ index1 = visualMap[j-1];
+ if ((Bidi_Abs(index-index1)!=1) || (saveLevels[index]!=saveLevels[index1])) {
+ addedRuns++;
+ }
+ }
+ }
+ if (addedRuns > 0) {
+ getRunsMemory(oldRunCount + addedRuns);
+ if (runCount == 1) {
+ /* because we switch from UBiDi.simpleRuns to UBiDi.runs */
+ runsMemory[0] = runs[0];
+ } else {
+ System.arraycopy(runs, 0, runsMemory, 0, runCount);
+ }
+ runs = runsMemory;
+ runCount += addedRuns;
+ for (i = oldRunCount; i < runCount; i++) {
+ if (runs[i] == null) {
+ runs[i] = new BidiRun(0, 0, (byte)0);
+ }
+ }
+ }
+ /* split runs which are not consecutive in source text */
+ int newI;
+ for (i = oldRunCount-1; i >= 0; i--) {
+ newI = i + addedRuns;
+ runLength = i==0 ? runs[0].limit :
+ runs[i].limit - runs[i-1].limit;
+ logicalStart = runs[i].start;
+ indexOddBit = runs[i].level & 1;
+ if (runLength < 2) {
+ if (addedRuns > 0) {
+ runs[newI].copyFrom(runs[i]);
+ }
+ logicalPos = visualMap[logicalStart];
+ runs[newI].start = logicalPos;
+ runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit);
+ continue;
+ }
+ if (indexOddBit > 0) {
+ start = logicalStart;
+ limit = logicalStart + runLength - 1;
+ step = 1;
+ } else {
+ start = logicalStart + runLength - 1;
+ limit = logicalStart;
+ step = -1;
+ }
+ for (j = start; j != limit; j += step) {
+ index = visualMap[j];
+ index1 = visualMap[j+step];
+ if ((Bidi_Abs(index-index1)!=1) || (saveLevels[index]!=saveLevels[index1])) {
+ logicalPos = Bidi_Min(visualMap[start], index);
+ runs[newI].start = logicalPos;
+ runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit);
+ runs[newI].limit = runs[i].limit;
+ runs[i].limit -= Bidi_Abs(j - start) + 1;
+ insertRemove = runs[i].insertRemove & (LRM_AFTER|RLM_AFTER);
+ runs[newI].insertRemove = insertRemove;
+ runs[i].insertRemove &= ~insertRemove;
+ start = j + step;
+ addedRuns--;
+ newI--;
+ }
+ }
+ if (addedRuns > 0) {
+ runs[newI].copyFrom(runs[i]);
+ }
+ logicalPos = Bidi_Min(visualMap[start], visualMap[limit]);
+ runs[newI].start = logicalPos;
+ runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit);
+ }
+
+ cleanup1:
+ /* restore initial paraLevel */
+ this.paraLevel ^= 1;
+ cleanup2:
+ /* restore real text */
+ this.text = parmText;
+ this.length = saveLength;
+ this.originalLength = parmLength;
+ this.direction=saveDirection;
+ this.levels = saveLevels;
+ this.trailingWSStart = saveTrailingWSStart;
+ if (runCount > 1) {
+ this.direction = MIXED;
+ }
+ cleanup3:
+ this.reorderingMode = REORDER_RUNS_ONLY;
+ }
+
/**
* Perform the Unicode Bidi algorithm. It is defined in the
* <a href="http://www.unicode.org/unicode/reports/tr9/">Unicode Standard Annex #9</a>,
@@ -2386,7 +3463,7 @@
* For example, in pure LTR text with numbers the numbers would get
* a resolved level of 2 higher than the surrounding text according to
* the algorithm. This implementation may set all resolved levels to
- * the same value in such a case.<p>
+ * the same value in such a case.
*
* The text can be composed of multiple paragraphs. Occurrence of a block
* separator in the text terminates a paragraph, and whatever comes next starts
@@ -2421,9 +3498,9 @@
* (same index) character if the level has the
* <code>LEVEL_OVERRIDE</code> bit set.<br><br>
* Except for that bit, it must be
- * {@code paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL},
+ * <code>paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL</code>,
* with one exception: a level of zero may be specified for a
- * paragraph separator even if {@code paraLevel > 0} when multiple
+ * paragraph separator even if <code>paraLevel>0</code> when multiple
* paragraphs are submitted in the same call to <code>setPara()</code>.<br><br>
* <strong>Caution: </strong>A reference to this array, not a copy
* of the levels, will be stored in the <code>Bidi</code> object;
@@ -2444,22 +3521,28 @@
* @see #MAX_EXPLICIT_LEVEL
* @stable ICU 3.8
*/
- public void setPara(char[] chars, byte paraLevel, byte[] embeddingLevels)
+ void setPara(char[] chars, byte paraLevel, byte[] embeddingLevels)
{
/* check the argument values */
- if (paraLevel < INTERNAL_LEVEL_DEFAULT_LTR) {
+ if (paraLevel < LEVEL_DEFAULT_LTR) {
verifyRange(paraLevel, 0, MAX_EXPLICIT_LEVEL + 1);
}
if (chars == null) {
chars = new char[0];
}
+ /* special treatment for RUNS_ONLY mode */
+ if (reorderingMode == REORDER_RUNS_ONLY) {
+ setParaRunsOnly(chars, paraLevel);
+ return;
+ }
+
/* initialize the Bidi object */
this.paraBidi = null; /* mark unfinished setPara */
this.text = chars;
this.length = this.originalLength = this.resultLength = text.length;
this.paraLevel = paraLevel;
- this.direction = Bidi.DIRECTION_LEFT_TO_RIGHT;
+ this.direction = (byte)(paraLevel & 1);
this.paraCount = 1;
/* Allocate zero-length arrays instead of setting to null here; then
@@ -2475,11 +3558,7 @@
/*
* Save the original paraLevel if contextual; otherwise, set to 0.
*/
- if (IsDefaultLevel(paraLevel)) {
- defaultParaLevel = paraLevel;
- } else {
- defaultParaLevel = 0;
- }
+ defaultParaLevel = IsDefaultLevel(paraLevel) ? paraLevel : 0;
if (length == 0) {
/*
@@ -2491,17 +3570,10 @@
this.paraLevel &= 1;
defaultParaLevel = 0;
}
- if ((this.paraLevel & 1) != 0) {
- flags = DirPropFlag(R);
- direction = Bidi.DIRECTION_RIGHT_TO_LEFT;
- } else {
- flags = DirPropFlag(L);
- direction = Bidi.DIRECTION_LEFT_TO_RIGHT;
- }
-
+ flags = DirPropFlagLR(paraLevel);
runCount = 0;
paraCount = 0;
- paraBidi = this; /* mark successful setPara */
+ setParaSuccess();
return;
}
@@ -2515,21 +3587,9 @@
getDirPropsMemory(length);
dirProps = dirPropsMemory;
getDirProps();
-
/* the processed length may have changed if OPTION_STREAMING is set */
trailingWSStart = length; /* the levels[] will reflect the WS run */
- /* allocate paras memory */
- if (paraCount > 1) {
- getInitialParasMemory(paraCount);
- paras = parasMemory;
- paras[paraCount - 1] = length;
- } else {
- /* initialize paras for single paragraph */
- paras = simpleParas;
- simpleParas[0] = length;
- }
-
/* are explicit levels specified? */
if (embeddingLevels == null) {
/* no: determine explicit levels according to the (Xn) rules */
@@ -2542,28 +3602,62 @@
direction = checkExplicitLevels();
}
+ /* allocate isolate memory */
+ if (isolateCount > 0) {
+ if (isolates == null || isolates.length < isolateCount)
+ isolates = new Isolate[isolateCount + 3]; /* keep some reserve */
+ }
+ isolateCount = -1; /* current isolates stack entry == none */
+
/*
* The steps after (X9) in the Bidi algorithm are performed only if
* the paragraph text has mixed directionality!
*/
switch (direction) {
- case Bidi.DIRECTION_LEFT_TO_RIGHT:
- /* make sure paraLevel is even */
- paraLevel = (byte)((paraLevel + 1) & ~1);
-
+ case LTR:
/* all levels are implicitly at paraLevel (important for getLevels()) */
trailingWSStart = 0;
break;
- case Bidi.DIRECTION_RIGHT_TO_LEFT:
- /* make sure paraLevel is odd */
- paraLevel |= 1;
-
+ case RTL:
/* all levels are implicitly at paraLevel (important for getLevels()) */
trailingWSStart = 0;
break;
default:
- this.impTabPair = impTab_DEFAULT;
-
+ /*
+ * Choose the right implicit state table
+ */
+ switch(reorderingMode) {
+ case REORDER_DEFAULT:
+ this.impTabPair = impTab_DEFAULT;
+ break;
+ case REORDER_NUMBERS_SPECIAL:
+ this.impTabPair = impTab_NUMBERS_SPECIAL;
+ break;
+ case REORDER_GROUP_NUMBERS_WITH_R:
+ this.impTabPair = impTab_GROUP_NUMBERS_WITH_R;
+ break;
+ case REORDER_RUNS_ONLY:
+ /* we should never get here */
+ throw new InternalError("Internal ICU error in setPara");
+ /* break; */
+ case REORDER_INVERSE_NUMBERS_AS_L:
+ this.impTabPair = impTab_INVERSE_NUMBERS_AS_L;
+ break;
+ case REORDER_INVERSE_LIKE_DIRECT:
+ if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) {
+ this.impTabPair = impTab_INVERSE_LIKE_DIRECT_WITH_MARKS;
+ } else {
+ this.impTabPair = impTab_INVERSE_LIKE_DIRECT;
+ }
+ break;
+ case REORDER_INVERSE_FOR_NUMBERS_SPECIAL:
+ if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) {
+ this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS;
+ } else {
+ this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL;
+ }
+ break;
+ }
/*
* If there are no external levels specified and there
* are no significant explicit level codes in the text,
@@ -2601,7 +3695,7 @@
/* the values for this run's start are the same as for the previous run's end */
start = limit;
level = nextLevel;
- if ((start > 0) && (NoContextRTL(dirProps[start - 1]) == B)) {
+ if ((start > 0) && (dirProps[start - 1] == B)) {
/* except if this is a new paragraph, then set sor = para level */
sor = GetLRFromLevel(GetParaLevelAt(start));
} else {
@@ -2609,7 +3703,9 @@
}
/* search for the limit of this run */
- while (++limit < length && levels[limit] == level) {}
+ while ((++limit < length) &&
+ ((levels[limit] == level) ||
+ ((DirPropFlag(dirProps[limit]) & MASK_BN_EXPLICIT) != 0))) {}
/* get the correct level of the next run */
if (limit < length) {
@@ -2619,7 +3715,7 @@
}
/* determine eor from max(level, nextLevel); sor is last run's eor */
- if ((level & ~INTERNAL_LEVEL_OVERRIDE) < (nextLevel & ~INTERNAL_LEVEL_OVERRIDE)) {
+ if (NoOverride(level) < NoOverride(nextLevel)) {
eor = GetLRFromLevel(nextLevel);
} else {
eor = GetLRFromLevel(level);
@@ -2627,12 +3723,12 @@
/* if the run consists of overridden directional types, then there
are no implicit types to be resolved */
- if ((level & INTERNAL_LEVEL_OVERRIDE) == 0) {
+ if ((level & LEVEL_OVERRIDE) == 0) {
resolveImplicitLevels(start, limit, sor, eor);
} else {
/* remove the LEVEL_OVERRIDE flags */
do {
- levels[start++] &= ~INTERNAL_LEVEL_OVERRIDE;
+ levels[start++] &= ~LEVEL_OVERRIDE;
} while (start < limit);
}
} while (limit < length);
@@ -2644,8 +3740,46 @@
break;
}
- resultLength += insertPoints.size;
- paraBidi = this; /* mark successful setPara */
+ /* add RLM for inverse Bidi with contextual orientation resolving
+ * to RTL which would not round-trip otherwise
+ */
+ if ((defaultParaLevel > 0) &&
+ ((reorderingOptions & OPTION_INSERT_MARKS) != 0) &&
+ ((reorderingMode == REORDER_INVERSE_LIKE_DIRECT) ||
+ (reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL))) {
+ int start, last;
+ byte level;
+ byte dirProp;
+ for (int i = 0; i < paraCount; i++) {
+ last = paras_limit[i] - 1;
+ level = paras_level[i];
+ if (level == 0)
+ continue; /* LTR paragraph */
+ start = i == 0 ? 0 : paras_limit[i - 1];
+ for (int j = last; j >= start; j--) {
+ dirProp = dirProps[j];
+ if (dirProp == L) {
+ if (j < last) {
+ while (dirProps[last] == B) {
+ last--;
+ }
+ }
+ addPoint(last, RLM_BEFORE);
+ break;
+ }
+ if ((DirPropFlag(dirProp) & MASK_R_AL) != 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ if ((reorderingOptions & OPTION_REMOVE_CONTROLS) != 0) {
+ resultLength -= controlCount;
+ } else {
+ resultLength += insertPoints.size;
+ }
+ setParaSuccess();
}
/**
@@ -2682,7 +3816,7 @@
* For example, in pure LTR text with numbers the numbers would get
* a resolved level of 2 higher than the surrounding text according to
* the algorithm. This implementation may set all resolved levels to
- * the same value in such a case.
+ * the same value in such a case.<p>
*
* @param paragraph a paragraph of text with optional character and
* paragraph attribute information
@@ -2693,13 +3827,14 @@
byte paraLvl;
char ch = paragraph.first();
Boolean runDirection =
- (Boolean) paragraph.getAttribute(TextAttributeConstants.RUN_DIRECTION);
+ (Boolean) paragraph.getAttribute(TextAttributeConstants.RUN_DIRECTION);
Object shaper = paragraph.getAttribute(TextAttributeConstants.NUMERIC_SHAPING);
+
if (runDirection == null) {
- paraLvl = INTERNAL_LEVEL_DEFAULT_LTR;
+ paraLvl = LEVEL_DEFAULT_LTR;
} else {
paraLvl = (runDirection.equals(TextAttributeConstants.RUN_DIRECTION_LTR)) ?
- (byte)Bidi.DIRECTION_LEFT_TO_RIGHT : (byte)Bidi.DIRECTION_RIGHT_TO_LEFT;
+ LTR : RTL;
}
byte[] lvls = null;
@@ -2717,7 +3852,7 @@
/* no-op */
} else if (level < 0) {
lvls = embeddingLevels;
- embeddingLevels[i] = (byte)((0 - level) | INTERNAL_LEVEL_OVERRIDE);
+ embeddingLevels[i] = (byte)((0 - level) | LEVEL_OVERRIDE);
} else {
lvls = embeddingLevels;
embeddingLevels[i] = level;
@@ -2751,7 +3886,7 @@
* @see #setPara
* @stable ICU 3.8
*/
- private void orderParagraphsLTR(boolean ordarParaLTR) {
+ public void orderParagraphsLTR(boolean ordarParaLTR) {
orderParagraphsLTR = ordarParaLTR;
}
@@ -2771,7 +3906,7 @@
* @see #MIXED
* @stable ICU 3.8
*/
- private byte getDirection()
+ public byte getDirection()
{
verifyValidParaOrLine();
return direction;
@@ -2819,31 +3954,25 @@
}
/**
- * Get the index of a paragraph, given a position within the text.
- *
- * @param charIndex is the index of a character within the text, in the
- * range <code>[0..getProcessedLength()-1]</code>.
+ * Retrieves the Bidi class for a given code point.
+ * <p>If a <code>BidiClassifier</code> is defined and returns a value
+ * other than <code>CLASS_DEFAULT</code>, that value is used; otherwise
+ * the default class determination mechanism is invoked.</p>
*
- * @return The index of the paragraph containing the specified position,
- * starting from 0.
+ * @param c The code point to get a Bidi class for.
*
- * @throws IllegalStateException if this call is not preceded by a successful
- * call to <code>setPara</code> or <code>setLine</code>
- * @throws IllegalArgumentException if charIndex is not within the legal range
+ * @return The Bidi class for the character <code>c</code> that is in effect
+ * for this <code>Bidi</code> instance.
*
- * @see com.ibm.icu.text.BidiRun
- * @see #getProcessedLength
* @stable ICU 3.8
*/
- public int getParagraphIndex(int charIndex)
- {
- verifyValidParaOrLine();
- BidiBase bidi = paraBidi; /* get Para object if Line object */
- verifyRange(charIndex, 0, bidi.length);
- int paraIndex;
- for (paraIndex = 0; charIndex >= bidi.paras[paraIndex]; paraIndex++) {
- }
- return paraIndex;
+ public int getCustomizedClass(int c) {
+ int dir;
+
+ dir = bdp.getClass(c);
+ if (dir >= CHAR_DIRECTION_COUNT)
+ dir = ON;
+ return dir;
}
/**
@@ -2891,7 +4020,7 @@
verifyRange(start, 0, limit);
verifyRange(limit, 0, length+1);
- return BidiLine.setLine(bidi, this, newBidi, newBidiBase, start, limit);
+ return BidiLine.setLine(this, newBidi, newBidiBase, start, limit);
}
/**
@@ -2911,9 +4040,11 @@
*/
public byte getLevelAt(int charIndex)
{
+ // for backward compatibility
if (charIndex < 0 || charIndex >= length) {
return (byte)getBaseLevel();
}
+
verifyValidParaOrLine();
verifyRange(charIndex, 0, length);
return BidiLine.getLevelAt(this, charIndex);
@@ -2932,7 +4063,7 @@
* call to <code>setPara</code> or <code>setLine</code>
* @stable ICU 3.8
*/
- private byte[] getLevels()
+ byte[] getLevels()
{
verifyValidParaOrLine();
if (length <= 0) {
@@ -2963,6 +4094,78 @@
}
/**
+ *
+ * Get a <code>BidiRun</code> object according to its index. BidiRun methods
+ * may be used to retrieve the run's logical start, length and level,
+ * which can be even for an LTR run or odd for an RTL run.
+ * In an RTL run, the character at the logical start is
+ * visually on the right of the displayed run.
+ * The length is the number of characters in the run.<p>
+ * <code>countRuns()</code> is normally called
+ * before the runs are retrieved.
+ *
+ * <p>
+ * Example:
+ * <pre>
+ * Bidi bidi = new Bidi();
+ * String text = "abc 123 DEFG xyz";
+ * bidi.setPara(text, Bidi.RTL, null);
+ * int i, count=bidi.countRuns(), logicalStart, visualIndex=0, length;
+ * BidiRun run;
+ * for (i = 0; i < count; ++i) {
+ * run = bidi.getVisualRun(i);
+ * logicalStart = run.getStart();
+ * length = run.getLength();
+ * if (Bidi.LTR == run.getEmbeddingLevel()) {
+ * do { // LTR
+ * show_char(text.charAt(logicalStart++), visualIndex++);
+ * } while (--length > 0);
+ * } else {
+ * logicalStart += length; // logicalLimit
+ * do { // RTL
+ * show_char(text.charAt(--logicalStart), visualIndex++);
+ * } while (--length > 0);
+ * }
+ * }
+ * </pre>
+ * <p>
+ * Note that in right-to-left runs, code like this places
+ * second surrogates before first ones (which is generally a bad idea)
+ * and combining characters before base characters.
+ * <p>
+ * Use of <code>{@link #writeReordered}</code>, optionally with the
+ * <code>{@link #KEEP_BASE_COMBINING}</code> option, can be considered in
+ * order to avoid these issues.
+ *
+ * @param runIndex is the number of the run in visual order, in the
+ * range <code>[0..countRuns()-1]</code>.
+ *
+ * @return a BidiRun object containing the details of the run. The
+ * directionality of the run is
+ * <code>LTR==0</code> or <code>RTL==1</code>,
+ * never <code>MIXED</code>.
+ *
+ * @throws IllegalStateException if this call is not preceded by a successful
+ * call to <code>setPara</code> or <code>setLine</code>
+ * @throws IllegalArgumentException if <code>runIndex</code> is not in
+ * the range <code>0<=runIndex<countRuns()</code>
+ *
+ * @see #countRuns()
+ * @see com.ibm.icu.text.BidiRun
+ * @see com.ibm.icu.text.BidiRun#getStart()
+ * @see com.ibm.icu.text.BidiRun#getLength()
+ * @see com.ibm.icu.text.BidiRun#getEmbeddingLevel()
+ * @stable ICU 3.8
+ */
+ BidiRun getVisualRun(int runIndex)
+ {
+ verifyValidParaOrLine();
+ BidiLine.getRuns(this);
+ verifyRange(runIndex, 0, runCount);
+ return BidiLine.getVisualRun(this, runIndex);
+ }
+
+ /**
* Get a visual-to-logical index map (array) for the characters in the
* <code>Bidi</code> (paragraph or line) object.
* <p>
@@ -3031,19 +4234,10 @@
* Constant indicating that the base direction depends on the first strong
* directional character in the text according to the Unicode Bidirectional
* Algorithm. If no strong directional character is present, the base
- * direction is left-to-right.
- * @stable ICU 3.8
- */
- private static final int INTERNAL_DIRECTION_DEFAULT_LEFT_TO_RIGHT = 0x7e;
-
- /**
- * Constant indicating that the base direction depends on the first strong
- * directional character in the text according to the Unicode Bidirectional
- * Algorithm. If no strong directional character is present, the base
* direction is right-to-left.
* @stable ICU 3.8
*/
- private static final int INTERMAL_DIRECTION_DEFAULT_RIGHT_TO_LEFT = 0x7f;
+ public static final int DIRECTION_DEFAULT_RIGHT_TO_LEFT = LEVEL_DEFAULT_RTL;
/**
* Create Bidi from the given text, embedding, and direction information.
@@ -3080,27 +4274,27 @@
* @stable ICU 3.8
*/
public BidiBase(char[] text,
- int textStart,
- byte[] embeddings,
- int embStart,
- int paragraphLength,
- int flags)
- {
+ int textStart,
+ byte[] embeddings,
+ int embStart,
+ int paragraphLength,
+ int flags)
+ {
this(0, 0);
byte paraLvl;
switch (flags) {
case Bidi.DIRECTION_LEFT_TO_RIGHT:
default:
- paraLvl = Bidi.DIRECTION_LEFT_TO_RIGHT;
+ paraLvl = LTR;
break;
case Bidi.DIRECTION_RIGHT_TO_LEFT:
- paraLvl = Bidi.DIRECTION_RIGHT_TO_LEFT;
+ paraLvl = RTL;
break;
case Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT:
- paraLvl = INTERNAL_LEVEL_DEFAULT_LTR;
+ paraLvl = LEVEL_DEFAULT_LTR;
break;
case Bidi.DIRECTION_DEFAULT_RIGHT_TO_LEFT:
- paraLvl = INTERNAL_LEVEL_DEFAULT_RTL;
+ paraLvl = LEVEL_DEFAULT_RTL;
break;
}
byte[] paraEmbeddings;
@@ -3112,7 +4306,7 @@
for (int i = 0; i < paragraphLength; i++) {
lev = embeddings[i + embStart];
if (lev < 0) {
- lev = (byte)((- lev) | INTERNAL_LEVEL_OVERRIDE);
+ lev = (byte)((- lev) | LEVEL_OVERRIDE);
} else if (lev == 0) {
lev = paraLvl;
if (paraLvl > MAX_EXPLICIT_LEVEL) {
@@ -3122,13 +4316,10 @@
paraEmbeddings[i] = lev;
}
}
- if (textStart == 0 && embStart == 0 && paragraphLength == text.length) {
- setPara(text, paraLvl, paraEmbeddings);
- } else {
- char[] paraText = new char[paragraphLength];
- System.arraycopy(text, textStart, paraText, 0, paragraphLength);
- setPara(paraText, paraLvl, paraEmbeddings);
- }
+
+ char[] paraText = new char[paragraphLength];
+ System.arraycopy(text, textStart, paraText, 0, paragraphLength);
+ setPara(paraText, paraLvl, paraEmbeddings);
}
/**
@@ -3148,7 +4339,7 @@
}
/**
- * Return true if the line is all left-to-right text and the base direction
+ * Return true if the line is all left-to-right text and the base direction
* is left-to-right.
*
* @return true if the line is all left-to-right text and the base direction
@@ -3160,7 +4351,7 @@
*/
public boolean isLeftToRight()
{
- return (getDirection() == Bidi.DIRECTION_LEFT_TO_RIGHT && (paraLevel & 1) == 0);
+ return (getDirection() == LTR && (paraLevel & 1) == 0);
}
/**
@@ -3176,7 +4367,7 @@
*/
public boolean isRightToLeft()
{
- return (getDirection() == Bidi.DIRECTION_RIGHT_TO_LEFT && (paraLevel & 1) == 1);
+ return (getDirection() == RTL && (paraLevel & 1) == 1);
}
/**
@@ -3191,7 +4382,7 @@
*/
public boolean baseIsLeftToRight()
{
- return (getParaLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT);
+ return (getParaLevel() == LTR);
}
/**
@@ -3212,8 +4403,8 @@
/**
* Compute the logical to visual run mapping
*/
- private void getLogicalToVisualRunsMap()
- {
+ void getLogicalToVisualRunsMap()
+ {
if (isGoodLogicalToVisualRunsMap) {
return;
}
@@ -3231,9 +4422,8 @@
for (i = 0; i < count; i++) {
logicalToVisualRunsMap[i] = (int)(keys[i] & 0x00000000FFFFFFFF);
}
- keys = null;
isGoodLogicalToVisualRunsMap = true;
- }
+ }
/**
* Return the level of the nth logical run in this line.
@@ -3252,9 +4442,12 @@
{
verifyValidParaOrLine();
BidiLine.getRuns(this);
+
+ // for backward compatibility
if (run < 0 || run >= runCount) {
return getParaLevel();
}
+
getLogicalToVisualRunsMap();
return runs[logicalToVisualRunsMap[run]].level;
}
@@ -3277,12 +4470,14 @@
{
verifyValidParaOrLine();
BidiLine.getRuns(this);
+
+ // for backward compatibility
if (runCount == 1) {
return 0;
} else if (run == runCount) {
return length;
}
- verifyIndex(run, 0, runCount);
+
getLogicalToVisualRunsMap();
return runs[logicalToVisualRunsMap[run]].start;
}
@@ -3306,10 +4501,12 @@
{
verifyValidParaOrLine();
BidiLine.getRuns(this);
+
+ // for backward compatibility
if (runCount == 1) {
return length;
}
- verifyIndex(run, 0, runCount);
+
getLogicalToVisualRunsMap();
int idx = logicalToVisualRunsMap[run];
int len = idx == 0 ? runs[idx].limit :
@@ -3336,7 +4533,7 @@
int start,
int limit)
{
- final int RTLMask = (1 << Bidi.DIRECTION_RIGHT_TO_LEFT |
+ final int RTLMask = (1 << R |
1 << AL |
1 << RLE |
1 << RLO |
@@ -3346,6 +4543,7 @@
throw new IllegalArgumentException("Value start " + start +
" is out of range 0 to " + limit);
}
+
for (int i = start; i < limit; ++i) {
if (Character.isHighSurrogate(text[i]) && i < (limit-1) &&
Character.isLowSurrogate(text[i+1])) {
@@ -3356,6 +4554,7 @@
return true;
}
}
+
return false;
}
@@ -3382,8 +4581,9 @@
int objectStart,
int count)
{
+ // for backward compatibility
if (0 > levelStart || levels.length <= levelStart) {
- throw new IllegalArgumentException("Value levelStart " +
+ throw new IllegalArgumentException("Value levelStart " +
levelStart + " is out of range 0 to " +
(levels.length-1));
}
@@ -3397,6 +4597,7 @@
levelStart + " is out of range 0 to " +
(objects.length - objectStart));
}
+
byte[] reorderLevels = new byte[count];
System.arraycopy(levels, levelStart, reorderLevels, 0, count);
int[] indexMap = reorderVisual(reorderLevels);
@@ -3408,6 +4609,74 @@
}
/**
+ * Take a <code>Bidi</code> object containing the reordering
+ * information for a piece of text (one or more paragraphs) set by
+ * <code>setPara()</code> or for a line of text set by <code>setLine()</code>
+ * and return a string containing the reordered text.
+ *
+ * <p>The text may have been aliased (only a reference was stored
+ * without copying the contents), thus it must not have been modified
+ * since the <code>setPara()</code> call.</p>
+ *
+ * This method preserves the integrity of characters with multiple
+ * code units and (optionally) combining characters.
+ * Characters in RTL runs can be replaced by mirror-image characters
+ * in the returned string. Note that "real" mirroring has to be done in a
+ * rendering engine by glyph selection and that for many "mirrored"
+ * characters there are no Unicode characters as mirror-image equivalents.
+ * There are also options to insert or remove Bidi control
+ * characters; see the descriptions of the return value and the
+ * <code>options</code> parameter, and of the option bit flags.
+ *
+ * @param options A bit set of options for the reordering that control
+ * how the reordered text is written.
+ * The options include mirroring the characters on a code
+ * point basis and inserting LRM characters, which is used
+ * especially for transforming visually stored text
+ * to logically stored text (although this is still an
+ * imperfect implementation of an "inverse Bidi" algorithm
+ * because it uses the "forward Bidi" algorithm at its core).
+ * The available options are:
+ * <code>DO_MIRRORING</code>,
+ * <code>INSERT_LRM_FOR_NUMERIC</code>,
+ * <code>KEEP_BASE_COMBINING</code>,
+ * <code>OUTPUT_REVERSE</code>,
+ * <code>REMOVE_BIDI_CONTROLS</code>,
+ * <code>STREAMING</code>
+ *
+ * @return The reordered text.
+ * If the <code>INSERT_LRM_FOR_NUMERIC</code> option is set, then
+ * the length of the returned string could be as large as
+ * <code>getLength()+2*countRuns()</code>.<br>
+ * If the <code>REMOVE_BIDI_CONTROLS</code> option is set, then the
+ * length of the returned string may be less than
+ * <code>getLength()</code>.<br>
+ * If none of these options is set, then the length of the returned
+ * string will be exactly <code>getProcessedLength()</code>.
+ *
+ * @throws IllegalStateException if this call is not preceded by a successful
+ * call to <code>setPara</code> or <code>setLine</code>
+ *
+ * @see #DO_MIRRORING
+ * @see #INSERT_LRM_FOR_NUMERIC
+ * @see #KEEP_BASE_COMBINING
+ * @see #OUTPUT_REVERSE
+ * @see #REMOVE_BIDI_CONTROLS
+ * @see #OPTION_STREAMING
+ * @see #getProcessedLength
+ * @stable ICU 3.8
+ */
+ public String writeReordered(int options)
+ {
+ verifyValidParaOrLine();
+ if (length == 0) {
+ /* nothing to do */
+ return "";
+ }
+ return BidiWriter.writeReordered(this, options);
+ }
+
+ /**
* Display the bidi internal state, used in debugging.
*/
public String toString() {
@@ -3507,4 +4776,5 @@
}
}
}
+
}
--- a/jdk/src/java.base/share/classes/sun/text/bidi/BidiLine.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/bidi/BidiLine.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,17 +22,13 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
+*******************************************************************************
+* Copyright (C) 2001-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
@@ -42,7 +38,7 @@
import java.text.Bidi;
import java.util.Arrays;
-public final class BidiLine {
+final class BidiLine {
/*
* General remarks about the functions in this file:
@@ -122,13 +118,13 @@
level of B chars from 0 to paraLevel in getLevels when
orderParagraphsLTR==TRUE
*/
- if (BidiBase.NoContextRTL(dirProps[start - 1]) == BidiBase.B) {
+ if (dirProps[start - 1] == BidiBase.B) {
bidiBase.trailingWSStart = start; /* currently == bidiBase.length */
return;
}
/* go backwards across all WS, BN, explicit codes */
while (start > 0 &&
- (BidiBase.DirPropFlagNC(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
+ (BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
--start;
}
@@ -140,13 +136,11 @@
bidiBase.trailingWSStart=start;
}
- public static Bidi setLine(Bidi bidi, BidiBase paraBidi,
- Bidi newBidi, BidiBase newBidiBase,
- int start, int limit) {
+ static Bidi setLine(BidiBase paraBidi,
+ Bidi newBidi, BidiBase lineBidi,
+ int start, int limit) {
int length;
- BidiBase lineBidi = newBidiBase;
-
/* set the values in lineBidi from its paraBidi parent */
/* class members are already initialized to 0 */
// lineBidi.paraBidi = null; /* mark unfinished setLine */
@@ -161,6 +155,8 @@
lineBidi.paraLevel = paraBidi.GetParaLevelAt(start);
lineBidi.paraCount = paraBidi.paraCount;
lineBidi.runs = new BidiRun[0];
+ lineBidi.reorderingMode = paraBidi.reorderingMode;
+ lineBidi.reorderingOptions = paraBidi.reorderingOptions;
if (paraBidi.controlCount > 0) {
int j;
for (j = start; j < limit; j++) {
@@ -206,7 +202,7 @@
setTrailingWSStart(lineBidi);
trailingWSStart = lineBidi.trailingWSStart;
- /* recalculate lineBidi.direction */
+ /* recalculate lineBidiBase.direction */
if (trailingWSStart == 0) {
/* all levels are at paraLevel */
lineBidi.direction = (byte)(lineBidi.paraLevel & 1);
@@ -260,7 +256,8 @@
}
}
- newBidiBase.paraBidi = paraBidi; /* mark successful setLine */
+ lineBidi.paraBidi = paraBidi; /* mark successful setLine */
+
return newBidi;
}
@@ -303,30 +300,19 @@
return bidiBase.levels;
}
- static BidiRun getLogicalRun(BidiBase bidiBase, int logicalPosition)
- {
- /* this is done based on runs rather than on levels since levels have
- a special interpretation when REORDER_RUNS_ONLY
- */
- BidiRun newRun = new BidiRun(), iRun;
- getRuns(bidiBase);
- int runCount = bidiBase.runCount;
- int visualStart = 0, logicalLimit = 0;
- iRun = bidiBase.runs[0];
+ static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) {
+ int start = bidiBase.runs[runIndex].start;
+ int limit;
+ byte level = bidiBase.runs[runIndex].level;
- for (int i = 0; i < runCount; i++) {
- iRun = bidiBase.runs[i];
- logicalLimit = iRun.start + iRun.limit - visualStart;
- if ((logicalPosition >= iRun.start) &&
- (logicalPosition < logicalLimit)) {
- break;
- }
- visualStart = iRun.limit;
+ if (runIndex > 0) {
+ limit = start +
+ bidiBase.runs[runIndex].limit -
+ bidiBase.runs[runIndex - 1].limit;
+ } else {
+ limit = start + bidiBase.runs[0].limit;
}
- newRun.start = iRun.start;
- newRun.limit = logicalLimit;
- newRun.level = iRun.level;
- return newRun;
+ return new BidiRun(start, limit, level);
}
/* in trivial cases there is only one trivial run; called by getRuns() */
@@ -502,7 +488,7 @@
int length = bidiBase.length, limit;
byte[] levels = bidiBase.levels;
int i, runCount;
- byte level = BidiBase.INTERNAL_LEVEL_DEFAULT_LTR; /* initialize with no valid level */
+ byte level = -1; /* initialize with no valid level */
/*
* If there are WS characters at the end of the line
* and the run preceding them has a level different from
@@ -651,7 +637,7 @@
maxLevel = 0;
for (start = levels.length; start>0; ) {
level = levels[--start];
- if (level > BidiBase.MAX_EXPLICIT_LEVEL + 1) {
+ if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) {
return null;
}
if (level < minLevel) {
--- a/jdk/src/java.base/share/classes/sun/text/bidi/BidiRun.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/bidi/BidiRun.java Wed Jul 15 11:05:51 2015 +0900
@@ -55,7 +55,7 @@
*
* @see com.ibm.icu.text.Bidi
*/
-public class BidiRun {
+class BidiRun {
int start; /* first logical position of the run */
int limit; /* last visual position of the run +1 */
@@ -106,7 +106,7 @@
/**
* Get level of run
*/
- public byte getEmbeddingLevel()
+ byte getEmbeddingLevel()
{
return level;
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/bidi/BidiWriter.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+*******************************************************************************
+* Copyright (C) 2001-2010, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+*/
+/* Written by Simon Montagu, Matitiahu Allouche
+ * (ported from C code written by Markus W. Scherer)
+ */
+
+package sun.text.bidi;
+
+import sun.text.normalizer.UCharacter;
+import sun.text.normalizer.UTF16;
+
+final class BidiWriter {
+
+ /** Bidi control code points */
+ static final char LRM_CHAR = 0x200e;
+ static final char RLM_CHAR = 0x200f;
+ static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT |
+ 1 << UCharacter.RIGHT_TO_LEFT_ARABIC);
+
+ private static boolean IsCombining(int type) {
+ return ((1<<type &
+ (1<<UCharacter.NON_SPACING_MARK |
+ 1<<UCharacter.COMBINING_SPACING_MARK |
+ 1<<UCharacter.ENCLOSING_MARK)) != 0);
+ }
+
+ /*
+ * When we have OUTPUT_REVERSE set on writeReordered(), then we
+ * semantically write RTL runs in reverse and later reverse them again.
+ * Instead, we actually write them in forward order to begin with.
+ * However, if the RTL run was to be mirrored, we need to mirror here now
+ * since the implicit second reversal must not do it.
+ * It looks strange to do mirroring in LTR output, but it is only because
+ * we are writing RTL output in reverse.
+ */
+ private static String doWriteForward(String src, int options) {
+ /* optimize for several combinations of options */
+ switch(options&(BidiBase.REMOVE_BIDI_CONTROLS|BidiBase.DO_MIRRORING)) {
+ case 0: {
+ /* simply return the LTR run */
+ return src;
+ }
+ case BidiBase.DO_MIRRORING: {
+ StringBuffer dest = new StringBuffer(src.length());
+
+ /* do mirroring */
+ int i=0;
+ int c;
+
+ do {
+ c = UTF16.charAt(src, i);
+ i += UTF16.getCharCount(c);
+ UTF16.append(dest, UCharacter.getMirror(c));
+ } while(i < src.length());
+ return dest.toString();
+ }
+ case BidiBase.REMOVE_BIDI_CONTROLS: {
+ StringBuilder dest = new StringBuilder(src.length());
+
+ /* copy the LTR run and remove any Bidi control characters */
+ int i = 0;
+ char c;
+ do {
+ c = src.charAt(i++);
+ if(!BidiBase.IsBidiControlChar(c)) {
+ dest.append(c);
+ }
+ } while(i < src.length());
+ return dest.toString();
+ }
+ default: {
+ StringBuffer dest = new StringBuffer(src.length());
+
+ /* remove Bidi control characters and do mirroring */
+ int i = 0;
+ int c;
+ do {
+ c = UTF16.charAt(src, i);
+ i += UTF16.getCharCount(c);
+ if(!BidiBase.IsBidiControlChar(c)) {
+ UTF16.append(dest, UCharacter.getMirror(c));
+ }
+ } while(i < src.length());
+ return dest.toString();
+ }
+ } /* end of switch */
+ }
+
+ private static String doWriteForward(char[] text, int start, int limit,
+ int options) {
+ return doWriteForward(new String(text, start, limit - start), options);
+ }
+
+ static String writeReverse(String src, int options) {
+ /*
+ * RTL run -
+ *
+ * RTL runs need to be copied to the destination in reverse order
+ * of code points, not code units, to keep Unicode characters intact.
+ *
+ * The general strategy for this is to read the source text
+ * in backward order, collect all code units for a code point
+ * (and optionally following combining characters, see below),
+ * and copy all these code units in ascending order
+ * to the destination for this run.
+ *
+ * Several options request whether combining characters
+ * should be kept after their base characters,
+ * whether Bidi control characters should be removed, and
+ * whether characters should be replaced by their mirror-image
+ * equivalent Unicode characters.
+ */
+ StringBuffer dest = new StringBuffer(src.length());
+
+ /* optimize for several combinations of options */
+ switch (options &
+ (BidiBase.REMOVE_BIDI_CONTROLS |
+ BidiBase.DO_MIRRORING |
+ BidiBase.KEEP_BASE_COMBINING)) {
+
+ case 0:
+ /*
+ * With none of the "complicated" options set, the destination
+ * run will have the same length as the source run,
+ * and there is no mirroring and no keeping combining characters
+ * with their base characters.
+ *
+ * XXX: or dest = UTF16.reverse(new StringBuffer(src));
+ */
+
+ int srcLength = src.length();
+
+ /* preserve character integrity */
+ do {
+ /* i is always after the last code unit known to need to be kept
+ * in this segment */
+ int i = srcLength;
+
+ /* collect code units for one base character */
+ srcLength -= UTF16.getCharCount(UTF16.charAt(src,
+ srcLength - 1));
+
+ /* copy this base character */
+ dest.append(src.substring(srcLength, i));
+ } while(srcLength > 0);
+ break;
+
+ case BidiBase.KEEP_BASE_COMBINING:
+ /*
+ * Here, too, the destination
+ * run will have the same length as the source run,
+ * and there is no mirroring.
+ * We do need to keep combining characters with their base
+ * characters.
+ */
+ srcLength = src.length();
+
+ /* preserve character integrity */
+ do {
+ /* i is always after the last code unit known to need to be kept
+ * in this segment */
+ int c;
+ int i = srcLength;
+
+ /* collect code units and modifier letters for one base
+ * character */
+ do {
+ c = UTF16.charAt(src, srcLength - 1);
+ srcLength -= UTF16.getCharCount(c);
+ } while(srcLength > 0 && IsCombining(UCharacter.getType(c)));
+
+ /* copy this "user character" */
+ dest.append(src.substring(srcLength, i));
+ } while(srcLength > 0);
+ break;
+
+ default:
+ /*
+ * With several "complicated" options set, this is the most
+ * general and the slowest copying of an RTL run.
+ * We will do mirroring, remove Bidi controls, and
+ * keep combining characters with their base characters
+ * as requested.
+ */
+ srcLength = src.length();
+
+ /* preserve character integrity */
+ do {
+ /* i is always after the last code unit known to need to be kept
+ * in this segment */
+ int i = srcLength;
+
+ /* collect code units for one base character */
+ int c = UTF16.charAt(src, srcLength - 1);
+ srcLength -= UTF16.getCharCount(c);
+ if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) {
+ /* collect modifier letters for this base character */
+ while(srcLength > 0 && IsCombining(UCharacter.getType(c))) {
+ c = UTF16.charAt(src, srcLength - 1);
+ srcLength -= UTF16.getCharCount(c);
+ }
+ }
+
+ if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 &&
+ BidiBase.IsBidiControlChar(c)) {
+ /* do not copy this Bidi control character */
+ continue;
+ }
+
+ /* copy this "user character" */
+ int j = srcLength;
+ if((options & BidiBase.DO_MIRRORING) != 0) {
+ /* mirror only the base character */
+ c = UCharacter.getMirror(c);
+ UTF16.append(dest, c);
+ j += UTF16.getCharCount(c);
+ }
+ dest.append(src.substring(j, i));
+ } while(srcLength > 0);
+ break;
+ } /* end of switch */
+
+ return dest.toString();
+ }
+
+ static String doWriteReverse(char[] text, int start, int limit, int options) {
+ return writeReverse(new String(text, start, limit - start), options);
+ }
+
+ static String writeReordered(BidiBase bidi, int options) {
+ int run, runCount;
+ StringBuilder dest;
+ char[] text = bidi.text;
+ runCount = bidi.countRuns();
+
+ /*
+ * Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the
+ * reordering mode (checked below) is appropriate.
+ */
+ if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) {
+ options |= BidiBase.INSERT_LRM_FOR_NUMERIC;
+ options &= ~BidiBase.REMOVE_BIDI_CONTROLS;
+ }
+ /*
+ * Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS
+ * and cancels BidiBase.INSERT_LRM_FOR_NUMERIC.
+ */
+ if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) {
+ options |= BidiBase.REMOVE_BIDI_CONTROLS;
+ options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
+ }
+ /*
+ * If we do not perform the "inverse Bidi" algorithm, then we
+ * don't need to insert any LRMs, and don't need to test for it.
+ */
+ if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L) &&
+ (bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT) &&
+ (bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
+ (bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) {
+ options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
+ }
+ dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ?
+ bidi.length * 2 : bidi.length);
+ /*
+ * Iterate through all visual runs and copy the run text segments to
+ * the destination, according to the options.
+ *
+ * The tests for where to insert LRMs ignore the fact that there may be
+ * BN codes or non-BMP code points at the beginning and end of a run;
+ * they may insert LRMs unnecessarily but the tests are faster this way
+ * (this would have to be improved for UTF-8).
+ */
+ if ((options & BidiBase.OUTPUT_REVERSE) == 0) {
+ /* forward output */
+ if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
+ /* do not insert Bidi controls */
+ for (run = 0; run < runCount; ++run) {
+ BidiRun bidiRun = bidi.getVisualRun(run);
+ if (bidiRun.isEvenRun()) {
+ dest.append(doWriteForward(text, bidiRun.start,
+ bidiRun.limit,
+ options & ~BidiBase.DO_MIRRORING));
+ } else {
+ dest.append(doWriteReverse(text, bidiRun.start,
+ bidiRun.limit, options));
+ }
+ }
+ } else {
+ /* insert Bidi controls for "inverse Bidi" */
+ byte[] dirProps = bidi.dirProps;
+ char uc;
+ int markFlag;
+
+ for (run = 0; run < runCount; ++run) {
+ BidiRun bidiRun = bidi.getVisualRun(run);
+ markFlag=0;
+ /* check if something relevant in insertPoints */
+ markFlag = bidi.runs[run].insertRemove;
+ if (markFlag < 0) { /* bidi controls count */
+ markFlag = 0;
+ }
+ if (bidiRun.isEvenRun()) {
+ if (bidi.isInverse() &&
+ dirProps[bidiRun.start] != BidiBase.L) {
+ markFlag |= BidiBase.LRM_BEFORE;
+ }
+ if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
+ uc = LRM_CHAR;
+ } else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
+ uc = RLM_CHAR;
+ } else {
+ uc = 0;
+ }
+ if (uc != 0) {
+ dest.append(uc);
+ }
+ dest.append(doWriteForward(text,
+ bidiRun.start, bidiRun.limit,
+ options & ~BidiBase.DO_MIRRORING));
+
+ if (bidi.isInverse() &&
+ dirProps[bidiRun.limit - 1] != BidiBase.L) {
+ markFlag |= BidiBase.LRM_AFTER;
+ }
+ if ((markFlag & BidiBase.LRM_AFTER) != 0) {
+ uc = LRM_CHAR;
+ } else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
+ uc = RLM_CHAR;
+ } else {
+ uc = 0;
+ }
+ if (uc != 0) {
+ dest.append(uc);
+ }
+ } else { /* RTL run */
+ if (bidi.isInverse() &&
+ !bidi.testDirPropFlagAt(MASK_R_AL,
+ bidiRun.limit - 1)) {
+ markFlag |= BidiBase.RLM_BEFORE;
+ }
+ if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
+ uc = LRM_CHAR;
+ } else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
+ uc = RLM_CHAR;
+ } else {
+ uc = 0;
+ }
+ if (uc != 0) {
+ dest.append(uc);
+ }
+ dest.append(doWriteReverse(text, bidiRun.start,
+ bidiRun.limit, options));
+
+ if(bidi.isInverse() &&
+ (MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
+ markFlag |= BidiBase.RLM_AFTER;
+ }
+ if ((markFlag & BidiBase.LRM_AFTER) != 0) {
+ uc = LRM_CHAR;
+ } else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
+ uc = RLM_CHAR;
+ } else {
+ uc = 0;
+ }
+ if (uc != 0) {
+ dest.append(uc);
+ }
+ }
+ }
+ }
+ } else {
+ /* reverse output */
+ if((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
+ /* do not insert Bidi controls */
+ for(run = runCount; --run >= 0; ) {
+ BidiRun bidiRun = bidi.getVisualRun(run);
+ if (bidiRun.isEvenRun()) {
+ dest.append(doWriteReverse(text,
+ bidiRun.start, bidiRun.limit,
+ options & ~BidiBase.DO_MIRRORING));
+ } else {
+ dest.append(doWriteForward(text, bidiRun.start,
+ bidiRun.limit, options));
+ }
+ }
+ } else {
+ /* insert Bidi controls for "inverse Bidi" */
+
+ byte[] dirProps = bidi.dirProps;
+
+ for (run = runCount; --run >= 0; ) {
+ /* reverse output */
+ BidiRun bidiRun = bidi.getVisualRun(run);
+ if (bidiRun.isEvenRun()) {
+ if (dirProps[bidiRun.limit - 1] != BidiBase.L) {
+ dest.append(LRM_CHAR);
+ }
+
+ dest.append(doWriteReverse(text, bidiRun.start,
+ bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
+
+ if (dirProps[bidiRun.start] != BidiBase.L) {
+ dest.append(LRM_CHAR);
+ }
+ } else {
+ if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
+ dest.append(RLM_CHAR);
+ }
+
+ dest.append(doWriteForward(text, bidiRun.start,
+ bidiRun.limit, options));
+
+ if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) {
+ dest.append(RLM_CHAR);
+ }
+ }
+ }
+ }
+ }
+
+ return dest.toString();
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/BMPSet.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ ******************************************************************************
+ *
+ * Copyright (C) 2009-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ ******************************************************************************
+ */
+
+package sun.text.normalizer;
+
+import sun.text.normalizer.UnicodeSet.SpanCondition;
+
+/**
+ * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
+ *
+ * Latin-1: Look up bytes.
+ * 2-byte characters: Bits organized vertically.
+ * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
+ * Supplementary characters: Call contains() on the parent set.
+ */
+final class BMPSet {
+
+ /**
+ * One boolean ('true' or 'false') per Latin-1 character.
+ */
+ private boolean[] latin1Contains;
+
+ /**
+ * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
+ * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
+ * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
+ *
+ * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
+ * runtime.
+ */
+ private int[] table7FF;
+
+ /**
+ * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
+ * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
+ * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
+ * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
+ * and set.contains(c) must be called.
+ *
+ * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
+ * validity checking at runtime.
+ */
+ private int[] bmpBlockBits;
+
+ /**
+ * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
+ * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
+ * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
+ */
+ private int[] list4kStarts;
+
+ /**
+ * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
+ * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
+ */
+ private final int[] list;
+ private final int listLength; // length used; list may be longer to minimize reallocs
+
+ public BMPSet(final int[] parentList, int parentListLength) {
+ list = parentList;
+ listLength = parentListLength;
+ latin1Contains = new boolean[0x100];
+ table7FF = new int[64];
+ bmpBlockBits = new int[64];
+ list4kStarts = new int[18];
+
+ /*
+ * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
+ * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
+ * indexes is for finding supplementary code points.
+ */
+ list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
+ int i;
+ for (i = 1; i <= 0x10; ++i) {
+ list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
+ }
+ list4kStarts[0x11] = listLength - 1;
+
+ initBits();
+ }
+
+ public boolean contains(int c) {
+ if (c <= 0xff) {
+ return (latin1Contains[c]);
+ } else if (c <= 0x7ff) {
+ return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
+ } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
+ int lead = c >> 12;
+ int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
+ if (twoBits <= 1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ return (0 != twoBits);
+ } else {
+ // Look up the code point in its 4k block of code points.
+ return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
+ }
+ } else if (c <= 0x10ffff) {
+ // surrogate or supplementary code point
+ return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
+ } else {
+ // Out-of-range code points get false, consistent with long-standing
+ // behavior of UnicodeSet.contains(c).
+ return false;
+ }
+ }
+
+ /**
+ * Span the initial substring for which each character c has spanCondition==contains(c). It must be
+ * spanCondition==0 or 1.
+ *
+ * @param start The start index
+ * @param outCount If not null: Receives the number of code points in the span.
+ * @return the limit (exclusive end) of the span
+ *
+ * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
+ * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
+ * as usual in ICU.
+ */
+ public final int span(CharSequence s, int start, SpanCondition spanCondition,
+ OutputInt outCount) {
+ char c, c2;
+ int i = start;
+ int limit = s.length();
+ int numSupplementary = 0;
+ if (SpanCondition.NOT_CONTAINED != spanCondition) {
+ // span
+ while (i < limit) {
+ c = s.charAt(i);
+ if (c <= 0xff) {
+ if (!latin1Contains[c]) {
+ break;
+ }
+ } else if (c <= 0x7ff) {
+ if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
+ break;
+ }
+ } else if (c < 0xd800 ||
+ c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
+ int lead = c >> 12;
+ int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
+ if (twoBits <= 1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if (twoBits == 0) {
+ break;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
+ break;
+ }
+ }
+ } else {
+ // surrogate pair
+ int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
+ if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
+ break;
+ }
+ ++numSupplementary;
+ ++i;
+ }
+ ++i;
+ }
+ } else {
+ // span not
+ while (i < limit) {
+ c = s.charAt(i);
+ if (c <= 0xff) {
+ if (latin1Contains[c]) {
+ break;
+ }
+ } else if (c <= 0x7ff) {
+ if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
+ break;
+ }
+ } else if (c < 0xd800 ||
+ c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
+ int lead = c >> 12;
+ int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
+ if (twoBits <= 1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if (twoBits != 0) {
+ break;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
+ break;
+ }
+ }
+ } else {
+ // surrogate pair
+ int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
+ if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
+ break;
+ }
+ ++numSupplementary;
+ ++i;
+ }
+ ++i;
+ }
+ }
+ if (outCount != null) {
+ int spanLength = i - start;
+ outCount.value = spanLength - numSupplementary; // number of code points
+ }
+ return i;
+ }
+
+ /**
+ * Symmetrical with span().
+ * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
+ * limit and spanCondition==0 or 1.
+ *
+ * @return The string index which starts the span (i.e. inclusive).
+ */
+ public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
+ char c, c2;
+
+ if (SpanCondition.NOT_CONTAINED != spanCondition) {
+ // span
+ for (;;) {
+ c = s.charAt(--limit);
+ if (c <= 0xff) {
+ if (!latin1Contains[c]) {
+ break;
+ }
+ } else if (c <= 0x7ff) {
+ if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
+ break;
+ }
+ } else if (c < 0xd800 ||
+ c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
+ int lead = c >> 12;
+ int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
+ if (twoBits <= 1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if (twoBits == 0) {
+ break;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
+ break;
+ }
+ }
+ } else {
+ // surrogate pair
+ int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
+ if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
+ break;
+ }
+ --limit;
+ }
+ if (0 == limit) {
+ return 0;
+ }
+ }
+ } else {
+ // span not
+ for (;;) {
+ c = s.charAt(--limit);
+ if (c <= 0xff) {
+ if (latin1Contains[c]) {
+ break;
+ }
+ } else if (c <= 0x7ff) {
+ if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
+ break;
+ }
+ } else if (c < 0xd800 ||
+ c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
+ int lead = c >> 12;
+ int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
+ if (twoBits <= 1) {
+ // All 64 code points with the same bits 15..6
+ // are either in the set or not.
+ if (twoBits != 0) {
+ break;
+ }
+ } else {
+ // Look up the code point in its 4k block of code points.
+ if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
+ break;
+ }
+ }
+ } else {
+ // surrogate pair
+ int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
+ if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
+ break;
+ }
+ --limit;
+ }
+ if (0 == limit) {
+ return 0;
+ }
+ }
+ }
+ return limit + 1;
+ }
+
+ /**
+ * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
+ */
+ private static void set32x64Bits(int[] table, int start, int limit) {
+ assert (64 == table.length);
+ int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
+ int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
+
+ // Set one bit indicating an all-one block.
+ int bits = 1 << lead;
+ if ((start + 1) == limit) { // Single-character shortcut.
+ table[trail] |= bits;
+ return;
+ }
+
+ int limitLead = limit >> 6;
+ int limitTrail = limit & 0x3f;
+
+ if (lead == limitLead) {
+ // Partial vertical bit column.
+ while (trail < limitTrail) {
+ table[trail++] |= bits;
+ }
+ } else {
+ // Partial vertical bit column,
+ // followed by a bit rectangle,
+ // followed by another partial vertical bit column.
+ if (trail > 0) {
+ do {
+ table[trail++] |= bits;
+ } while (trail < 64);
+ ++lead;
+ }
+ if (lead < limitLead) {
+ bits = ~((1 << lead) - 1);
+ if (limitLead < 0x20) {
+ bits &= (1 << limitLead) - 1;
+ }
+ for (trail = 0; trail < 64; ++trail) {
+ table[trail] |= bits;
+ }
+ }
+ // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
+ // In that case, bits=1<<limitLead == 1<<0 == 1
+ // (because Java << uses only the lower 5 bits of the shift operand)
+ // but the bits value is not used because trail<limitTrail is already false.
+ bits = 1 << limitLead;
+ for (trail = 0; trail < limitTrail; ++trail) {
+ table[trail] |= bits;
+ }
+ }
+ }
+
+ private void initBits() {
+ int start, limit;
+ int listIndex = 0;
+
+ // Set latin1Contains[].
+ do {
+ start = list[listIndex++];
+ if (listIndex < listLength) {
+ limit = list[listIndex++];
+ } else {
+ limit = 0x110000;
+ }
+ if (start >= 0x100) {
+ break;
+ }
+ do {
+ latin1Contains[start++] = true;
+ } while (start < limit && start < 0x100);
+ } while (limit <= 0x100);
+
+ // Set table7FF[].
+ while (start < 0x800) {
+ set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
+ if (limit > 0x800) {
+ start = 0x800;
+ break;
+ }
+
+ start = list[listIndex++];
+ if (listIndex < listLength) {
+ limit = list[listIndex++];
+ } else {
+ limit = 0x110000;
+ }
+ }
+
+ // Set bmpBlockBits[].
+ int minStart = 0x800;
+ while (start < 0x10000) {
+ if (limit > 0x10000) {
+ limit = 0x10000;
+ }
+
+ if (start < minStart) {
+ start = minStart;
+ }
+ if (start < limit) { // Else: Another range entirely in a known mixed-value block.
+ if (0 != (start & 0x3f)) {
+ // Mixed-value block of 64 code points.
+ start >>= 6;
+ bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
+ start = (start + 1) << 6; // Round up to the next block boundary.
+ minStart = start; // Ignore further ranges in this block.
+ }
+ if (start < limit) {
+ if (start < (limit & ~0x3f)) {
+ // Multiple all-ones blocks of 64 code points each.
+ set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
+ }
+
+ if (0 != (limit & 0x3f)) {
+ // Mixed-value block of 64 code points.
+ limit >>= 6;
+ bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
+ limit = (limit + 1) << 6; // Round up to the next block boundary.
+ minStart = limit; // Ignore further ranges in this block.
+ }
+ }
+ }
+
+ if (limit == 0x10000) {
+ break;
+ }
+
+ start = list[listIndex++];
+ if (listIndex < listLength) {
+ limit = list[listIndex++];
+ } else {
+ limit = 0x110000;
+ }
+ }
+ }
+
+ /**
+ * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
+ * points in a certain range.
+ *
+ * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
+ * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
+ *
+ * @param c
+ * a character in a subrange of MIN_VALUE..MAX_VALUE
+ * @param lo
+ * The lowest index to be returned.
+ * @param hi
+ * The highest index to be returned.
+ * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
+ */
+ private int findCodePoint(int c, int lo, int hi) {
+ /* Examples:
+ findCodePoint(c)
+ set list[] c=0 1 3 4 7 8
+ === ============== ===========
+ [] [110000] 0 0 0 0 0 0
+ [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
+ [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
+ [:Any:] [0, 110000] 1 1 1 1 1 1
+ */
+
+ // Return the smallest i such that c < list[i]. Assume
+ // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
+ if (c < list[lo])
+ return lo;
+ // High runner test. c is often after the last range, so an
+ // initial check for this condition pays off.
+ if (lo >= hi || c >= list[hi - 1])
+ return hi;
+ // invariant: c >= list[lo]
+ // invariant: c < list[hi]
+ for (;;) {
+ int i = (lo + hi) >>> 1;
+ if (i == lo) {
+ break; // Found!
+ } else if (c < list[i]) {
+ hi = i;
+ } else {
+ lo = i;
+ }
+ }
+ return hi;
+ }
+
+ private final boolean containsSlow(int c, int lo, int hi) {
+ return (0 != (findCodePoint(c, lo, hi) & 1));
+ }
+}
+
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/CharTrie.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/CharTrie.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,22 +22,18 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
+ ******************************************************************************
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ******************************************************************************
*/
package sun.text.normalizer;
+import java.io.DataInputStream;
import java.io.InputStream;
-import java.io.DataInputStream;
import java.io.IOException;
/**
@@ -73,120 +69,17 @@
throw new IllegalArgumentException(
"Data given does not belong to a char trie.");
}
- m_friendAgent_ = new FriendAgent();
- }
-
- /**
- * Make a dummy CharTrie.
- * A dummy trie is an empty runtime trie, used when a real data trie cannot
- * be loaded.
- *
- * The trie always returns the initialValue,
- * or the leadUnitValue for lead surrogate code points.
- * The Latin-1 part is always set up to be linear.
- *
- * @param initialValue the initial value that is set for all code points
- * @param leadUnitValue the value for lead surrogate code _units_ that do not
- * have associated supplementary data
- * @param dataManipulate object which provides methods to parse the char data
- */
- public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
- super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
-
- int dataLength, latin1Length, i, limit;
- char block;
-
- /* calculate the actual size of the dummy trie data */
-
- /* max(Latin-1, block 0) */
- dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
- if(leadUnitValue!=initialValue) {
- dataLength+=DATA_BLOCK_LENGTH;
- }
- m_data_=new char[dataLength];
- m_dataLength_=dataLength;
-
- m_initialValue_=(char)initialValue;
-
- /* fill the index and data arrays */
-
- /* indexes are preset to 0 (block 0) */
-
- /* Latin-1 data */
- for(i=0; i<latin1Length; ++i) {
- m_data_[i]=(char)initialValue;
- }
-
- if(leadUnitValue!=initialValue) {
- /* indexes for lead surrogate code units to the block after Latin-1 */
- block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
- i=0xd800>>INDEX_STAGE_1_SHIFT_;
- limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
- for(; i<limit; ++i) {
- m_index_[i]=block;
- }
-
- /* data for lead surrogate code units */
- limit=latin1Length+DATA_BLOCK_LENGTH;
- for(i=latin1Length; i<limit; ++i) {
- m_data_[i]=(char)leadUnitValue;
- }
- }
-
- m_friendAgent_ = new FriendAgent();
- }
-
- /**
- * Java friend implementation
- */
- public class FriendAgent
- {
- /**
- * Gives out the index array of the trie
- * @return index array of trie
- */
- public char[] getPrivateIndex()
- {
- return m_index_;
- }
- /**
- * Gives out the data array of the trie
- * @return data array of trie
- */
- public char[] getPrivateData()
- {
- return m_data_;
- }
- /**
- * Gives out the data offset in the trie
- * @return data offset in the trie
- */
- public int getPrivateInitialValue()
- {
- return m_initialValue_;
- }
}
// public methods --------------------------------------------------
/**
- * Java friend implementation
- * To store the index and data array into the argument.
- * @param friend java friend UCharacterProperty object to store the array
+ * Gets the value associated with the codepoint.
+ * If no value is associated with the codepoint, a default value will be
+ * returned.
+ * @param ch codepoint
+ * @return offset to data
*/
- public void putIndexData(UCharacterProperty friend)
- {
- friend.setIndexData(m_friendAgent_);
- }
-
- /**
- * Gets the value associated with the codepoint.
- * If no value is associated with the codepoint, a default value will be
- * returned.
- * @param ch codepoint
- * @return offset to data
- * @draft 2.1
- */
public final char getCodePointValue(int ch)
{
int offset;
@@ -215,52 +108,12 @@
* This method does not guarantee correct results for trail surrogates.
* @param ch lead surrogate character
* @return data value
- * @draft 2.1
*/
public final char getLeadValue(char ch)
{
return m_data_[getLeadOffset(ch)];
}
- /**
- * Get the value associated with a pair of surrogates.
- * @param lead a lead surrogate
- * @param trail a trail surrogate
- * @draft 2.1
- */
- public final char getSurrogateValue(char lead, char trail)
- {
- int offset = getSurrogateOffset(lead, trail);
- if (offset > 0) {
- return m_data_[offset];
- }
- return m_initialValue_;
- }
-
- /**
- * <p>Get a value from a folding offset (from the value of a lead surrogate)
- * and a trail surrogate.</p>
- * <p>If the
- * @param leadvalue value associated with the lead surrogate which contains
- * the folding offset
- * @param trail surrogate
- * @return trie data value associated with the trail character
- * @draft 2.1
- */
- public final char getTrailValue(int leadvalue, char trail)
- {
- if (m_dataManipulate_ == null) {
- throw new NullPointerException(
- "The field DataManipulate in this Trie is null");
- }
- int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
- if (offset > 0) {
- return m_data_[getRawOffset(offset,
- (char)(trail & SURROGATE_MASK_))];
- }
- return m_initialValue_;
- }
-
// protected methods -----------------------------------------------
/**
@@ -309,41 +162,14 @@
return -1;
}
- /**
- * Gets the value at the argument index.
- * For use internally in TrieIterator.
- * @param index value at index will be retrieved
- * @return 32 bit value
- * @see com.ibm.icu.impl.TrieIterator
- * @draft 2.1
- */
- protected final int getValue(int index)
- {
- return m_data_[index];
- }
-
- /**
- * Gets the default initial value
- * @return 32 bit value
- * @draft 2.1
- */
- protected final int getInitialValue()
- {
- return m_initialValue_;
- }
-
// private data members --------------------------------------------
/**
- * Default value
- */
+ * Default value
+ */
private char m_initialValue_;
/**
- * Array of char data
- */
+ * Array of char data
+ */
private char m_data_[];
- /**
- * Agent for friends
- */
- private FriendAgent m_friendAgent_;
}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/CharacterIteratorWrapper.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/CharacterIteratorWrapper.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -45,7 +45,7 @@
* @author ram
*/
-public class CharacterIteratorWrapper extends UCharacterIterator {
+class CharacterIteratorWrapper extends UCharacterIterator {
private CharacterIterator iterator;
@@ -111,7 +111,6 @@
iterator.setIndex(index);
}
- //// for StringPrep
/**
* @see UCharacterIterator#getText(char[])
*/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/FilteredNormalizer2.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+*******************************************************************************
+* Copyright (C) 2009-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+*/
+package sun.text.normalizer;
+
+import java.io.IOException;
+
+/**
+ * Normalization filtered by a UnicodeSet.
+ * Normalizes portions of the text contained in the filter set and leaves
+ * portions not contained in the filter set unchanged.
+ * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
+ * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
+ * This class implements all of (and only) the Normalizer2 API.
+ * An instance of this class is unmodifiable/immutable.
+ * @stable ICU 4.4
+ * @author Markus W. Scherer
+ */
+class FilteredNormalizer2 extends Normalizer2 {
+
+ /**
+ * Constructs a filtered normalizer wrapping any Normalizer2 instance
+ * and a filter set.
+ * Both are aliased and must not be modified or deleted while this object
+ * is used.
+ * The filter set should be frozen; otherwise the performance will suffer greatly.
+ * @param n2 wrapped Normalizer2 instance
+ * @param filterSet UnicodeSet which determines the characters to be normalized
+ * @stable ICU 4.4
+ */
+ public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
+ norm2=n2;
+ set=filterSet;
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 4.4
+ */
+ @Override
+ public StringBuilder normalize(CharSequence src, StringBuilder dest) {
+ if(dest==src) {
+ throw new IllegalArgumentException();
+ }
+ dest.setLength(0);
+ normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
+ return dest;
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 4.6
+ */
+ @Override
+ public Appendable normalize(CharSequence src, Appendable dest) {
+ if(dest==src) {
+ throw new IllegalArgumentException();
+ }
+ return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 4.4
+ */
+ @Override
+ public StringBuilder normalizeSecondAndAppend(
+ StringBuilder first, CharSequence second) {
+ return normalizeSecondAndAppend(first, second, true);
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 4.4
+ */
+ @Override
+ public StringBuilder append(StringBuilder first, CharSequence second) {
+ return normalizeSecondAndAppend(first, second, false);
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 4.6
+ */
+ @Override
+ public String getDecomposition(int c) {
+ return set.contains(c) ? norm2.getDecomposition(c) : null;
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 49
+ */
+ @Override
+ public int getCombiningClass(int c) {
+ return set.contains(c) ? norm2.getCombiningClass(c) : 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 4.4
+ */
+ @Override
+ public boolean isNormalized(CharSequence s) {
+ UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
+ for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
+ int spanLimit=set.span(s, prevSpanLimit, spanCondition);
+ if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
+ spanCondition=UnicodeSet.SpanCondition.SIMPLE;
+ } else {
+ if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
+ return false;
+ }
+ spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
+ }
+ prevSpanLimit=spanLimit;
+ }
+ return true;
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 4.4
+ */
+ @Override
+ public int spanQuickCheckYes(CharSequence s) {
+ UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
+ for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
+ int spanLimit=set.span(s, prevSpanLimit, spanCondition);
+ if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
+ spanCondition=UnicodeSet.SpanCondition.SIMPLE;
+ } else {
+ int yesLimit=
+ prevSpanLimit+
+ norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
+ if(yesLimit<spanLimit) {
+ return yesLimit;
+ }
+ spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
+ }
+ prevSpanLimit=spanLimit;
+ }
+ return s.length();
+ }
+
+ /**
+ * {@inheritDoc}
+ * @stable ICU 4.4
+ */
+ @Override
+ public boolean hasBoundaryBefore(int c) {
+ return !set.contains(c) || norm2.hasBoundaryBefore(c);
+ }
+
+ // Internal: No argument checking, and appends to dest.
+ // Pass as input spanCondition the one that is likely to yield a non-zero
+ // span length at the start of src.
+ // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
+ // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
+ // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
+ // an in-filter prefix.
+ private Appendable normalize(CharSequence src, Appendable dest,
+ UnicodeSet.SpanCondition spanCondition) {
+ // Don't throw away destination buffer between iterations.
+ StringBuilder tempDest=new StringBuilder();
+ try {
+ for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
+ int spanLimit=set.span(src, prevSpanLimit, spanCondition);
+ int spanLength=spanLimit-prevSpanLimit;
+ if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
+ if(spanLength!=0) {
+ dest.append(src, prevSpanLimit, spanLimit);
+ }
+ spanCondition=UnicodeSet.SpanCondition.SIMPLE;
+ } else {
+ if(spanLength!=0) {
+ // Not norm2.normalizeSecondAndAppend() because we do not want
+ // to modify the non-filter part of dest.
+ dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
+ }
+ spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
+ }
+ prevSpanLimit=spanLimit;
+ }
+ } catch(IOException e) {
+ throw new InternalError(e.toString(), e);
+ }
+ return dest;
+ }
+
+ private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
+ boolean doNormalize) {
+ if(first==second) {
+ throw new IllegalArgumentException();
+ }
+ if(first.length()==0) {
+ if(doNormalize) {
+ return normalize(second, first);
+ } else {
+ return first.append(second);
+ }
+ }
+ // merge the in-filter suffix of the first string with the in-filter prefix of the second
+ int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
+ if(prefixLimit!=0) {
+ CharSequence prefix=second.subSequence(0, prefixLimit);
+ int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
+ if(suffixStart==0) {
+ if(doNormalize) {
+ norm2.normalizeSecondAndAppend(first, prefix);
+ } else {
+ norm2.append(first, prefix);
+ }
+ } else {
+ StringBuilder middle=new StringBuilder(
+ first.subSequence(suffixStart, first.length()));
+ if(doNormalize) {
+ norm2.normalizeSecondAndAppend(middle, prefix);
+ } else {
+ norm2.append(middle, prefix);
+ }
+ first.delete(suffixStart, 0x7fffffff).append(middle);
+ }
+ }
+ if(prefixLimit<second.length()) {
+ CharSequence rest=second.subSequence(prefixLimit, second.length());
+ if(doNormalize) {
+ normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
+ } else {
+ first.append(rest);
+ }
+ }
+ return first;
+ }
+
+ private Normalizer2 norm2;
+ private UnicodeSet set;
+};
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/ICUBinary.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/ICUBinary.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -25,25 +25,38 @@
/*
*******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
-import java.io.InputStream;
+import java.io.BufferedInputStream;
import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.file.FileSystems;
import java.util.Arrays;
+import java.security.AccessController;
+import java.security.PrivilegedAction;
-public final class ICUBinary
-{
+public final class ICUBinary {
+
+ private static final class IsAcceptable implements Authenticate {
+ // @Override when we switch to Java 6
+ public boolean isDataVersionAcceptable(byte version[]) {
+ return version[0] == 1;
+ }
+ }
+
// public inner interface ------------------------------------------------
/**
@@ -63,53 +76,44 @@
// public methods --------------------------------------------------------
/**
- * <p>ICU data header reader method.
- * Takes a ICU generated big-endian input stream, parse the ICU standard
- * file header and authenticates them.
- * <p>Header format:
- * <ul>
- * <li> Header size (char)
- * <li> Magic number 1 (byte)
- * <li> Magic number 2 (byte)
- * <li> Rest of the header size (char)
- * <li> Reserved word (char)
- * <li> Big endian indicator (byte)
- * <li> Character set family indicator (byte)
- * <li> Size of a char (byte) for c++ and c use
- * <li> Reserved byte (byte)
- * <li> Data format identifier (4 bytes), each ICU data has its own
- * identifier to distinguish them. [0] major [1] minor
- * [2] milli [3] micro
- * <li> Data version (4 bytes), the change version of the ICU data
- * [0] major [1] minor [2] milli [3] micro
- * <li> Unicode version (4 bytes) this ICU is based on.
- * </ul>
- *
- * <p>
- * Example of use:<br>
- * <pre>
- * try {
- * FileInputStream input = new FileInputStream(filename);
- * If (Utility.readICUDataHeader(input, dataformat, dataversion,
- * unicode) {
- * System.out.println("Verified file header, this is a ICU data file");
- * }
- * } catch (IOException e) {
- * System.out.println("This is not a ICU data file");
- * }
- * </pre>
- *
- * @param inputStream input stream that contains the ICU data header
- * @param dataFormatIDExpected Data format expected. An array of 4 bytes
- * information about the data format.
- * E.g. data format ID 1.2.3.4. will became an array of
- * {1, 2, 3, 4}
- * @param authenticate user defined extra data authentication. This value
- * can be null, if no extra authentication is needed.
- * @exception IOException thrown if there is a read error or
- * when header authentication fails.
- * @draft 2.1
- */
+ * Loads an ICU binary data file and returns it as a ByteBuffer.
+ * The buffer contents is normally read-only, but its position etc. can be modified.
+ *
+ * @param itemPath Relative ICU data item path, for example "root.res" or "coll/ucadata.icu".
+ * @return The data as a read-only ByteBuffer.
+ */
+ public static ByteBuffer getRequiredData(String itemPath) {
+ final Class<ICUBinary> root = ICUBinary.class;
+
+ try (InputStream is = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
+ public InputStream run() {
+ return root.getResourceAsStream(itemPath);
+ }
+ })) {
+
+ BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
+ DataInputStream inputStream = new DataInputStream(b);
+ byte[] bb = new byte[120000];
+ int n = inputStream.read(bb);
+ ByteBuffer bytes = ByteBuffer.wrap(bb, 0, n);
+ return bytes;
+ }
+ catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ }
+
+ /**
+ * Same as readHeader(), but returns a VersionInfo rather than a compact int.
+ */
+ public static VersionInfo readHeaderAndDataVersion(ByteBuffer bytes,
+ int dataFormat,
+ Authenticate authenticate)
+ throws IOException {
+ return getVersionInfoFromCompactInt(readHeader(bytes, dataFormat, authenticate));
+ }
+
+ private static final byte BIG_ENDIAN_ = 1;
public static final byte[] readHeader(InputStream inputStream,
byte dataFormatIDExpected[],
Authenticate authenticate)
@@ -164,6 +168,80 @@
return unicodeVersion;
}
+ /**
+ * Reads an ICU data header, checks the data format, and returns the data version.
+ *
+ * <p>Assumes that the ByteBuffer position is 0 on input.
+ * The buffer byte order is set according to the data.
+ * The buffer position is advanced past the header (including UDataInfo and comment).
+ *
+ * <p>See C++ ucmndata.h and unicode/udata.h.
+ *
+ * @return dataVersion
+ * @throws IOException if this is not a valid ICU data item of the expected dataFormat
+ */
+ public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate)
+ throws IOException {
+ assert bytes.position() == 0;
+ byte magic1 = bytes.get(2);
+ byte magic2 = bytes.get(3);
+ if (magic1 != MAGIC1 || magic2 != MAGIC2) {
+ throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
+ }
+
+ byte isBigEndian = bytes.get(8);
+ byte charsetFamily = bytes.get(9);
+ byte sizeofUChar = bytes.get(10);
+ if (isBigEndian < 0 || 1 < isBigEndian ||
+ charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) {
+ throw new IOException(HEADER_AUTHENTICATION_FAILED_);
+ }
+ bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN);
+
+ int headerSize = bytes.getChar(0);
+ int sizeofUDataInfo = bytes.getChar(4);
+ if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) {
+ throw new IOException("Internal Error: Header size error");
+ }
+ // TODO: Change Authenticate to take int major, int minor, int milli, int micro
+ // to avoid array allocation.
+ byte[] formatVersion = new byte[] {
+ bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19)
+ };
+ if (bytes.get(12) != (byte)(dataFormat >> 24) ||
+ bytes.get(13) != (byte)(dataFormat >> 16) ||
+ bytes.get(14) != (byte)(dataFormat >> 8) ||
+ bytes.get(15) != (byte)dataFormat ||
+ (authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) {
+ throw new IOException(HEADER_AUTHENTICATION_FAILED_ +
+ String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d",
+ bytes.get(12), bytes.get(13), bytes.get(14), bytes.get(15),
+ formatVersion[0] & 0xff, formatVersion[1] & 0xff,
+ formatVersion[2] & 0xff, formatVersion[3] & 0xff));
+ }
+
+ bytes.position(headerSize);
+ return // dataVersion
+ ((int)bytes.get(20) << 24) |
+ ((bytes.get(21) & 0xff) << 16) |
+ ((bytes.get(22) & 0xff) << 8) |
+ (bytes.get(23) & 0xff);
+ }
+
+ public static void skipBytes(ByteBuffer bytes, int skipLength) {
+ if (skipLength > 0) {
+ bytes.position(bytes.position() + skipLength);
+ }
+ }
+
+ /**
+ * Returns a VersionInfo for the bytes in the compact version integer.
+ */
+ public static VersionInfo getVersionInfoFromCompactInt(int version) {
+ return VersionInfo.getInstance(
+ version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
+ }
+
// private variables -------------------------------------------------
/**
@@ -175,7 +253,6 @@
/**
* File format authentication values
*/
- private static final byte BIG_ENDIAN_ = 1;
private static final byte CHAR_SET_ = 0;
private static final byte CHAR_SIZE_ = 2;
@@ -183,7 +260,7 @@
* Error messages
*/
private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ =
- "ICU data file error: Not an ICU data file";
+ "ICUBinary data file error: Magin number authentication failed";
private static final String HEADER_AUTHENTICATION_FAILED_ =
- "ICU data file error: Header authentication failed, please check if you have a valid ICU data file";
+ "ICUBinary data file error: Header authentication failed";
}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/ICUData.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.io.InputStream;
-import java.net.URL;
-import java.security.AccessController;
-import java.security.PrivilegedAction;
-import java.util.MissingResourceException;
-
-/**
- * Provides access to ICU data files as InputStreams. Implements security checking.
- */
-public final class ICUData {
-
- private static InputStream getStream(final Class<ICUData> root, final String resourceName, boolean required) {
- InputStream i = null;
-
- if (System.getSecurityManager() != null) {
- i = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
- public InputStream run() {
- return root.getResourceAsStream(resourceName);
- }
- });
- } else {
- i = root.getResourceAsStream(resourceName);
- }
-
- if (i == null && required) {
- throw new MissingResourceException("could not locate data", root.getPackage().getName(), resourceName);
- }
- return i;
- }
-
- /*
- * Convenience override that calls getStream(ICUData.class, resourceName, false);
- */
- public static InputStream getStream(String resourceName) {
- return getStream(ICUData.class, resourceName, false);
- }
-
- /*
- * Convenience method that calls getStream(ICUData.class, resourceName, true).
- */
- public static InputStream getRequiredStream(String resourceName) {
- return getStream(ICUData.class, resourceName, true);
- }
-}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/IntTrie.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.io.InputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.util.Arrays;
-
-/**
- * Trie implementation which stores data in int, 32 bits.
- * @author synwee
- * @see com.ibm.icu.impl.Trie
- * @since release 2.1, Jan 01 2002
- */
-public class IntTrie extends Trie
-{
- // public constructors ---------------------------------------------
-
- /**
- * <p>Creates a new Trie with the settings for the trie data.</p>
- * <p>Unserialize the 32-bit-aligned input stream and use the data for the
- * trie.</p>
- * @param inputStream file input stream to a ICU data file, containing
- * the trie
- * @param datamanipulate object which provides methods to parse the char
- * data
- * @throws IOException thrown when data reading fails
- * @draft 2.1
- */
- public IntTrie(InputStream inputStream, DataManipulate datamanipulate)
- throws IOException
- {
- super(inputStream, datamanipulate);
- if (!isIntTrie()) {
- throw new IllegalArgumentException(
- "Data given does not belong to a int trie.");
- }
- }
-
- // public methods --------------------------------------------------
-
- /**
- * Gets the value associated with the codepoint.
- * If no value is associated with the codepoint, a default value will be
- * returned.
- * @param ch codepoint
- * @return offset to data
- * @draft 2.1
- */
- public final int getCodePointValue(int ch)
- {
- int offset = getCodePointOffset(ch);
- return (offset >= 0) ? m_data_[offset] : m_initialValue_;
- }
-
- /**
- * Gets the value to the data which this lead surrogate character points
- * to.
- * Returned data may contain folding offset information for the next
- * trailing surrogate character.
- * This method does not guarantee correct results for trail surrogates.
- * @param ch lead surrogate character
- * @return data value
- * @draft 2.1
- */
- public final int getLeadValue(char ch)
- {
- return m_data_[getLeadOffset(ch)];
- }
-
- /**
- * Get a value from a folding offset (from the value of a lead surrogate)
- * and a trail surrogate.
- * @param leadvalue the value of a lead surrogate that contains the
- * folding offset
- * @param trail surrogate
- * @return trie data value associated with the trail character
- * @draft 2.1
- */
- public final int getTrailValue(int leadvalue, char trail)
- {
- if (m_dataManipulate_ == null) {
- throw new NullPointerException(
- "The field DataManipulate in this Trie is null");
- }
- int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
- if (offset > 0) {
- return m_data_[getRawOffset(offset,
- (char)(trail & SURROGATE_MASK_))];
- }
- return m_initialValue_;
- }
-
- // protected methods -----------------------------------------------
-
- /**
- * <p>Parses the input stream and stores its trie content into a index and
- * data array</p>
- * @param inputStream data input stream containing trie data
- * @exception IOException thrown when data reading fails
- */
- protected final void unserialize(InputStream inputStream)
- throws IOException
- {
- super.unserialize(inputStream);
- // one used for initial value
- m_data_ = new int[m_dataLength_];
- DataInputStream input = new DataInputStream(inputStream);
- for (int i = 0; i < m_dataLength_; i ++) {
- m_data_[i] = input.readInt();
- }
- m_initialValue_ = m_data_[0];
- }
-
- /**
- * Gets the offset to the data which the surrogate pair points to.
- * @param lead lead surrogate
- * @param trail trailing surrogate
- * @return offset to data
- * @draft 2.1
- */
- protected final int getSurrogateOffset(char lead, char trail)
- {
- if (m_dataManipulate_ == null) {
- throw new NullPointerException(
- "The field DataManipulate in this Trie is null");
- }
- // get fold position for the next trail surrogate
- int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
-
- // get the real data from the folded lead/trail units
- if (offset > 0) {
- return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
- }
-
- // return -1 if there is an error, in this case we return the default
- // value: m_initialValue_
- return -1;
- }
-
- /**
- * Gets the value at the argument index.
- * For use internally in TrieIterator
- * @param index value at index will be retrieved
- * @return 32 bit value
- * @see com.ibm.icu.impl.TrieIterator
- * @draft 2.1
- */
- protected final int getValue(int index)
- {
- return m_data_[index];
- }
-
- /**
- * Gets the default initial value
- * @return 32 bit value
- * @draft 2.1
- */
- protected final int getInitialValue()
- {
- return m_initialValue_;
- }
-
- // package private methods -----------------------------------------
-
- /**
- * Internal constructor for builder use
- * @param index the index array to be slotted into this trie
- * @param data the data array to be slotted into this trie
- * @param initialvalue the initial value for this trie
- * @param options trie options to use
- * @param datamanipulate folding implementation
- */
- IntTrie(char index[], int data[], int initialvalue, int options,
- DataManipulate datamanipulate)
- {
- super(index, options, datamanipulate);
- m_data_ = data;
- m_dataLength_ = m_data_.length;
- m_initialValue_ = initialvalue;
- }
-
- // private data members --------------------------------------------
-
- /**
- * Default value
- */
- private int m_initialValue_;
- /**
- * Array of char data
- */
- private int m_data_[];
-}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/Norm2AllModes.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *******************************************************************************
+ * Copyright (C) 2009-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+package sun.text.normalizer;
+
+import java.io.IOException;
+
+final class Norm2AllModes {
+ // Public API dispatch via Normalizer2 subclasses -------------------------- ***
+
+ // Normalizer2 implementation for the old UNORM_NONE.
+ public static final class NoopNormalizer2 extends Normalizer2 {
+ @Override
+ public StringBuilder normalize(CharSequence src, StringBuilder dest) {
+ if(dest!=src) {
+ dest.setLength(0);
+ return dest.append(src);
+ } else {
+ throw new IllegalArgumentException();
+ }
+ }
+
+ @Override
+ public Appendable normalize(CharSequence src, Appendable dest) {
+ if(dest!=src) {
+ try {
+ return dest.append(src);
+ } catch(IOException e) {
+ throw new InternalError(e.toString(), e);
+ }
+ } else {
+ throw new IllegalArgumentException();
+ }
+ }
+
+ @Override
+ public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
+ if(first!=second) {
+ return first.append(second);
+ } else {
+ throw new IllegalArgumentException();
+ }
+ }
+
+ @Override
+ public StringBuilder append(StringBuilder first, CharSequence second) {
+ if(first!=second) {
+ return first.append(second);
+ } else {
+ throw new IllegalArgumentException();
+ }
+ }
+
+ @Override
+ public String getDecomposition(int c) {
+ return null;
+ }
+
+ // No need to override the default getRawDecomposition().
+ @Override
+ public boolean isNormalized(CharSequence s) { return true; }
+
+ @Override
+ public int spanQuickCheckYes(CharSequence s) { return s.length(); }
+
+ @Override
+ public boolean hasBoundaryBefore(int c) { return true; }
+ }
+
+ // Intermediate class:
+ // Has NormalizerImpl and does boilerplate argument checking and setup.
+ public static abstract class Normalizer2WithImpl extends Normalizer2 {
+ public Normalizer2WithImpl(NormalizerImpl ni) {
+ impl=ni;
+ }
+
+ // normalize
+ @Override
+ public StringBuilder normalize(CharSequence src, StringBuilder dest) {
+ if(dest==src) {
+ throw new IllegalArgumentException();
+ }
+ dest.setLength(0);
+ normalize(src, new NormalizerImpl.ReorderingBuffer(impl, dest, src.length()));
+ return dest;
+ }
+
+ @Override
+ public Appendable normalize(CharSequence src, Appendable dest) {
+ if(dest==src) {
+ throw new IllegalArgumentException();
+ }
+ NormalizerImpl.ReorderingBuffer buffer=
+ new NormalizerImpl.ReorderingBuffer(impl, dest, src.length());
+ normalize(src, buffer);
+ buffer.flush();
+ return dest;
+ }
+
+ protected abstract void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer);
+
+ // normalize and append
+ @Override
+ public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
+ return normalizeSecondAndAppend(first, second, true);
+ }
+
+ @Override
+ public StringBuilder append(StringBuilder first, CharSequence second) {
+ return normalizeSecondAndAppend(first, second, false);
+ }
+
+ public StringBuilder normalizeSecondAndAppend(
+ StringBuilder first, CharSequence second, boolean doNormalize) {
+ if(first==second) {
+ throw new IllegalArgumentException();
+ }
+ normalizeAndAppend(
+ second, doNormalize,
+ new NormalizerImpl.ReorderingBuffer(impl, first, first.length()+second.length()));
+ return first;
+ }
+
+ protected abstract void normalizeAndAppend(
+ CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer);
+
+ @Override
+ public String getDecomposition(int c) {
+ return impl.getDecomposition(c);
+ }
+
+ @Override
+ public int getCombiningClass(int c) {
+ return impl.getCC(impl.getNorm16(c));
+ }
+
+ // quick checks
+ @Override
+ public boolean isNormalized(CharSequence s) {
+ return s.length()==spanQuickCheckYes(s);
+ }
+
+ public final NormalizerImpl impl;
+ }
+
+ public static final class DecomposeNormalizer2 extends Normalizer2WithImpl {
+ public DecomposeNormalizer2(NormalizerImpl ni) {
+ super(ni);
+ }
+
+ @Override
+ protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) {
+ impl.decompose(src, 0, src.length(), buffer);
+ }
+
+ @Override
+ protected void normalizeAndAppend(
+ CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) {
+ impl.decomposeAndAppend(src, doNormalize, buffer);
+ }
+
+ @Override
+ public int spanQuickCheckYes(CharSequence s) {
+ return impl.decompose(s, 0, s.length(), null);
+ }
+
+ @Override
+ public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundary(c, true); }
+ }
+
+ public static final class ComposeNormalizer2 extends Normalizer2WithImpl {
+ public ComposeNormalizer2(NormalizerImpl ni, boolean fcc) {
+ super(ni);
+ onlyContiguous=fcc;
+ }
+
+ @Override
+ protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) {
+ impl.compose(src, 0, src.length(), onlyContiguous, true, buffer);
+ }
+
+ @Override
+ protected void normalizeAndAppend(
+ CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) {
+ impl.composeAndAppend(src, doNormalize, onlyContiguous, buffer);
+ }
+
+ @Override
+ public boolean isNormalized(CharSequence s) {
+ // 5: small destCapacity for substring normalization
+ return impl.compose(s, 0, s.length(),
+ onlyContiguous, false,
+ new NormalizerImpl.ReorderingBuffer(impl, new StringBuilder(), 5));
+ }
+
+ @Override
+ public int spanQuickCheckYes(CharSequence s) {
+ return impl.composeQuickCheck(s, 0, s.length(), onlyContiguous, true)>>>1;
+ }
+
+ @Override
+ public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); }
+
+ private final boolean onlyContiguous;
+ }
+
+ // instance cache ---------------------------------------------------------- ***
+
+ private Norm2AllModes(NormalizerImpl ni) {
+ impl=ni;
+ comp=new ComposeNormalizer2(ni, false);
+ decomp=new DecomposeNormalizer2(ni);
+ }
+
+ public final NormalizerImpl impl;
+ public final ComposeNormalizer2 comp;
+ public final DecomposeNormalizer2 decomp;
+
+ private static Norm2AllModes getInstanceFromSingleton(Norm2AllModesSingleton singleton) {
+ if(singleton.exception!=null) {
+ throw singleton.exception;
+ }
+ return singleton.allModes;
+ }
+
+ public static Norm2AllModes getNFCInstance() {
+ return getInstanceFromSingleton(NFCSingleton.INSTANCE);
+ }
+
+ public static Norm2AllModes getNFKCInstance() {
+ return getInstanceFromSingleton(NFKCSingleton.INSTANCE);
+ }
+
+ public static final NoopNormalizer2 NOOP_NORMALIZER2=new NoopNormalizer2();
+
+ private static final class Norm2AllModesSingleton {
+ private Norm2AllModesSingleton(String name) {
+ try {
+ String DATA_FILE_NAME = "/sun/text/resources/" + name + ".icu";
+ NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME);
+ allModes=new Norm2AllModes(impl);
+ } catch (RuntimeException e) {
+ exception=e;
+ }
+ }
+
+ private Norm2AllModes allModes;
+ private RuntimeException exception;
+ }
+
+ private static final class NFCSingleton {
+ private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfc");
+ }
+
+ private static final class NFKCSingleton {
+ private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfkc");
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/Normalizer2.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *******************************************************************************
+ * Copyright (C) 2009-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+package sun.text.normalizer;
+
+/**
+ * Unicode normalization functionality for standard Unicode normalization or
+ * for using custom mapping tables.
+ * All instances of this class are unmodifiable/immutable.
+ * The Normalizer2 class is not intended for public subclassing.
+ * <p>
+ * The primary functions are to produce a normalized string and to detect whether
+ * a string is already normalized.
+ * The most commonly used normalization forms are those defined in
+ * http://www.unicode.org/unicode/reports/tr15/
+ * However, this API supports additional normalization forms for specialized purposes.
+ * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
+ * and can be used in implementations of UTS #46.
+ * <p>
+ * Not only are the standard compose and decompose modes supplied,
+ * but additional modes are provided as documented in the Mode enum.
+ * <p>
+ * Some of the functions in this class identify normalization boundaries.
+ * At a normalization boundary, the portions of the string
+ * before it and starting from it do not interact and can be handled independently.
+ * <p>
+ * The spanQuickCheckYes() stops at a normalization boundary.
+ * When the goal is a normalized string, then the text before the boundary
+ * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
+ * <p>
+ * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
+ * a character is guaranteed to be at a normalization boundary,
+ * regardless of context.
+ * This is used for moving from one normalization boundary to the next
+ * or preceding boundary, and for performing iterative normalization.
+ * <p>
+ * Iterative normalization is useful when only a small portion of a
+ * longer string needs to be processed.
+ * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
+ * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
+ * (to process only the substring for which sort key bytes are computed).
+ * <p>
+ * The set of normalization boundaries returned by these functions may not be
+ * complete: There may be more boundaries that could be returned.
+ * Different functions may return different boundaries.
+ * @stable ICU 4.4
+ * @author Markus W. Scherer
+ */
+abstract class Normalizer2 {
+
+ /**
+ * Returns a Normalizer2 instance for Unicode NFC normalization.
+ * Same as getInstance(null, "nfc", Mode.COMPOSE).
+ * Returns an unmodifiable singleton instance.
+ * @return the requested Normalizer2, if successful
+ * @stable ICU 49
+ */
+ public static Normalizer2 getNFCInstance() {
+ return Norm2AllModes.getNFCInstance().comp;
+ }
+
+ /**
+ * Returns a Normalizer2 instance for Unicode NFD normalization.
+ * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
+ * Returns an unmodifiable singleton instance.
+ * @return the requested Normalizer2, if successful
+ * @stable ICU 49
+ */
+ public static Normalizer2 getNFDInstance() {
+ return Norm2AllModes.getNFCInstance().decomp;
+ }
+
+ /**
+ * Returns a Normalizer2 instance for Unicode NFKC normalization.
+ * Same as getInstance(null, "nfkc", Mode.COMPOSE).
+ * Returns an unmodifiable singleton instance.
+ * @return the requested Normalizer2, if successful
+ * @stable ICU 49
+ */
+ public static Normalizer2 getNFKCInstance() {
+ return Norm2AllModes.getNFKCInstance().comp;
+ }
+
+ /**
+ * Returns a Normalizer2 instance for Unicode NFKD normalization.
+ * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
+ * Returns an unmodifiable singleton instance.
+ * @return the requested Normalizer2, if successful
+ * @stable ICU 49
+ */
+ public static Normalizer2 getNFKDInstance() {
+ return Norm2AllModes.getNFKCInstance().decomp;
+ }
+
+ /**
+ * Returns the normalized form of the source string.
+ * @param src source string
+ * @return normalized src
+ * @stable ICU 4.4
+ */
+ public String normalize(CharSequence src) {
+ if(src instanceof String) {
+ // Fastpath: Do not construct a new String if the src is a String
+ // and is already normalized.
+ int spanLength=spanQuickCheckYes(src);
+ if(spanLength==src.length()) {
+ return (String)src;
+ }
+ StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
+ return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
+ }
+ return normalize(src, new StringBuilder(src.length())).toString();
+ }
+
+ /**
+ * Writes the normalized form of the source string to the destination string
+ * (replacing its contents) and returns the destination string.
+ * The source and destination strings must be different objects.
+ * @param src source string
+ * @param dest destination string; its contents is replaced with normalized src
+ * @return dest
+ * @stable ICU 4.4
+ */
+ public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
+
+ /**
+ * Writes the normalized form of the source string to the destination Appendable
+ * and returns the destination Appendable.
+ * The source and destination strings must be different objects.
+ *
+ * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
+ *
+ * @param src source string
+ * @param dest destination Appendable; gets normalized src appended
+ * @return dest
+ * @stable ICU 4.6
+ */
+ public abstract Appendable normalize(CharSequence src, Appendable dest);
+
+ /**
+ * Appends the normalized form of the second string to the first string
+ * (merging them at the boundary) and returns the first string.
+ * The result is normalized if the first string was normalized.
+ * The first and second strings must be different objects.
+ * @param first string, should be normalized
+ * @param second string, will be normalized
+ * @return first
+ * @stable ICU 4.4
+ */
+ public abstract StringBuilder normalizeSecondAndAppend(
+ StringBuilder first, CharSequence second);
+
+ /**
+ * Appends the second string to the first string
+ * (merging them at the boundary) and returns the first string.
+ * The result is normalized if both the strings were normalized.
+ * The first and second strings must be different objects.
+ * @param first string, should be normalized
+ * @param second string, should be normalized
+ * @return first
+ * @stable ICU 4.4
+ */
+ public abstract StringBuilder append(StringBuilder first, CharSequence second);
+
+ /**
+ * Gets the decomposition mapping of c.
+ * Roughly equivalent to normalizing the String form of c
+ * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
+ * returns null if c does not have a decomposition mapping in this instance's data.
+ * This function is independent of the mode of the Normalizer2.
+ * @param c code point
+ * @return c's decomposition mapping, if any; otherwise null
+ * @stable ICU 4.6
+ */
+ public abstract String getDecomposition(int c);
+
+ /**
+ * Gets the combining class of c.
+ * The default implementation returns 0
+ * but all standard implementations return the Unicode Canonical_Combining_Class value.
+ * @param c code point
+ * @return c's combining class
+ * @stable ICU 49
+ */
+ public int getCombiningClass(int c) { return 0; }
+
+ /**
+ * Tests if the string is normalized.
+ * Internally, in cases where the quickCheck() method would return "maybe"
+ * (which is only possible for the two COMPOSE modes) this method
+ * resolves to "yes" or "no" to provide a definitive result,
+ * at the cost of doing more work in those cases.
+ * @param s input string
+ * @return true if s is normalized
+ * @stable ICU 4.4
+ */
+ public abstract boolean isNormalized(CharSequence s);
+
+ /**
+ * Returns the end of the normalized substring of the input string.
+ * In other words, with <code>end=spanQuickCheckYes(s);</code>
+ * the substring <code>s.subSequence(0, end)</code>
+ * will pass the quick check with a "yes" result.
+ * <p>
+ * The returned end index is usually one or more characters before the
+ * "no" or "maybe" character: The end index is at a normalization boundary.
+ * (See the class documentation for more about normalization boundaries.)
+ * <p>
+ * When the goal is a normalized string and most input strings are expected
+ * to be normalized already, then call this method,
+ * and if it returns a prefix shorter than the input string,
+ * copy that prefix and use normalizeSecondAndAppend() for the remainder.
+ * @param s input string
+ * @return "yes" span end index
+ * @stable ICU 4.4
+ */
+ public abstract int spanQuickCheckYes(CharSequence s);
+
+ /**
+ * Tests if the character always has a normalization boundary before it,
+ * regardless of context.
+ * If true, then the character does not normalization-interact with
+ * preceding characters.
+ * In other words, a string containing this character can be normalized
+ * by processing portions before this character and starting from this
+ * character independently.
+ * This is used for iterative normalization. See the class documentation for details.
+ * @param c character to test
+ * @return true if c has a normalization boundary before it
+ * @stable ICU 4.4
+ */
+ public abstract boolean hasBoundaryBefore(int c);
+
+ /**
+ * Sole constructor. (For invocation by subclass constructors,
+ * typically implicit.)
+ * @internal
+ * deprecated This API is ICU internal only.
+ */
+ protected Normalizer2() {
+ }
+}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,18 +22,13 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 2000-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
-
package sun.text.normalizer;
import java.text.CharacterIterator;
@@ -125,8 +120,8 @@
*
* normalize(FCD) may be implemented with NFD.
*
- * For more details on FCD see the collation design document:
- * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
+ * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
+ * http://www.unicode.org/notes/tn5/#FCD
*
* ICU collation performs either NFD or FCD normalization automatically if
* normalization is turned on for the collator object. Beyond collation and
@@ -138,26 +133,88 @@
* often do not encode any combining marks by themselves. For conversion to such
* character encodings the Unicode text needs to be normalized to NFC.
* For more usage examples, see the Unicode Standard Annex.
+ *
+ * Note: The Normalizer class also provides API for iterative normalization.
+ * While the setIndex() and getIndex() refer to indices in the
+ * underlying Unicode input text, the next() and previous() methods
+ * iterate through characters in the normalized output.
+ * This means that there is not necessarily a one-to-one correspondence
+ * between characters returned by next() and previous() and the indices
+ * passed to and returned from setIndex() and getIndex().
+ * It is for this reason that Normalizer does not implement the CharacterIterator interface.
+ *
* @stable ICU 2.8
*/
-
+// Original filename in ICU4J: Normalizer.java
public final class NormalizerBase implements Cloneable {
- //-------------------------------------------------------------------------
- // Private data
- //-------------------------------------------------------------------------
- private char[] buffer = new char[100];
- private int bufferStart = 0;
- private int bufferPos = 0;
- private int bufferLimit = 0;
-
// The input text and our position in it
private UCharacterIterator text;
- private Mode mode = NFC;
- private int options = 0;
+ private Normalizer2 norm2;
+ private Mode mode;
+ private int options;
+
+ // The normalization buffer is the result of normalization
+ // of the source in [currentIndex..nextIndex] .
private int currentIndex;
private int nextIndex;
+ // A buffer for holding intermediate results
+ private StringBuilder buffer;
+ private int bufferPos;
+
+ // Helper classes to defer loading of normalization data.
+ private static final class ModeImpl {
+ private ModeImpl(Normalizer2 n2) {
+ normalizer2 = n2;
+ }
+ private final Normalizer2 normalizer2;
+ }
+
+ private static final class NFDModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
+ }
+
+ private static final class NFKDModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
+ }
+
+ private static final class NFCModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
+ }
+
+ private static final class NFKCModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
+ }
+
+ private static final class Unicode32 {
+ private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
+ }
+
+ private static final class NFD32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFKD32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFC32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFKC32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
+ Unicode32.INSTANCE));
+ }
+
/**
* Options bit set value to select Unicode 3.2 normalization
* (except NormalizationCorrections).
@@ -166,6 +223,17 @@
*/
public static final int UNICODE_3_2=0x20;
+ public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2;
+
+ /*
+ * Default option for the latest Unicode normalization. This option is
+ * provided mainly for testing.
+ * The value zero means that normalization is done with the fixes for
+ * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
+ * - Corrigendum 5 (Normalization Idempotency)
+ */
+ public static final int UNICODE_LATEST = 0x00;
+
/**
* Constant indicating that the end of the iteration has been reached.
* This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
@@ -175,101 +243,80 @@
/**
* Constants for normalization modes.
+ * <p>
+ * The Mode class is not intended for public subclassing.
+ * Only the Mode constants provided by the Normalizer class should be used,
+ * and any fields or methods should not be called or overridden by users.
* @stable ICU 2.8
*/
- public static class Mode {
- private int modeValue;
- private Mode(int value) {
- modeValue = value;
- }
+ public static abstract class Mode {
/**
- * This method is used for method dispatch
- * @stable ICU 2.6
+ * Sole constructor
+ * @internal
+ * @deprecated This API is ICU internal only.
*/
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- int srcLen = (srcLimit - srcStart);
- int destLen = (destLimit - destStart);
- if( srcLen > destLen ) {
- return srcLen;
- }
- System.arraycopy(src,srcStart,dest,destStart,srcLen);
- return srcLen;
- }
-
- /**
- * This method is used for method dispatch
- * @stable ICU 2.6
- */
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- int options) {
- return normalize( src, srcStart, srcLimit,
- dest,destStart,destLimit,
- NormalizerImpl.getNX(options)
- );
- }
-
- /**
- * This method is used for method dispatch
- * @stable ICU 2.6
- */
- protected String normalize(String src, int options) {
- return src;
+ @Deprecated
+ protected Mode() {
}
/**
- * This method is used for method dispatch
- * @stable ICU 2.8
- */
- protected int getMinC() {
- return -1;
- }
-
- /**
- * This method is used for method dispatch
- * @stable ICU 2.8
+ * @internal
+ * @deprecated This API is ICU internal only.
*/
- protected int getMask() {
- return -1;
- }
+ @Deprecated
+ protected abstract Normalizer2 getNormalizer2(int options);
+ }
- /**
- * This method is used for method dispatch
- * @stable ICU 2.8
- */
- protected IsPrevBoundary getPrevBoundary() {
- return null;
+ private static Mode toMode(Normalizer.Form form) {
+ switch (form) {
+ case NFC :
+ return NFC;
+ case NFD :
+ return NFD;
+ case NFKC :
+ return NFKC;
+ case NFKD :
+ return NFKD;
}
- /**
- * This method is used for method dispatch
- * @stable ICU 2.8
- */
- protected IsNextBoundary getNextBoundary() {
- return null;
+ throw new IllegalArgumentException("Unexpected normalization form: " +
+ form);
+ }
+
+ private static final class NONEMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
+ }
+
+ private static final class NFDMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFD32ModeImpl.INSTANCE.normalizer2 :
+ NFDModeImpl.INSTANCE.normalizer2;
}
+ }
- /**
- * This method is used for method dispatch
- * @stable ICU 2.6
- */
- protected QuickCheckResult quickCheck(char[] src,int start, int limit,
- boolean allowMaybe,UnicodeSet nx) {
- if(allowMaybe) {
- return MAYBE;
- }
- return NO;
+ private static final class NFKDMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFKD32ModeImpl.INSTANCE.normalizer2 :
+ NFKDModeImpl.INSTANCE.normalizer2;
}
+ }
- /**
- * This method is used for method dispatch
- * @stable ICU 2.8
- */
- protected boolean isNFSkippable(int c) {
- return true;
+ private static final class NFCMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFC32ModeImpl.INSTANCE.normalizer2 :
+ NFCModeImpl.INSTANCE.normalizer2;
+ }
+ }
+
+ private static final class NFKCMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFKC32ModeImpl.INSTANCE.normalizer2 :
+ NFKCModeImpl.INSTANCE.normalizer2;
}
}
@@ -277,290 +324,39 @@
* No decomposition/composition.
* @stable ICU 2.8
*/
- public static final Mode NONE = new Mode(1);
+ public static final Mode NONE = new NONEMode();
/**
* Canonical decomposition.
* @stable ICU 2.8
*/
- public static final Mode NFD = new NFDMode(2);
-
- private static final class NFDMode extends Mode {
- private NFDMode(int value) {
- super(value);
- }
-
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- int[] trailCC = new int[1];
- return NormalizerImpl.decompose(src, srcStart,srcLimit,
- dest, destStart,destLimit,
- false, trailCC,nx);
- }
-
- protected String normalize( String src, int options) {
- return decompose(src,false,options);
- }
-
- protected int getMinC() {
- return NormalizerImpl.MIN_WITH_LEAD_CC;
- }
-
- protected IsPrevBoundary getPrevBoundary() {
- return new IsPrevNFDSafe();
- }
-
- protected IsNextBoundary getNextBoundary() {
- return new IsNextNFDSafe();
- }
-
- protected int getMask() {
- return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD);
- }
-
- protected QuickCheckResult quickCheck(char[] src,int start,
- int limit,boolean allowMaybe,
- UnicodeSet nx) {
- return NormalizerImpl.quickCheck(
- src, start,limit,
- NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE
- ),
- NormalizerImpl.QC_NFD,
- 0,
- allowMaybe,
- nx
- );
- }
-
- protected boolean isNFSkippable(int c) {
- return NormalizerImpl.isNFSkippable(c,this,
- (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD)
- );
- }
- }
+ public static final Mode NFD = new NFDMode();
/**
* Compatibility decomposition.
* @stable ICU 2.8
*/
- public static final Mode NFKD = new NFKDMode(3);
-
- private static final class NFKDMode extends Mode {
- private NFKDMode(int value) {
- super(value);
- }
-
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- int[] trailCC = new int[1];
- return NormalizerImpl.decompose(src, srcStart,srcLimit,
- dest, destStart,destLimit,
- true, trailCC, nx);
- }
-
- protected String normalize( String src, int options) {
- return decompose(src,true,options);
- }
-
- protected int getMinC() {
- return NormalizerImpl.MIN_WITH_LEAD_CC;
- }
-
- protected IsPrevBoundary getPrevBoundary() {
- return new IsPrevNFDSafe();
- }
-
- protected IsNextBoundary getNextBoundary() {
- return new IsNextNFDSafe();
- }
-
- protected int getMask() {
- return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD);
- }
-
- protected QuickCheckResult quickCheck(char[] src,int start,
- int limit,boolean allowMaybe,
- UnicodeSet nx) {
- return NormalizerImpl.quickCheck(
- src,start,limit,
- NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE
- ),
- NormalizerImpl.QC_NFKD,
- NormalizerImpl.OPTIONS_COMPAT,
- allowMaybe,
- nx
- );
- }
-
- protected boolean isNFSkippable(int c) {
- return NormalizerImpl.isNFSkippable(c, this,
- (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD)
- );
- }
- }
+ public static final Mode NFKD = new NFKDMode();
/**
* Canonical decomposition followed by canonical composition.
* @stable ICU 2.8
*/
- public static final Mode NFC = new NFCMode(4);
-
- private static final class NFCMode extends Mode{
- private NFCMode(int value) {
- super(value);
- }
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- return NormalizerImpl.compose( src, srcStart, srcLimit,
- dest,destStart,destLimit,
- 0, nx);
- }
-
- protected String normalize( String src, int options) {
- return compose(src, false, options);
- }
-
- protected int getMinC() {
- return NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
- );
- }
- protected IsPrevBoundary getPrevBoundary() {
- return new IsPrevTrueStarter();
- }
- protected IsNextBoundary getNextBoundary() {
- return new IsNextTrueStarter();
- }
- protected int getMask() {
- return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFC);
- }
- protected QuickCheckResult quickCheck(char[] src,int start,
- int limit,boolean allowMaybe,
- UnicodeSet nx) {
- return NormalizerImpl.quickCheck(
- src,start,limit,
- NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
- ),
- NormalizerImpl.QC_NFC,
- 0,
- allowMaybe,
- nx
- );
- }
- protected boolean isNFSkippable(int c) {
- return NormalizerImpl.isNFSkippable(c,this,
- ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
- (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO)
- )
- );
- }
- };
-
- /**
- * Compatibility decomposition followed by canonical composition.
- * @stable ICU 2.8
- */
- public static final Mode NFKC =new NFKCMode(5);
+ public static final Mode NFC = new NFCMode();
- private static final class NFKCMode extends Mode{
- private NFKCMode(int value) {
- super(value);
- }
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- return NormalizerImpl.compose(src, srcStart,srcLimit,
- dest, destStart,destLimit,
- NormalizerImpl.OPTIONS_COMPAT, nx);
- }
-
- protected String normalize( String src, int options) {
- return compose(src, true, options);
- }
- protected int getMinC() {
- return NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
- );
- }
- protected IsPrevBoundary getPrevBoundary() {
- return new IsPrevTrueStarter();
- }
- protected IsNextBoundary getNextBoundary() {
- return new IsNextTrueStarter();
- }
- protected int getMask() {
- return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKC);
- }
- protected QuickCheckResult quickCheck(char[] src,int start,
- int limit,boolean allowMaybe,
- UnicodeSet nx) {
- return NormalizerImpl.quickCheck(
- src,start,limit,
- NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
- ),
- NormalizerImpl.QC_NFKC,
- NormalizerImpl.OPTIONS_COMPAT,
- allowMaybe,
- nx
- );
- }
- protected boolean isNFSkippable(int c) {
- return NormalizerImpl.isNFSkippable(c, this,
- ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
- (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO)
- )
- );
- }
- };
-
- /**
- * Result values for quickCheck().
- * For details see Unicode Technical Report 15.
- * @stable ICU 2.8
- */
- public static final class QuickCheckResult{
- private int resultValue;
- private QuickCheckResult(int value) {
- resultValue=value;
- }
- }
- /**
- * Indicates that string is not in the normalized format
- * @stable ICU 2.8
- */
- public static final QuickCheckResult NO = new QuickCheckResult(0);
-
- /**
- * Indicates that string is in the normalized format
- * @stable ICU 2.8
- */
- public static final QuickCheckResult YES = new QuickCheckResult(1);
-
- /**
- * Indicates it cannot be determined if string is in the normalized
- * format without further thorough checks.
- * @stable ICU 2.8
- */
- public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
+ public static final Mode NFKC =new NFKCMode();
//-------------------------------------------------------------------------
- // Constructors
+ // Iterator constructors
//-------------------------------------------------------------------------
/**
- * Creates a new {@code Normalizer} object for iterating over the
+ * Creates a new {@code NormalizerBase} object for iterating over the
* normalized form of a given string.
* <p>
* The {@code options} parameter specifies which optional
- * {@code Normalizer} features are to be enabled for this object.
- *
+ * {@code NormalizerBase} features are to be enabled for this object.
+ * <p>
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
@@ -576,25 +372,19 @@
this.text = UCharacterIterator.getInstance(str);
this.mode = mode;
this.options=opt;
+ norm2 = mode.getNormalizer2(opt);
+ buffer = new StringBuilder();
}
+ public NormalizerBase(String str, Mode mode) {
+ this(str, mode, 0);
+ }
+
+
/**
- * Creates a new {@code Normalizer} object for iterating over the
+ * Creates a new {@code NormalizerBase} object for iterating over the
* normalized form of the given text.
- *
- * @param iter The input text to be normalized. The normalization
- * will start at the beginning of the string.
- *
- * @param mode The normalization mode.
- */
- public NormalizerBase(CharacterIterator iter, Mode mode) {
- this(iter, mode, UNICODE_LATEST);
- }
-
- /**
- * Creates a new {@code Normalizer} object for iterating over the
- * normalized form of the given text.
- *
+ * <p>
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
@@ -607,15 +397,19 @@
* @stable ICU 2.6
*/
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
- this.text = UCharacterIterator.getInstance(
- (CharacterIterator)iter.clone()
- );
+ this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
this.mode = mode;
this.options = opt;
+ norm2 = mode.getNormalizer2(opt);
+ buffer = new StringBuilder();
+ }
+
+ public NormalizerBase(CharacterIterator iter, Mode mode) {
+ this(iter, mode, 0);
}
/**
- * Clones this {@code Normalizer} object. All properties of this
+ * Clones this {@code NormalizerBase} object. All properties of this
* object are duplicated in the new object, including the cloning of any
* {@link CharacterIterator} that was passed in to the constructor
* or to {@link #setText(CharacterIterator) setText}.
@@ -628,11 +422,13 @@
try {
NormalizerBase copy = (NormalizerBase) super.clone();
copy.text = (UCharacterIterator) text.clone();
- //clone the internal buffer
- if (buffer != null) {
- copy.buffer = new char[buffer.length];
- System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
- }
+ copy.mode = mode;
+ copy.options = options;
+ copy.norm2 = norm2;
+ copy.buffer = new StringBuilder(buffer);
+ copy.bufferPos = bufferPos;
+ copy.currentIndex = currentIndex;
+ copy.nextIndex = nextIndex;
return copy;
}
catch (CloneNotSupportedException e) {
@@ -640,150 +436,60 @@
}
}
- //--------------------------------------------------------------------------
- // Static Utility methods
- //--------------------------------------------------------------------------
-
/**
- * Compose a string.
- * The string will be composed according to the specified mode.
- * @param str The string to compose.
- * @param compat If true the string will be composed according to
- * NFKC rules and if false will be composed according to
- * NFC rules.
- * @param options The only recognized option is UNICODE_3_2
- * @return String The composed string
+ * Normalizes a {@code String} using the given normalization operation.
+ * <p>
+ * The {@code options} parameter specifies which optional
+ * {@code NormalizerBase} features are to be enabled for this operation.
+ * Currently the only available option is {@link #UNICODE_3_2}.
+ * If you want the default behavior corresponding to one of the standard
+ * Unicode Normalization Forms, use 0 for this argument.
+ * <p>
+ * @param str the input string to be normalized.
+ * @param mode the normalization mode
+ * @param options the optional features to be enabled.
+ * @return String the normalized string
* @stable ICU 2.6
*/
- public static String compose(String str, boolean compat, int options) {
-
- char[] dest, src;
- if (options == UNICODE_3_2_0_ORIGINAL) {
- String mappedStr = NormalizerImpl.convert(str);
- dest = new char[mappedStr.length()*MAX_BUF_SIZE_COMPOSE];
- src = mappedStr.toCharArray();
- } else {
- dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE];
- src = str.toCharArray();
- }
- int destSize=0;
-
- UnicodeSet nx = NormalizerImpl.getNX(options);
-
- /* reset options bits that should only be set here or inside compose() */
- options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
-
- if(compat) {
- options|=NormalizerImpl.OPTIONS_COMPAT;
- }
-
- for(;;) {
- destSize=NormalizerImpl.compose(src,0,src.length,
- dest,0,dest.length,options,
- nx);
- if(destSize<=dest.length) {
- return new String(dest,0,destSize);
- } else {
- dest = new char[destSize];
- }
- }
+ public static String normalize(String str, Mode mode, int options) {
+ return mode.getNormalizer2(options).normalize(str);
}
- private static final int MAX_BUF_SIZE_COMPOSE = 2;
- private static final int MAX_BUF_SIZE_DECOMPOSE = 3;
+ public static String normalize(String str, Normalizer.Form form) {
+ return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
+ }
- /**
- * Decompose a string.
- * The string will be decomposed according to the specified mode.
- * @param str The string to decompose.
- * @param compat If true the string will be decomposed according to NFKD
- * rules and if false will be decomposed according to NFD
- * rules.
- * @return String The decomposed string
- * @stable ICU 2.8
- */
- public static String decompose(String str, boolean compat) {
- return decompose(str,compat,UNICODE_LATEST);
+ public static String normalize(String str, Normalizer.Form form, int options) {
+ return NormalizerBase.normalize(str, toMode(form), options);
}
/**
- * Decompose a string.
- * The string will be decomposed according to the specified mode.
- * @param str The string to decompose.
- * @param compat If true the string will be decomposed according to NFKD
- * rules and if false will be decomposed according to NFD
- * rules.
- * @param options The normalization options, ORed together (0 for no options).
- * @return String The decomposed string
+ * Test if a string is in a given normalization form.
+ * This is semantically equivalent to source.equals(normalize(source, mode)).
+ *
+ * Unlike quickCheck(), this function returns a definitive result,
+ * never a "maybe".
+ * For NFD, NFKD, and FCD, both functions work exactly the same.
+ * For NFC and NFKC where quickCheck may return "maybe", this function will
+ * perform further tests to arrive at a true/false result.
+ * @param str the input string to be checked to see if it is
+ * normalized
+ * @param mode the normalization mode
+ * @param options Options for use with exclusion set and tailored Normalization
+ * The only option that is currently recognized is UNICODE_3_2
+ * @see #isNormalized
* @stable ICU 2.6
*/
- public static String decompose(String str, boolean compat, int options) {
-
- int[] trailCC = new int[1];
- int destSize=0;
- UnicodeSet nx = NormalizerImpl.getNX(options);
- char[] dest;
-
- if (options == UNICODE_3_2_0_ORIGINAL) {
- String mappedStr = NormalizerImpl.convert(str);
- dest = new char[mappedStr.length()*MAX_BUF_SIZE_DECOMPOSE];
-
- for(;;) {
- destSize=NormalizerImpl.decompose(mappedStr.toCharArray(),0,mappedStr.length(),
- dest,0,dest.length,
- compat,trailCC, nx);
- if(destSize<=dest.length) {
- return new String(dest,0,destSize);
- } else {
- dest = new char[destSize];
- }
- }
- } else {
- dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE];
-
- for(;;) {
- destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(),
- dest,0,dest.length,
- compat,trailCC, nx);
- if(destSize<=dest.length) {
- return new String(dest,0,destSize);
- } else {
- dest = new char[destSize];
- }
- }
- }
+ public static boolean isNormalized(String str, Mode mode, int options) {
+ return mode.getNormalizer2(options).isNormalized(str);
}
- /**
- * Normalize a string.
- * The string will be normalized according to the specified normalization
- * mode and options.
- * @param src The char array to compose.
- * @param srcStart Start index of the source
- * @param srcLimit Limit index of the source
- * @param dest The char buffer to fill in
- * @param destStart Start index of the destination buffer
- * @param destLimit End index of the destination buffer
- * @param mode The normalization mode; one of Normalizer.NONE,
- * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
- * Normalizer.NFKD, Normalizer.DEFAULT
- * @param options The normalization options, ORed together (0 for no options).
- * @return int The total buffer size needed;if greater than length of
- * result, the output was truncated.
- * @exception IndexOutOfBoundsException if the target capacity is
- * less than the required length
- * @stable ICU 2.6
- */
- public static int normalize(char[] src,int srcStart, int srcLimit,
- char[] dest,int destStart, int destLimit,
- Mode mode, int options) {
- int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
+ public static boolean isNormalized(String str, Normalizer.Form form) {
+ return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
+ }
- if(length<=(destLimit-destStart)) {
- return length;
- } else {
- throw new IndexOutOfBoundsException(Integer.toString(length));
- }
+ public static boolean isNormalized(String str, Normalizer.Form form, int options) {
+ return NormalizerBase.isNormalized(str, toMode(form), options);
}
//-------------------------------------------------------------------------
@@ -796,8 +502,8 @@
* @stable ICU 2.8
*/
public int current() {
- if(bufferPos<bufferLimit || nextNormalize()) {
- return getCodePointAt(bufferPos);
+ if(bufferPos<buffer.length() || nextNormalize()) {
+ return buffer.codePointAt(bufferPos);
} else {
return DONE;
}
@@ -811,16 +517,15 @@
* @stable ICU 2.8
*/
public int next() {
- if(bufferPos<bufferLimit || nextNormalize()) {
- int c=getCodePointAt(bufferPos);
- bufferPos+=(c>0xFFFF) ? 2 : 1;
+ if(bufferPos<buffer.length() || nextNormalize()) {
+ int c=buffer.codePointAt(bufferPos);
+ bufferPos+=Character.charCount(c);
return c;
} else {
return DONE;
}
}
-
/**
* Return the previous character in the normalized text and decrement
* the iteration position by one. If the beginning
@@ -830,8 +535,8 @@
*/
public int previous() {
if(bufferPos>0 || previousNormalize()) {
- int c=getCodePointAt(bufferPos-1);
- bufferPos-=(c>0xFFFF) ? 2 : 1;
+ int c=buffer.codePointBefore(bufferPos);
+ bufferPos-=Character.charCount(c);
return c;
} else {
return DONE;
@@ -859,8 +564,8 @@
* @stable ICU 2.8
*/
public void setIndexOnly(int index) {
- text.setIndex(index);
- currentIndex=nextIndex=index; // validates index
+ text.setIndex(index); // validates index
+ currentIndex=nextIndex=index;
clearBuffer();
}
@@ -874,7 +579,7 @@
* necessarily a one-to-one correspondence between characters returned
* by {@code next} and {@code previous} and the indices passed to and
* returned from {@code setIndex} and {@link #getIndex}.
- *
+ * <p>
* @param index the desired index in the input text.
*
* @return the first normalized character that is the result of iterating
@@ -882,11 +587,9 @@
*
* @throws IllegalArgumentException if the given index is less than
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
- * @return The codepoint as an int
- * @deprecated ICU 3.2
+ * deprecated ICU 3.2
* @obsolete ICU 3.2
*/
- @Deprecated
public int setIndex(int index) {
setIndexOnly(index);
return current();
@@ -895,7 +598,7 @@
/**
* Retrieve the index of the start of the input text. This is the begin
* index of the {@code CharacterIterator} or the start (i.e. 0) of the
- * {@code String} over which this {@code Normalizer} is iterating
+ * {@code String} over which this {@code NormalizerBase} is iterating
* @deprecated ICU 2.2. Use startIndex() instead.
* @return The codepoint as an int
* @see #startIndex
@@ -908,7 +611,7 @@
/**
* Retrieve the index of the end of the input text. This is the end index
* of the {@code CharacterIterator} or the length of the {@code String}
- * over which this {@code Normalizer} is iterating
+ * over which this {@code NormalizerBase} is iterating
* @deprecated ICU 2.2. Use endIndex() instead.
* @return The codepoint as an int
* @see #endIndex
@@ -934,7 +637,7 @@
* @stable ICU 2.8
*/
public int getIndex() {
- if(bufferPos<bufferLimit) {
+ if(bufferPos<buffer.length()) {
return currentIndex;
} else {
return nextIndex;
@@ -942,9 +645,9 @@
}
/**
- * Retrieve the index of the end of the input text. This is the end index
+ * Retrieve the index of the end of the input text. This is the end index
* of the {@code CharacterIterator} or the length of the {@code String}
- * over which this {@code Normalizer} is iterating
+ * over which this {@code NormalizerBase} is iterating
* @return The current iteration position
* @stable ICU 2.8
*/
@@ -953,7 +656,7 @@
}
//-------------------------------------------------------------------------
- // Property access methods
+ // Iterator attributes
//-------------------------------------------------------------------------
/**
* Set the normalization mode for this object.
@@ -964,18 +667,18 @@
* until the iteration is able to re-sync at the next base character.
* It is safest to call {@link #setText setText()}, {@link #first},
* {@link #last}, etc. after calling {@code setMode}.
- *
- * @param newMode the new mode for this {@code Normalizer}.
+ * <p>
+ * @param newMode the new mode for this {@code NormalizerBase}.
* The supported modes are:
* <ul>
- * <li>{@link #COMPOSE} - Unicode canonical decompositiion
- * followed by canonical composition.
- * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion
- * follwed by canonical composition.
- * <li>{@link #DECOMP} - Unicode canonical decomposition
- * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition.
- * <li>{@link #NO_OP} - Do nothing but return characters
- * from the underlying input text.
+ * <li>{@link #NFC} - Unicode canonical decompositiion
+ * followed by canonical composition.
+ * <li>{@link #NFKC} - Unicode compatibility decompositiion
+ * follwed by canonical composition.
+ * <li>{@link #NFD} - Unicode canonical decomposition
+ * <li>{@link #NFKD} - Unicode compatibility decomposition.
+ * <li>{@link #NONE} - Do nothing but return characters
+ * from the underlying input text.
* </ul>
*
* @see #getMode
@@ -983,9 +686,11 @@
*/
public void setMode(Mode newMode) {
mode = newMode;
+ norm2 = mode.getNormalizer2(options);
}
+
/**
- * Return the basic operation performed by this {@code Normalizer}
+ * Return the basic operation performed by this {@code NormalizerBase}
*
* @see #setMode
* @stable ICU 2.8
@@ -995,688 +700,83 @@
}
/**
- * Set the input text over which this {@code Normalizer} will iterate.
+ * Set the input text over which this {@code NormalizerBase} will iterate.
* The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(String newText) {
-
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
- throw new InternalError("Could not create a new UCharacterIterator");
+ throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
reset();
}
/**
- * Set the input text over which this {@code Normalizer} will iterate.
+ * Set the input text over which this {@code NormalizerBase} will iterate.
* The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(CharacterIterator newText) {
-
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
- throw new InternalError("Could not create a new UCharacterIterator");
+ throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
currentIndex=nextIndex=0;
clearBuffer();
}
- //-------------------------------------------------------------------------
- // Private utility methods
- //-------------------------------------------------------------------------
-
-
- /* backward iteration --------------------------------------------------- */
-
- /*
- * read backwards and get norm32
- * return 0 if the character is <minC
- * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
- * surrogate but read second!)
- */
-
- private static long getPrevNorm32(UCharacterIterator src,
- int/*unsigned*/ minC,
- int/*unsigned*/ mask,
- char[] chars) {
- long norm32;
- int ch=0;
- /* need src.hasPrevious() */
- if((ch=src.previous()) == UCharacterIterator.DONE) {
- return 0;
- }
- chars[0]=(char)ch;
- chars[1]=0;
-
- /* check for a surrogate before getting norm32 to see if we need to
- * predecrement further */
- if(chars[0]<minC) {
- return 0;
- } else if(!UTF16.isSurrogate(chars[0])) {
- return NormalizerImpl.getNorm32(chars[0]);
- } else if(UTF16.isLeadSurrogate(chars[0]) || (src.getIndex()==0)) {
- /* unpaired surrogate */
- chars[1]=(char)src.current();
- return 0;
- } else if(UTF16.isLeadSurrogate(chars[1]=(char)src.previous())) {
- norm32=NormalizerImpl.getNorm32(chars[1]);
- if((norm32&mask)==0) {
- /* all surrogate pairs with this lead surrogate have irrelevant
- * data */
- return 0;
- } else {
- /* norm32 must be a surrogate special */
- return NormalizerImpl.getNorm32FromSurrogatePair(norm32,chars[0]);
- }
- } else {
- /* unpaired second surrogate, undo the c2=src.previous() movement */
- src.moveIndex( 1);
- return 0;
- }
+ private void clearBuffer() {
+ buffer.setLength(0);
+ bufferPos=0;
}
- private interface IsPrevBoundary{
- public boolean isPrevBoundary(UCharacterIterator src,
- int/*unsigned*/ minC,
- int/*unsigned*/ mask,
- char[] chars);
- }
- private static final class IsPrevNFDSafe implements IsPrevBoundary{
- /*
- * for NF*D:
- * read backwards and check if the lead combining class is 0
- * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
- * surrogate but read second!)
- */
- public boolean isPrevBoundary(UCharacterIterator src,
- int/*unsigned*/ minC,
- int/*unsigned*/ ccOrQCMask,
- char[] chars) {
-
- return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC,
- ccOrQCMask, chars),
- ccOrQCMask,
- ccOrQCMask& NormalizerImpl.QC_MASK);
+ private boolean nextNormalize() {
+ clearBuffer();
+ currentIndex=nextIndex;
+ text.setIndex(nextIndex);
+ // Skip at least one character so we make progress.
+ int c=text.nextCodePoint();
+ if(c<0) {
+ return false;
}
+ StringBuilder segment=new StringBuilder().appendCodePoint(c);
+ while((c=text.nextCodePoint())>=0) {
+ if(norm2.hasBoundaryBefore(c)) {
+ text.moveCodePointIndex(-1);
+ break;
+ }
+ segment.appendCodePoint(c);
+ }
+ nextIndex=text.getIndex();
+ norm2.normalize(segment, buffer);
+ return buffer.length()!=0;
}
- private static final class IsPrevTrueStarter implements IsPrevBoundary{
- /*
- * read backwards and check if the character is (or its decomposition
- * begins with) a "true starter" (cc==0 and NF*C_YES)
- * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
- * surrogate but read second!)
- */
- public boolean isPrevBoundary(UCharacterIterator src,
- int/*unsigned*/ minC,
- int/*unsigned*/ ccOrQCMask,
- char[] chars) {
- long norm32;
- int/*unsigned*/ decompQCMask;
-
- decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
- norm32=getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
- return NormalizerImpl.isTrueStarter(norm32,ccOrQCMask,decompQCMask);
- }
- }
-
- private static int findPreviousIterationBoundary(UCharacterIterator src,
- IsPrevBoundary obj,
- int/*unsigned*/ minC,
- int/*mask*/ mask,
- char[] buffer,
- int[] startIndex) {
- char[] chars=new char[2];
- boolean isBoundary;
-
- /* fill the buffer from the end backwards */
- startIndex[0] = buffer.length;
- chars[0]=0;
- while(src.getIndex()>0 && chars[0]!=UCharacterIterator.DONE) {
- isBoundary=obj.isPrevBoundary(src, minC, mask, chars);
-
- /* always write this character to the front of the buffer */
- /* make sure there is enough space in the buffer */
- if(startIndex[0] < (chars[1]==0 ? 1 : 2)) {
-
- // grow the buffer
- char[] newBuf = new char[buffer.length*2];
- /* move the current buffer contents up */
- System.arraycopy(buffer,startIndex[0],newBuf,
- newBuf.length-(buffer.length-startIndex[0]),
- buffer.length-startIndex[0]);
- //adjust the startIndex
- startIndex[0]+=newBuf.length-buffer.length;
-
- buffer=newBuf;
- newBuf=null;
-
+ private boolean previousNormalize() {
+ clearBuffer();
+ nextIndex=currentIndex;
+ text.setIndex(currentIndex);
+ StringBuilder segment=new StringBuilder();
+ int c;
+ while((c=text.previousCodePoint())>=0) {
+ if(c<=0xffff) {
+ segment.insert(0, (char)c);
+ } else {
+ segment.insert(0, Character.toChars(c));
}
-
- buffer[--startIndex[0]]=chars[0];
- if(chars[1]!=0) {
- buffer[--startIndex[0]]=chars[1];
- }
-
- /* stop if this just-copied character is a boundary */
- if(isBoundary) {
+ if(norm2.hasBoundaryBefore(c)) {
break;
}
}
-
- /* return the length of the buffer contents */
- return buffer.length-startIndex[0];
- }
-
- private static int previous(UCharacterIterator src,
- char[] dest, int destStart, int destLimit,
- Mode mode,
- boolean doNormalize,
- boolean[] pNeededToNormalize,
- int options) {
-
- IsPrevBoundary isPreviousBoundary;
- int destLength, bufferLength;
- int/*unsigned*/ mask;
- int c,c2;
-
- char minC;
- int destCapacity = destLimit-destStart;
- destLength=0;
-
- if(pNeededToNormalize!=null) {
- pNeededToNormalize[0]=false;
- }
- minC = (char)mode.getMinC();
- mask = mode.getMask();
- isPreviousBoundary = mode.getPrevBoundary();
-
- if(isPreviousBoundary==null) {
- destLength=0;
- if((c=src.previous())>=0) {
- destLength=1;
- if(UTF16.isTrailSurrogate((char)c)) {
- c2= src.previous();
- if(c2!= UCharacterIterator.DONE) {
- if(UTF16.isLeadSurrogate((char)c2)) {
- if(destCapacity>=2) {
- dest[1]=(char)c; // trail surrogate
- destLength=2;
- }
- // lead surrogate to be written below
- c=c2;
- } else {
- src.moveIndex(1);
- }
- }
- }
-
- if(destCapacity>0) {
- dest[0]=(char)c;
- }
- }
- return destLength;
- }
-
- char[] buffer = new char[100];
- int[] startIndex= new int[1];
- bufferLength=findPreviousIterationBoundary(src,
- isPreviousBoundary,
- minC, mask,buffer,
- startIndex);
- if(bufferLength>0) {
- if(doNormalize) {
- destLength=NormalizerBase.normalize(buffer,startIndex[0],
- startIndex[0]+bufferLength,
- dest, destStart,destLimit,
- mode, options);
-
- if(pNeededToNormalize!=null) {
- pNeededToNormalize[0]=destLength!=bufferLength ||
- Utility.arrayRegionMatches(
- buffer,0,dest,
- destStart,destLimit
- );
- }
- } else {
- /* just copy the source characters */
- if(destCapacity>0) {
- System.arraycopy(buffer,startIndex[0],dest,0,
- (bufferLength<destCapacity) ?
- bufferLength : destCapacity
- );
- }
- }
- }
-
-
- return destLength;
- }
-
-
-
- /* forward iteration ---------------------------------------------------- */
- /*
- * read forward and check if the character is a next-iteration boundary
- * if c2!=0 then (c, c2) is a surrogate pair
- */
- private interface IsNextBoundary{
- boolean isNextBoundary(UCharacterIterator src,
- int/*unsigned*/ minC,
- int/*unsigned*/ mask,
- int[] chars);
- }
- /*
- * read forward and get norm32
- * return 0 if the character is <minC
- * if c2!=0 then (c2, c) is a surrogate pair
- * always reads complete characters
- */
- private static long /*unsigned*/ getNextNorm32(UCharacterIterator src,
- int/*unsigned*/ minC,
- int/*unsigned*/ mask,
- int[] chars) {
- long norm32;
-
- /* need src.hasNext() to be true */
- chars[0]=src.next();
- chars[1]=0;
-
- if(chars[0]<minC) {
- return 0;
- }
-
- norm32=NormalizerImpl.getNorm32((char)chars[0]);
- if(UTF16.isLeadSurrogate((char)chars[0])) {
- if(src.current()!=UCharacterIterator.DONE &&
- UTF16.isTrailSurrogate((char)(chars[1]=src.current()))) {
- src.moveIndex(1); /* skip the c2 surrogate */
- if((norm32&mask)==0) {
- /* irrelevant data */
- return 0;
- } else {
- /* norm32 must be a surrogate special */
- return NormalizerImpl.getNorm32FromSurrogatePair(norm32,(char)chars[1]);
- }
- } else {
- /* unmatched surrogate */
- return 0;
- }
- }
- return norm32;
- }
-
-
- /*
- * for NF*D:
- * read forward and check if the lead combining class is 0
- * if c2!=0 then (c, c2) is a surrogate pair
- */
- private static final class IsNextNFDSafe implements IsNextBoundary{
- public boolean isNextBoundary(UCharacterIterator src,
- int/*unsigned*/ minC,
- int/*unsigned*/ ccOrQCMask,
- int[] chars) {
- return NormalizerImpl.isNFDSafe(getNextNorm32(src,minC,ccOrQCMask,chars),
- ccOrQCMask, ccOrQCMask&NormalizerImpl.QC_MASK);
- }
- }
-
- /*
- * for NF*C:
- * read forward and check if the character is (or its decomposition begins
- * with) a "true starter" (cc==0 and NF*C_YES)
- * if c2!=0 then (c, c2) is a surrogate pair
- */
- private static final class IsNextTrueStarter implements IsNextBoundary{
- public boolean isNextBoundary(UCharacterIterator src,
- int/*unsigned*/ minC,
- int/*unsigned*/ ccOrQCMask,
- int[] chars) {
- long norm32;
- int/*unsigned*/ decompQCMask;
-
- decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/
- norm32=getNextNorm32(src, minC, ccOrQCMask|decompQCMask, chars);
- return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask, decompQCMask);
- }
- }
-
- private static int findNextIterationBoundary(UCharacterIterator src,
- IsNextBoundary obj,
- int/*unsigned*/ minC,
- int/*unsigned*/ mask,
- char[] buffer) {
- if(src.current()==UCharacterIterator.DONE) {
- return 0;
- }
-
- /* get one character and ignore its properties */
- int[] chars = new int[2];
- chars[0]=src.next();
- buffer[0]=(char)chars[0];
- int bufferIndex = 1;
-
- if(UTF16.isLeadSurrogate((char)chars[0])&&
- src.current()!=UCharacterIterator.DONE) {
- if(UTF16.isTrailSurrogate((char)(chars[1]=src.next()))) {
- buffer[bufferIndex++]=(char)chars[1];
- } else {
- src.moveIndex(-1); /* back out the non-trail-surrogate */
- }
- }
-
- /* get all following characters until we see a boundary */
- /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff
- * is part of the string */
- while( src.current()!=UCharacterIterator.DONE) {
- if(obj.isNextBoundary(src, minC, mask, chars)) {
- /* back out the latest movement to stop at the boundary */
- src.moveIndex(chars[1]==0 ? -1 : -2);
- break;
- } else {
- if(bufferIndex+(chars[1]==0 ? 1 : 2)<=buffer.length) {
- buffer[bufferIndex++]=(char)chars[0];
- if(chars[1]!=0) {
- buffer[bufferIndex++]=(char)chars[1];
- }
- } else {
- char[] newBuf = new char[buffer.length*2];
- System.arraycopy(buffer,0,newBuf,0,bufferIndex);
- buffer = newBuf;
- buffer[bufferIndex++]=(char)chars[0];
- if(chars[1]!=0) {
- buffer[bufferIndex++]=(char)chars[1];
- }
- }
- }
- }
-
- /* return the length of the buffer contents */
- return bufferIndex;
+ currentIndex=text.getIndex();
+ norm2.normalize(segment, buffer);
+ bufferPos=buffer.length();
+ return buffer.length()!=0;
}
- private static int next(UCharacterIterator src,
- char[] dest, int destStart, int destLimit,
- NormalizerBase.Mode mode,
- boolean doNormalize,
- boolean[] pNeededToNormalize,
- int options) {
-
- IsNextBoundary isNextBoundary;
- int /*unsigned*/ mask;
- int /*unsigned*/ bufferLength;
- int c,c2;
- char minC;
- int destCapacity = destLimit - destStart;
- int destLength = 0;
- if(pNeededToNormalize!=null) {
- pNeededToNormalize[0]=false;
- }
-
- minC = (char)mode.getMinC();
- mask = mode.getMask();
- isNextBoundary = mode.getNextBoundary();
-
- if(isNextBoundary==null) {
- destLength=0;
- c=src.next();
- if(c!=UCharacterIterator.DONE) {
- destLength=1;
- if(UTF16.isLeadSurrogate((char)c)) {
- c2= src.next();
- if(c2!= UCharacterIterator.DONE) {
- if(UTF16.isTrailSurrogate((char)c2)) {
- if(destCapacity>=2) {
- dest[1]=(char)c2; // trail surrogate
- destLength=2;
- }
- // lead surrogate to be written below
- } else {
- src.moveIndex(-1);
- }
- }
- }
-
- if(destCapacity>0) {
- dest[0]=(char)c;
- }
- }
- return destLength;
- }
-
- char[] buffer=new char[100];
- int[] startIndex = new int[1];
- bufferLength=findNextIterationBoundary(src,isNextBoundary, minC, mask,
- buffer);
- if(bufferLength>0) {
- if(doNormalize) {
- destLength=mode.normalize(buffer,startIndex[0],bufferLength,
- dest,destStart,destLimit, options);
-
- if(pNeededToNormalize!=null) {
- pNeededToNormalize[0]=destLength!=bufferLength ||
- Utility.arrayRegionMatches(buffer,startIndex[0],
- dest,destStart,
- destLength);
- }
- } else {
- /* just copy the source characters */
- if(destCapacity>0) {
- System.arraycopy(buffer,0,dest,destStart,
- Math.min(bufferLength,destCapacity)
- );
- }
-
-
- }
- }
- return destLength;
- }
-
- private void clearBuffer() {
- bufferLimit=bufferStart=bufferPos=0;
- }
-
- private boolean nextNormalize() {
-
- clearBuffer();
- currentIndex=nextIndex;
- text.setIndex(nextIndex);
-
- bufferLimit=next(text,buffer,bufferStart,buffer.length,mode,true,null,options);
-
- nextIndex=text.getIndex();
- return (bufferLimit>0);
- }
-
- private boolean previousNormalize() {
-
- clearBuffer();
- nextIndex=currentIndex;
- text.setIndex(currentIndex);
- bufferLimit=previous(text,buffer,bufferStart,buffer.length,mode,true,null,options);
-
- currentIndex=text.getIndex();
- bufferPos = bufferLimit;
- return bufferLimit>0;
- }
-
- private int getCodePointAt(int index) {
- if( UTF16.isSurrogate(buffer[index])) {
- if(UTF16.isLeadSurrogate(buffer[index])) {
- if((index+1)<bufferLimit &&
- UTF16.isTrailSurrogate(buffer[index+1])) {
- return UCharacterProperty.getRawSupplementary(
- buffer[index],
- buffer[index+1]
- );
- }
- }else if(UTF16.isTrailSurrogate(buffer[index])) {
- if(index>0 && UTF16.isLeadSurrogate(buffer[index-1])) {
- return UCharacterProperty.getRawSupplementary(
- buffer[index-1],
- buffer[index]
- );
- }
- }
- }
- return buffer[index];
-
- }
-
- /**
- * Internal API
- * @internal
- */
- public static boolean isNFSkippable(int c, Mode mode) {
- return mode.isNFSkippable(c);
- }
-
- //
- // Options
- //
-
- /*
- * Default option for Unicode 3.2.0 normalization.
- * Corrigendum 4 was fixed in Unicode 3.2.0 but isn't supported in
- * IDNA/StringPrep.
- * The public review issue #29 was fixed in Unicode 4.1.0. Corrigendum 5
- * allowed Unicode 3.2 to 4.0.1 to apply the fix for PRI #29, but it isn't
- * supported by IDNA/StringPrep as well as Corrigendum 4.
- */
- public static final int UNICODE_3_2_0_ORIGINAL =
- UNICODE_3_2 |
- NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS |
- NormalizerImpl.BEFORE_PRI_29;
-
- /*
- * Default option for the latest Unicode normalization. This option is
- * provided mainly for testing.
- * The value zero means that normalization is done with the fixes for
- * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
- * - Corrigendum 5 (Normalization Idempotency)
- */
- public static final int UNICODE_LATEST = 0x00;
-
- //
- // public constructor and methods for java.text.Normalizer and
- // sun.text.Normalizer
- //
-
- /**
- * Creates a new {@code Normalizer} object for iterating over the
- * normalized form of a given string.
- *
- * @param str The string to be normalized. The normalization
- * will start at the beginning of the string.
- *
- * @param mode The normalization mode.
- */
- public NormalizerBase(String str, Mode mode) {
- this(str, mode, UNICODE_LATEST);
- }
-
- /**
- * Normalizes a <code>String</code> using the given normalization form.
- *
- * @param str the input string to be normalized.
- * @param form the normalization form
- */
- public static String normalize(String str, Normalizer.Form form) {
- return normalize(str, form, UNICODE_LATEST);
- }
-
- /**
- * Normalizes a <code>String</code> using the given normalization form.
- *
- * @param str the input string to be normalized.
- * @param form the normalization form
- * @param options the optional features to be enabled.
- */
- public static String normalize(String str, Normalizer.Form form, int options) {
- int len = str.length();
- boolean asciiOnly = true;
- if (len < 80) {
- for (int i = 0; i < len; i++) {
- if (str.charAt(i) > 127) {
- asciiOnly = false;
- break;
- }
- }
- } else {
- char[] a = str.toCharArray();
- for (int i = 0; i < len; i++) {
- if (a[i] > 127) {
- asciiOnly = false;
- break;
- }
- }
- }
-
- switch (form) {
- case NFC :
- return asciiOnly ? str : NFC.normalize(str, options);
- case NFD :
- return asciiOnly ? str : NFD.normalize(str, options);
- case NFKC :
- return asciiOnly ? str : NFKC.normalize(str, options);
- case NFKD :
- return asciiOnly ? str : NFKD.normalize(str, options);
- }
-
- throw new IllegalArgumentException("Unexpected normalization form: " +
- form);
- }
-
- /**
- * Test if a string is in a given normalization form.
- * This is semantically equivalent to source.equals(normalize(source, mode)).
- *
- * Unlike quickCheck(), this function returns a definitive result,
- * never a "maybe".
- * For NFD, NFKD, and FCD, both functions work exactly the same.
- * For NFC and NFKC where quickCheck may return "maybe", this function will
- * perform further tests to arrive at a true/false result.
- * @param str the input string to be checked to see if it is normalized
- * @param form the normalization form
- */
- public static boolean isNormalized(String str, Normalizer.Form form) {
- return isNormalized(str, form, UNICODE_LATEST);
- }
-
- /**
- * Test if a string is in a given normalization form.
- * This is semantically equivalent to source.equals(normalize(source, mode)).
- *
- * Unlike quickCheck(), this function returns a definitive result,
- * never a "maybe".
- * For NFD, NFKD, and FCD, both functions work exactly the same.
- * For NFC and NFKC where quickCheck may return "maybe", this function will
- * perform further tests to arrive at a true/false result.
- * @param str the input string to be checked to see if it is normalized
- * @param form the normalization form
- * @param options the optional features to be enabled.
- */
- public static boolean isNormalized(String str, Normalizer.Form form, int options) {
- switch (form) {
- case NFC:
- return (NFC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
- case NFD:
- return (NFD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
- case NFKC:
- return (NFKC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
- case NFKD:
- return (NFKD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
- }
-
- throw new IllegalArgumentException("Unexpected normalization form: " +
- form);
- }
}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerDataReader.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,389 +0,0 @@
-/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.io.DataInputStream;
-import java.io.InputStream;
-import java.io.IOException;
-
-/**
- * @author Ram Viswanadha
- */
-
- /*
- * Description of the format of unorm.icu version 2.1.
- *
- * Main change from version 1 to version 2:
- * Use of new, common Trie instead of normalization-specific tries.
- * Change to version 2.1: add third/auxiliary trie with associated data.
- *
- * For more details of how to use the data structures see the code
- * in unorm.cpp (runtime normalization code) and
- * in gennorm.c and gennorm/store.c (build-time data generation).
- *
- * For the serialized format of Trie see Trie.c/TrieHeader.
- *
- * - Overall partition
- *
- * unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
- * After that there are the following structures:
- *
- * char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
- *
- * Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
- *
- * char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
- * extraData[0] contains the number of units for
- * FC_NFKC_Closure (formatVersion>=2.1)
- *
- * char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
- * combiningTableTop may include one 16-bit padding unit
- * to make sure that fcdTrie is 32-bit-aligned
- *
- * Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
- *
- * Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
- *
- *
- * The indexes array contains lengths and sizes of the following arrays and structures
- * as well as the following values:
- * indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
- * -- one more than the highest combining index computed for forward-only-combining characters
- * indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
- * -- number of combining indexes computed for both-ways-combining characters
- * indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
- * -- number of combining indexes computed for backward-only-combining characters
- *
- * indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
- * -- first code point with a quick check NF* value of NO/MAYBE
- *
- *
- * - Tries
- *
- * The main structures are two Trie tables ("compact arrays"),
- * each with one index array and one data array.
- * See Trie.h and Trie.c.
- *
- *
- * - Tries in unorm.icu
- *
- * The first trie (normTrie above)
- * provides data for the NF* quick checks and normalization.
- * The second trie (fcdTrie above) provides data just for FCD checks.
- *
- *
- * - norm32 data words from the first trie
- *
- * The norm32Table contains one 32-bit word "norm32" per code point.
- * It contains the following bit fields:
- * 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
- * if this index is <EXTRA_INDEX_TOP then it is an index into
- * extraData[] where variable-length normalization data for this
- * code point is found
- * if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
- * then this is a norm32 for a leading surrogate, and the index
- * value is used together with the following trailing surrogate
- * code unit in the second trie access
- * if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
- * then this is a norm32 for a "special" character,
- * i.e., the character is a Hangul syllable or a Jamo
- * see EXTRA_HANGUL etc.
- * generally, instead of extracting this index from the norm32 and
- * comparing it with the above constants,
- * the normalization code compares the entire norm32 value
- * with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
- *
- * 15..8 combining class (cc) according to UnicodeData.txt
- *
- * 7..6 COMBINES_ANY flags, used in composition to see if a character
- * combines with any following or preceding character(s)
- * at all
- * 7 COMBINES_BACK
- * 6 COMBINES_FWD
- *
- * 5..0 quick check flags, set for "no" or "maybe", with separate flags for
- * each normalization form
- * the higher bits are "maybe" flags; for NF*D there are no such flags
- * the lower bits are "no" flags for all forms, in the same order
- * as the "maybe" flags,
- * which is (MSB to LSB): NFKD NFD NFKC NFC
- * 5..4 QC_ANY_MAYBE
- * 3..0 QC_ANY_NO
- * see further related constants
- *
- *
- * - Extra data per code point
- *
- * "Extra data" is referenced by the index in norm32.
- * It is variable-length data. It is only present, and only those parts
- * of it are, as needed for a given character.
- * The norm32 extra data index is added to the beginning of extraData[]
- * to get to a vector of 16-bit words with data at the following offsets:
- *
- * [-1] Combining index for composition.
- * Stored only if norm32&COMBINES_ANY .
- * [0] Lengths of the canonical and compatibility decomposition strings.
- * Stored only if there are decompositions, i.e.,
- * if norm32&(QC_NFD|QC_NFKD)
- * High byte: length of NFKD, or 0 if none
- * Low byte: length of NFD, or 0 if none
- * Each length byte also has another flag:
- * Bit 7 of a length byte is set if there are non-zero
- * combining classes (cc's) associated with the respective
- * decomposition. If this flag is set, then the decomposition
- * is preceded by a 16-bit word that contains the
- * leading and trailing cc's.
- * Bits 6..0 of a length byte are the length of the
- * decomposition string, not counting the cc word.
- * [1..n] NFD
- * [n+1..] NFKD
- *
- * Each of the two decompositions consists of up to two parts:
- * - The 16-bit words with the leading and trailing cc's.
- * This is only stored if bit 7 of the corresponding length byte
- * is set. In this case, at least one of the cc's is not zero.
- * High byte: leading cc==cc of the first code point in the decomposition string
- * Low byte: trailing cc==cc of the last code point in the decomposition string
- * - The decomposition string in UTF-16, with length code units.
- *
- *
- * - Combining indexes and combiningTable[]
- *
- * Combining indexes are stored at the [-1] offset of the extra data
- * if the character combines forward or backward with any other characters.
- * They are used for (re)composition in NF*C.
- * Values of combining indexes are arranged according to whether a character
- * combines forward, backward, or both ways:
- * forward-only < both ways < backward-only
- *
- * The index values for forward-only and both-ways combining characters
- * are indexes into the combiningTable[].
- * The index values for backward-only combining characters are simply
- * incremented from the preceding index values to be unique.
- *
- * In the combiningTable[], a variable-length list
- * of variable-length (back-index, code point) pair entries is stored
- * for each forward-combining character.
- *
- * These back-indexes are the combining indexes of both-ways or backward-only
- * combining characters that the forward-combining character combines with.
- *
- * Each list is sorted in ascending order of back-indexes.
- * Each list is terminated with the last back-index having bit 15 set.
- *
- * Each pair (back-index, code point) takes up either 2 or 3
- * 16-bit words.
- * The first word of a list entry is the back-index, with its bit 15 set if
- * this is the last pair in the list.
- *
- * The second word contains flags in bits 15..13 that determine
- * if there is a third word and how the combined character is encoded:
- * 15 set if there is a third word in this list entry
- * 14 set if the result is a supplementary character
- * 13 set if the result itself combines forward
- *
- * According to these bits 15..14 of the second word,
- * the result character is encoded as follows:
- * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
- * the second word.
- * 10 The result is 0x2000..0xffff and stored in the third word.
- * Bits 12..0 of the second word are not used.
- * 11 The result is a supplementary character.
- * Bits 9..0 of the leading surrogate are in bits 9..0 of
- * the second word.
- * Add 0xd800 to these bits to get the complete surrogate.
- * Bits 12..10 of the second word are not used.
- * The trailing surrogate is stored in the third word.
- *
- *
- * - FCD trie
- *
- * The FCD trie is very simple.
- * It is a folded trie with 16-bit data words.
- * In each word, the high byte contains the leading cc of the character,
- * and the low byte contains the trailing cc of the character.
- * These cc's are the cc's of the first and last code points in the
- * canonical decomposition of the character.
- *
- * Since all 16 bits are used for cc's, lead surrogates must be tested
- * by checking the code unit instead of the trie data.
- * This is done only if the 16-bit data word is not zero.
- * If the code unit is a leading surrogate and the data word is not zero,
- * then instead of cc's it contains the offset for the second trie lookup.
- *
- *
- * - Auxiliary trie and data
- *
- *
- * The auxiliary 16-bit trie contains data for additional properties.
- * Bits
- * 15..13 reserved
- * 12 not NFC_Skippable (f) (formatVersion>=2.2)
- * 11 flag: not a safe starter for canonical closure
- * 10 composition exclusion
- * 9.. 0 index into extraData[] to FC_NFKC_Closure string
- * (not for lead surrogate),
- * or lead surrogate offset (for lead surrogate, if 9..0 not zero)
- *
- * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
- * (used in NormalizerTransliterator)
- *
- * A skippable character is
- * a) unassigned, or ALL of the following:
- * b) of combining class 0.
- * c) not decomposed by this normalization form.
- * AND if NFC or NFKC,
- * d) can never compose with a previous character.
- * e) can never compose with a following character.
- * f) can never change if another character is added.
- * Example: a-breve might satisfy all but f, but if you
- * add an ogonek it changes to a-ogonek + breve
- *
- * a)..e) must be tested from norm32.
- * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
- * into the auxiliary trie.
- * The same bit is used for NFC and NFKC; (c) differs for them.
- * As usual, we build the "not skippable" flags so that unassigned
- * code points get a 0 bit.
- * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
- * Test Hangul LV syllables entirely in code.
- *
- *
- * - FC_NFKC_Closure strings in extraData[]
- *
- * Strings are either stored as a single code unit or as the length
- * followed by that many units.
- *
- */
-final class NormalizerDataReader implements ICUBinary.Authenticate {
-
- /**
- * <p>Protected constructor.</p>
- * @param inputStream ICU uprop.dat file input stream
- * @exception IOException throw if data file fails authentication
- * @draft 2.1
- */
- protected NormalizerDataReader(InputStream inputStream)
- throws IOException{
-
- unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
- dataInputStream = new DataInputStream(inputStream);
- }
-
- // protected methods -------------------------------------------------
-
- protected int[] readIndexes(int length)throws IOException{
- int[] indexes = new int[length];
- //Read the indexes
- for (int i = 0; i <length ; i++) {
- indexes[i] = dataInputStream.readInt();
- }
- return indexes;
- }
- /**
- * <p>Reads unorm.icu, parse it into blocks of data to be stored in
- * NormalizerImpl.</P
- * @param normBytes
- * @param fcdBytes
- * @param auxBytes
- * @param extraData
- * @param combiningTable
- * @exception thrown when data reading fails
- * @draft 2.1
- */
- protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
- char[] extraData, char[] combiningTable)
- throws IOException{
-
- //Read the bytes that make up the normTrie
- dataInputStream.readFully(normBytes);
-
- //normTrieStream= new ByteArrayInputStream(normBytes);
-
- //Read the extra data
- for(int i=0;i<extraData.length;i++){
- extraData[i]=dataInputStream.readChar();
- }
-
- //Read the combining class table
- for(int i=0; i<combiningTable.length; i++){
- combiningTable[i]=dataInputStream.readChar();
- }
-
- //Read the fcdTrie
- dataInputStream.readFully(fcdBytes);
-
-
- //Read the AuxTrie
- dataInputStream.readFully(auxBytes);
- }
-
- public byte[] getDataFormatVersion(){
- return DATA_FORMAT_VERSION;
- }
-
- public boolean isDataVersionAcceptable(byte version[])
- {
- return version[0] == DATA_FORMAT_VERSION[0]
- && version[2] == DATA_FORMAT_VERSION[2]
- && version[3] == DATA_FORMAT_VERSION[3];
- }
-
- public byte[] getUnicodeVersion(){
- return unicodeVersion;
- }
- // private data members -------------------------------------------------
-
-
- /**
- * ICU data file input stream
- */
- private DataInputStream dataInputStream;
-
- private byte[] unicodeVersion;
-
- /**
- * File format version that this class understands.
- * No guarantees are made if a older version is used
- * see store.c of gennorm for more information and values
- */
- private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
- (byte)0x72, (byte)0x6D};
- private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
- (byte)0x5, (byte)0x2};
-
-}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,614 +22,1898 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 2009-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
-import java.io.BufferedInputStream;
-import java.io.ByteArrayInputStream;
import java.io.IOException;
-import java.io.BufferedInputStream;
-import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.text.Normalizer;
-/**
- * @author Ram Viswanadha
- */
+// Original filename in ICU4J: Normalizer2Impl.java
public final class NormalizerImpl {
- // Static block for the class to initialize its own self
- static final NormalizerImpl IMPL;
+
+ public static final class Hangul {
+ /* Korean Hangul and Jamo constants */
+ public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
+ public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
+ public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
+
+ public static final int HANGUL_BASE=0xac00;
+ public static final int HANGUL_END=0xd7a3;
+
+ public static final int JAMO_L_COUNT=19;
+ public static final int JAMO_V_COUNT=21;
+ public static final int JAMO_T_COUNT=28;
+
+ public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
+ public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
+
+ public static boolean isHangul(int c) {
+ return HANGUL_BASE<=c && c<HANGUL_LIMIT;
+ }
- static
- {
- try
- {
- IMPL = new NormalizerImpl();
+ public static boolean isHangulWithoutJamoT(char c) {
+ c-=HANGUL_BASE;
+ return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
- catch (Exception e)
- {
- throw new RuntimeException(e.getMessage());
+
+ /**
+ * Decomposes c, which must be a Hangul syllable, into buffer
+ * and returns the length of the decomposition (2 or 3).
+ */
+ public static int decompose(int c, Appendable buffer) {
+ try {
+ c-=HANGUL_BASE;
+ int c2=c%JAMO_T_COUNT;
+ c/=JAMO_T_COUNT;
+ buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
+ buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
+ if(c2==0) {
+ return 2;
+ } else {
+ buffer.append((char)(JAMO_T_BASE+c2));
+ return 3;
+ }
+ } catch(IOException e) {
+ throw new InternalError(e);
+ }
}
}
- static final int UNSIGNED_BYTE_MASK =0xFF;
- static final long UNSIGNED_INT_MASK = 0xffffffffL;
- /*
- * This new implementation of the normalization code loads its data from
- * unorm.icu, which is generated with the gennorm tool.
- * The format of that file is described at the end of this file.
- */
- private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu";
-
- // norm32 value constants
-
- // quick check flags 0..3 set mean "no" for their forms
- public static final int QC_NFC=0x11; /* no|maybe */
- public static final int QC_NFKC=0x22; /* no|maybe */
- public static final int QC_NFD=4; /* no */
- public static final int QC_NFKD=8; /* no */
-
- public static final int QC_ANY_NO=0xf;
-
- /* quick check flags 4..5 mean "maybe" for their forms;
- * test flags>=QC_MAYBE
+ /**
+ * Writable buffer that takes care of canonical ordering.
+ * Its Appendable methods behave like the C++ implementation's
+ * appendZeroCC() methods.
+ * <p>
+ * If dest is a StringBuilder, then the buffer writes directly to it.
+ * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
+ * until no further changes are necessary and whole segments are appended.
+ * append() methods that take combining-class values always write to the StringBuilder.
+ * Other append() methods flush and append to the Appendable.
*/
- public static final int QC_MAYBE=0x10;
- public static final int QC_ANY_MAYBE=0x30;
-
- public static final int QC_MASK=0x3f;
+ public static final class ReorderingBuffer implements Appendable {
+ public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
+ impl=ni;
+ app=dest;
+ if (app instanceof StringBuilder) {
+ appIsStringBuilder=true;
+ str=(StringBuilder)dest;
+ // In Java, the constructor subsumes public void init(int destCapacity)
+ str.ensureCapacity(destCapacity);
+ reorderStart=0;
+ if(str.length()==0) {
+ lastCC=0;
+ } else {
+ setIterator();
+ lastCC=previousCC();
+ // Set reorderStart after the last code point with cc<=1 if there is one.
+ if(lastCC>1) {
+ while(previousCC()>1) {}
+ }
+ reorderStart=codePointLimit;
+ }
+ } else {
+ appIsStringBuilder=false;
+ str=new StringBuilder();
+ reorderStart=0;
+ lastCC=0;
+ }
+ }
- private static final int COMBINES_FWD=0x40;
- private static final int COMBINES_BACK=0x80;
- public static final int COMBINES_ANY=0xc0;
- // UnicodeData.txt combining class in bits 15.
- private static final int CC_SHIFT=8;
- public static final int CC_MASK=0xff00;
- // 16 bits for the index to UChars and other extra data
- private static final int EXTRA_SHIFT=16;
+ public boolean isEmpty() { return str.length()==0; }
+ public int length() { return str.length(); }
+ public int getLastCC() { return lastCC; }
+
+ public StringBuilder getStringBuilder() { return str; }
+
+ public boolean equals(CharSequence s, int start, int limit) {
+ return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
+ }
- /* norm32 value constants using >16 bits */
- private static final long MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK;
- private static final long SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK;
- private static final long MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK;
-// private static final long MIN_JAMO_V = 0xfff20000 & UNSIGNED_INT_MASK;
- private static final long JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK;
+ // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
+ public void setLastChar(char c) {
+ str.setCharAt(str.length()-1, c);
+ }
+ public void append(int c, int cc) {
+ if(lastCC<=cc || cc==0) {
+ str.appendCodePoint(c);
+ lastCC=cc;
+ if(cc<=1) {
+ reorderStart=str.length();
+ }
+ } else {
+ insert(c, cc);
+ }
+ }
- /* indexes[] value names */
- /* number of bytes in normalization trie */
- static final int INDEX_TRIE_SIZE = 0;
- /* number of chars in extra data */
- static final int INDEX_CHAR_COUNT = 1;
- /* number of uint16_t words for combining data */
- static final int INDEX_COMBINE_DATA_COUNT = 2;
- /* first code point with quick check NFC NO/MAYBE */
- public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
- /* first code point with quick check NFKC NO/MAYBE */
- public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
- /* first code point with quick check NFD NO/MAYBE */
- public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
- /* first code point with quick check NFKD NO/MAYBE */
- public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
- /* number of bytes in FCD trie */
- static final int INDEX_FCD_TRIE_SIZE = 10;
- /* number of bytes in the auxiliary trie */
- static final int INDEX_AUX_TRIE_SIZE = 11;
- /* changing this requires a new formatVersion */
- static final int INDEX_TOP = 32;
-
+ // s must be in NFD, otherwise change the implementation.
+ public void append(CharSequence s, int start, int limit,
+ int leadCC, int trailCC) {
+ if(start==limit) {
+ return;
+ }
+ if(lastCC<=leadCC || leadCC==0) {
+ if(trailCC<=1) {
+ reorderStart=str.length()+(limit-start);
+ } else if(leadCC<=1) {
+ reorderStart=str.length()+1; // Ok if not a code point boundary.
+ }
+ str.append(s, start, limit);
+ lastCC=trailCC;
+ } else {
+ int c=Character.codePointAt(s, start);
+ start+=Character.charCount(c);
+ insert(c, leadCC); // insert first code point
+ while(start<limit) {
+ c=Character.codePointAt(s, start);
+ start+=Character.charCount(c);
+ if(start<limit) {
+ // s must be in NFD, otherwise we need to use getCC().
+ leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
+ } else {
+ leadCC=trailCC;
+ }
+ append(c, leadCC);
+ }
+ }
+ }
- /* AUX constants */
- /* value constants for auxTrie */
- private static final int AUX_UNSAFE_SHIFT = 11;
- private static final int AUX_COMP_EX_SHIFT = 10;
- private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
+ // The following append() methods work like C++ appendZeroCC().
+ // They assume that the cc or trailCC of their input is 0.
+ // Most of them implement Appendable interface methods.
+ // @Override when we switch to Java 6
+ public ReorderingBuffer append(char c) {
+ str.append(c);
+ lastCC=0;
+ reorderStart=str.length();
+ return this;
+ }
- private static final int AUX_MAX_FNC = 1<<AUX_COMP_EX_SHIFT;
- private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
- private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
- private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
- private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT);
-
- private static final int MAX_BUFFER_SIZE = 20;
-
- /*******************************/
+ public void appendZeroCC(int c) {
+ str.appendCodePoint(c);
+ lastCC=0;
+ reorderStart=str.length();
+ }
- /* Wrappers for Trie implementations */
- static final class NormTrieImpl implements Trie.DataManipulate{
- static IntTrie normTrie= null;
- /**
- * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
- * data the index array offset of the indexes for that lead surrogate.
- * @param property data value for a surrogate from the trie, including
- * the folding offset
- * @return data offset or 0 if there is no data for the lead surrogate
- */
- /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
- public int getFoldingOffset(int value){
- return BMP_INDEX_LENGTH+
- ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))&
- (0x3ff<<SURROGATE_BLOCK_BITS));
+ // @Override when we switch to Java 6
+ public ReorderingBuffer append(CharSequence s) {
+ if(s.length()!=0) {
+ str.append(s);
+ lastCC=0;
+ reorderStart=str.length();
+ }
+ return this;
+ }
+
+ // @Override when we switch to Java 6
+ public ReorderingBuffer append(CharSequence s, int start, int limit) {
+ if(start!=limit) {
+ str.append(s, start, limit);
+ lastCC=0;
+ reorderStart=str.length();
+ }
+ return this;
}
+ /**
+ * Flushes from the intermediate StringBuilder to the Appendable,
+ * if they are different objects.
+ * Used after recomposition.
+ * Must be called at the end when writing to a non-StringBuilder Appendable.
+ */
+ public void flush() {
+ if(appIsStringBuilder) {
+ reorderStart=str.length();
+ } else {
+ try {
+ app.append(str);
+ str.setLength(0);
+ reorderStart=0;
+ } catch(IOException e) {
+ throw new InternalError(e); // Avoid declaring "throws IOException".
+ }
+ }
+ lastCC=0;
+ }
+
+ /**
+ * Flushes from the intermediate StringBuilder to the Appendable,
+ * if they are different objects.
+ * Then appends the new text to the Appendable or StringBuilder.
+ * Normally used after quick check loops find a non-empty sequence.
+ */
+ public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
+ if(appIsStringBuilder) {
+ str.append(s, start, limit);
+ reorderStart=str.length();
+ } else {
+ try {
+ app.append(str).append(s, start, limit);
+ str.setLength(0);
+ reorderStart=0;
+ } catch(IOException e) {
+ throw new InternalError(e); // Avoid declaring "throws IOException".
+ }
+ }
+ lastCC=0;
+ return this;
+ }
+
+ public void remove() {
+ str.setLength(0);
+ lastCC=0;
+ reorderStart=0;
+ }
+
+ public void removeSuffix(int suffixLength) {
+ int oldLength=str.length();
+ str.delete(oldLength-suffixLength, oldLength);
+ lastCC=0;
+ reorderStart=str.length();
+ }
+
+ // Inserts c somewhere before the last character.
+ // Requires 0<cc<lastCC which implies reorderStart<limit.
+ private void insert(int c, int cc) {
+ for(setIterator(), skipPrevious(); previousCC()>cc;) {}
+ // insert c at codePointLimit, after the character with prevCC<=cc
+ if(c<=0xffff) {
+ str.insert(codePointLimit, (char)c);
+ if(cc<=1) {
+ reorderStart=codePointLimit+1;
+ }
+ } else {
+ str.insert(codePointLimit, Character.toChars(c));
+ if(cc<=1) {
+ reorderStart=codePointLimit+2;
+ }
+ }
+ }
+
+ private final NormalizerImpl impl;
+ private final Appendable app;
+ private final StringBuilder str;
+ private final boolean appIsStringBuilder;
+ private int reorderStart;
+ private int lastCC;
+
+ // private backward iterator
+ private void setIterator() { codePointStart=str.length(); }
+ private void skipPrevious() { // Requires 0<codePointStart.
+ codePointLimit=codePointStart;
+ codePointStart=str.offsetByCodePoints(codePointStart, -1);
+ }
+ private int previousCC() { // Returns 0 if there is no previous character.
+ codePointLimit=codePointStart;
+ if(reorderStart>=codePointStart) {
+ return 0;
+ }
+ int c=str.codePointBefore(codePointStart);
+ codePointStart-=Character.charCount(c);
+ if(c<MIN_CCC_LCCC_CP) {
+ return 0;
+ }
+ return getCCFromYesOrMaybe(impl.getNorm16(c));
+ }
+
+ private int codePointStart, codePointLimit;
}
- static final class FCDTrieImpl implements Trie.DataManipulate{
- static CharTrie fcdTrie=null;
- /**
- * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
- * data the index array offset of the indexes for that lead surrogate.
- * @param property data value for a surrogate from the trie, including
- * the folding offset
- * @return data offset or 0 if there is no data for the lead surrogate
- */
- /* fcdTrie: the folding offset is the lead FCD value itself */
- public int getFoldingOffset(int value){
- return value;
+
+ // TODO: Propose as public API on the UTF16 class.
+ // TODO: Propose widening UTF16 methods that take char to take int.
+ // TODO: Propose widening UTF16 methods that take String to take CharSequence.
+ public static final class UTF16Plus {
+ /**
+ * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
+ * is it a lead surrogate?
+ * @param c code unit or code point
+ * @return true or false
+ */
+ public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
+
+ /**
+ * Compares two CharSequence subsequences for binary equality.
+ * @param s1 first sequence
+ * @param start1 start offset in first sequence
+ * @param limit1 limit offset in first sequence
+ * @param s2 second sequence
+ * @param start2 start offset in second sequence
+ * @param limit2 limit offset in second sequence
+ * @return true if s1.subSequence(start1, limit1) contains the same text
+ * as s2.subSequence(start2, limit2)
+ */
+ public static boolean equal(CharSequence s1, int start1, int limit1,
+ CharSequence s2, int start2, int limit2) {
+ if((limit1-start1)!=(limit2-start2)) {
+ return false;
+ }
+ if(s1==s2 && start1==start2) {
+ return true;
+ }
+ while(start1<limit1) {
+ if(s1.charAt(start1++)!=s2.charAt(start2++)) {
+ return false;
+ }
+ }
+ return true;
}
}
- static final class AuxTrieImpl implements Trie.DataManipulate{
- static CharTrie auxTrie = null;
- /**
- * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
- * data the index array offset of the indexes for that lead surrogate.
- * @param property data value for a surrogate from the trie, including
- * the folding offset
- * @return data offset or 0 if there is no data for the lead surrogate
- */
- /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
- public int getFoldingOffset(int value){
- return (value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS;
+ public NormalizerImpl() {}
+
+ private static final class IsAcceptable implements ICUBinary.Authenticate {
+ // @Override when we switch to Java 6
+ public boolean isDataVersionAcceptable(byte version[]) {
+ return version[0]==2;
+ }
+ }
+
+ private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
+ private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2"
+
+ public NormalizerImpl load(ByteBuffer bytes) {
+ try {
+ dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
+ int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
+ if(indexesLength<=IX_MIN_MAYBE_YES) {
+ throw new IOException("Normalizer2 data: not enough indexes");
+ }
+ int[] inIndexes=new int[indexesLength];
+ inIndexes[0]=indexesLength*4;
+ for(int i=1; i<indexesLength; ++i) {
+ inIndexes[i]=bytes.getInt();
+ }
+
+ minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
+ minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
+
+ minYesNo=inIndexes[IX_MIN_YES_NO];
+ minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
+ minNoNo=inIndexes[IX_MIN_NO_NO];
+ limitNoNo=inIndexes[IX_LIMIT_NO_NO];
+ minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
+
+ // Read the normTrie.
+ int offset=inIndexes[IX_NORM_TRIE_OFFSET];
+ int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
+ normTrie=Trie2_16.createFromSerialized(bytes);
+ int trieLength=normTrie.getSerializedLength();
+ if(trieLength>(nextOffset-offset)) {
+ throw new IOException("Normalizer2 data: not enough bytes for normTrie");
+ }
+ ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes
+
+ // Read the composition and mapping data.
+ offset=nextOffset;
+ nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
+ int numChars=(nextOffset-offset)/2;
+ char[] chars;
+ if(numChars!=0) {
+ chars=new char[numChars];
+ for(int i=0; i<numChars; ++i) {
+ chars[i]=bytes.getChar();
+ }
+ maybeYesCompositions=new String(chars);
+ extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
+ }
+
+ // smallFCD: new in formatVersion 2
+ offset=nextOffset;
+ smallFCD=new byte[0x100];
+ for(int i=0; i<0x100; ++i) {
+ smallFCD[i]=bytes.get();
+ }
+
+ // Build tccc180[].
+ // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
+ tccc180=new int[0x180];
+ int bits=0;
+ for(int c=0; c<0x180; bits>>=1) {
+ if((c&0xff)==0) {
+ bits=smallFCD[c>>8]; // one byte per 0x100 code points
+ }
+ if((bits&1)!=0) {
+ for(int i=0; i<0x20; ++i, ++c) {
+ tccc180[c]=getFCD16FromNormData(c)&0xff;
+ }
+ } else {
+ c+=0x20;
+ }
+ }
+
+ return this;
+ } catch(IOException e) {
+ throw new InternalError(e);
+ }
+ }
+
+ public NormalizerImpl load(String name) {
+ return load(ICUBinary.getRequiredData(name));
+ }
+
+ public int getNorm16(int c) {
+ return normTrie.get(c);
+ }
+
+ public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
+
+ public int getCC(int norm16) {
+ if(norm16>=MIN_NORMAL_MAYBE_YES) {
+ return norm16&0xff;
+ }
+ if(norm16<minNoNo || limitNoNo<=norm16) {
+ return 0;
+ }
+ return getCCFromNoNo(norm16);
+ }
+
+ public static int getCCFromYesOrMaybe(int norm16) {
+ return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
+ }
+
+ /**
+ * Returns the FCD data for code point c.
+ * @param c A Unicode code point.
+ * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
+ */
+ public int getFCD16(int c) {
+ if(c<0) {
+ return 0;
+ } else if(c<0x180) {
+ return tccc180[c];
+ } else if(c<=0xffff) {
+ if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
+ }
+ return getFCD16FromNormData(c);
+ }
+
+ /** Returns the FCD data for U+0000<=c<U+0180. */
+ public int getFCD16FromBelow180(int c) { return tccc180[c]; }
+ /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
+ public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
+ // 0<=lead<=0xffff
+ byte bits=smallFCD[lead>>8];
+ if(bits==0) { return false; }
+ return ((bits>>((lead>>5)&7))&1)!=0;
+ }
+
+ /** Gets the FCD value from the regular normalization data. */
+ public int getFCD16FromNormData(int c) {
+ // Only loops for 1:1 algorithmic mappings.
+ for(;;) {
+ int norm16=getNorm16(c);
+ if(norm16<=minYesNo) {
+ // no decomposition or Hangul syllable, all zeros
+ return 0;
+ } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
+ // combining mark
+ norm16&=0xff;
+ return norm16|(norm16<<8);
+ } else if(norm16>=minMaybeYes) {
+ return 0;
+ } else if(isDecompNoAlgorithmic(norm16)) {
+ c=mapAlgorithmic(c, norm16);
+ } else {
+ // c decomposes, get everything from the variable-length extra data
+ int firstUnit=extraData.charAt(norm16);
+ if((firstUnit&MAPPING_LENGTH_MASK)==0) {
+ // A character that is deleted (maps to an empty string) must
+ // get the worst-case lccc and tccc values because arbitrary
+ // characters on both sides will become adjacent.
+ return 0x1ff;
+ } else {
+ int fcd16=firstUnit>>8; // tccc
+ if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
+ fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc
+ }
+ return fcd16;
+ }
+ }
+ }
+ }
+
+ /**
+ * Gets the decomposition for one code point.
+ * @param c code point
+ * @return c's decomposition, if it has one; returns null if it does not have a decomposition
+ */
+ public String getDecomposition(int c) {
+ int decomp=-1;
+ int norm16;
+ for(;;) {
+ if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
+ // c does not decompose
+ } else if(isHangul(norm16)) {
+ // Hangul syllable: decompose algorithmically
+ StringBuilder buffer=new StringBuilder();
+ Hangul.decompose(c, buffer);
+ return buffer.toString();
+ } else if(isDecompNoAlgorithmic(norm16)) {
+ decomp=c=mapAlgorithmic(c, norm16);
+ continue;
+ } else {
+ // c decomposes, get everything from the variable-length extra data
+ int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
+ return extraData.substring(norm16, norm16+length);
+ }
+ if(decomp<0) {
+ return null;
+ } else {
+ return UTF16.valueOf(decomp);
+ }
}
}
- /****************************************************/
+ public static final int MIN_CCC_LCCC_CP=0x300;
+ public static final int MIN_YES_YES_WITH_CC=0xff01;
+ public static final int JAMO_VT=0xff00;
+ public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
+ public static final int MAX_DELTA=0x40;
- private static FCDTrieImpl fcdTrieImpl;
- private static NormTrieImpl normTrieImpl;
- private static AuxTrieImpl auxTrieImpl;
- private static int[] indexes;
- private static char[] combiningTable;
- private static char[] extraData;
+ // Byte offsets from the start of the data, after the generic header.
+ public static final int IX_NORM_TRIE_OFFSET=0;
+ public static final int IX_EXTRA_DATA_OFFSET=1;
+ public static final int IX_SMALL_FCD_OFFSET=2;
+
+ // Code point thresholds for quick check codes.
+ public static final int IX_MIN_DECOMP_NO_CP=8;
+ public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
- private static boolean isDataLoaded;
- private static boolean isFormatVersion_2_1;
- private static boolean isFormatVersion_2_2;
- private static byte[] unicodeVersion;
+ // Norm16 value thresholds for quick check combinations and types of extra data.
+ // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
+ public static final int IX_MIN_YES_NO=10;
+ public static final int IX_MIN_NO_NO=11;
+ public static final int IX_LIMIT_NO_NO=12;
+ public static final int IX_MIN_MAYBE_YES=13;
+
+ // Mappings only in [minYesNoMappingsOnly..minNoNo[.
+ public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
- /**
- * Default buffer size of datafile
- */
- private static final int DATA_BUFFER_SIZE = 25000;
+ public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
+ public static final int MAPPING_LENGTH_MASK=0x1f;
- /**
- * FCD check: everything below this code point is known to have a 0
- * lead combining class
- */
- public static final int MIN_WITH_LEAD_CC=0x300;
+ public static final int COMP_1_LAST_TUPLE=0x8000;
+ public static final int COMP_1_TRIPLE=1;
+ public static final int COMP_1_TRAIL_LIMIT=0x3400;
+ public static final int COMP_1_TRAIL_MASK=0x7ffe;
+ public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit
+ public static final int COMP_2_TRAIL_SHIFT=6;
+ public static final int COMP_2_TRAIL_MASK=0xffc0;
+ // higher-level functionality ------------------------------------------ ***
/**
- * Bit 7 of the length byte for a decomposition string in extra data is
- * a flag indicating whether the decomposition string is
- * preceded by a 16-bit word with the leading and trailing cc
- * of the decomposition (like for A-umlaut);
- * if not, then both cc's are zero (like for compatibility ideographs).
+ * Decomposes s[src, limit[ and writes the result to dest.
+ * limit can be NULL if src is NUL-terminated.
+ * destLengthEstimate is the initial dest buffer capacity and can be -1.
*/
- private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80;
- /**
- * Bits 6..0 of the length byte contain the actual length.
- */
- private static final int DECOMP_LENGTH_MASK=0x7f;
+ public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
+ int destLengthEstimate) {
+ if(destLengthEstimate<0) {
+ destLengthEstimate=limit-src;
+ }
+ dest.setLength(0);
+ ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
+ decompose(s, src, limit, buffer);
+ }
+
+ // Dual functionality:
+ // buffer!=NULL: normalize
+ // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
+ public int decompose(CharSequence s, int src, int limit,
+ ReorderingBuffer buffer) {
+ int minNoCP=minDecompNoCP;
+
+ int prevSrc;
+ int c=0;
+ int norm16=0;
+
+ // only for quick check
+ int prevBoundary=src;
+ int prevCC=0;
+
+ for(;;) {
+ // count code units below the minimum or with irrelevant data for the quick check
+ for(prevSrc=src; src!=limit;) {
+ if( (c=s.charAt(src))<minNoCP ||
+ isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
+ ) {
+ ++src;
+ } else if(!UTF16.isSurrogate((char)c)) {
+ break;
+ } else {
+ char c2;
+ if(UTF16Plus.isSurrogateLead(c)) {
+ if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
+ c=Character.toCodePoint((char)c, c2);
+ }
+ } else /* trail surrogate */ {
+ if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
+ --src;
+ c=Character.toCodePoint(c2, (char)c);
+ }
+ }
+ if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
+ src+=Character.charCount(c);
+ } else {
+ break;
+ }
+ }
+ }
+ // copy these code units all at once
+ if(src!=prevSrc) {
+ if(buffer!=null) {
+ buffer.flushAndAppendZeroCC(s, prevSrc, src);
+ } else {
+ prevCC=0;
+ prevBoundary=src;
+ }
+ }
+ if(src==limit) {
+ break;
+ }
+
+ // Check one above-minimum, relevant code point.
+ src+=Character.charCount(c);
+ if(buffer!=null) {
+ decompose(c, norm16, buffer);
+ } else {
+ if(isDecompYes(norm16)) {
+ int cc=getCCFromYesOrMaybe(norm16);
+ if(prevCC<=cc || cc==0) {
+ prevCC=cc;
+ if(cc<=1) {
+ prevBoundary=src;
+ }
+ continue;
+ }
+ }
+ return prevBoundary; // "no" or cc out of order
+ }
+ }
+ return src;
+ }
+
+ public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
+ int limit=s.length();
+ if(limit==0) {
+ return;
+ }
+ if(doDecompose) {
+ decompose(s, 0, limit, buffer);
+ return;
+ }
+ // Just merge the strings at the boundary.
+ int c=Character.codePointAt(s, 0);
+ int src=0;
+ int firstCC, prevCC, cc;
+ firstCC=prevCC=cc=getCC(getNorm16(c));
+ while(cc!=0) {
+ prevCC=cc;
+ src+=Character.charCount(c);
+ if(src>=limit) {
+ break;
+ }
+ c=Character.codePointAt(s, src);
+ cc=getCC(getNorm16(c));
+ };
+ buffer.append(s, 0, src, firstCC, prevCC);
+ buffer.append(s, src, limit);
+ }
+
+ // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
+ // doCompose: normalize
+ // !doCompose: isNormalized (buffer must be empty and initialized)
+ public boolean compose(CharSequence s, int src, int limit,
+ boolean onlyContiguous,
+ boolean doCompose,
+ ReorderingBuffer buffer) {
+ int minNoMaybeCP=minCompNoMaybeCP;
+
+ /*
+ * prevBoundary points to the last character before the current one
+ * that has a composition boundary before it with ccc==0 and quick check "yes".
+ * Keeping track of prevBoundary saves us looking for a composition boundary
+ * when we find a "no" or "maybe".
+ *
+ * When we back out from prevSrc back to prevBoundary,
+ * then we also remove those same characters (which had been simply copied
+ * or canonically-order-inserted) from the ReorderingBuffer.
+ * Therefore, at all times, the [prevBoundary..prevSrc[ source units
+ * must correspond 1:1 to destination units at the end of the destination buffer.
+ */
+ int prevBoundary=src;
+ int prevSrc;
+ int c=0;
+ int norm16=0;
+
+ // only for isNormalized
+ int prevCC=0;
- /** Length of the BMP portion of the index (stage 1) array. */
- private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_;
- /** Number of bits of a trail surrogate that are used in index table
- * lookups.
- */
- private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
+ for(;;) {
+ // count code units below the minimum or with irrelevant data for the quick check
+ for(prevSrc=src; src!=limit;) {
+ if( (c=s.charAt(src))<minNoMaybeCP ||
+ isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
+ ) {
+ ++src;
+ } else if(!UTF16.isSurrogate((char)c)) {
+ break;
+ } else {
+ char c2;
+ if(UTF16Plus.isSurrogateLead(c)) {
+ if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
+ c=Character.toCodePoint((char)c, c2);
+ }
+ } else /* trail surrogate */ {
+ if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
+ --src;
+ c=Character.toCodePoint(c2, (char)c);
+ }
+ }
+ if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
+ src+=Character.charCount(c);
+ } else {
+ break;
+ }
+ }
+ }
+ // copy these code units all at once
+ if(src!=prevSrc) {
+ if(src==limit) {
+ if(doCompose) {
+ buffer.flushAndAppendZeroCC(s, prevSrc, src);
+ }
+ break;
+ }
+ // Set prevBoundary to the last character in the quick check loop.
+ prevBoundary=src-1;
+ if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
+ Character.isHighSurrogate(s.charAt(prevBoundary-1))
+ ) {
+ --prevBoundary;
+ }
+ if(doCompose) {
+ // The last "quick check yes" character is excluded from the
+ // flush-and-append call in case it needs to be modified.
+ buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
+ buffer.append(s, prevBoundary, src);
+ } else {
+ prevCC=0;
+ }
+ // The start of the current character (c).
+ prevSrc=src;
+ } else if(src==limit) {
+ break;
+ }
+ src+=Character.charCount(c);
+ /*
+ * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
+ * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
+ * or has ccc!=0.
+ * Check for Jamo V/T, then for regular characters.
+ * c is not a Hangul syllable or Jamo L because those have "yes" properties.
+ */
+ if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
+ char prev=s.charAt(prevSrc-1);
+ boolean needToDecompose=false;
+ if(c<Hangul.JAMO_T_BASE) {
+ // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
+ prev-=Hangul.JAMO_L_BASE;
+ if(prev<Hangul.JAMO_L_COUNT) {
+ if(!doCompose) {
+ return false;
+ }
+ char syllable=(char)
+ (Hangul.HANGUL_BASE+
+ (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
+ Hangul.JAMO_T_COUNT);
+ char t;
+ if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
+ ++src;
+ syllable+=t; // The next character was a Jamo T.
+ prevBoundary=src;
+ buffer.setLastChar(syllable);
+ continue;
+ }
+ // If we see L+V+x where x!=T then we drop to the slow path,
+ // decompose and recompose.
+ // This is to deal with NFKC finding normal L and V but a
+ // compatibility variant of a T. We need to either fully compose that
+ // combination here (which would complicate the code and may not work
+ // with strange custom data) or use the slow path -- or else our replacing
+ // two input characters (L+V) with one output character (LV syllable)
+ // would violate the invariant that [prevBoundary..prevSrc[ has the same
+ // length as what we appended to the buffer since prevBoundary.
+ needToDecompose=true;
+ }
+ } else if(Hangul.isHangulWithoutJamoT(prev)) {
+ // c is a Jamo Trailing consonant,
+ // compose with previous Hangul LV that does not contain a Jamo T.
+ if(!doCompose) {
+ return false;
+ }
+ buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
+ prevBoundary=src;
+ continue;
+ }
+ if(!needToDecompose) {
+ // The Jamo V/T did not compose into a Hangul syllable.
+ if(doCompose) {
+ buffer.append((char)c);
+ } else {
+ prevCC=0;
+ }
+ continue;
+ }
+ }
+ /*
+ * Source buffer pointers:
+ *
+ * all done quick check current char not yet
+ * "yes" but (c) processed
+ * may combine
+ * forward
+ * [-------------[-------------[-------------[-------------[
+ * | | | | |
+ * orig. src prevBoundary prevSrc src limit
+ *
+ *
+ * Destination buffer pointers inside the ReorderingBuffer:
+ *
+ * all done might take not filled yet
+ * characters for
+ * reordering
+ * [-------------[-------------[-------------[
+ * | | | |
+ * start reorderStart limit |
+ * +remainingCap.+
+ */
+ if(norm16>=MIN_YES_YES_WITH_CC) {
+ int cc=norm16&0xff; // cc!=0
+ if( onlyContiguous && // FCC
+ (doCompose ? buffer.getLastCC() : prevCC)==0 &&
+ prevBoundary<prevSrc &&
+ // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
+ // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
+ // passed the quick check "yes && ccc==0" test.
+ // Check whether the last character was a "yesYes" or a "yesNo".
+ // If a "yesNo", then we get its trailing ccc from its
+ // mapping and check for canonical order.
+ // All other cases are ok.
+ getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
+ ) {
+ // Fails FCD test, need to decompose and contiguously recompose.
+ if(!doCompose) {
+ return false;
+ }
+ } else if(doCompose) {
+ buffer.append(c, cc);
+ continue;
+ } else if(prevCC<=cc) {
+ prevCC=cc;
+ continue;
+ } else {
+ return false;
+ }
+ } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
+ return false;
+ }
- // public utility
- public static int getFromIndexesArr(int index){
- return indexes[index];
- }
+ /*
+ * Find appropriate boundaries around this character,
+ * decompose the source text from between the boundaries,
+ * and recompose it.
+ *
+ * We may need to remove the last few characters from the ReorderingBuffer
+ * to account for source text that was copied or appended
+ * but needs to take part in the recomposition.
+ */
+
+ /*
+ * Find the last composition boundary in [prevBoundary..src[.
+ * It is either the decomposition of the current character (at prevSrc),
+ * or prevBoundary.
+ */
+ if(hasCompBoundaryBefore(c, norm16)) {
+ prevBoundary=prevSrc;
+ } else if(doCompose) {
+ buffer.removeSuffix(prevSrc-prevBoundary);
+ }
- // protected constructor ---------------------------------------------
+ // Find the next composition boundary in [src..limit[ -
+ // modifies src to point to the next starter.
+ src=findNextCompBoundary(s, src, limit);
+
+ // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
+ int recomposeStartIndex=buffer.length();
+ decomposeShort(s, prevBoundary, src, buffer);
+ recompose(buffer, recomposeStartIndex, onlyContiguous);
+ if(!doCompose) {
+ if(!buffer.equals(s, prevBoundary, src)) {
+ return false;
+ }
+ buffer.remove();
+ prevCC=0;
+ }
+
+ // Move to the next starter. We never need to look back before this point again.
+ prevBoundary=src;
+ }
+ return true;
+ }
/**
- * Constructor
- * @exception thrown when data reading fails or data corrupted
- */
- private NormalizerImpl() throws IOException {
- //data should be loaded only once
- if(!isDataLoaded){
-
- // jar access
- InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
- BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
- NormalizerDataReader reader = new NormalizerDataReader(b);
+ * Very similar to compose(): Make the same changes in both places if relevant.
+ * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
+ * !doSpan: quickCheck
+ * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
+ * bit 0: set if "maybe"; otherwise, if the span length<s.length()
+ * then the quick check result is "no"
+ */
+ public int composeQuickCheck(CharSequence s, int src, int limit,
+ boolean onlyContiguous, boolean doSpan) {
+ int qcResult=0;
+ int minNoMaybeCP=minCompNoMaybeCP;
- // read the indexes
- indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
-
- byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
-
- int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
- combiningTable = new char[combiningTableTop];
-
- int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
- extraData = new char[extraDataTop];
-
- byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
- byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
+ /*
+ * prevBoundary points to the last character before the current one
+ * that has a composition boundary before it with ccc==0 and quick check "yes".
+ */
+ int prevBoundary=src;
+ int prevSrc;
+ int c=0;
+ int norm16=0;
+ int prevCC=0;
- fcdTrieImpl = new FCDTrieImpl();
- normTrieImpl = new NormTrieImpl();
- auxTrieImpl = new AuxTrieImpl();
-
- // load the rest of the data data and initialize the data members
- reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable);
-
- NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl );
- FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl );
- AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl );
+ for(;;) {
+ // count code units below the minimum or with irrelevant data for the quick check
+ for(prevSrc=src;;) {
+ if(src==limit) {
+ return (src<<1)|qcResult; // "yes" or "maybe"
+ }
+ if( (c=s.charAt(src))<minNoMaybeCP ||
+ isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
+ ) {
+ ++src;
+ } else if(!UTF16.isSurrogate((char)c)) {
+ break;
+ } else {
+ char c2;
+ if(UTF16Plus.isSurrogateLead(c)) {
+ if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
+ c=Character.toCodePoint((char)c, c2);
+ }
+ } else /* trail surrogate */ {
+ if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
+ --src;
+ c=Character.toCodePoint(c2, (char)c);
+ }
+ }
+ if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
+ src+=Character.charCount(c);
+ } else {
+ break;
+ }
+ }
+ }
+ if(src!=prevSrc) {
+ // Set prevBoundary to the last character in the quick check loop.
+ prevBoundary=src-1;
+ if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
+ Character.isHighSurrogate(s.charAt(prevBoundary-1))
+ ) {
+ --prevBoundary;
+ }
+ prevCC=0;
+ // The start of the current character (c).
+ prevSrc=src;
+ }
- // we reached here without any exceptions so the data is fully
- // loaded set the variable to true
- isDataLoaded = true;
-
- // get the data format version
- byte[] formatVersion = reader.getDataFormatVersion();
-
- isFormatVersion_2_1 =( formatVersion[0]>2
- ||
- (formatVersion[0]==2 && formatVersion[1]>=1)
- );
- isFormatVersion_2_2 =( formatVersion[0]>2
- ||
- (formatVersion[0]==2 && formatVersion[1]>=2)
- );
- unicodeVersion = reader.getUnicodeVersion();
- b.close();
+ src+=Character.charCount(c);
+ /*
+ * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
+ * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
+ * or has ccc!=0.
+ */
+ if(isMaybeOrNonZeroCC(norm16)) {
+ int cc=getCCFromYesOrMaybe(norm16);
+ if( onlyContiguous && // FCC
+ cc!=0 &&
+ prevCC==0 &&
+ prevBoundary<prevSrc &&
+ // prevCC==0 && prevBoundary<prevSrc tell us that
+ // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
+ // passed the quick check "yes && ccc==0" test.
+ // Check whether the last character was a "yesYes" or a "yesNo".
+ // If a "yesNo", then we get its trailing ccc from its
+ // mapping and check for canonical order.
+ // All other cases are ok.
+ getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
+ ) {
+ // Fails FCD test.
+ } else if(prevCC<=cc || cc==0) {
+ prevCC=cc;
+ if(norm16<MIN_YES_YES_WITH_CC) {
+ if(!doSpan) {
+ qcResult=1;
+ } else {
+ return prevBoundary<<1; // spanYes does not care to know it's "maybe"
+ }
+ }
+ continue;
+ }
+ }
+ return prevBoundary<<1; // "no"
}
}
- /* ---------------------------------------------------------------------- */
-
- /* Korean Hangul and Jamo constants */
-
- public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
- public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
- public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
-
- public static final int HANGUL_BASE=0xac00;
-
- public static final int JAMO_L_COUNT=19;
- public static final int JAMO_V_COUNT=21;
- public static final int JAMO_T_COUNT=28;
- public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
-
- private static boolean isHangulWithoutJamoT(char c) {
- c-=HANGUL_BASE;
- return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
- }
-
- /* norm32 helpers */
-
- /* is this a norm32 with a regular index? */
- private static boolean isNorm32Regular(long norm32) {
- return norm32<MIN_SPECIAL;
- }
-
- /* is this a norm32 with a special index for a lead surrogate? */
- private static boolean isNorm32LeadSurrogate(long norm32) {
- return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP;
- }
-
- /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
- private static boolean isNorm32HangulOrJamo(long norm32) {
- return norm32>=MIN_HANGUL;
- }
-
- /*
- * Given norm32 for Jamo V or T,
- * is this a Jamo V?
- */
- private static boolean isJamoVTNorm32JamoV(long norm32) {
- return norm32<JAMO_V_TOP;
- }
-
- /* data access primitives ----------------------------------------------- */
-
- public static long/*unsigned*/ getNorm32(char c) {
- return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c)));
- }
-
- public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32,
- char c2) {
- /*
- * the surrogate index in norm32 stores only the number of the surrogate
- * index block see gennorm/store.c/getFoldedNormValue()
- */
- return ((UNSIGNED_INT_MASK) &
- NormTrieImpl.normTrie.getTrailValue((int)norm32, c2));
- }
- ///CLOVER:OFF
- private static long getNorm32(int c){
- return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c)));
- }
-
- /*
- * get a norm32 from text with complete code points
- * (like from decompositions)
- */
- private static long/*unsigned*/ getNorm32(char[] p,int start,
- int/*unsigned*/ mask) {
- long/*unsigned*/ norm32= getNorm32(p[start]);
- if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
- /* *p is a lead surrogate, get the real norm32 */
- norm32=getNorm32FromSurrogatePair(norm32, p[start+1]);
+ public void composeAndAppend(CharSequence s,
+ boolean doCompose,
+ boolean onlyContiguous,
+ ReorderingBuffer buffer) {
+ int src=0, limit=s.length();
+ if(!buffer.isEmpty()) {
+ int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
+ if(0!=firstStarterInSrc) {
+ int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
+ buffer.length());
+ StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
+ firstStarterInSrc+16);
+ middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
+ buffer.removeSuffix(buffer.length()-lastStarterInDest);
+ middle.append(s, 0, firstStarterInSrc);
+ compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
+ src=firstStarterInSrc;
+ }
}
- return norm32;
- }
-
- //// for StringPrep
- public static VersionInfo getUnicodeVersion(){
- return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
- unicodeVersion[2], unicodeVersion[3]);
- }
-
- public static char getFCD16(char c) {
- return FCDTrieImpl.fcdTrie.getLeadValue(c);
- }
-
- public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
- /* the surrogate index in fcd16 is an absolute offset over the
- * start of stage 1
- * */
- return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
- }
- public static int getFCD16(int c) {
- return FCDTrieImpl.fcdTrie.getCodePointValue(c);
- }
-
- private static int getExtraDataIndex(long norm32) {
- return (int)(norm32>>EXTRA_SHIFT);
- }
-
- private static final class DecomposeArgs{
- int /*unsigned byte*/ cc;
- int /*unsigned byte*/ trailCC;
- int length;
- }
- /**
- *
- * get the canonical or compatibility decomposition for one character
- *
- * @return index into the extraData array
- */
- private static int/*index*/ decompose(long/*unsigned*/ norm32,
- int/*unsigned*/ qcMask,
- DecomposeArgs args) {
- int p= getExtraDataIndex(norm32);
- args.length=extraData[p++];
-
- if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) {
- /* use compatibility decomposition, skip canonical data */
- p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK);
- args.length>>=8;
- }
-
- if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
- /* get the lead and trail cc's */
- char bothCCs=extraData[p++];
- args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
- args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
+ if(doCompose) {
+ compose(s, src, limit, onlyContiguous, true, buffer);
} else {
- /* lead and trail cc's are both 0 */
- args.cc=args.trailCC=0;
- }
-
- args.length&=DECOMP_LENGTH_MASK;
- return p;
- }
-
-
- /**
- * get the canonical decomposition for one character
- * @return index into the extraData array
- */
- private static int decompose(long/*unsigned*/ norm32,
- DecomposeArgs args) {
-
- int p= getExtraDataIndex(norm32);
- args.length=extraData[p++];
-
- if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
- /* get the lead and trail cc's */
- char bothCCs=extraData[p++];
- args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
- args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
- } else {
- /* lead and trail cc's are both 0 */
- args.cc=args.trailCC=0;
- }
-
- args.length&=DECOMP_LENGTH_MASK;
- return p;
- }
-
-
- private static final class NextCCArgs{
- char[] source;
- int next;
- int limit;
- char c;
- char c2;
- }
-
- /*
- * get the combining class of (c, c2)= args.source[args.next++]
- * before: args.next<args.limit after: args.next<=args.limit
- * if only one code unit is used, then c2==0
- */
- private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
- long /*unsigned*/ norm32;
-
- args.c=args.source[args.next++];
-
- norm32= getNorm32(args.c);
- if((norm32 & CC_MASK)==0) {
- args.c2=0;
- return 0;
- } else {
- if(!isNorm32LeadSurrogate(norm32)) {
- args.c2=0;
- } else {
- /* c is a lead surrogate, get the real norm32 */
- if(args.next!=args.limit &&
- UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
- ++args.next;
- norm32=getNorm32FromSurrogatePair(norm32, args.c2);
- } else {
- args.c2=0;
- return 0;
- }
- }
-
- return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
+ buffer.append(s, src, limit);
}
}
- private static final class PrevArgs{
- char[] src;
- int start;
- int current;
- char c;
- char c2;
+ // Dual functionality:
+ // buffer!=NULL: normalize
+ // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
+ public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
+ // Note: In this function we use buffer->appendZeroCC() because we track
+ // the lead and trail combining classes here, rather than leaving it to
+ // the ReorderingBuffer.
+ // The exception is the call to decomposeShort() which uses the buffer
+ // in the normal way.
+
+ // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
+ // Similar to the prevBoundary in the compose() implementation.
+ int prevBoundary=src;
+ int prevSrc;
+ int c=0;
+ int prevFCD16=0;
+ int fcd16=0;
+
+ for(;;) {
+ // count code units with lccc==0
+ for(prevSrc=src; src!=limit;) {
+ if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
+ prevFCD16=~c;
+ ++src;
+ } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
+ prevFCD16=0;
+ ++src;
+ } else {
+ if(UTF16.isSurrogate((char)c)) {
+ char c2;
+ if(UTF16Plus.isSurrogateLead(c)) {
+ if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
+ c=Character.toCodePoint((char)c, c2);
+ }
+ } else /* trail surrogate */ {
+ if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
+ --src;
+ c=Character.toCodePoint(c2, (char)c);
+ }
+ }
+ }
+ if((fcd16=getFCD16FromNormData(c))<=0xff) {
+ prevFCD16=fcd16;
+ src+=Character.charCount(c);
+ } else {
+ break;
+ }
+ }
+ }
+ // copy these code units all at once
+ if(src!=prevSrc) {
+ if(src==limit) {
+ if(buffer!=null) {
+ buffer.flushAndAppendZeroCC(s, prevSrc, src);
+ }
+ break;
+ }
+ prevBoundary=src;
+ // We know that the previous character's lccc==0.
+ if(prevFCD16<0) {
+ // Fetching the fcd16 value was deferred for this below-U+0300 code point.
+ int prev=~prevFCD16;
+ prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
+ if(prevFCD16>1) {
+ --prevBoundary;
+ }
+ } else {
+ int p=src-1;
+ if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
+ Character.isHighSurrogate(s.charAt(p-1))
+ ) {
+ --p;
+ // Need to fetch the previous character's FCD value because
+ // prevFCD16 was just for the trail surrogate code point.
+ prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
+ // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
+ }
+ if(prevFCD16>1) {
+ prevBoundary=p;
+ }
+ }
+ if(buffer!=null) {
+ // The last lccc==0 character is excluded from the
+ // flush-and-append call in case it needs to be modified.
+ buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
+ buffer.append(s, prevBoundary, src);
+ }
+ // The start of the current character (c).
+ prevSrc=src;
+ } else if(src==limit) {
+ break;
+ }
+
+ src+=Character.charCount(c);
+ // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
+ // Check for proper order, and decompose locally if necessary.
+ if((prevFCD16&0xff)<=(fcd16>>8)) {
+ // proper order: prev tccc <= current lccc
+ if((fcd16&0xff)<=1) {
+ prevBoundary=src;
+ }
+ if(buffer!=null) {
+ buffer.appendZeroCC(c);
+ }
+ prevFCD16=fcd16;
+ continue;
+ } else if(buffer==null) {
+ return prevBoundary; // quick check "no"
+ } else {
+ /*
+ * Back out the part of the source that we copied or appended
+ * already but is now going to be decomposed.
+ * prevSrc is set to after what was copied/appended.
+ */
+ buffer.removeSuffix(prevSrc-prevBoundary);
+ /*
+ * Find the part of the source that needs to be decomposed,
+ * up to the next safe boundary.
+ */
+ src=findNextFCDBoundary(s, src, limit);
+ /*
+ * The source text does not fulfill the conditions for FCD.
+ * Decompose and reorder a limited piece of the text.
+ */
+ decomposeShort(s, prevBoundary, src, buffer);
+ prevBoundary=src;
+ prevFCD16=0;
+ }
+ }
+ return src;
}
- /*
- * read backwards and get norm32
- * return 0 if the character is <minC
- * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
- * surrogate but read second!)
- */
- private static long /*unsigned*/ getPrevNorm32(PrevArgs args,
- int/*unsigned*/ minC,
- int/*unsigned*/ mask) {
- long/*unsigned*/ norm32;
+ // Note: hasDecompBoundary() could be implemented as aliases to
+ // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
+ // at the cost of building the FCD trie for a decomposition normalizer.
+ public boolean hasDecompBoundary(int c, boolean before) {
+ for(;;) {
+ if(c<minDecompNoCP) {
+ return true;
+ }
+ int norm16=getNorm16(c);
+ if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
+ return true;
+ } else if(norm16>MIN_NORMAL_MAYBE_YES) {
+ return false; // ccc!=0
+ } else if(isDecompNoAlgorithmic(norm16)) {
+ c=mapAlgorithmic(c, norm16);
+ } else {
+ // c decomposes, get everything from the variable-length extra data
+ int firstUnit=extraData.charAt(norm16);
+ if((firstUnit&MAPPING_LENGTH_MASK)==0) {
+ return false;
+ }
+ if(!before) {
+ // decomp after-boundary: same as hasFCDBoundaryAfter(),
+ // fcd16<=1 || trailCC==0
+ if(firstUnit>0x1ff) {
+ return false; // trailCC>1
+ }
+ if(firstUnit<=0xff) {
+ return true; // trailCC==0
+ }
+ // if(trailCC==1) test leadCC==0, same as checking for before-boundary
+ }
+ // true if leadCC==0 (hasFCDBoundaryBefore())
+ return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
+ }
+ }
+ }
- args.c=args.src[--args.current];
- args.c2=0;
+ public boolean hasCompBoundaryBefore(int c) {
+ return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
+ }
- /* check for a surrogate before getting norm32 to see if we need to
- * predecrement further
- */
- if(args.c<minC) {
- return 0;
- } else if(!UTF16.isSurrogate(args.c)) {
- return getNorm32(args.c);
- } else if(UTF16.isLeadSurrogate(args.c)) {
- /* unpaired first surrogate */
- return 0;
- } else if(args.current!=args.start &&
- UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
- --args.current;
- norm32=getNorm32(args.c2);
+ private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
+ private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
+ private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
+ private boolean isHangul(int norm16) { return norm16==minYesNo; }
+ private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
+
+ // UBool isCompYes(uint16_t norm16) const {
+ // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
+ // }
+ // UBool isCompYesOrMaybe(uint16_t norm16) const {
+ // return norm16<minNoNo || minMaybeYes<=norm16;
+ // }
+ // private boolean hasZeroCCFromDecompYes(int norm16) {
+ // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
+ // }
+ private boolean isDecompYesAndZeroCC(int norm16) {
+ return norm16<minYesNo ||
+ norm16==JAMO_VT ||
+ (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
+ }
- if((norm32&mask)==0) {
- /* all surrogate pairs with this lead surrogate have
- * only irrelevant data
- */
- return 0;
- } else {
- /* norm32 must be a surrogate special */
- return getNorm32FromSurrogatePair(norm32, args.c);
- }
+ /**
+ * A little faster and simpler than isDecompYesAndZeroCC() but does not include
+ * the MaybeYes which combine-forward and have ccc=0.
+ * (Standard Unicode 5.2 normalization does not have such characters.)
+ */
+ private boolean isMostDecompYesAndZeroCC(int norm16) {
+ return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
+ }
+
+ private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
+
+ // For use with isCompYes().
+ // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
+ // static uint8_t getCCFromYes(uint16_t norm16) {
+ // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
+ // }
+ private int getCCFromNoNo(int norm16) {
+ if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
+ return extraData.charAt(norm16-1)&0xff;
} else {
- /* unpaired second surrogate */
- args.c2=0;
return 0;
}
}
- /*
- * get the combining class of (c, c2)=*--p
- * before: start<p after: start<=p
+ // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
+ int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
+ int c;
+ if(cpStart==(cpLimit-1)) {
+ c=s.charAt(cpStart);
+ } else {
+ c=Character.codePointAt(s, cpStart);
+ }
+ int prevNorm16=getNorm16(c);
+ if(prevNorm16<=minYesNo) {
+ return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
+ } else {
+ return extraData.charAt(prevNorm16)>>8; // tccc from yesNo
+ }
+ }
+
+ // Requires algorithmic-NoNo.
+ private int mapAlgorithmic(int c, int norm16) {
+ return c+norm16-(minMaybeYes-MAX_DELTA-1);
+ }
+
+ // Requires minYesNo<norm16<limitNoNo.
+ // private int getMapping(int norm16) { return /*extraData+*/norm16; }
+
+ /**
+ * @return index into maybeYesCompositions, or -1
*/
- private static int /*unsigned byte*/ getPrevCC(PrevArgs args) {
-
- return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC,
- CC_MASK)>>CC_SHIFT));
+ private int getCompositionsListForDecompYes(int norm16) {
+ if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
+ return -1;
+ } else {
+ if((norm16-=minMaybeYes)<0) {
+ // norm16<minMaybeYes: index into extraData which is a substring at
+ // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
+ // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
+ norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list
+ }
+ return norm16;
+ }
}
- /*
- * is this a safe boundary character for NF*D?
- * (lead cc==0)
+ /**
+ * @return index into maybeYesCompositions
*/
- public static boolean isNFDSafe(long/*unsigned*/ norm32,
- int/*unsigned*/ccOrQCMask,
- int/*unsigned*/ decompQCMask) {
- if((norm32&ccOrQCMask)==0) {
- return true; /* cc==0 and no decomposition: this is NF*D safe */
+ private int getCompositionsListForComposite(int norm16) {
+ // composite has both mapping & compositions list
+ int firstUnit=extraData.charAt(norm16);
+ return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions
+ 1+ // +1 to skip the first unit with the mapping lenth
+ (firstUnit&MAPPING_LENGTH_MASK); // + mapping length
+ }
+
+ // Decompose a short piece of text which is likely to contain characters that
+ // fail the quick check loop and/or where the quick check loop's overhead
+ // is unlikely to be amortized.
+ // Called by the compose() and makeFCD() implementations.
+ // Public in Java for collation implementation code.
+ public void decomposeShort(CharSequence s, int src, int limit,
+ ReorderingBuffer buffer) {
+ while(src<limit) {
+ int c=Character.codePointAt(s, src);
+ src+=Character.charCount(c);
+ decompose(c, getNorm16(c), buffer);
}
+ }
- /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
- if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
- DecomposeArgs args=new DecomposeArgs();
- /* decomposes, get everything from the variable-length extra data */
- decompose(norm32, decompQCMask, args);
- return args.cc==0;
- } else {
- /* no decomposition (or Hangul), test the cc directly */
- return (norm32&CC_MASK)==0;
+ private void decompose(int c, int norm16,
+ ReorderingBuffer buffer) {
+ // Only loops for 1:1 algorithmic mappings.
+ for(;;) {
+ // get the decomposition and the lead and trail cc's
+ if(isDecompYes(norm16)) {
+ // c does not decompose
+ buffer.append(c, getCCFromYesOrMaybe(norm16));
+ } else if(isHangul(norm16)) {
+ // Hangul syllable: decompose algorithmically
+ Hangul.decompose(c, buffer);
+ } else if(isDecompNoAlgorithmic(norm16)) {
+ c=mapAlgorithmic(c, norm16);
+ norm16=getNorm16(c);
+ continue;
+ } else {
+ // c decomposes, get everything from the variable-length extra data
+ int firstUnit=extraData.charAt(norm16);
+ int length=firstUnit&MAPPING_LENGTH_MASK;
+ int leadCC, trailCC;
+ trailCC=firstUnit>>8;
+ if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
+ leadCC=extraData.charAt(norm16-1)>>8;
+ } else {
+ leadCC=0;
+ }
+ ++norm16; // skip over the firstUnit
+ buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
+ }
+ return;
}
}
- /*
- * is this (or does its decomposition begin with) a "true starter"?
- * (cc==0 and NF*C_YES)
+ /**
+ * Finds the recomposition result for
+ * a forward-combining "lead" character,
+ * specified with a pointer to its compositions list,
+ * and a backward-combining "trail" character.
+ *
+ * <p>If the lead and trail characters combine, then this function returns
+ * the following "compositeAndFwd" value:
+ * <pre>
+ * Bits 21..1 composite character
+ * Bit 0 set if the composite is a forward-combining starter
+ * </pre>
+ * otherwise it returns -1.
+ *
+ * <p>The compositions list has (trail, compositeAndFwd) pair entries,
+ * encoded as either pairs or triples of 16-bit units.
+ * The last entry has the high bit of its first unit set.
+ *
+ * <p>The list is sorted by ascending trail characters (there are no duplicates).
+ * A linear search is used.
+ *
+ * <p>See normalizer2impl.h for a more detailed description
+ * of the compositions list format.
*/
- public static boolean isTrueStarter(long/*unsigned*/ norm32,
- int/*unsigned*/ ccOrQCMask,
- int/*unsigned*/ decompQCMask) {
- if((norm32&ccOrQCMask)==0) {
- return true; /* this is a true starter (could be Hangul or Jamo L)*/
- }
-
- /* inspect its decomposition - not a Hangul or a surrogate here */
- if((norm32&decompQCMask)!=0) {
- int p; /* index into extra data array */
- DecomposeArgs args=new DecomposeArgs();
- /* decomposes, get everything from the variable-length extra data */
- p=decompose(norm32, decompQCMask, args);
-
- if(args.cc==0) {
- int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK;
-
- /* does it begin with NFC_YES? */
- if((getNorm32(extraData,p, qcMask)&qcMask)==0) {
- /* yes, the decomposition begins with a true starter */
- return true;
+ private static int combine(String compositions, int list, int trail) {
+ int key1, firstUnit;
+ if(trail<COMP_1_TRAIL_LIMIT) {
+ // trail character is 0..33FF
+ // result entry may have 2 or 3 units
+ key1=(trail<<1);
+ while(key1>(firstUnit=compositions.charAt(list))) {
+ list+=2+(firstUnit&COMP_1_TRIPLE);
+ }
+ if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
+ if((firstUnit&COMP_1_TRIPLE)!=0) {
+ return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
+ } else {
+ return compositions.charAt(list+1);
+ }
+ }
+ } else {
+ // trail character is 3400..10FFFF
+ // result entry has 3 units
+ key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
+ int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
+ int secondUnit;
+ for(;;) {
+ if(key1>(firstUnit=compositions.charAt(list))) {
+ list+=2+(firstUnit&COMP_1_TRIPLE);
+ } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
+ if(key2>(secondUnit=compositions.charAt(list+1))) {
+ if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
+ break;
+ } else {
+ list+=3;
+ }
+ } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
+ return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
+ } else {
+ break;
+ }
+ } else {
+ break;
}
}
}
- return false;
+ return -1;
+ }
+
+ /*
+ * Recomposes the buffer text starting at recomposeStartIndex
+ * (which is in NFD - decomposed and canonically ordered),
+ * and truncates the buffer contents.
+ *
+ * Note that recomposition never lengthens the text:
+ * Any character consists of either one or two code units;
+ * a composition may contain at most one more code unit than the original starter,
+ * while the combining mark that is removed has at least one code unit.
+ */
+ private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
+ boolean onlyContiguous) {
+ StringBuilder sb=buffer.getStringBuilder();
+ int p=recomposeStartIndex;
+ if(p==sb.length()) {
+ return;
+ }
+
+ int starter, pRemove;
+ int compositionsList;
+ int c, compositeAndFwd;
+ int norm16;
+ int cc, prevCC;
+ boolean starterIsSupplementary;
+
+ // Some of the following variables are not used until we have a forward-combining starter
+ // and are only initialized now to avoid compiler warnings.
+ compositionsList=-1; // used as indicator for whether we have a forward-combining starter
+ starter=-1;
+ starterIsSupplementary=false;
+ prevCC=0;
+
+ for(;;) {
+ c=sb.codePointAt(p);
+ p+=Character.charCount(c);
+ norm16=getNorm16(c);
+ cc=getCCFromYesOrMaybe(norm16);
+ if( // this character combines backward and
+ isMaybe(norm16) &&
+ // we have seen a starter that combines forward and
+ compositionsList>=0 &&
+ // the backward-combining character is not blocked
+ (prevCC<cc || prevCC==0)) {
+ if(isJamoVT(norm16)) {
+ // c is a Jamo V/T, see if we can compose it with the previous character.
+ if(c<Hangul.JAMO_T_BASE) {
+ // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
+ char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
+ if(prev<Hangul.JAMO_L_COUNT) {
+ pRemove=p-1;
+ char syllable=(char)
+ (Hangul.HANGUL_BASE+
+ (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
+ Hangul.JAMO_T_COUNT);
+ char t;
+ if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
+ ++p;
+ syllable+=t; // The next character was a Jamo T.
+ }
+ sb.setCharAt(starter, syllable);
+ // remove the Jamo V/T
+ sb.delete(pRemove, p);
+ p=pRemove;
+ }
+ }
+ /*
+ * No "else" for Jamo T:
+ * Since the input is in NFD, there are no Hangul LV syllables that
+ * a Jamo T could combine with.
+ * All Jamo Ts are combined above when handling Jamo Vs.
+ */
+ if(p==sb.length()) {
+ break;
+ }
+ compositionsList=-1;
+ continue;
+ } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
+ // The starter and the combining mark (c) do combine.
+ int composite=compositeAndFwd>>1;
+
+ // Remove the combining mark.
+ pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark
+ sb.delete(pRemove, p);
+ p=pRemove;
+ // Replace the starter with the composite.
+ if(starterIsSupplementary) {
+ if(composite>0xffff) {
+ // both are supplementary
+ sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
+ sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
+ } else {
+ sb.setCharAt(starter, (char)c);
+ sb.deleteCharAt(starter+1);
+ // The composite is shorter than the starter,
+ // move the intermediate characters forward one.
+ starterIsSupplementary=false;
+ --p;
+ }
+ } else if(composite>0xffff) {
+ // The composite is longer than the starter,
+ // move the intermediate characters back one.
+ starterIsSupplementary=true;
+ sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
+ sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
+ ++p;
+ } else {
+ // both are on the BMP
+ sb.setCharAt(starter, (char)composite);
+ }
+
+ // Keep prevCC because we removed the combining mark.
+
+ if(p==sb.length()) {
+ break;
+ }
+ // Is the composite a starter that combines forward?
+ if((compositeAndFwd&1)!=0) {
+ compositionsList=
+ getCompositionsListForComposite(getNorm16(composite));
+ } else {
+ compositionsList=-1;
+ }
+
+ // We combined; continue with looking for compositions.
+ continue;
+ }
+ }
+
+ // no combination this time
+ prevCC=cc;
+ if(p==sb.length()) {
+ break;
+ }
+
+ // If c did not combine, then check if it is a starter.
+ if(cc==0) {
+ // Found a new starter.
+ if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
+ // It may combine with something, prepare for it.
+ if(c<=0xffff) {
+ starterIsSupplementary=false;
+ starter=p-1;
+ } else {
+ starterIsSupplementary=true;
+ starter=p-2;
+ }
+ }
+ } else if(onlyContiguous) {
+ // FCC: no discontiguous compositions; any intervening character blocks.
+ compositionsList=-1;
+ }
+ }
+ buffer.flush();
+ }
+
+ /**
+ * Does c have a composition boundary before it?
+ * True if its decomposition begins with a character that has
+ * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
+ * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
+ * (isCompYesAndZeroCC()) so we need not decompose.
+ */
+ private boolean hasCompBoundaryBefore(int c, int norm16) {
+ for(;;) {
+ if(isCompYesAndZeroCC(norm16)) {
+ return true;
+ } else if(isMaybeOrNonZeroCC(norm16)) {
+ return false;
+ } else if(isDecompNoAlgorithmic(norm16)) {
+ c=mapAlgorithmic(c, norm16);
+ norm16=getNorm16(c);
+ } else {
+ // c decomposes, get everything from the variable-length extra data
+ int firstUnit=extraData.charAt(norm16);
+ if((firstUnit&MAPPING_LENGTH_MASK)==0) {
+ return false;
+ }
+ if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
+ return false; // non-zero leadCC
+ }
+ return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
+ }
+ }
+ }
+
+ private int findPreviousCompBoundary(CharSequence s, int p) {
+ while(p>0) {
+ int c=Character.codePointBefore(s, p);
+ p-=Character.charCount(c);
+ if(hasCompBoundaryBefore(c)) {
+ break;
+ }
+ // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
+ // but that's probably not worth the extra cost.
+ }
+ return p;
+ }
+
+ private int findNextCompBoundary(CharSequence s, int p, int limit) {
+ while(p<limit) {
+ int c=Character.codePointAt(s, p);
+ int norm16=normTrie.get(c);
+ if(hasCompBoundaryBefore(c, norm16)) {
+ break;
+ }
+ p+=Character.charCount(c);
+ }
+ return p;
}
- /* reorder UTF-16 in-place ---------------------------------------------- */
+ private int findNextFCDBoundary(CharSequence s, int p, int limit) {
+ while(p<limit) {
+ int c=Character.codePointAt(s, p);
+ if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
+ break;
+ }
+ p+=Character.charCount(c);
+ }
+ return p;
+ }
+
+ /**
+ * Get the canonical decomposition
+ * sherman for ComposedCharIter
+ */
+ public static int getDecompose(int chars[], String decomps[]) {
+ Normalizer2 impl = Normalizer2.getNFDInstance();
+
+ int length=0;
+ int norm16 = 0;
+ int ch = -1;
+ int i = 0;
+
+ while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
+ //TBD !!!! the hack code heres save us about 50ms for startup
+ //need a better solution/lookup
+ if (ch == 0x30ff)
+ ch = 0xf900;
+ else if (ch == 0x115bc)
+ ch = 0x1d15e;
+ else if (ch == 0x1d1c1)
+ ch = 0x2f800;
+
+ String s = impl.getDecomposition(ch);
+
+ if(s != null && i < chars.length) {
+ chars[i] = ch;
+ decomps[i++] = s;
+ }
+ }
+ return i;
+ }
+
+ //------------------------------------------------------
+ // special method for Collation (RBTableBuilder.build())
+ //------------------------------------------------------
+ private static boolean needSingleQuotation(char c) {
+ return (c >= 0x0009 && c <= 0x000D) ||
+ (c >= 0x0020 && c <= 0x002F) ||
+ (c >= 0x003A && c <= 0x0040) ||
+ (c >= 0x005B && c <= 0x0060) ||
+ (c >= 0x007B && c <= 0x007E);
+ }
+
+ public static String canonicalDecomposeWithSingleQuotation(String string) {
+ Normalizer2 impl = Normalizer2.getNFDInstance();
+ char[] src = string.toCharArray();
+ int srcIndex = 0;
+ int srcLimit = src.length;
+ char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3
+ int destIndex = 0;
+ int destLimit = dest.length;
+
+ int prevSrc;
+ String norm;
+ int reorderStartIndex, length;
+ char c1, c2;
+ int cp;
+ int minNoMaybe = 0x00c0;
+ int cc, prevCC, trailCC;
+ char[] p;
+ int pStart;
+
+ // initialize
+ reorderStartIndex = 0;
+ prevCC = 0;
+ norm = null;
+ cp = 0;
+ pStart = 0;
+
+ cc = trailCC = -1; // initialize to bogus value
+ c1 = 0;
+ for (;;) {
+ prevSrc=srcIndex;
+ //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
+ while (srcIndex != srcLimit &&
+ ((c1 = src[srcIndex]) < minNoMaybe ||
+ (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null ||
+ (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables
+ prevCC = 0;
+ srcIndex += (cp < 0x10000) ? 1 : 2;
+ }
+
+ // copy these code units all at once
+ if (srcIndex != prevSrc) {
+ length = srcIndex - prevSrc;
+ if ((destIndex + length) <= destLimit) {
+ System.arraycopy(src,prevSrc,dest,destIndex,length);
+ }
+
+ destIndex += length;
+ reorderStartIndex = destIndex;
+ }
+
+ // end of source reached?
+ if (srcIndex == srcLimit) {
+ break;
+ }
+
+ // cp already contains *src and norm32 is set for it, increment src
+ srcIndex += (cp < 0x10000) ? 1 : 2;
+
+ if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+ c2 = 0;
+ length = 1;
+
+ if (Character.isHighSurrogate(c1)
+ || Character.isLowSurrogate(c1)) {
+ norm = null;
+ }
+ } else {
+ length = 2;
+ c2 = src[srcIndex-1];
+ }
+
+ // get the decomposition and the lead and trail cc's
+ if (norm == null) {
+ // cp does not decompose
+ cc = trailCC = UCharacter.getCombiningClass(cp);
+ p = null;
+ pStart = -1;
+ } else {
+
+ pStart = 0;
+ p = norm.toCharArray();
+ length = p.length;
+ int cpNum = norm.codePointCount(0, length);
+ cc= UCharacter.getCombiningClass(norm.codePointAt(0));
+ trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1));
+ if (length == 1) {
+ // fastpath a single code unit from decomposition
+ c1 = p[pStart];
+ c2 = 0;
+ p = null;
+ pStart = -1;
+ }
+ }
+
+ if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations
+ // buffer overflow
+ char[] tmpBuf = new char[destLimit * 2];
+ System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
+ dest = tmpBuf;
+ destLimit = dest.length;
+ }
+
+ // append the decomposition to the destination buffer, assume length>0
+ {
+ int reorderSplit = destIndex;
+ if (p == null) {
+ // fastpath: single code point
+ if (needSingleQuotation(c1)) {
+ //if we need single quotation, no need to consider "prevCC"
+ //and it must NOT be a supplementary pair
+ dest[destIndex++] = '\'';
+ dest[destIndex++] = c1;
+ dest[destIndex++] = '\'';
+ trailCC = 0;
+ } else if(cc != 0 && cc < prevCC) {
+ // (c1, c2) is out of order with respect to the preceding
+ // text
+ destIndex += length;
+ trailCC = insertOrdered(dest, reorderStartIndex,
+ reorderSplit, destIndex, c1, c2, cc);
+ } else {
+ // just append (c1, c2)
+ dest[destIndex++] = c1;
+ if(c2 != 0) {
+ dest[destIndex++] = c2;
+ }
+ }
+ } else {
+ // general: multiple code points (ordered by themselves)
+ // from decomposition
+ if (needSingleQuotation(p[pStart])) {
+ dest[destIndex++] = '\'';
+ dest[destIndex++] = p[pStart++];
+ dest[destIndex++] = '\'';
+ length--;
+ do {
+ dest[destIndex++] = p[pStart++];
+ } while(--length > 0);
+ } else if (cc != 0 && cc < prevCC) {
+ destIndex += length;
+ trailCC = mergeOrdered(dest, reorderStartIndex,
+ reorderSplit, p, pStart,
+ pStart+length);
+ } else {
+ // just append the decomposition
+ do {
+ dest[destIndex++] = p[pStart++];
+ } while (--length > 0);
+ }
+ }
+ }
+ prevCC = trailCC;
+ if(prevCC == 0) {
+ reorderStartIndex = destIndex;
+ }
+ }
+
+ return new String(dest, 0, destIndex);
+ }
/**
* simpler, single-character version of mergeOrdered() -
@@ -649,19 +1933,23 @@
private static int/*unsigned byte*/ insertOrdered(char[] source,
int start,
int current, int p,
- char c, char c2,
- int/*unsigned byte*/ cc) {
+ char c1, char c2,
+ int/*unsigned byte*/ cc) {
int back, preBack;
int r;
int prevCC, trailCC=cc;
- if(start<current && cc!=0) {
+ if (start<current && cc!=0) {
// search for the insertion point where cc>=prevCC
preBack=back=current;
+
PrevArgs prevArgs = new PrevArgs();
prevArgs.current = current;
prevArgs.start = start;
prevArgs.src = source;
+ prevArgs.c1 = c1;
+ prevArgs.c2 = c2;
+
// get the prevCC
prevCC=getPrevCC(prevArgs);
preBack = prevArgs.current;
@@ -679,7 +1967,6 @@
back=preBack;
}
-
// this is where we are right now with all these indicies:
// [start]..[pPreBack] 0..? code points that we can ignore
// [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
@@ -690,14 +1977,14 @@
r=p;
do {
source[--r]=source[--current];
- } while(back!=current);
+ } while (back!=current);
}
}
- // insert (c, c2)
- source[current]=c;
- if(c2!=0) {
- source[(current+1)]=c2;
+ // insert (c1, c2)
+ source[current] = c1;
+ if (c2!=0) {
+ source[(current+1)] = c2;
}
// we know the cc of the last code point
@@ -732,8 +2019,7 @@
int current,
char[] data,
int next,
- int limit,
- boolean isOrdered) {
+ int limit) {
int r;
int /*unsigned byte*/ cc, trailCC=0;
boolean adjacent;
@@ -744,7 +2030,7 @@
ncArgs.next = next;
ncArgs.limit = limit;
- if(start!=current || !isOrdered) {
+ if(start!=current) {
while(ncArgs.next<ncArgs.limit) {
cc=getNextCC(ncArgs);
@@ -754,20 +2040,16 @@
if(adjacent) {
current=ncArgs.next;
} else {
- data[current++]=ncArgs.c;
+ data[current++]=ncArgs.c1;
if(ncArgs.c2!=0) {
data[current++]=ncArgs.c2;
}
}
- if(isOrdered) {
- break;
- } else {
- start=current;
- }
+ break;
} else {
r=current+(ncArgs.c2==0 ? 1 : 2);
trailCC=insertOrdered(source,start, current, r,
- ncArgs.c, ncArgs.c2, cc);
+ ncArgs.c1, ncArgs.c2, cc);
current=r;
}
}
@@ -792,1945 +2074,82 @@
}
}
- private static int /*unsigned byte*/ mergeOrdered(char[] source,
- int start,
- int current,
- char[] data,
- final int next,
- final int limit) {
- return mergeOrdered(source,start,current,data,next,limit,true);
+
+ private static final class PrevArgs{
+ char[] src;
+ int start;
+ int current;
+ char c1;
+ char c2;
}
- public static NormalizerBase.QuickCheckResult quickCheck(char[] src,
- int srcStart,
- int srcLimit,
- int minNoMaybe,
- int qcMask,
- int options,
- boolean allowMaybe,
- UnicodeSet nx){
-
- int ccOrQCMask;
- long norm32;
- char c, c2;
- char cc, prevCC;
- long qcNorm32;
- NormalizerBase.QuickCheckResult result;
- ComposePartArgs args = new ComposePartArgs();
- char[] buffer ;
- int start = srcStart;
-
- if(!isDataLoaded) {
- return NormalizerBase.MAYBE;
- }
- // initialize
- ccOrQCMask=CC_MASK|qcMask;
- result=NormalizerBase.YES;
- prevCC=0;
-
- for(;;) {
- for(;;) {
- if(srcStart==srcLimit) {
- return result;
- } else if((c=src[srcStart++])>=minNoMaybe &&
- (( norm32=getNorm32(c)) & ccOrQCMask)!=0) {
- break;
- }
- prevCC=0;
- }
-
-
- // check one above-minimum, relevant code unit
- if(isNorm32LeadSurrogate(norm32)) {
- // c is a lead surrogate, get the real norm32
- if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) {
- ++srcStart;
- norm32=getNorm32FromSurrogatePair(norm32,c2);
- } else {
- norm32=0;
- c2=0;
- }
- }else{
- c2=0;
- }
- if(nx_contains(nx, c, c2)) {
- /* excluded: norm32==0 */
- norm32=0;
- }
-
- // check the combining order
- cc=(char)((norm32>>CC_SHIFT)&0xFF);
- if(cc!=0 && cc<prevCC) {
- return NormalizerBase.NO;
- }
- prevCC=cc;
-
- // check for "no" or "maybe" quick check flags
- qcNorm32 = norm32 & qcMask;
- if((qcNorm32& QC_ANY_NO)>=1) {
- result= NormalizerBase.NO;
- break;
- } else if(qcNorm32!=0) {
- // "maybe" can only occur for NFC and NFKC
- if(allowMaybe){
- result=NormalizerBase.MAYBE;
- }else{
- // normalize a section around here to see if it is really
- // normalized or not
- int prevStarter;
- int/*unsigned*/ decompQCMask;
-
- decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask
-
- // find the previous starter
-
- // set prevStarter to the beginning of the current character
- prevStarter=srcStart-1;
- if(UTF16.isTrailSurrogate(src[prevStarter])) {
- // safe because unpaired surrogates do not result
- // in "maybe"
- --prevStarter;
- }
- prevStarter=findPreviousStarter(src, start, prevStarter,
- ccOrQCMask, decompQCMask,
- (char)minNoMaybe);
-
- // find the next true starter in [src..limit[ - modifies
- // src to point to the next starter
- srcStart=findNextStarter(src,srcStart, srcLimit, qcMask,
- decompQCMask,(char) minNoMaybe);
-
- //set the args for compose part
- args.prevCC = prevCC;
-
- // decompose and recompose [prevStarter..src[
- buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);
-
- // compare the normalized version with the original
- if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) {
- result=NormalizerBase.NO; // normalization differs
- break;
- }
-
- // continue after the next starter
- }
- }
- }
- return result;
+ private static final class NextCCArgs{
+ char[] source;
+ int next;
+ int limit;
+ char c1;
+ char c2;
}
-
- //------------------------------------------------------
- // make NFD & NFKD
- //------------------------------------------------------
-
- public static int decompose(char[] src,int srcStart,int srcLimit,
- char[] dest,int destStart,int destLimit,
- boolean compat,int[] outTrailCC,
- UnicodeSet nx) {
-
- char[] buffer = new char[3];
- int prevSrc;
- long norm32;
- int ccOrQCMask, qcMask;
- int reorderStartIndex, length;
- char c, c2, minNoMaybe;
- int/*unsigned byte*/ cc, prevCC, trailCC;
- char[] p;
- int pStart;
- int destIndex = destStart;
- int srcIndex = srcStart;
- if(!compat) {
- minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE];
- qcMask=QC_NFD;
- } else {
- minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE];
- qcMask=QC_NFKD;
- }
-
- /* initialize */
- ccOrQCMask=CC_MASK|qcMask;
- reorderStartIndex=0;
- prevCC=0;
- norm32=0;
- c=0;
- pStart=0;
-
- cc=trailCC=-1;//initialize to bogus value
-
- for(;;) {
- /* count code units below the minimum or with irrelevant data for
- * the quick check
- */
- prevSrc=srcIndex;
-
- while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe ||
- ((norm32=getNorm32(c))&ccOrQCMask)==0)){
- prevCC=0;
- ++srcIndex;
- }
-
- /* copy these code units all at once */
- if(srcIndex!=prevSrc) {
- length=srcIndex-prevSrc;
- if((destIndex+length)<=destLimit) {
- System.arraycopy(src,prevSrc,dest,destIndex,length);
- }
-
- destIndex+=length;
- reorderStartIndex=destIndex;
- }
-
- /* end of source reached? */
- if(srcIndex==srcLimit) {
- break;
- }
-
- /* c already contains *src and norm32 is set for it, increment src*/
- ++srcIndex;
-
- /* check one above-minimum, relevant code unit */
- /*
- * generally, set p and length to the decomposition string
- * in simple cases, p==NULL and (c, c2) will hold the length code
- * units to append in all cases, set cc to the lead and trailCC to
- * the trail combining class
- *
- * the following merge-sort of the current character into the
- * preceding, canonically ordered result text will use the
- * optimized insertOrdered()
- * if there is only one single code point to process;
- * this is indicated with p==NULL, and (c, c2) is the character to
- * insert
- * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
- * for a supplementary character)
- * otherwise, p[length] is merged in with _mergeOrdered()
- */
- if(isNorm32HangulOrJamo(norm32)) {
- if(nx_contains(nx, c)) {
- c2=0;
- p=null;
- length=1;
- } else {
- // Hangul syllable: decompose algorithmically
- p=buffer;
- pStart=0;
- cc=trailCC=0;
-
- c-=HANGUL_BASE;
-
- c2=(char)(c%JAMO_T_COUNT);
- c/=JAMO_T_COUNT;
- if(c2>0) {
- buffer[2]=(char)(JAMO_T_BASE+c2);
- length=3;
- } else {
- length=2;
- }
-
- buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
- buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
- }
- } else {
- if(isNorm32Regular(norm32)) {
- c2=0;
- length=1;
- } else {
- // c is a lead surrogate, get the real norm32
- if(srcIndex!=srcLimit &&
- UTF16.isTrailSurrogate(c2=src[srcIndex])) {
- ++srcIndex;
- length=2;
- norm32=getNorm32FromSurrogatePair(norm32, c2);
- } else {
- c2=0;
- length=1;
- norm32=0;
- }
- }
-
- /* get the decomposition and the lead and trail cc's */
- if(nx_contains(nx, c, c2)) {
- /* excluded: norm32==0 */
- cc=trailCC=0;
- p=null;
- } else if((norm32&qcMask)==0) {
- /* c does not decompose */
- cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
- p=null;
- pStart=-1;
- } else {
- DecomposeArgs arg = new DecomposeArgs();
- /* c decomposes, get everything from the variable-length
- * extra data
- */
- pStart=decompose(norm32, qcMask, arg);
- p=extraData;
- length=arg.length;
- cc=arg.cc;
- trailCC=arg.trailCC;
- if(length==1) {
- /* fastpath a single code unit from decomposition */
- c=p[pStart];
- c2=0;
- p=null;
- pStart=-1;
- }
- }
- }
+ private static int /*unsigned*/ getPrevCC(PrevArgs args) {
+ args.c1=args.src[--args.current];
+ args.c2=0;
- /* append the decomposition to the destination buffer, assume
- * length>0
- */
- if((destIndex+length)<=destLimit) {
- int reorderSplit=destIndex;
- if(p==null) {
- /* fastpath: single code point */
- if(cc!=0 && cc<prevCC) {
- /* (c, c2) is out of order with respect to the preceding
- * text
- */
- destIndex+=length;
- trailCC=insertOrdered(dest,reorderStartIndex,
- reorderSplit, destIndex, c, c2, cc);
- } else {
- /* just append (c, c2) */
- dest[destIndex++]=c;
- if(c2!=0) {
- dest[destIndex++]=c2;
- }
- }
- } else {
- /* general: multiple code points (ordered by themselves)
- * from decomposition
- */
- if(cc!=0 && cc<prevCC) {
- /* the decomposition is out of order with respect to the
- * preceding text
- */
- destIndex+=length;
- trailCC=mergeOrdered(dest,reorderStartIndex,
- reorderSplit,p, pStart,pStart+length);
- } else {
- /* just append the decomposition */
- do {
- dest[destIndex++]=p[pStart++];
- } while(--length>0);
- }
- }
- } else {
- /* buffer overflow */
- /* keep incrementing the destIndex for preflighting */
- destIndex+=length;
- }
-
- prevCC=trailCC;
- if(prevCC==0) {
- reorderStartIndex=destIndex;
- }
- }
-
- outTrailCC[0]=prevCC;
-
- return destIndex - destStart;
- }
-
- /* make NFC & NFKC ------------------------------------------------------ */
- private static final class NextCombiningArgs{
- char[] source;
- int start;
- //int limit;
- char c;
- char c2;
- int/*unsigned*/ combiningIndex;
- char /*unsigned byte*/ cc;
- }
-
- /* get the composition properties of the next character */
- private static int /*unsigned*/ getNextCombining(NextCombiningArgs args,
- int limit,
- UnicodeSet nx) {
- long/*unsigned*/ norm32;
- int combineFlags;
- /* get properties */
- args.c=args.source[args.start++];
- norm32=getNorm32(args.c);
-
- /* preset output values for most characters */
- args.c2=0;
- args.combiningIndex=0;
- args.cc=0;
-
- if((norm32&(CC_MASK|COMBINES_ANY))==0) {
+ if (args.c1 < MIN_CCC_LCCC_CP) {
+ return 0;
+ } else if (UTF16.isLeadSurrogate(args.c1)) {
+ /* unpaired first surrogate */
return 0;
+ } else if (!UTF16.isTrailSurrogate(args.c1)) {
+ return UCharacter.getCombiningClass(args.c1);
+ } else if (args.current!=args.start &&
+ UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
+ --args.current;
+ return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1));
} else {
- if(isNorm32Regular(norm32)) {
- /* set cc etc. below */
- } else if(isNorm32HangulOrJamo(norm32)) {
- /* a compatibility decomposition contained Jamos */
- args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0|
- (norm32>>EXTRA_SHIFT)));
- return (int)(norm32&COMBINES_ANY);
- } else {
- /* c is a lead surrogate, get the real norm32 */
- if(args.start!=limit && UTF16.isTrailSurrogate(args.c2=
- args.source[args.start])) {
- ++args.start;
- norm32=getNorm32FromSurrogatePair(norm32, args.c2);
- } else {
- args.c2=0;
- return 0;
- }
- }
-
- if(nx_contains(nx, args.c, args.c2)) {
- return 0; /* excluded: norm32==0 */
- }
-
- args.cc= (char)((norm32>>CC_SHIFT)&0xff);
-
- combineFlags=(int)(norm32&COMBINES_ANY);
- if(combineFlags!=0) {
- int index = getExtraDataIndex(norm32);
- args.combiningIndex=index>0 ? extraData[(index-1)] :0;
- }
-
- return combineFlags;
- }
- }
-
- /*
- * given a composition-result starter (c, c2) - which means its cc==0,
- * it combines forward, it has extra data, its norm32!=0,
- * it is not a Hangul or Jamo,
- * get just its combineFwdIndex
- *
- * norm32(c) is special if and only if c2!=0
- */
- private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){
- long/*unsigned*/ norm32;
-
- norm32=getNorm32(c);
- if(c2!=0) {
- norm32=getNorm32FromSurrogatePair(norm32, c2);
- }
- return extraData[(getExtraDataIndex(norm32)-1)];
- }
-
- /*
- * Find the recomposition result for
- * a forward-combining character
- * (specified with a pointer to its part of the combiningTable[])
- * and a backward-combining character
- * (specified with its combineBackIndex).
- *
- * If these two characters combine, then set (value, value2)
- * with the code unit(s) of the composition character.
- *
- * Return value:
- * 0 do not combine
- * 1 combine
- * >1 combine, and the composition is a forward-combining starter
- *
- * See unormimp.h for a description of the composition table format.
- */
- private static int/*unsigned*/ combine(char[]table,int tableStart,
- int/*unsinged*/ combineBackIndex,
- int[] outValues) {
- int/*unsigned*/ key;
- int value,value2;
-
- if(outValues.length<2){
- throw new IllegalArgumentException();
- }
-
- /* search in the starter's composition table */
- for(;;) {
- key=table[tableStart++];
- if(key>=combineBackIndex) {
- break;
- }
- tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1;
- }
-
- /* mask off bit 15, the last-entry-in-the-list flag */
- if((key&0x7fff)==combineBackIndex) {
- /* found! combine! */
- value=table[tableStart];
-
- /* is the composition a starter that combines forward? */
- key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1));
-
- /* get the composition result code point from the variable-length
- * result value
- */
- if((value&0x8000) != 0) {
- if((value&0x4000) != 0) {
- /* surrogate pair composition result */
- value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800));
- value2=table[tableStart+1];
- } else {
- /* BMP composition result U+2000..U+ffff */
- value=table[tableStart+1];
- value2=0;
- }
- } else {
- /* BMP composition result U+0000..U+1fff */
- value&=0x1fff;
- value2=0;
- }
- outValues[0]=value;
- outValues[1]=value2;
- return key;
- } else {
- /* not found */
+ /* unpaired second surrogate */
+ args.c2=0;
return 0;
}
}
-
- private static final class RecomposeArgs{
- char[] source;
- int start;
- int limit;
- }
- /*
- * recompose the characters in [p..limit[
- * (which is in NFD - decomposed and canonically ordered),
- * adjust limit, and return the trailing cc
- *
- * since for NFKC we may get Jamos in decompositions, we need to
- * recompose those too
- *
- * note that recomposition never lengthens the text:
- * any character consists of either one or two code units;
- * a composition may contain at most one more code unit than the original
- * starter, while the combining mark that is removed has at least one code
- * unit
- */
- private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) {
- int remove, q, r;
- int /*unsigned*/ combineFlags;
- int /*unsigned*/ combineFwdIndex, combineBackIndex;
- int /*unsigned*/ result, value=0, value2=0;
- int /*unsigned byte*/ prevCC;
- boolean starterIsSupplementary;
- int starter;
- int[] outValues = new int[2];
- starter=-1; /* no starter */
- combineFwdIndex=0; /* will not be used until starter!=NULL */
- starterIsSupplementary=false; /* will not be used until starter!=NULL */
- prevCC=0;
-
- NextCombiningArgs ncArg = new NextCombiningArgs();
- ncArg.source = args.source;
-
- ncArg.cc =0;
- ncArg.c2 =0;
-
- for(;;) {
- ncArg.start = args.start;
- combineFlags=getNextCombining(ncArg,args.limit,nx);
- combineBackIndex=ncArg.combiningIndex;
- args.start = ncArg.start;
-
- if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
- if((combineBackIndex&0x8000)!=0) {
- /* c is a Jamo V/T, see if we can compose it with the
- * previous character
- */
- /* for the PRI #29 fix, check that there is no intervening combining mark */
- if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
- remove=-1; /* NULL while no Hangul composition */
- combineFlags=0;
- ncArg.c2=args.source[starter];
- if(combineBackIndex==0xfff2) {
- /* Jamo V, compose with previous Jamo L and following
- * Jamo T
- */
- ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
- if(ncArg.c2<JAMO_L_COUNT) {
- remove=args.start-1;
- ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
- (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
- if(args.start!=args.limit &&
- (ncArg.c2=(char)(args.source[args.start]
- -JAMO_T_BASE))<JAMO_T_COUNT) {
- ++args.start;
- ncArg.c+=ncArg.c2;
- } else {
- /* the result is an LV syllable, which is a starter (unlike LVT) */
- combineFlags=COMBINES_FWD;
- }
- if(!nx_contains(nx, ncArg.c)) {
- args.source[starter]=ncArg.c;
- } else {
- /* excluded */
- if(!isHangulWithoutJamoT(ncArg.c)) {
- --args.start; /* undo the ++args.start from reading the Jamo T */
- }
- /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
- remove=args.start;
- }
- }
-
- /*
- * Normally, the following can not occur:
- * Since the input is in NFD, there are no Hangul LV syllables that
- * a Jamo T could combine with.
- * All Jamo Ts are combined above when handling Jamo Vs.
- *
- * However, before the PRI #29 fix, this can occur due to
- * an intervening combining mark between the Hangul LV and the Jamo T.
- */
- } else {
- /* Jamo T, compose with previous Hangul that does not have a Jamo T */
- if(isHangulWithoutJamoT(ncArg.c2)) {
- ncArg.c2+=ncArg.c-JAMO_T_BASE;
- if(!nx_contains(nx, ncArg.c2)) {
- remove=args.start-1;
- args.source[starter]=ncArg.c2;
- }
- }
- }
-
- if(remove!=-1) {
- /* remove the Jamo(s) */
- q=remove;
- r=args.start;
- while(r<args.limit) {
- args.source[q++]=args.source[r++];
- }
- args.start=remove;
- args.limit=q;
- }
-
- ncArg.c2=0; /* c2 held *starter temporarily */
-
- if(combineFlags!=0) {
- /*
- * not starter=NULL because the composition is a Hangul LV syllable
- * and might combine once more (but only before the PRI #29 fix)
- */
+ private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
+ args.c1=args.source[args.next++];
+ args.c2=0;
- /* done? */
- if(args.start==args.limit) {
- return (char)prevCC;
- }
-
- /* the composition is a Hangul LV syllable which is a starter that combines forward */
- combineFwdIndex=0xfff0;
-
- /* we combined; continue with looking for compositions */
- continue;
- }
- }
-
- /*
- * now: cc==0 and the combining index does not include
- * "forward" -> the rest of the loop body will reset starter
- * to NULL; technically, a composed Hangul syllable is a
- * starter, but it does not combine forward now that we have
- * consumed all eligible Jamos; for Jamo V/T, combineFlags
- * does not contain _NORM_COMBINES_FWD
- */
-
- } else if(
- /* the starter is not a Hangul LV or Jamo V/T and */
- !((combineFwdIndex&0x8000)!=0) &&
- /* the combining mark is not blocked and */
- ((options&BEFORE_PRI_29)!=0 ?
- (prevCC!=ncArg.cc || prevCC==0) :
- (prevCC<ncArg.cc || prevCC==0)) &&
- /* the starter and the combining mark (c, c2) do combine */
- 0!=(result=combine(combiningTable,combineFwdIndex,
- combineBackIndex, outValues)) &&
- /* the composition result is not excluded */
- !nx_contains(nx, (char)value, (char)value2)
- ) {
- value=outValues[0];
- value2=outValues[1];
- /* replace the starter with the composition, remove the
- * combining mark
- */
- remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */
-
- /* replace the starter with the composition */
- args.source[starter]=(char)value;
- if(starterIsSupplementary) {
- if(value2!=0) {
- /* both are supplementary */
- args.source[starter+1]=(char)value2;
- } else {
- /* the composition is shorter than the starter,
- * move the intermediate characters forward one */
- starterIsSupplementary=false;
- q=starter+1;
- r=q+1;
- while(r<remove) {
- args.source[q++]=args.source[r++];
- }
- --remove;
- }
- } else if(value2!=0) { // for U+1109A, U+1109C, and U+110AB
- starterIsSupplementary=true;
- args.source[starter+1]=(char)value2;
- /* } else { both are on the BMP, nothing more to do */
- }
-
- /* remove the combining mark by moving the following text
- * over it */
- if(remove<args.start) {
- q=remove;
- r=args.start;
- while(r<args.limit) {
- args.source[q++]=args.source[r++];
- }
- args.start=remove;
- args.limit=q;
- }
-
- /* keep prevCC because we removed the combining mark */
-
- /* done? */
- if(args.start==args.limit) {
- return (char)prevCC;
- }
-
- /* is the composition a starter that combines forward? */
- if(result>1) {
- combineFwdIndex=getCombiningIndexFromStarter((char)value,
- (char)value2);
- } else {
- starter=-1;
- }
-
- /* we combined; continue with looking for compositions */
- continue;
- }
- }
-
- /* no combination this time */
- prevCC=ncArg.cc;
- if(args.start==args.limit) {
- return (char)prevCC;
- }
-
- /* if (c, c2) did not combine, then check if it is a starter */
- if(ncArg.cc==0) {
- /* found a new starter; combineFlags==0 if (c, c2) is excluded */
- if((combineFlags&COMBINES_FWD)!=0) {
- /* it may combine with something, prepare for it */
- if(ncArg.c2==0) {
- starterIsSupplementary=false;
- starter=args.start-1;
- } else {
- starterIsSupplementary=false;
- starter=args.start-2;
- }
- combineFwdIndex=combineBackIndex;
- } else {
- /* it will not combine with anything */
- starter=-1;
- }
- } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) {
- /* FCC: no discontiguous compositions; any intervening character blocks */
- starter=-1;
- }
+ if (UTF16.isTrailSurrogate(args.c1)) {
+ /* unpaired second surrogate */
+ return 0;
+ } else if (!UTF16.isLeadSurrogate(args.c1)) {
+ return UCharacter.getCombiningClass(args.c1);
+ } else if (args.next!=args.limit &&
+ UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
+ ++args.next;
+ return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2));
+ } else {
+ /* unpaired first surrogate */
+ args.c2=0;
+ return 0;
}
}
- // find the last true starter between src[start]....src[current] going
- // backwards and return its index
- private static int findPreviousStarter(char[]src, int srcStart, int current,
- int/*unsigned*/ ccOrQCMask,
- int/*unsigned*/ decompQCMask,
- char minNoMaybe) {
- long norm32;
- PrevArgs args = new PrevArgs();
- args.src = src;
- args.start = srcStart;
- args.current = current;
-
- while(args.start<args.current) {
- norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask);
- if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
- break;
- }
- }
- return args.current;
- }
-
- /* find the first true starter in [src..limit[ and return the
- * pointer to it
- */
- private static int/*index*/ findNextStarter(char[] src,int start,int limit,
- int/*unsigned*/ qcMask,
- int/*unsigned*/ decompQCMask,
- char minNoMaybe) {
- int p;
- long/*unsigned*/ norm32;
- int ccOrQCMask;
- char c, c2;
-
- ccOrQCMask=CC_MASK|qcMask;
-
- DecomposeArgs decompArgs = new DecomposeArgs();
-
- for(;;) {
- if(start==limit) {
- break; /* end of string */
- }
- c=src[start];
- if(c<minNoMaybe) {
- break; /* catches NUL terminater, too */
- }
-
- norm32=getNorm32(c);
- if((norm32&ccOrQCMask)==0) {
- break; /* true starter */
- }
-
- if(isNorm32LeadSurrogate(norm32)) {
- /* c is a lead surrogate, get the real norm32 */
- if((start+1)==limit ||
- !UTF16.isTrailSurrogate(c2=(src[start+1]))){
- /* unmatched first surrogate: counts as a true starter */
- break;
- }
- norm32=getNorm32FromSurrogatePair(norm32, c2);
-
- if((norm32&ccOrQCMask)==0) {
- break; /* true starter */
- }
- } else {
- c2=0;
- }
-
- /* (c, c2) is not a true starter but its decomposition may be */
- if((norm32&decompQCMask)!=0) {
- /* (c, c2) decomposes, get everything from the variable-length
- * extra data */
- p=decompose(norm32, decompQCMask, decompArgs);
-
- /* get the first character's norm32 to check if it is a true
- * starter */
- if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) {
- break; /* true starter */
- }
- }
-
- start+= c2==0 ? 1 : 2; /* not a true starter, continue */
- }
-
- return start;
- }
-
-
- private static final class ComposePartArgs{
- int prevCC;
- int length; /* length of decomposed part */
- }
-
- /* decompose and recompose [prevStarter..src[ */
- private static char[] composePart(ComposePartArgs args,
- int prevStarter,
- char[] src, int start, int limit,
- int options,
- UnicodeSet nx) {
- int recomposeLimit;
- boolean compat =((options&OPTIONS_COMPAT)!=0);
-
- /* decompose [prevStarter..src[ */
- int[] outTrailCC = new int[1];
- char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE];
-
- for(;;){
- args.length=decompose(src,prevStarter,(start),
- buffer,0,buffer.length,
- compat,outTrailCC,nx);
- if(args.length<=buffer.length){
- break;
- }else{
- buffer = new char[args.length];
- }
- }
-
- /* recompose the decomposition */
- recomposeLimit=args.length;
-
- if(args.length>=2) {
- RecomposeArgs rcArgs = new RecomposeArgs();
- rcArgs.source = buffer;
- rcArgs.start = 0;
- rcArgs.limit = recomposeLimit;
- args.prevCC=recompose(rcArgs, options, nx);
- recomposeLimit = rcArgs.limit;
- }
-
- /* return with a pointer to the recomposition and its length */
- args.length=recomposeLimit;
- return buffer;
- }
-
- private static boolean composeHangul(char prev, char c,
- long/*unsigned*/ norm32,
- char[] src,int[] srcIndex, int limit,
- boolean compat,
- char[] dest,int destIndex,
- UnicodeSet nx) {
- int start=srcIndex[0];
- if(isJamoVTNorm32JamoV(norm32)) {
- /* c is a Jamo V, compose with previous Jamo L and
- * following Jamo T */
- prev=(char)(prev-JAMO_L_BASE);
- if(prev<JAMO_L_COUNT) {
- c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+
- (c-JAMO_V_BASE))*JAMO_T_COUNT);
-
- /* check if the next character is a Jamo T (normal or
- * compatibility) */
- if(start!=limit) {
- char next, t;
-
- next=src[start];
- if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
- /* normal Jamo T */
- ++start;
- c+=t;
- } else if(compat) {
- /* if NFKC, then check for compatibility Jamo T
- * (BMP only) */
- norm32=getNorm32(next);
- if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) {
- int p /*index into extra data array*/;
- DecomposeArgs dcArgs = new DecomposeArgs();
- p=decompose(norm32, QC_NFKD, dcArgs);
- if(dcArgs.length==1 &&
- (t=(char)(extraData[p]-JAMO_T_BASE))
- <JAMO_T_COUNT) {
- /* compatibility Jamo T */
- ++start;
- c+=t;
- }
- }
- }
- }
- if(nx_contains(nx, c)) {
- if(!isHangulWithoutJamoT(c)) {
- --start; /* undo ++start from reading the Jamo T */
- }
- return false;
- }
- dest[destIndex]=c;
- srcIndex[0]=start;
- return true;
- }
- } else if(isHangulWithoutJamoT(prev)) {
- /* c is a Jamo T, compose with previous Hangul LV that does not
- * contain a Jamo T */
- c=(char)(prev+(c-JAMO_T_BASE));
- if(nx_contains(nx, c)) {
- return false;
- }
- dest[destIndex]=c;
- srcIndex[0]=start;
- return true;
- }
- return false;
- }
- /*
- public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
- return compose(src,0,src.length,dest,0,dest.length,compat, nx);
- }
- */
-
- public static int compose(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- int options,UnicodeSet nx) {
-
- int prevSrc, prevStarter;
- long/*unsigned*/ norm32;
- int ccOrQCMask, qcMask;
- int reorderStartIndex, length;
- char c, c2, minNoMaybe;
- int/*unsigned byte*/ cc, prevCC;
- int[] ioIndex = new int[1];
- int destIndex = destStart;
- int srcIndex = srcStart;
-
- if((options&OPTIONS_COMPAT)!=0) {
- minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE];
- qcMask=QC_NFKC;
- } else {
- minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE];
- qcMask=QC_NFC;
- }
-
- /*
- * prevStarter points to the last character before the current one
- * that is a "true" starter with cc==0 and quick check "yes".
- *
- * prevStarter will be used instead of looking for a true starter
- * while incrementally decomposing [prevStarter..prevSrc[
- * in _composePart(). Having a good prevStarter allows to just decompose
- * the entire [prevStarter..prevSrc[.
- *
- * When _composePart() backs out from prevSrc back to prevStarter,
- * then it also backs out destIndex by the same amount.
- * Therefore, at all times, the (prevSrc-prevStarter) source units
- * must correspond 1:1 to destination units counted with destIndex,
- * except for reordering.
- * This is true for the qc "yes" characters copied in the fast loop,
- * and for pure reordering.
- * prevStarter must be set forward to src when this is not true:
- * In _composePart() and after composing a Hangul syllable.
- *
- * This mechanism relies on the assumption that the decomposition of a
- * true starter also begins with a true starter. gennorm/store.c checks
- * for this.
- */
- prevStarter=srcIndex;
-
- ccOrQCMask=CC_MASK|qcMask;
- /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/
- prevCC=0;
-
- /* avoid compiler warnings */
- norm32=0;
- c=0;
-
- for(;;) {
- /* count code units below the minimum or with irrelevant data for
- * the quick check */
- prevSrc=srcIndex;
-
- while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe ||
- ((norm32=getNorm32(c))&ccOrQCMask)==0)) {
- prevCC=0;
- ++srcIndex;
- }
-
-
- /* copy these code units all at once */
- if(srcIndex!=prevSrc) {
- length=srcIndex-prevSrc;
- if((destIndex+length)<=destLimit) {
- System.arraycopy(src,prevSrc,dest,destIndex,length);
- }
- destIndex+=length;
- reorderStartIndex=destIndex;
-
- /* set prevStarter to the last character in the quick check
- * loop */
- prevStarter=srcIndex-1;
- if(UTF16.isTrailSurrogate(src[prevStarter]) &&
- prevSrc<prevStarter &&
- UTF16.isLeadSurrogate(src[(prevStarter-1)])) {
- --prevStarter;
- }
-
- prevSrc=srcIndex;
- }
-
- /* end of source reached? */
- if(srcIndex==srcLimit) {
- break;
- }
+ private VersionInfo dataVersion;
- /* c already contains *src and norm32 is set for it, increment src*/
- ++srcIndex;
-
- /*
- * source buffer pointers:
- *
- * all done quick check current char not yet
- * "yes" but (c, c2) processed
- * may combine
- * forward
- * [-------------[-------------[-------------[-------------[
- * | | | | |
- * start prevStarter prevSrc src limit
- *
- *
- * destination buffer pointers and indexes:
- *
- * all done might take not filled yet
- * characters for
- * reordering
- * [-------------[-------------[-------------[
- * | | | |
- * dest reorderStartIndex destIndex destCapacity
- */
-
- /* check one above-minimum, relevant code unit */
- /*
- * norm32 is for c=*(src-1), and the quick check flag is "no" or
- * "maybe", and/or cc!=0
- * check for Jamo V/T, then for surrogates and regular characters
- * c is not a Hangul syllable or Jamo L because
- * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
- */
- if(isNorm32HangulOrJamo(norm32)) {
- /*
- * c is a Jamo V/T:
- * try to compose with the previous character, Jamo V also with
- * a following Jamo T, and set values here right now in case we
- * just continue with the main loop
- */
- prevCC=cc=0;
- reorderStartIndex=destIndex;
- ioIndex[0]=srcIndex;
- if(
- destIndex>0 &&
- composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex,
- srcLimit, (options&OPTIONS_COMPAT)!=0, dest,
- destIndex<=destLimit ? destIndex-1: 0,
- nx)
- ) {
- srcIndex=ioIndex[0];
- prevStarter=srcIndex;
- continue;
- }
-
- srcIndex = ioIndex[0];
-
- /* the Jamo V/T did not compose into a Hangul syllable, just
- * append to dest */
- c2=0;
- length=1;
- prevStarter=prevSrc;
- } else {
- if(isNorm32Regular(norm32)) {
- c2=0;
- length=1;
- } else {
- /* c is a lead surrogate, get the real norm32 */
- if(srcIndex!=srcLimit &&
- UTF16.isTrailSurrogate(c2=src[srcIndex])) {
- ++srcIndex;
- length=2;
- norm32=getNorm32FromSurrogatePair(norm32, c2);
- } else {
- /* c is an unpaired lead surrogate, nothing to do */
- c2=0;
- length=1;
- norm32=0;
- }
- }
- ComposePartArgs args =new ComposePartArgs();
-
- /* we are looking at the character (c, c2) at [prevSrc..src[ */
- if(nx_contains(nx, c, c2)) {
- /* excluded: norm32==0 */
- cc=0;
- } else if((norm32&qcMask)==0) {
- cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT));
- } else {
- char[] p;
-
- /*
- * find appropriate boundaries around this character,
- * decompose the source text from between the boundaries,
- * and recompose it
- *
- * this puts the intermediate text into the side buffer because
- * it might be longer than the recomposition end result,
- * or the destination buffer may be too short or missing
- *
- * note that destIndex may be adjusted backwards to account
- * for source text that passed the quick check but needed to
- * take part in the recomposition
- */
- int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
- /*
- * find the last true starter in [prevStarter..src[
- * it is either the decomposition of the current character (at prevSrc),
- * or prevStarter
- */
- if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) {
- prevStarter=prevSrc;
- } else {
- /* adjust destIndex: back out what had been copied with qc "yes" */
- destIndex-=prevSrc-prevStarter;
- }
-
- /* find the next true starter in [src..limit[ */
- srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask,
- decompQCMask, minNoMaybe);
- //args.prevStarter = prevStarter;
- args.prevCC = prevCC;
- //args.destIndex = destIndex;
- args.length = length;
- p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx);
-
- if(p==null) {
- /* an error occurred (out of memory) */
- break;
- }
-
- prevCC = args.prevCC;
- length = args.length;
-
- /* append the recomposed buffer contents to the destination
- * buffer */
- if((destIndex+args.length)<=destLimit) {
- int i=0;
- while(i<args.length) {
- dest[destIndex++]=p[i++];
- --length;
- }
- } else {
- /* buffer overflow */
- /* keep incrementing the destIndex for preflighting */
- destIndex+=length;
- }
-
- prevStarter=srcIndex;
- continue;
- }
- }
-
- /* append the single code point (c, c2) to the destination buffer */
- if((destIndex+length)<=destLimit) {
- if(cc!=0 && cc<prevCC) {
- /* (c, c2) is out of order with respect to the preceding
- * text */
- int reorderSplit= destIndex;
- destIndex+=length;
- prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit,
- destIndex, c, c2, cc);
- } else {
- /* just append (c, c2) */
- dest[destIndex++]=c;
- if(c2!=0) {
- dest[destIndex++]=c2;
- }
- prevCC=cc;
- }
- } else {
- /* buffer overflow */
- /* keep incrementing the destIndex for preflighting */
- destIndex+=length;
- prevCC=cc;
- }
- }
-
- return destIndex - destStart;
- }
-
- public static int getCombiningClass(int c) {
- long norm32;
- norm32=getNorm32(c);
- return (int)((norm32>>CC_SHIFT)&0xFF);
- }
-
- public static boolean isFullCompositionExclusion(int c) {
- if(isFormatVersion_2_1) {
- int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
- return (aux & AUX_COMP_EX_MASK)!=0;
- } else {
- return false;
- }
- }
-
- public static boolean isCanonSafeStart(int c) {
- if(isFormatVersion_2_1) {
- int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
- return (aux & AUX_UNSAFE_MASK)==0;
- } else {
- return false;
- }
- }
-
- /* Is c an NF<mode>-skippable code point? See unormimp.h. */
- public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) {
- long /*unsigned int*/ norm32;
- mask = mask & UNSIGNED_INT_MASK;
- char aux;
-
- /* check conditions (a)..(e), see unormimp.h */
- norm32 = getNorm32(c);
-
- if((norm32&mask)!=0) {
- return false; /* fails (a)..(e), not skippable */
- }
-
- if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){
- return true; /* NF*D, passed (a)..(c), is skippable */
- }
- /* check conditions (a)..(e), see unormimp.h */
-
- /* NF*C/FCC, passed (a)..(e) */
- if((norm32& QC_NFD)==0) {
- return true; /* no canonical decomposition, is skippable */
- }
-
- /* check Hangul syllables algorithmically */
- if(isNorm32HangulOrJamo(norm32)) {
- /* Jamo passed (a)..(e) above, must be Hangul */
- return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */
- }
-
- /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
- /* NF*C, test (f) flag */
- if(!isFormatVersion_2_2) {
- return false; /* no (f) data, say not skippable to be safe */
- }
-
-
- aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
- return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
-
- /* } else { FCC, test fcd<=1 instead of the above } */
- }
-
- public static UnicodeSet addPropertyStarts(UnicodeSet set) {
- int c;
-
- /* add the start code point of each same-value range of each trie */
- //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
- TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
- RangeValueIterator.Element normResult = new RangeValueIterator.Element();
-
- while(normIter.next(normResult)){
- set.add(normResult.start);
- }
-
- //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
- TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
- RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
-
- while(fcdIter.next(fcdResult)){
- set.add(fcdResult.start);
- }
-
- if(isFormatVersion_2_1){
- //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
- TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
- RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
- while(auxIter.next(auxResult)){
- set.add(auxResult.start);
- }
- }
- /* add Hangul LV syllables and LV+1 because of skippables */
- for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
- set.add(c);
- set.add(c+1);
- }
- set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
- return set; // for chaining
- }
-
- /**
- * Internal API, used in UCharacter.getIntPropertyValue().
- * @internal
- * @param c code point
- * @param modeValue numeric value compatible with Mode
- * @return numeric value compatible with QuickCheck
- */
- public static final int quickCheck(int c, int modeValue) {
- final int qcMask[/*UNORM_MODE_COUNT*/]={
- 0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC
- };
-
- int norm32=(int)getNorm32(c)&qcMask[modeValue];
+ // Code point thresholds for quick check codes.
+ private int minDecompNoCP;
+ private int minCompNoMaybeCP;
- if(norm32==0) {
- return 1; // YES
- } else if((norm32&QC_ANY_NO)!=0) {
- return 0; // NO
- } else /* _NORM_QC_ANY_MAYBE */ {
- return 2; // MAYBE;
- }
- }
-
- private static int strCompare(char[] s1, int s1Start, int s1Limit,
- char[] s2, int s2Start, int s2Limit,
- boolean codePointOrder) {
-
- int start1, start2, limit1, limit2;
-
- char c1, c2;
-
- /* setup for fix-up */
- start1=s1Start;
- start2=s2Start;
-
- int length1, length2;
-
- length1 = s1Limit - s1Start;
- length2 = s2Limit - s2Start;
-
- int lengthResult;
-
- if(length1<length2) {
- lengthResult=-1;
- limit1=start1+length1;
- } else if(length1==length2) {
- lengthResult=0;
- limit1=start1+length1;
- } else /* length1>length2 */ {
- lengthResult=1;
- limit1=start1+length2;
- }
-
- if(s1==s2) {
- return lengthResult;
- }
-
- for(;;) {
- /* check pseudo-limit */
- if(s1Start==limit1) {
- return lengthResult;
- }
-
- c1=s1[s1Start];
- c2=s2[s2Start];
- if(c1!=c2) {
- break;
- }
- ++s1Start;
- ++s2Start;
- }
-
- /* setup for fix-up */
- limit1=start1+length1;
- limit2=start2+length2;
-
-
- /* if both values are in or above the surrogate range, fix them up */
- if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
- /* subtract 0x2800 from BMP code points to make them smaller than
- * supplementary ones */
- if(
- ( c1<=0xdbff && (s1Start+1)!=limit1 &&
- UTF16.isTrailSurrogate(s1[(s1Start+1)])
- ) ||
- ( UTF16.isTrailSurrogate(c1) && start1!=s1Start &&
- UTF16.isLeadSurrogate(s1[(s1Start-1)])
- )
- ) {
- /* part of a surrogate pair, leave >=d800 */
- } else {
- /* BMP code point - may be surrogate code point - make <d800 */
- c1-=0x2800;
- }
-
- if(
- ( c2<=0xdbff && (s2Start+1)!=limit2 &&
- UTF16.isTrailSurrogate(s2[(s2Start+1)])
- ) ||
- ( UTF16.isTrailSurrogate(c2) && start2!=s2Start &&
- UTF16.isLeadSurrogate(s2[(s2Start-1)])
- )
- ) {
- /* part of a surrogate pair, leave >=d800 */
- } else {
- /* BMP code point - may be surrogate code point - make <d800 */
- c2-=0x2800;
- }
- }
-
- /* now c1 and c2 are in UTF-32-compatible order */
- return (int)c1-(int)c2;
- }
-
-
- /*
- * Status of tailored normalization
- *
- * This was done initially for investigation on Unicode public review issue 7
- * (http://www.unicode.org/review/). See Jitterbug 2481.
- * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
- * a permanent feature in ICU 2.6 in support of IDNA which requires true
- * Unicode 3.2 normalization.
- * (NormalizationCorrections are rolled into IDNA mapping tables.)
- *
- * Tailored normalization as implemented here allows to "normalize less"
- * than full Unicode normalization would.
- * Based internally on a UnicodeSet of code points that are
- * "excluded from normalization", the normalization functions leave those
- * code points alone ("inert"). This means that tailored normalization
- * still transforms text into a canonically equivalent form.
- * It does not add decompositions to code points that do not have any or
- * change decomposition results.
- *
- * Any function that searches for a safe boundary has not been touched,
- * which means that these functions will be over-pessimistic when
- * exclusions are applied.
- * This should not matter because subsequent checks and normalizations
- * do apply the exclusions; only a little more of the text may be processed
- * than necessary under exclusions.
- *
- * Normalization exclusions have the following effect on excluded code points c:
- * - c is not decomposed
- * - c is not a composition target
- * - c does not combine forward or backward for composition
- * except that this is not implemented for Jamo
- * - c is treated as having a combining class of 0
- */
-
- /*
- * Constants for the bit fields in the options bit set parameter.
- * These need not be public.
- * A user only needs to know the currently assigned values.
- * The number and positions of reserved bits per field can remain private.
- */
- private static final int OPTIONS_NX_MASK=0x1f;
- private static final int OPTIONS_UNICODE_MASK=0xe0;
- public static final int OPTIONS_SETS_MASK=0xff;
-// private static final int OPTIONS_UNICODE_SHIFT=5;
- private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];
-
- /* Constants for options flags for normalization.*/
-
- /**
- * Options bit 0, do not decompose Hangul syllables.
- * @draft ICU 2.6
- */
- private static final int NX_HANGUL = 1;
- /**
- * Options bit 1, do not decompose CJK compatibility characters.
- * @draft ICU 2.6
- */
- private static final int NX_CJK_COMPAT=2;
- /**
- * Options bit 8, use buggy recomposition described in
- * Unicode Public Review Issue #29
- * at http://www.unicode.org/review/resolved-pri.html#pri29
- *
- * Used in IDNA implementation according to strict interpretation
- * of IDNA definition based on Unicode 3.2 which predates PRI #29.
- *
- * See ICU4C unormimp.h
- *
- * @draft ICU 3.2
- */
- public static final int BEFORE_PRI_29=0x100;
-
- /*
- * The following options are used only in some composition functions.
- * They use bits 12 and up to preserve lower bits for the available options
- * space in unorm_compare() -
- * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
- */
-
- /** Options bit 12, for compatibility vs. canonical decomposition. */
- public static final int OPTIONS_COMPAT=0x1000;
- /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
- public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000;
-
- /* normalization exclusion sets --------------------------------------------- */
-
- /*
- * Normalization exclusion UnicodeSets are used for tailored normalization;
- * see the comment near the beginning of this file.
- *
- * By specifying one or several sets of code points,
- * those code points become inert for normalization.
- */
- private static final synchronized UnicodeSet internalGetNXHangul() {
- /* internal function, does not check for incoming U_FAILURE */
-
- if(nxCache[NX_HANGUL]==null) {
- nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3);
- }
- return nxCache[NX_HANGUL];
- }
-
- private static final synchronized UnicodeSet internalGetNXCJKCompat() {
- /* internal function, does not check for incoming U_FAILURE */
-
- if(nxCache[NX_CJK_COMPAT]==null) {
-
- /* build a set from [CJK Ideographs]&[has canonical decomposition] */
- UnicodeSet set, hasDecomp;
-
- set=new UnicodeSet("[:Ideographic:]");
-
- /* start with an empty set for [has canonical decomposition] */
- hasDecomp=new UnicodeSet();
-
- /* iterate over all ideographs and remember which canonically decompose */
- UnicodeSetIterator it = new UnicodeSetIterator(set);
- int start, end;
- long norm32;
-
- while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
- start=it.codepoint;
- end=it.codepointEnd;
- while(start<=end) {
- norm32 = getNorm32(start);
- if((norm32 & QC_NFD)>0) {
- hasDecomp.add(start);
- }
- ++start;
- }
- }
-
- /* hasDecomp now contains all ideographs that decompose canonically */
- nxCache[NX_CJK_COMPAT]=hasDecomp;
-
- }
-
- return nxCache[NX_CJK_COMPAT];
- }
-
- private static final synchronized UnicodeSet internalGetNXUnicode(int options) {
- options &= OPTIONS_UNICODE_MASK;
- if(options==0) {
- return null;
- }
-
- if(nxCache[options]==null) {
- /* build a set with all code points that were not designated by the specified Unicode version */
- UnicodeSet set = new UnicodeSet();
-
- switch(options) {
- case NormalizerBase.UNICODE_3_2:
- set.applyPattern("[:^Age=3.2:]");
- break;
- default:
- return null;
- }
-
- nxCache[options]=set;
- }
-
- return nxCache[options];
- }
-
- /* Get a decomposition exclusion set. The data must be loaded. */
- private static final synchronized UnicodeSet internalGetNX(int options) {
- options&=OPTIONS_SETS_MASK;
-
- if(nxCache[options]==null) {
- /* return basic sets */
- if(options==NX_HANGUL) {
- return internalGetNXHangul();
- }
- if(options==NX_CJK_COMPAT) {
- return internalGetNXCJKCompat();
- }
- if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) {
- return internalGetNXUnicode(options);
- }
-
- /* build a set from multiple subsets */
- UnicodeSet set;
- UnicodeSet other;
-
- set=new UnicodeSet();
-
-
- if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) {
- set.addAll(other);
- }
- if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) {
- set.addAll(other);
- }
- if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) {
- set.addAll(other);
- }
+ // Norm16 value thresholds for quick check combinations and types of extra data.
+ private int minYesNo;
+ private int minYesNoMappingsOnly;
+ private int minNoNo;
+ private int limitNoNo;
+ private int minMaybeYes;
- nxCache[options]=set;
- }
- return nxCache[options];
- }
-
- public static final UnicodeSet getNX(int options) {
- if((options&=OPTIONS_SETS_MASK)==0) {
- /* incoming failure, or no decomposition exclusions requested */
- return null;
- } else {
- return internalGetNX(options);
- }
- }
-
- private static final boolean nx_contains(UnicodeSet nx, int c) {
- return nx!=null && nx.contains(c);
- }
-
- private static final boolean nx_contains(UnicodeSet nx, char c, char c2) {
- return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2));
- }
-
-/*****************************************************************************/
-
- /**
- * Get the canonical decomposition
- * sherman for ComposedCharIter
- */
-
- public static int getDecompose(int chars[], String decomps[]) {
- DecomposeArgs args = new DecomposeArgs();
- int length=0;
- long norm32 = 0;
- int ch = -1;
- int index = 0;
- int i = 0;
-
- while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff
- //TBD !!!! the hack code heres save us about 50ms for startup
- //need a better solution/lookup
- if (ch == 0x30ff)
- ch = 0xf900;
- else if (ch == 0x10000)
- ch = 0x1d15e;
- else if (ch == 0x1d1c1)
- ch = 0x2f800;
-
- norm32 = NormalizerImpl.getNorm32(ch);
- if((norm32 & QC_NFD)!=0 && i < chars.length) {
- chars[i] = ch;
- index = decompose(norm32, args);
- decomps[i++] = new String(extraData,index, args.length);
- }
- }
- return i;
- }
-
- //------------------------------------------------------
- // special method for Collation
- //------------------------------------------------------
- private static boolean needSingleQuotation(char c) {
- return (c >= 0x0009 && c <= 0x000D) ||
- (c >= 0x0020 && c <= 0x002F) ||
- (c >= 0x003A && c <= 0x0040) ||
- (c >= 0x005B && c <= 0x0060) ||
- (c >= 0x007B && c <= 0x007E);
- }
-
- public static String canonicalDecomposeWithSingleQuotation(String string) {
- char[] src = string.toCharArray();
- int srcIndex = 0;
- int srcLimit = src.length;
- char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3
- int destIndex = 0;
- int destLimit = dest.length;
-
- char[] buffer = new char[3];
- int prevSrc;
- long norm32;
- int ccOrQCMask;
- int qcMask = QC_NFD;
- int reorderStartIndex, length;
- char c, c2;
- char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE];
- int cc, prevCC, trailCC;
- char[] p;
- int pStart;
-
-
- // initialize
- ccOrQCMask = CC_MASK | qcMask;
- reorderStartIndex = 0;
- prevCC = 0;
- norm32 = 0;
- c = 0;
- pStart = 0;
-
- cc = trailCC = -1; // initialize to bogus value
- for(;;) {
- prevSrc=srcIndex;
- //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
- while (srcIndex != srcLimit &&
- (( c = src[srcIndex]) < minNoMaybe ||
- ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 ||
- ( c >= '\uac00' && c <= '\ud7a3'))){
-
- prevCC = 0;
- ++srcIndex;
- }
-
- // copy these code units all at once
- if (srcIndex != prevSrc) {
- length = srcIndex - prevSrc;
- if ((destIndex + length) <= destLimit) {
- System.arraycopy(src,prevSrc,dest,destIndex,length);
- }
-
- destIndex += length;
- reorderStartIndex = destIndex;
- }
-
- // end of source reached?
- if(srcIndex == srcLimit) {
- break;
- }
- // c already contains *src and norm32 is set for it, increment src
- ++srcIndex;
-
- if(isNorm32Regular(norm32)) {
- c2 = 0;
- length = 1;
- } else {
- // c is a lead surrogate, get the real norm32
- if(srcIndex != srcLimit &&
- Character.isLowSurrogate(c2 = src[srcIndex])) {
- ++srcIndex;
- length = 2;
- norm32 = getNorm32FromSurrogatePair(norm32, c2);
- } else {
- c2 = 0;
- length = 1;
- norm32 = 0;
- }
- }
+ private Trie2_16 normTrie;
+ private String maybeYesCompositions;
+ private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
+ private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
+ private int[] tccc180; // [0x180] tccc values for U+0000..U+017F
- // get the decomposition and the lead and trail cc's
- if((norm32 & qcMask) == 0) {
- // c does not decompose
- cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
- p = null;
- pStart = -1;
- } else {
- DecomposeArgs arg = new DecomposeArgs();
- // c decomposes, get everything from the variable-length
- // extra data
- pStart = decompose(norm32, qcMask, arg);
- p = extraData;
- length = arg.length;
- cc = arg.cc;
- trailCC = arg.trailCC;
- if(length == 1) {
- // fastpath a single code unit from decomposition
- c = p[pStart];
- c2 = 0;
- p = null;
- pStart = -1;
- }
- }
-
- if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations
- // buffer overflow
- char[] tmpBuf = new char[destLimit * 2];
- System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
- dest = tmpBuf;
- destLimit = dest.length;
- }
- // append the decomposition to the destination buffer, assume length>0
- {
- int reorderSplit = destIndex;
- if(p == null) {
- // fastpath: single code point
- if (needSingleQuotation(c)) {
- //if we need single quotation, no need to consider "prevCC"
- //and it must NOT be a supplementary pair
- dest[destIndex++] = '\'';
- dest[destIndex++] = c;
- dest[destIndex++] = '\'';
- trailCC = 0;
- } else if(cc != 0 && cc < prevCC) {
- // (c, c2) is out of order with respect to the preceding
- // text
- destIndex += length;
- trailCC = insertOrdered(dest,reorderStartIndex,
- reorderSplit, destIndex, c, c2, cc);
- } else {
- // just append (c, c2)
- dest[destIndex++] = c;
- if(c2 != 0) {
- dest[destIndex++] = c2;
- }
- }
- } else {
- // general: multiple code points (ordered by themselves)
- // from decomposition
- if (needSingleQuotation(p[pStart])) {
- dest[destIndex++] = '\'';
- dest[destIndex++] = p[pStart++];
- dest[destIndex++] = '\'';
- length--;
- do {
- dest[destIndex++] = p[pStart++];
- } while(--length > 0);
- } else
- if(cc != 0 && cc < prevCC) {
- destIndex += length;
- trailCC = mergeOrdered(dest,reorderStartIndex,
- reorderSplit,p, pStart,pStart+length);
- } else {
- // just append the decomposition
- do {
- dest[destIndex++] = p[pStart++];
- } while(--length > 0);
- }
- }
- }
- prevCC = trailCC;
- if(prevCC == 0) {
- reorderStartIndex = destIndex;
- }
- }
- return new String(dest, 0, destIndex);
- }
-
- //------------------------------------------------------
- // mapping method for IDNA/StringPrep
- //------------------------------------------------------
-
- /*
- * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode
- * 3.2 normalization with Corrigendum 4 corrections. However, normalization
- * without the corrections is necessary for IDNA/StringPrep support.
- * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option
- * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five
- * characters in Corrigendum 4 before normalization in order to avoid
- * incorrect normalization.
- * For the Corrigendum 4 issue, refer
- * http://www.unicode.org/versions/corrigendum4.html
- */
-
- /*
- * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL.
- */
- public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000;
-
- private static final char[][] corrigendum4MappingTable = {
- {'\uD844', '\uDF6A'}, // 0x2F868
- {'\u5F33'}, // 0x2F874
- {'\u43AB'}, // 0x2F91F
- {'\u7AAE'}, // 0x2F95F
- {'\u4D57'}}; // 0x2F9BF
-
- /*
- * Removing Corrigendum 4 fix
- * @return normalized text
- */
- public static String convert(String str) {
- if (str == null) {
- return null;
- }
-
- int ch = UCharacterIterator.DONE;
- StringBuffer dest = new StringBuffer();
- UCharacterIterator iter = UCharacterIterator.getInstance(str);
-
- while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
- switch (ch) {
- case 0x2F868:
- dest.append(corrigendum4MappingTable[0]);
- break;
- case 0x2F874:
- dest.append(corrigendum4MappingTable[1]);
- break;
- case 0x2F91F:
- dest.append(corrigendum4MappingTable[2]);
- break;
- case 0x2F95F:
- dest.append(corrigendum4MappingTable[3]);
- break;
- case 0x2F9BF:
- dest.append(corrigendum4MappingTable[4]);
- break;
- default:
- UTF16.append(dest,ch);
- break;
- }
- }
-
- return dest.toString();
- }
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/OutputInt.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *******************************************************************************
+ * Copyright (C) 2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ */
+package sun.text.normalizer;
+
+/**
+ * Simple struct-like class for int output parameters.
+ * Like <code>Output<Integer></code> but without auto-boxing.
+ *
+ * @internal but could become public
+ * deprecated This API is ICU internal only.
+ */
+class OutputInt {
+
+ /**
+ * The value field.
+ *
+ * @internal
+ * deprecated This API is ICU internal only.
+ */
+ public int value;
+}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/RangeValueIterator.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-/**
- * <p>Interface for enabling iteration over sets of
- * {@code <int index, int value>},
- * where index is the sorted integer index in ascending order and value, its
- * associated integer value.
- * <p>The result for each iteration is the consecutive range of
- * {@code <int index, int value>} with the same value. Result is represented by
- * {@code <start, limit, value>} where
- * <ul>
- * <li> start is the starting integer of the result range
- * <li> limit is 1 after the maximum integer that follows start, such that
- * all integers between start and (limit - 1), inclusive, have the same
- * associated integer value.
- * <li> value is the integer value that all integers from start to (limit - 1)
- * share in common.
- * </ul>
- * <p>
- * Hence value(start) = value(start + 1) = .... = value(start + n) = .... =
- * value(limit - 1). However value(start -1) != value(start) and
- * value(limit) != value(start).
- *
- * <p>Most implementations will be created by factory methods, such as the
- * character type iterator in UCharacter.getTypeIterator. See example below.
- *
- * Example of use:<br>
- * <pre>
- * RangeValueIterator iterator = UCharacter.getTypeIterator();
- * RangeValueIterator.Element result = new RangeValueIterator.Element();
- * while (iterator.next(result)) {
- * System.out.println("Codepoint \\u" +
- * Integer.toHexString(result.start) +
- * " to codepoint \\u" +
- * Integer.toHexString(result.limit - 1) +
- * " has the character type " + result.value);
- * }
- * </pre>
- * @author synwee
- * @stable ICU 2.6
- */
-public interface RangeValueIterator
-{
- // public inner class ---------------------------------------------
-
- /**
- * Return result wrapper for com.ibm.icu.util.RangeValueIterator.
- * Stores the start and limit of the continous result range and the
- * common value all integers between [start, limit - 1] has.
- * @stable ICU 2.6
- */
- public class Element
- {
- // public data member ---------------------------------------------
-
- /**
- * Starting integer of the continuous result range that has the same
- * value
- * @stable ICU 2.6
- */
- public int start;
- /**
- * (End + 1) integer of continuous result range that has the same
- * value
- * @stable ICU 2.6
- */
- public int limit;
- /**
- * Gets the common value of the continous result range
- * @stable ICU 2.6
- */
- public int value;
-
- // public constructor --------------------------------------------
-
- /**
- * Empty default constructor to make javadoc happy
- * @stable ICU 2.4
- */
- public Element()
- {
- }
- }
-
- // public methods -------------------------------------------------
-
- /**
- * <p>Gets the next maximal result range with a common value and returns
- * true if we are not at the end of the iteration, false otherwise.</p>
- * <p>If the return boolean is a false, the contents of elements will not
- * be updated.</p>
- * @param element for storing the result range and value
- * @return true if we are not at the end of the iteration, false otherwise.
- * @see Element
- * @stable ICU 2.6
- */
- public boolean next(Element element);
-
- /**
- * Resets the iterator to the beginning of the iteration.
- * @stable ICU 2.6
- */
- public void reset();
-}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/Replaceable.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/Replaceable.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -82,7 +82,7 @@
* @author Alan Liu
* @stable ICU 2.0
*/
-public interface Replaceable {
+interface Replaceable {
/**
* Returns the number of 16-bit code units in the text.
* @return number of 16-bit code units in text
@@ -99,7 +99,6 @@
*/
char charAt(int offset);
- //// for StringPrep
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableString.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableString.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -25,13 +25,8 @@
/*
*******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2009, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
*******************************************************************************
*/
@@ -51,7 +46,7 @@
* @author Alan Liu
* @stable ICU 2.0
*/
-public class ReplaceableString implements Replaceable {
+class ReplaceableString implements Replaceable {
private StringBuffer buf;
@@ -64,7 +59,6 @@
buf = new StringBuffer(str);
}
- //// for StringPrep
/**
* Construct a new object using <code>buf</code> for internal
* storage. The contents of <code>buf</code> at the time of
@@ -98,7 +92,6 @@
return buf.charAt(offset);
}
- //// for StringPrep
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
@@ -118,6 +111,8 @@
* @stable ICU 2.0
*/
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
- Utility.getChars(buf, srcStart, srcLimit, dst, dstStart);
+ if (srcStart != srcLimit) {
+ buf.getChars(srcStart, srcLimit, dst, dstStart);
+ }
}
}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableUCharacterIterator.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableUCharacterIterator.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -47,7 +47,7 @@
*
* What are first, last, and getBeginIndex doing here?!?!?!
*/
-public class ReplaceableUCharacterIterator extends UCharacterIterator {
+class ReplaceableUCharacterIterator extends UCharacterIterator {
// public constructor ------------------------------------------------------
@@ -63,7 +63,6 @@
this.currentIndex = 0;
}
- //// for StringPrep
/**
* Public constructor
* @param buf buffer of text on which the iterator will be based
@@ -164,7 +163,6 @@
this.currentIndex = currentIndex;
}
- //// for StringPrep
public int getText(char[] fillIn, int offset){
int length = replaceable.length();
if(offset < 0 || offset + length > fillIn.length){
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/RuleCharacterIterator.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,371 +0,0 @@
-/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-/*
- **********************************************************************
- * Author: Alan Liu
- * Created: September 23 2003
- * Since: ICU 2.8
- **********************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.text.ParsePosition;
-
-/**
- * An iterator that returns 32-bit code points. This class is deliberately
- * <em>not</em> related to any of the JDK or ICU4J character iterator classes
- * in order to minimize complexity.
- * @author Alan Liu
- * @since ICU 2.8
- */
-@SuppressWarnings("deprecation")
-public class RuleCharacterIterator {
-
- // TODO: Ideas for later. (Do not implement if not needed, lest the
- // code coverage numbers go down due to unused methods.)
- // 1. Add a copy constructor, equals() method, clone() method.
- // 2. Rather than return DONE, throw an exception if the end
- // is reached -- this is an alternate usage model, probably not useful.
- // 3. Return isEscaped from next(). If this happens,
- // don't keep an isEscaped member variable.
-
- /**
- * Text being iterated.
- */
- private String text;
-
- /**
- * Position of iterator.
- */
- private ParsePosition pos;
-
- /**
- * Symbol table used to parse and dereference variables. May be null.
- */
- private SymbolTable sym;
-
- /**
- * Current variable expansion, or null if none.
- */
- private char[] buf;
-
- /**
- * Position within buf[]. Meaningless if buf == null.
- */
- private int bufPos;
-
- /**
- * Flag indicating whether the last character was parsed from an escape.
- */
- private boolean isEscaped;
-
- /**
- * Value returned when there are no more characters to iterate.
- */
- public static final int DONE = -1;
-
- /**
- * Bitmask option to enable parsing of variable names.
- * If {@code (options & PARSE_VARIABLES) != 0},
- * then an embedded variable will be expanded to
- * its value. Variables are parsed using the SymbolTable API.
- */
- public static final int PARSE_VARIABLES = 1;
-
- /**
- * Bitmask option to enable parsing of escape sequences.
- * If {@code (options & PARSE_ESCAPES) != 0},
- * then an embedded escape sequence will be expanded
- * to its value. Escapes are parsed using Utility.unescapeAt().
- */
- public static final int PARSE_ESCAPES = 2;
-
- /**
- * Bitmask option to enable skipping of whitespace.
- * If {@code (options & SKIP_WHITESPACE) != 0},
- * then whitespace characters will be silently
- * skipped, as if they were not present in the input. Whitespace
- * characters are defined by UCharacterProperty.isRuleWhiteSpace().
- */
- public static final int SKIP_WHITESPACE = 4;
-
- /**
- * Constructs an iterator over the given text, starting at the given
- * position.
- * @param text the text to be iterated
- * @param sym the symbol table, or null if there is none. If sym is null,
- * then variables will not be deferenced, even if the PARSE_VARIABLES
- * option is set.
- * @param pos upon input, the index of the next character to return. If a
- * variable has been dereferenced, then pos will <em>not</em> increment as
- * characters of the variable value are iterated.
- */
- public RuleCharacterIterator(String text, SymbolTable sym,
- ParsePosition pos) {
- if (text == null || pos.getIndex() > text.length()) {
- throw new IllegalArgumentException();
- }
- this.text = text;
- this.sym = sym;
- this.pos = pos;
- buf = null;
- }
-
- /**
- * Returns true if this iterator has no more characters to return.
- */
- public boolean atEnd() {
- return buf == null && pos.getIndex() == text.length();
- }
-
- /**
- * Returns the next character using the given options, or DONE if there
- * are no more characters, and advance the position to the next
- * character.
- * @param options one or more of the following options, bitwise-OR-ed
- * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
- * @return the current 32-bit code point, or DONE
- */
- public int next(int options) {
- int c = DONE;
- isEscaped = false;
-
- for (;;) {
- c = _current();
- _advance(UTF16.getCharCount(c));
-
- if (c == SymbolTable.SYMBOL_REF && buf == null &&
- (options & PARSE_VARIABLES) != 0 && sym != null) {
- String name = sym.parseReference(text, pos, text.length());
- // If name == null there was an isolated SYMBOL_REF;
- // return it. Caller must be prepared for this.
- if (name == null) {
- break;
- }
- bufPos = 0;
- buf = sym.lookup(name);
- if (buf == null) {
- throw new IllegalArgumentException(
- "Undefined variable: " + name);
- }
- // Handle empty variable value
- if (buf.length == 0) {
- buf = null;
- }
- continue;
- }
-
- if ((options & SKIP_WHITESPACE) != 0 &&
- UCharacterProperty.isRuleWhiteSpace(c)) {
- continue;
- }
-
- if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
- int offset[] = new int[] { 0 };
- c = Utility.unescapeAt(lookahead(), offset);
- jumpahead(offset[0]);
- isEscaped = true;
- if (c < 0) {
- throw new IllegalArgumentException("Invalid escape");
- }
- }
-
- break;
- }
-
- return c;
- }
-
- /**
- * Returns true if the last character returned by next() was
- * escaped. This will only be the case if the option passed in to
- * next() included PARSE_ESCAPED and the next character was an
- * escape sequence.
- */
- public boolean isEscaped() {
- return isEscaped;
- }
-
- /**
- * Returns true if this iterator is currently within a variable expansion.
- */
- public boolean inVariable() {
- return buf != null;
- }
-
- /**
- * Returns an object which, when later passed to setPos(), will
- * restore this iterator's position. Usage idiom:
- *
- * RuleCharacterIterator iterator = ...;
- * Object pos = iterator.getPos(null); // allocate position object
- * for (;;) {
- * pos = iterator.getPos(pos); // reuse position object
- * int c = iterator.next(...);
- * ...
- * }
- * iterator.setPos(pos);
- *
- * @param p a position object previously returned by getPos(),
- * or null. If not null, it will be updated and returned. If
- * null, a new position object will be allocated and returned.
- * @return a position object which may be passed to setPos(),
- * either `p,' or if `p' == null, a newly-allocated object
- */
- public Object getPos(Object p) {
- if (p == null) {
- return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
- }
- Object[] a = (Object[]) p;
- a[0] = buf;
- int[] v = (int[]) a[1];
- v[0] = pos.getIndex();
- v[1] = bufPos;
- return p;
- }
-
- /**
- * Restores this iterator to the position it had when getPos()
- * returned the given object.
- * @param p a position object previously returned by getPos()
- */
- public void setPos(Object p) {
- Object[] a = (Object[]) p;
- buf = (char[]) a[0];
- int[] v = (int[]) a[1];
- pos.setIndex(v[0]);
- bufPos = v[1];
- }
-
- /**
- * Skips ahead past any ignored characters, as indicated by the given
- * options. This is useful in conjunction with the lookahead() method.
- *
- * Currently, this only has an effect for SKIP_WHITESPACE.
- * @param options one or more of the following options, bitwise-OR-ed
- * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
- */
- public void skipIgnored(int options) {
- if ((options & SKIP_WHITESPACE) != 0) {
- for (;;) {
- int a = _current();
- if (!UCharacterProperty.isRuleWhiteSpace(a)) break;
- _advance(UTF16.getCharCount(a));
- }
- }
- }
-
- /**
- * Returns a string containing the remainder of the characters to be
- * returned by this iterator, without any option processing. If the
- * iterator is currently within a variable expansion, this will only
- * extend to the end of the variable expansion. This method is provided
- * so that iterators may interoperate with string-based APIs. The typical
- * sequence of calls is to call skipIgnored(), then call lookahead(), then
- * parse the string returned by lookahead(), then call jumpahead() to
- * resynchronize the iterator.
- * @return a string containing the characters to be returned by future
- * calls to next()
- */
- public String lookahead() {
- if (buf != null) {
- return new String(buf, bufPos, buf.length - bufPos);
- } else {
- return text.substring(pos.getIndex());
- }
- }
-
- /**
- * Advances the position by the given number of 16-bit code units.
- * This is useful in conjunction with the lookahead() method.
- * @param count the number of 16-bit code units to jump over
- */
- public void jumpahead(int count) {
- if (count < 0) {
- throw new IllegalArgumentException();
- }
- if (buf != null) {
- bufPos += count;
- if (bufPos > buf.length) {
- throw new IllegalArgumentException();
- }
- if (bufPos == buf.length) {
- buf = null;
- }
- } else {
- int i = pos.getIndex() + count;
- pos.setIndex(i);
- if (i > text.length()) {
- throw new IllegalArgumentException();
- }
- }
- }
-
- /**
- * Returns the current 32-bit code point without parsing escapes, parsing
- * variables, or skipping whitespace.
- * @return the current 32-bit code point
- */
- private int _current() {
- if (buf != null) {
- return UTF16.charAt(buf, 0, buf.length, bufPos);
- } else {
- int i = pos.getIndex();
- return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
- }
- }
-
- /**
- * Advances the position by the given amount.
- * @param count the number of 16-bit code units to advance past
- */
- private void _advance(int count) {
- if (buf != null) {
- bufPos += count;
- if (bufPos == buf.length) {
- buf = null;
- }
- } else {
- pos.setIndex(pos.getIndex() + count);
- if (pos.getIndex() > text.length()) {
- pos.setIndex(text.length());
- }
- }
- }
-}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/SymbolTable.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.text.ParsePosition;
-
-/**
- * An interface that defines both lookup protocol and parsing of
- * symbolic names.
- *
- * <p>A symbol table maintains two kinds of mappings. The first is
- * between symbolic names and their values. For example, if the
- * variable with the name "start" is set to the value "alpha"
- * (perhaps, though not necessarily, through an expression such as
- * "$start=alpha"), then the call lookup("start") will return the
- * char[] array ['a', 'l', 'p', 'h', 'a'].
- *
- * <p>The second kind of mapping is between character values and
- * UnicodeMatcher objects. This is used by RuleBasedTransliterator,
- * which uses characters in the private use area to represent objects
- * such as UnicodeSets. If U+E015 is mapped to the UnicodeSet [a-z],
- * then lookupMatcher(0xE015) will return the UnicodeSet [a-z].
- *
- * <p>Finally, a symbol table defines parsing behavior for symbolic
- * names. All symbolic names start with the SYMBOL_REF character.
- * When a parser encounters this character, it calls parseReference()
- * with the position immediately following the SYMBOL_REF. The symbol
- * table parses the name, if there is one, and returns it.
- *
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
-@Deprecated
-public interface SymbolTable {
-
- /**
- * The character preceding a symbol reference name.
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
- @Deprecated
- static final char SYMBOL_REF = '$';
-
- /**
- * Lookup the characters associated with this string and return it.
- * Return {@code null} if no such name exists. The resultant
- * array may have length zero.
- * @param s the symbolic name to lookup
- * @return a char array containing the name's value, or null if
- * there is no mapping for s.
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
- @Deprecated
- char[] lookup(String s);
-
- /**
- * Lookup the UnicodeMatcher associated with the given character, and
- * return it. Return {@code null} if not found.
- * @param ch a 32-bit code point from 0 to 0x10FFFF inclusive.
- * @return the UnicodeMatcher object represented by the given
- * character, or null if there is no mapping for ch.
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
- @Deprecated
- UnicodeMatcher lookupMatcher(int ch);
-
- /**
- * Parse a symbol reference name from the given string, starting
- * at the given position. If no valid symbol reference name is
- * found, return null and leave pos unchanged. That is, if the
- * character at pos cannot start a name, or if pos is at or after
- * text.length(), then return null. This indicates an isolated
- * SYMBOL_REF character.
- * @param text the text to parse for the name
- * @param pos on entry, the index of the first character to parse.
- * This is the character following the SYMBOL_REF character. On
- * exit, the index after the last parsed character. If the parse
- * failed, pos is unchanged on exit.
- * @param limit the index after the last character to be parsed.
- * @return the parsed name, or null if there is no valid symbolic
- * name at the given position.
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
- @Deprecated
- String parseReference(String text, ParsePosition pos, int limit);
-}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/Trie.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/Trie.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,16 +22,12 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
+ ******************************************************************************
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ******************************************************************************
*/
package sun.text.normalizer;
@@ -135,93 +131,62 @@
unserialize(inputStream);
}
- /**
- * Trie constructor
- * @param index array to be used for index
- * @param options used by the trie
- * @param dataManipulate object containing the information to parse the
- * trie data
- */
- protected Trie(char index[], int options, DataManipulate dataManipulate)
- {
- m_options_ = options;
- if(dataManipulate != null) {
- m_dataManipulate_ = dataManipulate;
- } else {
- m_dataManipulate_ = new DefaultGetFoldingOffset();
- }
- m_isLatin1Linear_ = (m_options_ &
- HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
- m_index_ = index;
- m_dataOffset_ = m_index_.length;
- }
-
// protected data members ------------------------------------------
/**
- * Lead surrogate code points' index displacement in the index array.
- * <pre>{@code
- * 0x10000-0xd800=0x2800
- * 0x2800 >> INDEX_STAGE_1_SHIFT_
- * }</pre>
- */
+ * Lead surrogate code points' index displacement in the index array.
+ * <pre>{@code
+ * 0x10000-0xd800=0x2800
+ * 0x2800 >> INDEX_STAGE_1_SHIFT_
+ * }</pre>
+ */
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
/**
- * Shift size for shifting right the input index. 1..9
- */
+ * Shift size for shifting right the input index. 1..9
+ */
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
/**
- * Shift size for shifting left the index array values.
- * Increases possible data size with 16-bit index values at the cost
- * of compactability.
- * This requires blocks of stage 2 data to be aligned by
- * DATA_GRANULARITY.
- * 0..INDEX_STAGE_1_SHIFT
- */
+ * Shift size for shifting left the index array values.
+ * Increases possible data size with 16-bit index values at the cost
+ * of compactability.
+ * This requires blocks of stage 2 data to be aligned by
+ * DATA_GRANULARITY.
+ * 0..INDEX_STAGE_1_SHIFT
+ */
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
/**
* Number of data values in a stage 2 (data array) block.
*/
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
/**
- * Mask for getting the lower bits from the input index.
- * DATA_BLOCK_LENGTH - 1.
- */
+ * Mask for getting the lower bits from the input index.
+ * DATA_BLOCK_LENGTH - 1.
+ */
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
- /** Number of bits of a trail surrogate that are used in index table lookups. */
- protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
/**
- * Number of index (stage 1) entries per lead surrogate.
- * Same as number of index entries for 1024 trail surrogates,
- * {@code ==0x400>>INDEX_STAGE_1_SHIFT_}
+ * Surrogate mask to use when shifting offset to retrieve supplementary
+ * values
*/
- protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
- /** Length of the BMP portion of the index (stage 1) array. */
- protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
- /**
- * Surrogate mask to use when shifting offset to retrieve supplementary
- * values
- */
protected static final int SURROGATE_MASK_ = 0x3FF;
/**
- * Index or UTF16 characters
- */
+ * Index or UTF16 characters
+ */
protected char m_index_[];
/**
- * Internal TrieValue which handles the parsing of the data value.
- * This class is to be implemented by the user
- */
+ * Internal TrieValue which handles the parsing of the data value.
+ * This class is to be implemented by the user
+ */
protected DataManipulate m_dataManipulate_;
/**
- * Start index of the data portion of the trie. CharTrie combines
- * index and data into a char array, so this is used to indicate the
- * initial offset to the data portion.
- * Note this index always points to the initial value.
- */
+ * Start index of the data portion of the trie. CharTrie combines
+ * index and data into a char array, so this is used to indicate the
+ * initial offset to the data portion.
+ * Note this index always points to the initial value.
+ */
protected int m_dataOffset_;
/**
- * Length of the data array
- */
+ * Length of the data array
+ */
protected int m_dataLength_;
// protected methods -----------------------------------------------
@@ -235,19 +200,6 @@
protected abstract int getSurrogateOffset(char lead, char trail);
/**
- * Gets the value at the argument index
- * @param index value at index will be retrieved
- * @return 32 bit value
- */
- protected abstract int getValue(int index);
-
- /**
- * Gets the default initial value
- * @return 32 bit value
- */
- protected abstract int getInitialValue();
-
- /**
* Gets the offset to the data which the index ch after variable offset
* points to.
* Note for locating a non-supplementary character data offset, calling
@@ -297,13 +249,13 @@
}
/**
- * Internal trie getter from a code point.
- * Could be faster(?) but longer with
- * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}
- * Gets the offset to data which the codepoint points to
- * @param ch codepoint
- * @return offset to data
- */
+ * Internal trie getter from a code point.
+ * Could be faster(?) but longer with
+ * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}
+ * Gets the offset to data which the codepoint points to
+ * @param ch codepoint
+ * @return offset to data
+ */
protected final int getCodePointOffset(int ch)
{
// if ((ch >> 16) == 0) slower
@@ -321,7 +273,7 @@
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
(char)(ch & SURROGATE_MASK_));
} else {
- // return -1 // if there is an error, in this case we return
+ // return -1 if there is an error, in this case we return
return -1;
}
}
@@ -343,15 +295,6 @@
}
/**
- * Determines if this is a 32 bit trie
- * @return true if options specifies this is a 32 bit trie
- */
- protected final boolean isIntTrie()
- {
- return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
- }
-
- /**
* Determines if this is a 16 bit trie
* @return true if this is a 16 bit trie
*/
@@ -363,8 +306,8 @@
// private data members --------------------------------------------
/**
- * Latin 1 option mask
- */
+ * Latin 1 option mask
+ */
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
/**
* Constant number to authenticate the byte block
@@ -378,28 +321,28 @@
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
/**
- * Flag indicator for Latin quick access data block
- */
+ * Flag indicator for Latin quick access data block
+ */
private boolean m_isLatin1Linear_;
/**
- * <p>Trie options field.</p>
- * <p>options bit field:<br>
- * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
- * 8 0 = 16-bit data, 1=32-bit data<br>
- * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
- * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
- */
+ * <p>Trie options field.</p>
+ * <p>options bit field:<br>
+ * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
+ * 8 0 = 16-bit data, 1=32-bit data<br>
+ * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
+ * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
+ */
private int m_options_;
// private methods ---------------------------------------------------
/**
- * Authenticates raw data header.
- * Checking the header information, signature and options.
- * @param signature This contains the options and type of a Trie
- * @return true if the header is authenticated valid
- */
+ * Authenticates raw data header.
+ * Checking the header information, signature and options.
+ * @param signature This contains the options and type of a Trie
+ * @return true if the header is authenticated valid
+ */
private final boolean checkHeader(int signature)
{
// check the signature
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/Trie2.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *******************************************************************************
+ * Copyright (C) 2009-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+package sun.text.normalizer;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+
+/**
+ * This is the interface and common implementation of a Unicode Trie2.
+ * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff)
+ * to 16- or 32-bit integer values. It works best when there are ranges of
+ * characters with the same value, which is generally the case with Unicode
+ * character properties.
+ *
+ * This is the second common version of a Unicode trie (hence the name Trie2).
+ *
+ */
+abstract class Trie2 implements Iterable<Trie2.Range> {
+
+ /**
+ * Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
+ *
+ * Reads from the current position and leaves the buffer after the end of the trie.
+ *
+ * The serialized format is identical between ICU4C and ICU4J, so this function
+ * will work with serialized Trie2s from either.
+ *
+ * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending
+ * on the width of the data.
+ *
+ * To obtain the width of the Trie2, check the actual class type of the returned Trie2.
+ * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will
+ * return only Tries of their specific type/size.
+ *
+ * The serialized Trie2 on the stream may be in either little or big endian byte order.
+ * This allows using serialized Tries from ICU4C without needing to consider the
+ * byte order of the system that created them.
+ *
+ * @param bytes a byte buffer to the serialized form of a UTrie2.
+ * @return An unserialized Trie2, ready for use.
+ * @throws IllegalArgumentException if the stream does not contain a serialized Trie2.
+ * @throws IOException if a read error occurs in the buffer.
+ *
+ */
+ public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException {
+ // From ICU4C utrie2_impl.h
+ // * Trie2 data structure in serialized form:
+ // *
+ // * UTrie2Header header;
+ // * uint16_t index[header.index2Length];
+ // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
+ // * @internal
+ // */
+ // typedef struct UTrie2Header {
+ // /** "Tri2" in big-endian US-ASCII (0x54726932) */
+ // uint32_t signature;
+
+ // /**
+ // * options bit field:
+ // * 15.. 4 reserved (0)
+ // * 3.. 0 UTrie2ValueBits valueBits
+ // */
+ // uint16_t options;
+ //
+ // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */
+ // uint16_t indexLength;
+ //
+ // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */
+ // uint16_t shiftedDataLength;
+ //
+ // /** Null index and data blocks, not shifted. */
+ // uint16_t index2NullOffset, dataNullOffset;
+ //
+ // /**
+ // * First code point of the single-value range ending with U+10ffff,
+ // * rounded up and then shifted right by UTRIE2_SHIFT_1.
+ // */
+ // uint16_t shiftedHighStart;
+ // } UTrie2Header;
+
+ ByteOrder outerByteOrder = bytes.order();
+ try {
+ UTrie2Header header = new UTrie2Header();
+
+ /* check the signature */
+ header.signature = bytes.getInt();
+ switch (header.signature) {
+ case 0x54726932:
+ // The buffer is already set to the trie data byte order.
+ break;
+ case 0x32697254:
+ // Temporarily reverse the byte order.
+ boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN;
+ bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN);
+ header.signature = 0x54726932;
+ break;
+ default:
+ throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2");
+ }
+
+ header.options = bytes.getChar();
+ header.indexLength = bytes.getChar();
+ header.shiftedDataLength = bytes.getChar();
+ header.index2NullOffset = bytes.getChar();
+ header.dataNullOffset = bytes.getChar();
+ header.shiftedHighStart = bytes.getChar();
+
+ if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) {
+ throw new IllegalArgumentException("UTrie2 serialized format error.");
+ }
+
+ Trie2 This;
+ This = new Trie2_16();
+ This.header = header;
+
+ /* get the length values and offsets */
+ This.indexLength = header.indexLength;
+ This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT;
+ This.index2NullOffset = header.index2NullOffset;
+ This.dataNullOffset = header.dataNullOffset;
+ This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1;
+ This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY;
+ This.highValueIndex += This.indexLength;
+
+ // Allocate the Trie2 index array. If the data width is 16 bits, the array also
+ // includes the space for the data.
+
+ int indexArraySize = This.indexLength;
+ indexArraySize += This.dataLength;
+ This.index = new char[indexArraySize];
+
+ /* Read in the index */
+ int i;
+ for (i=0; i<This.indexLength; i++) {
+ This.index[i] = bytes.getChar();
+ }
+
+ /* Read in the data. 16 bit data goes in the same array as the index.
+ * 32 bit data goes in its own separate data array.
+ */
+ This.data16 = This.indexLength;
+ for (i=0; i<This.dataLength; i++) {
+ This.index[This.data16 + i] = bytes.getChar();
+ }
+
+ This.data32 = null;
+ This.initialValue = This.index[This.dataNullOffset];
+ This.errorValue = This.index[This.data16+UTRIE2_BAD_UTF8_DATA_OFFSET];
+
+ return This;
+ } finally {
+ bytes.order(outerByteOrder);
+ }
+ }
+
+ /**
+ * Get the value for a code point as stored in the Trie2.
+ *
+ * @param codePoint the code point
+ * @return the value
+ */
+ abstract public int get(int codePoint);
+
+ /**
+ * Get the trie value for a UTF-16 code unit.
+ *
+ * A Trie2 stores two distinct values for input in the lead surrogate
+ * range, one for lead surrogates, which is the value that will be
+ * returned by this function, and a second value that is returned
+ * by Trie2.get().
+ *
+ * For code units outside of the lead surrogate range, this function
+ * returns the same result as Trie2.get().
+ *
+ * This function, together with the alternate value for lead surrogates,
+ * makes possible very efficient processing of UTF-16 strings without
+ * first converting surrogate pairs to their corresponding 32 bit code point
+ * values.
+ *
+ * At build-time, enumerate the contents of the Trie2 to see if there
+ * is non-trivial (non-initialValue) data for any of the supplementary
+ * code points associated with a lead surrogate.
+ * If so, then set a special (application-specific) value for the
+ * lead surrogate code _unit_, with Trie2Writable.setForLeadSurrogateCodeUnit().
+ *
+ * At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial
+ * data and the code unit is a lead surrogate, then check if a trail surrogate
+ * follows. If so, assemble the supplementary code point and look up its value
+ * with Trie2.get(); otherwise reset the lead
+ * surrogate's value or do a code point lookup for it.
+ *
+ * If there is only trivial data for lead and trail surrogates, then processing
+ * can often skip them. For example, in normalization or case mapping
+ * all characters that do not have any mappings are simply copied as is.
+ *
+ * @param c the code point or lead surrogate value.
+ * @return the value
+ */
+ abstract public int getFromU16SingleLead(char c);
+
+ /**
+ * When iterating over the contents of a Trie2, Elements of this type are produced.
+ * The iterator will return one item for each contiguous range of codepoints having the same value.
+ *
+ * When iterating, the same Trie2EnumRange object will be reused and returned for each range.
+ * If you need to retain complete iteration results, clone each returned Trie2EnumRange,
+ * or save the range in some other way, before advancing to the next iteration step.
+ */
+ public static class Range {
+ public int startCodePoint;
+ public int endCodePoint; // Inclusive.
+ public int value;
+ public boolean leadSurrogate;
+
+ public boolean equals(Object other) {
+ if (other == null || !(other.getClass().equals(getClass()))) {
+ return false;
+ }
+ Range tother = (Range)other;
+ return this.startCodePoint == tother.startCodePoint &&
+ this.endCodePoint == tother.endCodePoint &&
+ this.value == tother.value &&
+ this.leadSurrogate == tother.leadSurrogate;
+ }
+
+ public int hashCode() {
+ int h = initHash();
+ h = hashUChar32(h, startCodePoint);
+ h = hashUChar32(h, endCodePoint);
+ h = hashInt(h, value);
+ h = hashByte(h, leadSurrogate? 1: 0);
+ return h;
+ }
+ }
+
+ /**
+ * Create an iterator over the value ranges in this Trie2.
+ * Values from the Trie2 are not remapped or filtered, but are returned as they
+ * are stored in the Trie2.
+ *
+ * @return an Iterator
+ */
+ public Iterator<Range> iterator() {
+ return iterator(defaultValueMapper);
+ }
+
+ private static ValueMapper defaultValueMapper = new ValueMapper() {
+ public int map(int in) {
+ return in;
+ }
+ };
+
+ /**
+ * Create an iterator over the value ranges from this Trie2.
+ * Values from the Trie2 are passed through a caller-supplied remapping function,
+ * and it is the remapped values that determine the ranges that
+ * will be produced by the iterator.
+ *
+ *
+ * @param mapper provides a function to remap values obtained from the Trie2.
+ * @return an Iterator
+ */
+ public Iterator<Range> iterator(ValueMapper mapper) {
+ return new Trie2Iterator(mapper);
+ }
+
+ /**
+ * When iterating over the contents of a Trie2, an instance of TrieValueMapper may
+ * be used to remap the values from the Trie2. The remapped values will be used
+ * both in determining the ranges of codepoints and as the value to be returned
+ * for each range.
+ *
+ * Example of use, with an anonymous subclass of TrieValueMapper:
+ *
+ *
+ * ValueMapper m = new ValueMapper() {
+ * int map(int in) {return in & 0x1f;};
+ * }
+ * for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) {
+ * Trie2EnumRange r = i.next();
+ * ... // Do something with the range r.
+ * }
+ *
+ */
+ public interface ValueMapper {
+ public int map(int originalVal);
+ }
+
+ //--------------------------------------------------------------------------------
+ //
+ // Below this point are internal implementation items. No further public API.
+ //
+ //--------------------------------------------------------------------------------
+
+ /**
+ * Trie2 data structure in serialized form:
+ *
+ * UTrie2Header header;
+ * uint16_t index[header.index2Length];
+ * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
+ *
+ * For Java, this is read from the stream into an instance of UTrie2Header.
+ * (The C version just places a struct over the raw serialized data.)
+ *
+ * @internal
+ */
+ static class UTrie2Header {
+ /** "Tri2" in big-endian US-ASCII (0x54726932) */
+ int signature;
+
+ /**
+ * options bit field (uint16_t):
+ * 15.. 4 reserved (0)
+ * 3.. 0 UTrie2ValueBits valueBits
+ */
+ int options;
+
+ /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */
+ int indexLength;
+
+ /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */
+ int shiftedDataLength;
+
+ /** Null index and data blocks, not shifted. (uint16_t) */
+ int index2NullOffset, dataNullOffset;
+
+ /**
+ * First code point of the single-value range ending with U+10ffff,
+ * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t)
+ */
+ int shiftedHighStart;
+ }
+
+ //
+ // Data members of UTrie2.
+ //
+ UTrie2Header header;
+ char index[]; // Index array. Includes data for 16 bit Tries.
+ int data16; // Offset to data portion of the index array, if 16 bit data.
+ // zero if 32 bit data.
+ int data32[]; // NULL if 16b data is used via index
+
+ int indexLength;
+ int dataLength;
+ int index2NullOffset; // 0xffff if there is no dedicated index-2 null block
+ int initialValue;
+
+ /** Value returned for out-of-range code points and illegal UTF-8. */
+ int errorValue;
+
+ /* Start of the last range which ends at U+10ffff, and its value. */
+ int highStart;
+ int highValueIndex;
+
+ int dataNullOffset;
+
+ /**
+ * Trie2 constants, defining shift widths, index array lengths, etc.
+ *
+ * These are needed for the runtime macros but users can treat these as
+ * implementation details and skip to the actual public API further below.
+ */
+
+ static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f;
+
+
+ /** Shift size for getting the index-1 table offset. */
+ static final int UTRIE2_SHIFT_1=6+5;
+
+ /** Shift size for getting the index-2 table offset. */
+ static final int UTRIE2_SHIFT_2=5;
+
+ /**
+ * Difference between the two shift sizes,
+ * for getting an index-1 offset from an index-2 offset. 6=11-5
+ */
+ static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2;
+
+ /**
+ * Number of index-1 entries for the BMP. 32=0x20
+ * This part of the index-1 table is omitted from the serialized form.
+ */
+ static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1;
+
+ /** Number of entries in an index-2 block. 64=0x40 */
+ static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2;
+
+ /** Mask for getting the lower bits for the in-index-2-block offset. */
+ static final int UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
+
+ /** Number of entries in a data block. 32=0x20 */
+ static final int UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2;
+
+ /** Mask for getting the lower bits for the in-data-block offset. */
+ static final int UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1;
+
+ /**
+ * Shift size for shifting left the index array values.
+ * Increases possible data size with 16-bit index values at the cost
+ * of compactability.
+ * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY.
+ */
+ static final int UTRIE2_INDEX_SHIFT=2;
+
+ /** The alignment size of a data block. Also the granularity for compaction. */
+ static final int UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT;
+
+ /**
+ * The part of the index-2 table for U+D800..U+DBFF stores values for
+ * lead surrogate code _units_ not code _points_.
+ * Values for lead surrogate code _points_ are indexed with this portion of the table.
+ * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.)
+ */
+ static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2;
+ static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2;
+
+ /** Count the lengths of both BMP pieces. 2080=0x820 */
+ static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH;
+
+ /**
+ * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
+ * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2.
+ */
+ static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH;
+ static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */
+
+ /**
+ * The index-1 table, only used for supplementary code points, at offset 2112=0x840.
+ * Variable length, for code points up to highStart, where the last single-value range starts.
+ * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1.
+ * (For 0x100000 supplementary code points U+10000..U+10ffff.)
+ *
+ * The part of the index-2 table for supplementary code points starts
+ * after this index-1 table.
+ *
+ * Both the index-1 table and the following part of the index-2 table
+ * are omitted completely if there is only BMP data.
+ */
+ static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH;
+
+ /**
+ * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.
+ * Used with linear access for single bytes 0..0xbf for simple error handling.
+ * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH.
+ */
+ static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80;
+
+ /**
+ * Implementation class for an iterator over a Trie2.
+ *
+ * Iteration over a Trie2 first returns all of the ranges that are indexed by code points,
+ * then returns the special alternate values for the lead surrogates
+ *
+ * @internal
+ */
+ class Trie2Iterator implements Iterator<Range> {
+
+ // The normal constructor that configures the iterator to cover the complete
+ // contents of the Trie2
+ Trie2Iterator(ValueMapper vm) {
+ mapper = vm;
+ nextStart = 0;
+ limitCP = 0x110000;
+ doLeadSurrogates = true;
+ }
+
+ /**
+ * The main next() function for Trie2 iterators
+ *
+ */
+ public Range next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ if (nextStart >= limitCP) {
+ // Switch over from iterating normal code point values to
+ // doing the alternate lead-surrogate values.
+ doingCodePoints = false;
+ nextStart = 0xd800;
+ }
+ int endOfRange = 0;
+ int val = 0;
+ int mappedVal = 0;
+
+ if (doingCodePoints) {
+ // Iteration over code point values.
+ val = get(nextStart);
+ mappedVal = mapper.map(val);
+ endOfRange = rangeEnd(nextStart, limitCP, val);
+ // Loop once for each range in the Trie2 with the same raw (unmapped) value.
+ // Loop continues so long as the mapped values are the same.
+ for (;;) {
+ if (endOfRange >= limitCP-1) {
+ break;
+ }
+ val = get(endOfRange+1);
+ if (mapper.map(val) != mappedVal) {
+ break;
+ }
+ endOfRange = rangeEnd(endOfRange+1, limitCP, val);
+ }
+ } else {
+ // Iteration over the alternate lead surrogate values.
+ val = getFromU16SingleLead((char)nextStart);
+ mappedVal = mapper.map(val);
+ endOfRange = rangeEndLS((char)nextStart);
+ // Loop once for each range in the Trie2 with the same raw (unmapped) value.
+ // Loop continues so long as the mapped values are the same.
+ for (;;) {
+ if (endOfRange >= 0xdbff) {
+ break;
+ }
+ val = getFromU16SingleLead((char)(endOfRange+1));
+ if (mapper.map(val) != mappedVal) {
+ break;
+ }
+ endOfRange = rangeEndLS((char)(endOfRange+1));
+ }
+ }
+ returnValue.startCodePoint = nextStart;
+ returnValue.endCodePoint = endOfRange;
+ returnValue.value = mappedVal;
+ returnValue.leadSurrogate = !doingCodePoints;
+ nextStart = endOfRange+1;
+ return returnValue;
+ }
+
+ /**
+ *
+ */
+ public boolean hasNext() {
+ return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00;
+ }
+
+ private int rangeEndLS(char startingLS) {
+ if (startingLS >= 0xdbff) {
+ return 0xdbff;
+ }
+
+ int c;
+ int val = getFromU16SingleLead(startingLS);
+ for (c = startingLS+1; c <= 0x0dbff; c++) {
+ if (getFromU16SingleLead((char)c) != val) {
+ break;
+ }
+ }
+ return c-1;
+ }
+
+ //
+ // Iteration State Variables
+ //
+ private ValueMapper mapper;
+ private Range returnValue = new Range();
+ // The starting code point for the next range to be returned.
+ private int nextStart;
+ // The upper limit for the last normal range to be returned. Normally 0x110000, but
+ // may be lower when iterating over the code points for a single lead surrogate.
+ private int limitCP;
+
+ // True while iterating over the the Trie2 values for code points.
+ // False while iterating over the alternate values for lead surrogates.
+ private boolean doingCodePoints = true;
+
+ // True if the iterator should iterate the special values for lead surrogates in
+ // addition to the normal values for code points.
+ private boolean doLeadSurrogates = true;
+ }
+
+ /**
+ * Find the last character in a contiguous range of characters with the
+ * same Trie2 value as the input character.
+ *
+ * @param c The character to begin with.
+ * @return The last contiguous character with the same value.
+ */
+ int rangeEnd(int start, int limitp, int val) {
+ int c;
+ int limit = Math.min(highStart, limitp);
+
+ for (c = start+1; c < limit; c++) {
+ if (get(c) != val) {
+ break;
+ }
+ }
+ if (c >= highStart) {
+ c = limitp;
+ }
+ return c - 1;
+ }
+
+
+ //
+ // Hashing implementation functions. FNV hash. Respected public domain algorithm.
+ //
+ private static int initHash() {
+ return 0x811c9DC5; // unsigned 2166136261
+ }
+
+ private static int hashByte(int h, int b) {
+ h = h * 16777619;
+ h = h ^ b;
+ return h;
+ }
+
+ private static int hashUChar32(int h, int c) {
+ h = Trie2.hashByte(h, c & 255);
+ h = Trie2.hashByte(h, (c>>8) & 255);
+ h = Trie2.hashByte(h, c>>16);
+ return h;
+ }
+
+ private static int hashInt(int h, int i) {
+ h = Trie2.hashByte(h, i & 255);
+ h = Trie2.hashByte(h, (i>>8) & 255);
+ h = Trie2.hashByte(h, (i>>16) & 255);
+ h = Trie2.hashByte(h, (i>>24) & 255);
+ return h;
+ }
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/Trie2_16.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *******************************************************************************
+ * Copyright (C) 2009-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+package sun.text.normalizer;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+
+/**
+ * @author aheninger
+ *
+ * A read-only Trie2, holding 16 bit data values.
+ *
+ * A Trie2 is a highly optimized data structure for mapping from Unicode
+ * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
+ *
+ * See class Trie2 for descriptions of the API for accessing the contents of a trie.
+ *
+ * The fundamental data access methods are declared final in this class, with
+ * the intent that applications might gain a little extra performance, when compared
+ * with calling the same methods via the abstract UTrie2 base class.
+ */
+public final class Trie2_16 extends Trie2 {
+
+ /**
+ * Internal constructor, not for general use.
+ */
+ Trie2_16() {
+ }
+
+
+ /**
+ * Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
+ * The serialized format is identical between ICU4C and ICU4J, so this function
+ * will work with serialized Trie2s from either.
+ *
+ * The serialized Trie2 in the bytes may be in either little or big endian byte order.
+ * This allows using serialized Tries from ICU4C without needing to consider the
+ * byte order of the system that created them.
+ *
+ * @param bytes a byte buffer to the serialized form of a UTrie2.
+ * @return An unserialized Trie2_16, ready for use.
+ * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2.
+ * @throws IOException if a read error occurs in the buffer.
+ * @throws ClassCastException if the bytes contain a serialized Trie2_32
+ */
+ public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException {
+ return (Trie2_16) Trie2.createFromSerialized(bytes);
+ }
+
+ /**
+ * Get the value for a code point as stored in the Trie2.
+ *
+ * @param codePoint the code point
+ * @return the value
+ */
+ @Override
+ public final int get(int codePoint) {
+ int value;
+ int ix;
+
+ if (codePoint >= 0) {
+ if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
+ // Ordinary BMP code point, excluding leading surrogates.
+ // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
+ // 16 bit data is stored in the index array itself.
+ ix = index[codePoint >> UTRIE2_SHIFT_2];
+ ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
+ value = index[ix];
+ return value;
+ }
+ if (codePoint <= 0xffff) {
+ // Lead Surrogate Code Point. A Separate index section is stored for
+ // lead surrogate code units and code points.
+ // The main index has the code unit data.
+ // For this function, we need the code point data.
+ // Note: this expression could be refactored for slightly improved efficiency, but
+ // surrogate code points will be so rare in practice that it's not worth it.
+ ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
+ ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
+ value = index[ix];
+ return value;
+ }
+ if (codePoint < highStart) {
+ // Supplemental code point, use two-level lookup.
+ ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
+ ix = index[ix];
+ ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
+ ix = index[ix];
+ ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
+ value = index[ix];
+ return value;
+ }
+ if (codePoint <= 0x10ffff) {
+ value = index[highValueIndex];
+ return value;
+ }
+ }
+
+ // Fall through. The code point is outside of the legal range of 0..0x10ffff.
+ return errorValue;
+ }
+
+
+ /**
+ * Get a Trie2 value for a UTF-16 code unit.
+ *
+ * This function returns the same value as get() if the input
+ * character is outside of the lead surrogate range
+ *
+ * There are two values stored in a Trie2 for inputs in the lead
+ * surrogate range. This function returns the alternate value,
+ * while Trie2.get() returns the main value.
+ *
+ * @param codeUnit a 16 bit code unit or lead surrogate value.
+ * @return the value
+ */
+ @Override
+ public int getFromU16SingleLead(char codeUnit) {
+ int value;
+ int ix;
+
+ // Because the input is a 16 bit char, we can skip the tests for it being in
+ // the BMP range. It is.
+ ix = index[codeUnit >> UTRIE2_SHIFT_2];
+ ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
+ value = index[ix];
+ return value;
+ }
+
+ /**
+ * @return the number of bytes of the serialized trie
+ */
+ public int getSerializedLength() {
+ return 16+(header.indexLength+dataLength)*2;
+ }
+}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/TrieIterator.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-/**
- * Class enabling iteration of the values in a Trie.
- * <p>Result of each iteration contains the interval of codepoints that have
- * the same value type and the value type itself.
- * <p>The comparison of each codepoint value is done via extract(), which the
- * default implementation is to return the value as it is.
- * <p>Method extract() can be overwritten to perform manipulations on
- * codepoint values in order to perform specialized comparison.
- * <p>TrieIterator is designed to be a generic iterator for the CharTrie
- * and the IntTrie, hence to accommodate both types of data, the return
- * result will be in terms of int (32 bit) values.
- * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.
- * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
- * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
- * sense, the caller will have to pass a object with a callback function
- * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
- * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
- * codepoints with the same value as determined by
- * UTrieEnumValue(const void *context, uint32_t value). for each range,
- * utrie_enum calls the callback function to perform a task. In this way,
- * icu4c performs the iteration within utrie_enum.
- * To follow the JDK model, icu4j is slightly different from icu4c.
- * Instead of requesting the caller to implement an object for a callback.
- * The caller will have to implement a subclass of TrieIterator, fleshing out
- * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
- * the caller will have to code his own iteration and flesh out the task
- * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
- *
- * <p>There are basically 3 usage scenarios for porting:
- * <p>1) UTrieEnumValue is the only implemented callback then just implement a
- * subclass of TrieIterator and override the extract(int) method. The
- * extract(int) method is analogus to UTrieEnumValue callback.
- *
- * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
- * a subclass of TrieIterator, override the extract method and iterate, e.g.<br>
- * {@code utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
- * set);}<br>
- * In Java:<br>
- * <pre>
- * class TrieIteratorImpl extends TrieIterator{
- * public TrieIteratorImpl(Trie data){
- * super(data);
- * }
- * public int extract(int value){
- * // port the implementation of _enumPropertyStartsValue here
- * }
- * }
- * ....
- * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
- * while(fcdIter.next(result)) {
- * // port the implementation of _enumPropertyStartsRange
- * }
- * </pre>
- *
- * <p>3) UTrieEnumRange is the only implemented callback then just implement
- * the while loop, when utrie_enum is called
- * <pre>{@code
- * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
- * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
- * while(fcdIter.next(result)){
- * set.add(result.start);
- * }
- * }</pre>
- *
- * @author synwee
- * @see com.ibm.icu.impl.Trie
- * @see com.ibm.icu.lang.UCharacterTypeIterator
- * @since release 2.1, Jan 17 2002
- */
-public class TrieIterator implements RangeValueIterator
-{
-
- // public constructor ---------------------------------------------
-
- /**
- * TrieEnumeration constructor
- * @param trie to be used
- * @exception IllegalArgumentException throw when argument is null.
- */
- public TrieIterator(Trie trie)
- {
- if (trie == null) {
- throw new IllegalArgumentException(
- "Argument trie cannot be null");
- }
- m_trie_ = trie;
- // synwee: check that extract belongs to the child class
- m_initialValue_ = extract(m_trie_.getInitialValue());
- reset();
- }
-
- // public methods -------------------------------------------------
-
- /**
- * <p>Returns true if we are not at the end of the iteration, false
- * otherwise.</p>
- * <p>The next set of codepoints with the same value type will be
- * calculated during this call and returned in the arguement element.</p>
- * @param element return result
- * @return true if we are not at the end of the iteration, false otherwise.
- * @exception NoSuchElementException - if no more elements exist.
- * @see com.ibm.icu.util.RangeValueIterator.Element
- */
- public final boolean next(Element element)
- {
- if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
- return false;
- }
- if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
- calculateNextBMPElement(element)) {
- return true;
- }
- calculateNextSupplementaryElement(element);
- return true;
- }
-
- /**
- * Resets the iterator to the beginning of the iteration
- */
- public final void reset()
- {
- m_currentCodepoint_ = 0;
- m_nextCodepoint_ = 0;
- m_nextIndex_ = 0;
- m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
- if (m_nextBlock_ == 0) {
- m_nextValue_ = m_initialValue_;
- }
- else {
- m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
- }
- m_nextBlockIndex_ = 0;
- m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
- }
-
- // protected methods ----------------------------------------------
-
- /**
- * Called by next() to extracts a 32 bit value from a trie value
- * used for comparison.
- * This method is to be overwritten if special manipulation is to be done
- * to retrieve a relevant comparison.
- * The default function is to return the value as it is.
- * @param value a value from the trie
- * @return extracted value
- */
- protected int extract(int value)
- {
- return value;
- }
-
- // private methods ------------------------------------------------
-
- /**
- * Set the result values
- * @param element return result object
- * @param start codepoint of range
- * @param limit (end + 1) codepoint of range
- * @param value common value of range
- */
- private final void setResult(Element element, int start, int limit,
- int value)
- {
- element.start = start;
- element.limit = limit;
- element.value = value;
- }
-
- /**
- * Finding the next element.
- * This method is called just before returning the result of
- * next().
- * We always store the next element before it is requested.
- * In the case that we have to continue calculations into the
- * supplementary planes, a false will be returned.
- * @param element return result object
- * @return true if the next range is found, false if we have to proceed to
- * the supplementary range.
- */
- private final boolean calculateNextBMPElement(Element element)
- {
- int currentBlock = m_nextBlock_;
- int currentValue = m_nextValue_;
- m_currentCodepoint_ = m_nextCodepoint_;
- m_nextCodepoint_ ++;
- m_nextBlockIndex_ ++;
- if (!checkBlockDetail(currentValue)) {
- setResult(element, m_currentCodepoint_, m_nextCodepoint_,
- currentValue);
- return true;
- }
- // synwee check that next block index == 0 here
- // enumerate BMP - the main loop enumerates data blocks
- while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
- m_nextIndex_ ++;
- // because of the way the character is split to form the index
- // the lead surrogate and trail surrogate can not be in the
- // mid of a block
- if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
- // skip lead surrogate code units,
- // go to lead surrogate codepoints
- m_nextIndex_ = BMP_INDEX_LENGTH_;
- }
- else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
- // go back to regular BMP code points
- m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
- }
-
- m_nextBlockIndex_ = 0;
- if (!checkBlock(currentBlock, currentValue)) {
- setResult(element, m_currentCodepoint_, m_nextCodepoint_,
- currentValue);
- return true;
- }
- }
- m_nextCodepoint_ --; // step one back since this value has not been
- m_nextBlockIndex_ --; // retrieved yet.
- return false;
- }
-
- /**
- * Finds the next supplementary element.
- * For each entry in the trie, the value to be delivered is passed through
- * extract().
- * We always store the next element before it is requested.
- * Called after calculateNextBMP() completes its round of BMP characters.
- * There is a slight difference in the usage of m_currentCodepoint_
- * here as compared to calculateNextBMP(). Though both represents the
- * lower bound of the next element, in calculateNextBMP() it gets set
- * at the start of any loop, where-else, in calculateNextSupplementary()
- * since m_currentCodepoint_ already contains the lower bound of the
- * next element (passed down from calculateNextBMP()), we keep it till
- * the end before resetting it to the new value.
- * Note, if there are no more iterations, it will never get to here.
- * Blocked out by next().
- * @param element return result object
- */
- private final void calculateNextSupplementaryElement(Element element)
- {
- int currentValue = m_nextValue_;
- int currentBlock = m_nextBlock_;
- m_nextCodepoint_ ++;
- m_nextBlockIndex_ ++;
-
- if (UTF16.getTrailSurrogate(m_nextCodepoint_)
- != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
- // this piece is only called when we are in the middle of a lead
- // surrogate block
- if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
- setResult(element, m_currentCodepoint_, m_nextCodepoint_,
- currentValue);
- m_currentCodepoint_ = m_nextCodepoint_;
- return;
- }
- // we have cleared one block
- m_nextIndex_ ++;
- m_nextTrailIndexOffset_ ++;
- if (!checkTrailBlock(currentBlock, currentValue)) {
- setResult(element, m_currentCodepoint_, m_nextCodepoint_,
- currentValue);
- m_currentCodepoint_ = m_nextCodepoint_;
- return;
- }
- }
- int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
- // enumerate supplementary code points
- while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
- // lead surrogate access
- int leadBlock =
- m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
- Trie.INDEX_STAGE_2_SHIFT_;
- if (leadBlock == m_trie_.m_dataOffset_) {
- // no entries for a whole block of lead surrogates
- if (currentValue != m_initialValue_) {
- m_nextValue_ = m_initialValue_;
- m_nextBlock_ = 0;
- m_nextBlockIndex_ = 0;
- setResult(element, m_currentCodepoint_, m_nextCodepoint_,
- currentValue);
- m_currentCodepoint_ = m_nextCodepoint_;
- return;
- }
-
- nextLead += DATA_BLOCK_LENGTH_;
- // number of total affected supplementary codepoints in one
- // block
- // this is not a simple addition of
- // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
- // that we might have moved some of the codepoints
- m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
- (char)nextLead,
- (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
- continue;
- }
- if (m_trie_.m_dataManipulate_ == null) {
- throw new NullPointerException(
- "The field DataManipulate in this Trie is null");
- }
- // enumerate trail surrogates for this lead surrogate
- m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
- m_trie_.getValue(leadBlock +
- (nextLead & Trie.INDEX_STAGE_3_MASK_)));
- if (m_nextIndex_ <= 0) {
- // no data for this lead surrogate
- if (currentValue != m_initialValue_) {
- m_nextValue_ = m_initialValue_;
- m_nextBlock_ = 0;
- m_nextBlockIndex_ = 0;
- setResult(element, m_currentCodepoint_, m_nextCodepoint_,
- currentValue);
- m_currentCodepoint_ = m_nextCodepoint_;
- return;
- }
- m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
- } else {
- m_nextTrailIndexOffset_ = 0;
- if (!checkTrailBlock(currentBlock, currentValue)) {
- setResult(element, m_currentCodepoint_, m_nextCodepoint_,
- currentValue);
- m_currentCodepoint_ = m_nextCodepoint_;
- return;
- }
- }
- nextLead ++;
- }
-
- // deliver last range
- setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
- currentValue);
- }
-
- /**
- * Internal block value calculations
- * Performs calculations on a data block to find codepoints in m_nextBlock_
- * after the index m_nextBlockIndex_ that has the same value.
- * Note m_*_ variables at this point is the next codepoint whose value
- * has not been calculated.
- * But when returned with false, it will be the last codepoint whose
- * value has been calculated.
- * @param currentValue the value which other codepoints are tested against
- * @return true if the whole block has the same value as currentValue or if
- * the whole block has been calculated, false otherwise.
- */
- private final boolean checkBlockDetail(int currentValue)
- {
- while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
- m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
- m_nextBlockIndex_));
- if (m_nextValue_ != currentValue) {
- return false;
- }
- ++ m_nextBlockIndex_;
- ++ m_nextCodepoint_;
- }
- return true;
- }
-
- /**
- * Internal block value calculations
- * Performs calculations on a data block to find codepoints in m_nextBlock_
- * that has the same value.
- * Will call checkBlockDetail() if highlevel check fails.
- * Note m_*_ variables at this point is the next codepoint whose value
- * has not been calculated.
- * @param currentBlock the initial block containing all currentValue
- * @param currentValue the value which other codepoints are tested against
- * @return true if the whole block has the same value as currentValue or if
- * the whole block has been calculated, false otherwise.
- */
- private final boolean checkBlock(int currentBlock, int currentValue)
- {
- m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
- Trie.INDEX_STAGE_2_SHIFT_;
- if (m_nextBlock_ == currentBlock &&
- (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
- // the block is the same as the previous one, filled with
- // currentValue
- m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
- }
- else if (m_nextBlock_ == 0) {
- // this is the all-initial-value block
- if (currentValue != m_initialValue_) {
- m_nextValue_ = m_initialValue_;
- m_nextBlockIndex_ = 0;
- return false;
- }
- m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
- }
- else {
- if (!checkBlockDetail(currentValue)) {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Internal block value calculations
- * Performs calculations on multiple data blocks for a set of trail
- * surrogates to find codepoints in m_nextBlock_ that has the same value.
- * Will call checkBlock() for internal block checks.
- * Note m_*_ variables at this point is the next codepoint whose value
- * has not been calculated.
- * @param currentBlock the initial block containing all currentValue
- * @param currentValue the value which other codepoints are tested against
- * @return true if the whole block has the same value as currentValue or if
- * the whole block has been calculated, false otherwise.
- */
- private final boolean checkTrailBlock(int currentBlock,
- int currentValue)
- {
- // enumerate code points for this lead surrogate
- while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
- {
- // if we ever reach here, we are at the start of a new block
- m_nextBlockIndex_ = 0;
- // copy of most of the body of the BMP loop
- if (!checkBlock(currentBlock, currentValue)) {
- return false;
- }
- m_nextTrailIndexOffset_ ++;
- m_nextIndex_ ++;
- }
- return true;
- }
-
- /**
- * Checks if we are beginning at the start of a initial block.
- * If we are then the rest of the codepoints in this initial block
- * has the same values.
- * We increment m_nextCodepoint_ and relevant data members if so.
- * This is used only in for the supplementary codepoints because
- * the offset to the trail indexes could be 0.
- * @return true if we are at the start of a initial block.
- */
- private final boolean checkNullNextTrailIndex()
- {
- if (m_nextIndex_ <= 0) {
- m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
- int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
- int leadBlock =
- m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
- Trie.INDEX_STAGE_2_SHIFT_;
- if (m_trie_.m_dataManipulate_ == null) {
- throw new NullPointerException(
- "The field DataManipulate in this Trie is null");
- }
- m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
- m_trie_.getValue(leadBlock +
- (nextLead & Trie.INDEX_STAGE_3_MASK_)));
- m_nextIndex_ --;
- m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
- return true;
- }
- return false;
- }
-
- // private data members --------------------------------------------
-
- /**
- * Size of the stage 1 BMP indexes
- */
- private static final int BMP_INDEX_LENGTH_ =
- 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
- /**
- * Lead surrogate minimum value
- */
- private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
- /**
- * Trail surrogate minimum value
- */
- private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
- /**
- * Number of trail surrogate
- */
- private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
- /**
- * Number of stage 1 indexes for supplementary calculations that maps to
- * each lead surrogate character.
- * See second pass into getRawOffset for the trail surrogate character.
- * 10 for significant number of bits for trail surrogates, 5 for what we
- * discard during shifting.
- */
- private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
- 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
- /**
- * Number of data values in a stage 2 (data array) block.
- */
- private static final int DATA_BLOCK_LENGTH_ =
- 1 << Trie.INDEX_STAGE_1_SHIFT_;
- /**
- * Trie instance
- */
- private Trie m_trie_;
- /**
- * Initial value for trie values
- */
- private int m_initialValue_;
- /**
- * Next element results and data.
- */
- private int m_currentCodepoint_;
- private int m_nextCodepoint_;
- private int m_nextValue_;
- private int m_nextIndex_;
- private int m_nextBlock_;
- private int m_nextBlockIndex_;
- private int m_nextTrailIndexOffset_;
-}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UBiDiProps.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/UBiDiProps.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,74 +24,71 @@
*/
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ *
+ * Copyright (C) 2004-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
*******************************************************************************
-* file name: UBiDiProps.java
-* encoding: US-ASCII
-* tab size: 8 (not used)
-* indentation:4
-*
-* created on: 2005jan16
-* created by: Markus W. Scherer
-*
-* Low-level Unicode bidi/shaping properties access.
-* Java port of ubidi_props.h/.c.
-*/
+ * file name: UBiDiProps.java
+ * encoding: US-ASCII
+ * tab size: 8 (not used)
+ * indentation:4
+ *
+ * created on: 2005jan16
+ * created by: Markus W. Scherer
+ *
+ * Low-level Unicode bidi/shaping properties access.
+ * Java port of ubidi_props.h/.c.
+ */
package sun.text.normalizer;
-import java.io.BufferedInputStream;
-import java.io.DataInputStream;
-import java.io.InputStream;
import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.MissingResourceException;
public final class UBiDiProps {
// constructors etc. --------------------------------------------------- ***
// port of ubidi_openProps()
- public UBiDiProps() throws IOException{
- InputStream is=ICUData.getStream(DATA_FILE_NAME);
- BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
- readData(b);
- b.close();
- is.close();
-
+ private UBiDiProps() throws IOException{
+ ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME);
+ readData(bytes);
}
- private void readData(InputStream is) throws IOException {
- DataInputStream inputStream=new DataInputStream(is);
-
+ private void readData(ByteBuffer bytes) throws IOException {
// read the header
- ICUBinary.readHeader(inputStream, FMT, new IsAcceptable());
+ ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
// read indexes[]
int i, count;
- count=inputStream.readInt();
- if(count<IX_INDEX_TOP) {
+ count=bytes.getInt();
+ if(count<IX_TOP) {
throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
}
indexes=new int[count];
indexes[0]=count;
for(i=1; i<count; ++i) {
- indexes[i]=inputStream.readInt();
+ indexes[i]=bytes.getInt();
}
// read the trie
- trie=new CharTrie(inputStream, null);
+ trie=Trie2_16.createFromSerialized(bytes);
+ int expectedTrieLength=indexes[IX_TRIE_SIZE];
+ int trieLength=trie.getSerializedLength();
+ if(trieLength>expectedTrieLength) {
+ throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie");
+ }
+ // skip padding after trie bytes
+ ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength);
// read mirrors[]
count=indexes[IX_MIRROR_LENGTH];
if(count>0) {
mirrors=new int[count];
for(i=0; i<count; ++i) {
- mirrors[i]=inputStream.readInt();
+ mirrors[i]=bytes.getInt();
}
}
@@ -99,81 +96,172 @@
count=indexes[IX_JG_LIMIT]-indexes[IX_JG_START];
jgArray=new byte[count];
for(i=0; i<count; ++i) {
- jgArray[i]=inputStream.readByte();
+ jgArray[i]=bytes.get();
+ }
+
+ // read jgArray2[]
+ count=indexes[IX_JG_LIMIT2]-indexes[IX_JG_START2];
+ jgArray2=new byte[count];
+ for(i=0; i<count; ++i) {
+ jgArray2[i]=bytes.get();
}
}
// implement ICUBinary.Authenticate
- private final class IsAcceptable implements ICUBinary.Authenticate {
+ private final static class IsAcceptable implements ICUBinary.Authenticate {
public boolean isDataVersionAcceptable(byte version[]) {
- return version[0]==1 &&
- version[2]==Trie.INDEX_STAGE_1_SHIFT_ && version[3]==Trie.INDEX_STAGE_2_SHIFT_;
+ return version[0]==2;
}
}
- // UBiDiProps singleton
- private static UBiDiProps gBdp=null;
+ // property access functions ------------------------------------------- ***
+
+ public final int getClass(int c) {
+ return getClassFromProps(trie.get(c));
+ }
+
+ private final int getMirror(int c, int props) {
+ int delta=getMirrorDeltaFromProps(props);
+ if(delta!=ESC_MIRROR_DELTA) {
+ return c+delta;
+ } else {
+ /* look for mirror code point in the mirrors[] table */
+ int m;
+ int i, length;
+ int c2;
- // port of ubidi_getSingleton()
- public static final synchronized UBiDiProps getSingleton() throws IOException {
- if(gBdp==null) {
- gBdp=new UBiDiProps();
+ length=indexes[IX_MIRROR_LENGTH];
+
+ /* linear search */
+ for(i=0; i<length; ++i) {
+ m=mirrors[i];
+ c2=getMirrorCodePoint(m);
+ if(c==c2) {
+ /* found c, return its mirror code point using the index in m */
+ return getMirrorCodePoint(mirrors[getMirrorIndex(m)]);
+ } else if(c<c2) {
+ break;
+ }
+ }
+
+ /* c not found, return it itself */
+ return c;
}
- return gBdp;
}
- // UBiDiProps dummy singleton
- private static UBiDiProps gBdpDummy=null;
+ public final int getMirror(int c) {
+ int props=trie.get(c);
+ return getMirror(c, props);
+ }
- private UBiDiProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature
- indexes=new int[IX_TOP];
- indexes[0]=IX_TOP;
- trie=new CharTrie(0, 0, null); // dummy trie, always returns 0
+ public final int getJoiningType(int c) {
+ return (trie.get(c)&JT_MASK)>>JT_SHIFT;
}
- /**
- * Get a singleton dummy object, one that works with no real data.
- * This can be used when the real data is not available.
- * Using the dummy can reduce checks for available data after an initial failure.
- * Port of ucase_getDummy().
- */
- public static final synchronized UBiDiProps getDummy() {
- if(gBdpDummy==null) {
- gBdpDummy=new UBiDiProps(true);
+ public final int getJoiningGroup(int c) {
+ int start, limit;
+
+ start=indexes[IX_JG_START];
+ limit=indexes[IX_JG_LIMIT];
+ if(start<=c && c<limit) {
+ return (int)jgArray[c-start]&0xff;
}
- return gBdpDummy;
+ start=indexes[IX_JG_START2];
+ limit=indexes[IX_JG_LIMIT2];
+ if(start<=c && c<limit) {
+ return (int)jgArray2[c-start]&0xff;
+ }
+ return UCharacter.JoiningGroup.NO_JOINING_GROUP;
}
- public final int getClass(int c) {
- return getClassFromProps(trie.getCodePointValue(c));
+ public final int getPairedBracketType(int c) {
+ return (trie.get(c)&BPT_MASK)>>BPT_SHIFT;
+ }
+
+ public final int getPairedBracket(int c) {
+ int props=trie.get(c);
+ if((props&BPT_MASK)==0) {
+ return c;
+ } else {
+ return getMirror(c, props);
+ }
}
// data members -------------------------------------------------------- ***
private int indexes[];
private int mirrors[];
private byte jgArray[];
+ private byte jgArray2[];
- private CharTrie trie;
+ private Trie2_16 trie;
// data format constants ----------------------------------------------- ***
private static final String DATA_FILE_NAME = "/sun/text/resources/ubidi.icu";
/* format "BiDi" */
- private static final byte FMT[]={ 0x42, 0x69, 0x44, 0x69 };
+ private static final int FMT=0x42694469;
/* indexes into indexes[] */
- private static final int IX_INDEX_TOP=0;
+ private static final int IX_TRIE_SIZE=2;
private static final int IX_MIRROR_LENGTH=3;
private static final int IX_JG_START=4;
private static final int IX_JG_LIMIT=5;
+ private static final int IX_JG_START2=6; /* new in format version 2.2, ICU 54 */
+ private static final int IX_JG_LIMIT2=7;
private static final int IX_TOP=16;
+ // definitions for 16-bit bidi/shaping properties word ----------------- ***
+
+ /* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */
+ private static final int JT_SHIFT=5; /* joining type: 3 bits (7..5) */
+
+ private static final int BPT_SHIFT=8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */
+
+ private static final int MIRROR_DELTA_SHIFT=13; /* bidi mirroring delta: 3 bits (15..13) */
+
private static final int CLASS_MASK= 0x0000001f;
+ private static final int JT_MASK= 0x000000e0;
+ private static final int BPT_MASK= 0x00000300;
private static final int getClassFromProps(int props) {
return props&CLASS_MASK;
}
+ private static final boolean getFlagFromProps(int props, int shift) {
+ return ((props>>shift)&1)!=0;
+ }
+ private static final int getMirrorDeltaFromProps(int props) {
+ return (short)props>>MIRROR_DELTA_SHIFT;
+ }
+ private static final int ESC_MIRROR_DELTA=-4;
+
+ // definitions for 32-bit mirror table entry --------------------------- ***
+
+ /* the source Unicode code point takes 21 bits (20..0) */
+ private static final int MIRROR_INDEX_SHIFT=21;
+
+ private static final int getMirrorCodePoint(int m) {
+ return m&0x1fffff;
+ }
+ private static final int getMirrorIndex(int m) {
+ return m>>>MIRROR_INDEX_SHIFT;
+ }
+
+
+ /*
+ * public singleton instance
+ */
+ public static final UBiDiProps INSTANCE;
+
+ // This static initializer block must be placed after
+ // other static member initialization
+ static {
+ try {
+ INSTANCE = new UBiDiProps();
+ } catch (IOException e) {
+ throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME,"");
+ }
+ }
}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacter.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacter.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,40 +22,30 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
+
+/**
+*******************************************************************************
+* Copyright (C) 1996-2014, International Business Machines Corporation and
+* others. All Rights Reserved.
+*******************************************************************************
+*/
package sun.text.normalizer;
-import java.io.IOException;
-import java.util.MissingResourceException;
-
/**
- * <p>
- * The UCharacter class provides extensions to the
- * <a href="http://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html">
+ * <p>The UCharacter class provides extensions to the
+ * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
* java.lang.Character</a> class. These extensions provide support for
* more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
* class, provide support for supplementary characters (those with code
* points above U+FFFF).
* Each ICU release supports the latest version of Unicode available at that time.
- * </p>
- * <p>
- * Code points are represented in these API using ints. While it would be
+ *
+ * <p>Code points are represented in these API using ints. While it would be
* more convenient in Java to have a separate primitive datatype for them,
* ints suffice in the meantime.
- * </p>
- * <p>
- * To use this class please add the jar file name icu4j.jar to the
+ *
+ * <p>To use this class please add the jar file name icu4j.jar to the
* class path, since it contains data files which supply the information used
* by this file.<br>
* E.g. In Windows <br>
@@ -64,9 +54,8 @@
* unames.icu from the icu4j source subdirectory
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
- * </p>
- * <p>
- * Aside from the additions for UTF-16 support, and the updated Unicode
+ *
+ * <p>Aside from the additions for UTF-16 support, and the updated Unicode
* properties, the main differences between UCharacter and Character are:
* <ul>
* <li> UCharacter is not designed to be a char wrapper and does not have
@@ -87,8 +76,9 @@
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
* </ul>
* <p>
- * Further detail differences can be determined from the program
- * <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
+ * Further detail on differences can be determined using the program
+ * <a href=
+ * "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
* </p>
* <p>
@@ -103,8 +93,11 @@
* </p>
* <p>
* For more information see
- * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
- * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
+ * <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
+ * (http://www.unicode.org/ucd/)
+ * and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
+ * User Guide chapter on Properties</a>
+ * (http://www.icu-project.org/userguide/properties.html).
* </p>
* <p>
* There are also functions that provide easy migration from C/POSIX functions
@@ -128,12 +121,15 @@
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
* </p>
+ * <p>
+ * API access for C/POSIX character classes is as follows:
* <pre>{@code
- * API access for C/POSIX character classes is as follows:
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
- * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
+ * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
+ * (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
+ * (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
@@ -143,21 +139,22 @@
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
* }</pre>
+ * </p>
* <p>
* The C/POSIX character classes are also available in UnicodeSet patterns,
* using patterns like [:graph:] or \p{graph}.
* </p>
- * <p>
- * Note: There are several ICU (and Java) whitespace functions.
- * Comparison:
- * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
+ *
+ * There are several ICU (and Java) whitespace functions.
+ * Comparison:<ul>
+ * <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
- * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
- * - isSpaceChar: just Z (including no-break spaces)
+ * <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
+ * <li> isSpaceChar: just Z (including no-break spaces)</ul>
* </p>
* <p>
- * This class is not subclassable
+ * This class is not subclassable.
* </p>
* @author Syn Wee Quek
* @stable ICU 2.1
@@ -168,6 +165,19 @@
{
/**
+ * Joining Group constants.
+ * @see UProperty#JOINING_GROUP
+ * @stable ICU 2.4
+ */
+ public static interface JoiningGroup
+ {
+ /**
+ * @stable ICU 2.4
+ */
+ public static final int NO_JOINING_GROUP = 0;
+ }
+
+ /**
* Numeric Type constants.
* @see UProperty#NUMERIC_TYPE
* @stable ICU 2.4
@@ -177,7 +187,61 @@
/**
* @stable ICU 2.4
*/
+ public static final int NONE = 0;
+ /**
+ * @stable ICU 2.4
+ */
public static final int DECIMAL = 1;
+ /**
+ * @stable ICU 2.4
+ */
+ public static final int DIGIT = 2;
+ /**
+ * @stable ICU 2.4
+ */
+ public static final int NUMERIC = 3;
+ /**
+ * @stable ICU 2.4
+ */
+ public static final int COUNT = 4;
+ }
+
+ /**
+ * Hangul Syllable Type constants.
+ *
+ * @see UProperty#HANGUL_SYLLABLE_TYPE
+ * @stable ICU 2.6
+ */
+ public static interface HangulSyllableType
+ {
+ /**
+ * @stable ICU 2.6
+ */
+ public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
+ /**
+ * @stable ICU 2.6
+ */
+ public static final int LEADING_JAMO = 1; /*[L]*/
+ /**
+ * @stable ICU 2.6
+ */
+ public static final int VOWEL_JAMO = 2; /*[V]*/
+ /**
+ * @stable ICU 2.6
+ */
+ public static final int TRAILING_JAMO = 3; /*[T]*/
+ /**
+ * @stable ICU 2.6
+ */
+ public static final int LV_SYLLABLE = 4; /*[LV]*/
+ /**
+ * @stable ICU 2.6
+ */
+ public static final int LVT_SYLLABLE = 5; /*[LVT]*/
+ /**
+ * @stable ICU 2.6
+ */
+ public static final int COUNT = 6;
}
// public data members -----------------------------------------------
@@ -192,22 +256,15 @@
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* This is a 21-bit value (21 bits, rounded up).<br>
- * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
+ * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
* @stable ICU 2.1
*/
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
- /**
- * The minimum value for Supplementary code points
- * @stable ICU 2.1
- */
- public static final int SUPPLEMENTARY_MIN_VALUE =
- UTF16.SUPPLEMENTARY_MIN_VALUE;
-
// public methods ----------------------------------------------------
/**
- * Retrieves the numeric value of a decimal digit code point.
+ * Returns the numeric value of a decimal digit code point.
* <br>This method observes the semantics of
* <code>java.lang.Character.digit()</code>. Note that this
* will return positive values for code points for which isDigit
@@ -231,15 +288,54 @@
*/
public static int digit(int ch, int radix)
{
- // when ch is out of bounds getProperty == 0
- int props = getProperty(ch);
- int value;
- if (getNumericType(props) == NumericType.DECIMAL) {
- value = UCharacterProperty.getUnsignedValue(props);
+ if (2 <= radix && radix <= 36) {
+ int value = digit(ch);
+ if (value < 0) {
+ // ch is not a decimal digit, try latin letters
+ value = UCharacterProperty.getEuropeanDigit(ch);
+ }
+ return (value < radix) ? value : -1;
} else {
- value = getEuropeanDigit(ch);
+ return -1; // invalid radix
}
- return (0 <= value && value < radix) ? value : -1;
+ }
+
+ /**
+ * Returns the numeric value of a decimal digit code point.
+ * <br>This is a convenience overload of <code>digit(int, int)</code>
+ * that provides a decimal radix.
+ * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
+ * treated numeric letters and other numbers as digits. This has
+ * been changed to conform to the java semantics.
+ * @param ch the code point to query
+ * @return the numeric value represented by the code point,
+ * or -1 if the code point is not a decimal digit or if its
+ * value is too large for a decimal radix
+ * @stable ICU 2.1
+ */
+ public static int digit(int ch)
+ {
+ return UCharacterProperty.INSTANCE.digit(ch);
+ }
+
+ /**
+ * Returns a value indicating a code point's Unicode category.
+ * Up-to-date Unicode implementation of java.lang.Character.getType()
+ * except for the above mentioned code points that had their category
+ * changed.<br>
+ * Return results are constants from the interface
+ * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
+ * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
+ * those returned by java.lang.Character.getType. UCharacterCategory values
+ * match the ones used in ICU4C, while java.lang.Character type
+ * values, though similar, skip the value 17.</p>
+ * @param ch code point whose type is to be determined
+ * @return category which is a value of UCharacterCategory
+ * @stable ICU 2.1
+ */
+ public static int getType(int ch)
+ {
+ return UCharacterProperty.INSTANCE.getType(ch);
}
/**
@@ -254,7 +350,67 @@
*/
public static int getDirection(int ch)
{
- return gBdp.getClass(ch);
+ return UBiDiProps.INSTANCE.getClass(ch);
+ }
+
+ /**
+ * Maps the specified code point to a "mirror-image" code point.
+ * For code points with the "mirrored" property, implementations sometimes
+ * need a "poor man's" mapping to another code point such that the default
+ * glyph may serve as the mirror-image of the default glyph of the
+ * specified code point.<br>
+ * This is useful for text conversion to and from codepages with visual
+ * order, and for displays without glyph selection capabilities.
+ * @param ch code point whose mirror is to be retrieved
+ * @return another code point that may serve as a mirror-image substitute,
+ * or ch itself if there is no such mapping or ch does not have the
+ * "mirrored" property
+ * @stable ICU 2.1
+ */
+ public static int getMirror(int ch)
+ {
+ return UBiDiProps.INSTANCE.getMirror(ch);
+ }
+
+ /**
+ * Maps the specified character to its paired bracket character.
+ * For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int).
+ * Otherwise c itself is returned.
+ * See http://www.unicode.org/reports/tr9/
+ *
+ * @param c the code point to be mapped
+ * @return the paired bracket code point,
+ * or c itself if there is no such mapping
+ * (Bidi_Paired_Bracket_Type=None)
+ *
+ * @see UProperty#BIDI_PAIRED_BRACKET
+ * @see UProperty#BIDI_PAIRED_BRACKET_TYPE
+ * @see #getMirror(int)
+ * @stable ICU 52
+ */
+ public static int getBidiPairedBracket(int c) {
+ return UBiDiProps.INSTANCE.getPairedBracket(c);
+ }
+
+ /**
+ * Returns the combining class of the argument codepoint
+ * @param ch code point whose combining is to be retrieved
+ * @return the combining class of the codepoint
+ * @stable ICU 2.1
+ */
+ public static int getCombiningClass(int ch)
+ {
+ return Normalizer2.getNFDInstance().getCombiningClass(ch);
+ }
+
+ /**
+ * Returns the version of Unicode data used.
+ * @return the unicode version number used
+ * @stable ICU 2.1
+ */
+ public static VersionInfo getUnicodeVersion()
+ {
+ return UCharacterProperty.INSTANCE.m_unicodeVersion_;
}
/**
@@ -275,7 +431,7 @@
}
/**
- * <p>Get the "age" of the code point.</p>
+ * Returns the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.
@@ -289,143 +445,95 @@
public static VersionInfo getAge(int ch)
{
if (ch < MIN_VALUE || ch > MAX_VALUE) {
- throw new IllegalArgumentException("Codepoint out of bounds");
+ throw new IllegalArgumentException("Codepoint out of bounds");
}
- return PROPERTY_.getAge(ch);
- }
-
- // private variables -------------------------------------------------
-
- /**
- * Database storing the sets of character property
- */
- private static final UCharacterProperty PROPERTY_;
- /**
- * For optimization
- */
- private static final char[] PROPERTY_TRIE_INDEX_;
- private static final char[] PROPERTY_TRIE_DATA_;
- private static final int PROPERTY_INITIAL_VALUE_;
-
- private static final UBiDiProps gBdp;
-
- // block to initialise character property database
- static
- {
- try
- {
- PROPERTY_ = UCharacterProperty.getInstance();
- PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
- PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
- PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
- }
- catch (Exception e)
- {
- throw new MissingResourceException(e.getMessage(),"","");
- }
-
- UBiDiProps bdp;
- try {
- bdp=UBiDiProps.getSingleton();
- } catch(IOException e) {
- bdp=UBiDiProps.getDummy();
- }
- gBdp=bdp;
+ return UCharacterProperty.INSTANCE.getAge(ch);
}
/**
- * Shift to get numeric type
- */
- private static final int NUMERIC_TYPE_SHIFT_ = 5;
- /**
- * Mask to get numeric type
+ * Returns the property value for an Unicode property type of a code point.
+ * Also returns binary and mask property values.</p>
+ * <p>Unicode, especially in version 3.2, defines many more properties than
+ * the original set in UnicodeData.txt.</p>
+ * <p>The properties APIs are intended to reflect Unicode properties as
+ * defined in the Unicode Character Database (UCD) and Unicode Technical
+ * Reports (UTR). For details about the properties see
+ * http://www.unicode.org/.</p>
+ * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
+ * </p>
+ * <pre>
+ * Sample usage:
+ * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
+ * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
+ * boolean b = (ideo == 1) ? true : false;
+ * </pre>
+ * @param ch code point to test.
+ * @param type UProperty selector constant, identifies which binary
+ * property to check. Must be
+ * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
+ * UProperty.INT_START <= type < UProperty.INT_LIMIT or
+ * UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
+ * @return numeric value that is directly the property value or,
+ * for enumerated properties, corresponds to the numeric value of
+ * the enumerated constant of the respective property value
+ * enumeration type (cast to enum type if necessary).
+ * Returns 0 or 1 (for false / true) for binary Unicode properties.
+ * Returns a bit-mask for mask properties.
+ * Returns 0 if 'type' is out of bounds or if the Unicode version
+ * does not have data for the property at all, or not for this code
+ * point.
+ * @see UProperty
+ * @see #hasBinaryProperty
+ * @see #getIntPropertyMinValue
+ * @see #getIntPropertyMaxValue
+ * @see #getUnicodeVersion
+ * @stable ICU 2.4
*/
- private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
-
- // private methods ---------------------------------------------------
-
- /**
- * Getting the digit values of characters like 'A' - 'Z', normal,
- * half-width and full-width. This method assumes that the other digit
- * characters are checked by the calling method.
- * @param ch character to test
- * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
- * its corresponding digit will be returned.
- */
- private static int getEuropeanDigit(int ch) {
- if ((ch > 0x7a && ch < 0xff21)
- || ch < 0x41 || (ch > 0x5a && ch < 0x61)
- || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
- return -1;
- }
- if (ch <= 0x7a) {
- // ch >= 0x41 or ch < 0x61
- return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
- }
- // ch >= 0xff21
- if (ch <= 0xff3a) {
- return ch + 10 - 0xff21;
- }
- // ch >= 0xff41 && ch <= 0xff5a
- return ch + 10 - 0xff41;
+ // for BiDiBase.java
+ public static int getIntPropertyValue(int ch, int type) {
+ return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
}
- /**
- * Gets the numeric type of the property argument
- * @param props 32 bit property
- * @return the numeric type
- */
- private static int getNumericType(int props)
- {
- return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
- }
+ // private constructor -----------------------------------------------
/**
- * Gets the property value at the index.
- * This is optimized.
- * Note this is alittle different from CharTrie the index m_trieData_
- * is never negative.
- * This is a duplicate of UCharacterProperty.getProperty. For optimization
- * purposes, this method calls the trie data directly instead of through
- * UCharacterProperty.getProperty.
- * @param ch code point whose property value is to be retrieved
- * @return property value of code point
- * @stable ICU 2.6
+ * Private constructor to prevent instantiation
*/
- private static final int getProperty(int ch)
- {
- if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
- || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
- && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
- // BMP codepoint 0000..D7FF or DC00..FFFF
- try { // using try for ch < 0 is faster than using an if statement
- return PROPERTY_TRIE_DATA_[
- (PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
- + (ch & 0x1f)];
- } catch (ArrayIndexOutOfBoundsException e) {
- return PROPERTY_INITIAL_VALUE_;
- }
- }
- if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
- // lead surrogate D800..DBFF
- return PROPERTY_TRIE_DATA_[
- (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
- + (ch & 0x1f)];
- }
- // for optimization
- if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
- // supplementary code point 10000..10FFFF
- // look at the construction of supplementary characters
- // trail forms the ends of it.
- return PROPERTY_.m_trie_.getSurrogateValue(
- UTF16.getLeadSurrogate(ch),
- (char)(ch & 0x3ff));
- }
- // return m_dataOffset_ if there is an error, in this case we return
- // the default value: m_initialValue_
- // we cannot assume that m_initialValue_ is at offset 0
- // this is for optimization.
- return PROPERTY_INITIAL_VALUE_;
- }
+ private UCharacter() { }
+
+ /*
+ * Copied from UCharacterEnums.java
+ */
+ /**
+ * Character type Mn
+ * @stable ICU 2.1
+ */
+ public static final byte NON_SPACING_MARK = 6;
+ /**
+ * Character type Me
+ * @stable ICU 2.1
+ */
+ public static final byte ENCLOSING_MARK = 7;
+ /**
+ * Character type Mc
+ * @stable ICU 2.1
+ */
+ public static final byte COMBINING_SPACING_MARK = 8;
+ /**
+ * Character type count
+ * @stable ICU 2.1
+ */
+ public static final byte CHAR_CATEGORY_COUNT = 30;
+
+ /**
+ * Directional type R
+ * @stable ICU 2.1
+ */
+ public static final int RIGHT_TO_LEFT = 1;
+ /**
+ * Directional type AL
+ * @stable ICU 2.1
+ */
+ public static final int RIGHT_TO_LEFT_ARABIC = 13;
}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterIterator.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterIterator.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -25,13 +25,8 @@
/*
*******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2014, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
*******************************************************************************
*/
@@ -84,7 +79,6 @@
return new ReplaceableUCharacterIterator(source);
}
- //// for StringPrep
/**
* Returns a <code>UCharacterIterator</code> object given a
* source StringBuffer.
@@ -97,7 +91,7 @@
return new ReplaceableUCharacterIterator(source);
}
- /**
+ /**
* Returns a <code>UCharacterIterator</code> object given a
* CharacterIterator.
* @param source a valid CharacterIterator object.
@@ -112,21 +106,12 @@
// public methods ----------------------------------------------------------
/**
- * Returns the code unit at the current index. If index is out
- * of range, returns DONE. Index is not changed.
- * @return current code unit
- * @stable ICU 2.4
- */
- public abstract int current();
-
- /**
* Returns the length of the text
* @return length of the text
* @stable ICU 2.4
*/
public abstract int getLength();
-
/**
* Gets the current index in text.
* @return current index in text.
@@ -134,7 +119,6 @@
*/
public abstract int getIndex();
-
/**
* Returns the UTF16 code unit at index, and increments to the next
* code unit (post-increment semantics). If index is out of
@@ -183,6 +167,33 @@
*/
public abstract int previous();
+
+ /**
+ * Retreat to the start of the previous code point in the text,
+ * and return it (pre-decrement semantics). If the index is not
+ * preceeded by a valid surrogate pair, the behavior is the same
+ * as <code>previous()</code>. Otherwise the iterator is
+ * decremented to the start of the surrogate pair, and the code
+ * point represented by the pair is returned.
+ * @return the previous code point in the text, or DONE if the new
+ * index is before the start of the text.
+ * @stable ICU 2.4
+ */
+ public int previousCodePoint(){
+ int ch1 = previous();
+ if(UTF16.isTrailSurrogate((char)ch1)){
+ int ch2 = previous();
+ if(UTF16.isLeadSurrogate((char)ch2)){
+ return UCharacterProperty.getRawSupplementary((char)ch2,
+ (char)ch1);
+ }else if (ch2 != DONE) {
+ //unmatched trail surrogate so back out
+ next();
+ }
+ }
+ return ch1;
+ }
+
/**
* Sets the index to the specified index in the text.
* @param index the index within the text.
@@ -192,7 +203,14 @@
*/
public abstract void setIndex(int index);
- //// for StringPrep
+ /**
+ * Sets the current index to the start.
+ * @stable ICU 2.4
+ */
+ public void setToStart() {
+ setIndex(0);
+ }
+
/**
* Fills the buffer with the underlying text storage of the iterator
* If the buffer capacity is not enough a exception is thrown. The capacity
@@ -222,20 +240,19 @@
* units.
* @param offset the position within the array to start putting the data.
* @return the number of code units added to fillIn, as a convenience
- * @exception IndexOutOfBounds exception if there is not enough
- * room after offset in the array, or if offset {@literal <} 0.
+ * @exception IndexOutOfBoundsException exception if there is not enough
+ * room after offset in the array, or if offset < 0.
* @stable ICU 2.4
*/
public abstract int getText(char[] fillIn, int offset);
- //// for StringPrep
/**
* Convenience override for <code>getText(char[], int)</code> that provides
* an offset of 0.
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @return the number of code units added to fillIn, as a convenience
- * @exception IndexOutOfBounds exception if there is not enough
+ * @exception IndexOutOfBoundsException exception if there is not enough
* room in the array.
* @stable ICU 2.4
*/
@@ -243,7 +260,6 @@
return getText(fillIn, 0);
}
- //// for StringPrep
/**
* Convenience method for returning the underlying text storage as a string
* @return the underlying text storage in the iterator as a string
@@ -256,25 +272,32 @@
}
/**
- * Moves the current position by the number of code units
- * specified, either forward or backward depending on the sign
- * of delta (positive or negative respectively). If the resulting
- * index would be less than zero, the index is set to zero, and if
- * the resulting index would be greater than limit, the index is
- * set to limit.
- *
- * @param delta the number of code units to move the current
- * index.
- * @return the new index.
- * @exception IndexOutOfBoundsException is thrown if an invalid index is
+ * Moves the current position by the number of code points
+ * specified, either forward or backward depending on the sign of
+ * delta (positive or negative respectively). If the current index
+ * is at a trail surrogate then the first adjustment is by code
+ * unit, and the remaining adjustments are by code points. If the
+ * resulting index would be less than zero, the index is set to
+ * zero, and if the resulting index would be greater than limit,
+ * the index is set to limit.
+ * @param delta the number of code units to move the current index.
+ * @return the new index
+ * @exception IndexOutOfBoundsException is thrown if an invalid delta is
* supplied
* @stable ICU 2.4
*
*/
- public int moveIndex(int delta) {
- int x = Math.max(0, Math.min(getIndex() + delta, getLength()));
- setIndex(x);
- return x;
+ public int moveCodePointIndex(int delta){
+ if(delta>0){
+ while(delta>0 && nextCodePoint() != DONE){delta--;}
+ }else{
+ while(delta<0 && previousCodePoint() != DONE){delta++;}
+ }
+ if(delta!=0){
+ throw new IndexOutOfBoundsException();
+ }
+
+ return getIndex();
}
/**
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,23 +24,21 @@
*/
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
-import java.io.BufferedInputStream;
-import java.io.InputStream;
import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
import java.util.MissingResourceException;
+import sun.text.normalizer.UCharacter.HangulSyllableType;
+import sun.text.normalizer.UCharacter.NumericType;
+
/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
@@ -56,134 +54,72 @@
* @since release 2.1, february 1st 2002
*/
-public final class UCharacterProperty
+final class UCharacterProperty
{
// public data members -----------------------------------------------
+ /*
+ * public singleton instance
+ */
+ public static final UCharacterProperty INSTANCE;
+
/**
* Trie data
*/
- public CharTrie m_trie_;
- /**
- * Optimization
- * CharTrie index array
- */
- public char[] m_trieIndex_;
- /**
- * Optimization
- * CharTrie data array
- */
- public char[] m_trieData_;
- /**
- * Optimization
- * CharTrie data offset
- */
- public int m_trieInitialValue_;
+ public Trie2_16 m_trie_;
+
/**
* Unicode version
*/
public VersionInfo m_unicodeVersion_;
+ /**
+ * Character type mask
+ */
+ public static final int TYPE_MASK = 0x1F;
+
// uprops.h enum UPropertySource --------------------------------------- ***
+ /** From uchar.c/uprops.icu main trie */
+ public static final int SRC_CHAR=1;
/** From uchar.c/uprops.icu properties vectors trie */
public static final int SRC_PROPSVEC=2;
- /** One more than the highest UPropertySource (SRC_) constant. */
- public static final int SRC_COUNT=9;
+ /** From ubidi_props.c/ubidi.icu */
+ public static final int SRC_BIDI=5;
+ /** From normalizer2impl.cpp/nfc.nrm */
+ public static final int SRC_NFC=8;
+ /** From normalizer2impl.cpp/nfkc.nrm */
+ public static final int SRC_NFKC=9;
// public methods ----------------------------------------------------
/**
- * Java friends implementation
- */
- public void setIndexData(CharTrie.FriendAgent friendagent)
- {
- m_trieIndex_ = friendagent.getPrivateIndex();
- m_trieData_ = friendagent.getPrivateData();
- m_trieInitialValue_ = friendagent.getPrivateInitialValue();
- }
-
- /**
- * Gets the property value at the index.
- * This is optimized.
- * Note this is alittle different from CharTrie the index m_trieData_
- * is never negative.
+ * Gets the main property value for code point ch.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
public final int getProperty(int ch)
{
- if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
- || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
- && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
- // BMP codepoint 0000..D7FF or DC00..FFFF
- // optimized
- try { // using try for ch < 0 is faster than using an if statement
- return m_trieData_[
- (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
- << Trie.INDEX_STAGE_2_SHIFT_)
- + (ch & Trie.INDEX_STAGE_3_MASK_)];
- } catch (ArrayIndexOutOfBoundsException e) {
- return m_trieInitialValue_;
- }
- }
- if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
- // lead surrogate D800..DBFF
- return m_trieData_[
- (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
- + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
- << Trie.INDEX_STAGE_2_SHIFT_)
- + (ch & Trie.INDEX_STAGE_3_MASK_)];
- }
- if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
- // supplementary code point 10000..10FFFF
- // look at the construction of supplementary characters
- // trail forms the ends of it.
- return m_trie_.getSurrogateValue(
- UTF16.getLeadSurrogate(ch),
- (char)(ch & Trie.SURROGATE_MASK_));
- }
- // ch is out of bounds
- // return m_dataOffset_ if there is an error, in this case we return
- // the default value: m_initialValue_
- // we cannot assume that m_initialValue_ is at offset 0
- // this is for optimization.
- return m_trieInitialValue_;
-
- // this all is an inlined form of return m_trie_.getCodePointValue(ch);
- }
-
- /**
- * Getting the unsigned numeric value of a character embedded in the property
- * argument
- * @param prop the character
- * @return unsigned numberic value
- */
- public static int getUnsignedValue(int prop)
- {
- return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
+ return m_trie_.get(ch);
}
/**
* Gets the unicode additional properties.
- * C version getUnicodeProperties.
+ * Java version of C u_getUnicodeProperties().
* @param codepoint codepoint whose additional properties is to be
* retrieved
- * @param column
+ * @param column The column index.
* @return unicode properties
*/
- public int getAdditional(int codepoint, int column) {
- if (column == -1) {
- return getProperty(codepoint);
+ public int getAdditional(int codepoint, int column) {
+ assert column >= 0;
+ if (column >= m_additionalColumnsCount_) {
+ return 0;
}
- if (column < 0 || column >= m_additionalColumnsCount_) {
- return 0;
- }
- return m_additionalVectors_[
- m_additionalTrie_.getCodePointValue(codepoint) + column];
- }
+ return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
+ }
- /**
+ /**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
@@ -203,6 +139,91 @@
version & LAST_NIBBLE_MASK_, 0, 0);
}
+ // int-value and enumerated properties --------------------------------- ***
+
+ public int getType(int c) {
+ return getProperty(c)&TYPE_MASK;
+ }
+
+ /*
+ * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
+ * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
+ */
+ private static final int /* UHangulSyllableType */ gcbToHst[]={
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
+ HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
+ HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
+ HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
+ HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
+ HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
+ /*
+ * Omit GCB values beyond what we need for hst.
+ * The code below checks for the array length.
+ */
+ };
+
+ private class IntProperty {
+ int column; // SRC_PROPSVEC column, or "source" if mask==0
+ int mask;
+ int shift;
+
+ IntProperty(int column, int mask, int shift) {
+ this.column=column;
+ this.mask=mask;
+ this.shift=shift;
+ }
+
+ IntProperty(int source) {
+ this.column=source;
+ this.mask=0;
+ }
+
+ int getValue(int c) {
+ // systematic, directly stored properties
+ return (getAdditional(c, column)&mask)>>>shift;
+ }
+ }
+
+ private class BiDiIntProperty extends IntProperty {
+ BiDiIntProperty() {
+ super(SRC_BIDI);
+ }
+ }
+
+ private class CombiningClassIntProperty extends IntProperty {
+ CombiningClassIntProperty(int source) {
+ super(source);
+ }
+ }
+
+ private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
+ int which;
+ int max;
+
+ NormQuickCheckIntProperty(int source, int which, int max) {
+ super(source);
+ this.which=which;
+ this.max=max;
+ }
+ }
+
+ private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
+ int getValue(int c) {
+ return UBiDiProps.INSTANCE.getPairedBracketType(c);
+ }
+ };
+
+ public int getIntPropertyValue(int c, int which) {
+ if (which == BIDI_PAIRED_BRACKET_TYPE) {
+ return intProp.getValue(c);
+ }
+ return 0; // undefined
+ }
+
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
@@ -217,42 +238,48 @@
}
/**
- * Loads the property data and initialize the UCharacterProperty instance.
- * @throws MissingResourceException when data is missing or data has been corrupted
- */
- public static UCharacterProperty getInstance()
+ * Gets the type mask
+ * @param type character type
+ * @return mask
+ */
+ public static final int getMask(int type)
{
- if(INSTANCE_ == null) {
- try {
- INSTANCE_ = new UCharacterProperty();
- }
- catch (Exception e) {
- throw new MissingResourceException(e.getMessage(),"","");
- }
- }
- return INSTANCE_;
+ return 1 << type;
}
/**
- * Checks if the argument c is to be treated as a white space in ICU
- * rules. Usually ICU rule white spaces are ignored unless quoted.
- * Equivalent to test for Pattern_White_Space Unicode property.
- * Stable set of characters, won't change.
- * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
- * @param c codepoint to check
- * @return true if c is a ICU white space
+ * Returns the digit values of characters like 'A' - 'Z', normal,
+ * half-width and full-width. This method assumes that the other digit
+ * characters are checked by the calling method.
+ * @param ch character to test
+ * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
+ * its corresponding digit will be returned.
*/
- public static boolean isRuleWhiteSpace(int c)
- {
- /* "white space" in the sense of ICU rule parsers
- This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
- See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
- U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
- Equivalent to test for Pattern_White_Space Unicode property.
- */
- return (c >= 0x0009 && c <= 0x2029 &&
- (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
- c == 0x200E || c == 0x200F || c >= 0x2028));
+ public static int getEuropeanDigit(int ch) {
+ if ((ch > 0x7a && ch < 0xff21)
+ || ch < 0x41 || (ch > 0x5a && ch < 0x61)
+ || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
+ return -1;
+ }
+ if (ch <= 0x7a) {
+ // ch >= 0x41 or ch < 0x61
+ return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
+ }
+ // ch >= 0xff21
+ if (ch <= 0xff3a) {
+ return ch + 10 - 0xff21;
+ }
+ // ch >= 0xff41 && ch <= 0xff5a
+ return ch + 10 - 0xff41;
+ }
+
+ public int digit(int c) {
+ int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
+ if(value<=9) {
+ return value;
+ } else {
+ return -1;
+ }
}
// protected variables -----------------------------------------------
@@ -260,7 +287,7 @@
/**
* Extra property trie
*/
- CharTrie m_additionalTrie_;
+ Trie2_16 m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
@@ -280,40 +307,24 @@
* 0
*/
int m_maxJTGValue_;
+ /**
+ * Script_Extensions data
+ */
+ public char[] m_scriptExtensions_;
// private variables -------------------------------------------------
- /**
- * UnicodeData.txt property object
- */
- private static UCharacterProperty INSTANCE_ = null;
-
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
/**
- * Default buffer size of datafile
- */
- private static final int DATA_BUFFER_SIZE_ = 25000;
-
- /**
- * Numeric value shift
- */
- private static final int VALUE_SHIFT_ = 8;
-
- /**
- * Mask to be applied after shifting to obtain an unsigned numeric value
- */
- private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
-
- /**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
- * Offset to add to combined surrogate pair to avoid msking.
+ * Offset to add to combined surrogate pair to avoid masking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
@@ -321,7 +332,153 @@
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
- // additional properties ----------------------------------------------
+
+ // property data constants -------------------------------------------------
+
+ /**
+ * Numeric types and values in the main properties words.
+ */
+ private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
+ private static final int getNumericTypeValue(int props) {
+ return props >> NUMERIC_TYPE_VALUE_SHIFT_;
+ }
+
+ /* constants for the storage form of numeric types and values */
+ /** No numeric value. */
+ private static final int NTV_NONE_ = 0;
+ /** Decimal digits: nv=0..9 */
+ private static final int NTV_DECIMAL_START_ = 1;
+ /** Other digits: nv=0..9 */
+ private static final int NTV_DIGIT_START_ = 11;
+ /** Small integers: nv=0..154 */
+ private static final int NTV_NUMERIC_START_ = 21;
+
+ private static final int ntvGetType(int ntv) {
+ return
+ (ntv==NTV_NONE_) ? NumericType.NONE :
+ (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL :
+ (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
+ NumericType.NUMERIC;
+ }
+
+ /*
+ * Properties in vector word 0
+ * Bits
+ * 31..24 DerivedAge version major/minor one nibble each
+ * 23..22 3..1: Bits 7..0 = Script_Extensions index
+ * 3: Script value from Script_Extensions
+ * 2: Script=Inherited
+ * 1: Script=Common
+ * 0: Script=bits 7..0
+ * 21..20 reserved
+ * 19..17 East Asian Width
+ * 16.. 8 UBlockCode
+ * 7.. 0 UScriptCode
+ */
+ /**
+ * Script_Extensions: mask includes Script
+ */
+ public static final int SCRIPT_X_MASK = 0x00c000ff;
+ //private static final int SCRIPT_X_SHIFT = 22;
+ /**
+ * Integer properties mask and shift values for East Asian cell width.
+ * Equivalent to icu4c UPROPS_EA_MASK
+ */
+ private static final int EAST_ASIAN_MASK_ = 0x000e0000;
+ /**
+ * Integer properties mask and shift values for East Asian cell width.
+ * Equivalent to icu4c UPROPS_EA_SHIFT
+ */
+ private static final int EAST_ASIAN_SHIFT_ = 17;
+ /**
+ * Integer properties mask and shift values for blocks.
+ * Equivalent to icu4c UPROPS_BLOCK_MASK
+ */
+ private static final int BLOCK_MASK_ = 0x0001ff00;
+ /**
+ * Integer properties mask and shift values for blocks.
+ * Equivalent to icu4c UPROPS_BLOCK_SHIFT
+ */
+ private static final int BLOCK_SHIFT_ = 8;
+ /**
+ * Integer properties mask and shift values for scripts.
+ * Equivalent to icu4c UPROPS_SHIFT_MASK
+ */
+ public static final int SCRIPT_MASK_ = 0x000000ff;
+
+ /**
+ * Additional properties used in internal trie data
+ */
+ /*
+ * Properties in vector word 1
+ * Each bit encodes one binary property.
+ * The following constants represent the bit number, use 1<<UPROPS_XYZ.
+ * UPROPS_BINARY_1_TOP<=32!
+ *
+ * Keep this list of property enums in sync with
+ * propListNames[] in icu/source/tools/genprops/props2.c!
+ *
+ * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
+ */
+ private static final int WHITE_SPACE_PROPERTY_ = 0;
+ private static final int DASH_PROPERTY_ = 1;
+ private static final int HYPHEN_PROPERTY_ = 2;
+ private static final int QUOTATION_MARK_PROPERTY_ = 3;
+ private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
+ private static final int MATH_PROPERTY_ = 5;
+ private static final int HEX_DIGIT_PROPERTY_ = 6;
+ private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
+ private static final int ALPHABETIC_PROPERTY_ = 8;
+ private static final int IDEOGRAPHIC_PROPERTY_ = 9;
+ private static final int DIACRITIC_PROPERTY_ = 10;
+ private static final int EXTENDER_PROPERTY_ = 11;
+ private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
+ private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
+ private static final int GRAPHEME_LINK_PROPERTY_ = 14;
+ private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
+ private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
+ private static final int RADICAL_PROPERTY_ = 17;
+ private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
+ private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
+ private static final int DEPRECATED_PROPERTY_ = 20;
+ private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
+ private static final int XID_START_PROPERTY_ = 22;
+ private static final int XID_CONTINUE_PROPERTY_ = 23;
+ private static final int ID_START_PROPERTY_ = 24;
+ private static final int ID_CONTINUE_PROPERTY_ = 25;
+ private static final int GRAPHEME_BASE_PROPERTY_ = 26;
+ private static final int S_TERM_PROPERTY_ = 27;
+ private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
+ private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */
+ private static final int PATTERN_WHITE_SPACE = 30;
+
+ /*
+ * Properties in vector word 2
+ * Bits
+ * 31..26 reserved
+ * 25..20 Line Break
+ * 19..15 Sentence Break
+ * 14..10 Word Break
+ * 9.. 5 Grapheme Cluster Break
+ * 4.. 0 Decomposition Type
+ */
+ private static final int LB_MASK = 0x03f00000;
+ private static final int LB_SHIFT = 20;
+
+ private static final int SB_MASK = 0x000f8000;
+ private static final int SB_SHIFT = 15;
+
+ private static final int WB_MASK = 0x00007c00;
+ private static final int WB_SHIFT = 10;
+
+ private static final int GCB_MASK = 0x000003e0;
+ private static final int GCB_SHIFT = 5;
+
+ /**
+ * Integer properties mask for decomposition type.
+ * Equivalent to icu4c UPROPS_DT_MASK.
+ */
+ private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
/**
* First nibble shift
@@ -339,31 +496,112 @@
// private constructors --------------------------------------------------
/**
- * Constructor
- * @exception IOException thrown when data reading fails or data corrupted
- */
+ * Constructor
+ * @exception IOException thrown when data reading fails or data corrupted
+ */
private UCharacterProperty() throws IOException
{
// jar access
- InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
- BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
- UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
- reader.read(this);
- b.close();
+ ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
+ m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
+ // Read or skip the 16 indexes.
+ int propertyOffset = bytes.getInt();
+ /* exceptionOffset = */ bytes.getInt();
+ /* caseOffset = */ bytes.getInt();
+ int additionalOffset = bytes.getInt();
+ int additionalVectorsOffset = bytes.getInt();
+ m_additionalColumnsCount_ = bytes.getInt();
+ int scriptExtensionsOffset = bytes.getInt();
+ int reservedOffset7 = bytes.getInt();
+ /* reservedOffset8 = */ bytes.getInt();
+ /* dataTopOffset = */ bytes.getInt();
+ m_maxBlockScriptValue_ = bytes.getInt();
+ m_maxJTGValue_ = bytes.getInt();
+ ICUBinary.skipBytes(bytes, (16 - 12) << 2);
+
+ // read the main properties trie
+ m_trie_ = Trie2_16.createFromSerialized(bytes);
+ int expectedTrieLength = (propertyOffset - 16) * 4;
+ int trieLength = m_trie_.getSerializedLength();
+ if(trieLength > expectedTrieLength) {
+ throw new IOException("uprops.icu: not enough bytes for main trie");
+ }
+ // skip padding after trie bytes
+ ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
+
+ // skip unused intervening data structures
+ ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
- m_trie_.putIndexData(this);
+ if(m_additionalColumnsCount_ > 0) {
+ // reads the additional property block
+ m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
+ expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
+ trieLength = m_additionalTrie_.getSerializedLength();
+ if(trieLength > expectedTrieLength) {
+ throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
+ }
+ // skip padding after trie bytes
+ ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
+
+ // additional properties
+ int size = scriptExtensionsOffset - additionalVectorsOffset;
+ m_additionalVectors_ = new int[size];
+ for (int i = 0; i < size; i ++) {
+ m_additionalVectors_[i] = bytes.getInt();
+ }
+ }
+
+ // Script_Extensions
+ int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
+ if(numChars > 0) {
+ m_scriptExtensions_ = new char[numChars];
+ for(int i = 0; i < numChars; ++i) {
+ m_scriptExtensions_[i] = bytes.getChar();
+ }
+ }
}
+ private static final class IsAcceptable implements ICUBinary.Authenticate {
+ // @Override when we switch to Java 6
+ public boolean isDataVersionAcceptable(byte version[]) {
+ return version[0] == 7;
+ }
+ }
+
+ private static final int DATA_FORMAT = 0x5550726F; // "UPro"
+
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
- TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
- RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
- while(propsVectorsIter.next(propsVectorsResult)){
- set.add(propsVectorsResult.start);
+ Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
+ Trie2.Range range;
+ while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
+ set.add(range.startCodePoint);
}
}
}
+ // This static initializer block must be placed after
+ // other static member initialization
+ static {
+ try {
+ INSTANCE = new UCharacterProperty();
+ }
+ catch (IOException e) {
+ throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");
+ }
+ }
+
+
+ // Moved from UProperty.java
+ /**
+ * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
+ * Used in UAX #9: Unicode Bidirectional Algorithm
+ * (http://www.unicode.org/reports/tr9/)
+ * Returns UCharacter.BidiPairedBracketType values.
+ * @stable ICU 52
+ */
+ public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
+
}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterPropertyReader.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.io.DataInputStream;
-import java.io.InputStream;
-import java.io.IOException;
-
-/**
-* <p>Internal reader class for ICU data file uprops.icu containing
-* Unicode codepoint data.</p>
-* <p>This class simply reads uprops.icu, authenticates that it is a valid
-* ICU data file and split its contents up into blocks of data for use in
-* <a href=UCharacterProperty.html>com.ibm.icu.impl.UCharacterProperty</a>.
-* </p>
-* <p>uprops.icu which is in big-endian format is jared together with this
-* package.</p>
-*
-* Unicode character properties file format see
-* (ICU4C)/source/tools/genprops/store.c
-*
-* @author Syn Wee Quek
-* @since release 2.1, February 1st 2002
-*/
-final class UCharacterPropertyReader implements ICUBinary.Authenticate
-{
- // public methods ----------------------------------------------------
-
- public boolean isDataVersionAcceptable(byte version[])
- {
- return version[0] == DATA_FORMAT_VERSION_[0]
- && version[2] == DATA_FORMAT_VERSION_[2]
- && version[3] == DATA_FORMAT_VERSION_[3];
- }
-
- // protected constructor ---------------------------------------------
-
- /**
- * <p>Protected constructor.</p>
- * @param inputStream ICU uprop.dat file input stream
- * @exception IOException throw if data file fails authentication
- */
- protected UCharacterPropertyReader(InputStream inputStream)
- throws IOException
- {
- m_unicodeVersion_ = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_,
- this);
- m_dataInputStream_ = new DataInputStream(inputStream);
- }
-
- // protected methods -------------------------------------------------
-
- /**
- * <p>Reads uprops.icu, parse it into blocks of data to be stored in
- * UCharacterProperty.</P
- * @param ucharppty UCharacterProperty instance
- * @exception IOException thrown when data reading fails
- */
- protected void read(UCharacterProperty ucharppty) throws IOException
- {
- // read the indexes
- int count = INDEX_SIZE_;
- m_propertyOffset_ = m_dataInputStream_.readInt();
- count --;
- m_exceptionOffset_ = m_dataInputStream_.readInt();
- count --;
- m_caseOffset_ = m_dataInputStream_.readInt();
- count --;
- m_additionalOffset_ = m_dataInputStream_.readInt();
- count --;
- m_additionalVectorsOffset_ = m_dataInputStream_.readInt();
- count --;
- m_additionalColumnsCount_ = m_dataInputStream_.readInt();
- count --;
- m_reservedOffset_ = m_dataInputStream_.readInt();
- count --;
- m_dataInputStream_.skipBytes(3 << 2);
- count -= 3;
- ucharppty.m_maxBlockScriptValue_ = m_dataInputStream_.readInt();
- count --; // 10
- ucharppty.m_maxJTGValue_ = m_dataInputStream_.readInt();
- count --; // 11
- m_dataInputStream_.skipBytes(count << 2);
-
- // read the trie index block
- // m_props_index_ in terms of ints
- ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, null);
-
- // skip the 32 bit properties block
- int size = m_exceptionOffset_ - m_propertyOffset_;
- m_dataInputStream_.skipBytes(size * 4);
-
- // reads the 32 bit exceptions block
- size = m_caseOffset_ - m_exceptionOffset_;
- m_dataInputStream_.skipBytes(size * 4);
-
- // reads the 32 bit case block
- size = (m_additionalOffset_ - m_caseOffset_) << 1;
- m_dataInputStream_.skipBytes(size * 2);
-
- if(m_additionalColumnsCount_ > 0) {
- // reads the additional property block
- ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_, null);
-
- // additional properties
- size = m_reservedOffset_ - m_additionalVectorsOffset_;
- ucharppty.m_additionalVectors_ = new int[size];
- for (int i = 0; i < size; i ++) {
- ucharppty.m_additionalVectors_[i] = m_dataInputStream_.readInt();
- }
- }
-
- m_dataInputStream_.close();
- ucharppty.m_additionalColumnsCount_ = m_additionalColumnsCount_;
- ucharppty.m_unicodeVersion_ = VersionInfo.getInstance(
- (int)m_unicodeVersion_[0], (int)m_unicodeVersion_[1],
- (int)m_unicodeVersion_[2], (int)m_unicodeVersion_[3]);
- }
-
- // private variables -------------------------------------------------
-
- /**
- * Index size
- */
- private static final int INDEX_SIZE_ = 16;
-
- /**
- * ICU data file input stream
- */
- private DataInputStream m_dataInputStream_;
-
- /**
- * Offset information in the indexes.
- */
- private int m_propertyOffset_;
- private int m_exceptionOffset_;
- private int m_caseOffset_;
- private int m_additionalOffset_;
- private int m_additionalVectorsOffset_;
- private int m_additionalColumnsCount_;
- private int m_reservedOffset_;
- private byte m_unicodeVersion_[];
-
- /**
- * Data format "UPro".
- */
- private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x50,
- (byte)0x72, (byte)0x6F};
- /**
- * Format version; this code works with all versions with the same major
- * version number and the same Trie bit distribution.
- */
- private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x5, (byte)0,
- (byte)Trie.INDEX_STAGE_1_SHIFT_,
- (byte)Trie.INDEX_STAGE_2_SHIFT_};
-}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UTF16.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/UTF16.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,15 +22,10 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
-/*
+/**
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
@@ -57,21 +52,21 @@
*
* // iteration forwards: Changes for UTF-32
* int ch;
- * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
- * ch = UTF16.charAt(s,i);
+ * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
+ * ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Original
- * for (int i = s.length() -1; i >= 0; --i) {
+ * for (int i = s.length() - 1; i >= 0; --i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Changes for UTF-32
* int ch;
- * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
- * ch = UTF16.charAt(s,i);
+ * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
+ * ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
* }</pre>
@@ -93,7 +88,7 @@
* back if and only if <code>bounds(string, offset16) != TRAIL</code>.
* </li>
* <li>
- * <strong>Exceptions:</strong> The error checking will throw an exception
+ * <strong>Exceptions:</strong> The error checking will throw an exception
* if indices are out of bounds. Other than that, all methods will
* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
* values are present. <code>UCharacter.isLegal()</code> can be used to check
@@ -106,10 +101,10 @@
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
* </li>
* <li>
- * <strong>Optimization:</strong> The method implementations may need
- * optimization if the compiler doesn't fold static final methods. Since
- * surrogate pairs will form an exceeding small percentage of all the text
- * in the world, the singleton case should always be optimized for.
+ * <strong>Optimization:</strong> The method implementations may need
+ * optimization if the compiler doesn't fold static final methods. Since
+ * surrogate pairs will form an exceeding small percentage of all the text
+ * in the world, the singleton case should always be optimized for.
* </li>
* </ul>
* @author Mark Davis, with help from Markus Scherer
@@ -135,7 +130,7 @@
* The minimum value for Supplementary code points
* @stable ICU 2.1
*/
- public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
+ public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
/**
* Lead surrogate minimum value
* @stable ICU 2.1
@@ -161,7 +156,41 @@
* @stable ICU 2.1
*/
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
+ /**
+ * Lead surrogate bitmask
+ */
+ private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
+ /**
+ * Trail surrogate bitmask
+ */
+ private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
+ /**
+ * Surrogate bitmask
+ */
+ private static final int SURROGATE_BITMASK = 0xFFFFF800;
+ /**
+ * Lead surrogate bits
+ */
+ private static final int LEAD_SURROGATE_BITS = 0xD800;
+ /**
+ * Trail surrogate bits
+ */
+ private static final int TRAIL_SURROGATE_BITS = 0xDC00;
+ /**
+ * Surrogate bits
+ */
+ private static final int SURROGATE_BITS = 0xD800;
+ // constructor --------------------------------------------------------
+
+ // /CLOVER:OFF
+ /**
+ * Prevent instance from being created.
+ */
+ private UTF16() {
+ }
+
+ // /CLOVER:ON
// public method ------------------------------------------------------
/**
@@ -222,7 +251,7 @@
}
/**
- * Extract a single UTF-32 value from a substring.
+ * Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
@@ -232,19 +261,72 @@
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
- * @param start offset to substring in the source array for analyzing
- * @param limit offset to substring in the source array for analyzing
- * @param offset16 UTF-16 offset relative to start
+ * @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
- * @exception IndexOutOfBoundsException thrown if offset16 is not within
- * the range of start and limit.
+ * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
- public static int charAt(char source[], int start, int limit,
- int offset16)
- {
+ public static int charAt(CharSequence source, int offset16) {
+ char single = source.charAt(offset16);
+ if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
+ return single;
+ }
+ return _charAt(source, offset16, single);
+ }
+
+ private static int _charAt(CharSequence source, int offset16, char single) {
+ if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+ return single;
+ }
+
+ // Convert the UTF-16 surrogate pair if necessary.
+ // For simplicity in usage, and because the frequency of pairs is
+ // low, look both directions.
+
+ if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+ ++offset16;
+ if (source.length() != offset16) {
+ char trail = source.charAt(offset16);
+ if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
+ && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+ return UCharacterProperty.getRawSupplementary(single, trail);
+ }
+ }
+ } else {
+ --offset16;
+ if (offset16 >= 0) {
+ // single is a trail surrogate so
+ char lead = source.charAt(offset16);
+ if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
+ && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+ return UCharacterProperty.getRawSupplementary(lead, single);
+ }
+ }
+ }
+ return single; // return unmatched surrogate
+ }
+
+ /**
+ * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
+ * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
+ * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
+ * </a></code>
+ * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
+ * character will be returned. If a complete supplementary character is not found the incomplete
+ * character will be returned
+ *
+ * @param source Array of UTF-16 chars
+ * @param start Offset to substring in the source array for analyzing
+ * @param limit Offset to substring in the source array for analyzing
+ * @param offset16 UTF-16 offset relative to start
+ * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
+ * of that codepoint are the same as in <code>bounds32()</code>.
+ * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
+ * @stable ICU 2.1
+ */
+ public static int charAt(char source[], int start, int limit, int offset16) {
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
@@ -259,7 +341,7 @@
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
- offset16 ++;
+ offset16++;
if (offset16 >= limit) {
return single;
}
@@ -272,7 +354,7 @@
if (offset16 == start) {
return single;
}
- offset16 --;
+ offset16--;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return UCharacterProperty.getRawSupplementary(lead, single);
@@ -300,37 +382,34 @@
/**
* Determines whether the code value is a surrogate.
* @param char16 the input character.
- * @return true iff the input character is a surrogate.
+ * @return true if the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16)
{
- return LEAD_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= TRAIL_SURROGATE_MAX_VALUE;
+ return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
* @param char16 the input character.
- * @return true iff the input character is a trail surrogate.
+ * @return true if the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16)
{
- return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= TRAIL_SURROGATE_MAX_VALUE);
+ return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
* @param char16 the input character.
- * @return true iff the input character is a lead surrogate
+ * @return true if the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16)
{
- return LEAD_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= LEAD_SURROGATE_MAX_VALUE;
+ return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
@@ -359,7 +438,7 @@
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
- * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
+ * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
* the character itself
* @stable ICU 2.1
*/
@@ -370,7 +449,7 @@
(char32 & TRAIL_SURROGATE_MASK_));
}
- return (char)char32;
+ return (char) char32;
}
/**
@@ -415,16 +494,15 @@
// Write the UTF-16 values
if (char32 >= SUPPLEMENTARY_MIN_VALUE)
{
- target.append(getLeadSurrogate(char32));
- target.append(getTrailSurrogate(char32));
- }
+ target.append(getLeadSurrogate(char32));
+ target.append(getTrailSurrogate(char32));
+ }
else {
- target.append((char)char32);
+ target.append((char) char32);
}
return target;
}
- //// for StringPrep
/**
* Shifts offset16 by the argument number of codepoints within a subarray.
* @param source char array
@@ -441,20 +519,20 @@
public static int moveCodePointOffset(char source[], int start, int limit,
int offset16, int shift32)
{
- int size = source.length;
- int count;
- char ch;
- int result = offset16 + start;
- if (start<0 || limit<start) {
+ int size = source.length;
+ int count;
+ char ch;
+ int result = offset16 + start;
+ if (start < 0 || limit < start) {
throw new StringIndexOutOfBoundsException(start);
}
- if (limit>size) {
+ if (limit > size) {
throw new StringIndexOutOfBoundsException(limit);
}
- if (offset16<0 || result>limit) {
+ if (offset16 < 0 || result > limit) {
throw new StringIndexOutOfBoundsException(offset16);
}
- if (shift32 > 0 ) {
+ if (shift32 > 0) {
if (shift32 + result > size) {
throw new StringIndexOutOfBoundsException(result);
}
@@ -462,29 +540,29 @@
while (result < limit && count > 0)
{
ch = source[result];
- if (isLeadSurrogate(ch) && (result+1 < limit) &&
- isTrailSurrogate(source[result+1])) {
- result ++;
+ if (isLeadSurrogate(ch) && (result + 1 < limit) &&
+ isTrailSurrogate(source[result + 1])) {
+ result++;
}
- count --;
- result ++;
+ count--;
+ result++;
}
} else {
if (result + shift32 < start) {
throw new StringIndexOutOfBoundsException(result);
}
- for (count=-shift32; count>0; count--) {
+ for (count = -shift32; count > 0; count--) {
result--;
- if (result<start) {
+ if (result < start) {
break;
}
ch = source[result];
- if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
+ if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
result--;
}
}
}
- if (count != 0) {
+ if (count != 0) {
throw new StringIndexOutOfBoundsException(shift32);
}
result -= start;
@@ -501,7 +579,7 @@
/**
* Mask to retrieve the significant value from a trail surrogate.
*/
- private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
+ private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
/**
* Value that all lead surrogate starts with
@@ -509,7 +587,7 @@
private static final int LEAD_SURROGATE_OFFSET_ =
LEAD_SURROGATE_MIN_VALUE -
(SUPPLEMENTARY_MIN_VALUE
- >> LEAD_SURROGATE_SHIFT_);
+ >> LEAD_SURROGATE_SHIFT_);
// private methods ------------------------------------------------------
@@ -527,7 +605,7 @@
private static String toString(int ch)
{
if (ch < SUPPLEMENTARY_MIN_VALUE) {
- return String.valueOf((char)ch);
+ return String.valueOf((char) ch);
}
StringBuilder result = new StringBuilder();
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeMatcher.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-/**
- * <code>UnicodeMatcher</code> defines a protocol for objects that can
- * match a range of characters in a Replaceable string.
- * @stable ICU 2.0
- */
-public interface UnicodeMatcher {
-
- /**
- * The character at index {@code i}, where
- * {@code i < contextStart || i >= contextLimit},
- * is ETHER. This allows explicit matching by rules and UnicodeSets
- * of text outside the context. In traditional terms, this allows anchoring
- * at the start and/or end.
- * @stable ICU 2.0
- */
- static final char ETHER = '\uFFFF';
-
-}
-
-//eof
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,29 +22,31 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2015, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
-
package sun.text.normalizer;
+import java.io.IOException;
import java.text.ParsePosition;
-import java.util.Iterator;
+import java.util.ArrayList;
import java.util.TreeSet;
/**
- * A mutable set of Unicode characters and multicharacter strings. Objects of this class
- * represent <em>character classes</em> used in regular expressions.
- * A character specifies a subset of Unicode code points. Legal
- * code points are U+0000 to U+10FFFF, inclusive.
+ * A mutable set of Unicode characters and multicharacter strings.
+ * Objects of this class represent <em>character classes</em> used
+ * in regular expressions. A character specifies a subset of Unicode
+ * code points. Legal code points are U+0000 to U+10FFFF, inclusive.
+ *
+ * Note: method freeze() will not only make the set immutable, but
+ * also makes important methods much higher performance:
+ * contains(c), containsNone(...), span(...), spanBack(...) etc.
+ * After the object is frozen, any subsequent call that wants to change
+ * the object will throw UnsupportedOperationException.
*
* <p>The UnicodeSet class is not designed to be subclassed.
*
@@ -118,7 +120,7 @@
* </blockquote>
*
* Any character may be preceded by a backslash in order to remove any special
- * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
+ * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are
* ignored, unless they are escaped.
*
* <p>Property patterns specify a set of characters having a certain
@@ -267,18 +269,24 @@
* </tr>
* </table>
* </blockquote>
- * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
+ * <p>To iterate over contents of UnicodeSet, the following are available:
+ * <ul><li>{@link #ranges()} to iterate through the ranges</li>
+ * <li>{@link #strings()} to iterate through the strings</li>
+ * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
+ * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
+ * </ul>
+ * All of the above can be used in <b>for</b> loops.
+ * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
*
* @author Alan Liu
* @stable ICU 2.0
- * @see UnicodeSetIterator
*/
-@SuppressWarnings("deprecation")
-public class UnicodeSet implements UnicodeMatcher {
+class UnicodeSet {
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
- // 110000 for codepoints
+ // 110000 for codepoints
/**
* Minimum value that can be stored in a UnicodeSet.
@@ -299,7 +307,7 @@
// NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
// is not private so that UnicodeSetIterator can get access
- TreeSet<String> strings = new TreeSet<>();
+ TreeSet<String> strings = new TreeSet<String>();
/**
* The pattern representation of this set. This may not be the
@@ -310,18 +318,14 @@
* indicating that toPattern() must generate a pattern
* representation from the inversion list.
*/
- private String pat = null;
private static final int START_EXTRA = 16; // initial storage. Must be >= 0
private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
- /**
- * A set of all characters _except_ the second through last characters of
- * certain ranges. These ranges are ranges of characters whose
- * properties are all exactly alike, e.g. CJK Ideographs from
- * U+4E00 to U+9FA5.
- */
- private static UnicodeSet INCLUSIONS[] = null;
+ private static UnicodeSet INCLUSION = null;
+
+ private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
+ private volatile UnicodeSetStringSpan stringSpan;
//----------------------------------------------------------------
// Public API
@@ -331,14 +335,22 @@
* Constructs an empty set.
* @stable ICU 2.0
*/
- public UnicodeSet() {
+ private UnicodeSet() {
list = new int[1 + START_EXTRA];
list[len++] = HIGH;
}
/**
- * Constructs a set containing the given range.
- * If {@code end > start} then an empty set is created.
+ * Constructs a copy of an existing set.
+ * @stable ICU 2.0
+ */
+ private UnicodeSet(UnicodeSet other) {
+ set(other);
+ }
+
+ /**
+ * Constructs a set containing the given range. If <code>end >
+ * start</code> then an empty set is created.
*
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
@@ -359,7 +371,7 @@
*/
public UnicodeSet(String pattern) {
this();
- applyPattern(pattern, null, null, IGNORE_SPACE);
+ applyPattern(pattern, null);
}
/**
@@ -368,172 +380,29 @@
* copied to this object
* @stable ICU 2.0
*/
- @SuppressWarnings("unchecked") // Casting result of clone of a collection
public UnicodeSet set(UnicodeSet other) {
+ checkFrozen();
list = other.list.clone();
len = other.len;
- pat = other.pat;
- strings = (TreeSet)other.strings.clone();
+ strings = new TreeSet<String>(other.strings);
return this;
}
/**
- * Modifies this set to represent the set specified by the given pattern.
- * See the class description for the syntax of the pattern language.
- * Whitespace is ignored.
- * @param pattern a string specifying what characters are in the set
- * @exception java.lang.IllegalArgumentException if the pattern
- * contains a syntax error.
+ * Returns the number of elements in this set (its cardinality)
+ * Note than the elements of a set may include both individual
+ * codepoints and strings.
+ *
+ * @return the number of elements in this set (its cardinality).
* @stable ICU 2.0
*/
- public final UnicodeSet applyPattern(String pattern) {
- return applyPattern(pattern, null, null, IGNORE_SPACE);
- }
-
- /**
- * Append the <code>toPattern()</code> representation of a
- * string to the given <code>StringBuffer</code>.
- */
- private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
- for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
- _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
- }
- }
-
- /**
- * Append the <code>toPattern()</code> representation of a
- * character to the given <code>StringBuffer</code>.
- */
- private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
- if (escapeUnprintable && Utility.isUnprintable(c)) {
- // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
- // unprintable
- if (Utility.escapeUnprintable(buf, c)) {
- return;
- }
- }
- // Okay to let ':' pass through
- switch (c) {
- case '[': // SET_OPEN:
- case ']': // SET_CLOSE:
- case '-': // HYPHEN:
- case '^': // COMPLEMENT:
- case '&': // INTERSECTION:
- case '\\': //BACKSLASH:
- case '{':
- case '}':
- case '$':
- case ':':
- buf.append('\\');
- break;
- default:
- // Escape whitespace
- if (UCharacterProperty.isRuleWhiteSpace(c)) {
- buf.append('\\');
- }
- break;
+ public int size() {
+ int n = 0;
+ int count = getRangeCount();
+ for (int i = 0; i < count; ++i) {
+ n += getRangeEnd(i) - getRangeStart(i) + 1;
}
- UTF16.append(buf, c);
- }
-
- /**
- * Append a string representation of this set to result. This will be
- * a cleaned version of the string passed to applyPattern(), if there
- * is one. Otherwise it will be generated.
- */
- private StringBuffer _toPattern(StringBuffer result,
- boolean escapeUnprintable) {
- if (pat != null) {
- int i;
- int backslashCount = 0;
- for (i=0; i<pat.length(); ) {
- int c = UTF16.charAt(pat, i);
- i += UTF16.getCharCount(c);
- if (escapeUnprintable && Utility.isUnprintable(c)) {
- // If the unprintable character is preceded by an odd
- // number of backslashes, then it has been escaped.
- // Before unescaping it, we delete the final
- // backslash.
- if ((backslashCount % 2) == 1) {
- result.setLength(result.length() - 1);
- }
- Utility.escapeUnprintable(result, c);
- backslashCount = 0;
- } else {
- UTF16.append(result, c);
- if (c == '\\') {
- ++backslashCount;
- } else {
- backslashCount = 0;
- }
- }
- }
- return result;
- }
-
- return _generatePattern(result, escapeUnprintable, true);
- }
-
- /**
- * Generate and append a string representation of this set to result.
- * This does not use this.pat, the cleaned up copy of the string
- * passed to applyPattern().
- * @param includeStrings if false, doesn't include the strings.
- * @stable ICU 3.8
- */
- public StringBuffer _generatePattern(StringBuffer result,
- boolean escapeUnprintable, boolean includeStrings) {
- result.append('[');
-
- int count = getRangeCount();
-
- // If the set contains at least 2 intervals and includes both
- // MIN_VALUE and MAX_VALUE, then the inverse representation will
- // be more economical.
- if (count > 1 &&
- getRangeStart(0) == MIN_VALUE &&
- getRangeEnd(count-1) == MAX_VALUE) {
-
- // Emit the inverse
- result.append('^');
-
- for (int i = 1; i < count; ++i) {
- int start = getRangeEnd(i-1)+1;
- int end = getRangeStart(i)-1;
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append('-');
- }
- _appendToPat(result, end, escapeUnprintable);
- }
- }
- }
-
- // Default; emit the ranges as pairs
- else {
- for (int i = 0; i < count; ++i) {
- int start = getRangeStart(i);
- int end = getRangeEnd(i);
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append('-');
- }
- _appendToPat(result, end, escapeUnprintable);
- }
- }
- }
-
- if (includeStrings && strings.size() > 0) {
- Iterator<String> it = strings.iterator();
- while (it.hasNext()) {
- result.append('{');
- _appendToPat(result, it.next(), escapeUnprintable);
- result.append('}');
- }
- }
- return result.append(']');
+ return n + strings.size();
}
// for internal use, after checkFrozen has been called
@@ -559,6 +428,7 @@
* @stable ICU 2.0
*/
public final UnicodeSet add(int c) {
+ checkFrozen();
return add_unchecked(c);
}
@@ -643,7 +513,6 @@
len += 2;
}
- pat = null;
return this;
}
@@ -657,11 +526,11 @@
* @return this object, for chaining
* @stable ICU 2.0
*/
- public final UnicodeSet add(String s) {
+ public final UnicodeSet add(CharSequence s) {
+ checkFrozen();
int cp = getSingleCP(s);
if (cp < 0) {
- strings.add(s);
- pat = null;
+ strings.add(s.toString());
} else {
add_unchecked(cp, cp);
}
@@ -669,11 +538,13 @@
}
/**
+ * Utility for getting code point from single code point CharSequence.
+ * See the public UTF16.getSingleCodePoint()
* @return a code point IF the string consists of a single one.
* otherwise returns -1.
- * @param string to test
+ * @param s to test
*/
- private static int getSingleCP(String s) {
+ private static int getSingleCP(CharSequence s) {
if (s.length() < 1) {
throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
}
@@ -701,6 +572,7 @@
* @stable ICU 2.0
*/
public UnicodeSet complement(int start, int end) {
+ checkFrozen();
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
@@ -710,26 +582,6 @@
if (start <= end) {
xor(range(start, end), 2, 0);
}
- pat = null;
- return this;
- }
-
- /**
- * This is equivalent to
- * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
- * @stable ICU 2.0
- */
- public UnicodeSet complement() {
- if (list[0] == LOW) {
- System.arraycopy(list, 1, list, 0, len-1);
- --len;
- } else {
- ensureCapacity(len+1);
- System.arraycopy(list, 0, list, 1, len);
- list[0] = LOW;
- ++len;
- }
- pat = null;
return this;
}
@@ -743,6 +595,12 @@
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
+ if (bmpSet != null) {
+ return bmpSet.contains(c);
+ }
+ if (stringSpan != null) {
+ return stringSpan.contains(c);
+ }
/*
// Set i to the index of the start item greater than ch
@@ -751,7 +609,7 @@
while (true) {
if (c < list[++i]) break;
}
- */
+ */
int i = findCodePoint(c);
@@ -790,7 +648,7 @@
// invariant: c < list[hi]
for (;;) {
int i = (lo + hi) >>> 1;
- if (i == lo) return hi;
+ if (i == lo) return hi;
if (c < list[i]) {
hi = i;
} else {
@@ -800,22 +658,6 @@
}
/**
- * Adds all of the elements in the specified set to this set if
- * they're not already present. This operation effectively
- * modifies this set so that its value is the <i>union</i> of the two
- * sets. The behavior of this operation is unspecified if the specified
- * collection is modified while the operation is in progress.
- *
- * @param c set whose elements are to be added to this set.
- * @stable ICU 2.0
- */
- public UnicodeSet addAll(UnicodeSet c) {
- add(c.list, c.len, 0);
- strings.addAll(c.strings);
- return this;
- }
-
- /**
* Retains only the elements in this set that are contained in the
* specified set. In other words, removes from this set all of
* its elements that are not contained in the specified set. This
@@ -826,36 +668,21 @@
* @stable ICU 2.0
*/
public UnicodeSet retainAll(UnicodeSet c) {
+ checkFrozen();
retain(c.list, c.len, 0);
strings.retainAll(c.strings);
return this;
}
/**
- * Removes from this set all of its elements that are contained in the
- * specified set. This operation effectively modifies this
- * set so that its value is the <i>asymmetric set difference</i> of
- * the two sets.
- *
- * @param c set that defines which elements will be removed from
- * this set.
- * @stable ICU 2.0
- */
- public UnicodeSet removeAll(UnicodeSet c) {
- retain(c.list, c.len, 2);
- strings.removeAll(c.strings);
- return this;
- }
-
- /**
* Removes all of the elements from this set. This set will be
* empty after this call returns.
* @stable ICU 2.0
*/
public UnicodeSet clear() {
+ checkFrozen();
list[0] = HIGH;
len = 1;
- pat = null;
strings.clear();
return this;
}
@@ -923,405 +750,18 @@
* of <code>pattern</code>
* @exception java.lang.IllegalArgumentException if the parse fails.
*/
- UnicodeSet applyPattern(String pattern,
- ParsePosition pos,
- SymbolTable symbols,
- int options) {
-
- // Need to build the pattern in a temporary string because
- // _applyPattern calls add() etc., which set pat to empty.
- boolean parsePositionWasNull = pos == null;
- if (parsePositionWasNull) {
- pos = new ParsePosition(0);
- }
-
- StringBuffer rebuiltPat = new StringBuffer();
- RuleCharacterIterator chars =
- new RuleCharacterIterator(pattern, symbols, pos);
- applyPattern(chars, symbols, rebuiltPat, options);
- if (chars.inVariable()) {
- syntaxError(chars, "Extra chars in variable value");
- }
- pat = rebuiltPat.toString();
- if (parsePositionWasNull) {
- int i = pos.getIndex();
-
- // Skip over trailing whitespace
- if ((options & IGNORE_SPACE) != 0) {
- i = Utility.skipWhitespace(pattern, i);
- }
-
- if (i != pattern.length()) {
- throw new IllegalArgumentException("Parse of \"" + pattern +
- "\" failed at " + i);
- }
- }
- return this;
- }
-
- /**
- * Parse the pattern from the given RuleCharacterIterator. The
- * iterator is advanced over the parsed pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param symbols symbol table to use to parse and dereference
- * variables, or null if none.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- * @param options a bit mask of zero or more of the following:
- * IGNORE_SPACE, CASE.
- */
- void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
- StringBuffer rebuiltPat, int options) {
- // Syntax characters: [ ] ^ - & { }
-
- // Recognized special forms for chars, sets: c-c s-s s&s
-
- int opts = RuleCharacterIterator.PARSE_VARIABLES |
- RuleCharacterIterator.PARSE_ESCAPES;
- if ((options & IGNORE_SPACE) != 0) {
- opts |= RuleCharacterIterator.SKIP_WHITESPACE;
+ private UnicodeSet applyPattern(String pattern,
+ ParsePosition pos) {
+ if ("[:age=3.2:]".equals(pattern)) {
+ checkFrozen();
+ VersionInfo version = VersionInfo.getInstance("3.2");
+ applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
+ } else {
+ throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern "
+ + pattern + ")");
}
- StringBuffer patBuf = new StringBuffer(), buf = null;
- boolean usePat = false;
- UnicodeSet scratch = null;
- Object backup = null;
-
- // mode: 0=before [, 1=between [...], 2=after ]
- // lastItem: 0=none, 1=char, 2=set
- int lastItem = 0, lastChar = 0, mode = 0;
- char op = 0;
-
- boolean invert = false;
-
- clear();
-
- while (mode != 2 && !chars.atEnd()) {
- if (false) {
- // Debugging assertion
- if (!((lastItem == 0 && op == 0) ||
- (lastItem == 1 && (op == 0 || op == '-')) ||
- (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
- throw new IllegalArgumentException();
- }
- }
-
- int c = 0;
- boolean literal = false;
- UnicodeSet nested = null;
-
- // -------- Check for property pattern
-
- // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
- int setMode = 0;
- if (resemblesPropertyPattern(chars, opts)) {
- setMode = 2;
- }
-
- // -------- Parse '[' of opening delimiter OR nested set.
- // If there is a nested set, use `setMode' to define how
- // the set should be parsed. If the '[' is part of the
- // opening delimiter for this pattern, parse special
- // strings "[", "[^", "[-", and "[^-". Check for stand-in
- // characters representing a nested set in the symbol
- // table.
-
- else {
- // Prepare to backup if necessary
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
-
- if (c == '[' && !literal) {
- if (mode == 1) {
- chars.setPos(backup); // backup
- setMode = 1;
- } else {
- // Handle opening '[' delimiter
- mode = 1;
- patBuf.append('[');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '^' && !literal) {
- invert = true;
- patBuf.append('^');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- }
- // Fall through to handle special leading '-';
- // otherwise restart loop for nested [], \p{}, etc.
- if (c == '-') {
- literal = true;
- // Fall through to handle literal '-' below
- } else {
- chars.setPos(backup); // backup
- continue;
- }
- }
- } else if (symbols != null) {
- UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
- if (m != null) {
- try {
- nested = (UnicodeSet) m;
- setMode = 3;
- } catch (ClassCastException e) {
- syntaxError(chars, "Syntax error");
- }
- }
- }
- }
-
- // -------- Handle a nested set. This either is inline in
- // the pattern or represented by a stand-in that has
- // previously been parsed and was looked up in the symbol
- // table.
-
- if (setMode != 0) {
- if (lastItem == 1) {
- if (op != 0) {
- syntaxError(chars, "Char expected after operator");
- }
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastItem = op = 0;
- }
-
- if (op == '-' || op == '&') {
- patBuf.append(op);
- }
-
- if (nested == null) {
- if (scratch == null) scratch = new UnicodeSet();
- nested = scratch;
- }
- switch (setMode) {
- case 1:
- nested.applyPattern(chars, symbols, patBuf, options);
- break;
- case 2:
- chars.skipIgnored(opts);
- nested.applyPropertyPattern(chars, patBuf, symbols);
- break;
- case 3: // `nested' already parsed
- nested._toPattern(patBuf, false);
- break;
- }
-
- usePat = true;
-
- if (mode == 0) {
- // Entire pattern is a category; leave parse loop
- set(nested);
- mode = 2;
- break;
- }
-
- switch (op) {
- case '-':
- removeAll(nested);
- break;
- case '&':
- retainAll(nested);
- break;
- case 0:
- addAll(nested);
- break;
- }
-
- op = 0;
- lastItem = 2;
-
- continue;
- }
-
- if (mode == 0) {
- syntaxError(chars, "Missing '['");
- }
-
- // -------- Parse special (syntax) characters. If the
- // current character is not special, or if it is escaped,
- // then fall through and handle it below.
-
- if (!literal) {
- switch (c) {
- case ']':
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- // Treat final trailing '-' as a literal
- if (op == '-') {
- add_unchecked(op, op);
- patBuf.append(op);
- } else if (op == '&') {
- syntaxError(chars, "Trailing '&'");
- }
- patBuf.append(']');
- mode = 2;
- continue;
- case '-':
- if (op == 0) {
- if (lastItem != 0) {
- op = (char) c;
- continue;
- } else {
- // Treat final trailing '-' as a literal
- add_unchecked(c, c);
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == ']' && !literal) {
- patBuf.append("-]");
- mode = 2;
- continue;
- }
- }
- }
- syntaxError(chars, "'-' not after char or set");
- break;
- case '&':
- if (lastItem == 2 && op == 0) {
- op = (char) c;
- continue;
- }
- syntaxError(chars, "'&' not after set");
- break;
- case '^':
- syntaxError(chars, "'^' not after '['");
- break;
- case '{':
- if (op != 0) {
- syntaxError(chars, "Missing operand after operator");
- }
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- lastItem = 0;
- if (buf == null) {
- buf = new StringBuffer();
- } else {
- buf.setLength(0);
- }
- boolean ok = false;
- while (!chars.atEnd()) {
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '}' && !literal) {
- ok = true;
- break;
- }
- UTF16.append(buf, c);
- }
- if (buf.length() < 1 || !ok) {
- syntaxError(chars, "Invalid multicharacter string");
- }
- // We have new string. Add it to set and continue;
- // we don't need to drop through to the further
- // processing
- add(buf.toString());
- patBuf.append('{');
- _appendToPat(patBuf, buf.toString(), false);
- patBuf.append('}');
- continue;
- case SymbolTable.SYMBOL_REF:
- // symbols nosymbols
- // [a-$] error error (ambiguous)
- // [a$] anchor anchor
- // [a-$x] var "x"* literal '$'
- // [a-$.] error literal '$'
- // *We won't get here in the case of var "x"
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
- boolean anchor = (c == ']' && !literal);
- if (symbols == null && !anchor) {
- c = SymbolTable.SYMBOL_REF;
- chars.setPos(backup);
- break; // literal '$'
- }
- if (anchor && op == 0) {
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- add_unchecked(UnicodeMatcher.ETHER);
- usePat = true;
- patBuf.append(SymbolTable.SYMBOL_REF).append(']');
- mode = 2;
- continue;
- }
- syntaxError(chars, "Unquoted '$'");
- break;
- default:
- break;
- }
- }
-
- // -------- Parse literal characters. This includes both
- // escaped chars ("\u4E01") and non-syntax characters
- // ("a").
-
- switch (lastItem) {
- case 0:
- lastItem = 1;
- lastChar = c;
- break;
- case 1:
- if (op == '-') {
- if (lastChar >= c) {
- // Don't allow redundant (a-a) or empty (b-a) ranges;
- // these are most likely typos.
- syntaxError(chars, "Invalid range");
- }
- add_unchecked(lastChar, c);
- _appendToPat(patBuf, lastChar, false);
- patBuf.append(op);
- _appendToPat(patBuf, c, false);
- lastItem = op = 0;
- } else {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastChar = c;
- }
- break;
- case 2:
- if (op != 0) {
- syntaxError(chars, "Set expected after operator");
- }
- lastChar = c;
- lastItem = 1;
- break;
- }
- }
-
- if (mode != 2) {
- syntaxError(chars, "Missing ']'");
- }
-
- chars.skipIgnored(opts);
-
- if (invert) {
- complement();
- }
-
- // Use the rebuilt pattern (pat) only if necessary. Prefer the
- // generated pattern.
- if (usePat) {
- rebuiltPat.append(patBuf.toString());
- } else {
- _generatePattern(rebuiltPat, false, true);
- }
- }
-
- private static void syntaxError(RuleCharacterIterator chars, String msg) {
- throw new IllegalArgumentException("Error: " + msg + " at \"" +
- Utility.escape(chars.toString()) +
- '"');
+ return this;
}
//----------------------------------------------------------------
@@ -1397,7 +837,6 @@
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1414,88 +853,87 @@
// change from xor is that we have to check overlapping pairs
// polarity bit 1 means a is second, bit 2 means b is.
main:
- while (true) {
- switch (polarity) {
- case 0: // both first; take lower if unequal
- if (a < b) { // take a
- // Back up over overlapping ranges in buffer[]
- if (k > 0 && a <= buffer[k-1]) {
- // Pick latter end value in buffer[] vs. list[]
- a = max(list[i], buffer[--k]);
- } else {
- // No overlap
- buffer[k++] = a;
- a = list[i];
+ while (true) {
+ switch (polarity) {
+ case 0: // both first; take lower if unequal
+ if (a < b) { // take a
+ // Back up over overlapping ranges in buffer[]
+ if (k > 0 && a <= buffer[k-1]) {
+ // Pick latter end value in buffer[] vs. list[]
+ a = max(list[i], buffer[--k]);
+ } else {
+ // No overlap
+ buffer[k++] = a;
+ a = list[i];
+ }
+ i++; // Common if/else code factored out
+ polarity ^= 1;
+ } else if (b < a) { // take b
+ if (k > 0 && b <= buffer[k-1]) {
+ b = max(other[j], buffer[--k]);
+ } else {
+ buffer[k++] = b;
+ b = other[j];
+ }
+ j++;
+ polarity ^= 2;
+ } else { // a == b, take a, drop b
+ if (a == HIGH) break main;
+ // This is symmetrical; it doesn't matter if
+ // we backtrack with a or b. - liu
+ if (k > 0 && a <= buffer[k-1]) {
+ a = max(list[i], buffer[--k]);
+ } else {
+ // No overlap
+ buffer[k++] = a;
+ a = list[i];
+ }
+ i++;
+ polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
}
- i++; // Common if/else code factored out
- polarity ^= 1;
- } else if (b < a) { // take b
- if (k > 0 && b <= buffer[k-1]) {
- b = max(other[j], buffer[--k]);
- } else {
+ break;
+ case 3: // both second; take higher if unequal, and drop other
+ if (b <= a) { // take a
+ if (a == HIGH) break main;
+ buffer[k++] = a;
+ } else { // take b
+ if (b == HIGH) break main;
buffer[k++] = b;
- b = other[j];
}
- j++;
- polarity ^= 2;
- } else { // a == b, take a, drop b
- if (a == HIGH) break main;
- // This is symmetrical; it doesn't matter if
- // we backtrack with a or b. - liu
- if (k > 0 && a <= buffer[k-1]) {
- a = max(list[i], buffer[--k]);
- } else {
- // No overlap
- buffer[k++] = a;
- a = list[i];
- }
- i++;
- polarity ^= 1;
+ a = list[i++]; polarity ^= 1; // factored common code
b = other[j++]; polarity ^= 2;
- }
- break;
- case 3: // both second; take higher if unequal, and drop other
- if (b <= a) { // take a
- if (a == HIGH) break main;
- buffer[k++] = a;
- } else { // take b
- if (b == HIGH) break main;
- buffer[k++] = b;
+ break;
+ case 1: // a second, b first; if b < a, overlap
+ if (a < b) { // no overlap, take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // OVERLAP, drop b
+ b = other[j++]; polarity ^= 2;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 2: // a first, b second; if a < b, overlap
+ if (b < a) { // no overlap, take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else if (a < b) { // OVERLAP, drop a
+ a = list[i++]; polarity ^= 1;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
}
- a = list[i++]; polarity ^= 1; // factored common code
- b = other[j++]; polarity ^= 2;
- break;
- case 1: // a second, b first; if b < a, overlap
- if (a < b) { // no overlap, take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else if (b < a) { // OVERLAP, drop b
- b = other[j++]; polarity ^= 2;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 2: // a first, b second; if a < b, overlap
- if (b < a) { // no overlap, take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else if (a < b) { // OVERLAP, drop a
- a = list[i++]; polarity ^= 1;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
}
- }
buffer[k++] = HIGH; // terminate
len = k;
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1512,61 +950,60 @@
// change from xor is that we have to check overlapping pairs
// polarity bit 1 means a is second, bit 2 means b is.
main:
- while (true) {
- switch (polarity) {
- case 0: // both first; drop the smaller
- if (a < b) { // drop a
- a = list[i++]; polarity ^= 1;
- } else if (b < a) { // drop b
- b = other[j++]; polarity ^= 2;
- } else { // a == b, take one, drop other
- if (a == HIGH) break main;
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 3: // both second; take lower if unequal
- if (a < b) { // take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else if (b < a) { // take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else { // a == b, take one, drop other
- if (a == HIGH) break main;
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
+ while (true) {
+ switch (polarity) {
+ case 0: // both first; drop the smaller
+ if (a < b) { // drop a
+ a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // drop b
+ b = other[j++]; polarity ^= 2;
+ } else { // a == b, take one, drop other
+ if (a == HIGH) break main;
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 3: // both second; take lower if unequal
+ if (a < b) { // take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else { // a == b, take one, drop other
+ if (a == HIGH) break main;
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 1: // a second, b first;
+ if (a < b) { // NO OVERLAP, drop a
+ a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // OVERLAP, take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 2: // a first, b second; if a < b, overlap
+ if (b < a) { // no overlap, drop b
+ b = other[j++]; polarity ^= 2;
+ } else if (a < b) { // OVERLAP, take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
}
- break;
- case 1: // a second, b first;
- if (a < b) { // NO OVERLAP, drop a
- a = list[i++]; polarity ^= 1;
- } else if (b < a) { // OVERLAP, take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 2: // a first, b second; if a < b, overlap
- if (b < a) { // no overlap, drop b
- b = other[j++]; polarity ^= 2;
- } else if (a < b) { // OVERLAP, take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
}
- }
buffer[k++] = HIGH; // terminate
len = k;
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1582,58 +1019,46 @@
boolean contains(int codePoint);
}
- // VersionInfo for unassigned characters
- static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
+ private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
private static class VersionFilter implements Filter {
VersionInfo version;
-
VersionFilter(VersionInfo version) { this.version = version; }
-
public boolean contains(int ch) {
VersionInfo v = UCharacter.getAge(ch);
// Reference comparison ok; VersionInfo caches and reuses
// unique objects.
return v != NO_VERSION &&
- v.compareTo(version) <= 0;
+ v.compareTo(version) <= 0;
}
}
private static synchronized UnicodeSet getInclusions(int src) {
- if (INCLUSIONS == null) {
- INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
+ if (src != UCharacterProperty.SRC_PROPSVEC) {
+ throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
}
- if(INCLUSIONS[src] == null) {
+
+ if (INCLUSION == null) {
UnicodeSet incl = new UnicodeSet();
- switch(src) {
- case UCharacterProperty.SRC_PROPSVEC:
- UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
- break;
- default:
- throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
- }
- INCLUSIONS[src] = incl;
+ UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+ INCLUSION = incl;
}
- return INCLUSIONS[src];
+ return INCLUSION;
}
/**
* Generic filter-based scanning code for UCD property UnicodeSets.
*/
private UnicodeSet applyFilter(Filter filter, int src) {
- // Walk through all Unicode characters, noting the start
+ // Logically, walk through all Unicode characters, noting the start
// and end of each range for which filter.contain(c) is
// true. Add each range to a set.
//
- // To improve performance, use the INCLUSIONS set, which
+ // To improve performance, use an inclusions set which
// encodes information about character ranges that are known
- // to have identical properties, such as the CJK Ideographs
- // from U+4E00 to U+9FA5. INCLUSIONS contains all characters
- // except the first characters of such ranges.
- //
- // TODO Where possible, instead of scanning over code points,
- // use internal property data to initialize UnicodeSets for
- // those properties. Scanning code points is slow.
+ // to have identical properties.
+ // getInclusions(src) contains exactly the first characters of
+ // same-value ranges for the given properties "source".
clear();
@@ -1668,204 +1093,315 @@
}
/**
- * Remove leading and trailing rule white space and compress
- * internal rule white space to a single space character.
+ * Is this frozen, according to the Freezable interface?
*
- * @see UCharacterProperty#isRuleWhiteSpace
- */
- private static String mungeCharName(String source) {
- StringBuffer buf = new StringBuffer();
- for (int i=0; i<source.length(); ) {
- int ch = UTF16.charAt(source, i);
- i += UTF16.getCharCount(ch);
- if (UCharacterProperty.isRuleWhiteSpace(ch)) {
- if (buf.length() == 0 ||
- buf.charAt(buf.length() - 1) == ' ') {
- continue;
- }
- ch = ' '; // convert to ' '
- }
- UTF16.append(buf, ch);
- }
- if (buf.length() != 0 &&
- buf.charAt(buf.length() - 1) == ' ') {
- buf.setLength(buf.length() - 1);
- }
- return buf.toString();
- }
-
- /**
- * Modifies this set to contain those code points which have the
- * given value for the given property. Prior contents of this
- * set are lost.
- * @param propertyAlias the property alias
- * @param valueAlias the value alias
- * @param symbols if not null, then symbols are first called to see if a property
- * is available. If true, then everything else is skipped.
- * @return this set
- * @stable ICU 3.2
+ * @return value
+ * @stable ICU 3.8
*/
- public UnicodeSet applyPropertyAlias(String propertyAlias,
- String valueAlias, SymbolTable symbols) {
- if (valueAlias.length() > 0) {
- if (propertyAlias.equals("Age")) {
- // Must munge name, since
- // VersionInfo.getInstance() does not do
- // 'loose' matching.
- VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
- applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
- return this;
- }
- }
- throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
- }
-
- /**
- * Return true if the given iterator appears to point at a
- * property pattern. Regardless of the result, return with the
- * iterator unchanged.
- * @param chars iterator over the pattern characters. Upon return
- * it will be unchanged.
- * @param iterOpts RuleCharacterIterator options
- */
- private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
- int iterOpts) {
- boolean result = false;
- iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
- Object pos = chars.getPos(null);
- int c = chars.next(iterOpts);
- if (c == '[' || c == '\\') {
- int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
- result = (c == '[') ? (d == ':') :
- (d == 'N' || d == 'p' || d == 'P');
- }
- chars.setPos(pos);
- return result;
+ public boolean isFrozen() {
+ return (bmpSet != null || stringSpan != null);
}
/**
- * Parse the given property pattern at the given parse position.
- * @param symbols TODO
+ * Freeze this class, according to the Freezable interface.
+ *
+ * @return this
+ * @stable ICU 4.4
*/
- private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
- int pos = ppos.getIndex();
-
- // On entry, ppos should point to one of the following locations:
-
- // Minimum length is 5 characters, e.g. \p{L}
- if ((pos+5) > pattern.length()) {
- return null;
- }
-
- boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
- boolean isName = false; // true for \N{pat}, o/w false
- boolean invert = false;
+ public UnicodeSet freeze() {
+ if (!isFrozen()) {
+ // Do most of what compact() does before freezing because
+ // compact() will not work when the set is frozen.
+ // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
- // Look for an opening [:, [:^, \p, or \P
- if (pattern.regionMatches(pos, "[:", 0, 2)) {
- posix = true;
- pos = Utility.skipWhitespace(pattern, pos+2);
- if (pos < pattern.length() && pattern.charAt(pos) == '^') {
- ++pos;
- invert = true;
- }
- } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
- pattern.regionMatches(pos, "\\N", 0, 2)) {
- char c = pattern.charAt(pos+1);
- invert = (c == 'P');
- isName = (c == 'N');
- pos = Utility.skipWhitespace(pattern, pos+2);
- if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
- // Syntax error; "\p" or "\P" not followed by "{"
- return null;
+ // Delete buffer first to defragment memory less.
+ buffer = null;
+ if (list.length > (len + GROW_EXTRA)) {
+ // Make the capacity equal to len or 1.
+ // We don't want to realloc of 0 size.
+ int capacity = (len == 0) ? 1 : len;
+ int[] oldList = list;
+ list = new int[capacity];
+ for (int i = capacity; i-- > 0;) {
+ list[i] = oldList[i];
+ }
}
- } else {
- // Open delimiter not seen
- return null;
- }
- // Look for the matching close delimiter, either :] or }
- int close = pattern.indexOf(posix ? ":]" : "}", pos);
- if (close < 0) {
- // Syntax error; close delimiter missing
- return null;
- }
-
- // Look for an '=' sign. If this is present, we will parse a
- // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
- // pattern.
- int equals = pattern.indexOf('=', pos);
- String propName, valueName;
- if (equals >= 0 && equals < close && !isName) {
- // Equals seen; parse medium/long pattern
- propName = pattern.substring(pos, equals);
- valueName = pattern.substring(equals+1, close);
- }
-
- else {
- // Handle case where no '=' is seen, and \N{}
- propName = pattern.substring(pos, close);
- valueName = "";
-
- // Handle \N{name}
- if (isName) {
- // This is a little inefficient since it means we have to
- // parse "na" back to UProperty.NAME even though we already
- // know it's UProperty.NAME. If we refactor the API to
- // support args of (int, String) then we can remove
- // "na" and make this a little more efficient.
- valueName = propName;
- propName = "na";
+ // Optimize contains() and span() and similar functions.
+ if (!strings.isEmpty()) {
+ stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
+ }
+ if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
+ // Optimize for code point spans.
+ // There are no strings, or
+ // all strings are irrelevant for span() etc. because
+ // all of each string's code points are contained in this set.
+ // However, fully contained strings are relevant for spanAndCount(),
+ // so we create both objects.
+ bmpSet = new BMPSet(list, len);
}
}
-
- applyPropertyAlias(propName, valueName, symbols);
-
- if (invert) {
- complement();
- }
-
- // Move to the limit position after the close delimiter
- ppos.setIndex(close + (posix ? 2 : 1));
-
return this;
}
/**
- * Parse a property pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- * @param symbols TODO
+ * Span a string using this UnicodeSet.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param spanCondition The span condition
+ * @return the length of the span
+ * @stable ICU 4.4
+ */
+ public int span(CharSequence s, SpanCondition spanCondition) {
+ return span(s, 0, spanCondition);
+ }
+
+ /**
+ * Span a string using this UnicodeSet.
+ * If the start index is less than 0, span will start from 0.
+ * If the start index is greater than the string length, span returns the string length.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param spanCondition The span condition
+ * @return the string index which ends the span (i.e. exclusive)
+ * @stable ICU 4.4
*/
- private void applyPropertyPattern(RuleCharacterIterator chars,
- StringBuffer rebuiltPat, SymbolTable symbols) {
- String patStr = chars.lookahead();
- ParsePosition pos = new ParsePosition(0);
- applyPropertyPattern(patStr, pos, symbols);
- if (pos.getIndex() == 0) {
- syntaxError(chars, "Invalid property pattern");
+ public int span(CharSequence s, int start, SpanCondition spanCondition) {
+ int end = s.length();
+ if (start < 0) {
+ start = 0;
+ } else if (start >= end) {
+ return end;
+ }
+ if (bmpSet != null) {
+ // Frozen set without strings, or no string is relevant for span().
+ return bmpSet.span(s, start, spanCondition, null);
+ }
+ if (stringSpan != null) {
+ return stringSpan.span(s, start, spanCondition);
+ } else if (!strings.isEmpty()) {
+ int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+ if (strSpan.needsStringSpanUTF16()) {
+ return strSpan.span(s, start, spanCondition);
+ }
+ }
+
+ return spanCodePointsAndCount(s, start, spanCondition, null);
+ }
+
+ /**
+ * Same as span() but also counts the smallest number of set elements on any path across the span.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param outCount An output-only object (must not be null) for returning the count.
+ * @return the limit (exclusive end) of the span
+ */
+ public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
+ if (outCount == null) {
+ throw new IllegalArgumentException("outCount must not be null");
}
- chars.jumpahead(pos.getIndex());
- rebuiltPat.append(patStr, 0, pos.getIndex());
+ int end = s.length();
+ if (start < 0) {
+ start = 0;
+ } else if (start >= end) {
+ return end;
+ }
+ if (stringSpan != null) {
+ // We might also have bmpSet != null,
+ // but fully-contained strings are relevant for counting elements.
+ return stringSpan.spanAndCount(s, start, spanCondition, outCount);
+ } else if (bmpSet != null) {
+ return bmpSet.span(s, start, spanCondition, outCount);
+ } else if (!strings.isEmpty()) {
+ int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+ which |= UnicodeSetStringSpan.WITH_COUNT;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+ return strSpan.spanAndCount(s, start, spanCondition, outCount);
+ }
+
+ return spanCodePointsAndCount(s, start, spanCondition, outCount);
+ }
+
+ private int spanCodePointsAndCount(CharSequence s, int start,
+ SpanCondition spanCondition, OutputInt outCount) {
+ // Pin to 0/1 values.
+ boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
+
+ int c;
+ int next = start;
+ int length = s.length();
+ int count = 0;
+ do {
+ c = Character.codePointAt(s, next);
+ if (spanContained != contains(c)) {
+ break;
+ }
+ ++count;
+ next += Character.charCount(c);
+ } while (next < length);
+ if (outCount != null) { outCount.value = count; }
+ return next;
}
- //----------------------------------------------------------------
- // Case folding API
- //----------------------------------------------------------------
+ /**
+ * Span a string backwards (from the fromIndex) using this UnicodeSet.
+ * If the fromIndex is less than 0, spanBack will return 0.
+ * If fromIndex is greater than the string length, spanBack will start from the string length.
+ * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
+ * @param spanCondition The span condition
+ * @return The string index which starts the span (i.e. inclusive).
+ * @stable ICU 4.4
+ */
+ public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
+ if (fromIndex <= 0) {
+ return 0;
+ }
+ if (fromIndex > s.length()) {
+ fromIndex = s.length();
+ }
+ if (bmpSet != null) {
+ // Frozen set without strings, or no string is relevant for spanBack().
+ return bmpSet.spanBack(s, fromIndex, spanCondition);
+ }
+ if (stringSpan != null) {
+ return stringSpan.spanBack(s, fromIndex, spanCondition);
+ } else if (!strings.isEmpty()) {
+ int which = (spanCondition == SpanCondition.NOT_CONTAINED)
+ ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
+ if (strSpan.needsStringSpanUTF16()) {
+ return strSpan.spanBack(s, fromIndex, spanCondition);
+ }
+ }
+
+ // Pin to 0/1 values.
+ boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
+
+ int c;
+ int prev = fromIndex;
+ do {
+ c = Character.codePointBefore(s, prev);
+ if (spanContained != contains(c)) {
+ break;
+ }
+ prev -= Character.charCount(c);
+ } while (prev > 0);
+ return prev;
+ }
+
+ /**
+ * Clone a thawed version of this class, according to the Freezable interface.
+ * @return the clone, not frozen
+ * @stable ICU 4.4
+ */
+ public UnicodeSet cloneAsThawed() {
+ UnicodeSet result = new UnicodeSet(this);
+ assert !result.isFrozen();
+ return result;
+ }
+
+ // internal function
+ private void checkFrozen() {
+ if (isFrozen()) {
+ throw new UnsupportedOperationException("Attempt to modify frozen object");
+ }
+ }
/**
- * Bitmask for constructor and applyPattern() indicating that
- * white space should be ignored. If set, ignore characters for
- * which UCharacterProperty.isRuleWhiteSpace() returns true,
- * unless they are quoted or escaped. This may be ORed together
- * with other selectors.
- * @stable ICU 3.8
+ * Argument values for whether span() and similar functions continue while the current character is contained vs.
+ * not contained in the set.
+ * <p>
+ * The functionality is straightforward for sets with only single code points, without strings (which is the common
+ * case):
+ * <ul>
+ * <li>CONTAINED and SIMPLE work the same.
+ * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
+ * <li>span() and spanBack() partition any string the
+ * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
+ * <li>Using a
+ * complemented (inverted) set and the opposite span conditions yields the same results.
+ * </ul>
+ * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
+ * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
+ * strings:
+ * <ul>
+ * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
+ * Therefore, complementing both the set and the span conditions may yield different results.
+ * <li>When starting spans
+ * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
+ * because a set string may start before the later position.
+ * <li>span(SIMPLE) may be shorter than
+ * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
+ * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
+ * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
+ * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
+ * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
+ * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
+ * </ul>
+ * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
+ * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
+ * be used.
+ * <p>
+ * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
+ * boundaries, never in the middle of a surrogate pair.
+ *
+ * @stable ICU 4.4
*/
- public static final int IGNORE_SPACE = 1;
+ public enum SpanCondition {
+ /**
+ * Continues a span() while there is no set element at the current position.
+ * Increments by one code point at a time.
+ * Stops before the first set element (character or string).
+ * (For code points only, this is like while contains(current)==false).
+ * <p>
+ * When span() returns, the substring between where it started and the position it returned consists only of
+ * characters that are not in the set, and none of its strings overlap with the span.
+ *
+ * @stable ICU 4.4
+ */
+ NOT_CONTAINED,
+
+ /**
+ * Spans the longest substring that is a concatenation of set elements (characters or strings).
+ * (For characters only, this is like while contains(current)==true).
+ * <p>
+ * When span() returns, the substring between where it started and the position it returned consists only of set
+ * elements (characters or strings) that are in the set.
+ * <p>
+ * If a set contains strings, then the span will be the longest substring for which there
+ * exists at least one non-overlapping concatenation of set elements (characters or strings).
+ * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
+ * (Java/ICU/Perl regex stops at the first match of an OR.)
+ *
+ * @stable ICU 4.4
+ */
+ CONTAINED,
+
+ /**
+ * Continues a span() while there is a set element at the current position.
+ * Increments by the longest matching element at each position.
+ * (For characters only, this is like while contains(current)==true).
+ * <p>
+ * When span() returns, the substring between where it started and the position it returned consists only of set
+ * elements (characters or strings) that are in the set.
+ * <p>
+ * If a set only contains single characters, then this is the same as CONTAINED.
+ * <p>
+ * If a set contains strings, then the span will be the longest substring with a match at each position with the
+ * longest single set element (character or string).
+ * <p>
+ * Use this span condition together with other longest-match algorithms, such as ICU converters
+ * (ucnv_getUnicodeSet()).
+ *
+ * @stable ICU 4.4
+ */
+ SIMPLE,
+ }
}
-
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSetIterator.java Tue Jul 14 16:29:08 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.util.Iterator;
-
-/**
- * UnicodeSetIterator iterates over the contents of a UnicodeSet. It
- * iterates over either code points or code point ranges. After all
- * code points or ranges have been returned, it returns the
- * multicharacter strings of the UnicodSet, if any.
- *
- * <p>To iterate over code points, use a loop like this:
- * <pre>
- * UnicodeSetIterator it(set);
- * while (set.next()) {
- * if (set.codepoint != UnicodeSetIterator::IS_STRING) {
- * processCodepoint(set.codepoint);
- * } else {
- * processString(set.string);
- * }
- * }
- * </pre>
- *
- * <p>To iterate over code point ranges, use a loop like this:
- * <pre>
- * UnicodeSetIterator it(set);
- * while (set.nextRange()) {
- * if (set.codepoint != UnicodeSetIterator::IS_STRING) {
- * processCodepointRange(set.codepoint, set.codepointEnd);
- * } else {
- * processString(set.string);
- * }
- * }
- * </pre>
- * @author M. Davis
- * @stable ICU 2.0
- */
-public class UnicodeSetIterator {
-
- /**
- * Value of {@code codepoint} if the iterator points to a string.
- * If {@code codepoint == IS_STRING}, then examine
- * {@code string} for the current iteration result.
- * @stable ICU 2.0
- */
- public static int IS_STRING = -1;
-
- /**
- * Current code point, or the special value {@code IS_STRING}, if
- * the iterator points to a string.
- * @stable ICU 2.0
- */
- public int codepoint;
-
- /**
- * When iterating over ranges using {@code nextRange()},
- * {@code codepointEnd} contains the inclusive end of the
- * iteration range, if {@code codepoint != IS_STRING}. If
- * iterating over code points using {@code next()}, or if
- * {@code codepoint == IS_STRING}, then the value of
- * {@code codepointEnd} is undefined.
- * @stable ICU 2.0
- */
- public int codepointEnd;
-
- /**
- * If {@code codepoint == IS_STRING}, then {@code string} points
- * to the current string. If {@code codepoint != IS_STRING}, the
- * value of {@code string} is undefined.
- * @stable ICU 2.0
- */
- public String string;
-
- /**
- * Create an iterator over the given set.
- * @param set set to iterate over
- * @stable ICU 2.0
- */
- public UnicodeSetIterator(UnicodeSet set) {
- reset(set);
- }
-
- /**
- * Returns the next element in the set, either a code point range
- * or a string. If there are no more elements in the set, return
- * false. If {@code codepoint == IS_STRING}, the value is a
- * string in the {@code string} field. Otherwise the value is a
- * range of one or more code points from {@code codepoint} to
- * {@code codepointeEnd} inclusive.
- *
- * <p>The order of iteration is all code points ranges in sorted
- * order, followed by all strings sorted order. Ranges are
- * disjoint and non-contiguous. {@code string} is undefined
- * unless {@code codepoint == IS_STRING}. Do not mix calls to
- * {@code next()} and {@code nextRange()} without calling
- * {@code reset()} between them. The results of doing so are
- * undefined.
- *
- * @return true if there was another element in the set and this
- * object contains the element.
- * @stable ICU 2.0
- */
- public boolean nextRange() {
- if (nextElement <= endElement) {
- codepointEnd = endElement;
- codepoint = nextElement;
- nextElement = endElement+1;
- return true;
- }
- if (range < endRange) {
- loadRange(++range);
- codepointEnd = endElement;
- codepoint = nextElement;
- nextElement = endElement+1;
- return true;
- }
-
- // stringIterator == null iff there are no string elements remaining
-
- if (stringIterator == null) return false;
- codepoint = IS_STRING; // signal that value is actually a string
- string = stringIterator.next();
- if (!stringIterator.hasNext()) stringIterator = null;
- return true;
- }
-
- /**
- * Sets this iterator to visit the elements of the given set and
- * resets it to the start of that set. The iterator is valid only
- * so long as {@code set} is valid.
- * @param uset the set to iterate over.
- * @stable ICU 2.0
- */
- public void reset(UnicodeSet uset) {
- set = uset;
- reset();
- }
-
- /**
- * Resets this iterator to the start of the set.
- * @stable ICU 2.0
- */
- public void reset() {
- endRange = set.getRangeCount() - 1;
- range = 0;
- endElement = -1;
- nextElement = 0;
- if (endRange >= 0) {
- loadRange(range);
- }
- stringIterator = null;
- if (set.strings != null) {
- stringIterator = set.strings.iterator();
- if (!stringIterator.hasNext()) stringIterator = null;
- }
- }
-
- // ======================= PRIVATES ===========================
-
- private UnicodeSet set;
- private int endRange = 0;
- private int range = 0;
- /**
- * @internal
- */
- protected int endElement;
- /**
- * @internal
- */
- protected int nextElement;
- private Iterator<String> stringIterator = null;
-
- /**
- * Invariant: stringIterator is null when there are no (more) strings remaining
- */
-
- /**
- * @internal
- */
- protected void loadRange(int aRange) {
- nextElement = set.getRangeStart(aRange);
- endElement = set.getRangeEnd(aRange);
- }
-}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSetStringSpan.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,1165 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ ******************************************************************************
+ *
+ * Copyright (C) 2009-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ ******************************************************************************
+ */
+
+package sun.text.normalizer;
+
+import java.util.ArrayList;
+
+import sun.text.normalizer.UnicodeSet.SpanCondition;
+
+/*
+ * Implement span() etc. for a set with strings.
+ * Avoid recursion because of its exponential complexity.
+ * Instead, try multiple paths at once and track them with an IndexList.
+ */
+class UnicodeSetStringSpan {
+
+ /*
+ * Which span() variant will be used? The object is either built for one variant and used once,
+ * or built for all and may be used many times.
+ */
+ public static final int WITH_COUNT = 0x40; // spanAndCount() may be called
+ public static final int FWD = 0x20;
+ public static final int BACK = 0x10;
+ // public static final int UTF16 = 8;
+ public static final int CONTAINED = 2;
+ public static final int NOT_CONTAINED = 1;
+
+ public static final int ALL = 0x7f;
+
+ public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED;
+ public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED;
+ public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED;
+ public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
+
+ /**
+ * Special spanLength short values. (since Java has not unsigned byte type)
+ * All code points in the string are contained in the parent set.
+ */
+ static final short ALL_CP_CONTAINED = 0xff;
+
+ /** The spanLength is >=0xfe. */
+ static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
+
+ /** Set for span(). Same as parent but without strings. */
+ private UnicodeSet spanSet;
+
+ /**
+ * Set for span(not contained).
+ * Same as spanSet, plus characters that start or end strings.
+ */
+ private UnicodeSet spanNotSet;
+
+ /** The strings of the parent set. */
+ private ArrayList<String> strings;
+
+ /** The lengths of span(), spanBack() etc. for each string. */
+ private short[] spanLengths;
+
+ /** Maximum lengths of relevant strings. */
+ private int maxLength16;
+
+ /** Are there strings that are not fully contained in the code point set? */
+ private boolean someRelevant;
+
+ /** Set up for all variants of span()? */
+ private boolean all;
+
+ /** Span helper */
+ private OffsetList offsets;
+
+ /**
+ * Constructs for all variants of span(), or only for any one variant.
+ * Initializes as little as possible, for single use.
+ */
+ public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
+ spanSet = new UnicodeSet(0, 0x10ffff);
+ // TODO: With Java 6, just take the parent set's strings as is,
+ // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
+ // Then iterate via the first() and higher() methods.
+ // (We do not want to create multiple Iterator objects in each span().)
+ // See ICU ticket #7454.
+ strings = setStrings;
+ all = (which == ALL);
+ spanSet.retainAll(set);
+ if (0 != (which & NOT_CONTAINED)) {
+ // Default to the same sets.
+ // addToSpanNotSet() will create a separate set if necessary.
+ spanNotSet = spanSet;
+ }
+ offsets = new OffsetList();
+
+ // Determine if the strings even need to be taken into account at all for span() etc.
+ // If any string is relevant, then all strings need to be used for
+ // span(longest match) but only the relevant ones for span(while contained).
+ // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
+ // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
+ // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
+ // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
+ int stringsLength = strings.size();
+
+ int i, spanLength;
+ someRelevant = false;
+ for (i = 0; i < stringsLength; ++i) {
+ String string = strings.get(i);
+ int length16 = string.length();
+ spanLength = spanSet.span(string, SpanCondition.CONTAINED);
+ if (spanLength < length16) { // Relevant string.
+ someRelevant = true;
+ }
+ if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
+ maxLength16 = length16;
+ }
+ }
+ if (!someRelevant && (which & WITH_COUNT) == 0) {
+ return;
+ }
+
+ // Freeze after checking for the need to use strings at all because freezing
+ // a set takes some time and memory which are wasted if there are no relevant strings.
+ if (all) {
+ spanSet.freeze();
+ }
+
+ int spanBackLengthsOffset;
+
+ // Allocate a block of meta data.
+ int allocSize;
+ if (all) {
+ // 2 sets of span lengths
+ allocSize = stringsLength * (2);
+ } else {
+ allocSize = stringsLength; // One set of span lengths.
+ }
+ spanLengths = new short[allocSize];
+
+ if (all) {
+ // Store span lengths for all span() variants.
+ spanBackLengthsOffset = stringsLength;
+ } else {
+ // Store span lengths for only one span() variant.
+ spanBackLengthsOffset = 0;
+ }
+
+ // Set the meta data and spanNotSet and write the UTF-8 strings.
+
+ for (i = 0; i < stringsLength; ++i) {
+ String string = strings.get(i);
+ int length16 = string.length();
+ spanLength = spanSet.span(string, SpanCondition.CONTAINED);
+ if (spanLength < length16) { // Relevant string.
+ if (true /* 0 != (which & UTF16) */) {
+ if (0 != (which & CONTAINED)) {
+ if (0 != (which & FWD)) {
+ spanLengths[i] = makeSpanLengthByte(spanLength);
+ }
+ if (0 != (which & BACK)) {
+ spanLength = length16
+ - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
+ spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
+ }
+ } else /* not CONTAINED, not all, but NOT_CONTAINED */{
+ spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
+ // flag.
+ }
+ }
+ if (0 != (which & NOT_CONTAINED)) {
+ // Add string start and end code points to the spanNotSet so that
+ // a span(while not contained) stops before any string.
+ int c;
+ if (0 != (which & FWD)) {
+ c = string.codePointAt(0);
+ addToSpanNotSet(c);
+ }
+ if (0 != (which & BACK)) {
+ c = string.codePointBefore(length16);
+ addToSpanNotSet(c);
+ }
+ }
+ } else { // Irrelevant string.
+ if (all) {
+ spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
+ } else {
+ // All spanXYZLengths pointers contain the same address.
+ spanLengths[i] = ALL_CP_CONTAINED;
+ }
+ }
+ }
+
+ // Finish.
+ if (all) {
+ spanNotSet.freeze();
+ }
+ }
+
+ /**
+ * Do the strings need to be checked in span() etc.?
+ *
+ * @return true if strings need to be checked (call span() here),
+ * false if not (use a BMPSet for best performance).
+ */
+ public boolean needsStringSpanUTF16() {
+ return someRelevant;
+ }
+
+ /** For fast UnicodeSet::contains(c). */
+ public boolean contains(int c) {
+ return spanSet.contains(c);
+ }
+
+ /**
+ * Adds a starting or ending string character to the spanNotSet
+ * so that a character span ends before any string.
+ */
+ private void addToSpanNotSet(int c) {
+ if (spanNotSet == null || spanNotSet == spanSet) {
+ if (spanSet.contains(c)) {
+ return; // Nothing to do.
+ }
+ spanNotSet = spanSet.cloneAsThawed();
+ }
+ spanNotSet.add(c);
+ }
+
+ /*
+ * Note: In span() when spanLength==0
+ * (after a string match, or at the beginning after an empty code point span)
+ * and in spanNot() and spanNotUTF8(),
+ * string matching could use a binary search because all string matches are done
+ * from the same start index.
+ *
+ * For UTF-8, this would require a comparison function that returns UTF-16 order.
+ *
+ * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
+ * with strings have very few very short strings. For cases with many strings, it might be better to use a different
+ * API and implementation with a DFA (state machine).
+ */
+
+ /*
+ * Algorithm for span(SpanCondition.CONTAINED)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ * + If the code point there is in the set, then remember to continue after it.
+ * + If a set string matches at the current position, then remember to continue after it.
+ * + Either recursively span for each code point or string match, or recursively span
+ * for all but the shortest one and iteratively continue the span with the shortest local match.
+ * + Remember the longest recursive span (the farthest end point).
+ * + If there is no match at the current position,
+ * neither for the code point there nor for any set string,
+ * then stop and return the longest recursive span length.
+ *
+ * Optimized implementation:
+ *
+ * (We assume that most sets will have very few very short strings.
+ * A span using a string-less set is extremely fast.)
+ *
+ * Create and cache a spanSet which contains all of the single code points of the original set
+ * but none of its strings.
+ *
+ * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
+ * - Loop:
+ * + Try to match each set string at the end of the spanLength.
+ * ~ Set strings that start with set-contained code points
+ * must be matched with a partial overlap
+ * because the recursive algorithm would have tried to match them at every position.
+ * ~ Set strings that entirely consist of set-contained code points
+ * are irrelevant for span(SpanCondition.CONTAINED)
+ * because the recursive algorithm would continue after them anyway and
+ * find the longest recursive match from their end.
+ * ~ Rather than recursing, note each end point of a set string match.
+ * + If no set string matched after spanSet.span(),
+ * then return with where the spanSet.span() ended.
+ * + If at least one set string matched after spanSet.span(),
+ * then pop the shortest string match end point and continue the loop,
+ * trying to match all set strings from there.
+ * + If at least one more set string matched after a previous string match, then test if the
+ * code point after the previous string match is also contained in the set.
+ * Continue the loop with the shortest end point of
+ * either this code point or a matching set string.
+ * + If no more set string matched after a previous string match,
+ * then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
+ * Stop if spanLength==0, otherwise continue the loop.
+ *
+ * By noting each end point of a set string match, the function visits each string position at most once and
+ * finishes in linear time.
+ *
+ * The recursive algorithm may visit the same string position many times
+ * if multiple paths lead to it and finishes in exponential time.
+ */
+
+ /*
+ * Algorithm for span(SIMPLE)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ * + If the code point there is in the set, then remember to continue after it.
+ * + If a set string matches at the current position, then remember to continue after it.
+ * + Continue from the farthest match position and ignore all others.
+ * + If there is no match at the current position, then stop and return the current position.
+ *
+ * Optimized implementation:
+ *
+ * (Same assumption and spanSet as above.)
+ *
+ * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
+ * - Loop:
+ * + Try to match each set string at the end of the spanLength.
+ * ~ Set strings that start with set-contained code points
+ * must be matched with a partial overlap
+ * because the standard algorithm would have tried to match them earlier.
+ * ~ Set strings that entirely consist of set-contained code points
+ * must be matched with a full overlap because the longest-match algorithm
+ * would hide set string matches that end earlier.
+ * Such set strings need not be matched earlier inside the code point span
+ * because the standard algorithm would then have
+ * continued after the set string match anyway.
+ * ~ Remember the longest set string match (farthest end point)
+ * from the earliest starting point.
+ * + If no set string matched after spanSet.span(),
+ * then return with where the spanSet.span() ended.
+ * + If at least one set string matched,
+ * then continue the loop after the longest match from the earliest position.
+ * + If no more set string matched after a previous string match,
+ * then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
+ * Stop if spanLength==0, otherwise continue the loop.
+ */
+ /**
+ * Spans a string.
+ *
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param spanCondition The span condition
+ * @return the limit (exclusive end) of the span
+ */
+ public int span(CharSequence s, int start, SpanCondition spanCondition) {
+ if (spanCondition == SpanCondition.NOT_CONTAINED) {
+ return spanNot(s, start, null);
+ }
+ int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
+ if (spanLimit == s.length()) {
+ return spanLimit;
+ }
+ return spanWithStrings(s, start, spanLimit, spanCondition);
+ }
+
+ /**
+ * Synchronized method for complicated spans using the offsets.
+ * Avoids synchronization for simple cases.
+ *
+ * @param spanLimit = spanSet.span(s, start, CONTAINED)
+ */
+ private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
+ SpanCondition spanCondition) {
+ // Consider strings; they may overlap with the span.
+ int initSize = 0;
+ if (spanCondition == SpanCondition.CONTAINED) {
+ // Use offset list to try all possibilities.
+ initSize = maxLength16;
+ }
+ offsets.setMaxLength(initSize);
+ int length = s.length();
+ int pos = spanLimit, rest = length - spanLimit;
+ int spanLength = spanLimit - start;
+ int i, stringsLength = strings.size();
+ for (;;) {
+ if (spanCondition == SpanCondition.CONTAINED) {
+ for (i = 0; i < stringsLength; ++i) {
+ int overlap = spanLengths[i];
+ if (overlap == ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ String string = strings.get(i);
+
+ int length16 = string.length();
+
+ // Try to match this string at pos-overlap..pos.
+ if (overlap >= LONG_SPAN) {
+ overlap = length16;
+ // While contained: No point matching fully inside the code point span.
+ overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
+ // point.
+ }
+ if (overlap > spanLength) {
+ overlap = spanLength;
+ }
+ int inc = length16 - overlap; // Keep overlap+inc==length16.
+ for (;;) {
+ if (inc > rest) {
+ break;
+ }
+ // Try to match if the increment is not listed already.
+ if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
+ if (inc == rest) {
+ return length; // Reached the end of the string.
+ }
+ offsets.addOffset(inc);
+ }
+ if (overlap == 0) {
+ break;
+ }
+ --overlap;
+ ++inc;
+ }
+ }
+ } else /* SIMPLE */{
+ int maxInc = 0, maxOverlap = 0;
+ for (i = 0; i < stringsLength; ++i) {
+ int overlap = spanLengths[i];
+ // For longest match, we do need to try to match even an all-contained string
+ // to find the match from the earliest start.
+
+ String string = strings.get(i);
+
+ int length16 = string.length();
+
+ // Try to match this string at pos-overlap..pos.
+ if (overlap >= LONG_SPAN) {
+ overlap = length16;
+ // Longest match: Need to match fully inside the code point span
+ // to find the match from the earliest start.
+ }
+ if (overlap > spanLength) {
+ overlap = spanLength;
+ }
+ int inc = length16 - overlap; // Keep overlap+inc==length16.
+ for (;;) {
+ if (inc > rest || overlap < maxOverlap) {
+ break;
+ }
+ // Try to match if the string is longer or starts earlier.
+ if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
+ && matches16CPB(s, pos - overlap, length, string, length16)) {
+ maxInc = inc; // Longest match from earliest start.
+ maxOverlap = overlap;
+ break;
+ }
+ --overlap;
+ ++inc;
+ }
+ }
+
+ if (maxInc != 0 || maxOverlap != 0) {
+ // Longest-match algorithm, and there was a string match.
+ // Simply continue after it.
+ pos += maxInc;
+ rest -= maxInc;
+ if (rest == 0) {
+ return length; // Reached the end of the string.
+ }
+ spanLength = 0; // Match strings from after a string match.
+ continue;
+ }
+ }
+ // Finished trying to match all strings at pos.
+
+ if (spanLength != 0 || pos == 0) {
+ // The position is after an unlimited code point span (spanLength!=0),
+ // not after a string match.
+ // The only position where spanLength==0 after a span is pos==0.
+ // Otherwise, an unlimited code point span is only tried again when no
+ // strings match, and if such a non-initial span fails we stop.
+ if (offsets.isEmpty()) {
+ return pos; // No strings matched after a span.
+ }
+ // Match strings from after the next string match.
+ } else {
+ // The position is after a string match (or a single code point).
+ if (offsets.isEmpty()) {
+ // No more strings matched after a previous string match.
+ // Try another code point span from after the last string match.
+ spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
+ spanLength = spanLimit - pos;
+ if (spanLength == rest || // Reached the end of the string, or
+ spanLength == 0 // neither strings nor span progressed.
+ ) {
+ return spanLimit;
+ }
+ pos += spanLength;
+ rest -= spanLength;
+ continue; // spanLength>0: Match strings from after a span.
+ } else {
+ // Try to match only one code point from after a string match if some
+ // string matched beyond it, so that we try all possible positions
+ // and don't overshoot.
+ spanLength = spanOne(spanSet, s, pos, rest);
+ if (spanLength > 0) {
+ if (spanLength == rest) {
+ return length; // Reached the end of the string.
+ }
+ // Match strings after this code point.
+ // There cannot be any increments below it because UnicodeSet strings
+ // contain multiple code points.
+ pos += spanLength;
+ rest -= spanLength;
+ offsets.shift(spanLength);
+ spanLength = 0;
+ continue; // Match strings from after a single code point.
+ }
+ // Match strings from after the next string match.
+ }
+ }
+ int minOffset = offsets.popMinimum(null);
+ pos += minOffset;
+ rest -= minOffset;
+ spanLength = 0; // Match strings from after a string match.
+ }
+ }
+
+ /**
+ * Spans a string and counts the smallest number of set elements on any path across the span.
+ *
+ * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
+ *
+ * <p>If the set does not have any fully-contained strings, then we could optimize this
+ * like span(), but such sets are likely rare, and this is at least still linear.
+ *
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param spanCondition The span condition
+ * @param outCount The count
+ * @return the limit (exclusive end) of the span
+ */
+ public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
+ OutputInt outCount) {
+ if (spanCondition == SpanCondition.NOT_CONTAINED) {
+ return spanNot(s, start, outCount);
+ }
+ // Consider strings; they may overlap with the span,
+ // and they may result in a smaller count that with just code points.
+ if (spanCondition == SpanCondition.CONTAINED) {
+ return spanContainedAndCount(s, start, outCount);
+ }
+ // SIMPLE (not synchronized, does not use offsets)
+ int stringsLength = strings.size();
+ int length = s.length();
+ int pos = start;
+ int rest = length - start;
+ int count = 0;
+ while (rest != 0) {
+ // Try to match the next code point.
+ int cpLength = spanOne(spanSet, s, pos, rest);
+ int maxInc = (cpLength > 0) ? cpLength : 0;
+ // Try to match all of the strings.
+ for (int i = 0; i < stringsLength; ++i) {
+ String string = strings.get(i);
+ int length16 = string.length();
+ if (maxInc < length16 && length16 <= rest &&
+ matches16CPB(s, pos, length, string, length16)) {
+ maxInc = length16;
+ }
+ }
+ // We are done if there is no match beyond pos.
+ if (maxInc == 0) {
+ outCount.value = count;
+ return pos;
+ }
+ // Continue from the longest match.
+ ++count;
+ pos += maxInc;
+ rest -= maxInc;
+ }
+ outCount.value = count;
+ return pos;
+ }
+
+ private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
+ // Use offset list to try all possibilities.
+ offsets.setMaxLength(maxLength16);
+ int stringsLength = strings.size();
+ int length = s.length();
+ int pos = start;
+ int rest = length - start;
+ int count = 0;
+ while (rest != 0) {
+ // Try to match the next code point.
+ int cpLength = spanOne(spanSet, s, pos, rest);
+ if (cpLength > 0) {
+ offsets.addOffsetAndCount(cpLength, count + 1);
+ }
+ // Try to match all of the strings.
+ for (int i = 0; i < stringsLength; ++i) {
+ String string = strings.get(i);
+ int length16 = string.length();
+ // Note: If the strings were sorted by length, then we could also
+ // avoid trying to match if there is already a match of the same length.
+ if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
+ matches16CPB(s, pos, length, string, length16)) {
+ offsets.addOffsetAndCount(length16, count + 1);
+ }
+ }
+ // We are done if there is no match beyond pos.
+ if (offsets.isEmpty()) {
+ outCount.value = count;
+ return pos;
+ }
+ // Continue from the nearest match.
+ int minOffset = offsets.popMinimum(outCount);
+ count = outCount.value;
+ pos += minOffset;
+ rest -= minOffset;
+ }
+ outCount.value = count;
+ return pos;
+ }
+
+ /**
+ * Span a string backwards.
+ *
+ * @param s The string to be spanned
+ * @param spanCondition The span condition
+ * @return The string index which starts the span (i.e. inclusive).
+ */
+ public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
+ if (spanCondition == SpanCondition.NOT_CONTAINED) {
+ return spanNotBack(s, length);
+ }
+ int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
+ if (pos == 0) {
+ return 0;
+ }
+ int spanLength = length - pos;
+
+ // Consider strings; they may overlap with the span.
+ int initSize = 0;
+ if (spanCondition == SpanCondition.CONTAINED) {
+ // Use offset list to try all possibilities.
+ initSize = maxLength16;
+ }
+ offsets.setMaxLength(initSize);
+ int i, stringsLength = strings.size();
+ int spanBackLengthsOffset = 0;
+ if (all) {
+ spanBackLengthsOffset = stringsLength;
+ }
+ for (;;) {
+ if (spanCondition == SpanCondition.CONTAINED) {
+ for (i = 0; i < stringsLength; ++i) {
+ int overlap = spanLengths[spanBackLengthsOffset + i];
+ if (overlap == ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ String string = strings.get(i);
+
+ int length16 = string.length();
+
+ // Try to match this string at pos-(length16-overlap)..pos-length16.
+ if (overlap >= LONG_SPAN) {
+ overlap = length16;
+ // While contained: No point matching fully inside the code point span.
+ int len1 = 0;
+ len1 = string.offsetByCodePoints(0, 1);
+ overlap -= len1; // Length of the string minus the first code point.
+ }
+ if (overlap > spanLength) {
+ overlap = spanLength;
+ }
+ int dec = length16 - overlap; // Keep dec+overlap==length16.
+ for (;;) {
+ if (dec > pos) {
+ break;
+ }
+ // Try to match if the decrement is not listed already.
+ if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
+ if (dec == pos) {
+ return 0; // Reached the start of the string.
+ }
+ offsets.addOffset(dec);
+ }
+ if (overlap == 0) {
+ break;
+ }
+ --overlap;
+ ++dec;
+ }
+ }
+ } else /* SIMPLE */{
+ int maxDec = 0, maxOverlap = 0;
+ for (i = 0; i < stringsLength; ++i) {
+ int overlap = spanLengths[spanBackLengthsOffset + i];
+ // For longest match, we do need to try to match even an all-contained string
+ // to find the match from the latest end.
+
+ String string = strings.get(i);
+
+ int length16 = string.length();
+
+ // Try to match this string at pos-(length16-overlap)..pos-length16.
+ if (overlap >= LONG_SPAN) {
+ overlap = length16;
+ // Longest match: Need to match fully inside the code point span
+ // to find the match from the latest end.
+ }
+ if (overlap > spanLength) {
+ overlap = spanLength;
+ }
+ int dec = length16 - overlap; // Keep dec+overlap==length16.
+ for (;;) {
+ if (dec > pos || overlap < maxOverlap) {
+ break;
+ }
+ // Try to match if the string is longer or ends later.
+ if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
+ && matches16CPB(s, pos - dec, length, string, length16)) {
+ maxDec = dec; // Longest match from latest end.
+ maxOverlap = overlap;
+ break;
+ }
+ --overlap;
+ ++dec;
+ }
+ }
+
+ if (maxDec != 0 || maxOverlap != 0) {
+ // Longest-match algorithm, and there was a string match.
+ // Simply continue before it.
+ pos -= maxDec;
+ if (pos == 0) {
+ return 0; // Reached the start of the string.
+ }
+ spanLength = 0; // Match strings from before a string match.
+ continue;
+ }
+ }
+ // Finished trying to match all strings at pos.
+
+ if (spanLength != 0 || pos == length) {
+ // The position is before an unlimited code point span (spanLength!=0),
+ // not before a string match.
+ // The only position where spanLength==0 before a span is pos==length.
+ // Otherwise, an unlimited code point span is only tried again when no
+ // strings match, and if such a non-initial span fails we stop.
+ if (offsets.isEmpty()) {
+ return pos; // No strings matched before a span.
+ }
+ // Match strings from before the next string match.
+ } else {
+ // The position is before a string match (or a single code point).
+ if (offsets.isEmpty()) {
+ // No more strings matched before a previous string match.
+ // Try another code point span from before the last string match.
+ int oldPos = pos;
+ pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
+ spanLength = oldPos - pos;
+ if (pos == 0 || // Reached the start of the string, or
+ spanLength == 0 // neither strings nor span progressed.
+ ) {
+ return pos;
+ }
+ continue; // spanLength>0: Match strings from before a span.
+ } else {
+ // Try to match only one code point from before a string match if some
+ // string matched beyond it, so that we try all possible positions
+ // and don't overshoot.
+ spanLength = spanOneBack(spanSet, s, pos);
+ if (spanLength > 0) {
+ if (spanLength == pos) {
+ return 0; // Reached the start of the string.
+ }
+ // Match strings before this code point.
+ // There cannot be any decrements below it because UnicodeSet strings
+ // contain multiple code points.
+ pos -= spanLength;
+ offsets.shift(spanLength);
+ spanLength = 0;
+ continue; // Match strings from before a single code point.
+ }
+ // Match strings from before the next string match.
+ }
+ }
+ pos -= offsets.popMinimum(null);
+ spanLength = 0; // Match strings from before a string match.
+ }
+ }
+
+ /**
+ * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ * + If the code point there is in the set, then return with the current position.
+ * + If a set string matches at the current position, then return with the current position.
+ *
+ * Optimized implementation:
+ *
+ * (Same assumption as for span() above.)
+ *
+ * Create and cache a spanNotSet which contains
+ * all of the single code points of the original set but none of its strings.
+ * For each set string add its initial code point to the spanNotSet.
+ * (Also add its final code point for spanNotBack().)
+ *
+ * - Loop:
+ * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
+ * + If the current code point is in the original set, then return the current position.
+ * + If any set string matches at the current position, then return the current position.
+ * + If there is no match at the current position, neither for the code point
+ * there nor for any set string, then skip this code point and continue the loop.
+ * This happens for set-string-initial code points that were added to spanNotSet
+ * when there is not actually a match for such a set string.
+ *
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param outCount If not null: Receives the number of code points across the span.
+ * @return the limit (exclusive end) of the span
+ */
+ private int spanNot(CharSequence s, int start, OutputInt outCount) {
+ int length = s.length();
+ int pos = start, rest = length - start;
+ int stringsLength = strings.size();
+ int count = 0;
+ do {
+ // Span until we find a code point from the set,
+ // or a code point that starts or ends some string.
+ int spanLimit;
+ if (outCount == null) {
+ spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
+ } else {
+ spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
+ outCount.value = count = count + outCount.value;
+ }
+ if (spanLimit == length) {
+ return length; // Reached the end of the string.
+ }
+ pos = spanLimit;
+ rest = length - spanLimit;
+
+ // Check whether the current code point is in the original set,
+ // without the string starts and ends.
+ int cpLength = spanOne(spanSet, s, pos, rest);
+ if (cpLength > 0) {
+ return pos; // There is a set element at pos.
+ }
+
+ // Try to match the strings at pos.
+ for (int i = 0; i < stringsLength; ++i) {
+ if (spanLengths[i] == ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ String string = strings.get(i);
+
+ int length16 = string.length();
+ if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
+ return pos; // There is a set element at pos.
+ }
+ }
+
+ // The span(while not contained) ended on a string start/end which is
+ // not in the original set. Skip this code point and continue.
+ // cpLength<0
+ pos -= cpLength;
+ rest += cpLength;
+ ++count;
+ } while (rest != 0);
+ if (outCount != null) {
+ outCount.value = count;
+ }
+ return length; // Reached the end of the string.
+ }
+
+ private int spanNotBack(CharSequence s, int length) {
+ int pos = length;
+ int i, stringsLength = strings.size();
+ do {
+ // Span until we find a code point from the set,
+ // or a code point that starts or ends some string.
+ pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
+ if (pos == 0) {
+ return 0; // Reached the start of the string.
+ }
+
+ // Check whether the current code point is in the original set,
+ // without the string starts and ends.
+ int cpLength = spanOneBack(spanSet, s, pos);
+ if (cpLength > 0) {
+ return pos; // There is a set element at pos.
+ }
+
+ // Try to match the strings at pos.
+ for (i = 0; i < stringsLength; ++i) {
+ // Use spanLengths rather than a spanLengths pointer because
+ // it is easier and we only need to know whether the string is irrelevant
+ // which is the same in either array.
+ if (spanLengths[i] == ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ String string = strings.get(i);
+
+ int length16 = string.length();
+ if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
+ return pos; // There is a set element at pos.
+ }
+ }
+
+ // The span(while not contained) ended on a string start/end which is
+ // not in the original set. Skip this code point and continue.
+ // cpLength<0
+ pos += cpLength;
+ } while (pos != 0);
+ return 0; // Reached the start of the string.
+ }
+
+ static short makeSpanLengthByte(int spanLength) {
+ // 0xfe==UnicodeSetStringSpan::LONG_SPAN
+ return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
+ }
+
+ // Compare strings without any argument checks. Requires length>0.
+ private static boolean matches16(CharSequence s, int start, final String t, int length) {
+ int end = start + length;
+ while (length-- > 0) {
+ if (s.charAt(--end) != t.charAt(length)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Compare 16-bit Unicode strings (which may be malformed UTF-16)
+ * at code point boundaries.
+ * That is, each edge of a match must not be in the middle of a surrogate pair.
+ * @param s The string to match in.
+ * @param start The start index of s.
+ * @param limit The limit of the subsequence of s being spanned.
+ * @param t The substring to be matched in s.
+ * @param tlength The length of t.
+ */
+ static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
+ return matches16(s, start, t, tlength)
+ && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
+ Character.isLowSurrogate(s.charAt(start)))
+ && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
+ Character.isLowSurrogate(s.charAt(start + tlength)));
+ }
+
+ /**
+ * Does the set contain the next code point?
+ * If so, return its length; otherwise return its negative length.
+ */
+ static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
+ char c = s.charAt(start);
+ if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
+ char c2 = s.charAt(start + 1);
+ if (UTF16.isTrailSurrogate(c2)) {
+ int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
+ return set.contains(supplementary) ? 2 : -2;
+ }
+ }
+ return set.contains(c) ? 1 : -1;
+ }
+
+ static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
+ char c = s.charAt(length - 1);
+ if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
+ char c2 = s.charAt(length - 2);
+ if (UTF16.isLeadSurrogate(c2)) {
+ int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
+ return set.contains(supplementary) ? 2 : -2;
+ }
+ }
+ return set.contains(c) ? 1 : -1;
+ }
+
+ /**
+ * Helper class for UnicodeSetStringSpan.
+ *
+ * <p>List of offsets from the current position from where to try matching
+ * a code point or a string.
+ * Stores offsets rather than indexes to simplify the code and use the same list
+ * for both increments (in span()) and decrements (in spanBack()).
+ *
+ * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
+ * are relatively dense, that is,
+ * there are normally no gaps of hundreds or thousands of offset values.
+ *
+ * <p>This class optionally also tracks the minimum non-negative count for each position,
+ * intended to count the smallest number of elements of any path leading to that position.
+ *
+ * <p>The implementation uses a circular buffer of count integers,
+ * each indicating whether the corresponding offset is in the list,
+ * and its path element count.
+ * This avoids inserting into a sorted list of offsets (or absolute indexes)
+ * and physically moving part of the list.
+ *
+ * <p>Note: In principle, the caller should setMaxLength() to
+ * the maximum of the max string length and U16_LENGTH/U8_LENGTH
+ * to account for "long" single code points.
+ *
+ * <p>Note: An earlier version did not track counts and stored only byte flags.
+ * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
+ * the list could be stored as bit flags in a single integer.
+ * Rather than handling a circular buffer with a start list index,
+ * the integer would simply be shifted when lower offsets are removed.
+ * UnicodeSet does not have a limit on the lengths of strings.
+ */
+ private static final class OffsetList {
+ private int[] list;
+ private int length;
+ private int start;
+
+ public OffsetList() {
+ list = new int[16]; // default size
+ }
+
+ public void setMaxLength(int maxLength) {
+ if (maxLength > list.length) {
+ list = new int[maxLength];
+ }
+ clear();
+ }
+
+ public void clear() {
+ for (int i = list.length; i-- > 0;) {
+ list[i] = 0;
+ }
+ start = length = 0;
+ }
+
+ public boolean isEmpty() {
+ return (length == 0);
+ }
+
+ /**
+ * Reduces all stored offsets by delta, used when the current position moves by delta.
+ * There must not be any offsets lower than delta.
+ * If there is an offset equal to delta, it is removed.
+ *
+ * @param delta [1..maxLength]
+ */
+ public void shift(int delta) {
+ int i = start + delta;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ if (list[i] != 0) {
+ list[i] = 0;
+ --length;
+ }
+ start = i;
+ }
+
+ /**
+ * Adds an offset. The list must not contain it yet.
+ * @param offset [1..maxLength]
+ */
+ public void addOffset(int offset) {
+ int i = start + offset;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ assert list[i] == 0;
+ list[i] = 1;
+ ++length;
+ }
+
+ /**
+ * Adds an offset and updates its count.
+ * The list may already contain the offset.
+ * @param offset [1..maxLength]
+ */
+ public void addOffsetAndCount(int offset, int count) {
+ assert count > 0;
+ int i = start + offset;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ if (list[i] == 0) {
+ list[i] = count;
+ ++length;
+ } else if (count < list[i]) {
+ list[i] = count;
+ }
+ }
+
+ /**
+ * @param offset [1..maxLength]
+ */
+ public boolean containsOffset(int offset) {
+ int i = start + offset;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ return list[i] != 0;
+ }
+
+ /**
+ * @param offset [1..maxLength]
+ */
+ public boolean hasCountAtOffset(int offset, int count) {
+ int i = start + offset;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ int oldCount = list[i];
+ return oldCount != 0 && oldCount <= count;
+ }
+
+ /**
+ * Finds the lowest stored offset from a non-empty list, removes it,
+ * and reduces all other offsets by this minimum.
+ * @return min=[1..maxLength]
+ */
+ public int popMinimum(OutputInt outCount) {
+ // Look for the next offset in list[start+1..list.length-1].
+ int i = start, result;
+ while (++i < list.length) {
+ int count = list[i];
+ if (count != 0) {
+ list[i] = 0;
+ --length;
+ result = i - start;
+ start = i;
+ if (outCount != null) { outCount.value = count; }
+ return result;
+ }
+ }
+ // i==list.length
+
+ // Wrap around and look for the next offset in list[0..start].
+ // Since the list is not empty, there will be one.
+ result = list.length - start;
+ i = 0;
+ int count;
+ while ((count = list[i]) == 0) {
+ ++i;
+ }
+ list[i] = 0;
+ --length;
+ start = i;
+ if (outCount != null) { outCount.value = count; }
+ return result + i;
+ }
+ }
+}
--- a/jdk/src/java.base/share/classes/sun/text/normalizer/Utility.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.base/share/classes/sun/text/normalizer/Utility.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,47 +24,26 @@
*/
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2011, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
*******************************************************************************
*/
package sun.text.normalizer;
-public final class Utility {
+import java.io.IOException;
+import java.util.Locale;
- /**
- * Convenience utility to compare two Object[]s
- * Ought to be in System.
- * @param len the length to compare.
- * The start indices and start+len must be valid.
- */
- public final static boolean arrayRegionMatches(char[] source, int sourceStart,
- char[] target, int targetStart,
- int len)
- {
- int sourceEnd = sourceStart + len;
- int delta = targetStart - sourceStart;
- for (int i = sourceStart; i < sourceEnd; i++) {
- if (source[i]!=target[i + delta])
- return false;
- }
- return true;
- }
+final class Utility {
/**
* Convert characters outside the range U+0020 to U+007F to
* Unicode escapes, and convert backslash to a double backslash.
*/
public static final String escape(String s) {
- StringBuffer buf = new StringBuffer();
+ StringBuilder buf = new StringBuilder();
for (int i=0; i<s.length(); ) {
- int c = UTF16.charAt(s, i);
+ int c = Character.codePointAt(s, i);
i += UTF16.getCharCount(c);
if (c >= ' ' && c <= 0x007F) {
if (c == '\\') {
@@ -75,7 +54,7 @@
} else {
boolean four = c <= 0xFFFF;
buf.append(four ? "\\u" : "\\U");
- hex(c, four ? 4 : 8, buf);
+ buf.append(hex(c, four ? 4 : 8));
}
}
return buf.toString();
@@ -124,7 +103,7 @@
}
/* Fetch first UChar after '\\' */
- c = UTF16.charAt(s, offset);
+ c = Character.codePointAt(s, offset);
offset += UTF16.getCharCount(c);
/* Convert hexadecimal and octal escapes */
@@ -143,7 +122,7 @@
maxDig = 8;
} else {
maxDig = 2;
- }
+ }
break;
default:
dig = UCharacter.digit(c, 8);
@@ -175,7 +154,7 @@
return -1;
}
++offset;
- }
+ }
if (result < 0 || result >= 0x110000) {
return -1;
}
@@ -184,7 +163,7 @@
// escape or as a literal. If so, join them up into a
// supplementary.
if (offset < length &&
- UTF16.isLeadSurrogate((char) result)) {
+ UTF16.isLeadSurrogate((char) result)) {
int ahead = offset+1;
c = s.charAt(offset); // [sic] get 16-bit code unit
if (c == '\\' && ahead < length) {
@@ -194,8 +173,8 @@
}
if (UTF16.isTrailSurrogate((char) c)) {
offset = ahead;
- result = UCharacterProperty.getRawSupplementary(
- (char) result, (char) c);
+ result = UCharacterProperty.getRawSupplementary(
+ (char) result, (char) c);
}
}
offset16[0] = offset;
@@ -226,39 +205,22 @@
}
/**
- * Convert a integer to size width hex uppercase digits.
- * E.g., {@code hex('a', 4, str) => "0041"}.
- * Append the output to the given StringBuffer.
- * If width is too small to fit, nothing will be appended to output.
- */
- public static StringBuffer hex(int ch, int width, StringBuffer output) {
- return appendNumber(output, ch, 16, width);
- }
-
- /**
- * Convert a integer to size width (minimum) hex uppercase digits.
- * E.g., {@code hex('a', 4, str) => "0041"}. If the integer requires more
- * than width digits, more will be used.
+ * Supplies a zero-padded hex representation of an integer (without 0x)
*/
- public static String hex(int ch, int width) {
- StringBuffer buf = new StringBuffer();
- return appendNumber(buf, ch, 16, width).toString();
- }
-
- /**
- * Skip over a sequence of zero or more white space characters
- * at pos. Return the index of the first non-white-space character
- * at or after pos, or str.length(), if there is none.
- */
- public static int skipWhitespace(String str, int pos) {
- while (pos < str.length()) {
- int c = UTF16.charAt(str, pos);
- if (!UCharacterProperty.isRuleWhiteSpace(c)) {
- break;
- }
- pos += UTF16.getCharCount(c);
+ static public String hex(long i, int places) {
+ if (i == Long.MIN_VALUE) return "-8000000000000000";
+ boolean negative = i < 0;
+ if (negative) {
+ i = -i;
}
- return pos;
+ String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
+ if (result.length() < places) {
+ result = "0000000000000000".substring(result.length(),places) + result;
+ }
+ if (negative) {
+ return '-' + result;
+ }
+ return result;
}
static final char DIGITS[] = {
@@ -269,117 +231,43 @@
};
/**
- * Append the digits of a positive integer to the given
- * <code>StringBuffer</code> in the given radix. This is
- * done recursively since it is easiest to generate the low-
- * order digit first, but it must be appended last.
- *
- * @param result is the <code>StringBuffer</code> to append to
- * @param n is the positive integer
- * @param radix is the radix, from 2 to 36 inclusive
- * @param minDigits is the minimum number of digits to append.
- */
- private static void recursiveAppendNumber(StringBuffer result, int n,
- int radix, int minDigits)
- {
- int digit = n % radix;
-
- if (n >= radix || minDigits > 1) {
- recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
- }
-
- result.append(DIGITS[digit]);
- }
-
- /**
- * Append a number to the given StringBuffer in the given radix.
- * Standard digits '0'-'9' are used and letters 'A'-'Z' for
- * radices 11 through 36.
- * @param result the digits of the number are appended here
- * @param n the number to be converted to digits; may be negative.
- * If negative, a '-' is prepended to the digits.
- * @param radix a radix from 2 to 36 inclusive.
- * @param minDigits the minimum number of digits, not including
- * any '-', to produce. Values less than 2 have no effect. One
- * digit is always emitted regardless of this parameter.
- * @return a reference to result
- */
- public static StringBuffer appendNumber(StringBuffer result, int n,
- int radix, int minDigits)
- throws IllegalArgumentException
- {
- if (radix < 2 || radix > 36) {
- throw new IllegalArgumentException("Illegal radix " + radix);
- }
-
-
- int abs = n;
-
- if (n < 0) {
- abs = -n;
- result.append("-");
- }
-
- recursiveAppendNumber(result, abs, radix, minDigits);
-
- return result;
- }
-
- /**
* Return true if the character is NOT printable ASCII. The tab,
* newline and linefeed characters are considered unprintable.
*/
public static boolean isUnprintable(int c) {
+ //0x20 = 32 and 0x7E = 126
return !(c >= 0x20 && c <= 0x7E);
}
/**
- * Escape unprintable characters using {@code <backslash>uxxxx} notation
- * for U+0000 to U+FFFF and {@code <backslash>Uxxxxxxxx} for U+10000 and
+ * Escape unprintable characters using <backslash>uxxxx notation
+ * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
* above. If the character is printable ASCII, then do nothing
* and return FALSE. Otherwise, append the escaped notation and
* return TRUE.
*/
- public static boolean escapeUnprintable(StringBuffer result, int c) {
- if (isUnprintable(c)) {
- result.append('\\');
- if ((c & ~0xFFFF) != 0) {
- result.append('U');
- result.append(DIGITS[0xF&(c>>28)]);
- result.append(DIGITS[0xF&(c>>24)]);
- result.append(DIGITS[0xF&(c>>20)]);
- result.append(DIGITS[0xF&(c>>16)]);
- } else {
- result.append('u');
+ public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
+ try {
+ if (isUnprintable(c)) {
+ result.append('\\');
+ if ((c & ~0xFFFF) != 0) {
+ result.append('U');
+ result.append(DIGITS[0xF&(c>>28)]);
+ result.append(DIGITS[0xF&(c>>24)]);
+ result.append(DIGITS[0xF&(c>>20)]);
+ result.append(DIGITS[0xF&(c>>16)]);
+ } else {
+ result.append('u');
+ }
+ result.append(DIGITS[0xF&(c>>12)]);
+ result.append(DIGITS[0xF&(c>>8)]);
+ result.append(DIGITS[0xF&(c>>4)]);
+ result.append(DIGITS[0xF&c]);
+ return true;
}
- result.append(DIGITS[0xF&(c>>12)]);
- result.append(DIGITS[0xF&(c>>8)]);
- result.append(DIGITS[0xF&(c>>4)]);
- result.append(DIGITS[0xF&c]);
- return true;
+ return false;
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
}
- return false;
}
-
- /**
- * Similar to StringBuffer.getChars, version 1.3.
- * Since JDK 1.2 implements StringBuffer.getChars differently, this method
- * is here to provide consistent results.
- * To be removed after JDK 1.2 ceased to be the reference platform.
- * @param src source string buffer
- * @param srcBegin offset to the start of the src to retrieve from
- * @param srcEnd offset to the end of the src to retrieve from
- * @param dst char array to store the retrieved chars
- * @param dstBegin offset to the start of the destination char array to
- * store the retrieved chars
- */
- public static void getChars(StringBuffer src, int srcBegin, int srcEnd,
- char dst[], int dstBegin)
- {
- if (srcBegin == srcEnd) {
- return;
- }
- src.getChars(srcBegin, srcEnd, dst, dstBegin);
- }
-
}
Binary file jdk/src/java.base/share/classes/sun/text/resources/nfc.icu has changed
Binary file jdk/src/java.base/share/classes/sun/text/resources/nfkc.icu has changed
Binary file jdk/src/java.base/share/classes/sun/text/resources/nfkc_cf.icu has changed
Binary file jdk/src/java.base/share/classes/sun/text/resources/ubidi.icu has changed
Binary file jdk/src/java.base/share/classes/sun/text/resources/unorm.icu has changed
Binary file jdk/src/java.base/share/classes/sun/text/resources/uprops.icu has changed
--- a/jdk/src/java.desktop/share/classes/java/awt/font/NumericShaper.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/src/java.desktop/share/classes/java/awt/font/NumericShaper.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -318,7 +318,17 @@
/**
* The Meetei Mayek range with the Meetei Mayek digits.
*/
- MEETEI_MAYEK ('\uabf0', '\uabc0', '\uac00');
+ MEETEI_MAYEK ('\uabf0', '\uabc0', '\uac00'),
+ /**
+ * The Sinhala range with the Sinhala digits.
+ * @since 1.9
+ */
+ SINHALA ('\u0de6', '\u0d80', '\u0e00'),
+ /**
+ * The Myanmar Extended-B range with the Myanmar Tai Laing digits.
+ * @since 1.9
+ */
+ MYANMAR_TAI_LAING ('\ua9f0', '\ua9e0', '\uaa00');
private static int toRangeIndex(Range script) {
int index = script.ordinal();
@@ -624,15 +634,25 @@
0x02e5, 0x02ee,
0x02ef, 0x0370,
0x0374, 0x0376,
- 0x037e, 0x0386,
+ 0x0378, 0x037a,
+ 0x037e, 0x037f,
+ 0x0380, 0x0386,
0x0387, 0x0388,
+ 0x038b, 0x038c,
+ 0x038d, 0x038e,
+ 0x03a2, 0x03a3,
0x03f6, 0x03f7,
0x0483, 0x048a,
- 0x058a, 0x05be,
+ 0x0530, 0x0531,
+ 0x0557, 0x0559,
+ 0x0560, 0x0561,
+ 0x0588, 0x0589,
+ 0x058a, 0x0590,
+ 0x0591, 0x05be,
0x05bf, 0x05c0,
0x05c1, 0x05c3,
0x05c4, 0x05c6,
- 0x05c7, 0x05d0,
+ 0x05c7, 0x05c8,
0x0600, 0x0608,
0x0609, 0x060b,
0x060c, 0x060d,
@@ -643,15 +663,15 @@
0x06e7, 0x06ee,
0x06f0, 0x06fa,
0x0711, 0x0712,
- 0x0730, 0x074d,
+ 0x0730, 0x074b,
0x07a6, 0x07b1,
0x07eb, 0x07f4,
0x07f6, 0x07fa,
0x0816, 0x081a,
0x081b, 0x0824,
0x0825, 0x0828,
- 0x0829, 0x0830,
- 0x0859, 0x085e,
+ 0x0829, 0x082e,
+ 0x0859, 0x085c,
0x08e4, 0x0903,
0x093a, 0x093b,
0x093c, 0x093d,
@@ -660,57 +680,161 @@
0x0951, 0x0958,
0x0962, 0x0964,
0x0981, 0x0982,
- 0x09bc, 0x09bd,
+ 0x0984, 0x0985,
+ 0x098d, 0x098f,
+ 0x0991, 0x0993,
+ 0x09a9, 0x09aa,
+ 0x09b1, 0x09b2,
+ 0x09b3, 0x09b6,
+ 0x09ba, 0x09bd,
0x09c1, 0x09c7,
+ 0x09c9, 0x09cb,
0x09cd, 0x09ce,
+ 0x09cf, 0x09d7,
+ 0x09d8, 0x09dc,
+ 0x09de, 0x09df,
0x09e2, 0x09e6,
0x09f2, 0x09f4,
0x09fb, 0x0a03,
- 0x0a3c, 0x0a3e,
+ 0x0a04, 0x0a05,
+ 0x0a0b, 0x0a0f,
+ 0x0a11, 0x0a13,
+ 0x0a29, 0x0a2a,
+ 0x0a31, 0x0a32,
+ 0x0a34, 0x0a35,
+ 0x0a37, 0x0a38,
+ 0x0a3a, 0x0a3e,
0x0a41, 0x0a59,
+ 0x0a5d, 0x0a5e,
+ 0x0a5f, 0x0a66,
0x0a70, 0x0a72,
0x0a75, 0x0a83,
- 0x0abc, 0x0abd,
+ 0x0a84, 0x0a85,
+ 0x0a8e, 0x0a8f,
+ 0x0a92, 0x0a93,
+ 0x0aa9, 0x0aaa,
+ 0x0ab1, 0x0ab2,
+ 0x0ab4, 0x0ab5,
+ 0x0aba, 0x0abd,
0x0ac1, 0x0ac9,
+ 0x0aca, 0x0acb,
0x0acd, 0x0ad0,
+ 0x0ad1, 0x0ae0,
0x0ae2, 0x0ae6,
0x0af1, 0x0b02,
- 0x0b3c, 0x0b3d,
+ 0x0b04, 0x0b05,
+ 0x0b0d, 0x0b0f,
+ 0x0b11, 0x0b13,
+ 0x0b29, 0x0b2a,
+ 0x0b31, 0x0b32,
+ 0x0b34, 0x0b35,
+ 0x0b3a, 0x0b3d,
0x0b3f, 0x0b40,
0x0b41, 0x0b47,
+ 0x0b49, 0x0b4b,
0x0b4d, 0x0b57,
+ 0x0b58, 0x0b5c,
+ 0x0b5e, 0x0b5f,
0x0b62, 0x0b66,
- 0x0b82, 0x0b83,
+ 0x0b78, 0x0b83,
+ 0x0b84, 0x0b85,
+ 0x0b8b, 0x0b8e,
+ 0x0b91, 0x0b92,
+ 0x0b96, 0x0b99,
+ 0x0b9b, 0x0b9c,
+ 0x0b9d, 0x0b9e,
+ 0x0ba0, 0x0ba3,
+ 0x0ba5, 0x0ba8,
+ 0x0bab, 0x0bae,
+ 0x0bba, 0x0bbe,
0x0bc0, 0x0bc1,
+ 0x0bc3, 0x0bc6,
+ 0x0bc9, 0x0bca,
0x0bcd, 0x0bd0,
+ 0x0bd1, 0x0bd7,
+ 0x0bd8, 0x0be6,
0x0bf3, 0x0c01,
+ 0x0c04, 0x0c05,
+ 0x0c0d, 0x0c0e,
+ 0x0c11, 0x0c12,
+ 0x0c29, 0x0c2a,
+ 0x0c3a, 0x0c3d,
0x0c3e, 0x0c41,
- 0x0c46, 0x0c58,
+ 0x0c45, 0x0c58,
+ 0x0c5a, 0x0c60,
0x0c62, 0x0c66,
- 0x0c78, 0x0c7f,
- 0x0cbc, 0x0cbd,
+ 0x0c70, 0x0c7f,
+ 0x0c80, 0x0c82,
+ 0x0c84, 0x0c85,
+ 0x0c8d, 0x0c8e,
+ 0x0c91, 0x0c92,
+ 0x0ca9, 0x0caa,
+ 0x0cb4, 0x0cb5,
+ 0x0cba, 0x0cbd,
+ 0x0cc5, 0x0cc6,
+ 0x0cc9, 0x0cca,
0x0ccc, 0x0cd5,
+ 0x0cd7, 0x0cde,
+ 0x0cdf, 0x0ce0,
0x0ce2, 0x0ce6,
+ 0x0cf0, 0x0cf1,
+ 0x0cf3, 0x0d02,
+ 0x0d04, 0x0d05,
+ 0x0d0d, 0x0d0e,
+ 0x0d11, 0x0d12,
+ 0x0d3b, 0x0d3d,
0x0d41, 0x0d46,
+ 0x0d49, 0x0d4a,
0x0d4d, 0x0d4e,
+ 0x0d4f, 0x0d57,
+ 0x0d58, 0x0d60,
0x0d62, 0x0d66,
- 0x0dca, 0x0dcf,
+ 0x0d76, 0x0d79,
+ 0x0d80, 0x0d82,
+ 0x0d84, 0x0d85,
+ 0x0d97, 0x0d9a,
+ 0x0db2, 0x0db3,
+ 0x0dbc, 0x0dbd,
+ 0x0dbe, 0x0dc0,
+ 0x0dc7, 0x0dcf,
0x0dd2, 0x0dd8,
+ 0x0de0, 0x0de6,
+ 0x0df0, 0x0df2,
+ 0x0df5, 0x0e01,
0x0e31, 0x0e32,
0x0e34, 0x0e40,
0x0e47, 0x0e4f,
+ 0x0e5c, 0x0e81,
+ 0x0e83, 0x0e84,
+ 0x0e85, 0x0e87,
+ 0x0e89, 0x0e8a,
+ 0x0e8b, 0x0e8d,
+ 0x0e8e, 0x0e94,
+ 0x0e98, 0x0e99,
+ 0x0ea0, 0x0ea1,
+ 0x0ea4, 0x0ea5,
+ 0x0ea6, 0x0ea7,
+ 0x0ea8, 0x0eaa,
+ 0x0eac, 0x0ead,
0x0eb1, 0x0eb2,
0x0eb4, 0x0ebd,
- 0x0ec8, 0x0ed0,
+ 0x0ebe, 0x0ec0,
+ 0x0ec5, 0x0ec6,
+ 0x0ec7, 0x0ed0,
+ 0x0eda, 0x0edc,
+ 0x0ee0, 0x0f00,
0x0f18, 0x0f1a,
0x0f35, 0x0f36,
0x0f37, 0x0f38,
0x0f39, 0x0f3e,
- 0x0f71, 0x0f7f,
+ 0x0f48, 0x0f49,
+ 0x0f6d, 0x0f7f,
0x0f80, 0x0f85,
0x0f86, 0x0f88,
0x0f8d, 0x0fbe,
0x0fc6, 0x0fc7,
+ 0x0fcd, 0x0fce,
+ 0x0fdb, 0x1000,
0x102d, 0x1031,
0x1032, 0x1038,
0x1039, 0x103b,
@@ -722,66 +846,119 @@
0x1085, 0x1087,
0x108d, 0x108e,
0x109d, 0x109e,
- 0x135d, 0x1360,
+ 0x10c6, 0x10c7,
+ 0x10c8, 0x10cd,
+ 0x10ce, 0x10d0,
+ 0x1249, 0x124a,
+ 0x124e, 0x1250,
+ 0x1257, 0x1258,
+ 0x1259, 0x125a,
+ 0x125e, 0x1260,
+ 0x1289, 0x128a,
+ 0x128e, 0x1290,
+ 0x12b1, 0x12b2,
+ 0x12b6, 0x12b8,
+ 0x12bf, 0x12c0,
+ 0x12c1, 0x12c2,
+ 0x12c6, 0x12c8,
+ 0x12d7, 0x12d8,
+ 0x1311, 0x1312,
+ 0x1316, 0x1318,
+ 0x135b, 0x1360,
+ 0x137d, 0x1380,
0x1390, 0x13a0,
- 0x1400, 0x1401,
+ 0x13f5, 0x1401,
0x1680, 0x1681,
0x169b, 0x16a0,
+ 0x16f9, 0x1700,
+ 0x170d, 0x170e,
0x1712, 0x1720,
0x1732, 0x1735,
+ 0x1737, 0x1740,
0x1752, 0x1760,
- 0x1772, 0x1780,
+ 0x176d, 0x176e,
+ 0x1771, 0x1780,
0x17b4, 0x17b6,
0x17b7, 0x17be,
0x17c6, 0x17c7,
0x17c9, 0x17d4,
0x17db, 0x17dc,
0x17dd, 0x17e0,
- 0x17f0, 0x1810,
+ 0x17ea, 0x1810,
+ 0x181a, 0x1820,
+ 0x1878, 0x1880,
0x18a9, 0x18aa,
- 0x1920, 0x1923,
+ 0x18ab, 0x18b0,
+ 0x18f6, 0x1900,
+ 0x191f, 0x1923,
0x1927, 0x1929,
+ 0x192c, 0x1930,
0x1932, 0x1933,
0x1939, 0x1946,
- 0x19de, 0x1a00,
+ 0x196e, 0x1970,
+ 0x1975, 0x1980,
+ 0x19ac, 0x19b0,
+ 0x19ca, 0x19d0,
+ 0x19db, 0x1a00,
0x1a17, 0x1a19,
+ 0x1a1b, 0x1a1e,
0x1a56, 0x1a57,
0x1a58, 0x1a61,
0x1a62, 0x1a63,
0x1a65, 0x1a6d,
0x1a73, 0x1a80,
- 0x1b00, 0x1b04,
+ 0x1a8a, 0x1a90,
+ 0x1a9a, 0x1aa0,
+ 0x1aae, 0x1b04,
0x1b34, 0x1b35,
0x1b36, 0x1b3b,
0x1b3c, 0x1b3d,
0x1b42, 0x1b43,
+ 0x1b4c, 0x1b50,
0x1b6b, 0x1b74,
- 0x1b80, 0x1b82,
+ 0x1b7d, 0x1b82,
0x1ba2, 0x1ba6,
0x1ba8, 0x1baa,
- 0x1bab, 0x1bac,
+ 0x1bab, 0x1bae,
0x1be6, 0x1be7,
0x1be8, 0x1bea,
0x1bed, 0x1bee,
0x1bef, 0x1bf2,
+ 0x1bf4, 0x1bfc,
0x1c2c, 0x1c34,
0x1c36, 0x1c3b,
- 0x1cd0, 0x1cd3,
+ 0x1c4a, 0x1c4d,
+ 0x1c80, 0x1cc0,
+ 0x1cc8, 0x1cd3,
0x1cd4, 0x1ce1,
0x1ce2, 0x1ce9,
0x1ced, 0x1cee,
0x1cf4, 0x1cf5,
+ 0x1cf7, 0x1d00,
0x1dc0, 0x1e00,
+ 0x1f16, 0x1f18,
+ 0x1f1e, 0x1f20,
+ 0x1f46, 0x1f48,
+ 0x1f4e, 0x1f50,
+ 0x1f58, 0x1f59,
+ 0x1f5a, 0x1f5b,
+ 0x1f5c, 0x1f5d,
+ 0x1f5e, 0x1f5f,
+ 0x1f7e, 0x1f80,
+ 0x1fb5, 0x1fb6,
0x1fbd, 0x1fbe,
0x1fbf, 0x1fc2,
+ 0x1fc5, 0x1fc6,
0x1fcd, 0x1fd0,
- 0x1fdd, 0x1fe0,
+ 0x1fd4, 0x1fd6,
+ 0x1fdc, 0x1fe0,
0x1fed, 0x1ff2,
+ 0x1ff5, 0x1ff6,
0x1ffd, 0x200e,
0x2010, 0x2071,
- 0x2074, 0x207f,
+ 0x2072, 0x207f,
0x2080, 0x2090,
- 0x20a0, 0x2102,
+ 0x209d, 0x2102,
0x2103, 0x2107,
0x2108, 0x210a,
0x2114, 0x2115,
@@ -801,35 +978,59 @@
0x24ea, 0x26ac,
0x26ad, 0x2800,
0x2900, 0x2c00,
+ 0x2c2f, 0x2c30,
+ 0x2c5f, 0x2c60,
0x2ce5, 0x2ceb,
0x2cef, 0x2cf2,
- 0x2cf9, 0x2d00,
- 0x2d7f, 0x2d80,
- 0x2de0, 0x3005,
+ 0x2cf4, 0x2d00,
+ 0x2d26, 0x2d27,
+ 0x2d28, 0x2d2d,
+ 0x2d2e, 0x2d30,
+ 0x2d68, 0x2d6f,
+ 0x2d71, 0x2d80,
+ 0x2d97, 0x2da0,
+ 0x2da7, 0x2da8,
+ 0x2daf, 0x2db0,
+ 0x2db7, 0x2db8,
+ 0x2dbf, 0x2dc0,
+ 0x2dc7, 0x2dc8,
+ 0x2dcf, 0x2dd0,
+ 0x2dd7, 0x2dd8,
+ 0x2ddf, 0x3005,
0x3008, 0x3021,
- 0x302a, 0x3031,
+ 0x302a, 0x302e,
+ 0x3030, 0x3031,
0x3036, 0x3038,
0x303d, 0x3041,
- 0x3099, 0x309d,
+ 0x3097, 0x309d,
0x30a0, 0x30a1,
0x30fb, 0x30fc,
- 0x31c0, 0x31f0,
+ 0x3100, 0x3105,
+ 0x312e, 0x3131,
+ 0x318f, 0x3190,
+ 0x31bb, 0x31f0,
0x321d, 0x3220,
0x3250, 0x3260,
0x327c, 0x327f,
0x32b1, 0x32c0,
0x32cc, 0x32d0,
+ 0x32ff, 0x3300,
0x3377, 0x337b,
0x33de, 0x33e0,
0x33ff, 0x3400,
- 0x4dc0, 0x4e00,
- 0xa490, 0xa4d0,
+ 0x4db6, 0x4e00,
+ 0x9fcd, 0xa000,
+ 0xa48d, 0xa4d0,
0xa60d, 0xa610,
+ 0xa62c, 0xa640,
0xa66f, 0xa680,
- 0xa69f, 0xa6a0,
+ 0xa69e, 0xa6a0,
0xa6f0, 0xa6f2,
- 0xa700, 0xa722,
+ 0xa6f8, 0xa722,
0xa788, 0xa789,
+ 0xa78f, 0xa790,
+ 0xa7ae, 0xa7b0,
+ 0xa7b2, 0xa7f7,
0xa802, 0xa803,
0xa806, 0xa807,
0xa80b, 0xa80c,
@@ -838,77 +1039,241 @@
0xa838, 0xa840,
0xa874, 0xa880,
0xa8c4, 0xa8ce,
- 0xa8e0, 0xa8f2,
+ 0xa8da, 0xa8f2,
+ 0xa8fc, 0xa900,
0xa926, 0xa92e,
0xa947, 0xa952,
- 0xa980, 0xa983,
+ 0xa954, 0xa95f,
+ 0xa97d, 0xa983,
0xa9b3, 0xa9b4,
0xa9b6, 0xa9ba,
0xa9bc, 0xa9bd,
+ 0xa9ce, 0xa9cf,
+ 0xa9da, 0xa9de,
+ 0xa9e5, 0xa9e6,
+ 0xa9ff, 0xaa00,
0xaa29, 0xaa2f,
0xaa31, 0xaa33,
0xaa35, 0xaa40,
0xaa43, 0xaa44,
0xaa4c, 0xaa4d,
+ 0xaa4e, 0xaa50,
+ 0xaa5a, 0xaa5c,
+ 0xaa7c, 0xaa7d,
0xaab0, 0xaab1,
0xaab2, 0xaab5,
0xaab7, 0xaab9,
0xaabe, 0xaac0,
0xaac1, 0xaac2,
+ 0xaac3, 0xaadb,
0xaaec, 0xaaee,
0xaaf6, 0xab01,
+ 0xab07, 0xab09,
+ 0xab0f, 0xab11,
+ 0xab17, 0xab20,
+ 0xab27, 0xab28,
+ 0xab2f, 0xab30,
+ 0xab60, 0xab64,
+ 0xab66, 0xabc0,
0xabe5, 0xabe6,
0xabe8, 0xabe9,
0xabed, 0xabf0,
+ 0xabfa, 0xac00,
+ 0xd7a4, 0xd7b0,
+ 0xd7c7, 0xd7cb,
+ 0xd7fc, 0xe000,
+ 0xfa6e, 0xfa70,
+ 0xfada, 0xfb00,
+ 0xfb07, 0xfb13,
+ 0xfb18, 0xfb1d,
0xfb1e, 0xfb1f,
0xfb29, 0xfb2a,
- 0xfd3e, 0xfd50,
- 0xfdfd, 0xfe70,
+ 0xfd3e, 0xfd40,
+ 0xfdd0, 0xfdf0,
+ 0xfdfd, 0xfdfe,
+ 0xfe00, 0xfe70,
0xfeff, 0xff21,
0xff3b, 0xff41,
0xff5b, 0xff66,
- 0xffe0, 0x10000,
+ 0xffbf, 0xffc2,
+ 0xffc8, 0xffca,
+ 0xffd0, 0xffd2,
+ 0xffd8, 0xffda,
+ 0xffdd, 0x10000,
+ 0x1000c, 0x1000d,
+ 0x10027, 0x10028,
+ 0x1003b, 0x1003c,
+ 0x1003e, 0x1003f,
+ 0x1004e, 0x10050,
+ 0x1005e, 0x10080,
+ 0x100fb, 0x10100,
0x10101, 0x10102,
+ 0x10103, 0x10107,
+ 0x10134, 0x10137,
0x10140, 0x101d0,
0x101fd, 0x10280,
+ 0x1029d, 0x102a0,
+ 0x102d1, 0x10300,
+ 0x10324, 0x10330,
+ 0x1034b, 0x10350,
+ 0x10376, 0x10380,
+ 0x1039e, 0x1039f,
+ 0x103c4, 0x103c8,
+ 0x103d6, 0x10400,
+ 0x1049e, 0x104a0,
+ 0x104aa, 0x10500,
+ 0x10528, 0x10530,
+ 0x10564, 0x1056f,
+ 0x10570, 0x10600,
+ 0x10737, 0x10740,
+ 0x10756, 0x10760,
+ 0x10768, 0x10800,
0x1091f, 0x10920,
- 0x10a01, 0x10a10,
- 0x10a38, 0x10a40,
+ 0x10a01, 0x10a04,
+ 0x10a05, 0x10a07,
+ 0x10a0c, 0x10a10,
+ 0x10a38, 0x10a3b,
+ 0x10a3f, 0x10a40,
+ 0x10ae5, 0x10ae7,
0x10b39, 0x10b40,
- 0x10e60, 0x11000,
+ 0x10e60, 0x10e7f,
0x11001, 0x11002,
0x11038, 0x11047,
- 0x11052, 0x11066,
- 0x11080, 0x11082,
+ 0x1104e, 0x11066,
+ 0x11070, 0x11082,
0x110b3, 0x110b7,
0x110b9, 0x110bb,
- 0x11100, 0x11103,
+ 0x110c2, 0x110d0,
+ 0x110e9, 0x110f0,
+ 0x110fa, 0x11103,
0x11127, 0x1112c,
0x1112d, 0x11136,
- 0x11180, 0x11182,
+ 0x11144, 0x11150,
+ 0x11173, 0x11174,
+ 0x11177, 0x11182,
0x111b6, 0x111bf,
+ 0x111c9, 0x111cd,
+ 0x111ce, 0x111d0,
+ 0x111db, 0x111e1,
+ 0x111f5, 0x11200,
+ 0x11212, 0x11213,
+ 0x1122f, 0x11232,
+ 0x11234, 0x11235,
+ 0x11236, 0x11238,
+ 0x1123e, 0x112b0,
+ 0x112df, 0x112e0,
+ 0x112e3, 0x112f0,
+ 0x112fa, 0x11302,
+ 0x11304, 0x11305,
+ 0x1130d, 0x1130f,
+ 0x11311, 0x11313,
+ 0x11329, 0x1132a,
+ 0x11331, 0x11332,
+ 0x11334, 0x11335,
+ 0x1133a, 0x1133d,
+ 0x11340, 0x11341,
+ 0x11345, 0x11347,
+ 0x11349, 0x1134b,
+ 0x1134e, 0x11357,
+ 0x11358, 0x1135d,
+ 0x11364, 0x11480,
+ 0x114b3, 0x114b9,
+ 0x114ba, 0x114bb,
+ 0x114bf, 0x114c1,
+ 0x114c2, 0x114c4,
+ 0x114c8, 0x114d0,
+ 0x114da, 0x11580,
+ 0x115b2, 0x115b8,
+ 0x115bc, 0x115be,
+ 0x115bf, 0x115c1,
+ 0x115ca, 0x11600,
+ 0x11633, 0x1163b,
+ 0x1163d, 0x1163e,
+ 0x1163f, 0x11641,
+ 0x11645, 0x11650,
+ 0x1165a, 0x11680,
0x116ab, 0x116ac,
0x116ad, 0x116ae,
0x116b0, 0x116b6,
0x116b7, 0x116c0,
- 0x16f8f, 0x16f93,
+ 0x116ca, 0x118a0,
+ 0x118f3, 0x118ff,
+ 0x11900, 0x11ac0,
+ 0x11af9, 0x12000,
+ 0x12399, 0x12400,
+ 0x1246f, 0x12470,
+ 0x12475, 0x13000,
+ 0x1342f, 0x16800,
+ 0x16a39, 0x16a40,
+ 0x16a5f, 0x16a60,
+ 0x16a6a, 0x16a6e,
+ 0x16a70, 0x16ad0,
+ 0x16aee, 0x16af5,
+ 0x16af6, 0x16b00,
+ 0x16b30, 0x16b37,
+ 0x16b46, 0x16b50,
+ 0x16b5a, 0x16b5b,
+ 0x16b62, 0x16b63,
+ 0x16b78, 0x16b7d,
+ 0x16b90, 0x16f00,
+ 0x16f45, 0x16f50,
+ 0x16f7f, 0x16f93,
+ 0x16fa0, 0x1b000,
+ 0x1b002, 0x1bc00,
+ 0x1bc6b, 0x1bc70,
+ 0x1bc7d, 0x1bc80,
+ 0x1bc89, 0x1bc90,
+ 0x1bc9a, 0x1bc9c,
+ 0x1bc9d, 0x1bc9f,
+ 0x1bca0, 0x1d000,
+ 0x1d0f6, 0x1d100,
+ 0x1d127, 0x1d129,
0x1d167, 0x1d16a,
0x1d173, 0x1d183,
0x1d185, 0x1d18c,
0x1d1aa, 0x1d1ae,
- 0x1d200, 0x1d360,
+ 0x1d1de, 0x1d360,
+ 0x1d372, 0x1d400,
+ 0x1d455, 0x1d456,
+ 0x1d49d, 0x1d49e,
+ 0x1d4a0, 0x1d4a2,
+ 0x1d4a3, 0x1d4a5,
+ 0x1d4a7, 0x1d4a9,
+ 0x1d4ad, 0x1d4ae,
+ 0x1d4ba, 0x1d4bb,
+ 0x1d4bc, 0x1d4bd,
+ 0x1d4c4, 0x1d4c5,
+ 0x1d506, 0x1d507,
+ 0x1d50b, 0x1d50d,
+ 0x1d515, 0x1d516,
+ 0x1d51d, 0x1d51e,
+ 0x1d53a, 0x1d53b,
+ 0x1d53f, 0x1d540,
+ 0x1d545, 0x1d546,
+ 0x1d547, 0x1d54a,
+ 0x1d551, 0x1d552,
+ 0x1d6a6, 0x1d6a8,
0x1d6db, 0x1d6dc,
0x1d715, 0x1d716,
0x1d74f, 0x1d750,
0x1d789, 0x1d78a,
0x1d7c3, 0x1d7c4,
- 0x1d7ce, 0x1ee00,
- 0x1eef0, 0x1f110,
+ 0x1d7cc, 0x1e800,
+ 0x1e8d0, 0x1e8d7,
+ 0x1eef0, 0x1eef2,
+ 0x1f000, 0x1f110,
+ 0x1f12f, 0x1f130,
0x1f16a, 0x1f170,
- 0x1f300, 0x1f48c,
- 0x1f48d, 0x1f524,
- 0x1f525, 0x20000,
- 0xe0001, 0xf0000,
+ 0x1f19b, 0x1f1e6,
+ 0x1f203, 0x1f210,
+ 0x1f23b, 0x1f240,
+ 0x1f249, 0x1f250,
+ 0x1f252, 0x20000,
+ 0x2a6d7, 0x2a700,
+ 0x2b735, 0x2b740,
+ 0x2b81e, 0x2f800,
+ 0x2fa1e, 0xf0000,
+ 0xffffe, 0x100000,
0x10fffe, 0x10ffff // sentinel
};
--- a/jdk/test/java/awt/font/NumericShaper/ShapingTest.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/test/java/awt/font/NumericShaper/ShapingTest.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -23,7 +23,7 @@
/*
* @test
- * @bug 6842557 6943963 6959267
+ * @bug 6842557 6943963 6959267 8032446
* @summary confirm that shaping works as expected. (Mainly for new characters which were added in Unicode 5 and 6)
* used where appropriate.
*/
@@ -40,6 +40,7 @@
test6842557();
test6943963();
test6903266();
+ test8032446();
if (err) {
throw new RuntimeException("shape() returned unexpected value.");
@@ -138,6 +139,18 @@
checkResult("Range.MEETEI_MAYEK", ns, given, expected);
}
+ private static void test8032446() {
+ NumericShaper ns = getContextualShaper(EnumSet.of(Range.SINHALA));
+ String given = "\u0d85 012";
+ String expected = "\u0d85 \u0de6\u0de7\u0de8";
+ checkResult("Range.SINHALA", ns, given, expected);
+
+ ns = getContextualShaper(EnumSet.of(Range.MYANMAR_TAI_LAING));
+ given = "\ua9e2 012";
+ expected = "\ua9e2 \ua9f0\ua9f1\ua9f2";
+ checkResult("Range.MYANMAR_TAI_LAING", ns, given, expected);
+ }
+
private static void checkResult(String ranges, NumericShaper ns,
String given, String expected) {
char[] text = given.toCharArray();
--- a/jdk/test/java/lang/Character/CheckProp.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/test/java/lang/Character/CheckProp.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,7 +24,7 @@
/**
* @test
- * @bug 7037261 7070436 7198195
+ * @bug 7037261 7070436 7198195 8032446
* @summary Check j.l.Character.isLowerCase/isUppercase/isAlphabetic/isIdeographic
*/
--- a/jdk/test/java/lang/Character/CheckScript.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/test/java/lang/Character/CheckScript.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,6 +1,5 @@
-
/*
- * Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,7 +23,7 @@
/**
* @test
- * @bug 6945564 6959267 7033561 7070436 7198195
+ * @bug 6945564 6959267 7033561 7070436 7198195 8032446
* @summary Check that the j.l.Character.UnicodeScript
*/
--- a/jdk/test/java/lang/Character/PropList.txt Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/test/java/lang/Character/PropList.txt Wed Jul 15 11:05:51 2015 +0900
@@ -1,8 +1,8 @@
-# PropList-6.2.0.txt
-# Date: 2012-05-23, 20:34:59 GMT [MD]
+# PropList-7.0.0.txt
+# Date: 2014-02-19, 15:51:26 GMT [MD]
#
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@@ -13,7 +13,6 @@
0085 ; White_Space # Cc <control-0085>
00A0 ; White_Space # Zs NO-BREAK SPACE
1680 ; White_Space # Zs OGHAM SPACE MARK
-180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
2028 ; White_Space # Zl LINE SEPARATOR
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
@@ -21,14 +20,16 @@
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
3000 ; White_Space # Zs IDEOGRAPHIC SPACE
-# Total code points: 26
+# Total code points: 25
# ================================================
+061C ; Bidi_Control # Cf ARABIC LETTER MARK
200E..200F ; Bidi_Control # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK
202A..202E ; Bidi_Control # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
+2066..2069 ; Bidi_Control # Cf [4] LEFT-TO-RIGHT ISOLATE..POP DIRECTIONAL ISOLATE
-# Total code points: 7
+# Total code points: 12
# ================================================
@@ -51,6 +52,7 @@
2E17 ; Dash # Pd DOUBLE OBLIQUE HYPHEN
2E1A ; Dash # Pd HYPHEN WITH DIAERESIS
2E3A..2E3B ; Dash # Pd [2] TWO-EM DASH..THREE-EM DASH
+2E40 ; Dash # Pd DOUBLE HYPHEN
301C ; Dash # Pd WAVE DASH
3030 ; Dash # Pd WAVY DASH
30A0 ; Dash # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
@@ -59,7 +61,7 @@
FE63 ; Dash # Pd SMALL HYPHEN-MINUS
FF0D ; Dash # Pd FULLWIDTH HYPHEN-MINUS
-# Total code points: 27
+# Total code points: 28
# ================================================
@@ -91,6 +93,7 @@
201F ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK
2039 ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK
203A ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+2E42 ; Quotation_Mark # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
300C ; Quotation_Mark # Ps LEFT CORNER BRACKET
300D ; Quotation_Mark # Pe RIGHT CORNER BRACKET
300E ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET
@@ -106,7 +109,7 @@
FF62 ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET
FF63 ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET
-# Total code points: 29
+# Total code points: 30
# ================================================
@@ -136,6 +139,7 @@
1361..1368 ; Terminal_Punctuation # Po [8] ETHIOPIC WORDSPACE..ETHIOPIC PARAGRAPH SEPARATOR
166D..166E ; Terminal_Punctuation # Po [2] CANADIAN SYLLABICS CHI SIGN..CANADIAN SYLLABICS FULL STOP
16EB..16ED ; Terminal_Punctuation # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION
+1735..1736 ; Terminal_Punctuation # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
17D4..17D6 ; Terminal_Punctuation # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH
17DA ; Terminal_Punctuation # Po KHMER SIGN KOOMUUT
1802..1805 ; Terminal_Punctuation # Po [4] MONGOLIAN COMMA..MONGOLIAN FOUR DOTS
@@ -149,6 +153,8 @@
203C..203D ; Terminal_Punctuation # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2E2E ; Terminal_Punctuation # Po REVERSED QUESTION MARK
+2E3C ; Terminal_Punctuation # Po STENOGRAPHIC FULL STOP
+2E41 ; Terminal_Punctuation # Po REVERSED COMMA
3001..3002 ; Terminal_Punctuation # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
A4FE..A4FF ; Terminal_Punctuation # Po [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP
A60D..A60F ; Terminal_Punctuation # Po [3] VAI COMMA..VAI QUESTION MARK
@@ -174,14 +180,27 @@
103D0 ; Terminal_Punctuation # Po OLD PERSIAN WORD DIVIDER
10857 ; Terminal_Punctuation # Po IMPERIAL ARAMAIC SECTION SIGN
1091F ; Terminal_Punctuation # Po PHOENICIAN WORD SEPARATOR
+10A56..10A57 ; Terminal_Punctuation # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA
+10AF0..10AF5 ; Terminal_Punctuation # Po [6] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS
10B3A..10B3F ; Terminal_Punctuation # Po [6] TINY TWO DOTS OVER ONE DOT PUNCTUATION..LARGE ONE RING OVER TWO RINGS PUNCTUATION
+10B99..10B9C ; Terminal_Punctuation # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
11047..1104D ; Terminal_Punctuation # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
110BE..110C1 ; Terminal_Punctuation # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
11141..11143 ; Terminal_Punctuation # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK
111C5..111C6 ; Terminal_Punctuation # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA
-12470..12473 ; Terminal_Punctuation # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON
+111CD ; Terminal_Punctuation # Po SHARADA SUTRA MARK
+11238..1123C ; Terminal_Punctuation # Po [5] KHOJKI DANDA..KHOJKI DOUBLE SECTION MARK
+115C2..115C5 ; Terminal_Punctuation # Po [4] SIDDHAM DANDA..SIDDHAM SEPARATOR BAR
+115C9 ; Terminal_Punctuation # Po SIDDHAM END OF TEXT MARK
+11641..11642 ; Terminal_Punctuation # Po [2] MODI DANDA..MODI DOUBLE DANDA
+12470..12474 ; Terminal_Punctuation # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
+16A6E..16A6F ; Terminal_Punctuation # Po [2] MRO DANDA..MRO DOUBLE DANDA
+16AF5 ; Terminal_Punctuation # Po BASSA VAH FULL STOP
+16B37..16B39 ; Terminal_Punctuation # Po [3] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN CIM CHEEM
+16B44 ; Terminal_Punctuation # Po PAHAWH HMONG SIGN XAUS
+1BC9F ; Terminal_Punctuation # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
-# Total code points: 176
+# Total code points: 214
# ================================================
@@ -230,6 +249,10 @@
21D5..21DB ; Other_Math # So [7] UP DOWN DOUBLE ARROW..RIGHTWARDS TRIPLE ARROW
21DD ; Other_Math # So RIGHTWARDS SQUIGGLE ARROW
21E4..21E5 ; Other_Math # So [2] LEFTWARDS ARROW TO BAR..RIGHTWARDS ARROW TO BAR
+2308 ; Other_Math # Ps LEFT CEILING
+2309 ; Other_Math # Pe RIGHT CEILING
+230A ; Other_Math # Ps LEFT FLOOR
+230B ; Other_Math # Pe RIGHT FLOOR
23B4..23B5 ; Other_Math # So [2] TOP SQUARE BRACKET..BOTTOM SQUARE BRACKET
23B7 ; Other_Math # So RADICAL SYMBOL BOTTOM
23D0 ; Other_Math # So VERTICAL LINE EXTENSION
@@ -358,7 +381,7 @@
1EEA5..1EEA9 ; Other_Math # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
1EEAB..1EEBB ; Other_Math # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
-# Total code points: 1358
+# Total code points: 1362
# ================================================
@@ -403,8 +426,7 @@
0825..0827 ; Other_Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082C ; Other_Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN
08E4..08E9 ; Other_Alphabetic # Mn [6] ARABIC CURLY FATHA..ARABIC CURLY KASRATAN
-08F0..08FE ; Other_Alphabetic # Mn [15] ARABIC OPEN FATHATAN..ARABIC DAMMA WITH DOT
-0900..0902 ; Other_Alphabetic # Mn [3] DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI SIGN ANUSVARA
+08F0..0902 ; Other_Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA
0903 ; Other_Alphabetic # Mc DEVANAGARI SIGN VISARGA
093A ; Other_Alphabetic # Mn DEVANAGARI VOWEL SIGN OE
093B ; Other_Alphabetic # Mc DEVANAGARI VOWEL SIGN OOE
@@ -457,6 +479,7 @@
0BC6..0BC8 ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI
0BCA..0BCC ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU
0BD7 ; Other_Alphabetic # Mc TAMIL AU LENGTH MARK
+0C00 ; Other_Alphabetic # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
0C01..0C03 ; Other_Alphabetic # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
0C3E..0C40 ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
0C41..0C44 ; Other_Alphabetic # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
@@ -464,6 +487,7 @@
0C4A..0C4C ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN O..TELUGU VOWEL SIGN AU
0C55..0C56 ; Other_Alphabetic # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
0C62..0C63 ; Other_Alphabetic # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL
+0C81 ; Other_Alphabetic # Mn KANNADA SIGN CANDRABINDU
0C82..0C83 ; Other_Alphabetic # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
0CBE ; Other_Alphabetic # Mc KANNADA VOWEL SIGN AA
0CBF ; Other_Alphabetic # Mn KANNADA VOWEL SIGN I
@@ -474,6 +498,7 @@
0CCC ; Other_Alphabetic # Mn KANNADA VOWEL SIGN AU
0CD5..0CD6 ; Other_Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
0CE2..0CE3 ; Other_Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
+0D01 ; Other_Alphabetic # Mn MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Other_Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D3E..0D40 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
0D41..0D44 ; Other_Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
@@ -538,7 +563,8 @@
19B0..19C0 ; Other_Alphabetic # Mc [17] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE VOWEL SIGN IY
19C8..19C9 ; Other_Alphabetic # Mc [2] NEW TAI LUE TONE MARK-1..NEW TAI LUE TONE MARK-2
1A17..1A18 ; Other_Alphabetic # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
-1A19..1A1B ; Other_Alphabetic # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE
+1A19..1A1A ; Other_Alphabetic # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O
+1A1B ; Other_Alphabetic # Mn BUGINESE VOWEL SIGN AE
1A55 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN MEDIAL RA
1A56 ; Other_Alphabetic # Mn TAI THAM CONSONANT SIGN MEDIAL LA
1A57 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN LA TANG LAI
@@ -564,7 +590,7 @@
1BA2..1BA5 ; Other_Alphabetic # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU
1BA6..1BA7 ; Other_Alphabetic # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG
1BA8..1BA9 ; Other_Alphabetic # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
-1BAC..1BAD ; Other_Alphabetic # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA
+1BAC..1BAD ; Other_Alphabetic # Mn [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA
1BE7 ; Other_Alphabetic # Mc BATAK VOWEL SIGN E
1BE8..1BE9 ; Other_Alphabetic # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE
1BEA..1BEC ; Other_Alphabetic # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O
@@ -575,6 +601,7 @@
1C2C..1C33 ; Other_Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T
1C34..1C35 ; Other_Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG
1CF2..1CF3 ; Other_Alphabetic # Mc [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
+1DE7..1DF4 ; Other_Alphabetic # Mn [14] COMBINING LATIN SMALL LETTER ALPHA..COMBINING LATIN SMALL LETTER U WITH DIAERESIS
24B6..24E9 ; Other_Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z
2DE0..2DFF ; Other_Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
A674..A67B ; Other_Alphabetic # Mn [8] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC LETTER OMEGA
@@ -616,6 +643,7 @@
ABE8 ; Other_Alphabetic # Mn MEETEI MAYEK VOWEL SIGN UNAP
ABE9..ABEA ; Other_Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG
FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
+10376..1037A ; Other_Alphabetic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
10A01..10A03 ; Other_Alphabetic # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R
10A05..10A06 ; Other_Alphabetic # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O
10A0C..10A0F ; Other_Alphabetic # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
@@ -636,14 +664,54 @@
111B3..111B5 ; Other_Alphabetic # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II
111B6..111BE ; Other_Alphabetic # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O
111BF ; Other_Alphabetic # Mc SHARADA VOWEL SIGN AU
+1122C..1122E ; Other_Alphabetic # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II
+1122F..11231 ; Other_Alphabetic # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI
+11232..11233 ; Other_Alphabetic # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
+11234 ; Other_Alphabetic # Mn KHOJKI SIGN ANUSVARA
+11237 ; Other_Alphabetic # Mn KHOJKI SIGN SHADDA
+112DF ; Other_Alphabetic # Mn KHUDAWADI SIGN ANUSVARA
+112E0..112E2 ; Other_Alphabetic # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
+112E3..112E8 ; Other_Alphabetic # Mn [6] KHUDAWADI VOWEL SIGN U..KHUDAWADI VOWEL SIGN AU
+11301 ; Other_Alphabetic # Mn GRANTHA SIGN CANDRABINDU
+11302..11303 ; Other_Alphabetic # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA
+1133E..1133F ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I
+11340 ; Other_Alphabetic # Mn GRANTHA VOWEL SIGN II
+11341..11344 ; Other_Alphabetic # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR
+11347..11348 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI
+1134B..1134C ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU
+11357 ; Other_Alphabetic # Mc GRANTHA AU LENGTH MARK
+11362..11363 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
+114B0..114B2 ; Other_Alphabetic # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
+114B3..114B8 ; Other_Alphabetic # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
+114B9 ; Other_Alphabetic # Mc TIRHUTA VOWEL SIGN E
+114BA ; Other_Alphabetic # Mn TIRHUTA VOWEL SIGN SHORT E
+114BB..114BE ; Other_Alphabetic # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU
+114BF..114C0 ; Other_Alphabetic # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA
+114C1 ; Other_Alphabetic # Mc TIRHUTA SIGN VISARGA
+115AF..115B1 ; Other_Alphabetic # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II
+115B2..115B5 ; Other_Alphabetic # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR
+115B8..115BB ; Other_Alphabetic # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU
+115BC..115BD ; Other_Alphabetic # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA
+115BE ; Other_Alphabetic # Mc SIDDHAM SIGN VISARGA
+11630..11632 ; Other_Alphabetic # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II
+11633..1163A ; Other_Alphabetic # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI
+1163B..1163C ; Other_Alphabetic # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU
+1163D ; Other_Alphabetic # Mn MODI SIGN ANUSVARA
+1163E ; Other_Alphabetic # Mc MODI SIGN VISARGA
+11640 ; Other_Alphabetic # Mn MODI SIGN ARDHACANDRA
116AB ; Other_Alphabetic # Mn TAKRI SIGN ANUSVARA
116AC ; Other_Alphabetic # Mc TAKRI SIGN VISARGA
116AD ; Other_Alphabetic # Mn TAKRI VOWEL SIGN AA
116AE..116AF ; Other_Alphabetic # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II
116B0..116B5 ; Other_Alphabetic # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU
+16B30..16B36 ; Other_Alphabetic # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
16F51..16F7E ; Other_Alphabetic # Mc [46] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN NG
+1BC9E ; Other_Alphabetic # Mn DUPLOYAN DOUBLE MARK
+1F130..1F149 ; Other_Alphabetic # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
+1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
+1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
-# Total code points: 922
+# Total code points: 1116
# ================================================
@@ -746,6 +814,7 @@
1939..193B ; Diacritic # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I
1A75..1A7C ; Diacritic # Mn [8] TAI THAM SIGN TONE-1..TAI THAM SIGN KHUEN-LUE KARAN
1A7F ; Diacritic # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT
+1AB0..1ABD ; Diacritic # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
1B34 ; Diacritic # Mn BALINESE SIGN REREKAN
1B44 ; Diacritic # Mc BALINESE ADEG ADEG
1B6B..1B73 ; Diacritic # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
@@ -760,8 +829,10 @@
1CE2..1CE8 ; Diacritic # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; Diacritic # Mn VEDIC SIGN TIRYAK
1CF4 ; Diacritic # Mn VEDIC TONE CANDRA ABOVE
+1CF8..1CF9 ; Diacritic # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
1D2C..1D6A ; Diacritic # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI
1DC4..1DCF ; Diacritic # Mn [12] COMBINING MACRON-ACUTE..COMBINING ZIGZAG BELOW
+1DF5 ; Diacritic # Mn COMBINING UP TACK ABOVE
1DFD..1DFF ; Diacritic # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1FBD ; Diacritic # Sk GREEK KORONIS
1FBF..1FC1 ; Diacritic # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
@@ -779,6 +850,7 @@
A66F ; Diacritic # Mn COMBINING CYRILLIC VZMET
A67C..A67D ; Diacritic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK
A67F ; Diacritic # Lm CYRILLIC PAYEROK
+A69C..A69D ; Diacritic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
A6F0..A6F1 ; Diacritic # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS
A717..A71F ; Diacritic # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
A720..A721 ; Diacritic # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE
@@ -791,26 +863,45 @@
A953 ; Diacritic # Mc REJANG VIRAMA
A9B3 ; Diacritic # Mn JAVANESE SIGN CECAK TELU
A9C0 ; Diacritic # Mc JAVANESE PANGKON
+A9E5 ; Diacritic # Mn MYANMAR SIGN SHAN SAW
AA7B ; Diacritic # Mc MYANMAR SIGN PAO KAREN TONE
+AA7C ; Diacritic # Mn MYANMAR SIGN TAI LAING TONE-2
+AA7D ; Diacritic # Mc MYANMAR SIGN TAI LAING TONE-5
AABF ; Diacritic # Mn TAI VIET TONE MAI EK
AAC0 ; Diacritic # Lo TAI VIET TONE MAI NUENG
AAC1 ; Diacritic # Mn TAI VIET TONE MAI THO
AAC2 ; Diacritic # Lo TAI VIET TONE MAI SONG
AAF6 ; Diacritic # Mn MEETEI MAYEK VIRAMA
+AB5B ; Diacritic # Sk MODIFIER BREVE WITH INVERTED BREVE
+AB5C..AB5F ; Diacritic # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
ABEC ; Diacritic # Mc MEETEI MAYEK LUM IYEK
ABED ; Diacritic # Mn MEETEI MAYEK APUN IYEK
FB1E ; Diacritic # Mn HEBREW POINT JUDEO-SPANISH VARIKA
-FE20..FE26 ; Diacritic # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON
+FE20..FE2D ; Diacritic # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW
FF3E ; Diacritic # Sk FULLWIDTH CIRCUMFLEX ACCENT
FF40 ; Diacritic # Sk FULLWIDTH GRAVE ACCENT
FF70 ; Diacritic # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF9E..FF9F ; Diacritic # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
FFE3 ; Diacritic # Sk FULLWIDTH MACRON
+102E0 ; Diacritic # Mn COPTIC EPACT THOUSANDS MARK
+10AE5..10AE6 ; Diacritic # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
110B9..110BA ; Diacritic # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
11133..11134 ; Diacritic # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA
+11173 ; Diacritic # Mn MAHAJANI SIGN NUKTA
111C0 ; Diacritic # Mc SHARADA SIGN VIRAMA
+11235 ; Diacritic # Mc KHOJKI SIGN VIRAMA
+11236 ; Diacritic # Mn KHOJKI SIGN NUKTA
+112E9..112EA ; Diacritic # Mn [2] KHUDAWADI SIGN NUKTA..KHUDAWADI SIGN VIRAMA
+1133C ; Diacritic # Mn GRANTHA SIGN NUKTA
+1134D ; Diacritic # Mc GRANTHA SIGN VIRAMA
+11366..1136C ; Diacritic # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
+11370..11374 ; Diacritic # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
+114C2..114C3 ; Diacritic # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
+115BF..115C0 ; Diacritic # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
+1163F ; Diacritic # Mn MODI SIGN VIRAMA
116B6 ; Diacritic # Mc TAKRI SIGN VIRAMA
116B7 ; Diacritic # Mn TAKRI SIGN NUKTA
+16AF0..16AF4 ; Diacritic # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
16F8F..16F92 ; Diacritic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW
16F93..16F9F ; Diacritic # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8
1D167..1D169 ; Diacritic # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
@@ -818,8 +909,9 @@
1D17B..1D182 ; Diacritic # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
1D185..1D18B ; Diacritic # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
1D1AA..1D1AD ; Diacritic # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
+1E8D0..1E8D6 ; Diacritic # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
-# Total code points: 693
+# Total code points: 766
# ================================================
@@ -841,12 +933,16 @@
A015 ; Extender # Lm YI SYLLABLE WU
A60C ; Extender # Lm VAI SYLLABLE LENGTHENER
A9CF ; Extender # Lm JAVANESE PANGRANGKEP
+A9E6 ; Extender # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION
AA70 ; Extender # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION
AADD ; Extender # Lm TAI VIET SYMBOL SAM
AAF3..AAF4 ; Extender # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK
FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
+1135D ; Extender # Lo GRANTHA SIGN PLUTA
+115C6..115C8 ; Extender # Po [3] SIDDHAM REPETITION MARK-1..SIDDHAM REPETITION MARK-3
+16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM
-# Total code points: 31
+# Total code points: 38
# ================================================
@@ -866,17 +962,22 @@
2170..217F ; Other_Lowercase # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
24D0..24E9 ; Other_Lowercase # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
2C7C..2C7D ; Other_Lowercase # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V
+A69C..A69D ; Other_Lowercase # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
A770 ; Other_Lowercase # Lm MODIFIER LETTER US
A7F8..A7F9 ; Other_Lowercase # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
+AB5C..AB5F ; Other_Lowercase # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
-# Total code points: 183
+# Total code points: 189
# ================================================
2160..216F ; Other_Uppercase # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
24B6..24CF ; Other_Uppercase # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
+1F130..1F149 ; Other_Uppercase # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
+1F150..1F169 ; Other_Uppercase # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
+1F170..1F189 ; Other_Uppercase # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
-# Total code points: 42
+# Total code points: 120
# ================================================
@@ -918,10 +1019,15 @@
200C..200D ; Other_Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
302E..302F ; Other_Grapheme_Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+1133E ; Other_Grapheme_Extend # Mc GRANTHA VOWEL SIGN AA
+11357 ; Other_Grapheme_Extend # Mc GRANTHA AU LENGTH MARK
+114B0 ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN AA
+114BD ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN SHORT O
+115AF ; Other_Grapheme_Extend # Mc SIDDHAM VOWEL SIGN AA
1D165 ; Other_Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM
1D16E..1D172 ; Other_Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5
-# Total code points: 25
+# Total code points: 30
# ================================================
@@ -966,7 +1072,7 @@
034F ; Other_Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER
115F..1160 ; Other_Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER
17B4..17B5 ; Other_Default_Ignorable_Code_Point # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
-2065..2069 ; Other_Default_Ignorable_Code_Point # Cn [5] <reserved-2065>..<reserved-2069>
+2065 ; Other_Default_Ignorable_Code_Point # Cn <reserved-2065>
3164 ; Other_Default_Ignorable_Code_Point # Lo HANGUL FILLER
FFA0 ; Other_Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER
FFF0..FFF8 ; Other_Default_Ignorable_Code_Point # Cn [9] <reserved-FFF0>..<reserved-FFF8>
@@ -975,7 +1081,7 @@
E0080..E00FF ; Other_Default_Ignorable_Code_Point # Cn [128] <reserved-E0080>..<reserved-E00FF>
E01F0..E0FFF ; Other_Default_Ignorable_Code_Point # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
-# Total code points: 3780
+# Total code points: 3776
# ================================================
@@ -1060,8 +1166,6 @@
0021 ; STerm # Po EXCLAMATION MARK
002E ; STerm # Po FULL STOP
003F ; STerm # Po QUESTION MARK
-055C ; STerm # Po ARMENIAN EXCLAMATION MARK
-055E ; STerm # Po ARMENIAN QUESTION MARK
0589 ; STerm # Po ARMENIAN FULL STOP
061F ; STerm # Po ARABIC QUESTION MARK
06D4 ; STerm # Po ARABIC FULL STOP
@@ -1084,6 +1188,7 @@
203C..203D ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG
2047..2049 ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK
2E2E ; STerm # Po REVERSED QUESTION MARK
+2E3C ; STerm # Po STENOGRAPHIC FULL STOP
3002 ; STerm # Po IDEOGRAPHIC FULL STOP
A4FF ; STerm # Po LISU PUNCTUATION FULL STOP
A60E..A60F ; STerm # Po [2] VAI FULL STOP..VAI QUESTION MARK
@@ -1107,8 +1212,19 @@
110BE..110C1 ; STerm # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA
11141..11143 ; STerm # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK
111C5..111C6 ; STerm # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA
+111CD ; STerm # Po SHARADA SUTRA MARK
+11238..11239 ; STerm # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA
+1123B..1123C ; STerm # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
+115C2..115C3 ; STerm # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA
+115C9 ; STerm # Po SIDDHAM END OF TEXT MARK
+11641..11642 ; STerm # Po [2] MODI DANDA..MODI DOUBLE DANDA
+16A6E..16A6F ; STerm # Po [2] MRO DANDA..MRO DOUBLE DANDA
+16AF5 ; STerm # Po BASSA VAH FULL STOP
+16B37..16B38 ; STerm # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB
+16B44 ; STerm # Po PAHAWH HMONG SIGN XAUS
+1BC9F ; STerm # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
-# Total code points: 83
+# Total code points: 99
# ================================================
@@ -1210,7 +1326,10 @@
21D5..21F3 ; Pattern_Syntax # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW
21F4..22FF ; Pattern_Syntax # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP
2300..2307 ; Pattern_Syntax # So [8] DIAMETER SIGN..WAVY LINE
-2308..230B ; Pattern_Syntax # Sm [4] LEFT CEILING..RIGHT FLOOR
+2308 ; Pattern_Syntax # Ps LEFT CEILING
+2309 ; Pattern_Syntax # Pe RIGHT CEILING
+230A ; Pattern_Syntax # Ps LEFT FLOOR
+230B ; Pattern_Syntax # Pe RIGHT FLOOR
230C..231F ; Pattern_Syntax # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER
2320..2321 ; Pattern_Syntax # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL
2322..2328 ; Pattern_Syntax # So [7] FROWN..KEYBOARD
@@ -1222,8 +1341,8 @@
239B..23B3 ; Pattern_Syntax # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Pattern_Syntax # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Pattern_Syntax # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
-23E2..23F3 ; Pattern_Syntax # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND
-23F4..23FF ; Pattern_Syntax # Cn [12] <reserved-23F4>..<reserved-23FF>
+23E2..23FA ; Pattern_Syntax # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
+23FB..23FF ; Pattern_Syntax # Cn [5] <reserved-23FB>..<reserved-23FF>
2400..2426 ; Pattern_Syntax # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
2427..243F ; Pattern_Syntax # Cn [25] <reserved-2427>..<reserved-243F>
2440..244A ; Pattern_Syntax # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
@@ -1236,9 +1355,7 @@
25F8..25FF ; Pattern_Syntax # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE
2600..266E ; Pattern_Syntax # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN
266F ; Pattern_Syntax # Sm MUSIC SHARP SIGN
-2670..26FF ; Pattern_Syntax # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
-2700 ; Pattern_Syntax # Cn <reserved-2700>
-2701..2767 ; Pattern_Syntax # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET
+2670..2767 ; Pattern_Syntax # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET
2768 ; Pattern_Syntax # Ps MEDIUM LEFT PARENTHESIS ORNAMENT
2769 ; Pattern_Syntax # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT
276A ; Pattern_Syntax # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
@@ -1306,9 +1423,16 @@
2B30..2B44 ; Pattern_Syntax # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET
2B45..2B46 ; Pattern_Syntax # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW
2B47..2B4C ; Pattern_Syntax # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR
-2B4D..2B4F ; Pattern_Syntax # Cn [3] <reserved-2B4D>..<reserved-2B4F>
-2B50..2B59 ; Pattern_Syntax # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE
-2B5A..2BFF ; Pattern_Syntax # Cn [166] <reserved-2B5A>..<reserved-2BFF>
+2B4D..2B73 ; Pattern_Syntax # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR
+2B74..2B75 ; Pattern_Syntax # Cn [2] <reserved-2B74>..<reserved-2B75>
+2B76..2B95 ; Pattern_Syntax # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
+2B96..2B97 ; Pattern_Syntax # Cn [2] <reserved-2B96>..<reserved-2B97>
+2B98..2BB9 ; Pattern_Syntax # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX
+2BBA..2BBC ; Pattern_Syntax # Cn [3] <reserved-2BBA>..<reserved-2BBC>
+2BBD..2BC8 ; Pattern_Syntax # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
+2BC9 ; Pattern_Syntax # Cn <reserved-2BC9>
+2BCA..2BD1 ; Pattern_Syntax # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
+2BD2..2BFF ; Pattern_Syntax # Cn [46] <reserved-2BD2>..<reserved-2BFF>
2E00..2E01 ; Pattern_Syntax # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
2E02 ; Pattern_Syntax # Pi LEFT SUBSTITUTION BRACKET
2E03 ; Pattern_Syntax # Pf RIGHT SUBSTITUTION BRACKET
@@ -1342,7 +1466,11 @@
2E2F ; Pattern_Syntax # Lm VERTICAL TILDE
2E30..2E39 ; Pattern_Syntax # Po [10] RING POINT..TOP HALF SECTION SIGN
2E3A..2E3B ; Pattern_Syntax # Pd [2] TWO-EM DASH..THREE-EM DASH
-2E3C..2E7F ; Pattern_Syntax # Cn [68] <reserved-2E3C>..<reserved-2E7F>
+2E3C..2E3F ; Pattern_Syntax # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM
+2E40 ; Pattern_Syntax # Pd DOUBLE HYPHEN
+2E41 ; Pattern_Syntax # Po REVERSED COMMA
+2E42 ; Pattern_Syntax # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
+2E43..2E7F ; Pattern_Syntax # Cn [61] <reserved-2E43>..<reserved-2E7F>
3001..3003 ; Pattern_Syntax # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
3008 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET
3009 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET
@@ -1368,8 +1496,8 @@
301E..301F ; Pattern_Syntax # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
3020 ; Pattern_Syntax # So POSTAL MARK FACE
3030 ; Pattern_Syntax # Pd WAVY DASH
-FD3E ; Pattern_Syntax # Ps ORNATE LEFT PARENTHESIS
-FD3F ; Pattern_Syntax # Pe ORNATE RIGHT PARENTHESIS
+FD3E ; Pattern_Syntax # Pe ORNATE LEFT PARENTHESIS
+FD3F ; Pattern_Syntax # Ps ORNATE RIGHT PARENTHESIS
FE45..FE46 ; Pattern_Syntax # Po [2] SESAME DOT..WHITE SESAME DOT
# Total code points: 2760
--- a/jdk/test/java/lang/Character/PropertyValueAliases.txt Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/test/java/lang/Character/PropertyValueAliases.txt Wed Jul 15 11:05:51 2015 +0900
@@ -1,8 +1,8 @@
-# PropertyValueAliases-6.2.0.txt
-# Date: 2012-08-14, 16:05:11 GMT [MD]
+# PropertyValueAliases-7.0.0.txt
+# Date: 2014-05-14, 23:55:16 GMT [MD]
#
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@@ -32,13 +32,14 @@
#
# Loose matching should be applied to all property names and property values, with
# the exception of String Property values. With loose matching of property names and
-# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property
-# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1".
+# values, the case distinctions, whitespace, hyphens, and '_' are ignored.
+# For Numeric Property values, numeric equivalence is applied: thus "01.00"
+# is equivalent to "1".
#
# NOTE: Property value names are NOT unique across properties. For example:
#
# AL means Arabic Letter for the Bidi_Class property, and
-# AL means Above_Left for the Combining_Class property, and
+# AL means Above_Left for the Canonical_Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
#
# In addition, some property names may be the same as some property value names.
@@ -74,6 +75,8 @@
age; 6.0 ; V6_0
age; 6.1 ; V6_1
age; 6.2 ; V6_2
+age; 6.3 ; V6_3
+age; 7.0 ; V7_0
age; NA ; Unassigned
# Alphabetic (Alpha)
@@ -91,14 +94,18 @@
bc ; EN ; European_Number
bc ; ES ; European_Separator
bc ; ET ; European_Terminator
+bc ; FSI ; First_Strong_Isolate
bc ; L ; Left_To_Right
bc ; LRE ; Left_To_Right_Embedding
+bc ; LRI ; Left_To_Right_Isolate
bc ; LRO ; Left_To_Right_Override
bc ; NSM ; Nonspacing_Mark
bc ; ON ; Other_Neutral
bc ; PDF ; Pop_Directional_Format
+bc ; PDI ; Pop_Directional_Isolate
bc ; R ; Right_To_Left
bc ; RLE ; Right_To_Left_Embedding
+bc ; RLI ; Right_To_Left_Isolate
bc ; RLO ; Right_To_Left_Override
bc ; S ; Segment_Separator
bc ; WS ; White_Space
@@ -117,6 +124,17 @@
# @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; <none>
+# Bidi_Paired_Bracket (bpb)
+
+# @missing: 0000..10FFFF; Bidi_Paired_Bracket; <none>
+
+# Bidi_Paired_Bracket_Type (bpt)
+
+bpt; c ; Close
+bpt; n ; None
+bpt; o ; Open
+# @missing: 0000..10FFFF; Bidi_Paired_Bracket_Type; n
+
# Block (blk)
blk; Aegean_Numbers ; Aegean_Numbers
@@ -138,6 +156,7 @@
blk; Balinese ; Balinese
blk; Bamum ; Bamum
blk; Bamum_Sup ; Bamum_Supplement
+blk; Bassa_Vah ; Bassa_Vah
blk; Batak ; Batak
blk; Bengali ; Bengali
blk; Block_Elements ; Block_Elements
@@ -150,6 +169,7 @@
blk; Buhid ; Buhid
blk; Byzantine_Music ; Byzantine_Musical_Symbols
blk; Carian ; Carian
+blk; Caucasian_Albanian ; Caucasian_Albanian
blk; Chakma ; Chakma
blk; Cham ; Cham
blk; Cherokee ; Cherokee
@@ -168,6 +188,7 @@
blk; Compat_Jamo ; Hangul_Compatibility_Jamo
blk; Control_Pictures ; Control_Pictures
blk; Coptic ; Coptic
+blk; Coptic_Epact_Numbers ; Coptic_Epact_Numbers
blk; Counting_Rod ; Counting_Rod_Numerals
blk; Cuneiform ; Cuneiform
blk; Cuneiform_Numbers ; Cuneiform_Numbers_And_Punctuation
@@ -181,11 +202,14 @@
blk; Devanagari ; Devanagari
blk; Devanagari_Ext ; Devanagari_Extended
blk; Diacriticals ; Combining_Diacritical_Marks
+blk; Diacriticals_Ext ; Combining_Diacritical_Marks_Extended
blk; Diacriticals_For_Symbols ; Combining_Diacritical_Marks_For_Symbols; Combining_Marks_For_Symbols
blk; Diacriticals_Sup ; Combining_Diacritical_Marks_Supplement
blk; Dingbats ; Dingbats
blk; Domino ; Domino_Tiles
+blk; Duployan ; Duployan
blk; Egyptian_Hieroglyphs ; Egyptian_Hieroglyphs
+blk; Elbasan ; Elbasan
blk; Emoticons ; Emoticons
blk; Enclosed_Alphanum ; Enclosed_Alphanumerics
blk; Enclosed_Alphanum_Sup ; Enclosed_Alphanumeric_Supplement
@@ -196,10 +220,12 @@
blk; Ethiopic_Ext_A ; Ethiopic_Extended_A
blk; Ethiopic_Sup ; Ethiopic_Supplement
blk; Geometric_Shapes ; Geometric_Shapes
+blk; Geometric_Shapes_Ext ; Geometric_Shapes_Extended
blk; Georgian ; Georgian
blk; Georgian_Sup ; Georgian_Supplement
blk; Glagolitic ; Glagolitic
blk; Gothic ; Gothic
+blk; Grantha ; Grantha
blk; Greek ; Greek_And_Coptic
blk; Greek_Ext ; Greek_Extended
blk; Gujarati ; Gujarati
@@ -233,6 +259,8 @@
blk; Kharoshthi ; Kharoshthi
blk; Khmer ; Khmer
blk; Khmer_Symbols ; Khmer_Symbols
+blk; Khojki ; Khojki
+blk; Khudawadi ; Khudawadi
blk; Lao ; Lao
blk; Latin_1_Sup ; Latin_1_Supplement ; Latin_1
blk; Latin_Ext_A ; Latin_Extended_A
@@ -240,22 +268,27 @@
blk; Latin_Ext_B ; Latin_Extended_B
blk; Latin_Ext_C ; Latin_Extended_C
blk; Latin_Ext_D ; Latin_Extended_D
+blk; Latin_Ext_E ; Latin_Extended_E
blk; Lepcha ; Lepcha
blk; Letterlike_Symbols ; Letterlike_Symbols
blk; Limbu ; Limbu
+blk; Linear_A ; Linear_A
blk; Linear_B_Ideograms ; Linear_B_Ideograms
blk; Linear_B_Syllabary ; Linear_B_Syllabary
blk; Lisu ; Lisu
blk; Low_Surrogates ; Low_Surrogates
blk; Lycian ; Lycian
blk; Lydian ; Lydian
+blk; Mahajani ; Mahajani
blk; Mahjong ; Mahjong_Tiles
blk; Malayalam ; Malayalam
blk; Mandaic ; Mandaic
+blk; Manichaean ; Manichaean
blk; Math_Alphanum ; Mathematical_Alphanumeric_Symbols
blk; Math_Operators ; Mathematical_Operators
blk; Meetei_Mayek ; Meetei_Mayek
blk; Meetei_Mayek_Ext ; Meetei_Mayek_Extensions
+blk; Mende_Kikakui ; Mende_Kikakui
blk; Meroitic_Cursive ; Meroitic_Cursive
blk; Meroitic_Hieroglyphs ; Meroitic_Hieroglyphs
blk; Miao ; Miao
@@ -265,12 +298,16 @@
blk; Misc_Pictographs ; Miscellaneous_Symbols_And_Pictographs
blk; Misc_Symbols ; Miscellaneous_Symbols
blk; Misc_Technical ; Miscellaneous_Technical
+blk; Modi ; Modi
blk; Modifier_Letters ; Spacing_Modifier_Letters
blk; Modifier_Tone_Letters ; Modifier_Tone_Letters
blk; Mongolian ; Mongolian
+blk; Mro ; Mro
blk; Music ; Musical_Symbols
blk; Myanmar ; Myanmar
blk; Myanmar_Ext_A ; Myanmar_Extended_A
+blk; Myanmar_Ext_B ; Myanmar_Extended_B
+blk; Nabataean ; Nabataean
blk; NB ; No_Block
blk; New_Tai_Lue ; New_Tai_Lue
blk; NKo ; NKo
@@ -279,17 +316,24 @@
blk; Ogham ; Ogham
blk; Ol_Chiki ; Ol_Chiki
blk; Old_Italic ; Old_Italic
+blk; Old_North_Arabian ; Old_North_Arabian
+blk; Old_Permic ; Old_Permic
blk; Old_Persian ; Old_Persian
blk; Old_South_Arabian ; Old_South_Arabian
blk; Old_Turkic ; Old_Turkic
blk; Oriya ; Oriya
+blk; Ornamental_Dingbats ; Ornamental_Dingbats
blk; Osmanya ; Osmanya
+blk; Pahawh_Hmong ; Pahawh_Hmong
+blk; Palmyrene ; Palmyrene
+blk; Pau_Cin_Hau ; Pau_Cin_Hau
blk; Phags_Pa ; Phags_Pa
blk; Phaistos ; Phaistos_Disc
blk; Phoenician ; Phoenician
blk; Phonetic_Ext ; Phonetic_Extensions
blk; Phonetic_Ext_Sup ; Phonetic_Extensions_Supplement
blk; Playing_Cards ; Playing_Cards
+blk; Psalter_Pahlavi ; Psalter_Pahlavi
blk; PUA ; Private_Use_Area ; Private_Use
blk; Punctuation ; General_Punctuation
blk; Rejang ; Rejang
@@ -299,7 +343,10 @@
blk; Saurashtra ; Saurashtra
blk; Sharada ; Sharada
blk; Shavian ; Shavian
+blk; Shorthand_Format_Controls ; Shorthand_Format_Controls
+blk; Siddham ; Siddham
blk; Sinhala ; Sinhala
+blk; Sinhala_Archaic_Numbers ; Sinhala_Archaic_Numbers
blk; Small_Forms ; Small_Form_Variants
blk; Sora_Sompeng ; Sora_Sompeng
blk; Specials ; Specials
@@ -307,6 +354,7 @@
blk; Sundanese_Sup ; Sundanese_Supplement
blk; Sup_Arrows_A ; Supplemental_Arrows_A
blk; Sup_Arrows_B ; Supplemental_Arrows_B
+blk; Sup_Arrows_C ; Supplemental_Arrows_C
blk; Sup_Math_Operators ; Supplemental_Mathematical_Operators
blk; Sup_PUA_A ; Supplementary_Private_Use_Area_A
blk; Sup_PUA_B ; Supplementary_Private_Use_Area_B
@@ -328,6 +376,7 @@
blk; Thai ; Thai
blk; Tibetan ; Tibetan
blk; Tifinagh ; Tifinagh
+blk; Tirhuta ; Tirhuta
blk; Transport_And_Map ; Transport_And_Map_Symbols
blk; UCAS ; Unified_Canadian_Aboriginal_Syllabics; Canadian_Syllabics
blk; UCAS_Ext ; Unified_Canadian_Aboriginal_Syllabics_Extended
@@ -337,6 +386,7 @@
blk; Vertical_Forms ; Vertical_Forms
blk; VS ; Variation_Selectors
blk; VS_Sup ; Variation_Selectors_Supplement
+blk; Warang_Citi ; Warang_Citi
blk; Yi_Radicals ; Yi_Radicals
blk; Yi_Syllables ; Yi_Syllables
blk; Yijing ; Yijing_Hexagram_Symbols
@@ -578,6 +628,7 @@
gc ; Zl ; Line_Separator
gc ; Zp ; Paragraph_Separator
gc ; Zs ; Space_Separator
+# @missing: 0000..10FFFF; General_Category; Unassigned
# Grapheme_Base (Gr_Base)
@@ -662,7 +713,6 @@
InMC; Bottom ; Bottom
InMC; Bottom_And_Right ; Bottom_And_Right
-InMC; Invisible ; Invisible
InMC; Left ; Left
InMC; Left_And_Right ; Left_And_Right
InMC; NA ; NA
@@ -680,17 +730,27 @@
InSC; Avagraha ; Avagraha
InSC; Bindu ; Bindu
+InSC; Brahmi_Joining_Number ; Brahmi_Joining_Number
+InSC; Cantillation_Mark ; Cantillation_Mark
InSC; Consonant ; Consonant
InSC; Consonant_Dead ; Consonant_Dead
InSC; Consonant_Final ; Consonant_Final
InSC; Consonant_Head_Letter ; Consonant_Head_Letter
InSC; Consonant_Medial ; Consonant_Medial
InSC; Consonant_Placeholder ; Consonant_Placeholder
-InSC; Consonant_Repha ; Consonant_Repha
+InSC; Consonant_Preceding_Repha ; Consonant_Preceding_Repha
InSC; Consonant_Subjoined ; Consonant_Subjoined
+InSC; Consonant_Succeeding_Repha ; Consonant_Succeeding_Repha
+InSC; Gemination_Mark ; Gemination_Mark
+InSC; Invisible_Stacker ; Invisible_Stacker
+InSC; Joiner ; Joiner
InSC; Modifying_Letter ; Modifying_Letter
+InSC; Non_Joiner ; Non_Joiner
InSC; Nukta ; Nukta
+InSC; Number ; Number
+InSC; Number_Joiner ; Number_Joiner
InSC; Other ; Other
+InSC; Pure_Killer ; Pure_Killer
InSC; Register_Shifter ; Register_Shifter
InSC; Tone_Letter ; Tone_Letter
InSC; Tone_Mark ; Tone_Mark
@@ -702,7 +762,6 @@
# Jamo_Short_Name (JSN)
-# @missing: 0000..10FFFF; Jamo_Short_Name; <none>
JSN; A ; A
JSN; AE ; AE
JSN; B ; B
@@ -755,6 +814,7 @@
JSN; YI ; YI
JSN; YO ; YO
JSN; YU ; YU
+# @missing: 0000..10FFFF; Jamo_Short_Name; <none>
# Join_Control (Join_C)
@@ -789,6 +849,33 @@
jg ; Knotted_Heh ; Knotted_Heh
jg ; Lam ; Lam
jg ; Lamadh ; Lamadh
+jg ; Manichaean_Aleph ; Manichaean_Aleph
+jg ; Manichaean_Ayin ; Manichaean_Ayin
+jg ; Manichaean_Beth ; Manichaean_Beth
+jg ; Manichaean_Daleth ; Manichaean_Daleth
+jg ; Manichaean_Dhamedh ; Manichaean_Dhamedh
+jg ; Manichaean_Five ; Manichaean_Five
+jg ; Manichaean_Gimel ; Manichaean_Gimel
+jg ; Manichaean_Heth ; Manichaean_Heth
+jg ; Manichaean_Hundred ; Manichaean_Hundred
+jg ; Manichaean_Kaph ; Manichaean_Kaph
+jg ; Manichaean_Lamedh ; Manichaean_Lamedh
+jg ; Manichaean_Mem ; Manichaean_Mem
+jg ; Manichaean_Nun ; Manichaean_Nun
+jg ; Manichaean_One ; Manichaean_One
+jg ; Manichaean_Pe ; Manichaean_Pe
+jg ; Manichaean_Qoph ; Manichaean_Qoph
+jg ; Manichaean_Resh ; Manichaean_Resh
+jg ; Manichaean_Sadhe ; Manichaean_Sadhe
+jg ; Manichaean_Samekh ; Manichaean_Samekh
+jg ; Manichaean_Taw ; Manichaean_Taw
+jg ; Manichaean_Ten ; Manichaean_Ten
+jg ; Manichaean_Teth ; Manichaean_Teth
+jg ; Manichaean_Thamedh ; Manichaean_Thamedh
+jg ; Manichaean_Twenty ; Manichaean_Twenty
+jg ; Manichaean_Waw ; Manichaean_Waw
+jg ; Manichaean_Yodh ; Manichaean_Yodh
+jg ; Manichaean_Zayin ; Manichaean_Zayin
jg ; Meem ; Meem
jg ; Mim ; Mim
jg ; No_Joining_Group ; No_Joining_Group
@@ -806,6 +893,7 @@
jg ; Seen ; Seen
jg ; Semkath ; Semkath
jg ; Shin ; Shin
+jg ; Straight_Waw ; Straight_Waw
jg ; Swash_Kaf ; Swash_Kaf
jg ; Syriac_Waw ; Syriac_Waw
jg ; Tah ; Tah
@@ -884,6 +972,10 @@
Lower; N ; No ; F ; False
Lower; Y ; Yes ; T ; True
+# Lowercase_Mapping (lc)
+
+# @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
+
# Math (Math)
Math; N ; No ; F ; False
@@ -1006,12 +1098,14 @@
# Script (sc)
+sc ; Aghb ; Caucasian_Albanian
sc ; Arab ; Arabic
sc ; Armi ; Imperial_Aramaic
sc ; Armn ; Armenian
sc ; Avst ; Avestan
sc ; Bali ; Balinese
sc ; Bamu ; Bamum
+sc ; Bass ; Bassa_Vah
sc ; Batk ; Batak
sc ; Beng ; Bengali
sc ; Bopo ; Bopomofo
@@ -1029,11 +1123,14 @@
sc ; Cyrl ; Cyrillic
sc ; Deva ; Devanagari
sc ; Dsrt ; Deseret
+sc ; Dupl ; Duployan
sc ; Egyp ; Egyptian_Hieroglyphs
+sc ; Elba ; Elbasan
sc ; Ethi ; Ethiopic
sc ; Geor ; Georgian
sc ; Glag ; Glagolitic
sc ; Goth ; Gothic
+sc ; Gran ; Grantha
sc ; Grek ; Greek
sc ; Gujr ; Gujarati
sc ; Guru ; Gurmukhi
@@ -1042,6 +1139,7 @@
sc ; Hano ; Hanunoo
sc ; Hebr ; Hebrew
sc ; Hira ; Hiragana
+sc ; Hmng ; Pahawh_Hmong
sc ; Hrkt ; Katakana_Or_Hiragana
sc ; Ital ; Old_Italic
sc ; Java ; Javanese
@@ -1049,6 +1147,7 @@
sc ; Kana ; Katakana
sc ; Khar ; Kharoshthi
sc ; Khmr ; Khmer
+sc ; Khoj ; Khojki
sc ; Knda ; Kannada
sc ; Kthi ; Kaithi
sc ; Lana ; Tai_Tham
@@ -1056,25 +1155,37 @@
sc ; Latn ; Latin
sc ; Lepc ; Lepcha
sc ; Limb ; Limbu
+sc ; Lina ; Linear_A
sc ; Linb ; Linear_B
sc ; Lisu ; Lisu
sc ; Lyci ; Lycian
sc ; Lydi ; Lydian
+sc ; Mahj ; Mahajani
sc ; Mand ; Mandaic
+sc ; Mani ; Manichaean
+sc ; Mend ; Mende_Kikakui
sc ; Merc ; Meroitic_Cursive
sc ; Mero ; Meroitic_Hieroglyphs
sc ; Mlym ; Malayalam
+sc ; Modi ; Modi
sc ; Mong ; Mongolian
+sc ; Mroo ; Mro
sc ; Mtei ; Meetei_Mayek
sc ; Mymr ; Myanmar
+sc ; Narb ; Old_North_Arabian
+sc ; Nbat ; Nabataean
sc ; Nkoo ; Nko
sc ; Ogam ; Ogham
sc ; Olck ; Ol_Chiki
sc ; Orkh ; Old_Turkic
sc ; Orya ; Oriya
sc ; Osma ; Osmanya
+sc ; Palm ; Palmyrene
+sc ; Pauc ; Pau_Cin_Hau
+sc ; Perm ; Old_Permic
sc ; Phag ; Phags_Pa
sc ; Phli ; Inscriptional_Pahlavi
+sc ; Phlp ; Psalter_Pahlavi
sc ; Phnx ; Phoenician
sc ; Plrd ; Miao
sc ; Prti ; Inscriptional_Parthian
@@ -1085,6 +1196,8 @@
sc ; Saur ; Saurashtra
sc ; Shaw ; Shavian
sc ; Shrd ; Sharada
+sc ; Sidd ; Siddham
+sc ; Sind ; Khudawadi
sc ; Sinh ; Sinhala
sc ; Sora ; Sora_Sompeng
sc ; Sund ; Sundanese
@@ -1102,8 +1215,10 @@
sc ; Thaa ; Thaana
sc ; Thai ; Thai
sc ; Tibt ; Tibetan
+sc ; Tirh ; Tirhuta
sc ; Ugar ; Ugaritic
sc ; Vaii ; Vai
+sc ; Wara ; Warang_Citi
sc ; Xpeo ; Old_Persian
sc ; Xsux ; Cuneiform
sc ; Yiii ; Yi
@@ -1159,6 +1274,10 @@
Term; N ; No ; F ; False
Term; Y ; Yes ; T ; True
+# Titlecase_Mapping (tc)
+
+# @missing: 0000..10FFFF; Titlecase_Mapping; <code point>
+
# Unicode_1_Name (na1)
# @missing: 0000..10FFFF; Unicode_1_Name; <none>
@@ -1173,6 +1292,10 @@
Upper; N ; No ; F ; False
Upper; Y ; Yes ; T ; True
+# Uppercase_Mapping (uc)
+
+# @missing: 0000..10FFFF; Uppercase_Mapping; <code point>
+
# Variation_Selector (VS)
VS ; N ; No ; F ; False
@@ -1186,9 +1309,11 @@
# Word_Break (WB)
WB ; CR ; CR
+WB ; DQ ; Double_Quote
WB ; EX ; ExtendNumLet
WB ; Extend ; Extend
WB ; FO ; Format
+WB ; HL ; Hebrew_Letter
WB ; KA ; Katakana
WB ; LE ; ALetter
WB ; LF ; LF
@@ -1198,6 +1323,7 @@
WB ; NL ; Newline
WB ; NU ; Numeric
WB ; RI ; Regional_Indicator
+WB ; SQ ; Single_Quote
WB ; XX ; Other
# XID_Continue (XIDC)
--- a/jdk/test/java/lang/Character/Scripts.txt Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/test/java/lang/Character/Scripts.txt Wed Jul 15 11:05:51 2015 +0900
@@ -1,8 +1,8 @@
-# Scripts-6.2.0.txt
-# Date: 2012-06-04, 17:21:29 GMT [MD]
+# Scripts-7.0.0.txt
+# Date: 2014-05-15, 00:11:35 GMT [MD]
#
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@@ -83,8 +83,10 @@
0385 ; Common # Sk GREEK DIALYTIKA TONOS
0387 ; Common # Po GREEK ANO TELEIA
0589 ; Common # Po ARMENIAN FULL STOP
+0605 ; Common # Cf ARABIC NUMBER MARK ABOVE
060C ; Common # Po ARABIC COMMA
061B ; Common # Po ARABIC SEMICOLON
+061C ; Common # Cf ARABIC LETTER MARK
061F ; Common # Po ARABIC QUESTION MARK
0640 ; Common # Lm ARABIC TATWEEL
0660..0669 ; Common # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
@@ -136,7 +138,7 @@
2055..205E ; Common # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS
205F ; Common # Zs MEDIUM MATHEMATICAL SPACE
2060..2064 ; Common # Cf [5] WORD JOINER..INVISIBLE PLUS
-206A..206F ; Common # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES
+2066..206F ; Common # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
2070 ; Common # No SUPERSCRIPT ZERO
2074..2079 ; Common # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE
207A..207C ; Common # Sm [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN
@@ -146,7 +148,7 @@
208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN
208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS
208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS
-20A0..20BA ; Common # Sc [27] EURO-CURRENCY SIGN..TURKISH LIRA SIGN
+20A0..20BD ; Common # Sc [30] EURO-CURRENCY SIGN..RUBLE SIGN
2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
2102 ; Common # L& DOUBLE-STRUCK CAPITAL C
2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA
@@ -200,7 +202,10 @@
21D5..21F3 ; Common # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW
21F4..22FF ; Common # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP
2300..2307 ; Common # So [8] DIAMETER SIGN..WAVY LINE
-2308..230B ; Common # Sm [4] LEFT CEILING..RIGHT FLOOR
+2308 ; Common # Ps LEFT CEILING
+2309 ; Common # Pe RIGHT CEILING
+230A ; Common # Ps LEFT FLOOR
+230B ; Common # Pe RIGHT FLOOR
230C..231F ; Common # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER
2320..2321 ; Common # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL
2322..2328 ; Common # So [7] FROWN..KEYBOARD
@@ -212,7 +217,7 @@
239B..23B3 ; Common # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Common # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Common # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
-23E2..23F3 ; Common # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND
+23E2..23FA ; Common # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
2400..2426 ; Common # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
2440..244A ; Common # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
2460..249B ; Common # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP
@@ -226,8 +231,7 @@
25F8..25FF ; Common # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE
2600..266E ; Common # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN
266F ; Common # Sm MUSIC SHARP SIGN
-2670..26FF ; Common # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
-2701..2767 ; Common # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET
+2670..2767 ; Common # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET
2768 ; Common # Ps MEDIUM LEFT PARENTHESIS ORNAMENT
2769 ; Common # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT
276A ; Common # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
@@ -295,7 +299,11 @@
2B30..2B44 ; Common # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET
2B45..2B46 ; Common # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW
2B47..2B4C ; Common # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR
-2B50..2B59 ; Common # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE
+2B4D..2B73 ; Common # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR
+2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
+2B98..2BB9 ; Common # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX
+2BBD..2BC8 ; Common # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
+2BCA..2BD1 ; Common # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET
2E03 ; Common # Pf RIGHT SUBSTITUTION BRACKET
@@ -329,6 +337,10 @@
2E2F ; Common # Lm VERTICAL TILDE
2E30..2E39 ; Common # Po [10] RING POINT..TOP HALF SECTION SIGN
2E3A..2E3B ; Common # Pd [2] TWO-EM DASH..THREE-EM DASH
+2E3C..2E3F ; Common # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM
+2E40 ; Common # Pd DOUBLE HYPHEN
+2E41 ; Common # Po REVERSED COMMA
+2E42 ; Common # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
2FF0..2FFB ; Common # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
3000 ; Common # Zs IDEOGRAPHIC SPACE
3001..3003 ; Common # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
@@ -392,9 +404,11 @@
A836..A837 ; Common # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
A838 ; Common # Sc NORTH INDIC RUPEE MARK
A839 ; Common # So NORTH INDIC QUANTITY MARK
-FD3E ; Common # Ps ORNATE LEFT PARENTHESIS
-FD3F ; Common # Pe ORNATE RIGHT PARENTHESIS
-FDFD ; Common # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
+A92E ; Common # Po KAYAH LI SIGN CWI
+A9CF ; Common # Lm JAVANESE PANGRANGKEP
+AB5B ; Common # Sk MODIFIER BREVE WITH INVERTED BREVE
+FD3E ; Common # Pe ORNATE LEFT PARENTHESIS
+FD3F ; Common # Ps ORNATE RIGHT PARENTHESIS
FE10..FE16 ; Common # Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK
FE17 ; Common # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
FE18 ; Common # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
@@ -487,6 +501,8 @@
10137..1013F ; Common # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
10190..1019B ; Common # So [12] ROMAN SEXTANS SIGN..ROMAN CENTURIAL SIGN
101D0..101FC ; Common # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND
+102E1..102FB ; Common # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
+1BCA0..1BCA3 ; Common # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
1D000..1D0F5 ; Common # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
1D100..1D126 ; Common # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
1D129..1D164 ; Common # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
@@ -543,10 +559,10 @@
1F000..1F02B ; Common # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK
1F030..1F093 ; Common # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06
1F0A0..1F0AE ; Common # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES
-1F0B1..1F0BE ; Common # So [14] PLAYING CARD ACE OF HEARTS..PLAYING CARD KING OF HEARTS
+1F0B1..1F0BF ; Common # So [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER
1F0C1..1F0CF ; Common # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER
-1F0D1..1F0DF ; Common # So [15] PLAYING CARD ACE OF CLUBS..PLAYING CARD WHITE JOKER
-1F100..1F10A ; Common # No [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA
+1F0D1..1F0F5 ; Common # So [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21
+1F100..1F10C ; Common # No [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
1F110..1F12E ; Common # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ
1F130..1F16B ; Common # So [60] SQUARED LATIN CAPITAL LETTER A..RAISED MD SIGN
1F170..1F19A ; Common # So [43] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VS
@@ -555,28 +571,29 @@
1F210..1F23A ; Common # So [43] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-55B6
1F240..1F248 ; Common # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
1F250..1F251 ; Common # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
-1F300..1F320 ; Common # So [33] CYCLONE..SHOOTING STAR
-1F330..1F335 ; Common # So [6] CHESTNUT..CACTUS
-1F337..1F37C ; Common # So [70] TULIP..BABY BOTTLE
-1F380..1F393 ; Common # So [20] RIBBON..GRADUATION CAP
-1F3A0..1F3C4 ; Common # So [37] CAROUSEL HORSE..SURFER
-1F3C6..1F3CA ; Common # So [5] TROPHY..SWIMMER
-1F3E0..1F3F0 ; Common # So [17] HOUSE BUILDING..EUROPEAN CASTLE
-1F400..1F43E ; Common # So [63] RAT..PAW PRINTS
-1F440 ; Common # So EYES
-1F442..1F4F7 ; Common # So [182] EAR..CAMERA
-1F4F9..1F4FC ; Common # So [4] VIDEO CAMERA..VIDEOCASSETTE
-1F500..1F53D ; Common # So [62] TWISTED RIGHTWARDS ARROWS..DOWN-POINTING SMALL RED TRIANGLE
-1F540..1F543 ; Common # So [4] CIRCLED CROSS POMMEE..NOTCHED LEFT SEMICIRCLE WITH THREE DOTS
-1F550..1F567 ; Common # So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
-1F5FB..1F640 ; Common # So [70] MOUNT FUJI..WEARY CAT FACE
-1F645..1F64F ; Common # So [11] FACE WITH NO GOOD GESTURE..PERSON WITH FOLDED HANDS
-1F680..1F6C5 ; Common # So [70] ROCKET..LEFT LUGGAGE
+1F300..1F32C ; Common # So [45] CYCLONE..WIND BLOWING FACE
+1F330..1F37D ; Common # So [78] CHESTNUT..FORK AND KNIFE WITH PLATE
+1F380..1F3CE ; Common # So [79] RIBBON..RACING CAR
+1F3D4..1F3F7 ; Common # So [36] SNOW CAPPED MOUNTAIN..LABEL
+1F400..1F4FE ; Common # So [255] RAT..PORTABLE STEREO
+1F500..1F54A ; Common # So [75] TWISTED RIGHTWARDS ARROWS..DOVE OF PEACE
+1F550..1F579 ; Common # So [42] CLOCK FACE ONE OCLOCK..JOYSTICK
+1F57B..1F5A3 ; Common # So [41] LEFT HAND TELEPHONE RECEIVER..BLACK DOWN POINTING BACKHAND INDEX
+1F5A5..1F642 ; Common # So [158] DESKTOP COMPUTER..SLIGHTLY SMILING FACE
+1F645..1F6CF ; Common # So [139] FACE WITH NO GOOD GESTURE..BED
+1F6E0..1F6EC ; Common # So [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
+1F6F0..1F6F3 ; Common # So [4] SATELLITE..PASSENGER SHIP
1F700..1F773 ; Common # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
+1F780..1F7D4 ; Common # So [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR
+1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
+1F810..1F847 ; Common # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
+1F850..1F859 ; Common # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
+1F860..1F887 ; Common # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
+1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
E0001 ; Common # Cf LANGUAGE TAG
E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG
-# Total code points: 6413
+# Total code points: 7129
# ================================================
@@ -618,16 +635,20 @@
A770 ; Latin # Lm MODIFIER LETTER US
A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
-A790..A793 ; Latin # L& [4] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER C WITH BAR
-A7A0..A7AA ; Latin # L& [11] LATIN CAPITAL LETTER G WITH OBLIQUE STROKE..LATIN CAPITAL LETTER H WITH HOOK
+A790..A7AD ; Latin # L& [30] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER L WITH BELT
+A7B0..A7B1 ; Latin # L& [2] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER TURNED T
+A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
A7FA ; Latin # L& LATIN LETTER SMALL CAPITAL TURNED M
A7FB..A7FF ; Latin # Lo [5] LATIN EPIGRAPHIC LETTER REVERSED F..LATIN EPIGRAPHIC LETTER ARCHAIC M
+AB30..AB5A ; Latin # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
+AB5C..AB5F ; Latin # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
+AB64 ; Latin # L& LATIN SMALL LETTER INVERTED ALPHA
FB00..FB06 ; Latin # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
FF21..FF3A ; Latin # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
-# Total code points: 1272
+# Total code points: 1338
# ================================================
@@ -636,6 +657,7 @@
0376..0377 ; Greek # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
037A ; Greek # Lm GREEK YPOGEGRAMMENI
037B..037D ; Greek # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL
+037F ; Greek # L& GREEK CAPITAL LETTER YOT
0384 ; Greek # Sk GREEK TONOS
0386 ; Greek # L& GREEK CAPITAL LETTER ALPHA WITH TONOS
0388..038A ; Greek # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS
@@ -675,15 +697,18 @@
1FF6..1FFC ; Greek # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
1FFD..1FFE ; Greek # Sk [2] GREEK OXIA..GREEK DASIA
2126 ; Greek # L& OHM SIGN
+AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA
10140..10174 ; Greek # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS
10175..10178 ; Greek # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN
10179..10189 ; Greek # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN
-1018A ; Greek # No GREEK ZERO SIGN
+1018A..1018B ; Greek # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN
+1018C ; Greek # So GREEK SINUSOID SIGN
+101A0 ; Greek # So GREEK SYMBOL TAU RHO
1D200..1D241 ; Greek # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D242..1D244 ; Greek # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
1D245 ; Greek # So GREEK MUSICAL LEIMMA
-# Total code points: 511
+# Total code points: 516
# ================================================
@@ -692,7 +717,7 @@
0483..0484 ; Cyrillic # Mn [2] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC PALATALIZATION
0487 ; Cyrillic # Mn COMBINING CYRILLIC POKRYTIE
0488..0489 ; Cyrillic # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
-048A..0527 ; Cyrillic # L& [158] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER SHHA WITH DESCENDER
+048A..052F ; Cyrillic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER
1D2B ; Cyrillic # L& CYRILLIC LETTER SMALL CAPITAL EL
1D78 ; Cyrillic # Lm MODIFIER LETTER CYRILLIC EN
2DE0..2DFF ; Cyrillic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
@@ -704,10 +729,11 @@
A674..A67D ; Cyrillic # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK
A67E ; Cyrillic # Po CYRILLIC KAVYKA
A67F ; Cyrillic # Lm CYRILLIC PAYEROK
-A680..A697 ; Cyrillic # L& [24] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER SHWE
+A680..A69B ; Cyrillic # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O
+A69C..A69D ; Cyrillic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
A69F ; Cyrillic # Mn COMBINING CYRILLIC LETTER IOTIFIED E
-# Total code points: 417
+# Total code points: 431
# ================================================
@@ -716,10 +742,11 @@
055A..055F ; Armenian # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK
0561..0587 ; Armenian # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN
058A ; Armenian # Pd ARMENIAN HYPHEN
+058D..058E ; Armenian # So [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN
058F ; Armenian # Sc ARMENIAN DRAM SIGN
FB13..FB17 ; Armenian # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
-# Total code points: 91
+# Total code points: 93
# ================================================
@@ -779,9 +806,8 @@
06FD..06FE ; Arabic # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN
06FF ; Arabic # Lo ARABIC LETTER HEH WITH INVERTED V
0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE
-08A0 ; Arabic # Lo ARABIC LETTER BEH WITH SMALL V BELOW
-08A2..08AC ; Arabic # Lo [11] ARABIC LETTER JEEM WITH TWO DOTS ABOVE..ARABIC LETTER ROHINGYA YEH
-08E4..08FE ; Arabic # Mn [27] ARABIC CURLY FATHA..ARABIC DAMMA WITH DOT
+08A0..08B2 ; Arabic # Lo [19] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER ZAIN WITH INVERTED V ABOVE
+08E4..08FF ; Arabic # Mn [28] ARABIC CURLY FATHA..ARABIC MARK SIDEWAYS NOON GHUNNA
FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
FBB2..FBC1 ; Arabic # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
FBD3..FD3D ; Arabic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
@@ -789,6 +815,7 @@
FD92..FDC7 ; Arabic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
FDF0..FDFB ; Arabic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU
FDFC ; Arabic # Sc RIAL SIGN
+FDFD ; Arabic # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS
@@ -827,7 +854,7 @@
1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL
-# Total code points: 1235
+# Total code points: 1244
# ================================================
@@ -870,17 +897,17 @@
0966..096F ; Devanagari # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
0970 ; Devanagari # Po DEVANAGARI ABBREVIATION SIGN
0971 ; Devanagari # Lm DEVANAGARI SIGN HIGH SPACING DOT
-0972..0977 ; Devanagari # Lo [6] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER UUE
-0979..097F ; Devanagari # Lo [7] DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA
+0972..097F ; Devanagari # Lo [14] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER BBA
A8E0..A8F1 ; Devanagari # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA
A8F2..A8F7 ; Devanagari # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA
A8F8..A8FA ; Devanagari # Po [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET
A8FB ; Devanagari # Lo DEVANAGARI HEADSTROKE
-# Total code points: 151
+# Total code points: 152
# ================================================
+0980 ; Bengali # Lo BENGALI ANJI
0981 ; Bengali # Mn BENGALI SIGN CANDRABINDU
0982..0983 ; Bengali # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA
0985..098C ; Bengali # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L
@@ -908,7 +935,7 @@
09FA ; Bengali # So BENGALI ISSHAR
09FB ; Bengali # Sc BENGALI GANDA MARK
-# Total code points: 92
+# Total code points: 93
# ================================================
@@ -1025,12 +1052,12 @@
# ================================================
+0C00 ; Telugu # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
0C01..0C03 ; Telugu # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
0C05..0C0C ; Telugu # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
0C0E..0C10 ; Telugu # Lo [3] TELUGU LETTER E..TELUGU LETTER AI
0C12..0C28 ; Telugu # Lo [23] TELUGU LETTER O..TELUGU LETTER NA
-0C2A..0C33 ; Telugu # Lo [10] TELUGU LETTER PA..TELUGU LETTER LLA
-0C35..0C39 ; Telugu # Lo [5] TELUGU LETTER VA..TELUGU LETTER HA
+0C2A..0C39 ; Telugu # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA
0C3D ; Telugu # Lo TELUGU SIGN AVAGRAHA
0C3E..0C40 ; Telugu # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
0C41..0C44 ; Telugu # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
@@ -1044,10 +1071,11 @@
0C78..0C7E ; Telugu # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR
0C7F ; Telugu # So TELUGU SIGN TUUMU
-# Total code points: 93
+# Total code points: 95
# ================================================
+0C81 ; Kannada # Mn KANNADA SIGN CANDRABINDU
0C82..0C83 ; Kannada # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
0C85..0C8C ; Kannada # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
0C8E..0C90 ; Kannada # Lo [3] KANNADA LETTER E..KANNADA LETTER AI
@@ -1070,10 +1098,11 @@
0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
-# Total code points: 86
+# Total code points: 87
# ================================================
+0D01 ; Malayalam # Mn MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Malayalam # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D05..0D0C ; Malayalam # Lo [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L
0D0E..0D10 ; Malayalam # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
@@ -1093,7 +1122,7 @@
0D79 ; Malayalam # So MALAYALAM DATE MARK
0D7A..0D7F ; Malayalam # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
-# Total code points: 98
+# Total code points: 99
# ================================================
@@ -1108,10 +1137,12 @@
0DD2..0DD4 ; Sinhala # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
0DD6 ; Sinhala # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA
0DD8..0DDF ; Sinhala # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA
+0DE6..0DEF ; Sinhala # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE
0DF2..0DF3 ; Sinhala # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA
0DF4 ; Sinhala # Po SINHALA PUNCTUATION KUNDDALIYA
+111E1..111F4 ; Sinhala # No [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND
-# Total code points: 80
+# Total code points: 110
# ================================================
@@ -1234,14 +1265,23 @@
109A..109C ; Myanmar # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
109D ; Myanmar # Mn MYANMAR VOWEL SIGN AITON AI
109E..109F ; Myanmar # So [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION
+A9E0..A9E4 ; Myanmar # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA
+A9E5 ; Myanmar # Mn MYANMAR SIGN SHAN SAW
+A9E6 ; Myanmar # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION
+A9E7..A9EF ; Myanmar # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA
+A9F0..A9F9 ; Myanmar # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE
+A9FA..A9FE ; Myanmar # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA
AA60..AA6F ; Myanmar # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA
AA70 ; Myanmar # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION
AA71..AA76 ; Myanmar # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM
AA77..AA79 ; Myanmar # So [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO
AA7A ; Myanmar # Lo MYANMAR LETTER AITON RA
AA7B ; Myanmar # Mc MYANMAR SIGN PAO KAREN TONE
+AA7C ; Myanmar # Mn MYANMAR SIGN TAI LAING TONE-2
+AA7D ; Myanmar # Mc MYANMAR SIGN TAI LAING TONE-5
+AA7E..AA7F ; Myanmar # Lo [2] MYANMAR LETTER SHWE PALAUNG CHA..MYANMAR LETTER SHWE PALAUNG SHA
-# Total code points: 188
+# Total code points: 223
# ================================================
@@ -1345,8 +1385,9 @@
16A0..16EA ; Runic # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
16EE..16F0 ; Runic # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL
+16F1..16F8 ; Runic # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
-# Total code points: 78
+# Total code points: 86
# ================================================
@@ -1377,7 +1418,7 @@
1806 ; Mongolian # Pd MONGOLIAN TODO SOFT HYPHEN
1807..180A ; Mongolian # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU
180B..180D ; Mongolian # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
-180E ; Mongolian # Zs MONGOLIAN VOWEL SEPARATOR
+180E ; Mongolian # Cf MONGOLIAN VOWEL SEPARATOR
1810..1819 ; Mongolian # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE
1820..1842 ; Mongolian # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
1843 ; Mongolian # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
@@ -1452,10 +1493,10 @@
# ================================================
-10300..1031E ; Old_Italic # Lo [31] OLD ITALIC LETTER A..OLD ITALIC LETTER UU
+10300..1031F ; Old_Italic # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS
10320..10323 ; Old_Italic # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY
-# Total code points: 35
+# Total code points: 36
# ================================================
@@ -1479,12 +1520,15 @@
064B..0655 ; Inherited # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
0670 ; Inherited # Mn ARABIC LETTER SUPERSCRIPT ALEF
0951..0952 ; Inherited # Mn [2] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI STRESS SIGN ANUDATTA
+1AB0..1ABD ; Inherited # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
+1ABE ; Inherited # Me COMBINING PARENTHESES OVERLAY
1CD0..1CD2 ; Inherited # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
1CD4..1CE0 ; Inherited # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
1CE2..1CE8 ; Inherited # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; Inherited # Mn VEDIC SIGN TIRYAK
1CF4 ; Inherited # Mn VEDIC TONE CANDRA ABOVE
-1DC0..1DE6 ; Inherited # Mn [39] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER Z
+1CF8..1CF9 ; Inherited # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
+1DC0..1DF5 ; Inherited # Mn [54] COMBINING DOTTED GRAVE ACCENT..COMBINING UP TACK ABOVE
1DFC..1DFF ; Inherited # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
200C..200D ; Inherited # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
20D0..20DC ; Inherited # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
@@ -1495,15 +1539,16 @@
302A..302D ; Inherited # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
3099..309A ; Inherited # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
FE00..FE0F ; Inherited # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
-FE20..FE26 ; Inherited # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON
+FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW
101FD ; Inherited # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
+102E0 ; Inherited # Mn COPTIC EPACT THOUSANDS MARK
1D167..1D169 ; Inherited # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
1D17B..1D182 ; Inherited # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
1D185..1D18B ; Inherited # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 523
+# Total code points: 563
# ================================================
@@ -1537,7 +1582,7 @@
# ================================================
-1900..191C ; Limbu # Lo [29] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER HA
+1900..191E ; Limbu # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA
1920..1922 ; Limbu # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
1923..1926 ; Limbu # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU
1927..1928 ; Limbu # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O
@@ -1550,7 +1595,7 @@
1944..1945 ; Limbu # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
1946..194F ; Limbu # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE
-# Total code points: 66
+# Total code points: 68
# ================================================
@@ -1612,7 +1657,8 @@
1A00..1A16 ; Buginese # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA
1A17..1A18 ; Buginese # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
-1A19..1A1B ; Buginese # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE
+1A19..1A1A ; Buginese # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O
+1A1B ; Buginese # Mn BUGINESE VOWEL SIGN AE
1A1E..1A1F ; Buginese # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION
# Total code points: 30
@@ -1724,11 +1770,11 @@
# ================================================
-12000..1236E ; Cuneiform # Lo [879] CUNEIFORM SIGN A..CUNEIFORM SIGN ZUM
-12400..12462 ; Cuneiform # Nl [99] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER
-12470..12473 ; Cuneiform # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON
+12000..12398 ; Cuneiform # Lo [921] CUNEIFORM SIGN A..CUNEIFORM SIGN UM TIMES ME
+12400..1246E ; Cuneiform # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
+12470..12474 ; Cuneiform # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
-# Total code points: 982
+# Total code points: 1037
# ================================================
@@ -1767,8 +1813,7 @@
1BA6..1BA7 ; Sundanese # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG
1BA8..1BA9 ; Sundanese # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
1BAA ; Sundanese # Mc SUNDANESE SIGN PAMAAEH
-1BAB ; Sundanese # Mn SUNDANESE SIGN VIRAMA
-1BAC..1BAD ; Sundanese # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA
+1BAB..1BAD ; Sundanese # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA
1BAE..1BAF ; Sundanese # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
1BB0..1BB9 ; Sundanese # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE
1BBA..1BBF ; Sundanese # Lo [6] SUNDANESE AVAGRAHA..SUNDANESE LETTER FINAL M
@@ -1825,9 +1870,9 @@
A900..A909 ; Kayah_Li # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE
A90A..A925 ; Kayah_Li # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO
A926..A92D ; Kayah_Li # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU
-A92E..A92F ; Kayah_Li # Po [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA
+A92F ; Kayah_Li # Po KAYAH LI SIGN SHYA
-# Total code points: 48
+# Total code points: 47
# ================================================
@@ -1974,11 +2019,10 @@
A9BC ; Javanese # Mn JAVANESE VOWEL SIGN PEPET
A9BD..A9C0 ; Javanese # Mc [4] JAVANESE CONSONANT SIGN KERET..JAVANESE PANGKON
A9C1..A9CD ; Javanese # Po [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH
-A9CF ; Javanese # Lm JAVANESE PANGRANGKEP
A9D0..A9D9 ; Javanese # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE
A9DE..A9DF ; Javanese # Po [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN
-# Total code points: 91
+# Total code points: 90
# ================================================
@@ -2080,8 +2124,9 @@
11047..1104D ; Brahmi # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
11052..11065 ; Brahmi # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND
11066..1106F ; Brahmi # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE
+1107F ; Brahmi # Mn BRAHMI NUMBER JOINER
-# Total code points: 108
+# Total code points: 109
# ================================================
@@ -2136,9 +2181,11 @@
111BF..111C0 ; Sharada # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA
111C1..111C4 ; Sharada # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM
111C5..111C8 ; Sharada # Po [4] SHARADA DANDA..SHARADA SEPARATOR
+111CD ; Sharada # Po SHARADA SUTRA MARK
111D0..111D9 ; Sharada # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE
+111DA ; Sharada # Lo SHARADA EKAM
-# Total code points: 83
+# Total code points: 85
# ================================================
@@ -2161,4 +2208,244 @@
# Total code points: 66
+# ================================================
+
+10530..10563 ; Caucasian_Albanian # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW
+1056F ; Caucasian_Albanian # Po CAUCASIAN ALBANIAN CITATION MARK
+
+# Total code points: 53
+
+# ================================================
+
+16AD0..16AED ; Bassa_Vah # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
+16AF0..16AF4 ; Bassa_Vah # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
+16AF5 ; Bassa_Vah # Po BASSA VAH FULL STOP
+
+# Total code points: 36
+
+# ================================================
+
+1BC00..1BC6A ; Duployan # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
+1BC70..1BC7C ; Duployan # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
+1BC80..1BC88 ; Duployan # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
+1BC90..1BC99 ; Duployan # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
+1BC9C ; Duployan # So DUPLOYAN SIGN O WITH CROSS
+1BC9D..1BC9E ; Duployan # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
+1BC9F ; Duployan # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
+
+# Total code points: 143
+
+# ================================================
+
+10500..10527 ; Elbasan # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE
+
+# Total code points: 40
+
+# ================================================
+
+11301 ; Grantha # Mn GRANTHA SIGN CANDRABINDU
+11302..11303 ; Grantha # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA
+11305..1130C ; Grantha # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L
+1130F..11310 ; Grantha # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI
+11313..11328 ; Grantha # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA
+1132A..11330 ; Grantha # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA
+11332..11333 ; Grantha # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA
+11335..11339 ; Grantha # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA
+1133C ; Grantha # Mn GRANTHA SIGN NUKTA
+1133D ; Grantha # Lo GRANTHA SIGN AVAGRAHA
+1133E..1133F ; Grantha # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I
+11340 ; Grantha # Mn GRANTHA VOWEL SIGN II
+11341..11344 ; Grantha # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR
+11347..11348 ; Grantha # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI
+1134B..1134D ; Grantha # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA
+11357 ; Grantha # Mc GRANTHA AU LENGTH MARK
+1135D..11361 ; Grantha # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL
+11362..11363 ; Grantha # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
+11366..1136C ; Grantha # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
+11370..11374 ; Grantha # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
+
+# Total code points: 83
+
+# ================================================
+
+16B00..16B2F ; Pahawh_Hmong # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
+16B30..16B36 ; Pahawh_Hmong # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
+16B37..16B3B ; Pahawh_Hmong # Po [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM
+16B3C..16B3F ; Pahawh_Hmong # So [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB
+16B40..16B43 ; Pahawh_Hmong # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
+16B44 ; Pahawh_Hmong # Po PAHAWH HMONG SIGN XAUS
+16B45 ; Pahawh_Hmong # So PAHAWH HMONG SIGN CIM TSOV ROG
+16B50..16B59 ; Pahawh_Hmong # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
+16B5B..16B61 ; Pahawh_Hmong # No [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS
+16B63..16B77 ; Pahawh_Hmong # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
+16B7D..16B8F ; Pahawh_Hmong # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ
+
+# Total code points: 127
+
+# ================================================
+
+11200..11211 ; Khojki # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA
+11213..1122B ; Khojki # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1122C..1122E ; Khojki # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II
+1122F..11231 ; Khojki # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI
+11232..11233 ; Khojki # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
+11234 ; Khojki # Mn KHOJKI SIGN ANUSVARA
+11235 ; Khojki # Mc KHOJKI SIGN VIRAMA
+11236..11237 ; Khojki # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
+11238..1123D ; Khojki # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
+
+# Total code points: 61
+
+# ================================================
+
+10600..10736 ; Linear_A # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664
+10740..10755 ; Linear_A # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE
+10760..10767 ; Linear_A # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807
+
+# Total code points: 341
+
+# ================================================
+
+11150..11172 ; Mahajani # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA
+11173 ; Mahajani # Mn MAHAJANI SIGN NUKTA
+11174..11175 ; Mahajani # Po [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK
+11176 ; Mahajani # Lo MAHAJANI LIGATURE SHRI
+
+# Total code points: 39
+
+# ================================================
+
+10AC0..10AC7 ; Manichaean # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW
+10AC8 ; Manichaean # So MANICHAEAN SIGN UD
+10AC9..10AE4 ; Manichaean # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW
+10AE5..10AE6 ; Manichaean # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
+10AEB..10AEF ; Manichaean # No [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED
+10AF0..10AF6 ; Manichaean # Po [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER
+
+# Total code points: 51
+
+# ================================================
+
+1E800..1E8C4 ; Mende_Kikakui # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
+1E8C7..1E8CF ; Mende_Kikakui # No [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE
+1E8D0..1E8D6 ; Mende_Kikakui # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
+
+# Total code points: 213
+
+# ================================================
+
+11600..1162F ; Modi # Lo [48] MODI LETTER A..MODI LETTER LLA
+11630..11632 ; Modi # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II
+11633..1163A ; Modi # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI
+1163B..1163C ; Modi # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU
+1163D ; Modi # Mn MODI SIGN ANUSVARA
+1163E ; Modi # Mc MODI SIGN VISARGA
+1163F..11640 ; Modi # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA
+11641..11643 ; Modi # Po [3] MODI DANDA..MODI ABBREVIATION SIGN
+11644 ; Modi # Lo MODI SIGN HUVA
+11650..11659 ; Modi # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE
+
+# Total code points: 79
+
+# ================================================
+
+16A40..16A5E ; Mro # Lo [31] MRO LETTER TA..MRO LETTER TEK
+16A60..16A69 ; Mro # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE
+16A6E..16A6F ; Mro # Po [2] MRO DANDA..MRO DOUBLE DANDA
+
+# Total code points: 43
+
+# ================================================
+
+10A80..10A9C ; Old_North_Arabian # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH
+10A9D..10A9F ; Old_North_Arabian # No [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY
+
+# Total code points: 32
+
+# ================================================
+
+10880..1089E ; Nabataean # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW
+108A7..108AF ; Nabataean # No [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED
+
+# Total code points: 40
+
+# ================================================
+
+10860..10876 ; Palmyrene # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW
+10877..10878 ; Palmyrene # So [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON
+10879..1087F ; Palmyrene # No [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY
+
+# Total code points: 32
+
+# ================================================
+
+11AC0..11AF8 ; Pau_Cin_Hau # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
+
+# Total code points: 57
+
+# ================================================
+
+10350..10375 ; Old_Permic # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA
+10376..1037A ; Old_Permic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
+
+# Total code points: 43
+
+# ================================================
+
+10B80..10B91 ; Psalter_Pahlavi # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW
+10B99..10B9C ; Psalter_Pahlavi # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
+10BA9..10BAF ; Psalter_Pahlavi # No [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED
+
+# Total code points: 29
+
+# ================================================
+
+11580..115AE ; Siddham # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA
+115AF..115B1 ; Siddham # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II
+115B2..115B5 ; Siddham # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR
+115B8..115BB ; Siddham # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU
+115BC..115BD ; Siddham # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA
+115BE ; Siddham # Mc SIDDHAM SIGN VISARGA
+115BF..115C0 ; Siddham # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
+115C1..115C9 ; Siddham # Po [9] SIDDHAM SIGN SIDDHAM..SIDDHAM END OF TEXT MARK
+
+# Total code points: 72
+
+# ================================================
+
+112B0..112DE ; Khudawadi # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA
+112DF ; Khudawadi # Mn KHUDAWADI SIGN ANUSVARA
+112E0..112E2 ; Khudawadi # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
+112E3..112EA ; Khudawadi # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
+112F0..112F9 ; Khudawadi # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE
+
+# Total code points: 69
+
+# ================================================
+
+11480..114AF ; Tirhuta # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA
+114B0..114B2 ; Tirhuta # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
+114B3..114B8 ; Tirhuta # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
+114B9 ; Tirhuta # Mc TIRHUTA VOWEL SIGN E
+114BA ; Tirhuta # Mn TIRHUTA VOWEL SIGN SHORT E
+114BB..114BE ; Tirhuta # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU
+114BF..114C0 ; Tirhuta # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA
+114C1 ; Tirhuta # Mc TIRHUTA SIGN VISARGA
+114C2..114C3 ; Tirhuta # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
+114C4..114C5 ; Tirhuta # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG
+114C6 ; Tirhuta # Po TIRHUTA ABBREVIATION SIGN
+114C7 ; Tirhuta # Lo TIRHUTA OM
+114D0..114D9 ; Tirhuta # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE
+
+# Total code points: 82
+
+# ================================================
+
+118A0..118DF ; Warang_Citi # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
+118E0..118E9 ; Warang_Citi # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE
+118EA..118F2 ; Warang_Citi # No [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY
+118FF ; Warang_Citi # Lo WARANG CITI OM
+
+# Total code points: 84
+
# EOF
--- a/jdk/test/java/text/Bidi/BidiConformance.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/test/java/text/Bidi/BidiConformance.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -23,7 +23,7 @@
/*
* @test
- * @bug 6850113
+ * @bug 6850113 8032446
* @summary confirm the behavior of new Bidi implementation. (Backward compatibility)
*/
@@ -40,6 +40,8 @@
private static boolean verbose = false;
private static boolean abort = false;
+ private static final byte MAX_EXPLICIT_LEVEL = 125;
+
public static void main(String[] args) {
for (int i = 0; i < args.length; i++) {
String arg = args[i];
@@ -368,15 +370,15 @@
AttributedString astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-61),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-MAX_EXPLICIT_LEVEL),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
- if (bidi.getLevelAt(i) != 61) {
+ if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) {
errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" +
i + ") should not be " + bidi.getLevelAt(i) +
- " but 60 when BIDI_EMBEDDING is -61.");
+ " but MAX_EXPLICIT_LEVEL-1 when BIDI_EMBEDDING is -MAX_EXPLICIT_LEVEL.");
}
}
}
@@ -387,14 +389,14 @@
astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-62),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-(MAX_EXPLICIT_LEVEL+1)),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
if (bidi.getLevelAt(i) != 1) {
errorHandling("Bidi(AttributedCharacterIterator).getLevelAt() " +
- "should be 1 when BIDI_EMBEDDING is -62.");
+ "should be 1 when BIDI_EMBEDDING is -(MAX_EXPLICIT_LEVEL+1).");
}
}
}
@@ -405,14 +407,14 @@
astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(60),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL-1),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
- if (bidi.getLevelAt(i) != 61) {
+ if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) {
errorHandling("Bidi(AttributedCharacterIterator).getLevelAt() " +
- "should be 61 when BIDI_EMBEDDING is 60.");
+ "should be MAX_EXPLICIT_LEVEL when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL-1.");
}
}
}
@@ -423,15 +425,15 @@
astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(61),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
- if (bidi.getLevelAt(i) != 61) {
+ if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) {
errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" +
i + ") should not be " + bidi.getLevelAt(i) +
- " but 61 when BIDI_EMBEDDING is 61.");
+ " but MAX_EXPLICIT_LEVEL when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL.");
}
}
}
@@ -442,15 +444,15 @@
astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(62),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL+1),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
if (bidi.getLevelAt(i) != 1) {
- errorHandling("Bidi(AttributedCharacterIterator).getLevelAt()" +
- " should not be " + bidi.getLevelAt(i) +
- " but 1 when BIDI_EMBEDDING is 62.");
+ errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" +
+ i + ") should not be " + bidi.getLevelAt(i) +
+ " but 1 when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL+1.");
}
}
}
@@ -536,8 +538,8 @@
}
byte[] actualLevels = new byte[text.length];
- byte[] validEmbeddings1 = {0, -61, -60, -2, -1};
- byte[] expectedLevels1 = {0, 61, 60, 2, 1};
+ byte[] validEmbeddings1 = {0, -MAX_EXPLICIT_LEVEL, -(MAX_EXPLICIT_LEVEL-1), -2, -1};
+ byte[] expectedLevels1 = {0, MAX_EXPLICIT_LEVEL, MAX_EXPLICIT_LEVEL-1, 2, 1};
try {
bidi = new Bidi(text, 0, validEmbeddings1, 0, 5,
Bidi.DIRECTION_LEFT_TO_RIGHT);
@@ -553,11 +555,11 @@
}
catch (Exception e) {
errorHandling("Bidi(char[], ...) should not throw an exception " +
- "when embeddings is valid(-61).");
+ "when embeddings is valid(-MAX_EXPLICIT_LEVEL).");
}
- byte[] validEmbeddings2 = {0, 61, 60, 2, 1};
- byte[] expectedLevels2 = {0, 62, 60, 2, 2};
+ byte[] validEmbeddings2 = {0, MAX_EXPLICIT_LEVEL, MAX_EXPLICIT_LEVEL-1, 2, 1};
+ byte[] expectedLevels2 = {0, MAX_EXPLICIT_LEVEL+1, MAX_EXPLICIT_LEVEL-1, 2, 2};
try {
bidi = new Bidi(text, 0, validEmbeddings2, 0, 5,
Bidi.DIRECTION_LEFT_TO_RIGHT);
@@ -573,35 +575,35 @@
}
catch (Exception e) {
errorHandling("Bidi(char[], ...) should not throw an exception " +
- "when embeddings is valid(61).");
+ "when embeddings is valid(MAX_EXPLICIT_LEVEL).");
}
- byte[] invalidEmbeddings1 = {0, -62, 0, 0, 0};
+ byte[] invalidEmbeddings1 = {0, -(MAX_EXPLICIT_LEVEL+1), 0, 0, 0};
try {
bidi = new Bidi(text, 0, invalidEmbeddings1, 0, 5,
Bidi.DIRECTION_LEFT_TO_RIGHT);
if (bidi.getLevelAt(1) != 0) {
errorHandling("Bidi(char[], ...).getLevelAt(1) should be 0 " +
- "when embeddings[1] is -62.");
+ "when embeddings[1] is -(MAX_EXPLICIT_LEVEL+1).");
}
}
catch (Exception e) {
errorHandling("Bidi(char[], ...) should not throw an exception " +
- "even when embeddings includes -62.");
+ "even when embeddings includes -(MAX_EXPLICIT_LEVEL+1).");
}
- byte[] invalidEmbeddings2 = {0, 62, 0, 0, 0};
+ byte[] invalidEmbeddings2 = {0, MAX_EXPLICIT_LEVEL+1, 0, 0, 0};
try {
bidi = new Bidi(text, 0, invalidEmbeddings2, 0, 5,
Bidi.DIRECTION_LEFT_TO_RIGHT);
if (bidi.getLevelAt(1) != 0) {
errorHandling("Bidi(char[], ...).getLevelAt(1) should be 0 " +
- "when embeddings[1] is 62.");
+ "when embeddings[1] is MAX_EXPLICIT_LEVEL+1.");
}
}
catch (Exception e) {
errorHandling("Bidi(char[], ...) should not throw an exception " +
- "even when embeddings includes 62.");
+ "even when embeddings includes MAX_EXPLICIT_LEVEL+1.");
}
try {
@@ -1595,6 +1597,10 @@
private static final char PDF = '\u202C';
private static final char LRO = '\u202D';
private static final char RLO = '\u202E';
+ private static final char LRI = '\u2066';
+ private static final char RLI = '\u2067';
+ private static final char FSI = '\u2068';
+ private static final char PDI = '\u2069';
/*
* 0x05D0-0x05EA: [R] Hewbrew letters (Strong)
@@ -2002,8 +2008,8 @@
/* For Text #18 */
{" ABC (" + ArabicABC + " " + Arabic123 + ") 123.",
- "0000001111222112220", "0000001111222112220",
- "0000001111222112220", "1222111111222112221"},
+ "0000001111222002220", "0000001111222002220",
+ "0000001111222002220", "1222111111222112221"},
/* For Text #19 */
{" " + HebrewABC + " (ABC 123) " + NKo123 + ".",
@@ -2028,6 +2034,90 @@
PDF,
"22222221111111111111110", "22222221111111111111110",
"22222221111111111111110", "44444443333333333333331"},
+
+ /* For Text #23 */
+ {" ABC (" + Arabic123 + " " + ArabicABC + ") 123.",
+ "0000002221111002220", "0000002221111002220",
+ "0000002221111002220", "1222112221111112221"},
+
+ /* For Text #24 */
+ {" 123 (" + ArabicABC + " " + Arabic123 + ") ABC.",
+ "1222111111222112221", "1222111111222112221",
+ "0000001111222000000", "1222111111222112221"},
+
+ /* For Text #25 */
+ {" 123 (" + Arabic123 + " " + ArabicABC + ") ABC.",
+ "1222112221111112221", "1222112221111112221",
+ "0000002221111000000", "1222112221111112221"},
+
+ /* For Text #26 */
+ {" " + ArabicABC + " (ABC 123) " + Arabic123 + ".",
+ "1111112222222112221", "1111112222222112221",
+ "0111000000000002220", "1111112222222112221"},
+
+ /* For Text #27 */
+ {" " + ArabicABC + " (123 ABC) " + Arabic123 + ".",
+ "1111112221222112221", "1111112221222112221",
+ "0111002220000002220", "1111112221222112221"},
+
+ /* For Text #28 */
+ {" " + Arabic123 + " (ABC 123) " + ArabicABC + ".",
+ "0222000000000001110", "0222000000000001110",
+ "0222000000000001110", "1222112222222111111"},
+
+ /* For Text #29 */
+ {" " + Arabic123 + " (123 ABC) " + ArabicABC + ".",
+ "0222000000000001110", "0222000000000001110",
+ "0222000000000001110", "1222112221222111111"},
+
+ /* For Text #30 */
+ {RLI + "ABC " + ArabicABC + " " + ArabicABC + "." + PDI,
+ "02221111111110", "14443333333331",
+ "02221111111110", "14443333333331"},
+
+ /* For Text #31 */
+ {"ABC abc \"" + RLI + "IJK " + ArabicABC + " " + ArabicABC + PDI +
+ ".\" \"" + RLI + ArabicABC + " " + ArabicABC + PDI + ",\" xyz XYZ.",
+ "0000000000222111111110000001111111000000000000",
+ "0000000000222111111110000001111111000000000000",
+ "0000000000222111111110000001111111000000000000",
+ "2222222222444333333332222223333333222222222221"},
+
+ /* For Text #32 */
+ {ArabicABC + " " + ArabicABC + " '" + LRI + "abc def \"" + RLI +
+ "xyz " + ArabicABC + " " + ArabicABC + PDI + "\"" + PDI + "'?",
+ "111111111122222222224443333333322111",
+ "111111111122222222224443333333322111",
+ "111111100022222222224443333333322000",
+ "111111111122222222224443333333322111"},
+
+ /* For Text #33 */
+ {FSI + Arabic123 + " ABC " + ArabicABC + " " + ArabicABC + "." + PDI,
+ "044422222333333320", "144422222333333321",
+ "044422222333333320", "144422222333333321"},
+
+ /* For Text #34 */
+ {FSI + "123 ABC " + ArabicABC + " " + ArabicABC + "." + PDI,
+ "022222222333333320", "122222222333333321",
+ "022222222333333320", "122222222333333321"},
+
+ /* For Text #35 */
+ {FSI + "123 " + ArabicABC + " ABC " + ArabicABC + "." + PDI,
+ "022211111222111110", "144433333444333331",
+ "022211111222111110", "144433333444333331"},
+
+ /* For Text #36 */
+ {FSI + Arabic123 + " " + ArabicABC + " ABC " + ArabicABC + "." + PDI,
+ "022211111222111110", "144433333444333331",
+ "022211111222111110", "144433333444333331"},
+
+ /* For Text #37 */
+ {FSI + Arabic123 + " 123." + PDI,
+ "0444222220", "1444222221", "0444222220", "1444222221"},
+
+ /* For Text #38 */
+ {FSI + "123 " + Arabic123 + "." + PDI,
+ "0222244420", "1222244421", "0222244420", "1222244421"},
};
/* Golden data for baseIsLeftToRight() results */
@@ -2060,10 +2150,32 @@
{true, true, true, false},
{false, false, true, false},
- /* For Text #20 - $22 */
+ /* For Text #20 - $24 */
+ {true, true, true, false},
{true, true, true, false},
{true, true, true, false},
{true, true, true, false},
+ {false, false, true, false},
+
+ /* For Text #25 - $29 */
+ {false, false, true, false},
+ {false, false, true, false},
+ {false, false, true, false},
+ {true, true, true, false},
+ {true, true, true, false},
+
+ /* For Text #30 - $34 */
+ {true, false, true, false},
+ {true, true, true, false},
+ {false, false, true, false},
+ {true, false, true, false},
+ {true , false, true, false},
+
+ /* For Text #35 - $38 */
+ {true, false, true, false},
+ {true, false, true, false},
+ {true, false, true, false},
+ {true, false, true, false},
};
/* Golden data for isLeftToRight() & isRightToLeft() results */
@@ -2097,7 +2209,29 @@
{{false, false, false, false}, {false, false, false, false}},
{{false, false, false, false}, {false, false, false, false}},
- /* For Text #20 - $22 */
+ /* For Text #20 - $24 */
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+
+ /* For Text #25 - $29 */
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+
+ /* For Text #30 - $34 */
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+
+ /* For Text #35 - $37 */
+ {{false, false, false, false}, {false, false, false, false}},
{{false, false, false, false}, {false, false, false, false}},
{{false, false, false, false}, {false, false, false, false}},
{{false, false, false, false}, {false, false, false, false}},
@@ -2113,8 +2247,13 @@
true, true, true, true, true,
true, true, true, true, true,
- /* For Text #20 - $22 */
- true, true, true,
+ /* For Text #20 - $29 */
+ true, true, true, true, true,
+ true, true, true, true, true,
+
+ /* For Text #30 - $37 */
+ true, true, true, true, true,
+ true, true, true, true,
};
/* --------------------------------------------------------------------- */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/test/java/text/BreakIterator/Bug8032446.java Wed Jul 15 11:05:51 2015 +0900
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/*
+ * @test
+ * @bug 8032446
+ * @summary Confirm that BreakIterator works as expected with new characters in Unicode 7.
+ */
+
+import java.text.*;
+import java.util.*;
+
+public class Bug8032446 {
+
+ public static void main(String[] args) {
+ boolean err = false;
+
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0x10860; i <= 0x10876; i++) { // Palmyrene Letters
+ sb.append(Character.toChars(i));
+ }
+ sb.append(" ");
+ for (int i = 0x10879; i <= 0x1087D; i++) { // Palmyrene Numbers
+ sb.append(Character.toChars(i));
+ }
+ String s = sb.toString();
+
+ BreakIterator bi = BreakIterator.getWordInstance(Locale.ROOT);
+ bi.setText(s);
+ bi.first();
+
+ if (bi.next() != s.indexOf(' ')) {
+ throw new RuntimeException("Unexpected word breaking.");
+ }
+ }
+
+}
--- a/jdk/test/sun/net/idn/NFS4StringPrep.java Tue Jul 14 16:29:08 2015 -0700
+++ b/jdk/test/sun/net/idn/NFS4StringPrep.java Wed Jul 15 11:05:51 2015 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -32,7 +32,6 @@
import java.io.UnsupportedEncodingException;
import java.text.ParseException;
-import sun.text.normalizer.ICUData;
import sun.net.idn.StringPrep;
import sun.text.normalizer.UCharacterIterator;