740 0xE0000, 0xE007F |
741 0xE0000, 0xE007F |
741 }; |
742 }; |
742 private static final int NONBMP_BLOCK_START = 84; |
743 private static final int NONBMP_BLOCK_START = 84; |
743 |
744 |
744 static protected RangeToken getRange(String name, boolean positive) { |
745 static protected RangeToken getRange(String name, boolean positive) { |
745 if (Token.categories.size() == 0) { |
746 // use local variable for better performance |
746 synchronized (Token.categories) { |
747 Map<String, Token> localCat = Token.categories; |
747 Token[] ranges = new Token[Token.categoryNames.length]; |
748 if (localCat == null) { |
748 for (int i = 0; i < ranges.length; i ++) { |
749 synchronized (lock) { |
749 ranges[i] = Token.createRange(); |
750 localCat = Token.categories; |
750 } |
751 if (localCat == null) { |
751 int type; |
752 Map<String, Token> tmpCat = new HashMap<>(); |
752 for (int i = 0; i < 0x10000; i ++) { |
753 Map<String, Token> tmpCat2 = new HashMap<>(); |
753 type = Character.getType((char)i); |
754 |
754 if (type == Character.START_PUNCTUATION || |
755 Token[] ranges = new Token[Token.categoryNames.length]; |
755 type == Character.END_PUNCTUATION) { |
756 for (int i = 0; i < ranges.length; i ++) { |
756 //build table of Pi values |
757 ranges[i] = Token.createRange(); |
757 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || |
758 } |
758 i == 0x201F || i == 0x2039) { |
759 int type; |
759 type = CHAR_INIT_QUOTE; |
760 for (int i = 0; i < 0x10000; i ++) { |
|
761 type = Character.getType((char)i); |
|
762 if (type == Character.START_PUNCTUATION || |
|
763 type == Character.END_PUNCTUATION) { |
|
764 //build table of Pi values |
|
765 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || |
|
766 i == 0x201F || i == 0x2039) { |
|
767 type = CHAR_INIT_QUOTE; |
|
768 } |
|
769 //build table of Pf values |
|
770 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { |
|
771 type = CHAR_FINAL_QUOTE; |
|
772 } |
760 } |
773 } |
761 //build table of Pf values |
774 ranges[type].addRange(i, i); |
762 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { |
775 switch (type) { |
763 type = CHAR_FINAL_QUOTE; |
776 case Character.UPPERCASE_LETTER: |
|
777 case Character.LOWERCASE_LETTER: |
|
778 case Character.TITLECASE_LETTER: |
|
779 case Character.MODIFIER_LETTER: |
|
780 case Character.OTHER_LETTER: |
|
781 type = CHAR_LETTER; |
|
782 break; |
|
783 case Character.NON_SPACING_MARK: |
|
784 case Character.COMBINING_SPACING_MARK: |
|
785 case Character.ENCLOSING_MARK: |
|
786 type = CHAR_MARK; |
|
787 break; |
|
788 case Character.DECIMAL_DIGIT_NUMBER: |
|
789 case Character.LETTER_NUMBER: |
|
790 case Character.OTHER_NUMBER: |
|
791 type = CHAR_NUMBER; |
|
792 break; |
|
793 case Character.SPACE_SEPARATOR: |
|
794 case Character.LINE_SEPARATOR: |
|
795 case Character.PARAGRAPH_SEPARATOR: |
|
796 type = CHAR_SEPARATOR; |
|
797 break; |
|
798 case Character.CONTROL: |
|
799 case Character.FORMAT: |
|
800 case Character.SURROGATE: |
|
801 case Character.PRIVATE_USE: |
|
802 case Character.UNASSIGNED: |
|
803 type = CHAR_OTHER; |
|
804 break; |
|
805 case Character.CONNECTOR_PUNCTUATION: |
|
806 case Character.DASH_PUNCTUATION: |
|
807 case Character.START_PUNCTUATION: |
|
808 case Character.END_PUNCTUATION: |
|
809 case CHAR_INIT_QUOTE: |
|
810 case CHAR_FINAL_QUOTE: |
|
811 case Character.OTHER_PUNCTUATION: |
|
812 type = CHAR_PUNCTUATION; |
|
813 break; |
|
814 case Character.MATH_SYMBOL: |
|
815 case Character.CURRENCY_SYMBOL: |
|
816 case Character.MODIFIER_SYMBOL: |
|
817 case Character.OTHER_SYMBOL: |
|
818 type = CHAR_SYMBOL; |
|
819 break; |
|
820 default: |
|
821 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); |
|
822 } |
|
823 ranges[type].addRange(i, i); |
|
824 } // for all characters |
|
825 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); |
|
826 |
|
827 for (int i = 0; i < ranges.length; i ++) { |
|
828 if (Token.categoryNames[i] != null) { |
|
829 if (i == Character.UNASSIGNED) { // Unassigned |
|
830 ranges[i].addRange(0x10000, Token.UTF16_MAX); |
|
831 } |
|
832 tmpCat.put(Token.categoryNames[i], ranges[i]); |
|
833 tmpCat2.put(Token.categoryNames[i], |
|
834 Token.complementRanges(ranges[i])); |
764 } |
835 } |
765 } |
836 } |
766 ranges[type].addRange(i, i); |
837 //REVISIT: do we really need to support block names as in Unicode 3.1 |
767 switch (type) { |
838 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? |
768 case Character.UPPERCASE_LETTER: |
839 // |
769 case Character.LOWERCASE_LETTER: |
840 StringBuilder buffer = new StringBuilder(50); |
770 case Character.TITLECASE_LETTER: |
841 for (int i = 0; i < Token.blockNames.length; i ++) { |
771 case Character.MODIFIER_LETTER: |
842 Token r1 = Token.createRange(); |
772 case Character.OTHER_LETTER: |
843 int location; |
773 type = CHAR_LETTER; |
844 if (i < NONBMP_BLOCK_START) { |
774 break; |
845 location = i*2; |
775 case Character.NON_SPACING_MARK: |
846 int rstart = Token.blockRanges.charAt(location); |
776 case Character.COMBINING_SPACING_MARK: |
847 int rend = Token.blockRanges.charAt(location+1); |
777 case Character.ENCLOSING_MARK: |
848 //DEBUGING |
778 type = CHAR_MARK; |
849 //System.out.println(n+" " +Integer.toHexString(rstart) |
779 break; |
850 // +"-"+ Integer.toHexString(rend)); |
780 case Character.DECIMAL_DIGIT_NUMBER: |
851 r1.addRange(rstart, rend); |
781 case Character.LETTER_NUMBER: |
852 } else { |
782 case Character.OTHER_NUMBER: |
853 location = (i - NONBMP_BLOCK_START) * 2; |
783 type = CHAR_NUMBER; |
854 r1.addRange(Token.nonBMPBlockRanges[location], |
784 break; |
855 Token.nonBMPBlockRanges[location + 1]); |
785 case Character.SPACE_SEPARATOR: |
856 } |
786 case Character.LINE_SEPARATOR: |
857 String n = Token.blockNames[i]; |
787 case Character.PARAGRAPH_SEPARATOR: |
858 if (n.equals("Specials")) |
788 type = CHAR_SEPARATOR; |
859 r1.addRange(0xfff0, 0xfffd); |
789 break; |
860 if (n.equals("Private Use")) { |
790 case Character.CONTROL: |
861 r1.addRange(0xF0000,0xFFFFD); |
791 case Character.FORMAT: |
862 r1.addRange(0x100000,0x10FFFD); |
792 case Character.SURROGATE: |
863 } |
793 case Character.PRIVATE_USE: |
864 tmpCat.put(n, r1); |
794 case Character.UNASSIGNED: |
865 tmpCat2.put(n, Token.complementRanges(r1)); |
795 type = CHAR_OTHER; |
866 buffer.setLength(0); |
796 break; |
867 buffer.append("Is"); |
797 case Character.CONNECTOR_PUNCTUATION: |
868 if (n.indexOf(' ') >= 0) { |
798 case Character.DASH_PUNCTUATION: |
869 for (int ci = 0; ci < n.length(); ci ++) |
799 case Character.START_PUNCTUATION: |
870 if (n.charAt(ci) != ' ') buffer.append(n.charAt(ci)); |
800 case Character.END_PUNCTUATION: |
871 } |
801 case CHAR_INIT_QUOTE: |
872 else { |
802 case CHAR_FINAL_QUOTE: |
873 buffer.append(n); |
803 case Character.OTHER_PUNCTUATION: |
874 } |
804 type = CHAR_PUNCTUATION; |
875 Token.setAlias(tmpCat, tmpCat2, buffer.toString(), n, true); |
805 break; |
|
806 case Character.MATH_SYMBOL: |
|
807 case Character.CURRENCY_SYMBOL: |
|
808 case Character.MODIFIER_SYMBOL: |
|
809 case Character.OTHER_SYMBOL: |
|
810 type = CHAR_SYMBOL; |
|
811 break; |
|
812 default: |
|
813 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); |
|
814 } |
876 } |
815 ranges[type].addRange(i, i); |
877 |
816 } // for all characters |
878 // TR#18 1.2 |
817 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); |
879 Token.setAlias(tmpCat, tmpCat2, "ASSIGNED", "Cn", false); |
818 |
880 Token.setAlias(tmpCat, tmpCat2, "UNASSIGNED", "Cn", true); |
819 for (int i = 0; i < ranges.length; i ++) { |
881 Token all = Token.createRange(); |
820 if (Token.categoryNames[i] != null) { |
882 all.addRange(0, Token.UTF16_MAX); |
821 if (i == Character.UNASSIGNED) { // Unassigned |
883 tmpCat.put("ALL", all); |
822 ranges[i].addRange(0x10000, Token.UTF16_MAX); |
884 tmpCat2.put("ALL", Token.complementRanges(all)); |
823 } |
885 Token.registerNonXS("ASSIGNED"); |
824 Token.categories.put(Token.categoryNames[i], ranges[i]); |
886 Token.registerNonXS("UNASSIGNED"); |
825 Token.categories2.put(Token.categoryNames[i], |
887 Token.registerNonXS("ALL"); |
826 Token.complementRanges(ranges[i])); |
888 |
827 } |
889 Token isalpha = Token.createRange(); |
828 } |
890 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu |
829 //REVISIT: do we really need to support block names as in Unicode 3.1 |
891 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll |
830 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? |
892 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo |
831 // |
893 tmpCat.put("IsAlpha", isalpha); |
832 StringBuilder buffer = new StringBuilder(50); |
894 tmpCat2.put("IsAlpha", Token.complementRanges(isalpha)); |
833 for (int i = 0; i < Token.blockNames.length; i ++) { |
895 Token.registerNonXS("IsAlpha"); |
834 Token r1 = Token.createRange(); |
896 |
835 int location; |
897 Token isalnum = Token.createRange(); |
836 if (i < NONBMP_BLOCK_START) { |
898 isalnum.mergeRanges(isalpha); // Lu Ll Lo |
837 location = i*2; |
899 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd |
838 int rstart = Token.blockRanges.charAt(location); |
900 tmpCat.put("IsAlnum", isalnum); |
839 int rend = Token.blockRanges.charAt(location+1); |
901 tmpCat2.put("IsAlnum", Token.complementRanges(isalnum)); |
840 //DEBUGING |
902 Token.registerNonXS("IsAlnum"); |
841 //System.out.println(n+" " +Integer.toHexString(rstart) |
903 |
842 // +"-"+ Integer.toHexString(rend)); |
904 Token isspace = Token.createRange(); |
843 r1.addRange(rstart, rend); |
905 isspace.mergeRanges(Token.token_spaces); |
844 } else { |
906 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z |
845 location = (i - NONBMP_BLOCK_START) * 2; |
907 tmpCat.put("IsSpace", isspace); |
846 r1.addRange(Token.nonBMPBlockRanges[location], |
908 tmpCat2.put("IsSpace", Token.complementRanges(isspace)); |
847 Token.nonBMPBlockRanges[location + 1]); |
909 Token.registerNonXS("IsSpace"); |
848 } |
910 |
849 String n = Token.blockNames[i]; |
911 Token isword = Token.createRange(); |
850 if (n.equals("Specials")) |
912 isword.mergeRanges(isalnum); // Lu Ll Lo Nd |
851 r1.addRange(0xfff0, 0xfffd); |
913 isword.addRange('_', '_'); |
852 if (n.equals("Private Use")) { |
914 tmpCat.put("IsWord", isword); |
853 r1.addRange(0xF0000,0xFFFFD); |
915 tmpCat2.put("IsWord", Token.complementRanges(isword)); |
854 r1.addRange(0x100000,0x10FFFD); |
916 Token.registerNonXS("IsWord"); |
855 } |
917 |
856 Token.categories.put(n, r1); |
918 Token isascii = Token.createRange(); |
857 Token.categories2.put(n, Token.complementRanges(r1)); |
919 isascii.addRange(0, 127); |
858 buffer.setLength(0); |
920 tmpCat.put("IsASCII", isascii); |
859 buffer.append("Is"); |
921 tmpCat2.put("IsASCII", Token.complementRanges(isascii)); |
860 if (n.indexOf(' ') >= 0) { |
922 Token.registerNonXS("IsASCII"); |
861 for (int ci = 0; ci < n.length(); ci ++) |
923 |
862 if (n.charAt(ci) != ' ') buffer.append(n.charAt(ci)); |
924 Token isnotgraph = Token.createRange(); |
863 } |
925 isnotgraph.mergeRanges(ranges[CHAR_OTHER]); |
864 else { |
926 isnotgraph.addRange(' ', ' '); |
865 buffer.append(n); |
927 tmpCat.put("IsGraph", Token.complementRanges(isnotgraph)); |
866 } |
928 tmpCat2.put("IsGraph", isnotgraph); |
867 Token.setAlias(buffer.toString(), n, true); |
929 Token.registerNonXS("IsGraph"); |
868 } |
930 |
869 |
931 Token isxdigit = Token.createRange(); |
870 // TR#18 1.2 |
932 isxdigit.addRange('0', '9'); |
871 Token.setAlias("ASSIGNED", "Cn", false); |
933 isxdigit.addRange('A', 'F'); |
872 Token.setAlias("UNASSIGNED", "Cn", true); |
934 isxdigit.addRange('a', 'f'); |
873 Token all = Token.createRange(); |
935 tmpCat.put("IsXDigit", Token.complementRanges(isxdigit)); |
874 all.addRange(0, Token.UTF16_MAX); |
936 tmpCat2.put("IsXDigit", isxdigit); |
875 Token.categories.put("ALL", all); |
937 Token.registerNonXS("IsXDigit"); |
876 Token.categories2.put("ALL", Token.complementRanges(all)); |
938 |
877 Token.registerNonXS("ASSIGNED"); |
939 Token.setAlias(tmpCat, tmpCat2, "IsDigit", "Nd", true); |
878 Token.registerNonXS("UNASSIGNED"); |
940 Token.setAlias(tmpCat, tmpCat2, "IsUpper", "Lu", true); |
879 Token.registerNonXS("ALL"); |
941 Token.setAlias(tmpCat, tmpCat2, "IsLower", "Ll", true); |
880 |
942 Token.setAlias(tmpCat, tmpCat2, "IsCntrl", "C", true); |
881 Token isalpha = Token.createRange(); |
943 Token.setAlias(tmpCat, tmpCat2, "IsPrint", "C", false); |
882 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu |
944 Token.setAlias(tmpCat, tmpCat2, "IsPunct", "P", true); |
883 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll |
945 Token.registerNonXS("IsDigit"); |
884 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo |
946 Token.registerNonXS("IsUpper"); |
885 Token.categories.put("IsAlpha", isalpha); |
947 Token.registerNonXS("IsLower"); |
886 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha)); |
948 Token.registerNonXS("IsCntrl"); |
887 Token.registerNonXS("IsAlpha"); |
949 Token.registerNonXS("IsPrint"); |
888 |
950 Token.registerNonXS("IsPunct"); |
889 Token isalnum = Token.createRange(); |
951 |
890 isalnum.mergeRanges(isalpha); // Lu Ll Lo |
952 Token.setAlias(tmpCat, tmpCat2, "alpha", "IsAlpha", true); |
891 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd |
953 Token.setAlias(tmpCat, tmpCat2, "alnum", "IsAlnum", true); |
892 Token.categories.put("IsAlnum", isalnum); |
954 Token.setAlias(tmpCat, tmpCat2, "ascii", "IsASCII", true); |
893 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum)); |
955 Token.setAlias(tmpCat, tmpCat2, "cntrl", "IsCntrl", true); |
894 Token.registerNonXS("IsAlnum"); |
956 Token.setAlias(tmpCat, tmpCat2, "digit", "IsDigit", true); |
895 |
957 Token.setAlias(tmpCat, tmpCat2, "graph", "IsGraph", true); |
896 Token isspace = Token.createRange(); |
958 Token.setAlias(tmpCat, tmpCat2, "lower", "IsLower", true); |
897 isspace.mergeRanges(Token.token_spaces); |
959 Token.setAlias(tmpCat, tmpCat2, "print", "IsPrint", true); |
898 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z |
960 Token.setAlias(tmpCat, tmpCat2, "punct", "IsPunct", true); |
899 Token.categories.put("IsSpace", isspace); |
961 Token.setAlias(tmpCat, tmpCat2, "space", "IsSpace", true); |
900 Token.categories2.put("IsSpace", Token.complementRanges(isspace)); |
962 Token.setAlias(tmpCat, tmpCat2, "upper", "IsUpper", true); |
901 Token.registerNonXS("IsSpace"); |
963 Token.setAlias(tmpCat, tmpCat2, "word", "IsWord", true); // Perl extension |
902 |
964 Token.setAlias(tmpCat, tmpCat2, "xdigit", "IsXDigit", true); |
903 Token isword = Token.createRange(); |
965 Token.registerNonXS("alpha"); |
904 isword.mergeRanges(isalnum); // Lu Ll Lo Nd |
966 Token.registerNonXS("alnum"); |
905 isword.addRange('_', '_'); |
967 Token.registerNonXS("ascii"); |
906 Token.categories.put("IsWord", isword); |
968 Token.registerNonXS("cntrl"); |
907 Token.categories2.put("IsWord", Token.complementRanges(isword)); |
969 Token.registerNonXS("digit"); |
908 Token.registerNonXS("IsWord"); |
970 Token.registerNonXS("graph"); |
909 |
971 Token.registerNonXS("lower"); |
910 Token isascii = Token.createRange(); |
972 Token.registerNonXS("print"); |
911 isascii.addRange(0, 127); |
973 Token.registerNonXS("punct"); |
912 Token.categories.put("IsASCII", isascii); |
974 Token.registerNonXS("space"); |
913 Token.categories2.put("IsASCII", Token.complementRanges(isascii)); |
975 Token.registerNonXS("upper"); |
914 Token.registerNonXS("IsASCII"); |
976 Token.registerNonXS("word"); |
915 |
977 Token.registerNonXS("xdigit"); |
916 Token isnotgraph = Token.createRange(); |
978 Token.categories = localCat = Collections.unmodifiableMap(tmpCat); |
917 isnotgraph.mergeRanges(ranges[CHAR_OTHER]); |
979 Token.categories2 = Collections.unmodifiableMap(tmpCat2); |
918 isnotgraph.addRange(' ', ' '); |
980 } // localCat == null |
919 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph)); |
|
920 Token.categories2.put("IsGraph", isnotgraph); |
|
921 Token.registerNonXS("IsGraph"); |
|
922 |
|
923 Token isxdigit = Token.createRange(); |
|
924 isxdigit.addRange('0', '9'); |
|
925 isxdigit.addRange('A', 'F'); |
|
926 isxdigit.addRange('a', 'f'); |
|
927 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit)); |
|
928 Token.categories2.put("IsXDigit", isxdigit); |
|
929 Token.registerNonXS("IsXDigit"); |
|
930 |
|
931 Token.setAlias("IsDigit", "Nd", true); |
|
932 Token.setAlias("IsUpper", "Lu", true); |
|
933 Token.setAlias("IsLower", "Ll", true); |
|
934 Token.setAlias("IsCntrl", "C", true); |
|
935 Token.setAlias("IsPrint", "C", false); |
|
936 Token.setAlias("IsPunct", "P", true); |
|
937 Token.registerNonXS("IsDigit"); |
|
938 Token.registerNonXS("IsUpper"); |
|
939 Token.registerNonXS("IsLower"); |
|
940 Token.registerNonXS("IsCntrl"); |
|
941 Token.registerNonXS("IsPrint"); |
|
942 Token.registerNonXS("IsPunct"); |
|
943 |
|
944 Token.setAlias("alpha", "IsAlpha", true); |
|
945 Token.setAlias("alnum", "IsAlnum", true); |
|
946 Token.setAlias("ascii", "IsASCII", true); |
|
947 Token.setAlias("cntrl", "IsCntrl", true); |
|
948 Token.setAlias("digit", "IsDigit", true); |
|
949 Token.setAlias("graph", "IsGraph", true); |
|
950 Token.setAlias("lower", "IsLower", true); |
|
951 Token.setAlias("print", "IsPrint", true); |
|
952 Token.setAlias("punct", "IsPunct", true); |
|
953 Token.setAlias("space", "IsSpace", true); |
|
954 Token.setAlias("upper", "IsUpper", true); |
|
955 Token.setAlias("word", "IsWord", true); // Perl extension |
|
956 Token.setAlias("xdigit", "IsXDigit", true); |
|
957 Token.registerNonXS("alpha"); |
|
958 Token.registerNonXS("alnum"); |
|
959 Token.registerNonXS("ascii"); |
|
960 Token.registerNonXS("cntrl"); |
|
961 Token.registerNonXS("digit"); |
|
962 Token.registerNonXS("graph"); |
|
963 Token.registerNonXS("lower"); |
|
964 Token.registerNonXS("print"); |
|
965 Token.registerNonXS("punct"); |
|
966 Token.registerNonXS("space"); |
|
967 Token.registerNonXS("upper"); |
|
968 Token.registerNonXS("word"); |
|
969 Token.registerNonXS("xdigit"); |
|
970 } // synchronized |
981 } // synchronized |
971 } // if null |
982 } // if null |
972 RangeToken tok = positive ? (RangeToken)Token.categories.get(name) |
983 return positive ? (RangeToken)localCat.get(name) |
973 : (RangeToken)Token.categories2.get(name); |
984 : (RangeToken)Token.categories2.get(name); |
974 //if (tok == null) System.out.println(name); |
|
975 return tok; |
|
976 } |
985 } |
977 static protected RangeToken getRange(String name, boolean positive, boolean xs) { |
986 static protected RangeToken getRange(String name, boolean positive, boolean xs) { |
978 RangeToken range = Token.getRange(name, positive); |
987 RangeToken range = Token.getRange(name, positive); |
979 if (xs && range != null && Token.isRegisterNonXS(name)) |
988 if (xs && range != null && Token.isRegisterNonXS(name)) |
980 range = null; |
989 range = null; |