jdk/src/java.base/share/classes/java/util/regex/Pattern.java
changeset 37882 e7f3cf12e739
parent 37880 60ec48925dc6
child 38777 826eb7091523
equal deleted inserted replaced
37881:fda31effa7b3 37882:e7f3cf12e739
     1 /*
     1 /*
     2  * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
     2  * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Oracle designates this
     7  * published by the Free Software Foundation.  Oracle designates this
    24  */
    24  */
    25 
    25 
    26 package java.util.regex;
    26 package java.util.regex;
    27 
    27 
    28 import java.text.Normalizer;
    28 import java.text.Normalizer;
       
    29 import java.text.Normalizer.Form;
    29 import java.util.Locale;
    30 import java.util.Locale;
    30 import java.util.Iterator;
    31 import java.util.Iterator;
    31 import java.util.Map;
    32 import java.util.Map;
    32 import java.util.ArrayList;
    33 import java.util.ArrayList;
    33 import java.util.HashMap;
    34 import java.util.HashMap;
       
    35 import java.util.LinkedHashSet;
       
    36 import java.util.List;
       
    37 import java.util.Set;
    34 import java.util.Arrays;
    38 import java.util.Arrays;
    35 import java.util.NoSuchElementException;
    39 import java.util.NoSuchElementException;
    36 import java.util.Spliterator;
    40 import java.util.Spliterator;
    37 import java.util.Spliterators;
    41 import java.util.Spliterators;
    38 import java.util.function.Predicate;
    42 import java.util.function.Predicate;
   982      * Temporary storage used by parsing pattern slice.
   986      * Temporary storage used by parsing pattern slice.
   983      */
   987      */
   984     transient int[] buffer;
   988     transient int[] buffer;
   985 
   989 
   986     /**
   990     /**
       
   991      * A temporary storage used for predicate for double return.
       
   992      */
       
   993     transient CharPredicate predicate;
       
   994 
       
   995     /**
   987      * Map the "name" of the "named capturing group" to its group id
   996      * Map the "name" of the "named capturing group" to its group id
   988      * node.
   997      * node.
   989      */
   998      */
   990     transient volatile Map<String, Integer> namedGroups;
   999     transient volatile Map<String, Integer> namedGroups;
   991 
  1000 
   992     /**
  1001     /**
   993      * Temporary storage used while parsing group references.
  1002      * Temporary storage used while parsing group references.
   994      */
  1003      */
   995     transient GroupHead[] groupNodes;
  1004     transient GroupHead[] groupNodes;
       
  1005 
       
  1006     /**
       
  1007      * Temporary storage used to store the top level closure nodes.
       
  1008      */
       
  1009     transient List<Node> topClosureNodes;
       
  1010 
       
  1011     /**
       
  1012      * The number of top greedy closure nodes in this Pattern. Used by
       
  1013      * matchers to allocate storage needed for a IntHashSet to keep the
       
  1014      * beginning pos {@code i} of all failed match.
       
  1015      */
       
  1016     transient int localTCNCount;
       
  1017 
       
  1018     /*
       
  1019      * Turn off the stop-exponential-backtracking optimization if there
       
  1020      * is a group ref in the pattern.
       
  1021      */
       
  1022     transient boolean hasGroupRef;
   996 
  1023 
   997     /**
  1024     /**
   998      * Temporary null terminated code point array used by pattern compiling.
  1025      * Temporary null terminated code point array used by pattern compiling.
   999      */
  1026      */
  1000     private transient int[] temp;
  1027     private transient int[] temp;
  1024 
  1051 
  1025     /**
  1052     /**
  1026      * If the Start node might possibly match supplementary characters.
  1053      * If the Start node might possibly match supplementary characters.
  1027      * It is set to true during compiling if
  1054      * It is set to true during compiling if
  1028      * (1) There is supplementary char in pattern, or
  1055      * (1) There is supplementary char in pattern, or
  1029      * (2) There is complement node of Category or Block
  1056      * (2) There is complement node of a "family" CharProperty
  1030      */
  1057      */
  1031     private transient boolean hasSupplementary;
  1058     private transient boolean hasSupplementary;
  1032 
  1059 
  1033     /**
  1060     /**
  1034      * Compiles the given regular expression into a pattern.
  1061      * Compiles the given regular expression into a pattern.
  1336         s.defaultReadObject();
  1363         s.defaultReadObject();
  1337 
  1364 
  1338         // Initialize counts
  1365         // Initialize counts
  1339         capturingGroupCount = 1;
  1366         capturingGroupCount = 1;
  1340         localCount = 0;
  1367         localCount = 0;
       
  1368         localTCNCount = 0;
  1341 
  1369 
  1342         // if length > 0, the Pattern is lazily compiled
  1370         // if length > 0, the Pattern is lazily compiled
  1343         if (pattern.length() == 0) {
  1371         if (pattern.length() == 0) {
  1344             root = new Start(lastAccept);
  1372             root = new Start(lastAccept);
  1345             matchRoot = lastAccept;
  1373             matchRoot = lastAccept;
  1366             flags |= UNICODE_CASE;
  1394             flags |= UNICODE_CASE;
  1367 
  1395 
  1368         // Reset group index count
  1396         // Reset group index count
  1369         capturingGroupCount = 1;
  1397         capturingGroupCount = 1;
  1370         localCount = 0;
  1398         localCount = 0;
       
  1399         localTCNCount = 0;
  1371 
  1400 
  1372         if (pattern.length() > 0) {
  1401         if (pattern.length() > 0) {
  1373             compile();
  1402             compile();
  1374         } else {
  1403         } else {
  1375             root = new Start(lastAccept);
  1404             root = new Start(lastAccept);
  1376             matchRoot = lastAccept;
  1405             matchRoot = lastAccept;
  1377         }
  1406         }
  1378     }
  1407     }
  1379 
  1408 
  1380     /**
  1409     /**
  1381      * The pattern is converted to normalized form ({@linkplain
  1410      * The pattern is converted to normalized form ({@link
  1382      * java.text.Normalizer.Form.NFD NFD}, canonical decomposition)
  1411      * java.text.Normalizer.Form.NFC NFC}, canonical decomposition,
  1383      * and then a pure group is constructed to match canonical
  1412      * followed by canonical composition for the character class
  1384      * equivalences of the characters.
  1413      * part, and {@link java.text.Normalizer.Form.NFD NFD},
  1385      */
  1414      * canonical decomposition) for the rest), and then a pure
  1386     private void normalize() {
  1415      * group is constructed to match canonical equivalences of the
  1387         int lastCodePoint = -1;
  1416      * characters.
  1388 
  1417      */
  1389         // Convert pattern into normalized form
  1418     private static String normalize(String pattern) {
  1390         normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
  1419         int plen = pattern.length();
  1391         patternLength = normalizedPattern.length();
  1420         StringBuilder pbuf = new StringBuilder(plen);
  1392 
  1421         char last = 0;
  1393         // Modify pattern to match canonical equivalences
  1422         int lastStart = 0;
  1394         StringBuilder newPattern = new StringBuilder(patternLength);
  1423         char cc = 0;
  1395         for(int i=0; i<patternLength; ) {
  1424         for (int i = 0; i < plen;) {
  1396             int c = normalizedPattern.codePointAt(i);
  1425             char c = pattern.charAt(i);
  1397             StringBuilder sequenceBuffer;
  1426             if (cc == 0 &&    // top level
  1398             if ((Character.getType(c) == Character.NON_SPACING_MARK)
  1427                 c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') {
  1399                 && (lastCodePoint != -1)) {
  1428                 i += 2; last = 0;
  1400                 sequenceBuffer = new StringBuilder();
  1429                 continue;
  1401                 sequenceBuffer.appendCodePoint(lastCodePoint);
  1430             }
  1402                 sequenceBuffer.appendCodePoint(c);
  1431             if (c == '[' && last != '\\') {
  1403                 while(Character.getType(c) == Character.NON_SPACING_MARK) {
  1432                 if (cc == 0) {
  1404                     i += Character.charCount(c);
  1433                     if (lastStart < i)
  1405                     if (i >= patternLength)
  1434                         normalizeSlice(pattern, lastStart, i, pbuf);
  1406                         break;
  1435                     lastStart = i;
  1407                     c = normalizedPattern.codePointAt(i);
  1436                 }
  1408                     sequenceBuffer.appendCodePoint(c);
  1437                 cc++;
  1409                 }
  1438             } else if (c == ']' && last != '\\') {
  1410                 String ea = produceEquivalentAlternation(
  1439                 cc--;
  1411                                                sequenceBuffer.toString());
  1440                 if (cc == 0) {
  1412                 newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
  1441                     normalizeClazz(pattern, lastStart, i + 1, pbuf);
  1413                 newPattern.append("(?:").append(ea).append(")");
  1442                     lastStart = i + 1;
  1414             } else if (c == '[' && lastCodePoint != '\\') {
  1443                 }
  1415                 i = normalizeCharClass(newPattern, i);
  1444             }
  1416             } else {
  1445             last = c;
  1417                 newPattern.appendCodePoint(c);
  1446             i++;
  1418             }
  1447         }
  1419             lastCodePoint = c;
  1448         assert (cc == 0);
  1420             i += Character.charCount(c);
  1449         if (lastStart < plen)
  1421         }
  1450             normalizeSlice(pattern, lastStart, plen, pbuf);
  1422         normalizedPattern = newPattern.toString();
  1451         return pbuf.toString();
  1423     }
  1452     }
  1424 
  1453 
  1425     /**
  1454     private static void normalizeSlice(String src, int off, int limit,
  1426      * Complete the character class being parsed and add a set
  1455                                        StringBuilder dst)
  1427      * of alternations to it that will match the canonical equivalences
  1456     {
  1428      * of the characters within the class.
  1457         int len = src.length();
  1429      */
  1458         int off0 = off;
  1430     private int normalizeCharClass(StringBuilder newPattern, int i) {
  1459         while (off < limit && ASCII.isAscii(src.charAt(off))) {
  1431         StringBuilder charClass = new StringBuilder();
  1460             off++;
  1432         StringBuilder eq = null;
  1461         }
  1433         int lastCodePoint = -1;
  1462         if (off == limit) {
  1434         String result;
  1463             dst.append(src, off0, limit);
  1435 
  1464             return;
  1436         i++;
  1465         }
  1437         charClass.append("[");
  1466         off--;
  1438         while(true) {
  1467         if (off < off0)
  1439             int c = normalizedPattern.codePointAt(i);
  1468             off = off0;
  1440             StringBuilder sequenceBuffer;
  1469         else
  1441 
  1470             dst.append(src, off0, off);
  1442             if (c == ']' && lastCodePoint != '\\') {
  1471         while (off < limit) {
  1443                 charClass.append((char)c);
  1472             int ch0 = src.codePointAt(off);
  1444                 break;
  1473             if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {
  1445             } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
  1474                 dst.append((char)ch0);
  1446                 sequenceBuffer = new StringBuilder();
  1475                 off++;
  1447                 sequenceBuffer.appendCodePoint(lastCodePoint);
  1476                 continue;
  1448                 while(Character.getType(c) == Character.NON_SPACING_MARK) {
  1477             }
  1449                     sequenceBuffer.appendCodePoint(c);
  1478             int j = off + Character.charCount(ch0);
  1450                     i += Character.charCount(c);
  1479             int ch1;
  1451                     if (i >= normalizedPattern.length())
  1480             while (j < limit) {
  1452                         break;
  1481                 ch1 = src.codePointAt(j);
  1453                     c = normalizedPattern.codePointAt(i);
  1482                 if (Grapheme.isBoundary(ch0, ch1))
  1454                 }
  1483                     break;
  1455                 String ea = produceEquivalentAlternation(
  1484                 ch0 = ch1;
  1456                                                   sequenceBuffer.toString());
  1485                 j += Character.charCount(ch1);
  1457 
  1486             }
  1458                 charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
  1487             String seq = src.substring(off, j);
  1459                 if (eq == null)
  1488             String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
  1460                     eq = new StringBuilder();
  1489             off = j;
  1461                 eq.append('|');
  1490             if (nfd.length() > 1) {
  1462                 eq.append(ea);
  1491                 ch0 = nfd.codePointAt(0);
  1463             } else {
  1492                 ch1 = nfd.codePointAt(Character.charCount(ch0));
  1464                 charClass.appendCodePoint(c);
  1493                 if (Character.getType(ch1) == Character.NON_SPACING_MARK) {
  1465                 i++;
  1494                     Set<String> altns = new LinkedHashSet<>();
  1466             }
  1495                     altns.add(seq);
  1467             if (i == normalizedPattern.length())
  1496                     produceEquivalentAlternation(nfd, altns);
  1468                 throw error("Unclosed character class");
  1497                     dst.append("(?:");
  1469             lastCodePoint = c;
  1498                     altns.forEach( s -> dst.append(s + "|"));
  1470         }
  1499                     dst.delete(dst.length() - 1, dst.length());
  1471 
  1500                     dst.append(")");
  1472         if (eq != null) {
  1501                     continue;
  1473             result = "(?:"+charClass.toString()+eq.toString()+")";
  1502                 }
  1474         } else {
  1503             }
  1475             result = charClass.toString();
  1504             String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC);
  1476         }
  1505             if (!seq.equals(nfc) && !nfd.equals(nfc))
  1477 
  1506                 dst.append("(?:" + seq + "|" + nfd  + "|" + nfc + ")");
  1478         newPattern.append(result);
  1507             else if (!seq.equals(nfd))
  1479         return i;
  1508                 dst.append("(?:" + seq + "|" + nfd + ")");
       
  1509             else
       
  1510                 dst.append(seq);
       
  1511         }
       
  1512     }
       
  1513 
       
  1514     private static void normalizeClazz(String src, int off, int limit,
       
  1515                                        StringBuilder dst)
       
  1516     {
       
  1517         dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC));
  1480     }
  1518     }
  1481 
  1519 
  1482     /**
  1520     /**
  1483      * Given a specific sequence composed of a regular character and
  1521      * Given a specific sequence composed of a regular character and
  1484      * combining marks that follow it, produce the alternation that will
  1522      * combining marks that follow it, produce the alternation that will
  1485      * match all canonical equivalences of that sequence.
  1523      * match all canonical equivalences of that sequence.
  1486      */
  1524      */
  1487     private String produceEquivalentAlternation(String source) {
  1525     private static void produceEquivalentAlternation(String src,
  1488         int len = countChars(source, 0, 1);
  1526                                                      Set<String> dst)
  1489         if (source.length() == len)
  1527     {
  1490             // source has one character.
  1528         int len = countChars(src, 0, 1);
  1491             return source;
  1529         if (src.length() == len) {
  1492 
  1530             dst.add(src);  // source has one character.
  1493         String base = source.substring(0,len);
  1531             return;
  1494         String combiningMarks = source.substring(len);
  1532         }
  1495 
  1533         String base = src.substring(0,len);
       
  1534         String combiningMarks = src.substring(len);
  1496         String[] perms = producePermutations(combiningMarks);
  1535         String[] perms = producePermutations(combiningMarks);
  1497         StringBuilder result = new StringBuilder(source);
       
  1498 
       
  1499         // Add combined permutations
  1536         // Add combined permutations
  1500         for(int x=0; x<perms.length; x++) {
  1537         for(int x = 0; x < perms.length; x++) {
  1501             String next = base + perms[x];
  1538             String next = base + perms[x];
  1502             if (x>0)
  1539             dst.add(next);
  1503                 result.append("|"+next);
       
  1504             next = composeOneStep(next);
  1540             next = composeOneStep(next);
  1505             if (next != null)
  1541             if (next != null) {
  1506                 result.append("|"+produceEquivalentAlternation(next));
  1542                 produceEquivalentAlternation(next, dst);
  1507         }
  1543             }
  1508         return result.toString();
  1544         }
  1509     }
  1545     }
  1510 
  1546 
  1511     /**
  1547     /**
  1512      * Returns an array of strings that have all the possible
  1548      * Returns an array of strings that have all the possible
  1513      * permutations of the characters in the input string.
  1549      * permutations of the characters in the input string.
  1515      * of a set of combining marks. Note that some of the permutations
  1551      * of a set of combining marks. Note that some of the permutations
  1516      * are invalid because of combining class collisions, and these
  1552      * are invalid because of combining class collisions, and these
  1517      * possibilities must be removed because they are not canonically
  1553      * possibilities must be removed because they are not canonically
  1518      * equivalent.
  1554      * equivalent.
  1519      */
  1555      */
  1520     private String[] producePermutations(String input) {
  1556     private static String[] producePermutations(String input) {
  1521         if (input.length() == countChars(input, 0, 1))
  1557         if (input.length() == countChars(input, 0, 1))
  1522             return new String[] {input};
  1558             return new String[] {input};
  1523 
  1559 
  1524         if (input.length() == countChars(input, 0, 2)) {
  1560         if (input.length() == countChars(input, 0, 2)) {
  1525             int c0 = Character.codePointAt(input, 0);
  1561             int c0 = Character.codePointAt(input, 0);
  1573         String[] result = new String[index];
  1609         String[] result = new String[index];
  1574         System.arraycopy(temp, 0, result, 0, index);
  1610         System.arraycopy(temp, 0, result, 0, index);
  1575         return result;
  1611         return result;
  1576     }
  1612     }
  1577 
  1613 
  1578     private int getClass(int c) {
  1614     private static int getClass(int c) {
  1579         return sun.text.Normalizer.getCombiningClass(c);
  1615         return sun.text.Normalizer.getCombiningClass(c);
  1580     }
  1616     }
  1581 
  1617 
  1582     /**
  1618     /**
  1583      * Attempts to compose input by combining the first character
  1619      * Attempts to compose input by combining the first character
  1584      * with the first combining mark following it. Returns a String
  1620      * with the first combining mark following it. Returns a String
  1585      * that is the composition of the leading character with its first
  1621      * that is the composition of the leading character with its first
  1586      * combining mark followed by the remaining combining marks. Returns
  1622      * combining mark followed by the remaining combining marks. Returns
  1587      * null if the first two characters cannot be further composed.
  1623      * null if the first two characters cannot be further composed.
  1588      */
  1624      */
  1589     private String composeOneStep(String input) {
  1625     private static String composeOneStep(String input) {
  1590         int len = countChars(input, 0, 2);
  1626         int len = countChars(input, 0, 2);
  1591         String firstTwoCharacters = input.substring(0, len);
  1627         String firstTwoCharacters = input.substring(0, len);
  1592         String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
  1628         String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
  1593 
       
  1594         if (result.equals(firstTwoCharacters))
  1629         if (result.equals(firstTwoCharacters))
  1595             return null;
  1630             return null;
  1596         else {
  1631         else {
  1597             String remainder = input.substring(len);
  1632             String remainder = input.substring(len);
  1598             return result + remainder;
  1633             return result + remainder;
  1675      * of the expression which will create the object tree.
  1710      * of the expression which will create the object tree.
  1676      */
  1711      */
  1677     private void compile() {
  1712     private void compile() {
  1678         // Handle canonical equivalences
  1713         // Handle canonical equivalences
  1679         if (has(CANON_EQ) && !has(LITERAL)) {
  1714         if (has(CANON_EQ) && !has(LITERAL)) {
  1680             normalize();
  1715             normalizedPattern = normalize(pattern);
  1681         } else {
  1716         } else {
  1682             normalizedPattern = pattern;
  1717             normalizedPattern = pattern;
  1683         }
  1718         }
  1684         patternLength = normalizedPattern.length();
  1719         patternLength = normalizedPattern.length();
  1685 
  1720 
  1705 
  1740 
  1706         // Allocate all temporary objects here.
  1741         // Allocate all temporary objects here.
  1707         buffer = new int[32];
  1742         buffer = new int[32];
  1708         groupNodes = new GroupHead[10];
  1743         groupNodes = new GroupHead[10];
  1709         namedGroups = null;
  1744         namedGroups = null;
       
  1745         topClosureNodes = new ArrayList<>(10);
  1710 
  1746 
  1711         if (has(LITERAL)) {
  1747         if (has(LITERAL)) {
  1712             // Literal pattern handling
  1748             // Literal pattern handling
  1713             matchRoot = newSlice(temp, patternLength, hasSupplementary);
  1749             matchRoot = newSlice(temp, patternLength, hasSupplementary);
  1714             matchRoot.next = lastAccept;
  1750             matchRoot.next = lastAccept;
  1735             root = matchRoot;
  1771             root = matchRoot;
  1736         } else {
  1772         } else {
  1737             root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
  1773             root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
  1738         }
  1774         }
  1739 
  1775 
       
  1776         // Optimize the greedy Loop to prevent exponential backtracking, IF there
       
  1777         // is no group ref in this pattern. With a non-negative localTCNCount value,
       
  1778         // the greedy type Loop, Curly will skip the backtracking for any starting
       
  1779         // position "i" that failed in the past.
       
  1780         if (!hasGroupRef) {
       
  1781             for (Node node : topClosureNodes) {
       
  1782                 if (node instanceof Loop) {
       
  1783                     // non-deterministic-greedy-group
       
  1784                     ((Loop)node).posIndex = localTCNCount++;
       
  1785                 }
       
  1786             }
       
  1787         }
       
  1788 
  1740         // Release temporary storage
  1789         // Release temporary storage
  1741         temp = null;
  1790         temp = null;
  1742         buffer = null;
  1791         buffer = null;
  1743         groupNodes = null;
  1792         groupNodes = null;
  1744         patternLength = 0;
  1793         patternLength = 0;
  1745         compiled = true;
  1794         compiled = true;
       
  1795         topClosureNodes = null;
  1746     }
  1796     }
  1747 
  1797 
  1748     Map<String, Integer> namedGroups() {
  1798     Map<String, Integer> namedGroups() {
  1749         Map<String, Integer> groups = namedGroups;
  1799         Map<String, Integer> groups = namedGroups;
  1750         if (groups == null) {
  1800         if (groups == null) {
  1751             namedGroups = groups = new HashMap<>(2);
  1801             namedGroups = groups = new HashMap<>(2);
  1752         }
  1802         }
  1753         return groups;
  1803         return groups;
  1754     }
       
  1755 
       
  1756     /**
       
  1757      * Used to print out a subtree of the Pattern to help with debugging.
       
  1758      */
       
  1759     private static void printObjectTree(Node node) {
       
  1760         while(node != null) {
       
  1761             if (node instanceof Prolog) {
       
  1762                 System.out.println(node);
       
  1763                 printObjectTree(((Prolog)node).loop);
       
  1764                 System.out.println("**** end contents prolog loop");
       
  1765             } else if (node instanceof Loop) {
       
  1766                 System.out.println(node);
       
  1767                 printObjectTree(((Loop)node).body);
       
  1768                 System.out.println("**** end contents Loop body");
       
  1769             } else if (node instanceof Curly) {
       
  1770                 System.out.println(node);
       
  1771                 printObjectTree(((Curly)node).atom);
       
  1772                 System.out.println("**** end contents Curly body");
       
  1773             } else if (node instanceof GroupCurly) {
       
  1774                 System.out.println(node);
       
  1775                 printObjectTree(((GroupCurly)node).atom);
       
  1776                 System.out.println("**** end contents GroupCurly body");
       
  1777             } else if (node instanceof GroupTail) {
       
  1778                 System.out.println(node);
       
  1779                 System.out.println("Tail next is "+node.next);
       
  1780                 return;
       
  1781             } else {
       
  1782                 System.out.println(node);
       
  1783             }
       
  1784             node = node.next;
       
  1785             if (node != null)
       
  1786                 System.out.println("->next:");
       
  1787             if (node == Pattern.accept) {
       
  1788                 System.out.println("Accept Node");
       
  1789                 node = null;
       
  1790             }
       
  1791        }
       
  1792     }
  1804     }
  1793 
  1805 
  1794     /**
  1806     /**
  1795      * Used to accumulate information about a subtree of the object graph
  1807      * Used to accumulate information about a subtree of the object graph
  1796      * so that optimizations can be applied to the subtree.
  1808      * so that optimizations can be applied to the subtree.
  2081                     tail.next = node;
  2093                     tail.next = node;
  2082                 // Double return: Tail was returned in root
  2094                 // Double return: Tail was returned in root
  2083                 tail = root;
  2095                 tail = root;
  2084                 continue;
  2096                 continue;
  2085             case '[':
  2097             case '[':
  2086                 node = clazz(true);
  2098                 if (has(CANON_EQ) && !has(LITERAL))
       
  2099                     node = new NFCCharProperty(clazz(true));
       
  2100                 else
       
  2101                     node = newCharProperty(clazz(true));
  2087                 break;
  2102                 break;
  2088             case '\\':
  2103             case '\\':
  2089                 ch = nextEscaped();
  2104                 ch = nextEscaped();
  2090                 if (ch == 'p' || ch == 'P') {
  2105                 if (ch == 'p' || ch == 'P') {
  2091                     boolean oneLetter = true;
  2106                     boolean oneLetter = true;
  2094                     if (ch != '{') {
  2109                     if (ch != '{') {
  2095                         unread();
  2110                         unread();
  2096                     } else {
  2111                     } else {
  2097                         oneLetter = false;
  2112                         oneLetter = false;
  2098                     }
  2113                     }
  2099                     node = family(oneLetter, comp);
  2114                     // node = newCharProperty(family(oneLetter, comp));
       
  2115                     if (has(CANON_EQ) && !has(LITERAL))
       
  2116                         node = new NFCCharProperty(family(oneLetter, comp));
       
  2117                     else
       
  2118                         node = newCharProperty(family(oneLetter, comp));
  2100                 } else {
  2119                 } else {
  2101                     unread();
  2120                     unread();
  2102                     node = atom();
  2121                     node = atom();
  2103                 }
  2122                 }
  2104                 break;
  2123                 break;
  2121                     node = new Dollar(has(MULTILINE));
  2140                     node = new Dollar(has(MULTILINE));
  2122                 break;
  2141                 break;
  2123             case '.':
  2142             case '.':
  2124                 next();
  2143                 next();
  2125                 if (has(DOTALL)) {
  2144                 if (has(DOTALL)) {
  2126                     node = new All();
  2145                     node = new CharProperty(ALL);
  2127                 } else {
  2146                 } else {
  2128                     if (has(UNIX_LINES))
  2147                     if (has(UNIX_LINES)) {
  2129                         node = new UnixDot();
  2148                         node = new CharProperty(UNIXDOT);
  2130                     else {
  2149                     } else {
  2131                         node = new Dot();
  2150                         node = new CharProperty(DOT);
  2132                     }
  2151                     }
  2133                 }
  2152                 }
  2134                 break;
  2153                 break;
  2135             case '|':
  2154             case '|':
  2136             case ')':
  2155             case ')':
  2153                 node = atom();
  2172                 node = atom();
  2154                 break;
  2173                 break;
  2155             }
  2174             }
  2156 
  2175 
  2157             node = closure(node);
  2176             node = closure(node);
  2158 
  2177             /* save the top dot-greedy nodes (.*, .+) as well
       
  2178             if (node instanceof GreedyCharProperty &&
       
  2179                 ((GreedyCharProperty)node).cp instanceof Dot) {
       
  2180                 topClosureNodes.add(node);
       
  2181             }
       
  2182             */
  2159             if (head == null) {
  2183             if (head == null) {
  2160                 head = tail = node;
  2184                 head = tail = node;
  2161             } else {
  2185             } else {
  2162                 tail.next = node;
  2186                 tail.next = node;
  2163                 tail = node;
  2187                 tail = node;
  2211                         ch = next(); // Consume { if present
  2235                         ch = next(); // Consume { if present
  2212                         if (ch != '{')
  2236                         if (ch != '{')
  2213                             unread();
  2237                             unread();
  2214                         else
  2238                         else
  2215                             oneLetter = false;
  2239                             oneLetter = false;
  2216                         return family(oneLetter, comp);
  2240                         if (has(CANON_EQ) && !has(LITERAL))
       
  2241                             return new NFCCharProperty(family(oneLetter, comp));
       
  2242                         else
       
  2243                             return newCharProperty(family(oneLetter, comp));
  2217                     }
  2244                     }
  2218                 }
  2245                 }
  2219                 unread();
  2246                 unread();
  2220                 prev = cursor;
  2247                 prev = cursor;
  2221                 ch = escape(false, first == 0, false);
  2248                 ch = escape(false, first == 0, false);
  2249                 continue;
  2276                 continue;
  2250             }
  2277             }
  2251             break;
  2278             break;
  2252         }
  2279         }
  2253         if (first == 1) {
  2280         if (first == 1) {
  2254             return newSingle(buffer[0]);
  2281             return newCharProperty(single(buffer[0]));
  2255         } else {
  2282         } else {
  2256             return newSlice(buffer, first, hasSupplementary);
  2283             return newSlice(buffer, first, hasSupplementary);
  2257         }
  2284         }
  2258     }
  2285     }
  2259 
  2286 
  2300             default:
  2327             default:
  2301                 done = true;
  2328                 done = true;
  2302                 break;
  2329                 break;
  2303             }
  2330             }
  2304         }
  2331         }
       
  2332         hasGroupRef = true;
  2305         if (has(CASE_INSENSITIVE))
  2333         if (has(CASE_INSENSITIVE))
  2306             return new CIBackRef(refNum, has(UNICODE_CASE));
  2334             return new CIBackRef(refNum, has(UNICODE_CASE));
  2307         else
  2335         else
  2308             return new BackRef(refNum);
  2336             return new BackRef(refNum);
  2309     }
  2337     }
  2344             if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
  2372             if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
  2345             return -1;
  2373             return -1;
  2346         case 'C':
  2374         case 'C':
  2347             break;
  2375             break;
  2348         case 'D':
  2376         case 'D':
  2349             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2377             if (create) {
  2350                                ? new Utype(UnicodeProp.DIGIT).complement()
  2378                 predicate = has(UNICODE_CHARACTER_CLASS) ?
  2351                                : new Ctype(ASCII.DIGIT).complement();
  2379                             CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
       
  2380                 predicate = predicate.negate();
       
  2381                 if (!inclass)
       
  2382                     root = newCharProperty(predicate);
       
  2383             }
  2352             return -1;
  2384             return -1;
  2353         case 'E':
  2385         case 'E':
  2354         case 'F':
  2386         case 'F':
  2355             break;
  2387             break;
  2356         case 'G':
  2388         case 'G':
  2357             if (inclass) break;
  2389             if (inclass) break;
  2358             if (create) root = new LastMatch();
  2390             if (create) root = new LastMatch();
  2359             return -1;
  2391             return -1;
  2360         case 'H':
  2392         case 'H':
  2361             if (create) root = new HorizWS().complement();
  2393             if (create) {
       
  2394                 predicate = HorizWS.negate();
       
  2395                 if (!inclass)
       
  2396                     root = newCharProperty(predicate);
       
  2397             }
  2362             return -1;
  2398             return -1;
  2363         case 'I':
  2399         case 'I':
  2364         case 'J':
  2400         case 'J':
  2365         case 'K':
  2401         case 'K':
  2366         case 'L':
  2402         case 'L':
  2375         case 'R':
  2411         case 'R':
  2376             if (inclass) break;
  2412             if (inclass) break;
  2377             if (create) root = new LineEnding();
  2413             if (create) root = new LineEnding();
  2378             return -1;
  2414             return -1;
  2379         case 'S':
  2415         case 'S':
  2380             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2416             if (create) {
  2381                                ? new Utype(UnicodeProp.WHITE_SPACE).complement()
  2417                 predicate = has(UNICODE_CHARACTER_CLASS) ?
  2382                                : new Ctype(ASCII.SPACE).complement();
  2418                             CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
       
  2419                 predicate = predicate.negate();
       
  2420                 if (!inclass)
       
  2421                     root = newCharProperty(predicate);
       
  2422             }
  2383             return -1;
  2423             return -1;
  2384         case 'T':
  2424         case 'T':
  2385         case 'U':
  2425         case 'U':
  2386             break;
  2426             break;
  2387         case 'V':
  2427         case 'V':
  2388             if (create) root = new VertWS().complement();
  2428             if (create) {
       
  2429                 predicate = VertWS.negate();
       
  2430                 if (!inclass)
       
  2431                     root = newCharProperty(predicate);
       
  2432             }
  2389             return -1;
  2433             return -1;
  2390         case 'W':
  2434         case 'W':
  2391             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2435             if (create) {
  2392                                ? new Utype(UnicodeProp.WORD).complement()
  2436                 predicate = has(UNICODE_CHARACTER_CLASS) ?
  2393                                : new Ctype(ASCII.WORD).complement();
  2437                             CharPredicates.WORD : CharPredicates.ASCII_WORD;
       
  2438                 predicate = predicate.negate();
       
  2439                 if (!inclass)
       
  2440                     root = newCharProperty(predicate);
       
  2441             }
  2394             return -1;
  2442             return -1;
  2395         case 'X':
  2443         case 'X':
  2396             if (inclass) break;
  2444             if (inclass) break;
  2397             if (create) {
  2445             if (create) {
  2398                 root = new XGrapheme();
  2446                 root = new XGrapheme();
  2428             }
  2476             }
  2429             return -1;
  2477             return -1;
  2430         case 'c':
  2478         case 'c':
  2431             return c();
  2479             return c();
  2432         case 'd':
  2480         case 'd':
  2433             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2481             if (create) {
  2434                                ? new Utype(UnicodeProp.DIGIT)
  2482                 predicate = has(UNICODE_CHARACTER_CLASS) ?
  2435                                : new Ctype(ASCII.DIGIT);
  2483                             CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
       
  2484                 if (!inclass)
       
  2485                     root = newCharProperty(predicate);
       
  2486             }
  2436             return -1;
  2487             return -1;
  2437         case 'e':
  2488         case 'e':
  2438             return '\033';
  2489             return '\033';
  2439         case 'f':
  2490         case 'f':
  2440             return '\f';
  2491             return '\f';
  2441         case 'g':
  2492         case 'g':
  2442             break;
  2493             break;
  2443         case 'h':
  2494         case 'h':
  2444             if (create) root = new HorizWS();
  2495             if (create) {
       
  2496                 predicate = HorizWS;
       
  2497                 if (!inclass)
       
  2498                     root = newCharProperty(predicate);
       
  2499             }
  2445             return -1;
  2500             return -1;
  2446         case 'i':
  2501         case 'i':
  2447         case 'j':
  2502         case 'j':
  2448             break;
  2503             break;
  2449         case 'k':
  2504         case 'k':
  2453                 throw error("\\k is not followed by '<' for named capturing group");
  2508                 throw error("\\k is not followed by '<' for named capturing group");
  2454             String name = groupname(read());
  2509             String name = groupname(read());
  2455             if (!namedGroups().containsKey(name))
  2510             if (!namedGroups().containsKey(name))
  2456                 throw error("(named capturing group <"+ name+"> does not exit");
  2511                 throw error("(named capturing group <"+ name+"> does not exit");
  2457             if (create) {
  2512             if (create) {
       
  2513                 hasGroupRef = true;
  2458                 if (has(CASE_INSENSITIVE))
  2514                 if (has(CASE_INSENSITIVE))
  2459                     root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
  2515                     root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
  2460                 else
  2516                 else
  2461                     root = new BackRef(namedGroups().get(name));
  2517                     root = new BackRef(namedGroups().get(name));
  2462             }
  2518             }
  2471         case 'q':
  2527         case 'q':
  2472             break;
  2528             break;
  2473         case 'r':
  2529         case 'r':
  2474             return '\r';
  2530             return '\r';
  2475         case 's':
  2531         case 's':
  2476             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2532             if (create) {
  2477                                ? new Utype(UnicodeProp.WHITE_SPACE)
  2533                 predicate = has(UNICODE_CHARACTER_CLASS) ?
  2478                                : new Ctype(ASCII.SPACE);
  2534                             CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
       
  2535                 if (!inclass)
       
  2536                     root = newCharProperty(predicate);
       
  2537             }
  2479             return -1;
  2538             return -1;
  2480         case 't':
  2539         case 't':
  2481             return '\t';
  2540             return '\t';
  2482         case 'u':
  2541         case 'u':
  2483             return u();
  2542             return u();
  2490             // the start or end value, such as [\v-...] or [...-\v], in
  2549             // the start or end value, such as [\v-...] or [...-\v], in
  2491             // which a single definite value (0x0B) is expected. For
  2550             // which a single definite value (0x0B) is expected. For
  2492             // compatibility concern '\013'/0x0B is returned if isrange.
  2551             // compatibility concern '\013'/0x0B is returned if isrange.
  2493             if (isrange)
  2552             if (isrange)
  2494                 return '\013';
  2553                 return '\013';
  2495             if (create) root = new VertWS();
  2554             if (create) {
       
  2555                 predicate = VertWS;
       
  2556                 if (!inclass)
       
  2557                     root = newCharProperty(predicate);
       
  2558             }
  2496             return -1;
  2559             return -1;
  2497         case 'w':
  2560         case 'w':
  2498             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2561             if (create) {
  2499                                ? new Utype(UnicodeProp.WORD)
  2562                 predicate = has(UNICODE_CHARACTER_CLASS) ?
  2500                                : new Ctype(ASCII.WORD);
  2563                             CharPredicates.WORD : CharPredicates.ASCII_WORD;
       
  2564                 if (!inclass)
       
  2565                     root = newCharProperty(predicate);
       
  2566             }
  2501             return -1;
  2567             return -1;
  2502         case 'x':
  2568         case 'x':
  2503             return x();
  2569             return x();
  2504         case 'y':
  2570         case 'y':
  2505             break;
  2571             break;
  2518      *
  2584      *
  2519      * Consumes a ] on the way out if consume is true. Usually consume
  2585      * Consumes a ] on the way out if consume is true. Usually consume
  2520      * is true except for the case of [abc&&def] where def is a separate
  2586      * is true except for the case of [abc&&def] where def is a separate
  2521      * right hand node with "understood" brackets.
  2587      * right hand node with "understood" brackets.
  2522      */
  2588      */
  2523     private CharProperty clazz(boolean consume) {
  2589     private CharPredicate clazz(boolean consume) {
  2524         CharProperty prev = null;
  2590         CharPredicate prev = null;
  2525         CharProperty node = null;
  2591         CharPredicate curr = null;
  2526         BitClass bits = new BitClass();
  2592         BitClass bits = new BitClass();
  2527         boolean include = true;
  2593         BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
  2528         boolean firstInClass = true;
  2594 
       
  2595         boolean isNeg = false;
       
  2596         boolean hasBits = false;
  2529         int ch = next();
  2597         int ch = next();
       
  2598 
       
  2599         // Negates if first char in a class, otherwise literal
       
  2600         if (ch == '^' && temp[cursor-1] == '[') {
       
  2601             ch = next();
       
  2602             isNeg = true;
       
  2603         }
  2530         for (;;) {
  2604         for (;;) {
  2531             switch (ch) {
  2605             switch (ch) {
  2532                 case '^':
       
  2533                     // Negates if first char in a class, otherwise literal
       
  2534                     if (firstInClass) {
       
  2535                         if (temp[cursor-1] != '[')
       
  2536                             break;
       
  2537                         ch = next();
       
  2538                         include = !include;
       
  2539                         continue;
       
  2540                     } else {
       
  2541                         // ^ not first in class, treat as literal
       
  2542                         break;
       
  2543                     }
       
  2544                 case '[':
  2606                 case '[':
  2545                     firstInClass = false;
  2607                     curr = clazz(true);
  2546                     node = clazz(true);
       
  2547                     if (prev == null)
  2608                     if (prev == null)
  2548                         prev = node;
  2609                         prev = curr;
  2549                     else
  2610                     else
  2550                         prev = union(prev, node);
  2611                         prev = prev.union(curr);
  2551                     ch = peek();
  2612                     ch = peek();
  2552                     continue;
  2613                     continue;
  2553                 case '&':
  2614                 case '&':
  2554                     firstInClass = false;
       
  2555                     ch = next();
  2615                     ch = next();
  2556                     if (ch == '&') {
  2616                     if (ch == '&') {
  2557                         ch = next();
  2617                         ch = next();
  2558                         CharProperty rightNode = null;
  2618                         CharPredicate right = null;
  2559                         while (ch != ']' && ch != '&') {
  2619                         while (ch != ']' && ch != '&') {
  2560                             if (ch == '[') {
  2620                             if (ch == '[') {
  2561                                 if (rightNode == null)
  2621                                 if (right == null)
  2562                                     rightNode = clazz(true);
  2622                                     right = clazz(true);
  2563                                 else
  2623                                 else
  2564                                     rightNode = union(rightNode, clazz(true));
  2624                                     right = right.union(clazz(true));
  2565                             } else { // abc&&def
  2625                             } else { // abc&&def
  2566                                 unread();
  2626                                 unread();
  2567                                 rightNode = clazz(false);
  2627                                 right = clazz(false);
  2568                             }
  2628                             }
  2569                             ch = peek();
  2629                             ch = peek();
  2570                         }
  2630                         }
  2571                         if (rightNode != null)
  2631                         if (hasBits) {
  2572                             node = rightNode;
  2632                             // bits used, union has high precedence
       
  2633                             if (prev == null) {
       
  2634                                 prev = curr = bitsP;
       
  2635                             } else {
       
  2636                                 prev = prev.union(bitsP);
       
  2637                             }
       
  2638                             hasBits = false;
       
  2639                         }
       
  2640                         if (right != null)
       
  2641                             curr = right;
  2573                         if (prev == null) {
  2642                         if (prev == null) {
  2574                             if (rightNode == null)
  2643                             if (right == null)
  2575                                 throw error("Bad class syntax");
  2644                                 throw error("Bad class syntax");
  2576                             else
  2645                             else
  2577                                 prev = rightNode;
  2646                                 prev = right;
  2578                         } else {
  2647                         } else {
  2579                             prev = intersection(prev, node);
  2648                             prev = prev.and(curr);
  2580                         }
  2649                         }
  2581                     } else {
  2650                     } else {
  2582                         // treat as a literal &
  2651                         // treat as a literal &
  2583                         unread();
  2652                         unread();
  2584                         break;
  2653                         break;
  2585                     }
  2654                     }
  2586                     continue;
  2655                     continue;
  2587                 case 0:
  2656                 case 0:
  2588                     firstInClass = false;
       
  2589                     if (cursor >= patternLength)
  2657                     if (cursor >= patternLength)
  2590                         throw error("Unclosed character class");
  2658                         throw error("Unclosed character class");
  2591                     break;
  2659                     break;
  2592                 case ']':
  2660                 case ']':
  2593                     firstInClass = false;
  2661                     if (prev != null || hasBits) {
  2594                     if (prev != null) {
       
  2595                         if (consume)
  2662                         if (consume)
  2596                             next();
  2663                             next();
       
  2664                         if (prev == null)
       
  2665                             prev = bitsP;
       
  2666                         else if (hasBits)
       
  2667                             prev = prev.union(bitsP);
       
  2668                         if (isNeg)
       
  2669                             return prev.negate();
  2597                         return prev;
  2670                         return prev;
  2598                     }
  2671                     }
  2599                     break;
  2672                     break;
  2600                 default:
  2673                 default:
  2601                     firstInClass = false;
       
  2602                     break;
  2674                     break;
  2603             }
  2675             }
  2604             node = range(bits);
  2676             curr = range(bits);
  2605             if (include) {
  2677             if (curr == null) {    // the bits used
  2606                 if (prev == null) {
  2678                 hasBits = true;
  2607                     prev = node;
       
  2608                 } else {
       
  2609                     if (prev != node)
       
  2610                         prev = union(prev, node);
       
  2611                 }
       
  2612             } else {
  2679             } else {
  2613                 if (prev == null) {
  2680                 if (prev == null)
  2614                     prev = node.complement();
  2681                     prev = curr;
  2615                 } else {
  2682                 else if (prev != curr)
  2616                     if (prev != node)
  2683                     prev = prev.union(curr);
  2617                         prev = setDifference(prev, node);
       
  2618                 }
       
  2619             }
  2684             }
  2620             ch = peek();
  2685             ch = peek();
  2621         }
  2686         }
  2622     }
  2687     }
  2623 
  2688 
  2624     private CharProperty bitsOrSingle(BitClass bits, int ch) {
  2689     private CharPredicate bitsOrSingle(BitClass bits, int ch) {
  2625         /* Bits can only handle codepoints in [u+0000-u+00ff] range.
  2690         /* Bits can only handle codepoints in [u+0000-u+00ff] range.
  2626            Use "single" node instead of bits when dealing with unicode
  2691            Use "single" node instead of bits when dealing with unicode
  2627            case folding for codepoints listed below.
  2692            case folding for codepoints listed below.
  2628            (1)Uppercase out of range: u+00ff, u+00b5
  2693            (1)Uppercase out of range: u+00ff, u+00b5
  2629               toUpperCase(u+00ff) -> u+0178
  2694               toUpperCase(u+00ff) -> u+0178
  2641         */
  2706         */
  2642         int d;
  2707         int d;
  2643         if (ch < 256 &&
  2708         if (ch < 256 &&
  2644             !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
  2709             !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
  2645               (ch == 0xff || ch == 0xb5 ||
  2710               (ch == 0xff || ch == 0xb5 ||
  2646                ch == 0x49 || ch == 0x69 ||  //I and i
  2711                ch == 0x49 || ch == 0x69 ||    //I and i
  2647                ch == 0x53 || ch == 0x73 ||  //S and s
  2712                ch == 0x53 || ch == 0x73 ||    //S and s
  2648                ch == 0x4b || ch == 0x6b ||  //K and k
  2713                ch == 0x4b || ch == 0x6b ||    //K and k
  2649                ch == 0xc5 || ch == 0xe5)))  //A+ring
  2714                ch == 0xc5 || ch == 0xe5))) {  //A+ring
  2650             return bits.add(ch, flags());
  2715             bits.add(ch, flags());
  2651         return newSingle(ch);
  2716             return null;
       
  2717         }
       
  2718         return single(ch);
       
  2719     }
       
  2720 
       
  2721     /**
       
  2722      *  Returns a suitably optimized, single character predicate
       
  2723      */
       
  2724     private CharPredicate single(final int ch) {
       
  2725         if (has(CASE_INSENSITIVE)) {
       
  2726             int lower, upper;
       
  2727             if (has(UNICODE_CASE)) {
       
  2728                 upper = Character.toUpperCase(ch);
       
  2729                 lower = Character.toLowerCase(upper);
       
  2730                 // Unicode case insensitive matches
       
  2731                 if (upper != lower)
       
  2732                     return SingleU(lower);
       
  2733             } else if (ASCII.isAscii(ch)) {
       
  2734                 lower = ASCII.toLower(ch);
       
  2735                 upper = ASCII.toUpper(ch);
       
  2736                 // Case insensitive matches a given BMP character
       
  2737                 if (lower != upper)
       
  2738                     return SingleI(lower, upper);
       
  2739             }
       
  2740         }
       
  2741         if (isSupplementary(ch))
       
  2742             return SingleS(ch);
       
  2743         return Single(ch);  // Match a given BMP character
  2652     }
  2744     }
  2653 
  2745 
  2654     /**
  2746     /**
  2655      * Parse a single character or a character range in a character class
  2747      * Parse a single character or a character range in a character class
  2656      * and return its representative node.
  2748      * and return its representative node.
  2657      */
  2749      */
  2658     private CharProperty range(BitClass bits) {
  2750     private CharPredicate range(BitClass bits) {
  2659         int ch = peek();
  2751         int ch = peek();
  2660         if (ch == '\\') {
  2752         if (ch == '\\') {
  2661             ch = nextEscaped();
  2753             ch = nextEscaped();
  2662             if (ch == 'p' || ch == 'P') { // A property
  2754             if (ch == 'p' || ch == 'P') { // A property
  2663                 boolean comp = (ch == 'P');
  2755                 boolean comp = (ch == 'P');
  2672             } else { // ordinary escape
  2764             } else { // ordinary escape
  2673                 boolean isrange = temp[cursor+1] == '-';
  2765                 boolean isrange = temp[cursor+1] == '-';
  2674                 unread();
  2766                 unread();
  2675                 ch = escape(true, true, isrange);
  2767                 ch = escape(true, true, isrange);
  2676                 if (ch == -1)
  2768                 if (ch == -1)
  2677                     return (CharProperty) root;
  2769                     return predicate;
  2678             }
  2770             }
  2679         } else {
  2771         } else {
  2680             next();
  2772             next();
  2681         }
  2773         }
  2682         if (ch >= 0) {
  2774         if (ch >= 0) {
  2694                         next();
  2786                         next();
  2695                     }
  2787                     }
  2696                     if (m < ch) {
  2788                     if (m < ch) {
  2697                         throw error("Illegal character range");
  2789                         throw error("Illegal character range");
  2698                     }
  2790                     }
  2699                     if (has(CASE_INSENSITIVE))
  2791                     if (has(CASE_INSENSITIVE)) {
  2700                         return caseInsensitiveRangeFor(ch, m);
  2792                         if (has(UNICODE_CASE))
  2701                     else
  2793                             return CIRangeU(ch, m);
  2702                         return rangeFor(ch, m);
  2794                         return CIRange(ch, m);
       
  2795                     } else {
       
  2796                         return Range(ch, m);
       
  2797                     }
  2703                 }
  2798                 }
  2704             }
  2799             }
  2705             return bitsOrSingle(bits, ch);
  2800             return bitsOrSingle(bits, ch);
  2706         }
  2801         }
  2707         throw error("Unexpected character '"+((char)ch)+"'");
  2802         throw error("Unexpected character '"+((char)ch)+"'");
  2708     }
  2803     }
  2709 
  2804 
  2710     /**
  2805     /**
  2711      * Parses a Unicode character family and returns its representative node.
  2806      * Parses a Unicode character family and returns its representative node.
  2712      */
  2807      */
  2713     private CharProperty family(boolean singleLetter,
  2808     private CharPredicate family(boolean singleLetter, boolean isComplement) {
  2714                                 boolean maybeComplement)
       
  2715     {
       
  2716         next();
  2809         next();
  2717         String name;
  2810         String name;
  2718         CharProperty node = null;
  2811         CharPredicate p = null;
  2719 
  2812 
  2720         if (singleLetter) {
  2813         if (singleLetter) {
  2721             int c = temp[cursor];
  2814             int c = temp[cursor];
  2722             if (!Character.isSupplementaryCodePoint(c)) {
  2815             if (!Character.isSupplementaryCodePoint(c)) {
  2723                 name = String.valueOf((char)c);
  2816                 name = String.valueOf((char)c);
  2745             String value = name.substring(i + 1);
  2838             String value = name.substring(i + 1);
  2746             name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
  2839             name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
  2747             switch (name) {
  2840             switch (name) {
  2748                 case "sc":
  2841                 case "sc":
  2749                 case "script":
  2842                 case "script":
  2750                     node = unicodeScriptPropertyFor(value);
  2843                     p = CharPredicates.forUnicodeScript(value);
  2751                     break;
  2844                     break;
  2752                 case "blk":
  2845                 case "blk":
  2753                 case "block":
  2846                 case "block":
  2754                     node = unicodeBlockPropertyFor(value);
  2847                     p = CharPredicates.forUnicodeBlock(value);
  2755                     break;
  2848                     break;
  2756                 case "gc":
  2849                 case "gc":
  2757                 case "general_category":
  2850                 case "general_category":
  2758                     node = charPropertyNodeFor(value);
  2851                     p = CharPredicates.forProperty(value);
  2759                     break;
  2852                     break;
  2760                 default:
  2853                 default:
  2761                     throw error("Unknown Unicode property {name=<" + name + ">, "
  2854                     break;
  2762                                 + "value=<" + value + ">}");
  2855             }
  2763             }
  2856             if (p == null)
       
  2857                 throw error("Unknown Unicode property {name=<" + name + ">, "
       
  2858                              + "value=<" + value + ">}");
       
  2859 
  2764         } else {
  2860         } else {
  2765             if (name.startsWith("In")) {
  2861             if (name.startsWith("In")) {
  2766                 // \p{inBlockName}
  2862                 // \p{InBlockName}
  2767                 node = unicodeBlockPropertyFor(name.substring(2));
  2863                 p = CharPredicates.forUnicodeBlock(name.substring(2));
  2768             } else if (name.startsWith("Is")) {
  2864             } else if (name.startsWith("Is")) {
  2769                 // \p{isGeneralCategory} and \p{isScriptName}
  2865                 // \p{IsGeneralCategory} and \p{IsScriptName}
  2770                 name = name.substring(2);
  2866                 name = name.substring(2);
  2771                 UnicodeProp uprop = UnicodeProp.forName(name);
  2867                 p = CharPredicates.forUnicodeProperty(name);
  2772                 if (uprop != null)
  2868                 if (p == null)
  2773                     node = new Utype(uprop);
  2869                     p = CharPredicates.forProperty(name);
  2774                 if (node == null)
  2870                 if (p == null)
  2775                     node = CharPropertyNames.charPropertyFor(name);
  2871                     p = CharPredicates.forUnicodeScript(name);
  2776                 if (node == null)
       
  2777                     node = unicodeScriptPropertyFor(name);
       
  2778             } else {
  2872             } else {
  2779                 if (has(UNICODE_CHARACTER_CLASS)) {
  2873                 if (has(UNICODE_CHARACTER_CLASS)) {
  2780                     UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
  2874                     p = CharPredicates.forPOSIXName(name);
  2781                     if (uprop != null)
  2875                 }
  2782                         node = new Utype(uprop);
  2876                 if (p == null)
  2783                 }
  2877                     p = CharPredicates.forProperty(name);
  2784                 if (node == null)
  2878             }
  2785                     node = charPropertyNodeFor(name);
  2879             if (p == null)
  2786             }
  2880                 throw error("Unknown character property name {In/Is" + name + "}");
  2787         }
  2881         }
  2788         if (maybeComplement) {
  2882         if (isComplement) {
  2789             if (node instanceof Category || node instanceof Block)
  2883             // it might be too expensive to detect if a complement of
  2790                 hasSupplementary = true;
  2884             // CharProperty can match "certain" supplementary. So just
  2791             node = node.complement();
  2885             // go with StartS.
  2792         }
  2886             hasSupplementary = true;
  2793         return node;
  2887             p = p.negate();
  2794     }
  2888         }
  2795 
  2889         return p;
  2796 
  2890     }
  2797     /**
  2891 
  2798      * Returns a CharProperty matching all characters belong to
  2892     private CharProperty newCharProperty(CharPredicate p) {
  2799      * a UnicodeScript.
       
  2800      */
       
  2801     private CharProperty unicodeScriptPropertyFor(String name) {
       
  2802         final Character.UnicodeScript script;
       
  2803         try {
       
  2804             script = Character.UnicodeScript.forName(name);
       
  2805         } catch (IllegalArgumentException iae) {
       
  2806             throw error("Unknown character script name {" + name + "}");
       
  2807         }
       
  2808         return new Script(script);
       
  2809     }
       
  2810 
       
  2811     /**
       
  2812      * Returns a CharProperty matching all characters in a UnicodeBlock.
       
  2813      */
       
  2814     private CharProperty unicodeBlockPropertyFor(String name) {
       
  2815         final Character.UnicodeBlock block;
       
  2816         try {
       
  2817             block = Character.UnicodeBlock.forName(name);
       
  2818         } catch (IllegalArgumentException iae) {
       
  2819             throw error("Unknown character block name {" + name + "}");
       
  2820         }
       
  2821         return new Block(block);
       
  2822     }
       
  2823 
       
  2824     /**
       
  2825      * Returns a CharProperty matching all characters in a named property.
       
  2826      */
       
  2827     private CharProperty charPropertyNodeFor(String name) {
       
  2828         CharProperty p = CharPropertyNames.charPropertyFor(name);
       
  2829         if (p == null)
  2893         if (p == null)
  2830             throw error("Unknown character property name {" + name + "}");
  2894             return null;
  2831         return p;
  2895         if (p instanceof BmpCharPredicate)
       
  2896             return new BmpCharProperty((BmpCharPredicate)p);
       
  2897         else
       
  2898             return new CharProperty(p);
  2832     }
  2899     }
  2833 
  2900 
  2834     /**
  2901     /**
  2835      * Parses and returns the name of a "named capturing group", the trailing
  2902      * Parses and returns the name of a "named capturing group", the trailing
  2836      * ">" is consumed after parsing.
  2903      * ">" is consumed after parsing.
  2857     private Node group0() {
  2924     private Node group0() {
  2858         boolean capturingGroup = false;
  2925         boolean capturingGroup = false;
  2859         Node head = null;
  2926         Node head = null;
  2860         Node tail = null;
  2927         Node tail = null;
  2861         int save = flags;
  2928         int save = flags;
       
  2929         int saveTCNCount = topClosureNodes.size();
  2862         root = null;
  2930         root = null;
  2863         int ch = next();
  2931         int ch = next();
  2864         if (ch == '?') {
  2932         if (ch == '?') {
  2865             ch = skip();
  2933             ch = skip();
  2866             switch (ch) {
  2934             switch (ch) {
  2882                 break;
  2950                 break;
  2883             case '>':   // (?>xxx)  independent group
  2951             case '>':   // (?>xxx)  independent group
  2884                 head = createGroup(true);
  2952                 head = createGroup(true);
  2885                 tail = root;
  2953                 tail = root;
  2886                 head.next = expr(tail);
  2954                 head.next = expr(tail);
  2887                 head = tail = new Ques(head, INDEPENDENT);
  2955                 head = tail = new Ques(head, Qtype.INDEPENDENT);
  2888                 break;
  2956                 break;
  2889             case '<':   // (?<xxx)  look behind
  2957             case '<':   // (?<xxx)  look behind
  2890                 ch = read();
  2958                 ch = read();
  2891                 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
  2959                 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
  2892                     // named captured group
  2960                     // named captured group
  2926                                    new NotBehind(head, info.maxLength,
  2994                                    new NotBehind(head, info.maxLength,
  2927                                                  info.minLength));
  2995                                                  info.minLength));
  2928                 } else {
  2996                 } else {
  2929                     throw error("Unknown look-behind group");
  2997                     throw error("Unknown look-behind group");
  2930                 }
  2998                 }
       
  2999                 // clear all top-closure-nodes inside lookbehind
       
  3000                 if (saveTCNCount < topClosureNodes.size())
       
  3001                     topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
  2931                 break;
  3002                 break;
  2932             case '$':
  3003             case '$':
  2933             case '@':
  3004             case '@':
  2934                 throw error("Unknown group type");
  3005                 throw error("Unknown group type");
  2935             default:    // (?xxx:) inlined match flags
  3006             default:    // (?xxx:) inlined match flags
  2966         if (head == tail) { // Zero length assertion
  3037         if (head == tail) { // Zero length assertion
  2967             root = node;
  3038             root = node;
  2968             return node;    // Dual return
  3039             return node;    // Dual return
  2969         }
  3040         }
  2970 
  3041 
       
  3042         // have group closure, clear all inner closure nodes from the
       
  3043         // top list (no backtracking stopper optimization for inner
       
  3044         if (saveTCNCount < topClosureNodes.size())
       
  3045             topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
       
  3046 
  2971         if (node instanceof Ques) {
  3047         if (node instanceof Ques) {
  2972             Ques ques = (Ques) node;
  3048             Ques ques = (Ques) node;
  2973             if (ques.type == POSSESSIVE) {
  3049             if (ques.type == Qtype.POSSESSIVE) {
  2974                 root = node;
  3050                 root = node;
  2975                 return node;
  3051                 return node;
  2976             }
  3052             }
  2977             tail.next = new BranchConn();
  3053             tail.next = new BranchConn();
  2978             tail = tail.next;
  3054             tail = tail.next;
  2979             if (ques.type == GREEDY) {
  3055             if (ques.type == Qtype.GREEDY) {
  2980                 head = new Branch(head, null, tail);
  3056                 head = new Branch(head, null, tail);
  2981             } else { // Reluctant quantifier
  3057             } else { // Reluctant quantifier
  2982                 head = new Branch(null, head, tail);
  3058                 head = new Branch(null, head, tail);
  2983             }
  3059             }
  2984             root = tail;
  3060             root = tail;
  2985             return head;
  3061             return head;
  2986         } else if (node instanceof Curly) {
  3062         } else if (node instanceof Curly) {
  2987             Curly curly = (Curly) node;
  3063             Curly curly = (Curly) node;
  2988             if (curly.type == POSSESSIVE) {
  3064             if (curly.type == Qtype.POSSESSIVE) {
  2989                 root = node;
  3065                 root = node;
  2990                 return node;
  3066                 return node;
  2991             }
  3067             }
  2992             // Discover if the group is deterministic
  3068             // Discover if the group is deterministic
  2993             TreeInfo info = new TreeInfo();
  3069             TreeInfo info = new TreeInfo();
  3000                                              capturingGroup);
  3076                                              capturingGroup);
  3001                 return head;
  3077                 return head;
  3002             } else { // Non-deterministic
  3078             } else { // Non-deterministic
  3003                 int temp = ((GroupHead) head).localIndex;
  3079                 int temp = ((GroupHead) head).localIndex;
  3004                 Loop loop;
  3080                 Loop loop;
  3005                 if (curly.type == GREEDY)
  3081                 if (curly.type == Qtype.GREEDY) {
  3006                     loop = new Loop(this.localCount, temp);
  3082                     loop = new Loop(this.localCount, temp);
  3007                 else  // Reluctant Curly
  3083                     // add the max_reps greedy to the top-closure-node list
       
  3084                     if (curly.cmax == MAX_REPS)
       
  3085                         topClosureNodes.add(loop);
       
  3086                 } else {  // Reluctant Curly
  3008                     loop = new LazyLoop(this.localCount, temp);
  3087                     loop = new LazyLoop(this.localCount, temp);
       
  3088                 }
  3009                 Prolog prolog = new Prolog(loop);
  3089                 Prolog prolog = new Prolog(loop);
  3010                 this.localCount += 1;
  3090                 this.localCount += 1;
  3011                 loop.cmin = curly.cmin;
  3091                 loop.cmin = curly.cmin;
  3012                 loop.cmax = curly.cmax;
  3092                 loop.cmax = curly.cmax;
  3013                 loop.body = head;
  3093                 loop.body = head;
  3029         int groupIndex = 0;
  3109         int groupIndex = 0;
  3030         if (!anonymous)
  3110         if (!anonymous)
  3031             groupIndex = capturingGroupCount++;
  3111             groupIndex = capturingGroupCount++;
  3032         GroupHead head = new GroupHead(localIndex);
  3112         GroupHead head = new GroupHead(localIndex);
  3033         root = new GroupTail(localIndex, groupIndex);
  3113         root = new GroupTail(localIndex, groupIndex);
       
  3114 
       
  3115         // for debug/print only, head.match does NOT need the "tail" info
       
  3116         head.tail = (GroupTail)root;
       
  3117 
  3034         if (!anonymous && groupIndex < 10)
  3118         if (!anonymous && groupIndex < 10)
  3035             groupNodes[groupIndex] = head;
  3119             groupNodes[groupIndex] = head;
  3036         return head;
  3120         return head;
  3037     }
  3121     }
  3038 
  3122 
  3117         }
  3201         }
  3118     }
  3202     }
  3119 
  3203 
  3120     static final int MAX_REPS   = 0x7FFFFFFF;
  3204     static final int MAX_REPS   = 0x7FFFFFFF;
  3121 
  3205 
  3122     static final int GREEDY     = 0;
  3206     static enum Qtype {
  3123 
  3207         GREEDY, LAZY, POSSESSIVE, INDEPENDENT
  3124     static final int LAZY       = 1;
  3208     }
  3125 
  3209 
  3126     static final int POSSESSIVE = 2;
  3210     private Node curly(Node prev, int cmin) {
  3127 
  3211         int ch = next();
  3128     static final int INDEPENDENT = 3;
  3212         if (ch == '?') {
       
  3213             next();
       
  3214             return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
       
  3215         } else if (ch == '+') {
       
  3216             next();
       
  3217             return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
       
  3218         }
       
  3219         if (prev instanceof BmpCharProperty) {
       
  3220             return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
       
  3221         } else if (prev instanceof CharProperty) {
       
  3222             return new CharPropertyGreedy((CharProperty)prev, cmin);
       
  3223         }
       
  3224         return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
       
  3225     }
  3129 
  3226 
  3130     /**
  3227     /**
  3131      * Processes repetition. If the next character peeked is a quantifier
  3228      * Processes repetition. If the next character peeked is a quantifier
  3132      * then new nodes must be appended to handle the repetition.
  3229      * then new nodes must be appended to handle the repetition.
  3133      * Prev could be a single or a group, so it could be a chain of nodes.
  3230      * Prev could be a single or a group, so it could be a chain of nodes.
  3138         switch (ch) {
  3235         switch (ch) {
  3139         case '?':
  3236         case '?':
  3140             ch = next();
  3237             ch = next();
  3141             if (ch == '?') {
  3238             if (ch == '?') {
  3142                 next();
  3239                 next();
  3143                 return new Ques(prev, LAZY);
  3240                 return new Ques(prev, Qtype.LAZY);
  3144             } else if (ch == '+') {
  3241             } else if (ch == '+') {
  3145                 next();
  3242                 next();
  3146                 return new Ques(prev, POSSESSIVE);
  3243                 return new Ques(prev, Qtype.POSSESSIVE);
  3147             }
  3244             }
  3148             return new Ques(prev, GREEDY);
  3245             return new Ques(prev, Qtype.GREEDY);
  3149         case '*':
  3246         case '*':
  3150             ch = next();
  3247             return curly(prev, 0);
  3151             if (ch == '?') {
       
  3152                 next();
       
  3153                 return new Curly(prev, 0, MAX_REPS, LAZY);
       
  3154             } else if (ch == '+') {
       
  3155                 next();
       
  3156                 return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
       
  3157             }
       
  3158             return new Curly(prev, 0, MAX_REPS, GREEDY);
       
  3159         case '+':
  3248         case '+':
  3160             ch = next();
  3249             return curly(prev, 1);
  3161             if (ch == '?') {
       
  3162                 next();
       
  3163                 return new Curly(prev, 1, MAX_REPS, LAZY);
       
  3164             } else if (ch == '+') {
       
  3165                 next();
       
  3166                 return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
       
  3167             }
       
  3168             return new Curly(prev, 1, MAX_REPS, GREEDY);
       
  3169         case '{':
  3250         case '{':
  3170             ch = temp[cursor+1];
  3251             ch = temp[cursor+1];
  3171             if (ASCII.isDigit(ch)) {
  3252             if (ASCII.isDigit(ch)) {
  3172                 skip();
  3253                 skip();
  3173                 int cmin = 0;
  3254                 int cmin = 0;
  3192                     throw error("Illegal repetition range");
  3273                     throw error("Illegal repetition range");
  3193                 Curly curly;
  3274                 Curly curly;
  3194                 ch = peek();
  3275                 ch = peek();
  3195                 if (ch == '?') {
  3276                 if (ch == '?') {
  3196                     next();
  3277                     next();
  3197                     curly = new Curly(prev, cmin, cmax, LAZY);
  3278                     curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
  3198                 } else if (ch == '+') {
  3279                 } else if (ch == '+') {
  3199                     next();
  3280                     next();
  3200                     curly = new Curly(prev, cmin, cmax, POSSESSIVE);
  3281                     curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
  3201                 } else {
  3282                 } else {
  3202                     curly = new Curly(prev, cmin, cmax, GREEDY);
  3283                     curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
  3203                 }
  3284                 }
  3204                 return curly;
  3285                 return curly;
  3205             } else {
  3286             } else {
  3206                 throw error("Illegal repetition");
  3287                 throw error("Illegal repetition");
  3207             }
  3288             }
  3374     /**
  3455     /**
  3375      *  Creates a bit vector for matching Latin-1 values. A normal BitClass
  3456      *  Creates a bit vector for matching Latin-1 values. A normal BitClass
  3376      *  never matches values above Latin-1, and a complemented BitClass always
  3457      *  never matches values above Latin-1, and a complemented BitClass always
  3377      *  matches values above Latin-1.
  3458      *  matches values above Latin-1.
  3378      */
  3459      */
  3379     private static final class BitClass extends BmpCharProperty {
  3460     static final class BitClass extends BmpCharProperty {
  3380         final boolean[] bits;
  3461         final boolean[] bits;
  3381         BitClass() { bits = new boolean[256]; }
  3462         BitClass() {
  3382         private BitClass(boolean[] bits) { this.bits = bits; }
  3463             this(new boolean[256]);
       
  3464         }
       
  3465         private BitClass(boolean[] bits) {
       
  3466             super( ch -> ch < 256 && bits[ch]);
       
  3467             this.bits = bits;
       
  3468         }
  3383         BitClass add(int c, int flags) {
  3469         BitClass add(int c, int flags) {
  3384             assert c >= 0 && c <= 255;
  3470             assert c >= 0 && c <= 255;
  3385             if ((flags & CASE_INSENSITIVE) != 0) {
  3471             if ((flags & CASE_INSENSITIVE) != 0) {
  3386                 if (ASCII.isAscii(c)) {
  3472                 if (ASCII.isAscii(c)) {
  3387                     bits[ASCII.toUpper(c)] = true;
  3473                     bits[ASCII.toUpper(c)] = true;
  3392                 }
  3478                 }
  3393             }
  3479             }
  3394             bits[c] = true;
  3480             bits[c] = true;
  3395             return this;
  3481             return this;
  3396         }
  3482         }
  3397         boolean isSatisfiedBy(int ch) {
       
  3398             return ch < 256 && bits[ch];
       
  3399         }
       
  3400     }
       
  3401 
       
  3402     /**
       
  3403      *  Returns a suitably optimized, single character matcher.
       
  3404      */
       
  3405     private CharProperty newSingle(final int ch) {
       
  3406         if (has(CASE_INSENSITIVE)) {
       
  3407             int lower, upper;
       
  3408             if (has(UNICODE_CASE)) {
       
  3409                 upper = Character.toUpperCase(ch);
       
  3410                 lower = Character.toLowerCase(upper);
       
  3411                 if (upper != lower)
       
  3412                     return new SingleU(lower);
       
  3413             } else if (ASCII.isAscii(ch)) {
       
  3414                 lower = ASCII.toLower(ch);
       
  3415                 upper = ASCII.toUpper(ch);
       
  3416                 if (lower != upper)
       
  3417                     return new SingleI(lower, upper);
       
  3418             }
       
  3419         }
       
  3420         if (isSupplementary(ch))
       
  3421             return new SingleS(ch);    // Match a given Unicode character
       
  3422         return new Single(ch);         // Match a given BMP character
       
  3423     }
  3483     }
  3424 
  3484 
  3425     /**
  3485     /**
  3426      *  Utility method for creating a string slice matcher.
  3486      *  Utility method for creating a string slice matcher.
  3427      */
  3487      */
  3825 
  3885 
  3826     /**
  3886     /**
  3827      * Abstract node class to match one character satisfying some
  3887      * Abstract node class to match one character satisfying some
  3828      * boolean property.
  3888      * boolean property.
  3829      */
  3889      */
  3830     private abstract static class CharProperty extends Node {
  3890     static class CharProperty extends Node {
  3831         abstract boolean isSatisfiedBy(int ch);
  3891         CharPredicate predicate;
  3832         CharProperty complement() {
  3892 
  3833             return new CharProperty() {
  3893         CharProperty (CharPredicate predicate) {
  3834                     boolean isSatisfiedBy(int ch) {
  3894             this.predicate = predicate;
  3835                         return ! CharProperty.this.isSatisfiedBy(ch);}};
       
  3836         }
  3895         }
  3837         boolean match(Matcher matcher, int i, CharSequence seq) {
  3896         boolean match(Matcher matcher, int i, CharSequence seq) {
  3838             if (i < matcher.to) {
  3897             if (i < matcher.to) {
  3839                 int ch = Character.codePointAt(seq, i);
  3898                 int ch = Character.codePointAt(seq, i);
  3840                 return isSatisfiedBy(ch)
  3899                 return predicate.is(ch) &&
  3841                     && next.match(matcher, i+Character.charCount(ch), seq);
  3900                        next.match(matcher, i + Character.charCount(ch), seq);
  3842             } else {
  3901             } else {
  3843                 matcher.hitEnd = true;
  3902                 matcher.hitEnd = true;
  3844                 return false;
  3903                 return false;
  3845             }
  3904             }
  3846         }
  3905         }
  3853 
  3912 
  3854     /**
  3913     /**
  3855      * Optimized version of CharProperty that works only for
  3914      * Optimized version of CharProperty that works only for
  3856      * properties never satisfied by Supplementary characters.
  3915      * properties never satisfied by Supplementary characters.
  3857      */
  3916      */
  3858     private abstract static class BmpCharProperty extends CharProperty {
  3917     private static class BmpCharProperty extends CharProperty {
       
  3918         BmpCharProperty (BmpCharPredicate predicate) {
       
  3919             super(predicate);
       
  3920         }
  3859         boolean match(Matcher matcher, int i, CharSequence seq) {
  3921         boolean match(Matcher matcher, int i, CharSequence seq) {
  3860             if (i < matcher.to) {
  3922             if (i < matcher.to) {
  3861                 return isSatisfiedBy(seq.charAt(i))
  3923                 return predicate.is(seq.charAt(i)) &&
  3862                     && next.match(matcher, i+1, seq);
  3924                        next.match(matcher, i + 1, seq);
  3863             } else {
  3925             } else {
  3864                 matcher.hitEnd = true;
  3926                 matcher.hitEnd = true;
  3865                 return false;
  3927                 return false;
  3866             }
  3928             }
  3867         }
  3929         }
  3868     }
  3930     }
  3869 
  3931 
  3870     /**
  3932     private static class NFCCharProperty extends Node {
  3871      * Node class that matches a Supplementary Unicode character
  3933         CharPredicate predicate;
  3872      */
  3934         NFCCharProperty (CharPredicate predicate) {
  3873     static final class SingleS extends CharProperty {
  3935             this.predicate = predicate;
  3874         final int c;
  3936         }
  3875         SingleS(int c) { this.c = c; }
  3937 
  3876         boolean isSatisfiedBy(int ch) {
  3938         boolean match(Matcher matcher, int i, CharSequence seq) {
  3877             return ch == c;
  3939             if (i < matcher.to) {
  3878         }
  3940                 int ch0 = Character.codePointAt(seq, i);
  3879     }
  3941                 int n = Character.charCount(ch0);
  3880 
  3942                 int j = i + n;
  3881     /**
  3943                 while (j < matcher.to) {
  3882      * Optimization -- matches a given BMP character
  3944                     int ch1 = Character.codePointAt(seq, j);
  3883      */
  3945                     if (Grapheme.isBoundary(ch0, ch1))
  3884     static final class Single extends BmpCharProperty {
  3946                         break;
  3885         final int c;
  3947                     ch0 = ch1;
  3886         Single(int c) { this.c = c; }
  3948                     j += Character.charCount(ch1);
  3887         boolean isSatisfiedBy(int ch) {
  3949                 }
  3888             return ch == c;
  3950                 if (i + n == j) {    // single, assume nfc cp
  3889         }
  3951                     if (predicate.is(ch0))
  3890     }
  3952                         return next.match(matcher, j, seq);
  3891 
  3953                 } else {
  3892     /**
  3954                     while (i + n < j) {
  3893      * Case insensitive matches a given BMP character
  3955                         String nfc = Normalizer.normalize(
  3894      */
  3956                             seq.toString().substring(i, j), Normalizer.Form.NFC);
  3895     static final class SingleI extends BmpCharProperty {
  3957                         if (nfc.codePointCount(0, nfc.length()) == 1) {
  3896         final int lower;
  3958                             if (predicate.is(nfc.codePointAt(0)) &&
  3897         final int upper;
  3959                                 next.match(matcher, j, seq)) {
  3898         SingleI(int lower, int upper) {
  3960                                 return true;
  3899             this.lower = lower;
  3961                             }
  3900             this.upper = upper;
  3962                         }
  3901         }
  3963 
  3902         boolean isSatisfiedBy(int ch) {
  3964                         ch0 = Character.codePointBefore(seq, j);
  3903             return ch == lower || ch == upper;
  3965                         j -= Character.charCount(ch0);
  3904         }
  3966                     }
  3905     }
  3967                 }
  3906 
  3968                 if (j < matcher.to)
  3907     /**
  3969                     return false;
  3908      * Unicode case insensitive matches a given Unicode character
  3970             }
  3909      */
  3971             matcher.hitEnd = true;
  3910     static final class SingleU extends CharProperty {
  3972             return false;
  3911         final int lower;
  3973         }
  3912         SingleU(int lower) {
  3974 
  3913             this.lower = lower;
  3975         boolean study(TreeInfo info) {
  3914         }
  3976             info.minLength++;
  3915         boolean isSatisfiedBy(int ch) {
  3977             info.deterministic = false;
  3916             return lower == ch ||
  3978             return next.study(info);
  3917                 lower == Character.toLowerCase(Character.toUpperCase(ch));
       
  3918         }
       
  3919     }
       
  3920 
       
  3921     /**
       
  3922      * Node class that matches a Unicode block.
       
  3923      */
       
  3924     static final class Block extends CharProperty {
       
  3925         final Character.UnicodeBlock block;
       
  3926         Block(Character.UnicodeBlock block) {
       
  3927             this.block = block;
       
  3928         }
       
  3929         boolean isSatisfiedBy(int ch) {
       
  3930             return block == Character.UnicodeBlock.of(ch);
       
  3931         }
       
  3932     }
       
  3933 
       
  3934     /**
       
  3935      * Node class that matches a Unicode script
       
  3936      */
       
  3937     static final class Script extends CharProperty {
       
  3938         final Character.UnicodeScript script;
       
  3939         Script(Character.UnicodeScript script) {
       
  3940             this.script = script;
       
  3941         }
       
  3942         boolean isSatisfiedBy(int ch) {
       
  3943             return script == Character.UnicodeScript.of(ch);
       
  3944         }
       
  3945     }
       
  3946 
       
  3947     /**
       
  3948      * Node class that matches a Unicode category.
       
  3949      */
       
  3950     static final class Category extends CharProperty {
       
  3951         final int typeMask;
       
  3952         Category(int typeMask) { this.typeMask = typeMask; }
       
  3953         boolean isSatisfiedBy(int ch) {
       
  3954             return (typeMask & (1 << Character.getType(ch))) != 0;
       
  3955         }
       
  3956     }
       
  3957 
       
  3958     /**
       
  3959      * Node class that matches a Unicode "type"
       
  3960      */
       
  3961     static final class Utype extends CharProperty {
       
  3962         final UnicodeProp uprop;
       
  3963         Utype(UnicodeProp uprop) { this.uprop = uprop; }
       
  3964         boolean isSatisfiedBy(int ch) {
       
  3965             return uprop.is(ch);
       
  3966         }
       
  3967     }
       
  3968 
       
  3969     /**
       
  3970      * Node class that matches a POSIX type.
       
  3971      */
       
  3972     static final class Ctype extends BmpCharProperty {
       
  3973         final int ctype;
       
  3974         Ctype(int ctype) { this.ctype = ctype; }
       
  3975         boolean isSatisfiedBy(int ch) {
       
  3976             return ch < 128 && ASCII.isType(ch, ctype);
       
  3977         }
       
  3978     }
       
  3979 
       
  3980     /**
       
  3981      * Node class that matches a Perl vertical whitespace
       
  3982      */
       
  3983     static final class VertWS extends BmpCharProperty {
       
  3984         boolean isSatisfiedBy(int cp) {
       
  3985             return (cp >= 0x0A && cp <= 0x0D) ||
       
  3986                    cp == 0x85 || cp == 0x2028 || cp == 0x2029;
       
  3987         }
       
  3988     }
       
  3989 
       
  3990     /**
       
  3991      * Node class that matches a Perl horizontal whitespace
       
  3992      */
       
  3993     static final class HorizWS extends BmpCharProperty {
       
  3994         boolean isSatisfiedBy(int cp) {
       
  3995             return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
       
  3996                    cp == 0x1680 || cp == 0x180e ||
       
  3997                    cp >= 0x2000 && cp <= 0x200a ||
       
  3998                    cp == 0x202f || cp == 0x205f || cp == 0x3000;
       
  3999         }
  3979         }
  4000     }
  3980     }
  4001 
  3981 
  4002     /**
  3982     /**
  4003      * Node class that matches an unicode extended grapheme cluster
  3983      * Node class that matches an unicode extended grapheme cluster
  4215         int toLower(int c) {
  4195         int toLower(int c) {
  4216             return Character.toLowerCase(Character.toUpperCase(c));
  4196             return Character.toLowerCase(Character.toUpperCase(c));
  4217         }
  4197         }
  4218     }
  4198     }
  4219 
  4199 
  4220     private static boolean inRange(int lower, int ch, int upper) {
       
  4221         return lower <= ch && ch <= upper;
       
  4222     }
       
  4223 
       
  4224     /**
       
  4225      * Returns node for matching characters within an explicit value range.
       
  4226      */
       
  4227     private static CharProperty rangeFor(final int lower,
       
  4228                                          final int upper) {
       
  4229         return new CharProperty() {
       
  4230                 boolean isSatisfiedBy(int ch) {
       
  4231                     return inRange(lower, ch, upper);}};
       
  4232     }
       
  4233 
       
  4234     /**
       
  4235      * Returns node for matching characters within an explicit value
       
  4236      * range in a case insensitive manner.
       
  4237      */
       
  4238     private CharProperty caseInsensitiveRangeFor(final int lower,
       
  4239                                                  final int upper) {
       
  4240         if (has(UNICODE_CASE))
       
  4241             return new CharProperty() {
       
  4242                 boolean isSatisfiedBy(int ch) {
       
  4243                     if (inRange(lower, ch, upper))
       
  4244                         return true;
       
  4245                     int up = Character.toUpperCase(ch);
       
  4246                     return inRange(lower, up, upper) ||
       
  4247                            inRange(lower, Character.toLowerCase(up), upper);}};
       
  4248         return new CharProperty() {
       
  4249             boolean isSatisfiedBy(int ch) {
       
  4250                 return inRange(lower, ch, upper) ||
       
  4251                     ASCII.isAscii(ch) &&
       
  4252                         (inRange(lower, ASCII.toUpper(ch), upper) ||
       
  4253                          inRange(lower, ASCII.toLower(ch), upper));
       
  4254             }};
       
  4255     }
       
  4256 
       
  4257     /**
       
  4258      * Implements the Unicode category ALL and the dot metacharacter when
       
  4259      * in dotall mode.
       
  4260      */
       
  4261     static final class All extends CharProperty {
       
  4262         boolean isSatisfiedBy(int ch) {
       
  4263             return true;
       
  4264         }
       
  4265     }
       
  4266 
       
  4267     /**
       
  4268      * Node class for the dot metacharacter when dotall is not enabled.
       
  4269      */
       
  4270     static final class Dot extends CharProperty {
       
  4271         boolean isSatisfiedBy(int ch) {
       
  4272             return (ch != '\n' && ch != '\r'
       
  4273                     && (ch|1) != '\u2029'
       
  4274                     && ch != '\u0085');
       
  4275         }
       
  4276     }
       
  4277 
       
  4278     /**
       
  4279      * Node class for the dot metacharacter when dotall is not enabled
       
  4280      * but UNIX_LINES is enabled.
       
  4281      */
       
  4282     static final class UnixDot extends CharProperty {
       
  4283         boolean isSatisfiedBy(int ch) {
       
  4284             return ch != '\n';
       
  4285         }
       
  4286     }
       
  4287 
       
  4288     /**
  4200     /**
  4289      * The 0 or 1 quantifier. This one class implements all three types.
  4201      * The 0 or 1 quantifier. This one class implements all three types.
  4290      */
  4202      */
  4291     static final class Ques extends Node {
  4203     static final class Ques extends Node {
  4292         Node atom;
  4204         Node atom;
  4293         int type;
  4205         Qtype type;
  4294         Ques(Node node, int type) {
  4206         Ques(Node node, Qtype type) {
  4295             this.atom = node;
  4207             this.atom = node;
  4296             this.type = type;
  4208             this.type = type;
  4297         }
  4209         }
  4298         boolean match(Matcher matcher, int i, CharSequence seq) {
  4210         boolean match(Matcher matcher, int i, CharSequence seq) {
  4299             switch (type) {
  4211             switch (type) {
  4309             default:
  4221             default:
  4310                 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
  4222                 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
  4311             }
  4223             }
  4312         }
  4224         }
  4313         boolean study(TreeInfo info) {
  4225         boolean study(TreeInfo info) {
  4314             if (type != INDEPENDENT) {
  4226             if (type != Qtype.INDEPENDENT) {
  4315                 int minL = info.minLength;
  4227                 int minL = info.minLength;
  4316                 atom.study(info);
  4228                 atom.study(info);
  4317                 info.minLength = minL;
  4229                 info.minLength = minL;
  4318                 info.deterministic = false;
  4230                 info.deterministic = false;
  4319                 return next.study(info);
  4231                 return next.study(info);
  4323             }
  4235             }
  4324         }
  4236         }
  4325     }
  4237     }
  4326 
  4238 
  4327     /**
  4239     /**
       
  4240      * Handles the greedy style repetition with the minimum either be
       
  4241      * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
       
  4242      */
       
  4243     static class CharPropertyGreedy extends Node {
       
  4244         final CharPredicate predicate;
       
  4245         final int cmin;
       
  4246 
       
  4247         CharPropertyGreedy(CharProperty cp, int cmin) {
       
  4248             this.predicate = cp.predicate;
       
  4249             this.cmin = cmin;
       
  4250         }
       
  4251         boolean match(Matcher matcher, int i,  CharSequence seq) {
       
  4252             int n = 0;
       
  4253             int to = matcher.to;
       
  4254             // greedy, all the way down
       
  4255             while (i < to) {
       
  4256                 int ch = Character.codePointAt(seq, i);
       
  4257                 if (!predicate.is(ch))
       
  4258                    break;
       
  4259                 i += Character.charCount(ch);
       
  4260                 n++;
       
  4261             }
       
  4262             if (i >= to) {
       
  4263                 matcher.hitEnd = true;
       
  4264             }
       
  4265             while (n >= cmin) {
       
  4266                 if (next.match(matcher, i, seq))
       
  4267                     return true;
       
  4268                 if (n == cmin)
       
  4269                     return false;
       
  4270                  // backing off if match fails
       
  4271                 int ch = Character.codePointBefore(seq, i);
       
  4272                 i -= Character.charCount(ch);
       
  4273                 n--;
       
  4274             }
       
  4275             return false;
       
  4276         }
       
  4277 
       
  4278         boolean study(TreeInfo info) {
       
  4279             info.minLength += cmin;
       
  4280             if (info.maxValid) {
       
  4281                 info.maxLength += MAX_REPS;
       
  4282             }
       
  4283             info.deterministic = false;
       
  4284             return next.study(info);
       
  4285         }
       
  4286     }
       
  4287 
       
  4288     static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
       
  4289 
       
  4290         BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
       
  4291             super(bcp, cmin);
       
  4292         }
       
  4293 
       
  4294         boolean match(Matcher matcher, int i,  CharSequence seq) {
       
  4295             int n = 0;
       
  4296             int to = matcher.to;
       
  4297             while (i < to && predicate.is(seq.charAt(i))) {
       
  4298                 i++; n++;
       
  4299             }
       
  4300             if (i >= to) {
       
  4301                 matcher.hitEnd = true;
       
  4302             }
       
  4303             while (n >= cmin) {
       
  4304                 if (next.match(matcher, i, seq))
       
  4305                     return true;
       
  4306                 i--; n--;  // backing off if match fails
       
  4307             }
       
  4308             return false;
       
  4309         }
       
  4310     }
       
  4311 
       
  4312     /**
  4328      * Handles the curly-brace style repetition with a specified minimum and
  4313      * Handles the curly-brace style repetition with a specified minimum and
  4329      * maximum occurrences. The * quantifier is handled as a special case.
  4314      * maximum occurrences. The * quantifier is handled as a special case.
  4330      * This class handles the three types.
  4315      * This class handles the three types.
  4331      */
  4316      */
  4332     static final class Curly extends Node {
  4317     static final class Curly extends Node {
  4333         Node atom;
  4318         Node atom;
  4334         int type;
  4319         Qtype type;
  4335         int cmin;
  4320         int cmin;
  4336         int cmax;
  4321         int cmax;
  4337 
  4322 
  4338         Curly(Node node, int cmin, int cmax, int type) {
  4323         Curly(Node node, int cmin, int cmax, Qtype type) {
  4339             this.atom = node;
  4324             this.atom = node;
  4340             this.type = type;
  4325             this.type = type;
  4341             this.cmin = cmin;
  4326             this.cmin = cmin;
  4342             this.cmax = cmax;
  4327             this.cmax = cmax;
  4343         }
  4328         }
  4348                     i = matcher.last;
  4333                     i = matcher.last;
  4349                     continue;
  4334                     continue;
  4350                 }
  4335                 }
  4351                 return false;
  4336                 return false;
  4352             }
  4337             }
  4353             if (type == GREEDY)
  4338             if (type == Qtype.GREEDY)
  4354                 return match0(matcher, i, j, seq);
  4339                 return match0(matcher, i, j, seq);
  4355             else if (type == LAZY)
  4340             else if (type == Qtype.LAZY)
  4356                 return match1(matcher, i, j, seq);
  4341                 return match1(matcher, i, j, seq);
  4357             else
  4342             else
  4358                 return match2(matcher, i, j, seq);
  4343                 return match2(matcher, i, j, seq);
  4359         }
  4344         }
  4360         // Greedy match.
  4345         // Greedy match.
  4472      * If capture is true then this class saves group settings and ensures
  4457      * If capture is true then this class saves group settings and ensures
  4473      * that groups are unset when backing off of a group match.
  4458      * that groups are unset when backing off of a group match.
  4474      */
  4459      */
  4475     static final class GroupCurly extends Node {
  4460     static final class GroupCurly extends Node {
  4476         Node atom;
  4461         Node atom;
  4477         int type;
  4462         Qtype type;
  4478         int cmin;
  4463         int cmin;
  4479         int cmax;
  4464         int cmax;
  4480         int localIndex;
  4465         int localIndex;
  4481         int groupIndex;
  4466         int groupIndex;
  4482         boolean capture;
  4467         boolean capture;
  4483 
  4468 
  4484         GroupCurly(Node node, int cmin, int cmax, int type, int local,
  4469         GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,
  4485                    int group, boolean capture) {
  4470                    int group, boolean capture) {
  4486             this.atom = node;
  4471             this.atom = node;
  4487             this.type = type;
  4472             this.type = type;
  4488             this.cmin = cmin;
  4473             this.cmin = cmin;
  4489             this.cmax = cmax;
  4474             this.cmax = cmax;
  4519                     ret = false;
  4504                     ret = false;
  4520                     break;
  4505                     break;
  4521                 }
  4506                 }
  4522             }
  4507             }
  4523             if (ret) {
  4508             if (ret) {
  4524                 if (type == GREEDY) {
  4509                 if (type == Qtype.GREEDY) {
  4525                     ret = match0(matcher, i, cmin, seq);
  4510                     ret = match0(matcher, i, cmin, seq);
  4526                 } else if (type == LAZY) {
  4511                 } else if (type == Qtype.LAZY) {
  4527                     ret = match1(matcher, i, cmin, seq);
  4512                     ret = match1(matcher, i, cmin, seq);
  4528                 } else {
  4513                 } else {
  4529                     ret = match2(matcher, i, cmin, seq);
  4514                     ret = match2(matcher, i, cmin, seq);
  4530                 }
  4515                 }
  4531             }
  4516             }
  4767      * indicate that we do not want to unset the group if the reference
  4752      * indicate that we do not want to unset the group if the reference
  4768      * doesn't match.
  4753      * doesn't match.
  4769      */
  4754      */
  4770     static final class GroupHead extends Node {
  4755     static final class GroupHead extends Node {
  4771         int localIndex;
  4756         int localIndex;
       
  4757         GroupTail tail;    // for debug/print only, match does not need to know
  4772         GroupHead(int localCount) {
  4758         GroupHead(int localCount) {
  4773             localIndex = localCount;
  4759             localIndex = localCount;
  4774         }
  4760         }
  4775         boolean match(Matcher matcher, int i, CharSequence seq) {
  4761         boolean match(Matcher matcher, int i, CharSequence seq) {
  4776             int save = matcher.locals[localIndex];
  4762             int save = matcher.locals[localIndex];
  4874     static class Loop extends Node {
  4860     static class Loop extends Node {
  4875         Node body;
  4861         Node body;
  4876         int countIndex; // local count index in matcher locals
  4862         int countIndex; // local count index in matcher locals
  4877         int beginIndex; // group beginning index
  4863         int beginIndex; // group beginning index
  4878         int cmin, cmax;
  4864         int cmin, cmax;
       
  4865         int posIndex;
  4879         Loop(int countIndex, int beginIndex) {
  4866         Loop(int countIndex, int beginIndex) {
  4880             this.countIndex = countIndex;
  4867             this.countIndex = countIndex;
  4881             this.beginIndex = beginIndex;
  4868             this.beginIndex = beginIndex;
       
  4869             this.posIndex = -1;
  4882         }
  4870         }
  4883         boolean match(Matcher matcher, int i, CharSequence seq) {
  4871         boolean match(Matcher matcher, int i, CharSequence seq) {
  4884             // Avoid infinite loop in zero-length case.
  4872             // Avoid infinite loop in zero-length case.
  4885             if (i > matcher.locals[beginIndex]) {
  4873             if (i > matcher.locals[beginIndex]) {
  4886                 int count = matcher.locals[countIndex];
  4874                 int count = matcher.locals[countIndex];
  4899                     return b;
  4887                     return b;
  4900                 }
  4888                 }
  4901                 // This block is for after we have the minimum
  4889                 // This block is for after we have the minimum
  4902                 // iterations required for the loop to match
  4890                 // iterations required for the loop to match
  4903                 if (count < cmax) {
  4891                 if (count < cmax) {
       
  4892                     // Let's check if we have already tried and failed
       
  4893                     // at this starting position "i" in the past.
       
  4894                     // If yes, then just return false wihtout trying
       
  4895                     // again, to stop the exponential backtracking.
       
  4896                     if (posIndex != -1 &&
       
  4897                         matcher.localsPos[posIndex].contains(i)) {
       
  4898                         return next.match(matcher, i, seq);
       
  4899                     }
  4904                     matcher.locals[countIndex] = count + 1;
  4900                     matcher.locals[countIndex] = count + 1;
  4905                     boolean b = body.match(matcher, i, seq);
  4901                     boolean b = body.match(matcher, i, seq);
  4906                     // If match failed we must backtrack, so
  4902                     // If match failed we must backtrack, so
  4907                     // the loop count should NOT be incremented
  4903                     // the loop count should NOT be incremented
  4908                     if (!b)
  4904                     if (b)
  4909                         matcher.locals[countIndex] = count;
       
  4910                     else
       
  4911                         return true;
  4905                         return true;
       
  4906                     matcher.locals[countIndex] = count;
       
  4907                     // save the failed position
       
  4908                     if (posIndex != -1) {
       
  4909                         matcher.localsPos[posIndex].add(i);
       
  4910                     }
  4912                 }
  4911                 }
  4913             }
  4912             }
  4914             return next.match(matcher, i, seq);
  4913             return next.match(matcher, i, seq);
  4915         }
  4914         }
  4916         boolean matchInit(Matcher matcher, int i, CharSequence seq) {
  4915         boolean matchInit(Matcher matcher, int i, CharSequence seq) {
  4917             int save = matcher.locals[countIndex];
  4916             int save = matcher.locals[countIndex];
  4918             boolean ret = false;
  4917             boolean ret = false;
       
  4918             if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
       
  4919                 matcher.localsPos[posIndex] = new IntHashSet();
       
  4920             }
  4919             if (0 < cmin) {
  4921             if (0 < cmin) {
  4920                 matcher.locals[countIndex] = 1;
  4922                 matcher.locals[countIndex] = 1;
  4921                 ret = body.match(matcher, i, seq);
  4923                 ret = body.match(matcher, i, seq);
  4922             } else if (0 < cmax) {
  4924             } else if (0 < cmax) {
  4923                 matcher.locals[countIndex] = 1;
  4925                 matcher.locals[countIndex] = 1;
  5360             return !conditionMatched && next.match(matcher, i, seq);
  5362             return !conditionMatched && next.match(matcher, i, seq);
  5361         }
  5363         }
  5362     }
  5364     }
  5363 
  5365 
  5364     /**
  5366     /**
  5365      * Returns the set union of two CharProperty nodes.
       
  5366      */
       
  5367     private static CharProperty union(final CharProperty lhs,
       
  5368                                       final CharProperty rhs) {
       
  5369         return new CharProperty() {
       
  5370                 boolean isSatisfiedBy(int ch) {
       
  5371                     return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
       
  5372     }
       
  5373 
       
  5374     /**
       
  5375      * Returns the set intersection of two CharProperty nodes.
       
  5376      */
       
  5377     private static CharProperty intersection(final CharProperty lhs,
       
  5378                                              final CharProperty rhs) {
       
  5379         return new CharProperty() {
       
  5380                 boolean isSatisfiedBy(int ch) {
       
  5381                     return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
       
  5382     }
       
  5383 
       
  5384     /**
       
  5385      * Returns the set difference of two CharProperty nodes.
       
  5386      */
       
  5387     private static CharProperty setDifference(final CharProperty lhs,
       
  5388                                               final CharProperty rhs) {
       
  5389         return new CharProperty() {
       
  5390                 boolean isSatisfiedBy(int ch) {
       
  5391                     return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
       
  5392     }
       
  5393 
       
  5394     /**
       
  5395      * Handles word boundaries. Includes a field to allow this one class to
  5367      * Handles word boundaries. Includes a field to allow this one class to
  5396      * deal with the different types of word boundaries we can match. The word
  5368      * deal with the different types of word boundaries we can match. The word
  5397      * characters include underscores, letters, and digits. Non spacing marks
  5369      * characters include underscores, letters, and digits. Non spacing marks
  5398      * can are also part of a word if they have a base character, otherwise
  5370      * can are also part of a word if they have a base character, otherwise
  5399      * they are ignored for purposes of finding word boundaries.
  5371      * they are ignored for purposes of finding word boundaries.
  5409             type = n;
  5381             type = n;
  5410             this.useUWORD = useUWORD;
  5382             this.useUWORD = useUWORD;
  5411         }
  5383         }
  5412 
  5384 
  5413         boolean isWord(int ch) {
  5385         boolean isWord(int ch) {
  5414             return useUWORD ? UnicodeProp.WORD.is(ch)
  5386             return useUWORD ? CharPredicates.WORD.is(ch)
  5415                             : (ch == '_' || Character.isLetterOrDigit(ch));
  5387                             : (ch == '_' || Character.isLetterOrDigit(ch));
  5416         }
  5388         }
  5417 
  5389 
  5418         int check(Matcher matcher, int i, CharSequence seq) {
  5390         int check(Matcher matcher, int i, CharSequence seq) {
  5419             int ch;
  5391             int ch;
  5655             matcher.hitEnd = true;
  5627             matcher.hitEnd = true;
  5656             return false;
  5628             return false;
  5657         }
  5629         }
  5658     }
  5630     }
  5659 
  5631 
  5660 ///////////////////////////////////////////////////////////////////////////////
  5632     @FunctionalInterface
  5661 ///////////////////////////////////////////////////////////////////////////////
  5633     static interface CharPredicate {
       
  5634         boolean is(int ch);
       
  5635 
       
  5636         default CharPredicate and(CharPredicate p) {
       
  5637             return ch -> is(ch) && p.is(ch);
       
  5638         }
       
  5639         default CharPredicate union(CharPredicate p) {
       
  5640             return ch -> is(ch) || p.is(ch);
       
  5641         }
       
  5642         default CharPredicate union(CharPredicate p1,
       
  5643                                     CharPredicate p2 ) {
       
  5644             return ch -> is(ch) || p1.is(ch) || p2.is(ch);
       
  5645         }
       
  5646         default CharPredicate negate() {
       
  5647             return ch -> !is(ch);
       
  5648         }
       
  5649     }
       
  5650 
       
  5651     static interface BmpCharPredicate extends CharPredicate {
       
  5652 
       
  5653         default CharPredicate and(CharPredicate p) {
       
  5654             if(p instanceof BmpCharPredicate)
       
  5655                 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
       
  5656             return ch -> is(ch) && p.is(ch);
       
  5657         }
       
  5658         default CharPredicate union(CharPredicate p) {
       
  5659             if (p instanceof BmpCharPredicate)
       
  5660                 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch));
       
  5661             return ch -> is(ch) || p.is(ch);
       
  5662         }
       
  5663         static CharPredicate union(CharPredicate... predicates) {
       
  5664             CharPredicate cp = ch -> {
       
  5665                 for (CharPredicate p : predicates) {
       
  5666                     if (!p.is(ch))
       
  5667                         return false;
       
  5668                 }
       
  5669                 return true;
       
  5670             };
       
  5671             for (CharPredicate p : predicates) {
       
  5672                 if (! (p instanceof BmpCharPredicate))
       
  5673                     return cp;
       
  5674             }
       
  5675             return (BmpCharPredicate)cp;
       
  5676         }
       
  5677     }
       
  5678 
       
  5679     /**
       
  5680      * matches a Perl vertical whitespace
       
  5681      */
       
  5682     static BmpCharPredicate VertWS = cp ->
       
  5683         (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
       
  5684 
       
  5685     /**
       
  5686      * matches a Perl horizontal whitespace
       
  5687      */
       
  5688     static BmpCharPredicate HorizWS = cp ->
       
  5689         cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
       
  5690         cp == 0x180e || cp >= 0x2000 && cp <= 0x200a ||  cp == 0x202f ||
       
  5691         cp == 0x205f || cp == 0x3000;
       
  5692 
       
  5693     /**
       
  5694      *  for the Unicode category ALL and the dot metacharacter when
       
  5695      *  in dotall mode.
       
  5696      */
       
  5697     static CharPredicate ALL = ch -> true;
       
  5698 
       
  5699     /**
       
  5700      * for the dot metacharacter when dotall is not enabled.
       
  5701      */
       
  5702     static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
       
  5703                                           && (ch|1) != '\u2029'
       
  5704                                           && ch != '\u0085');
       
  5705     /**
       
  5706      *  the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
       
  5707      */
       
  5708     static CharPredicate UNIXDOT = ch ->  ch != '\n';
       
  5709 
       
  5710     /**
       
  5711      * Indicate that matches a Supplementary Unicode character
       
  5712      */
       
  5713     static CharPredicate SingleS(int c) {
       
  5714         return ch -> ch == c;
       
  5715     }
       
  5716 
       
  5717     /**
       
  5718      * A bmp/optimized predicate of single
       
  5719      */
       
  5720     static BmpCharPredicate Single(int c) {
       
  5721         return ch -> ch == c;
       
  5722     }
       
  5723 
       
  5724     /**
       
  5725      * Case insensitive matches a given BMP character
       
  5726      */
       
  5727     static BmpCharPredicate SingleI(int lower, int upper) {
       
  5728         return ch -> ch == lower || ch == upper;
       
  5729     }
       
  5730 
       
  5731     /**
       
  5732      * Unicode case insensitive matches a given Unicode character
       
  5733      */
       
  5734     static CharPredicate SingleU(int lower) {
       
  5735         return ch -> lower == ch ||
       
  5736                      lower == Character.toLowerCase(Character.toUpperCase(ch));
       
  5737     }
       
  5738 
       
  5739     private static boolean inRange(int lower, int ch, int upper) {
       
  5740         return lower <= ch && ch <= upper;
       
  5741     }
       
  5742 
       
  5743     /**
       
  5744      * Charactrs within a explicit value range
       
  5745      */
       
  5746     static CharPredicate Range(int lower, int upper) {
       
  5747         if (upper < Character.MIN_HIGH_SURROGATE ||
       
  5748             lower > Character.MAX_HIGH_SURROGATE &&
       
  5749             upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
       
  5750             return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
       
  5751         return ch -> inRange(lower, ch, upper);
       
  5752     }
       
  5753 
       
  5754    /**
       
  5755     * Charactrs within a explicit value range in a case insensitive manner.
       
  5756     */
       
  5757     static CharPredicate CIRange(int lower, int upper) {
       
  5758         return ch -> inRange(lower, ch, upper) ||
       
  5759                      ASCII.isAscii(ch) &&
       
  5760                      (inRange(lower, ASCII.toUpper(ch), upper) ||
       
  5761                       inRange(lower, ASCII.toLower(ch), upper));
       
  5762     }
       
  5763 
       
  5764     static CharPredicate CIRangeU(int lower, int upper) {
       
  5765         return ch -> {
       
  5766             if (inRange(lower, ch, upper))
       
  5767                 return true;
       
  5768             int up = Character.toUpperCase(ch);
       
  5769             return inRange(lower, up, upper) ||
       
  5770                    inRange(lower, Character.toLowerCase(up), upper);
       
  5771         };
       
  5772     }
  5662 
  5773 
  5663     /**
  5774     /**
  5664      *  This must be the very first initializer.
  5775      *  This must be the very first initializer.
  5665      */
  5776      */
  5666     static Node accept = new Node();
  5777     static final Node accept = new Node();
  5667 
  5778 
  5668     static Node lastAccept = new LastNode();
  5779     static final Node lastAccept = new LastNode();
  5669 
       
  5670     private static class CharPropertyNames {
       
  5671 
       
  5672         static CharProperty charPropertyFor(String name) {
       
  5673             CharPropertyFactory m = map.get(name);
       
  5674             return m == null ? null : m.make();
       
  5675         }
       
  5676 
       
  5677         private abstract static class CharPropertyFactory {
       
  5678             abstract CharProperty make();
       
  5679         }
       
  5680 
       
  5681         private static void defCategory(String name,
       
  5682                                         final int typeMask) {
       
  5683             map.put(name, new CharPropertyFactory() {
       
  5684                     CharProperty make() { return new Category(typeMask);}});
       
  5685         }
       
  5686 
       
  5687         private static void defRange(String name,
       
  5688                                      final int lower, final int upper) {
       
  5689             map.put(name, new CharPropertyFactory() {
       
  5690                     CharProperty make() { return rangeFor(lower, upper);}});
       
  5691         }
       
  5692 
       
  5693         private static void defCtype(String name,
       
  5694                                      final int ctype) {
       
  5695             map.put(name, new CharPropertyFactory() {
       
  5696                     CharProperty make() { return new Ctype(ctype);}});
       
  5697         }
       
  5698 
       
  5699         private abstract static class CloneableProperty
       
  5700             extends CharProperty implements Cloneable
       
  5701         {
       
  5702             public CloneableProperty clone() {
       
  5703                 try {
       
  5704                     return (CloneableProperty) super.clone();
       
  5705                 } catch (CloneNotSupportedException e) {
       
  5706                     throw new AssertionError(e);
       
  5707                 }
       
  5708             }
       
  5709         }
       
  5710 
       
  5711         private static void defClone(String name,
       
  5712                                      final CloneableProperty p) {
       
  5713             map.put(name, new CharPropertyFactory() {
       
  5714                     CharProperty make() { return p.clone();}});
       
  5715         }
       
  5716 
       
  5717         private static final HashMap<String, CharPropertyFactory> map
       
  5718             = new HashMap<>();
       
  5719 
       
  5720         static {
       
  5721             // Unicode character property aliases, defined in
       
  5722             // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
       
  5723             defCategory("Cn", 1<<Character.UNASSIGNED);
       
  5724             defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
       
  5725             defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
       
  5726             defCategory("Lt", 1<<Character.TITLECASE_LETTER);
       
  5727             defCategory("Lm", 1<<Character.MODIFIER_LETTER);
       
  5728             defCategory("Lo", 1<<Character.OTHER_LETTER);
       
  5729             defCategory("Mn", 1<<Character.NON_SPACING_MARK);
       
  5730             defCategory("Me", 1<<Character.ENCLOSING_MARK);
       
  5731             defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
       
  5732             defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
       
  5733             defCategory("Nl", 1<<Character.LETTER_NUMBER);
       
  5734             defCategory("No", 1<<Character.OTHER_NUMBER);
       
  5735             defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
       
  5736             defCategory("Zl", 1<<Character.LINE_SEPARATOR);
       
  5737             defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
       
  5738             defCategory("Cc", 1<<Character.CONTROL);
       
  5739             defCategory("Cf", 1<<Character.FORMAT);
       
  5740             defCategory("Co", 1<<Character.PRIVATE_USE);
       
  5741             defCategory("Cs", 1<<Character.SURROGATE);
       
  5742             defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
       
  5743             defCategory("Ps", 1<<Character.START_PUNCTUATION);
       
  5744             defCategory("Pe", 1<<Character.END_PUNCTUATION);
       
  5745             defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
       
  5746             defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
       
  5747             defCategory("Sm", 1<<Character.MATH_SYMBOL);
       
  5748             defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
       
  5749             defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
       
  5750             defCategory("So", 1<<Character.OTHER_SYMBOL);
       
  5751             defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
       
  5752             defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
       
  5753             defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
       
  5754                               (1<<Character.LOWERCASE_LETTER) |
       
  5755                               (1<<Character.TITLECASE_LETTER) |
       
  5756                               (1<<Character.MODIFIER_LETTER)  |
       
  5757                               (1<<Character.OTHER_LETTER)));
       
  5758             defCategory("M", ((1<<Character.NON_SPACING_MARK) |
       
  5759                               (1<<Character.ENCLOSING_MARK)   |
       
  5760                               (1<<Character.COMBINING_SPACING_MARK)));
       
  5761             defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
       
  5762                               (1<<Character.LETTER_NUMBER)        |
       
  5763                               (1<<Character.OTHER_NUMBER)));
       
  5764             defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
       
  5765                               (1<<Character.LINE_SEPARATOR)  |
       
  5766                               (1<<Character.PARAGRAPH_SEPARATOR)));
       
  5767             defCategory("C", ((1<<Character.CONTROL)     |
       
  5768                               (1<<Character.FORMAT)      |
       
  5769                               (1<<Character.PRIVATE_USE) |
       
  5770                               (1<<Character.SURROGATE))); // Other
       
  5771             defCategory("P", ((1<<Character.DASH_PUNCTUATION)      |
       
  5772                               (1<<Character.START_PUNCTUATION)     |
       
  5773                               (1<<Character.END_PUNCTUATION)       |
       
  5774                               (1<<Character.CONNECTOR_PUNCTUATION) |
       
  5775                               (1<<Character.OTHER_PUNCTUATION)     |
       
  5776                               (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
       
  5777                               (1<<Character.FINAL_QUOTE_PUNCTUATION)));
       
  5778             defCategory("S", ((1<<Character.MATH_SYMBOL)     |
       
  5779                               (1<<Character.CURRENCY_SYMBOL) |
       
  5780                               (1<<Character.MODIFIER_SYMBOL) |
       
  5781                               (1<<Character.OTHER_SYMBOL)));
       
  5782             defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
       
  5783                                (1<<Character.LOWERCASE_LETTER) |
       
  5784                                (1<<Character.TITLECASE_LETTER)));
       
  5785             defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
       
  5786                                (1<<Character.LOWERCASE_LETTER) |
       
  5787                                (1<<Character.TITLECASE_LETTER) |
       
  5788                                (1<<Character.MODIFIER_LETTER)  |
       
  5789                                (1<<Character.OTHER_LETTER)     |
       
  5790                                (1<<Character.DECIMAL_DIGIT_NUMBER)));
       
  5791             defRange("L1", 0x00, 0xFF); // Latin-1
       
  5792             map.put("all", new CharPropertyFactory() {
       
  5793                     CharProperty make() { return new All(); }});
       
  5794 
       
  5795             // Posix regular expression character classes, defined in
       
  5796             // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
       
  5797             defRange("ASCII", 0x00, 0x7F);   // ASCII
       
  5798             defCtype("Alnum", ASCII.ALNUM);  // Alphanumeric characters
       
  5799             defCtype("Alpha", ASCII.ALPHA);  // Alphabetic characters
       
  5800             defCtype("Blank", ASCII.BLANK);  // Space and tab characters
       
  5801             defCtype("Cntrl", ASCII.CNTRL);  // Control characters
       
  5802             defRange("Digit", '0', '9');     // Numeric characters
       
  5803             defCtype("Graph", ASCII.GRAPH);  // printable and visible
       
  5804             defRange("Lower", 'a', 'z');     // Lower-case alphabetic
       
  5805             defRange("Print", 0x20, 0x7E);   // Printable characters
       
  5806             defCtype("Punct", ASCII.PUNCT);  // Punctuation characters
       
  5807             defCtype("Space", ASCII.SPACE);  // Space characters
       
  5808             defRange("Upper", 'A', 'Z');     // Upper-case alphabetic
       
  5809             defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
       
  5810 
       
  5811             // Java character properties, defined by methods in Character.java
       
  5812             defClone("javaLowerCase", new CloneableProperty() {
       
  5813                 boolean isSatisfiedBy(int ch) {
       
  5814                     return Character.isLowerCase(ch);}});
       
  5815             defClone("javaUpperCase", new CloneableProperty() {
       
  5816                 boolean isSatisfiedBy(int ch) {
       
  5817                     return Character.isUpperCase(ch);}});
       
  5818             defClone("javaAlphabetic", new CloneableProperty() {
       
  5819                 boolean isSatisfiedBy(int ch) {
       
  5820                     return Character.isAlphabetic(ch);}});
       
  5821             defClone("javaIdeographic", new CloneableProperty() {
       
  5822                 boolean isSatisfiedBy(int ch) {
       
  5823                     return Character.isIdeographic(ch);}});
       
  5824             defClone("javaTitleCase", new CloneableProperty() {
       
  5825                 boolean isSatisfiedBy(int ch) {
       
  5826                     return Character.isTitleCase(ch);}});
       
  5827             defClone("javaDigit", new CloneableProperty() {
       
  5828                 boolean isSatisfiedBy(int ch) {
       
  5829                     return Character.isDigit(ch);}});
       
  5830             defClone("javaDefined", new CloneableProperty() {
       
  5831                 boolean isSatisfiedBy(int ch) {
       
  5832                     return Character.isDefined(ch);}});
       
  5833             defClone("javaLetter", new CloneableProperty() {
       
  5834                 boolean isSatisfiedBy(int ch) {
       
  5835                     return Character.isLetter(ch);}});
       
  5836             defClone("javaLetterOrDigit", new CloneableProperty() {
       
  5837                 boolean isSatisfiedBy(int ch) {
       
  5838                     return Character.isLetterOrDigit(ch);}});
       
  5839             defClone("javaJavaIdentifierStart", new CloneableProperty() {
       
  5840                 boolean isSatisfiedBy(int ch) {
       
  5841                     return Character.isJavaIdentifierStart(ch);}});
       
  5842             defClone("javaJavaIdentifierPart", new CloneableProperty() {
       
  5843                 boolean isSatisfiedBy(int ch) {
       
  5844                     return Character.isJavaIdentifierPart(ch);}});
       
  5845             defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
       
  5846                 boolean isSatisfiedBy(int ch) {
       
  5847                     return Character.isUnicodeIdentifierStart(ch);}});
       
  5848             defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
       
  5849                 boolean isSatisfiedBy(int ch) {
       
  5850                     return Character.isUnicodeIdentifierPart(ch);}});
       
  5851             defClone("javaIdentifierIgnorable", new CloneableProperty() {
       
  5852                 boolean isSatisfiedBy(int ch) {
       
  5853                     return Character.isIdentifierIgnorable(ch);}});
       
  5854             defClone("javaSpaceChar", new CloneableProperty() {
       
  5855                 boolean isSatisfiedBy(int ch) {
       
  5856                     return Character.isSpaceChar(ch);}});
       
  5857             defClone("javaWhitespace", new CloneableProperty() {
       
  5858                 boolean isSatisfiedBy(int ch) {
       
  5859                     return Character.isWhitespace(ch);}});
       
  5860             defClone("javaISOControl", new CloneableProperty() {
       
  5861                 boolean isSatisfiedBy(int ch) {
       
  5862                     return Character.isISOControl(ch);}});
       
  5863             defClone("javaMirrored", new CloneableProperty() {
       
  5864                 boolean isSatisfiedBy(int ch) {
       
  5865                     return Character.isMirrored(ch);}});
       
  5866         }
       
  5867     }
       
  5868 
  5780 
  5869     /**
  5781     /**
  5870      * Creates a predicate which can be used to match a string.
  5782      * Creates a predicate which can be used to match a string.
  5871      *
  5783      *
  5872      * @return  The predicate which can be used for matching on a string
  5784      * @return  The predicate which can be used for matching on a string