changeset 37882 | e7f3cf12e739 |
parent 37880 | 60ec48925dc6 |
child 38777 | 826eb7091523 |
37881:fda31effa7b3 | 37882:e7f3cf12e739 |
---|---|
1 /* |
1 /* |
2 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved. |
2 * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. |
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 * |
4 * |
5 * This code is free software; you can redistribute it and/or modify it |
5 * This code is free software; you can redistribute it and/or modify it |
6 * under the terms of the GNU General Public License version 2 only, as |
6 * under the terms of the GNU General Public License version 2 only, as |
7 * published by the Free Software Foundation. Oracle designates this |
7 * published by the Free Software Foundation. Oracle designates this |
24 */ |
24 */ |
25 |
25 |
26 package java.util.regex; |
26 package java.util.regex; |
27 |
27 |
28 import java.text.Normalizer; |
28 import java.text.Normalizer; |
29 import java.text.Normalizer.Form; |
|
29 import java.util.Locale; |
30 import java.util.Locale; |
30 import java.util.Iterator; |
31 import java.util.Iterator; |
31 import java.util.Map; |
32 import java.util.Map; |
32 import java.util.ArrayList; |
33 import java.util.ArrayList; |
33 import java.util.HashMap; |
34 import java.util.HashMap; |
35 import java.util.LinkedHashSet; |
|
36 import java.util.List; |
|
37 import java.util.Set; |
|
34 import java.util.Arrays; |
38 import java.util.Arrays; |
35 import java.util.NoSuchElementException; |
39 import java.util.NoSuchElementException; |
36 import java.util.Spliterator; |
40 import java.util.Spliterator; |
37 import java.util.Spliterators; |
41 import java.util.Spliterators; |
38 import java.util.function.Predicate; |
42 import java.util.function.Predicate; |
982 * Temporary storage used by parsing pattern slice. |
986 * Temporary storage used by parsing pattern slice. |
983 */ |
987 */ |
984 transient int[] buffer; |
988 transient int[] buffer; |
985 |
989 |
986 /** |
990 /** |
991 * A temporary storage used for predicate for double return. |
|
992 */ |
|
993 transient CharPredicate predicate; |
|
994 |
|
995 /** |
|
987 * Map the "name" of the "named capturing group" to its group id |
996 * Map the "name" of the "named capturing group" to its group id |
988 * node. |
997 * node. |
989 */ |
998 */ |
990 transient volatile Map<String, Integer> namedGroups; |
999 transient volatile Map<String, Integer> namedGroups; |
991 |
1000 |
992 /** |
1001 /** |
993 * Temporary storage used while parsing group references. |
1002 * Temporary storage used while parsing group references. |
994 */ |
1003 */ |
995 transient GroupHead[] groupNodes; |
1004 transient GroupHead[] groupNodes; |
1005 |
|
1006 /** |
|
1007 * Temporary storage used to store the top level closure nodes. |
|
1008 */ |
|
1009 transient List<Node> topClosureNodes; |
|
1010 |
|
1011 /** |
|
1012 * The number of top greedy closure nodes in this Pattern. Used by |
|
1013 * matchers to allocate storage needed for a IntHashSet to keep the |
|
1014 * beginning pos {@code i} of all failed match. |
|
1015 */ |
|
1016 transient int localTCNCount; |
|
1017 |
|
1018 /* |
|
1019 * Turn off the stop-exponential-backtracking optimization if there |
|
1020 * is a group ref in the pattern. |
|
1021 */ |
|
1022 transient boolean hasGroupRef; |
|
996 |
1023 |
997 /** |
1024 /** |
998 * Temporary null terminated code point array used by pattern compiling. |
1025 * Temporary null terminated code point array used by pattern compiling. |
999 */ |
1026 */ |
1000 private transient int[] temp; |
1027 private transient int[] temp; |
1024 |
1051 |
1025 /** |
1052 /** |
1026 * If the Start node might possibly match supplementary characters. |
1053 * If the Start node might possibly match supplementary characters. |
1027 * It is set to true during compiling if |
1054 * It is set to true during compiling if |
1028 * (1) There is supplementary char in pattern, or |
1055 * (1) There is supplementary char in pattern, or |
1029 * (2) There is complement node of Category or Block |
1056 * (2) There is complement node of a "family" CharProperty |
1030 */ |
1057 */ |
1031 private transient boolean hasSupplementary; |
1058 private transient boolean hasSupplementary; |
1032 |
1059 |
1033 /** |
1060 /** |
1034 * Compiles the given regular expression into a pattern. |
1061 * Compiles the given regular expression into a pattern. |
1336 s.defaultReadObject(); |
1363 s.defaultReadObject(); |
1337 |
1364 |
1338 // Initialize counts |
1365 // Initialize counts |
1339 capturingGroupCount = 1; |
1366 capturingGroupCount = 1; |
1340 localCount = 0; |
1367 localCount = 0; |
1368 localTCNCount = 0; |
|
1341 |
1369 |
1342 // if length > 0, the Pattern is lazily compiled |
1370 // if length > 0, the Pattern is lazily compiled |
1343 if (pattern.length() == 0) { |
1371 if (pattern.length() == 0) { |
1344 root = new Start(lastAccept); |
1372 root = new Start(lastAccept); |
1345 matchRoot = lastAccept; |
1373 matchRoot = lastAccept; |
1366 flags |= UNICODE_CASE; |
1394 flags |= UNICODE_CASE; |
1367 |
1395 |
1368 // Reset group index count |
1396 // Reset group index count |
1369 capturingGroupCount = 1; |
1397 capturingGroupCount = 1; |
1370 localCount = 0; |
1398 localCount = 0; |
1399 localTCNCount = 0; |
|
1371 |
1400 |
1372 if (pattern.length() > 0) { |
1401 if (pattern.length() > 0) { |
1373 compile(); |
1402 compile(); |
1374 } else { |
1403 } else { |
1375 root = new Start(lastAccept); |
1404 root = new Start(lastAccept); |
1376 matchRoot = lastAccept; |
1405 matchRoot = lastAccept; |
1377 } |
1406 } |
1378 } |
1407 } |
1379 |
1408 |
1380 /** |
1409 /** |
1381 * The pattern is converted to normalized form ({@linkplain |
1410 * The pattern is converted to normalized form ({@link |
1382 * java.text.Normalizer.Form.NFD NFD}, canonical decomposition) |
1411 * java.text.Normalizer.Form.NFC NFC}, canonical decomposition, |
1383 * and then a pure group is constructed to match canonical |
1412 * followed by canonical composition for the character class |
1384 * equivalences of the characters. |
1413 * part, and {@link java.text.Normalizer.Form.NFD NFD}, |
1385 */ |
1414 * canonical decomposition) for the rest), and then a pure |
1386 private void normalize() { |
1415 * group is constructed to match canonical equivalences of the |
1387 int lastCodePoint = -1; |
1416 * characters. |
1388 |
1417 */ |
1389 // Convert pattern into normalized form |
1418 private static String normalize(String pattern) { |
1390 normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD); |
1419 int plen = pattern.length(); |
1391 patternLength = normalizedPattern.length(); |
1420 StringBuilder pbuf = new StringBuilder(plen); |
1392 |
1421 char last = 0; |
1393 // Modify pattern to match canonical equivalences |
1422 int lastStart = 0; |
1394 StringBuilder newPattern = new StringBuilder(patternLength); |
1423 char cc = 0; |
1395 for(int i=0; i<patternLength; ) { |
1424 for (int i = 0; i < plen;) { |
1396 int c = normalizedPattern.codePointAt(i); |
1425 char c = pattern.charAt(i); |
1397 StringBuilder sequenceBuffer; |
1426 if (cc == 0 && // top level |
1398 if ((Character.getType(c) == Character.NON_SPACING_MARK) |
1427 c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') { |
1399 && (lastCodePoint != -1)) { |
1428 i += 2; last = 0; |
1400 sequenceBuffer = new StringBuilder(); |
1429 continue; |
1401 sequenceBuffer.appendCodePoint(lastCodePoint); |
1430 } |
1402 sequenceBuffer.appendCodePoint(c); |
1431 if (c == '[' && last != '\\') { |
1403 while(Character.getType(c) == Character.NON_SPACING_MARK) { |
1432 if (cc == 0) { |
1404 i += Character.charCount(c); |
1433 if (lastStart < i) |
1405 if (i >= patternLength) |
1434 normalizeSlice(pattern, lastStart, i, pbuf); |
1406 break; |
1435 lastStart = i; |
1407 c = normalizedPattern.codePointAt(i); |
1436 } |
1408 sequenceBuffer.appendCodePoint(c); |
1437 cc++; |
1409 } |
1438 } else if (c == ']' && last != '\\') { |
1410 String ea = produceEquivalentAlternation( |
1439 cc--; |
1411 sequenceBuffer.toString()); |
1440 if (cc == 0) { |
1412 newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint)); |
1441 normalizeClazz(pattern, lastStart, i + 1, pbuf); |
1413 newPattern.append("(?:").append(ea).append(")"); |
1442 lastStart = i + 1; |
1414 } else if (c == '[' && lastCodePoint != '\\') { |
1443 } |
1415 i = normalizeCharClass(newPattern, i); |
1444 } |
1416 } else { |
1445 last = c; |
1417 newPattern.appendCodePoint(c); |
1446 i++; |
1418 } |
1447 } |
1419 lastCodePoint = c; |
1448 assert (cc == 0); |
1420 i += Character.charCount(c); |
1449 if (lastStart < plen) |
1421 } |
1450 normalizeSlice(pattern, lastStart, plen, pbuf); |
1422 normalizedPattern = newPattern.toString(); |
1451 return pbuf.toString(); |
1423 } |
1452 } |
1424 |
1453 |
1425 /** |
1454 private static void normalizeSlice(String src, int off, int limit, |
1426 * Complete the character class being parsed and add a set |
1455 StringBuilder dst) |
1427 * of alternations to it that will match the canonical equivalences |
1456 { |
1428 * of the characters within the class. |
1457 int len = src.length(); |
1429 */ |
1458 int off0 = off; |
1430 private int normalizeCharClass(StringBuilder newPattern, int i) { |
1459 while (off < limit && ASCII.isAscii(src.charAt(off))) { |
1431 StringBuilder charClass = new StringBuilder(); |
1460 off++; |
1432 StringBuilder eq = null; |
1461 } |
1433 int lastCodePoint = -1; |
1462 if (off == limit) { |
1434 String result; |
1463 dst.append(src, off0, limit); |
1435 |
1464 return; |
1436 i++; |
1465 } |
1437 charClass.append("["); |
1466 off--; |
1438 while(true) { |
1467 if (off < off0) |
1439 int c = normalizedPattern.codePointAt(i); |
1468 off = off0; |
1440 StringBuilder sequenceBuffer; |
1469 else |
1441 |
1470 dst.append(src, off0, off); |
1442 if (c == ']' && lastCodePoint != '\\') { |
1471 while (off < limit) { |
1443 charClass.append((char)c); |
1472 int ch0 = src.codePointAt(off); |
1444 break; |
1473 if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) { |
1445 } else if (Character.getType(c) == Character.NON_SPACING_MARK) { |
1474 dst.append((char)ch0); |
1446 sequenceBuffer = new StringBuilder(); |
1475 off++; |
1447 sequenceBuffer.appendCodePoint(lastCodePoint); |
1476 continue; |
1448 while(Character.getType(c) == Character.NON_SPACING_MARK) { |
1477 } |
1449 sequenceBuffer.appendCodePoint(c); |
1478 int j = off + Character.charCount(ch0); |
1450 i += Character.charCount(c); |
1479 int ch1; |
1451 if (i >= normalizedPattern.length()) |
1480 while (j < limit) { |
1452 break; |
1481 ch1 = src.codePointAt(j); |
1453 c = normalizedPattern.codePointAt(i); |
1482 if (Grapheme.isBoundary(ch0, ch1)) |
1454 } |
1483 break; |
1455 String ea = produceEquivalentAlternation( |
1484 ch0 = ch1; |
1456 sequenceBuffer.toString()); |
1485 j += Character.charCount(ch1); |
1457 |
1486 } |
1458 charClass.setLength(charClass.length()-Character.charCount(lastCodePoint)); |
1487 String seq = src.substring(off, j); |
1459 if (eq == null) |
1488 String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD); |
1460 eq = new StringBuilder(); |
1489 off = j; |
1461 eq.append('|'); |
1490 if (nfd.length() > 1) { |
1462 eq.append(ea); |
1491 ch0 = nfd.codePointAt(0); |
1463 } else { |
1492 ch1 = nfd.codePointAt(Character.charCount(ch0)); |
1464 charClass.appendCodePoint(c); |
1493 if (Character.getType(ch1) == Character.NON_SPACING_MARK) { |
1465 i++; |
1494 Set<String> altns = new LinkedHashSet<>(); |
1466 } |
1495 altns.add(seq); |
1467 if (i == normalizedPattern.length()) |
1496 produceEquivalentAlternation(nfd, altns); |
1468 throw error("Unclosed character class"); |
1497 dst.append("(?:"); |
1469 lastCodePoint = c; |
1498 altns.forEach( s -> dst.append(s + "|")); |
1470 } |
1499 dst.delete(dst.length() - 1, dst.length()); |
1471 |
1500 dst.append(")"); |
1472 if (eq != null) { |
1501 continue; |
1473 result = "(?:"+charClass.toString()+eq.toString()+")"; |
1502 } |
1474 } else { |
1503 } |
1475 result = charClass.toString(); |
1504 String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC); |
1476 } |
1505 if (!seq.equals(nfc) && !nfd.equals(nfc)) |
1477 |
1506 dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")"); |
1478 newPattern.append(result); |
1507 else if (!seq.equals(nfd)) |
1479 return i; |
1508 dst.append("(?:" + seq + "|" + nfd + ")"); |
1509 else |
|
1510 dst.append(seq); |
|
1511 } |
|
1512 } |
|
1513 |
|
1514 private static void normalizeClazz(String src, int off, int limit, |
|
1515 StringBuilder dst) |
|
1516 { |
|
1517 dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC)); |
|
1480 } |
1518 } |
1481 |
1519 |
1482 /** |
1520 /** |
1483 * Given a specific sequence composed of a regular character and |
1521 * Given a specific sequence composed of a regular character and |
1484 * combining marks that follow it, produce the alternation that will |
1522 * combining marks that follow it, produce the alternation that will |
1485 * match all canonical equivalences of that sequence. |
1523 * match all canonical equivalences of that sequence. |
1486 */ |
1524 */ |
1487 private String produceEquivalentAlternation(String source) { |
1525 private static void produceEquivalentAlternation(String src, |
1488 int len = countChars(source, 0, 1); |
1526 Set<String> dst) |
1489 if (source.length() == len) |
1527 { |
1490 // source has one character. |
1528 int len = countChars(src, 0, 1); |
1491 return source; |
1529 if (src.length() == len) { |
1492 |
1530 dst.add(src); // source has one character. |
1493 String base = source.substring(0,len); |
1531 return; |
1494 String combiningMarks = source.substring(len); |
1532 } |
1495 |
1533 String base = src.substring(0,len); |
1534 String combiningMarks = src.substring(len); |
|
1496 String[] perms = producePermutations(combiningMarks); |
1535 String[] perms = producePermutations(combiningMarks); |
1497 StringBuilder result = new StringBuilder(source); |
|
1498 |
|
1499 // Add combined permutations |
1536 // Add combined permutations |
1500 for(int x=0; x<perms.length; x++) { |
1537 for(int x = 0; x < perms.length; x++) { |
1501 String next = base + perms[x]; |
1538 String next = base + perms[x]; |
1502 if (x>0) |
1539 dst.add(next); |
1503 result.append("|"+next); |
|
1504 next = composeOneStep(next); |
1540 next = composeOneStep(next); |
1505 if (next != null) |
1541 if (next != null) { |
1506 result.append("|"+produceEquivalentAlternation(next)); |
1542 produceEquivalentAlternation(next, dst); |
1507 } |
1543 } |
1508 return result.toString(); |
1544 } |
1509 } |
1545 } |
1510 |
1546 |
1511 /** |
1547 /** |
1512 * Returns an array of strings that have all the possible |
1548 * Returns an array of strings that have all the possible |
1513 * permutations of the characters in the input string. |
1549 * permutations of the characters in the input string. |
1515 * of a set of combining marks. Note that some of the permutations |
1551 * of a set of combining marks. Note that some of the permutations |
1516 * are invalid because of combining class collisions, and these |
1552 * are invalid because of combining class collisions, and these |
1517 * possibilities must be removed because they are not canonically |
1553 * possibilities must be removed because they are not canonically |
1518 * equivalent. |
1554 * equivalent. |
1519 */ |
1555 */ |
1520 private String[] producePermutations(String input) { |
1556 private static String[] producePermutations(String input) { |
1521 if (input.length() == countChars(input, 0, 1)) |
1557 if (input.length() == countChars(input, 0, 1)) |
1522 return new String[] {input}; |
1558 return new String[] {input}; |
1523 |
1559 |
1524 if (input.length() == countChars(input, 0, 2)) { |
1560 if (input.length() == countChars(input, 0, 2)) { |
1525 int c0 = Character.codePointAt(input, 0); |
1561 int c0 = Character.codePointAt(input, 0); |
1573 String[] result = new String[index]; |
1609 String[] result = new String[index]; |
1574 System.arraycopy(temp, 0, result, 0, index); |
1610 System.arraycopy(temp, 0, result, 0, index); |
1575 return result; |
1611 return result; |
1576 } |
1612 } |
1577 |
1613 |
1578 private int getClass(int c) { |
1614 private static int getClass(int c) { |
1579 return sun.text.Normalizer.getCombiningClass(c); |
1615 return sun.text.Normalizer.getCombiningClass(c); |
1580 } |
1616 } |
1581 |
1617 |
1582 /** |
1618 /** |
1583 * Attempts to compose input by combining the first character |
1619 * Attempts to compose input by combining the first character |
1584 * with the first combining mark following it. Returns a String |
1620 * with the first combining mark following it. Returns a String |
1585 * that is the composition of the leading character with its first |
1621 * that is the composition of the leading character with its first |
1586 * combining mark followed by the remaining combining marks. Returns |
1622 * combining mark followed by the remaining combining marks. Returns |
1587 * null if the first two characters cannot be further composed. |
1623 * null if the first two characters cannot be further composed. |
1588 */ |
1624 */ |
1589 private String composeOneStep(String input) { |
1625 private static String composeOneStep(String input) { |
1590 int len = countChars(input, 0, 2); |
1626 int len = countChars(input, 0, 2); |
1591 String firstTwoCharacters = input.substring(0, len); |
1627 String firstTwoCharacters = input.substring(0, len); |
1592 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); |
1628 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); |
1593 |
|
1594 if (result.equals(firstTwoCharacters)) |
1629 if (result.equals(firstTwoCharacters)) |
1595 return null; |
1630 return null; |
1596 else { |
1631 else { |
1597 String remainder = input.substring(len); |
1632 String remainder = input.substring(len); |
1598 return result + remainder; |
1633 return result + remainder; |
1675 * of the expression which will create the object tree. |
1710 * of the expression which will create the object tree. |
1676 */ |
1711 */ |
1677 private void compile() { |
1712 private void compile() { |
1678 // Handle canonical equivalences |
1713 // Handle canonical equivalences |
1679 if (has(CANON_EQ) && !has(LITERAL)) { |
1714 if (has(CANON_EQ) && !has(LITERAL)) { |
1680 normalize(); |
1715 normalizedPattern = normalize(pattern); |
1681 } else { |
1716 } else { |
1682 normalizedPattern = pattern; |
1717 normalizedPattern = pattern; |
1683 } |
1718 } |
1684 patternLength = normalizedPattern.length(); |
1719 patternLength = normalizedPattern.length(); |
1685 |
1720 |
1705 |
1740 |
1706 // Allocate all temporary objects here. |
1741 // Allocate all temporary objects here. |
1707 buffer = new int[32]; |
1742 buffer = new int[32]; |
1708 groupNodes = new GroupHead[10]; |
1743 groupNodes = new GroupHead[10]; |
1709 namedGroups = null; |
1744 namedGroups = null; |
1745 topClosureNodes = new ArrayList<>(10); |
|
1710 |
1746 |
1711 if (has(LITERAL)) { |
1747 if (has(LITERAL)) { |
1712 // Literal pattern handling |
1748 // Literal pattern handling |
1713 matchRoot = newSlice(temp, patternLength, hasSupplementary); |
1749 matchRoot = newSlice(temp, patternLength, hasSupplementary); |
1714 matchRoot.next = lastAccept; |
1750 matchRoot.next = lastAccept; |
1735 root = matchRoot; |
1771 root = matchRoot; |
1736 } else { |
1772 } else { |
1737 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); |
1773 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); |
1738 } |
1774 } |
1739 |
1775 |
1776 // Optimize the greedy Loop to prevent exponential backtracking, IF there |
|
1777 // is no group ref in this pattern. With a non-negative localTCNCount value, |
|
1778 // the greedy type Loop, Curly will skip the backtracking for any starting |
|
1779 // position "i" that failed in the past. |
|
1780 if (!hasGroupRef) { |
|
1781 for (Node node : topClosureNodes) { |
|
1782 if (node instanceof Loop) { |
|
1783 // non-deterministic-greedy-group |
|
1784 ((Loop)node).posIndex = localTCNCount++; |
|
1785 } |
|
1786 } |
|
1787 } |
|
1788 |
|
1740 // Release temporary storage |
1789 // Release temporary storage |
1741 temp = null; |
1790 temp = null; |
1742 buffer = null; |
1791 buffer = null; |
1743 groupNodes = null; |
1792 groupNodes = null; |
1744 patternLength = 0; |
1793 patternLength = 0; |
1745 compiled = true; |
1794 compiled = true; |
1795 topClosureNodes = null; |
|
1746 } |
1796 } |
1747 |
1797 |
1748 Map<String, Integer> namedGroups() { |
1798 Map<String, Integer> namedGroups() { |
1749 Map<String, Integer> groups = namedGroups; |
1799 Map<String, Integer> groups = namedGroups; |
1750 if (groups == null) { |
1800 if (groups == null) { |
1751 namedGroups = groups = new HashMap<>(2); |
1801 namedGroups = groups = new HashMap<>(2); |
1752 } |
1802 } |
1753 return groups; |
1803 return groups; |
1754 } |
|
1755 |
|
1756 /** |
|
1757 * Used to print out a subtree of the Pattern to help with debugging. |
|
1758 */ |
|
1759 private static void printObjectTree(Node node) { |
|
1760 while(node != null) { |
|
1761 if (node instanceof Prolog) { |
|
1762 System.out.println(node); |
|
1763 printObjectTree(((Prolog)node).loop); |
|
1764 System.out.println("**** end contents prolog loop"); |
|
1765 } else if (node instanceof Loop) { |
|
1766 System.out.println(node); |
|
1767 printObjectTree(((Loop)node).body); |
|
1768 System.out.println("**** end contents Loop body"); |
|
1769 } else if (node instanceof Curly) { |
|
1770 System.out.println(node); |
|
1771 printObjectTree(((Curly)node).atom); |
|
1772 System.out.println("**** end contents Curly body"); |
|
1773 } else if (node instanceof GroupCurly) { |
|
1774 System.out.println(node); |
|
1775 printObjectTree(((GroupCurly)node).atom); |
|
1776 System.out.println("**** end contents GroupCurly body"); |
|
1777 } else if (node instanceof GroupTail) { |
|
1778 System.out.println(node); |
|
1779 System.out.println("Tail next is "+node.next); |
|
1780 return; |
|
1781 } else { |
|
1782 System.out.println(node); |
|
1783 } |
|
1784 node = node.next; |
|
1785 if (node != null) |
|
1786 System.out.println("->next:"); |
|
1787 if (node == Pattern.accept) { |
|
1788 System.out.println("Accept Node"); |
|
1789 node = null; |
|
1790 } |
|
1791 } |
|
1792 } |
1804 } |
1793 |
1805 |
1794 /** |
1806 /** |
1795 * Used to accumulate information about a subtree of the object graph |
1807 * Used to accumulate information about a subtree of the object graph |
1796 * so that optimizations can be applied to the subtree. |
1808 * so that optimizations can be applied to the subtree. |
2081 tail.next = node; |
2093 tail.next = node; |
2082 // Double return: Tail was returned in root |
2094 // Double return: Tail was returned in root |
2083 tail = root; |
2095 tail = root; |
2084 continue; |
2096 continue; |
2085 case '[': |
2097 case '[': |
2086 node = clazz(true); |
2098 if (has(CANON_EQ) && !has(LITERAL)) |
2099 node = new NFCCharProperty(clazz(true)); |
|
2100 else |
|
2101 node = newCharProperty(clazz(true)); |
|
2087 break; |
2102 break; |
2088 case '\\': |
2103 case '\\': |
2089 ch = nextEscaped(); |
2104 ch = nextEscaped(); |
2090 if (ch == 'p' || ch == 'P') { |
2105 if (ch == 'p' || ch == 'P') { |
2091 boolean oneLetter = true; |
2106 boolean oneLetter = true; |
2094 if (ch != '{') { |
2109 if (ch != '{') { |
2095 unread(); |
2110 unread(); |
2096 } else { |
2111 } else { |
2097 oneLetter = false; |
2112 oneLetter = false; |
2098 } |
2113 } |
2099 node = family(oneLetter, comp); |
2114 // node = newCharProperty(family(oneLetter, comp)); |
2115 if (has(CANON_EQ) && !has(LITERAL)) |
|
2116 node = new NFCCharProperty(family(oneLetter, comp)); |
|
2117 else |
|
2118 node = newCharProperty(family(oneLetter, comp)); |
|
2100 } else { |
2119 } else { |
2101 unread(); |
2120 unread(); |
2102 node = atom(); |
2121 node = atom(); |
2103 } |
2122 } |
2104 break; |
2123 break; |
2121 node = new Dollar(has(MULTILINE)); |
2140 node = new Dollar(has(MULTILINE)); |
2122 break; |
2141 break; |
2123 case '.': |
2142 case '.': |
2124 next(); |
2143 next(); |
2125 if (has(DOTALL)) { |
2144 if (has(DOTALL)) { |
2126 node = new All(); |
2145 node = new CharProperty(ALL); |
2127 } else { |
2146 } else { |
2128 if (has(UNIX_LINES)) |
2147 if (has(UNIX_LINES)) { |
2129 node = new UnixDot(); |
2148 node = new CharProperty(UNIXDOT); |
2130 else { |
2149 } else { |
2131 node = new Dot(); |
2150 node = new CharProperty(DOT); |
2132 } |
2151 } |
2133 } |
2152 } |
2134 break; |
2153 break; |
2135 case '|': |
2154 case '|': |
2136 case ')': |
2155 case ')': |
2153 node = atom(); |
2172 node = atom(); |
2154 break; |
2173 break; |
2155 } |
2174 } |
2156 |
2175 |
2157 node = closure(node); |
2176 node = closure(node); |
2158 |
2177 /* save the top dot-greedy nodes (.*, .+) as well |
2178 if (node instanceof GreedyCharProperty && |
|
2179 ((GreedyCharProperty)node).cp instanceof Dot) { |
|
2180 topClosureNodes.add(node); |
|
2181 } |
|
2182 */ |
|
2159 if (head == null) { |
2183 if (head == null) { |
2160 head = tail = node; |
2184 head = tail = node; |
2161 } else { |
2185 } else { |
2162 tail.next = node; |
2186 tail.next = node; |
2163 tail = node; |
2187 tail = node; |
2211 ch = next(); // Consume { if present |
2235 ch = next(); // Consume { if present |
2212 if (ch != '{') |
2236 if (ch != '{') |
2213 unread(); |
2237 unread(); |
2214 else |
2238 else |
2215 oneLetter = false; |
2239 oneLetter = false; |
2216 return family(oneLetter, comp); |
2240 if (has(CANON_EQ) && !has(LITERAL)) |
2241 return new NFCCharProperty(family(oneLetter, comp)); |
|
2242 else |
|
2243 return newCharProperty(family(oneLetter, comp)); |
|
2217 } |
2244 } |
2218 } |
2245 } |
2219 unread(); |
2246 unread(); |
2220 prev = cursor; |
2247 prev = cursor; |
2221 ch = escape(false, first == 0, false); |
2248 ch = escape(false, first == 0, false); |
2249 continue; |
2276 continue; |
2250 } |
2277 } |
2251 break; |
2278 break; |
2252 } |
2279 } |
2253 if (first == 1) { |
2280 if (first == 1) { |
2254 return newSingle(buffer[0]); |
2281 return newCharProperty(single(buffer[0])); |
2255 } else { |
2282 } else { |
2256 return newSlice(buffer, first, hasSupplementary); |
2283 return newSlice(buffer, first, hasSupplementary); |
2257 } |
2284 } |
2258 } |
2285 } |
2259 |
2286 |
2300 default: |
2327 default: |
2301 done = true; |
2328 done = true; |
2302 break; |
2329 break; |
2303 } |
2330 } |
2304 } |
2331 } |
2332 hasGroupRef = true; |
|
2305 if (has(CASE_INSENSITIVE)) |
2333 if (has(CASE_INSENSITIVE)) |
2306 return new CIBackRef(refNum, has(UNICODE_CASE)); |
2334 return new CIBackRef(refNum, has(UNICODE_CASE)); |
2307 else |
2335 else |
2308 return new BackRef(refNum); |
2336 return new BackRef(refNum); |
2309 } |
2337 } |
2344 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); |
2372 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); |
2345 return -1; |
2373 return -1; |
2346 case 'C': |
2374 case 'C': |
2347 break; |
2375 break; |
2348 case 'D': |
2376 case 'D': |
2349 if (create) root = has(UNICODE_CHARACTER_CLASS) |
2377 if (create) { |
2350 ? new Utype(UnicodeProp.DIGIT).complement() |
2378 predicate = has(UNICODE_CHARACTER_CLASS) ? |
2351 : new Ctype(ASCII.DIGIT).complement(); |
2379 CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT; |
2380 predicate = predicate.negate(); |
|
2381 if (!inclass) |
|
2382 root = newCharProperty(predicate); |
|
2383 } |
|
2352 return -1; |
2384 return -1; |
2353 case 'E': |
2385 case 'E': |
2354 case 'F': |
2386 case 'F': |
2355 break; |
2387 break; |
2356 case 'G': |
2388 case 'G': |
2357 if (inclass) break; |
2389 if (inclass) break; |
2358 if (create) root = new LastMatch(); |
2390 if (create) root = new LastMatch(); |
2359 return -1; |
2391 return -1; |
2360 case 'H': |
2392 case 'H': |
2361 if (create) root = new HorizWS().complement(); |
2393 if (create) { |
2394 predicate = HorizWS.negate(); |
|
2395 if (!inclass) |
|
2396 root = newCharProperty(predicate); |
|
2397 } |
|
2362 return -1; |
2398 return -1; |
2363 case 'I': |
2399 case 'I': |
2364 case 'J': |
2400 case 'J': |
2365 case 'K': |
2401 case 'K': |
2366 case 'L': |
2402 case 'L': |
2375 case 'R': |
2411 case 'R': |
2376 if (inclass) break; |
2412 if (inclass) break; |
2377 if (create) root = new LineEnding(); |
2413 if (create) root = new LineEnding(); |
2378 return -1; |
2414 return -1; |
2379 case 'S': |
2415 case 'S': |
2380 if (create) root = has(UNICODE_CHARACTER_CLASS) |
2416 if (create) { |
2381 ? new Utype(UnicodeProp.WHITE_SPACE).complement() |
2417 predicate = has(UNICODE_CHARACTER_CLASS) ? |
2382 : new Ctype(ASCII.SPACE).complement(); |
2418 CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE; |
2419 predicate = predicate.negate(); |
|
2420 if (!inclass) |
|
2421 root = newCharProperty(predicate); |
|
2422 } |
|
2383 return -1; |
2423 return -1; |
2384 case 'T': |
2424 case 'T': |
2385 case 'U': |
2425 case 'U': |
2386 break; |
2426 break; |
2387 case 'V': |
2427 case 'V': |
2388 if (create) root = new VertWS().complement(); |
2428 if (create) { |
2429 predicate = VertWS.negate(); |
|
2430 if (!inclass) |
|
2431 root = newCharProperty(predicate); |
|
2432 } |
|
2389 return -1; |
2433 return -1; |
2390 case 'W': |
2434 case 'W': |
2391 if (create) root = has(UNICODE_CHARACTER_CLASS) |
2435 if (create) { |
2392 ? new Utype(UnicodeProp.WORD).complement() |
2436 predicate = has(UNICODE_CHARACTER_CLASS) ? |
2393 : new Ctype(ASCII.WORD).complement(); |
2437 CharPredicates.WORD : CharPredicates.ASCII_WORD; |
2438 predicate = predicate.negate(); |
|
2439 if (!inclass) |
|
2440 root = newCharProperty(predicate); |
|
2441 } |
|
2394 return -1; |
2442 return -1; |
2395 case 'X': |
2443 case 'X': |
2396 if (inclass) break; |
2444 if (inclass) break; |
2397 if (create) { |
2445 if (create) { |
2398 root = new XGrapheme(); |
2446 root = new XGrapheme(); |
2428 } |
2476 } |
2429 return -1; |
2477 return -1; |
2430 case 'c': |
2478 case 'c': |
2431 return c(); |
2479 return c(); |
2432 case 'd': |
2480 case 'd': |
2433 if (create) root = has(UNICODE_CHARACTER_CLASS) |
2481 if (create) { |
2434 ? new Utype(UnicodeProp.DIGIT) |
2482 predicate = has(UNICODE_CHARACTER_CLASS) ? |
2435 : new Ctype(ASCII.DIGIT); |
2483 CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT; |
2484 if (!inclass) |
|
2485 root = newCharProperty(predicate); |
|
2486 } |
|
2436 return -1; |
2487 return -1; |
2437 case 'e': |
2488 case 'e': |
2438 return '\033'; |
2489 return '\033'; |
2439 case 'f': |
2490 case 'f': |
2440 return '\f'; |
2491 return '\f'; |
2441 case 'g': |
2492 case 'g': |
2442 break; |
2493 break; |
2443 case 'h': |
2494 case 'h': |
2444 if (create) root = new HorizWS(); |
2495 if (create) { |
2496 predicate = HorizWS; |
|
2497 if (!inclass) |
|
2498 root = newCharProperty(predicate); |
|
2499 } |
|
2445 return -1; |
2500 return -1; |
2446 case 'i': |
2501 case 'i': |
2447 case 'j': |
2502 case 'j': |
2448 break; |
2503 break; |
2449 case 'k': |
2504 case 'k': |
2453 throw error("\\k is not followed by '<' for named capturing group"); |
2508 throw error("\\k is not followed by '<' for named capturing group"); |
2454 String name = groupname(read()); |
2509 String name = groupname(read()); |
2455 if (!namedGroups().containsKey(name)) |
2510 if (!namedGroups().containsKey(name)) |
2456 throw error("(named capturing group <"+ name+"> does not exit"); |
2511 throw error("(named capturing group <"+ name+"> does not exit"); |
2457 if (create) { |
2512 if (create) { |
2513 hasGroupRef = true; |
|
2458 if (has(CASE_INSENSITIVE)) |
2514 if (has(CASE_INSENSITIVE)) |
2459 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); |
2515 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); |
2460 else |
2516 else |
2461 root = new BackRef(namedGroups().get(name)); |
2517 root = new BackRef(namedGroups().get(name)); |
2462 } |
2518 } |
2471 case 'q': |
2527 case 'q': |
2472 break; |
2528 break; |
2473 case 'r': |
2529 case 'r': |
2474 return '\r'; |
2530 return '\r'; |
2475 case 's': |
2531 case 's': |
2476 if (create) root = has(UNICODE_CHARACTER_CLASS) |
2532 if (create) { |
2477 ? new Utype(UnicodeProp.WHITE_SPACE) |
2533 predicate = has(UNICODE_CHARACTER_CLASS) ? |
2478 : new Ctype(ASCII.SPACE); |
2534 CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE; |
2535 if (!inclass) |
|
2536 root = newCharProperty(predicate); |
|
2537 } |
|
2479 return -1; |
2538 return -1; |
2480 case 't': |
2539 case 't': |
2481 return '\t'; |
2540 return '\t'; |
2482 case 'u': |
2541 case 'u': |
2483 return u(); |
2542 return u(); |
2490 // the start or end value, such as [\v-...] or [...-\v], in |
2549 // the start or end value, such as [\v-...] or [...-\v], in |
2491 // which a single definite value (0x0B) is expected. For |
2550 // which a single definite value (0x0B) is expected. For |
2492 // compatibility concern '\013'/0x0B is returned if isrange. |
2551 // compatibility concern '\013'/0x0B is returned if isrange. |
2493 if (isrange) |
2552 if (isrange) |
2494 return '\013'; |
2553 return '\013'; |
2495 if (create) root = new VertWS(); |
2554 if (create) { |
2555 predicate = VertWS; |
|
2556 if (!inclass) |
|
2557 root = newCharProperty(predicate); |
|
2558 } |
|
2496 return -1; |
2559 return -1; |
2497 case 'w': |
2560 case 'w': |
2498 if (create) root = has(UNICODE_CHARACTER_CLASS) |
2561 if (create) { |
2499 ? new Utype(UnicodeProp.WORD) |
2562 predicate = has(UNICODE_CHARACTER_CLASS) ? |
2500 : new Ctype(ASCII.WORD); |
2563 CharPredicates.WORD : CharPredicates.ASCII_WORD; |
2564 if (!inclass) |
|
2565 root = newCharProperty(predicate); |
|
2566 } |
|
2501 return -1; |
2567 return -1; |
2502 case 'x': |
2568 case 'x': |
2503 return x(); |
2569 return x(); |
2504 case 'y': |
2570 case 'y': |
2505 break; |
2571 break; |
2518 * |
2584 * |
2519 * Consumes a ] on the way out if consume is true. Usually consume |
2585 * Consumes a ] on the way out if consume is true. Usually consume |
2520 * is true except for the case of [abc&&def] where def is a separate |
2586 * is true except for the case of [abc&&def] where def is a separate |
2521 * right hand node with "understood" brackets. |
2587 * right hand node with "understood" brackets. |
2522 */ |
2588 */ |
2523 private CharProperty clazz(boolean consume) { |
2589 private CharPredicate clazz(boolean consume) { |
2524 CharProperty prev = null; |
2590 CharPredicate prev = null; |
2525 CharProperty node = null; |
2591 CharPredicate curr = null; |
2526 BitClass bits = new BitClass(); |
2592 BitClass bits = new BitClass(); |
2527 boolean include = true; |
2593 BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch]; |
2528 boolean firstInClass = true; |
2594 |
2595 boolean isNeg = false; |
|
2596 boolean hasBits = false; |
|
2529 int ch = next(); |
2597 int ch = next(); |
2598 |
|
2599 // Negates if first char in a class, otherwise literal |
|
2600 if (ch == '^' && temp[cursor-1] == '[') { |
|
2601 ch = next(); |
|
2602 isNeg = true; |
|
2603 } |
|
2530 for (;;) { |
2604 for (;;) { |
2531 switch (ch) { |
2605 switch (ch) { |
2532 case '^': |
|
2533 // Negates if first char in a class, otherwise literal |
|
2534 if (firstInClass) { |
|
2535 if (temp[cursor-1] != '[') |
|
2536 break; |
|
2537 ch = next(); |
|
2538 include = !include; |
|
2539 continue; |
|
2540 } else { |
|
2541 // ^ not first in class, treat as literal |
|
2542 break; |
|
2543 } |
|
2544 case '[': |
2606 case '[': |
2545 firstInClass = false; |
2607 curr = clazz(true); |
2546 node = clazz(true); |
|
2547 if (prev == null) |
2608 if (prev == null) |
2548 prev = node; |
2609 prev = curr; |
2549 else |
2610 else |
2550 prev = union(prev, node); |
2611 prev = prev.union(curr); |
2551 ch = peek(); |
2612 ch = peek(); |
2552 continue; |
2613 continue; |
2553 case '&': |
2614 case '&': |
2554 firstInClass = false; |
|
2555 ch = next(); |
2615 ch = next(); |
2556 if (ch == '&') { |
2616 if (ch == '&') { |
2557 ch = next(); |
2617 ch = next(); |
2558 CharProperty rightNode = null; |
2618 CharPredicate right = null; |
2559 while (ch != ']' && ch != '&') { |
2619 while (ch != ']' && ch != '&') { |
2560 if (ch == '[') { |
2620 if (ch == '[') { |
2561 if (rightNode == null) |
2621 if (right == null) |
2562 rightNode = clazz(true); |
2622 right = clazz(true); |
2563 else |
2623 else |
2564 rightNode = union(rightNode, clazz(true)); |
2624 right = right.union(clazz(true)); |
2565 } else { // abc&&def |
2625 } else { // abc&&def |
2566 unread(); |
2626 unread(); |
2567 rightNode = clazz(false); |
2627 right = clazz(false); |
2568 } |
2628 } |
2569 ch = peek(); |
2629 ch = peek(); |
2570 } |
2630 } |
2571 if (rightNode != null) |
2631 if (hasBits) { |
2572 node = rightNode; |
2632 // bits used, union has high precedence |
2633 if (prev == null) { |
|
2634 prev = curr = bitsP; |
|
2635 } else { |
|
2636 prev = prev.union(bitsP); |
|
2637 } |
|
2638 hasBits = false; |
|
2639 } |
|
2640 if (right != null) |
|
2641 curr = right; |
|
2573 if (prev == null) { |
2642 if (prev == null) { |
2574 if (rightNode == null) |
2643 if (right == null) |
2575 throw error("Bad class syntax"); |
2644 throw error("Bad class syntax"); |
2576 else |
2645 else |
2577 prev = rightNode; |
2646 prev = right; |
2578 } else { |
2647 } else { |
2579 prev = intersection(prev, node); |
2648 prev = prev.and(curr); |
2580 } |
2649 } |
2581 } else { |
2650 } else { |
2582 // treat as a literal & |
2651 // treat as a literal & |
2583 unread(); |
2652 unread(); |
2584 break; |
2653 break; |
2585 } |
2654 } |
2586 continue; |
2655 continue; |
2587 case 0: |
2656 case 0: |
2588 firstInClass = false; |
|
2589 if (cursor >= patternLength) |
2657 if (cursor >= patternLength) |
2590 throw error("Unclosed character class"); |
2658 throw error("Unclosed character class"); |
2591 break; |
2659 break; |
2592 case ']': |
2660 case ']': |
2593 firstInClass = false; |
2661 if (prev != null || hasBits) { |
2594 if (prev != null) { |
|
2595 if (consume) |
2662 if (consume) |
2596 next(); |
2663 next(); |
2664 if (prev == null) |
|
2665 prev = bitsP; |
|
2666 else if (hasBits) |
|
2667 prev = prev.union(bitsP); |
|
2668 if (isNeg) |
|
2669 return prev.negate(); |
|
2597 return prev; |
2670 return prev; |
2598 } |
2671 } |
2599 break; |
2672 break; |
2600 default: |
2673 default: |
2601 firstInClass = false; |
|
2602 break; |
2674 break; |
2603 } |
2675 } |
2604 node = range(bits); |
2676 curr = range(bits); |
2605 if (include) { |
2677 if (curr == null) { // the bits used |
2606 if (prev == null) { |
2678 hasBits = true; |
2607 prev = node; |
|
2608 } else { |
|
2609 if (prev != node) |
|
2610 prev = union(prev, node); |
|
2611 } |
|
2612 } else { |
2679 } else { |
2613 if (prev == null) { |
2680 if (prev == null) |
2614 prev = node.complement(); |
2681 prev = curr; |
2615 } else { |
2682 else if (prev != curr) |
2616 if (prev != node) |
2683 prev = prev.union(curr); |
2617 prev = setDifference(prev, node); |
|
2618 } |
|
2619 } |
2684 } |
2620 ch = peek(); |
2685 ch = peek(); |
2621 } |
2686 } |
2622 } |
2687 } |
2623 |
2688 |
2624 private CharProperty bitsOrSingle(BitClass bits, int ch) { |
2689 private CharPredicate bitsOrSingle(BitClass bits, int ch) { |
2625 /* Bits can only handle codepoints in [u+0000-u+00ff] range. |
2690 /* Bits can only handle codepoints in [u+0000-u+00ff] range. |
2626 Use "single" node instead of bits when dealing with unicode |
2691 Use "single" node instead of bits when dealing with unicode |
2627 case folding for codepoints listed below. |
2692 case folding for codepoints listed below. |
2628 (1)Uppercase out of range: u+00ff, u+00b5 |
2693 (1)Uppercase out of range: u+00ff, u+00b5 |
2629 toUpperCase(u+00ff) -> u+0178 |
2694 toUpperCase(u+00ff) -> u+0178 |
2641 */ |
2706 */ |
2642 int d; |
2707 int d; |
2643 if (ch < 256 && |
2708 if (ch < 256 && |
2644 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && |
2709 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && |
2645 (ch == 0xff || ch == 0xb5 || |
2710 (ch == 0xff || ch == 0xb5 || |
2646 ch == 0x49 || ch == 0x69 || //I and i |
2711 ch == 0x49 || ch == 0x69 || //I and i |
2647 ch == 0x53 || ch == 0x73 || //S and s |
2712 ch == 0x53 || ch == 0x73 || //S and s |
2648 ch == 0x4b || ch == 0x6b || //K and k |
2713 ch == 0x4b || ch == 0x6b || //K and k |
2649 ch == 0xc5 || ch == 0xe5))) //A+ring |
2714 ch == 0xc5 || ch == 0xe5))) { //A+ring |
2650 return bits.add(ch, flags()); |
2715 bits.add(ch, flags()); |
2651 return newSingle(ch); |
2716 return null; |
2717 } |
|
2718 return single(ch); |
|
2719 } |
|
2720 |
|
2721 /** |
|
2722 * Returns a suitably optimized, single character predicate |
|
2723 */ |
|
2724 private CharPredicate single(final int ch) { |
|
2725 if (has(CASE_INSENSITIVE)) { |
|
2726 int lower, upper; |
|
2727 if (has(UNICODE_CASE)) { |
|
2728 upper = Character.toUpperCase(ch); |
|
2729 lower = Character.toLowerCase(upper); |
|
2730 // Unicode case insensitive matches |
|
2731 if (upper != lower) |
|
2732 return SingleU(lower); |
|
2733 } else if (ASCII.isAscii(ch)) { |
|
2734 lower = ASCII.toLower(ch); |
|
2735 upper = ASCII.toUpper(ch); |
|
2736 // Case insensitive matches a given BMP character |
|
2737 if (lower != upper) |
|
2738 return SingleI(lower, upper); |
|
2739 } |
|
2740 } |
|
2741 if (isSupplementary(ch)) |
|
2742 return SingleS(ch); |
|
2743 return Single(ch); // Match a given BMP character |
|
2652 } |
2744 } |
2653 |
2745 |
2654 /** |
2746 /** |
2655 * Parse a single character or a character range in a character class |
2747 * Parse a single character or a character range in a character class |
2656 * and return its representative node. |
2748 * and return its representative node. |
2657 */ |
2749 */ |
2658 private CharProperty range(BitClass bits) { |
2750 private CharPredicate range(BitClass bits) { |
2659 int ch = peek(); |
2751 int ch = peek(); |
2660 if (ch == '\\') { |
2752 if (ch == '\\') { |
2661 ch = nextEscaped(); |
2753 ch = nextEscaped(); |
2662 if (ch == 'p' || ch == 'P') { // A property |
2754 if (ch == 'p' || ch == 'P') { // A property |
2663 boolean comp = (ch == 'P'); |
2755 boolean comp = (ch == 'P'); |
2672 } else { // ordinary escape |
2764 } else { // ordinary escape |
2673 boolean isrange = temp[cursor+1] == '-'; |
2765 boolean isrange = temp[cursor+1] == '-'; |
2674 unread(); |
2766 unread(); |
2675 ch = escape(true, true, isrange); |
2767 ch = escape(true, true, isrange); |
2676 if (ch == -1) |
2768 if (ch == -1) |
2677 return (CharProperty) root; |
2769 return predicate; |
2678 } |
2770 } |
2679 } else { |
2771 } else { |
2680 next(); |
2772 next(); |
2681 } |
2773 } |
2682 if (ch >= 0) { |
2774 if (ch >= 0) { |
2694 next(); |
2786 next(); |
2695 } |
2787 } |
2696 if (m < ch) { |
2788 if (m < ch) { |
2697 throw error("Illegal character range"); |
2789 throw error("Illegal character range"); |
2698 } |
2790 } |
2699 if (has(CASE_INSENSITIVE)) |
2791 if (has(CASE_INSENSITIVE)) { |
2700 return caseInsensitiveRangeFor(ch, m); |
2792 if (has(UNICODE_CASE)) |
2701 else |
2793 return CIRangeU(ch, m); |
2702 return rangeFor(ch, m); |
2794 return CIRange(ch, m); |
2795 } else { |
|
2796 return Range(ch, m); |
|
2797 } |
|
2703 } |
2798 } |
2704 } |
2799 } |
2705 return bitsOrSingle(bits, ch); |
2800 return bitsOrSingle(bits, ch); |
2706 } |
2801 } |
2707 throw error("Unexpected character '"+((char)ch)+"'"); |
2802 throw error("Unexpected character '"+((char)ch)+"'"); |
2708 } |
2803 } |
2709 |
2804 |
2710 /** |
2805 /** |
2711 * Parses a Unicode character family and returns its representative node. |
2806 * Parses a Unicode character family and returns its representative node. |
2712 */ |
2807 */ |
2713 private CharProperty family(boolean singleLetter, |
2808 private CharPredicate family(boolean singleLetter, boolean isComplement) { |
2714 boolean maybeComplement) |
|
2715 { |
|
2716 next(); |
2809 next(); |
2717 String name; |
2810 String name; |
2718 CharProperty node = null; |
2811 CharPredicate p = null; |
2719 |
2812 |
2720 if (singleLetter) { |
2813 if (singleLetter) { |
2721 int c = temp[cursor]; |
2814 int c = temp[cursor]; |
2722 if (!Character.isSupplementaryCodePoint(c)) { |
2815 if (!Character.isSupplementaryCodePoint(c)) { |
2723 name = String.valueOf((char)c); |
2816 name = String.valueOf((char)c); |
2745 String value = name.substring(i + 1); |
2838 String value = name.substring(i + 1); |
2746 name = name.substring(0, i).toLowerCase(Locale.ENGLISH); |
2839 name = name.substring(0, i).toLowerCase(Locale.ENGLISH); |
2747 switch (name) { |
2840 switch (name) { |
2748 case "sc": |
2841 case "sc": |
2749 case "script": |
2842 case "script": |
2750 node = unicodeScriptPropertyFor(value); |
2843 p = CharPredicates.forUnicodeScript(value); |
2751 break; |
2844 break; |
2752 case "blk": |
2845 case "blk": |
2753 case "block": |
2846 case "block": |
2754 node = unicodeBlockPropertyFor(value); |
2847 p = CharPredicates.forUnicodeBlock(value); |
2755 break; |
2848 break; |
2756 case "gc": |
2849 case "gc": |
2757 case "general_category": |
2850 case "general_category": |
2758 node = charPropertyNodeFor(value); |
2851 p = CharPredicates.forProperty(value); |
2759 break; |
2852 break; |
2760 default: |
2853 default: |
2761 throw error("Unknown Unicode property {name=<" + name + ">, " |
2854 break; |
2762 + "value=<" + value + ">}"); |
2855 } |
2763 } |
2856 if (p == null) |
2857 throw error("Unknown Unicode property {name=<" + name + ">, " |
|
2858 + "value=<" + value + ">}"); |
|
2859 |
|
2764 } else { |
2860 } else { |
2765 if (name.startsWith("In")) { |
2861 if (name.startsWith("In")) { |
2766 // \p{inBlockName} |
2862 // \p{InBlockName} |
2767 node = unicodeBlockPropertyFor(name.substring(2)); |
2863 p = CharPredicates.forUnicodeBlock(name.substring(2)); |
2768 } else if (name.startsWith("Is")) { |
2864 } else if (name.startsWith("Is")) { |
2769 // \p{isGeneralCategory} and \p{isScriptName} |
2865 // \p{IsGeneralCategory} and \p{IsScriptName} |
2770 name = name.substring(2); |
2866 name = name.substring(2); |
2771 UnicodeProp uprop = UnicodeProp.forName(name); |
2867 p = CharPredicates.forUnicodeProperty(name); |
2772 if (uprop != null) |
2868 if (p == null) |
2773 node = new Utype(uprop); |
2869 p = CharPredicates.forProperty(name); |
2774 if (node == null) |
2870 if (p == null) |
2775 node = CharPropertyNames.charPropertyFor(name); |
2871 p = CharPredicates.forUnicodeScript(name); |
2776 if (node == null) |
|
2777 node = unicodeScriptPropertyFor(name); |
|
2778 } else { |
2872 } else { |
2779 if (has(UNICODE_CHARACTER_CLASS)) { |
2873 if (has(UNICODE_CHARACTER_CLASS)) { |
2780 UnicodeProp uprop = UnicodeProp.forPOSIXName(name); |
2874 p = CharPredicates.forPOSIXName(name); |
2781 if (uprop != null) |
2875 } |
2782 node = new Utype(uprop); |
2876 if (p == null) |
2783 } |
2877 p = CharPredicates.forProperty(name); |
2784 if (node == null) |
2878 } |
2785 node = charPropertyNodeFor(name); |
2879 if (p == null) |
2786 } |
2880 throw error("Unknown character property name {In/Is" + name + "}"); |
2787 } |
2881 } |
2788 if (maybeComplement) { |
2882 if (isComplement) { |
2789 if (node instanceof Category || node instanceof Block) |
2883 // it might be too expensive to detect if a complement of |
2790 hasSupplementary = true; |
2884 // CharProperty can match "certain" supplementary. So just |
2791 node = node.complement(); |
2885 // go with StartS. |
2792 } |
2886 hasSupplementary = true; |
2793 return node; |
2887 p = p.negate(); |
2794 } |
2888 } |
2795 |
2889 return p; |
2796 |
2890 } |
2797 /** |
2891 |
2798 * Returns a CharProperty matching all characters belong to |
2892 private CharProperty newCharProperty(CharPredicate p) { |
2799 * a UnicodeScript. |
|
2800 */ |
|
2801 private CharProperty unicodeScriptPropertyFor(String name) { |
|
2802 final Character.UnicodeScript script; |
|
2803 try { |
|
2804 script = Character.UnicodeScript.forName(name); |
|
2805 } catch (IllegalArgumentException iae) { |
|
2806 throw error("Unknown character script name {" + name + "}"); |
|
2807 } |
|
2808 return new Script(script); |
|
2809 } |
|
2810 |
|
2811 /** |
|
2812 * Returns a CharProperty matching all characters in a UnicodeBlock. |
|
2813 */ |
|
2814 private CharProperty unicodeBlockPropertyFor(String name) { |
|
2815 final Character.UnicodeBlock block; |
|
2816 try { |
|
2817 block = Character.UnicodeBlock.forName(name); |
|
2818 } catch (IllegalArgumentException iae) { |
|
2819 throw error("Unknown character block name {" + name + "}"); |
|
2820 } |
|
2821 return new Block(block); |
|
2822 } |
|
2823 |
|
2824 /** |
|
2825 * Returns a CharProperty matching all characters in a named property. |
|
2826 */ |
|
2827 private CharProperty charPropertyNodeFor(String name) { |
|
2828 CharProperty p = CharPropertyNames.charPropertyFor(name); |
|
2829 if (p == null) |
2893 if (p == null) |
2830 throw error("Unknown character property name {" + name + "}"); |
2894 return null; |
2831 return p; |
2895 if (p instanceof BmpCharPredicate) |
2896 return new BmpCharProperty((BmpCharPredicate)p); |
|
2897 else |
|
2898 return new CharProperty(p); |
|
2832 } |
2899 } |
2833 |
2900 |
2834 /** |
2901 /** |
2835 * Parses and returns the name of a "named capturing group", the trailing |
2902 * Parses and returns the name of a "named capturing group", the trailing |
2836 * ">" is consumed after parsing. |
2903 * ">" is consumed after parsing. |
2857 private Node group0() { |
2924 private Node group0() { |
2858 boolean capturingGroup = false; |
2925 boolean capturingGroup = false; |
2859 Node head = null; |
2926 Node head = null; |
2860 Node tail = null; |
2927 Node tail = null; |
2861 int save = flags; |
2928 int save = flags; |
2929 int saveTCNCount = topClosureNodes.size(); |
|
2862 root = null; |
2930 root = null; |
2863 int ch = next(); |
2931 int ch = next(); |
2864 if (ch == '?') { |
2932 if (ch == '?') { |
2865 ch = skip(); |
2933 ch = skip(); |
2866 switch (ch) { |
2934 switch (ch) { |
2882 break; |
2950 break; |
2883 case '>': // (?>xxx) independent group |
2951 case '>': // (?>xxx) independent group |
2884 head = createGroup(true); |
2952 head = createGroup(true); |
2885 tail = root; |
2953 tail = root; |
2886 head.next = expr(tail); |
2954 head.next = expr(tail); |
2887 head = tail = new Ques(head, INDEPENDENT); |
2955 head = tail = new Ques(head, Qtype.INDEPENDENT); |
2888 break; |
2956 break; |
2889 case '<': // (?<xxx) look behind |
2957 case '<': // (?<xxx) look behind |
2890 ch = read(); |
2958 ch = read(); |
2891 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) { |
2959 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) { |
2892 // named captured group |
2960 // named captured group |
2926 new NotBehind(head, info.maxLength, |
2994 new NotBehind(head, info.maxLength, |
2927 info.minLength)); |
2995 info.minLength)); |
2928 } else { |
2996 } else { |
2929 throw error("Unknown look-behind group"); |
2997 throw error("Unknown look-behind group"); |
2930 } |
2998 } |
2999 // clear all top-closure-nodes inside lookbehind |
|
3000 if (saveTCNCount < topClosureNodes.size()) |
|
3001 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); |
|
2931 break; |
3002 break; |
2932 case '$': |
3003 case '$': |
2933 case '@': |
3004 case '@': |
2934 throw error("Unknown group type"); |
3005 throw error("Unknown group type"); |
2935 default: // (?xxx:) inlined match flags |
3006 default: // (?xxx:) inlined match flags |
2966 if (head == tail) { // Zero length assertion |
3037 if (head == tail) { // Zero length assertion |
2967 root = node; |
3038 root = node; |
2968 return node; // Dual return |
3039 return node; // Dual return |
2969 } |
3040 } |
2970 |
3041 |
3042 // have group closure, clear all inner closure nodes from the |
|
3043 // top list (no backtracking stopper optimization for inner |
|
3044 if (saveTCNCount < topClosureNodes.size()) |
|
3045 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); |
|
3046 |
|
2971 if (node instanceof Ques) { |
3047 if (node instanceof Ques) { |
2972 Ques ques = (Ques) node; |
3048 Ques ques = (Ques) node; |
2973 if (ques.type == POSSESSIVE) { |
3049 if (ques.type == Qtype.POSSESSIVE) { |
2974 root = node; |
3050 root = node; |
2975 return node; |
3051 return node; |
2976 } |
3052 } |
2977 tail.next = new BranchConn(); |
3053 tail.next = new BranchConn(); |
2978 tail = tail.next; |
3054 tail = tail.next; |
2979 if (ques.type == GREEDY) { |
3055 if (ques.type == Qtype.GREEDY) { |
2980 head = new Branch(head, null, tail); |
3056 head = new Branch(head, null, tail); |
2981 } else { // Reluctant quantifier |
3057 } else { // Reluctant quantifier |
2982 head = new Branch(null, head, tail); |
3058 head = new Branch(null, head, tail); |
2983 } |
3059 } |
2984 root = tail; |
3060 root = tail; |
2985 return head; |
3061 return head; |
2986 } else if (node instanceof Curly) { |
3062 } else if (node instanceof Curly) { |
2987 Curly curly = (Curly) node; |
3063 Curly curly = (Curly) node; |
2988 if (curly.type == POSSESSIVE) { |
3064 if (curly.type == Qtype.POSSESSIVE) { |
2989 root = node; |
3065 root = node; |
2990 return node; |
3066 return node; |
2991 } |
3067 } |
2992 // Discover if the group is deterministic |
3068 // Discover if the group is deterministic |
2993 TreeInfo info = new TreeInfo(); |
3069 TreeInfo info = new TreeInfo(); |
3000 capturingGroup); |
3076 capturingGroup); |
3001 return head; |
3077 return head; |
3002 } else { // Non-deterministic |
3078 } else { // Non-deterministic |
3003 int temp = ((GroupHead) head).localIndex; |
3079 int temp = ((GroupHead) head).localIndex; |
3004 Loop loop; |
3080 Loop loop; |
3005 if (curly.type == GREEDY) |
3081 if (curly.type == Qtype.GREEDY) { |
3006 loop = new Loop(this.localCount, temp); |
3082 loop = new Loop(this.localCount, temp); |
3007 else // Reluctant Curly |
3083 // add the max_reps greedy to the top-closure-node list |
3084 if (curly.cmax == MAX_REPS) |
|
3085 topClosureNodes.add(loop); |
|
3086 } else { // Reluctant Curly |
|
3008 loop = new LazyLoop(this.localCount, temp); |
3087 loop = new LazyLoop(this.localCount, temp); |
3088 } |
|
3009 Prolog prolog = new Prolog(loop); |
3089 Prolog prolog = new Prolog(loop); |
3010 this.localCount += 1; |
3090 this.localCount += 1; |
3011 loop.cmin = curly.cmin; |
3091 loop.cmin = curly.cmin; |
3012 loop.cmax = curly.cmax; |
3092 loop.cmax = curly.cmax; |
3013 loop.body = head; |
3093 loop.body = head; |
3029 int groupIndex = 0; |
3109 int groupIndex = 0; |
3030 if (!anonymous) |
3110 if (!anonymous) |
3031 groupIndex = capturingGroupCount++; |
3111 groupIndex = capturingGroupCount++; |
3032 GroupHead head = new GroupHead(localIndex); |
3112 GroupHead head = new GroupHead(localIndex); |
3033 root = new GroupTail(localIndex, groupIndex); |
3113 root = new GroupTail(localIndex, groupIndex); |
3114 |
|
3115 // for debug/print only, head.match does NOT need the "tail" info |
|
3116 head.tail = (GroupTail)root; |
|
3117 |
|
3034 if (!anonymous && groupIndex < 10) |
3118 if (!anonymous && groupIndex < 10) |
3035 groupNodes[groupIndex] = head; |
3119 groupNodes[groupIndex] = head; |
3036 return head; |
3120 return head; |
3037 } |
3121 } |
3038 |
3122 |
3117 } |
3201 } |
3118 } |
3202 } |
3119 |
3203 |
3120 static final int MAX_REPS = 0x7FFFFFFF; |
3204 static final int MAX_REPS = 0x7FFFFFFF; |
3121 |
3205 |
3122 static final int GREEDY = 0; |
3206 static enum Qtype { |
3123 |
3207 GREEDY, LAZY, POSSESSIVE, INDEPENDENT |
3124 static final int LAZY = 1; |
3208 } |
3125 |
3209 |
3126 static final int POSSESSIVE = 2; |
3210 private Node curly(Node prev, int cmin) { |
3127 |
3211 int ch = next(); |
3128 static final int INDEPENDENT = 3; |
3212 if (ch == '?') { |
3213 next(); |
|
3214 return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY); |
|
3215 } else if (ch == '+') { |
|
3216 next(); |
|
3217 return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE); |
|
3218 } |
|
3219 if (prev instanceof BmpCharProperty) { |
|
3220 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin); |
|
3221 } else if (prev instanceof CharProperty) { |
|
3222 return new CharPropertyGreedy((CharProperty)prev, cmin); |
|
3223 } |
|
3224 return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY); |
|
3225 } |
|
3129 |
3226 |
3130 /** |
3227 /** |
3131 * Processes repetition. If the next character peeked is a quantifier |
3228 * Processes repetition. If the next character peeked is a quantifier |
3132 * then new nodes must be appended to handle the repetition. |
3229 * then new nodes must be appended to handle the repetition. |
3133 * Prev could be a single or a group, so it could be a chain of nodes. |
3230 * Prev could be a single or a group, so it could be a chain of nodes. |
3138 switch (ch) { |
3235 switch (ch) { |
3139 case '?': |
3236 case '?': |
3140 ch = next(); |
3237 ch = next(); |
3141 if (ch == '?') { |
3238 if (ch == '?') { |
3142 next(); |
3239 next(); |
3143 return new Ques(prev, LAZY); |
3240 return new Ques(prev, Qtype.LAZY); |
3144 } else if (ch == '+') { |
3241 } else if (ch == '+') { |
3145 next(); |
3242 next(); |
3146 return new Ques(prev, POSSESSIVE); |
3243 return new Ques(prev, Qtype.POSSESSIVE); |
3147 } |
3244 } |
3148 return new Ques(prev, GREEDY); |
3245 return new Ques(prev, Qtype.GREEDY); |
3149 case '*': |
3246 case '*': |
3150 ch = next(); |
3247 return curly(prev, 0); |
3151 if (ch == '?') { |
|
3152 next(); |
|
3153 return new Curly(prev, 0, MAX_REPS, LAZY); |
|
3154 } else if (ch == '+') { |
|
3155 next(); |
|
3156 return new Curly(prev, 0, MAX_REPS, POSSESSIVE); |
|
3157 } |
|
3158 return new Curly(prev, 0, MAX_REPS, GREEDY); |
|
3159 case '+': |
3248 case '+': |
3160 ch = next(); |
3249 return curly(prev, 1); |
3161 if (ch == '?') { |
|
3162 next(); |
|
3163 return new Curly(prev, 1, MAX_REPS, LAZY); |
|
3164 } else if (ch == '+') { |
|
3165 next(); |
|
3166 return new Curly(prev, 1, MAX_REPS, POSSESSIVE); |
|
3167 } |
|
3168 return new Curly(prev, 1, MAX_REPS, GREEDY); |
|
3169 case '{': |
3250 case '{': |
3170 ch = temp[cursor+1]; |
3251 ch = temp[cursor+1]; |
3171 if (ASCII.isDigit(ch)) { |
3252 if (ASCII.isDigit(ch)) { |
3172 skip(); |
3253 skip(); |
3173 int cmin = 0; |
3254 int cmin = 0; |
3192 throw error("Illegal repetition range"); |
3273 throw error("Illegal repetition range"); |
3193 Curly curly; |
3274 Curly curly; |
3194 ch = peek(); |
3275 ch = peek(); |
3195 if (ch == '?') { |
3276 if (ch == '?') { |
3196 next(); |
3277 next(); |
3197 curly = new Curly(prev, cmin, cmax, LAZY); |
3278 curly = new Curly(prev, cmin, cmax, Qtype.LAZY); |
3198 } else if (ch == '+') { |
3279 } else if (ch == '+') { |
3199 next(); |
3280 next(); |
3200 curly = new Curly(prev, cmin, cmax, POSSESSIVE); |
3281 curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE); |
3201 } else { |
3282 } else { |
3202 curly = new Curly(prev, cmin, cmax, GREEDY); |
3283 curly = new Curly(prev, cmin, cmax, Qtype.GREEDY); |
3203 } |
3284 } |
3204 return curly; |
3285 return curly; |
3205 } else { |
3286 } else { |
3206 throw error("Illegal repetition"); |
3287 throw error("Illegal repetition"); |
3207 } |
3288 } |
3374 /** |
3455 /** |
3375 * Creates a bit vector for matching Latin-1 values. A normal BitClass |
3456 * Creates a bit vector for matching Latin-1 values. A normal BitClass |
3376 * never matches values above Latin-1, and a complemented BitClass always |
3457 * never matches values above Latin-1, and a complemented BitClass always |
3377 * matches values above Latin-1. |
3458 * matches values above Latin-1. |
3378 */ |
3459 */ |
3379 private static final class BitClass extends BmpCharProperty { |
3460 static final class BitClass extends BmpCharProperty { |
3380 final boolean[] bits; |
3461 final boolean[] bits; |
3381 BitClass() { bits = new boolean[256]; } |
3462 BitClass() { |
3382 private BitClass(boolean[] bits) { this.bits = bits; } |
3463 this(new boolean[256]); |
3464 } |
|
3465 private BitClass(boolean[] bits) { |
|
3466 super( ch -> ch < 256 && bits[ch]); |
|
3467 this.bits = bits; |
|
3468 } |
|
3383 BitClass add(int c, int flags) { |
3469 BitClass add(int c, int flags) { |
3384 assert c >= 0 && c <= 255; |
3470 assert c >= 0 && c <= 255; |
3385 if ((flags & CASE_INSENSITIVE) != 0) { |
3471 if ((flags & CASE_INSENSITIVE) != 0) { |
3386 if (ASCII.isAscii(c)) { |
3472 if (ASCII.isAscii(c)) { |
3387 bits[ASCII.toUpper(c)] = true; |
3473 bits[ASCII.toUpper(c)] = true; |
3392 } |
3478 } |
3393 } |
3479 } |
3394 bits[c] = true; |
3480 bits[c] = true; |
3395 return this; |
3481 return this; |
3396 } |
3482 } |
3397 boolean isSatisfiedBy(int ch) { |
|
3398 return ch < 256 && bits[ch]; |
|
3399 } |
|
3400 } |
|
3401 |
|
3402 /** |
|
3403 * Returns a suitably optimized, single character matcher. |
|
3404 */ |
|
3405 private CharProperty newSingle(final int ch) { |
|
3406 if (has(CASE_INSENSITIVE)) { |
|
3407 int lower, upper; |
|
3408 if (has(UNICODE_CASE)) { |
|
3409 upper = Character.toUpperCase(ch); |
|
3410 lower = Character.toLowerCase(upper); |
|
3411 if (upper != lower) |
|
3412 return new SingleU(lower); |
|
3413 } else if (ASCII.isAscii(ch)) { |
|
3414 lower = ASCII.toLower(ch); |
|
3415 upper = ASCII.toUpper(ch); |
|
3416 if (lower != upper) |
|
3417 return new SingleI(lower, upper); |
|
3418 } |
|
3419 } |
|
3420 if (isSupplementary(ch)) |
|
3421 return new SingleS(ch); // Match a given Unicode character |
|
3422 return new Single(ch); // Match a given BMP character |
|
3423 } |
3483 } |
3424 |
3484 |
3425 /** |
3485 /** |
3426 * Utility method for creating a string slice matcher. |
3486 * Utility method for creating a string slice matcher. |
3427 */ |
3487 */ |
3825 |
3885 |
3826 /** |
3886 /** |
3827 * Abstract node class to match one character satisfying some |
3887 * Abstract node class to match one character satisfying some |
3828 * boolean property. |
3888 * boolean property. |
3829 */ |
3889 */ |
3830 private abstract static class CharProperty extends Node { |
3890 static class CharProperty extends Node { |
3831 abstract boolean isSatisfiedBy(int ch); |
3891 CharPredicate predicate; |
3832 CharProperty complement() { |
3892 |
3833 return new CharProperty() { |
3893 CharProperty (CharPredicate predicate) { |
3834 boolean isSatisfiedBy(int ch) { |
3894 this.predicate = predicate; |
3835 return ! CharProperty.this.isSatisfiedBy(ch);}}; |
|
3836 } |
3895 } |
3837 boolean match(Matcher matcher, int i, CharSequence seq) { |
3896 boolean match(Matcher matcher, int i, CharSequence seq) { |
3838 if (i < matcher.to) { |
3897 if (i < matcher.to) { |
3839 int ch = Character.codePointAt(seq, i); |
3898 int ch = Character.codePointAt(seq, i); |
3840 return isSatisfiedBy(ch) |
3899 return predicate.is(ch) && |
3841 && next.match(matcher, i+Character.charCount(ch), seq); |
3900 next.match(matcher, i + Character.charCount(ch), seq); |
3842 } else { |
3901 } else { |
3843 matcher.hitEnd = true; |
3902 matcher.hitEnd = true; |
3844 return false; |
3903 return false; |
3845 } |
3904 } |
3846 } |
3905 } |
3853 |
3912 |
3854 /** |
3913 /** |
3855 * Optimized version of CharProperty that works only for |
3914 * Optimized version of CharProperty that works only for |
3856 * properties never satisfied by Supplementary characters. |
3915 * properties never satisfied by Supplementary characters. |
3857 */ |
3916 */ |
3858 private abstract static class BmpCharProperty extends CharProperty { |
3917 private static class BmpCharProperty extends CharProperty { |
3918 BmpCharProperty (BmpCharPredicate predicate) { |
|
3919 super(predicate); |
|
3920 } |
|
3859 boolean match(Matcher matcher, int i, CharSequence seq) { |
3921 boolean match(Matcher matcher, int i, CharSequence seq) { |
3860 if (i < matcher.to) { |
3922 if (i < matcher.to) { |
3861 return isSatisfiedBy(seq.charAt(i)) |
3923 return predicate.is(seq.charAt(i)) && |
3862 && next.match(matcher, i+1, seq); |
3924 next.match(matcher, i + 1, seq); |
3863 } else { |
3925 } else { |
3864 matcher.hitEnd = true; |
3926 matcher.hitEnd = true; |
3865 return false; |
3927 return false; |
3866 } |
3928 } |
3867 } |
3929 } |
3868 } |
3930 } |
3869 |
3931 |
3870 /** |
3932 private static class NFCCharProperty extends Node { |
3871 * Node class that matches a Supplementary Unicode character |
3933 CharPredicate predicate; |
3872 */ |
3934 NFCCharProperty (CharPredicate predicate) { |
3873 static final class SingleS extends CharProperty { |
3935 this.predicate = predicate; |
3874 final int c; |
3936 } |
3875 SingleS(int c) { this.c = c; } |
3937 |
3876 boolean isSatisfiedBy(int ch) { |
3938 boolean match(Matcher matcher, int i, CharSequence seq) { |
3877 return ch == c; |
3939 if (i < matcher.to) { |
3878 } |
3940 int ch0 = Character.codePointAt(seq, i); |
3879 } |
3941 int n = Character.charCount(ch0); |
3880 |
3942 int j = i + n; |
3881 /** |
3943 while (j < matcher.to) { |
3882 * Optimization -- matches a given BMP character |
3944 int ch1 = Character.codePointAt(seq, j); |
3883 */ |
3945 if (Grapheme.isBoundary(ch0, ch1)) |
3884 static final class Single extends BmpCharProperty { |
3946 break; |
3885 final int c; |
3947 ch0 = ch1; |
3886 Single(int c) { this.c = c; } |
3948 j += Character.charCount(ch1); |
3887 boolean isSatisfiedBy(int ch) { |
3949 } |
3888 return ch == c; |
3950 if (i + n == j) { // single, assume nfc cp |
3889 } |
3951 if (predicate.is(ch0)) |
3890 } |
3952 return next.match(matcher, j, seq); |
3891 |
3953 } else { |
3892 /** |
3954 while (i + n < j) { |
3893 * Case insensitive matches a given BMP character |
3955 String nfc = Normalizer.normalize( |
3894 */ |
3956 seq.toString().substring(i, j), Normalizer.Form.NFC); |
3895 static final class SingleI extends BmpCharProperty { |
3957 if (nfc.codePointCount(0, nfc.length()) == 1) { |
3896 final int lower; |
3958 if (predicate.is(nfc.codePointAt(0)) && |
3897 final int upper; |
3959 next.match(matcher, j, seq)) { |
3898 SingleI(int lower, int upper) { |
3960 return true; |
3899 this.lower = lower; |
3961 } |
3900 this.upper = upper; |
3962 } |
3901 } |
3963 |
3902 boolean isSatisfiedBy(int ch) { |
3964 ch0 = Character.codePointBefore(seq, j); |
3903 return ch == lower || ch == upper; |
3965 j -= Character.charCount(ch0); |
3904 } |
3966 } |
3905 } |
3967 } |
3906 |
3968 if (j < matcher.to) |
3907 /** |
3969 return false; |
3908 * Unicode case insensitive matches a given Unicode character |
3970 } |
3909 */ |
3971 matcher.hitEnd = true; |
3910 static final class SingleU extends CharProperty { |
3972 return false; |
3911 final int lower; |
3973 } |
3912 SingleU(int lower) { |
3974 |
3913 this.lower = lower; |
3975 boolean study(TreeInfo info) { |
3914 } |
3976 info.minLength++; |
3915 boolean isSatisfiedBy(int ch) { |
3977 info.deterministic = false; |
3916 return lower == ch || |
3978 return next.study(info); |
3917 lower == Character.toLowerCase(Character.toUpperCase(ch)); |
|
3918 } |
|
3919 } |
|
3920 |
|
3921 /** |
|
3922 * Node class that matches a Unicode block. |
|
3923 */ |
|
3924 static final class Block extends CharProperty { |
|
3925 final Character.UnicodeBlock block; |
|
3926 Block(Character.UnicodeBlock block) { |
|
3927 this.block = block; |
|
3928 } |
|
3929 boolean isSatisfiedBy(int ch) { |
|
3930 return block == Character.UnicodeBlock.of(ch); |
|
3931 } |
|
3932 } |
|
3933 |
|
3934 /** |
|
3935 * Node class that matches a Unicode script |
|
3936 */ |
|
3937 static final class Script extends CharProperty { |
|
3938 final Character.UnicodeScript script; |
|
3939 Script(Character.UnicodeScript script) { |
|
3940 this.script = script; |
|
3941 } |
|
3942 boolean isSatisfiedBy(int ch) { |
|
3943 return script == Character.UnicodeScript.of(ch); |
|
3944 } |
|
3945 } |
|
3946 |
|
3947 /** |
|
3948 * Node class that matches a Unicode category. |
|
3949 */ |
|
3950 static final class Category extends CharProperty { |
|
3951 final int typeMask; |
|
3952 Category(int typeMask) { this.typeMask = typeMask; } |
|
3953 boolean isSatisfiedBy(int ch) { |
|
3954 return (typeMask & (1 << Character.getType(ch))) != 0; |
|
3955 } |
|
3956 } |
|
3957 |
|
3958 /** |
|
3959 * Node class that matches a Unicode "type" |
|
3960 */ |
|
3961 static final class Utype extends CharProperty { |
|
3962 final UnicodeProp uprop; |
|
3963 Utype(UnicodeProp uprop) { this.uprop = uprop; } |
|
3964 boolean isSatisfiedBy(int ch) { |
|
3965 return uprop.is(ch); |
|
3966 } |
|
3967 } |
|
3968 |
|
3969 /** |
|
3970 * Node class that matches a POSIX type. |
|
3971 */ |
|
3972 static final class Ctype extends BmpCharProperty { |
|
3973 final int ctype; |
|
3974 Ctype(int ctype) { this.ctype = ctype; } |
|
3975 boolean isSatisfiedBy(int ch) { |
|
3976 return ch < 128 && ASCII.isType(ch, ctype); |
|
3977 } |
|
3978 } |
|
3979 |
|
3980 /** |
|
3981 * Node class that matches a Perl vertical whitespace |
|
3982 */ |
|
3983 static final class VertWS extends BmpCharProperty { |
|
3984 boolean isSatisfiedBy(int cp) { |
|
3985 return (cp >= 0x0A && cp <= 0x0D) || |
|
3986 cp == 0x85 || cp == 0x2028 || cp == 0x2029; |
|
3987 } |
|
3988 } |
|
3989 |
|
3990 /** |
|
3991 * Node class that matches a Perl horizontal whitespace |
|
3992 */ |
|
3993 static final class HorizWS extends BmpCharProperty { |
|
3994 boolean isSatisfiedBy(int cp) { |
|
3995 return cp == 0x09 || cp == 0x20 || cp == 0xa0 || |
|
3996 cp == 0x1680 || cp == 0x180e || |
|
3997 cp >= 0x2000 && cp <= 0x200a || |
|
3998 cp == 0x202f || cp == 0x205f || cp == 0x3000; |
|
3999 } |
3979 } |
4000 } |
3980 } |
4001 |
3981 |
4002 /** |
3982 /** |
4003 * Node class that matches an unicode extended grapheme cluster |
3983 * Node class that matches an unicode extended grapheme cluster |
4215 int toLower(int c) { |
4195 int toLower(int c) { |
4216 return Character.toLowerCase(Character.toUpperCase(c)); |
4196 return Character.toLowerCase(Character.toUpperCase(c)); |
4217 } |
4197 } |
4218 } |
4198 } |
4219 |
4199 |
4220 private static boolean inRange(int lower, int ch, int upper) { |
|
4221 return lower <= ch && ch <= upper; |
|
4222 } |
|
4223 |
|
4224 /** |
|
4225 * Returns node for matching characters within an explicit value range. |
|
4226 */ |
|
4227 private static CharProperty rangeFor(final int lower, |
|
4228 final int upper) { |
|
4229 return new CharProperty() { |
|
4230 boolean isSatisfiedBy(int ch) { |
|
4231 return inRange(lower, ch, upper);}}; |
|
4232 } |
|
4233 |
|
4234 /** |
|
4235 * Returns node for matching characters within an explicit value |
|
4236 * range in a case insensitive manner. |
|
4237 */ |
|
4238 private CharProperty caseInsensitiveRangeFor(final int lower, |
|
4239 final int upper) { |
|
4240 if (has(UNICODE_CASE)) |
|
4241 return new CharProperty() { |
|
4242 boolean isSatisfiedBy(int ch) { |
|
4243 if (inRange(lower, ch, upper)) |
|
4244 return true; |
|
4245 int up = Character.toUpperCase(ch); |
|
4246 return inRange(lower, up, upper) || |
|
4247 inRange(lower, Character.toLowerCase(up), upper);}}; |
|
4248 return new CharProperty() { |
|
4249 boolean isSatisfiedBy(int ch) { |
|
4250 return inRange(lower, ch, upper) || |
|
4251 ASCII.isAscii(ch) && |
|
4252 (inRange(lower, ASCII.toUpper(ch), upper) || |
|
4253 inRange(lower, ASCII.toLower(ch), upper)); |
|
4254 }}; |
|
4255 } |
|
4256 |
|
4257 /** |
|
4258 * Implements the Unicode category ALL and the dot metacharacter when |
|
4259 * in dotall mode. |
|
4260 */ |
|
4261 static final class All extends CharProperty { |
|
4262 boolean isSatisfiedBy(int ch) { |
|
4263 return true; |
|
4264 } |
|
4265 } |
|
4266 |
|
4267 /** |
|
4268 * Node class for the dot metacharacter when dotall is not enabled. |
|
4269 */ |
|
4270 static final class Dot extends CharProperty { |
|
4271 boolean isSatisfiedBy(int ch) { |
|
4272 return (ch != '\n' && ch != '\r' |
|
4273 && (ch|1) != '\u2029' |
|
4274 && ch != '\u0085'); |
|
4275 } |
|
4276 } |
|
4277 |
|
4278 /** |
|
4279 * Node class for the dot metacharacter when dotall is not enabled |
|
4280 * but UNIX_LINES is enabled. |
|
4281 */ |
|
4282 static final class UnixDot extends CharProperty { |
|
4283 boolean isSatisfiedBy(int ch) { |
|
4284 return ch != '\n'; |
|
4285 } |
|
4286 } |
|
4287 |
|
4288 /** |
4200 /** |
4289 * The 0 or 1 quantifier. This one class implements all three types. |
4201 * The 0 or 1 quantifier. This one class implements all three types. |
4290 */ |
4202 */ |
4291 static final class Ques extends Node { |
4203 static final class Ques extends Node { |
4292 Node atom; |
4204 Node atom; |
4293 int type; |
4205 Qtype type; |
4294 Ques(Node node, int type) { |
4206 Ques(Node node, Qtype type) { |
4295 this.atom = node; |
4207 this.atom = node; |
4296 this.type = type; |
4208 this.type = type; |
4297 } |
4209 } |
4298 boolean match(Matcher matcher, int i, CharSequence seq) { |
4210 boolean match(Matcher matcher, int i, CharSequence seq) { |
4299 switch (type) { |
4211 switch (type) { |
4309 default: |
4221 default: |
4310 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); |
4222 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); |
4311 } |
4223 } |
4312 } |
4224 } |
4313 boolean study(TreeInfo info) { |
4225 boolean study(TreeInfo info) { |
4314 if (type != INDEPENDENT) { |
4226 if (type != Qtype.INDEPENDENT) { |
4315 int minL = info.minLength; |
4227 int minL = info.minLength; |
4316 atom.study(info); |
4228 atom.study(info); |
4317 info.minLength = minL; |
4229 info.minLength = minL; |
4318 info.deterministic = false; |
4230 info.deterministic = false; |
4319 return next.study(info); |
4231 return next.study(info); |
4323 } |
4235 } |
4324 } |
4236 } |
4325 } |
4237 } |
4326 |
4238 |
4327 /** |
4239 /** |
4240 * Handles the greedy style repetition with the minimum either be |
|
4241 * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier. |
|
4242 */ |
|
4243 static class CharPropertyGreedy extends Node { |
|
4244 final CharPredicate predicate; |
|
4245 final int cmin; |
|
4246 |
|
4247 CharPropertyGreedy(CharProperty cp, int cmin) { |
|
4248 this.predicate = cp.predicate; |
|
4249 this.cmin = cmin; |
|
4250 } |
|
4251 boolean match(Matcher matcher, int i, CharSequence seq) { |
|
4252 int n = 0; |
|
4253 int to = matcher.to; |
|
4254 // greedy, all the way down |
|
4255 while (i < to) { |
|
4256 int ch = Character.codePointAt(seq, i); |
|
4257 if (!predicate.is(ch)) |
|
4258 break; |
|
4259 i += Character.charCount(ch); |
|
4260 n++; |
|
4261 } |
|
4262 if (i >= to) { |
|
4263 matcher.hitEnd = true; |
|
4264 } |
|
4265 while (n >= cmin) { |
|
4266 if (next.match(matcher, i, seq)) |
|
4267 return true; |
|
4268 if (n == cmin) |
|
4269 return false; |
|
4270 // backing off if match fails |
|
4271 int ch = Character.codePointBefore(seq, i); |
|
4272 i -= Character.charCount(ch); |
|
4273 n--; |
|
4274 } |
|
4275 return false; |
|
4276 } |
|
4277 |
|
4278 boolean study(TreeInfo info) { |
|
4279 info.minLength += cmin; |
|
4280 if (info.maxValid) { |
|
4281 info.maxLength += MAX_REPS; |
|
4282 } |
|
4283 info.deterministic = false; |
|
4284 return next.study(info); |
|
4285 } |
|
4286 } |
|
4287 |
|
4288 static final class BmpCharPropertyGreedy extends CharPropertyGreedy { |
|
4289 |
|
4290 BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) { |
|
4291 super(bcp, cmin); |
|
4292 } |
|
4293 |
|
4294 boolean match(Matcher matcher, int i, CharSequence seq) { |
|
4295 int n = 0; |
|
4296 int to = matcher.to; |
|
4297 while (i < to && predicate.is(seq.charAt(i))) { |
|
4298 i++; n++; |
|
4299 } |
|
4300 if (i >= to) { |
|
4301 matcher.hitEnd = true; |
|
4302 } |
|
4303 while (n >= cmin) { |
|
4304 if (next.match(matcher, i, seq)) |
|
4305 return true; |
|
4306 i--; n--; // backing off if match fails |
|
4307 } |
|
4308 return false; |
|
4309 } |
|
4310 } |
|
4311 |
|
4312 /** |
|
4328 * Handles the curly-brace style repetition with a specified minimum and |
4313 * Handles the curly-brace style repetition with a specified minimum and |
4329 * maximum occurrences. The * quantifier is handled as a special case. |
4314 * maximum occurrences. The * quantifier is handled as a special case. |
4330 * This class handles the three types. |
4315 * This class handles the three types. |
4331 */ |
4316 */ |
4332 static final class Curly extends Node { |
4317 static final class Curly extends Node { |
4333 Node atom; |
4318 Node atom; |
4334 int type; |
4319 Qtype type; |
4335 int cmin; |
4320 int cmin; |
4336 int cmax; |
4321 int cmax; |
4337 |
4322 |
4338 Curly(Node node, int cmin, int cmax, int type) { |
4323 Curly(Node node, int cmin, int cmax, Qtype type) { |
4339 this.atom = node; |
4324 this.atom = node; |
4340 this.type = type; |
4325 this.type = type; |
4341 this.cmin = cmin; |
4326 this.cmin = cmin; |
4342 this.cmax = cmax; |
4327 this.cmax = cmax; |
4343 } |
4328 } |
4348 i = matcher.last; |
4333 i = matcher.last; |
4349 continue; |
4334 continue; |
4350 } |
4335 } |
4351 return false; |
4336 return false; |
4352 } |
4337 } |
4353 if (type == GREEDY) |
4338 if (type == Qtype.GREEDY) |
4354 return match0(matcher, i, j, seq); |
4339 return match0(matcher, i, j, seq); |
4355 else if (type == LAZY) |
4340 else if (type == Qtype.LAZY) |
4356 return match1(matcher, i, j, seq); |
4341 return match1(matcher, i, j, seq); |
4357 else |
4342 else |
4358 return match2(matcher, i, j, seq); |
4343 return match2(matcher, i, j, seq); |
4359 } |
4344 } |
4360 // Greedy match. |
4345 // Greedy match. |
4472 * If capture is true then this class saves group settings and ensures |
4457 * If capture is true then this class saves group settings and ensures |
4473 * that groups are unset when backing off of a group match. |
4458 * that groups are unset when backing off of a group match. |
4474 */ |
4459 */ |
4475 static final class GroupCurly extends Node { |
4460 static final class GroupCurly extends Node { |
4476 Node atom; |
4461 Node atom; |
4477 int type; |
4462 Qtype type; |
4478 int cmin; |
4463 int cmin; |
4479 int cmax; |
4464 int cmax; |
4480 int localIndex; |
4465 int localIndex; |
4481 int groupIndex; |
4466 int groupIndex; |
4482 boolean capture; |
4467 boolean capture; |
4483 |
4468 |
4484 GroupCurly(Node node, int cmin, int cmax, int type, int local, |
4469 GroupCurly(Node node, int cmin, int cmax, Qtype type, int local, |
4485 int group, boolean capture) { |
4470 int group, boolean capture) { |
4486 this.atom = node; |
4471 this.atom = node; |
4487 this.type = type; |
4472 this.type = type; |
4488 this.cmin = cmin; |
4473 this.cmin = cmin; |
4489 this.cmax = cmax; |
4474 this.cmax = cmax; |
4519 ret = false; |
4504 ret = false; |
4520 break; |
4505 break; |
4521 } |
4506 } |
4522 } |
4507 } |
4523 if (ret) { |
4508 if (ret) { |
4524 if (type == GREEDY) { |
4509 if (type == Qtype.GREEDY) { |
4525 ret = match0(matcher, i, cmin, seq); |
4510 ret = match0(matcher, i, cmin, seq); |
4526 } else if (type == LAZY) { |
4511 } else if (type == Qtype.LAZY) { |
4527 ret = match1(matcher, i, cmin, seq); |
4512 ret = match1(matcher, i, cmin, seq); |
4528 } else { |
4513 } else { |
4529 ret = match2(matcher, i, cmin, seq); |
4514 ret = match2(matcher, i, cmin, seq); |
4530 } |
4515 } |
4531 } |
4516 } |
4767 * indicate that we do not want to unset the group if the reference |
4752 * indicate that we do not want to unset the group if the reference |
4768 * doesn't match. |
4753 * doesn't match. |
4769 */ |
4754 */ |
4770 static final class GroupHead extends Node { |
4755 static final class GroupHead extends Node { |
4771 int localIndex; |
4756 int localIndex; |
4757 GroupTail tail; // for debug/print only, match does not need to know |
|
4772 GroupHead(int localCount) { |
4758 GroupHead(int localCount) { |
4773 localIndex = localCount; |
4759 localIndex = localCount; |
4774 } |
4760 } |
4775 boolean match(Matcher matcher, int i, CharSequence seq) { |
4761 boolean match(Matcher matcher, int i, CharSequence seq) { |
4776 int save = matcher.locals[localIndex]; |
4762 int save = matcher.locals[localIndex]; |
4874 static class Loop extends Node { |
4860 static class Loop extends Node { |
4875 Node body; |
4861 Node body; |
4876 int countIndex; // local count index in matcher locals |
4862 int countIndex; // local count index in matcher locals |
4877 int beginIndex; // group beginning index |
4863 int beginIndex; // group beginning index |
4878 int cmin, cmax; |
4864 int cmin, cmax; |
4865 int posIndex; |
|
4879 Loop(int countIndex, int beginIndex) { |
4866 Loop(int countIndex, int beginIndex) { |
4880 this.countIndex = countIndex; |
4867 this.countIndex = countIndex; |
4881 this.beginIndex = beginIndex; |
4868 this.beginIndex = beginIndex; |
4869 this.posIndex = -1; |
|
4882 } |
4870 } |
4883 boolean match(Matcher matcher, int i, CharSequence seq) { |
4871 boolean match(Matcher matcher, int i, CharSequence seq) { |
4884 // Avoid infinite loop in zero-length case. |
4872 // Avoid infinite loop in zero-length case. |
4885 if (i > matcher.locals[beginIndex]) { |
4873 if (i > matcher.locals[beginIndex]) { |
4886 int count = matcher.locals[countIndex]; |
4874 int count = matcher.locals[countIndex]; |
4899 return b; |
4887 return b; |
4900 } |
4888 } |
4901 // This block is for after we have the minimum |
4889 // This block is for after we have the minimum |
4902 // iterations required for the loop to match |
4890 // iterations required for the loop to match |
4903 if (count < cmax) { |
4891 if (count < cmax) { |
4892 // Let's check if we have already tried and failed |
|
4893 // at this starting position "i" in the past. |
|
4894 // If yes, then just return false wihtout trying |
|
4895 // again, to stop the exponential backtracking. |
|
4896 if (posIndex != -1 && |
|
4897 matcher.localsPos[posIndex].contains(i)) { |
|
4898 return next.match(matcher, i, seq); |
|
4899 } |
|
4904 matcher.locals[countIndex] = count + 1; |
4900 matcher.locals[countIndex] = count + 1; |
4905 boolean b = body.match(matcher, i, seq); |
4901 boolean b = body.match(matcher, i, seq); |
4906 // If match failed we must backtrack, so |
4902 // If match failed we must backtrack, so |
4907 // the loop count should NOT be incremented |
4903 // the loop count should NOT be incremented |
4908 if (!b) |
4904 if (b) |
4909 matcher.locals[countIndex] = count; |
|
4910 else |
|
4911 return true; |
4905 return true; |
4906 matcher.locals[countIndex] = count; |
|
4907 // save the failed position |
|
4908 if (posIndex != -1) { |
|
4909 matcher.localsPos[posIndex].add(i); |
|
4910 } |
|
4912 } |
4911 } |
4913 } |
4912 } |
4914 return next.match(matcher, i, seq); |
4913 return next.match(matcher, i, seq); |
4915 } |
4914 } |
4916 boolean matchInit(Matcher matcher, int i, CharSequence seq) { |
4915 boolean matchInit(Matcher matcher, int i, CharSequence seq) { |
4917 int save = matcher.locals[countIndex]; |
4916 int save = matcher.locals[countIndex]; |
4918 boolean ret = false; |
4917 boolean ret = false; |
4918 if (posIndex != -1 && matcher.localsPos[posIndex] == null) { |
|
4919 matcher.localsPos[posIndex] = new IntHashSet(); |
|
4920 } |
|
4919 if (0 < cmin) { |
4921 if (0 < cmin) { |
4920 matcher.locals[countIndex] = 1; |
4922 matcher.locals[countIndex] = 1; |
4921 ret = body.match(matcher, i, seq); |
4923 ret = body.match(matcher, i, seq); |
4922 } else if (0 < cmax) { |
4924 } else if (0 < cmax) { |
4923 matcher.locals[countIndex] = 1; |
4925 matcher.locals[countIndex] = 1; |
5360 return !conditionMatched && next.match(matcher, i, seq); |
5362 return !conditionMatched && next.match(matcher, i, seq); |
5361 } |
5363 } |
5362 } |
5364 } |
5363 |
5365 |
5364 /** |
5366 /** |
5365 * Returns the set union of two CharProperty nodes. |
|
5366 */ |
|
5367 private static CharProperty union(final CharProperty lhs, |
|
5368 final CharProperty rhs) { |
|
5369 return new CharProperty() { |
|
5370 boolean isSatisfiedBy(int ch) { |
|
5371 return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}}; |
|
5372 } |
|
5373 |
|
5374 /** |
|
5375 * Returns the set intersection of two CharProperty nodes. |
|
5376 */ |
|
5377 private static CharProperty intersection(final CharProperty lhs, |
|
5378 final CharProperty rhs) { |
|
5379 return new CharProperty() { |
|
5380 boolean isSatisfiedBy(int ch) { |
|
5381 return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}}; |
|
5382 } |
|
5383 |
|
5384 /** |
|
5385 * Returns the set difference of two CharProperty nodes. |
|
5386 */ |
|
5387 private static CharProperty setDifference(final CharProperty lhs, |
|
5388 final CharProperty rhs) { |
|
5389 return new CharProperty() { |
|
5390 boolean isSatisfiedBy(int ch) { |
|
5391 return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}}; |
|
5392 } |
|
5393 |
|
5394 /** |
|
5395 * Handles word boundaries. Includes a field to allow this one class to |
5367 * Handles word boundaries. Includes a field to allow this one class to |
5396 * deal with the different types of word boundaries we can match. The word |
5368 * deal with the different types of word boundaries we can match. The word |
5397 * characters include underscores, letters, and digits. Non spacing marks |
5369 * characters include underscores, letters, and digits. Non spacing marks |
5398 * can are also part of a word if they have a base character, otherwise |
5370 * can are also part of a word if they have a base character, otherwise |
5399 * they are ignored for purposes of finding word boundaries. |
5371 * they are ignored for purposes of finding word boundaries. |
5409 type = n; |
5381 type = n; |
5410 this.useUWORD = useUWORD; |
5382 this.useUWORD = useUWORD; |
5411 } |
5383 } |
5412 |
5384 |
5413 boolean isWord(int ch) { |
5385 boolean isWord(int ch) { |
5414 return useUWORD ? UnicodeProp.WORD.is(ch) |
5386 return useUWORD ? CharPredicates.WORD.is(ch) |
5415 : (ch == '_' || Character.isLetterOrDigit(ch)); |
5387 : (ch == '_' || Character.isLetterOrDigit(ch)); |
5416 } |
5388 } |
5417 |
5389 |
5418 int check(Matcher matcher, int i, CharSequence seq) { |
5390 int check(Matcher matcher, int i, CharSequence seq) { |
5419 int ch; |
5391 int ch; |
5655 matcher.hitEnd = true; |
5627 matcher.hitEnd = true; |
5656 return false; |
5628 return false; |
5657 } |
5629 } |
5658 } |
5630 } |
5659 |
5631 |
5660 /////////////////////////////////////////////////////////////////////////////// |
5632 @FunctionalInterface |
5661 /////////////////////////////////////////////////////////////////////////////// |
5633 static interface CharPredicate { |
5634 boolean is(int ch); |
|
5635 |
|
5636 default CharPredicate and(CharPredicate p) { |
|
5637 return ch -> is(ch) && p.is(ch); |
|
5638 } |
|
5639 default CharPredicate union(CharPredicate p) { |
|
5640 return ch -> is(ch) || p.is(ch); |
|
5641 } |
|
5642 default CharPredicate union(CharPredicate p1, |
|
5643 CharPredicate p2 ) { |
|
5644 return ch -> is(ch) || p1.is(ch) || p2.is(ch); |
|
5645 } |
|
5646 default CharPredicate negate() { |
|
5647 return ch -> !is(ch); |
|
5648 } |
|
5649 } |
|
5650 |
|
5651 static interface BmpCharPredicate extends CharPredicate { |
|
5652 |
|
5653 default CharPredicate and(CharPredicate p) { |
|
5654 if(p instanceof BmpCharPredicate) |
|
5655 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch)); |
|
5656 return ch -> is(ch) && p.is(ch); |
|
5657 } |
|
5658 default CharPredicate union(CharPredicate p) { |
|
5659 if (p instanceof BmpCharPredicate) |
|
5660 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch)); |
|
5661 return ch -> is(ch) || p.is(ch); |
|
5662 } |
|
5663 static CharPredicate union(CharPredicate... predicates) { |
|
5664 CharPredicate cp = ch -> { |
|
5665 for (CharPredicate p : predicates) { |
|
5666 if (!p.is(ch)) |
|
5667 return false; |
|
5668 } |
|
5669 return true; |
|
5670 }; |
|
5671 for (CharPredicate p : predicates) { |
|
5672 if (! (p instanceof BmpCharPredicate)) |
|
5673 return cp; |
|
5674 } |
|
5675 return (BmpCharPredicate)cp; |
|
5676 } |
|
5677 } |
|
5678 |
|
5679 /** |
|
5680 * matches a Perl vertical whitespace |
|
5681 */ |
|
5682 static BmpCharPredicate VertWS = cp -> |
|
5683 (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029; |
|
5684 |
|
5685 /** |
|
5686 * matches a Perl horizontal whitespace |
|
5687 */ |
|
5688 static BmpCharPredicate HorizWS = cp -> |
|
5689 cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 || |
|
5690 cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f || |
|
5691 cp == 0x205f || cp == 0x3000; |
|
5692 |
|
5693 /** |
|
5694 * for the Unicode category ALL and the dot metacharacter when |
|
5695 * in dotall mode. |
|
5696 */ |
|
5697 static CharPredicate ALL = ch -> true; |
|
5698 |
|
5699 /** |
|
5700 * for the dot metacharacter when dotall is not enabled. |
|
5701 */ |
|
5702 static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r' |
|
5703 && (ch|1) != '\u2029' |
|
5704 && ch != '\u0085'); |
|
5705 /** |
|
5706 * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled. |
|
5707 */ |
|
5708 static CharPredicate UNIXDOT = ch -> ch != '\n'; |
|
5709 |
|
5710 /** |
|
5711 * Indicate that matches a Supplementary Unicode character |
|
5712 */ |
|
5713 static CharPredicate SingleS(int c) { |
|
5714 return ch -> ch == c; |
|
5715 } |
|
5716 |
|
5717 /** |
|
5718 * A bmp/optimized predicate of single |
|
5719 */ |
|
5720 static BmpCharPredicate Single(int c) { |
|
5721 return ch -> ch == c; |
|
5722 } |
|
5723 |
|
5724 /** |
|
5725 * Case insensitive matches a given BMP character |
|
5726 */ |
|
5727 static BmpCharPredicate SingleI(int lower, int upper) { |
|
5728 return ch -> ch == lower || ch == upper; |
|
5729 } |
|
5730 |
|
5731 /** |
|
5732 * Unicode case insensitive matches a given Unicode character |
|
5733 */ |
|
5734 static CharPredicate SingleU(int lower) { |
|
5735 return ch -> lower == ch || |
|
5736 lower == Character.toLowerCase(Character.toUpperCase(ch)); |
|
5737 } |
|
5738 |
|
5739 private static boolean inRange(int lower, int ch, int upper) { |
|
5740 return lower <= ch && ch <= upper; |
|
5741 } |
|
5742 |
|
5743 /** |
|
5744 * Charactrs within a explicit value range |
|
5745 */ |
|
5746 static CharPredicate Range(int lower, int upper) { |
|
5747 if (upper < Character.MIN_HIGH_SURROGATE || |
|
5748 lower > Character.MAX_HIGH_SURROGATE && |
|
5749 upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) |
|
5750 return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); |
|
5751 return ch -> inRange(lower, ch, upper); |
|
5752 } |
|
5753 |
|
5754 /** |
|
5755 * Charactrs within a explicit value range in a case insensitive manner. |
|
5756 */ |
|
5757 static CharPredicate CIRange(int lower, int upper) { |
|
5758 return ch -> inRange(lower, ch, upper) || |
|
5759 ASCII.isAscii(ch) && |
|
5760 (inRange(lower, ASCII.toUpper(ch), upper) || |
|
5761 inRange(lower, ASCII.toLower(ch), upper)); |
|
5762 } |
|
5763 |
|
5764 static CharPredicate CIRangeU(int lower, int upper) { |
|
5765 return ch -> { |
|
5766 if (inRange(lower, ch, upper)) |
|
5767 return true; |
|
5768 int up = Character.toUpperCase(ch); |
|
5769 return inRange(lower, up, upper) || |
|
5770 inRange(lower, Character.toLowerCase(up), upper); |
|
5771 }; |
|
5772 } |
|
5662 |
5773 |
5663 /** |
5774 /** |
5664 * This must be the very first initializer. |
5775 * This must be the very first initializer. |
5665 */ |
5776 */ |
5666 static Node accept = new Node(); |
5777 static final Node accept = new Node(); |
5667 |
5778 |
5668 static Node lastAccept = new LastNode(); |
5779 static final Node lastAccept = new LastNode(); |
5669 |
|
5670 private static class CharPropertyNames { |
|
5671 |
|
5672 static CharProperty charPropertyFor(String name) { |
|
5673 CharPropertyFactory m = map.get(name); |
|
5674 return m == null ? null : m.make(); |
|
5675 } |
|
5676 |
|
5677 private abstract static class CharPropertyFactory { |
|
5678 abstract CharProperty make(); |
|
5679 } |
|
5680 |
|
5681 private static void defCategory(String name, |
|
5682 final int typeMask) { |
|
5683 map.put(name, new CharPropertyFactory() { |
|
5684 CharProperty make() { return new Category(typeMask);}}); |
|
5685 } |
|
5686 |
|
5687 private static void defRange(String name, |
|
5688 final int lower, final int upper) { |
|
5689 map.put(name, new CharPropertyFactory() { |
|
5690 CharProperty make() { return rangeFor(lower, upper);}}); |
|
5691 } |
|
5692 |
|
5693 private static void defCtype(String name, |
|
5694 final int ctype) { |
|
5695 map.put(name, new CharPropertyFactory() { |
|
5696 CharProperty make() { return new Ctype(ctype);}}); |
|
5697 } |
|
5698 |
|
5699 private abstract static class CloneableProperty |
|
5700 extends CharProperty implements Cloneable |
|
5701 { |
|
5702 public CloneableProperty clone() { |
|
5703 try { |
|
5704 return (CloneableProperty) super.clone(); |
|
5705 } catch (CloneNotSupportedException e) { |
|
5706 throw new AssertionError(e); |
|
5707 } |
|
5708 } |
|
5709 } |
|
5710 |
|
5711 private static void defClone(String name, |
|
5712 final CloneableProperty p) { |
|
5713 map.put(name, new CharPropertyFactory() { |
|
5714 CharProperty make() { return p.clone();}}); |
|
5715 } |
|
5716 |
|
5717 private static final HashMap<String, CharPropertyFactory> map |
|
5718 = new HashMap<>(); |
|
5719 |
|
5720 static { |
|
5721 // Unicode character property aliases, defined in |
|
5722 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt |
|
5723 defCategory("Cn", 1<<Character.UNASSIGNED); |
|
5724 defCategory("Lu", 1<<Character.UPPERCASE_LETTER); |
|
5725 defCategory("Ll", 1<<Character.LOWERCASE_LETTER); |
|
5726 defCategory("Lt", 1<<Character.TITLECASE_LETTER); |
|
5727 defCategory("Lm", 1<<Character.MODIFIER_LETTER); |
|
5728 defCategory("Lo", 1<<Character.OTHER_LETTER); |
|
5729 defCategory("Mn", 1<<Character.NON_SPACING_MARK); |
|
5730 defCategory("Me", 1<<Character.ENCLOSING_MARK); |
|
5731 defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK); |
|
5732 defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER); |
|
5733 defCategory("Nl", 1<<Character.LETTER_NUMBER); |
|
5734 defCategory("No", 1<<Character.OTHER_NUMBER); |
|
5735 defCategory("Zs", 1<<Character.SPACE_SEPARATOR); |
|
5736 defCategory("Zl", 1<<Character.LINE_SEPARATOR); |
|
5737 defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR); |
|
5738 defCategory("Cc", 1<<Character.CONTROL); |
|
5739 defCategory("Cf", 1<<Character.FORMAT); |
|
5740 defCategory("Co", 1<<Character.PRIVATE_USE); |
|
5741 defCategory("Cs", 1<<Character.SURROGATE); |
|
5742 defCategory("Pd", 1<<Character.DASH_PUNCTUATION); |
|
5743 defCategory("Ps", 1<<Character.START_PUNCTUATION); |
|
5744 defCategory("Pe", 1<<Character.END_PUNCTUATION); |
|
5745 defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION); |
|
5746 defCategory("Po", 1<<Character.OTHER_PUNCTUATION); |
|
5747 defCategory("Sm", 1<<Character.MATH_SYMBOL); |
|
5748 defCategory("Sc", 1<<Character.CURRENCY_SYMBOL); |
|
5749 defCategory("Sk", 1<<Character.MODIFIER_SYMBOL); |
|
5750 defCategory("So", 1<<Character.OTHER_SYMBOL); |
|
5751 defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION); |
|
5752 defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION); |
|
5753 defCategory("L", ((1<<Character.UPPERCASE_LETTER) | |
|
5754 (1<<Character.LOWERCASE_LETTER) | |
|
5755 (1<<Character.TITLECASE_LETTER) | |
|
5756 (1<<Character.MODIFIER_LETTER) | |
|
5757 (1<<Character.OTHER_LETTER))); |
|
5758 defCategory("M", ((1<<Character.NON_SPACING_MARK) | |
|
5759 (1<<Character.ENCLOSING_MARK) | |
|
5760 (1<<Character.COMBINING_SPACING_MARK))); |
|
5761 defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) | |
|
5762 (1<<Character.LETTER_NUMBER) | |
|
5763 (1<<Character.OTHER_NUMBER))); |
|
5764 defCategory("Z", ((1<<Character.SPACE_SEPARATOR) | |
|
5765 (1<<Character.LINE_SEPARATOR) | |
|
5766 (1<<Character.PARAGRAPH_SEPARATOR))); |
|
5767 defCategory("C", ((1<<Character.CONTROL) | |
|
5768 (1<<Character.FORMAT) | |
|
5769 (1<<Character.PRIVATE_USE) | |
|
5770 (1<<Character.SURROGATE))); // Other |
|
5771 defCategory("P", ((1<<Character.DASH_PUNCTUATION) | |
|
5772 (1<<Character.START_PUNCTUATION) | |
|
5773 (1<<Character.END_PUNCTUATION) | |
|
5774 (1<<Character.CONNECTOR_PUNCTUATION) | |
|
5775 (1<<Character.OTHER_PUNCTUATION) | |
|
5776 (1<<Character.INITIAL_QUOTE_PUNCTUATION) | |
|
5777 (1<<Character.FINAL_QUOTE_PUNCTUATION))); |
|
5778 defCategory("S", ((1<<Character.MATH_SYMBOL) | |
|
5779 (1<<Character.CURRENCY_SYMBOL) | |
|
5780 (1<<Character.MODIFIER_SYMBOL) | |
|
5781 (1<<Character.OTHER_SYMBOL))); |
|
5782 defCategory("LC", ((1<<Character.UPPERCASE_LETTER) | |
|
5783 (1<<Character.LOWERCASE_LETTER) | |
|
5784 (1<<Character.TITLECASE_LETTER))); |
|
5785 defCategory("LD", ((1<<Character.UPPERCASE_LETTER) | |
|
5786 (1<<Character.LOWERCASE_LETTER) | |
|
5787 (1<<Character.TITLECASE_LETTER) | |
|
5788 (1<<Character.MODIFIER_LETTER) | |
|
5789 (1<<Character.OTHER_LETTER) | |
|
5790 (1<<Character.DECIMAL_DIGIT_NUMBER))); |
|
5791 defRange("L1", 0x00, 0xFF); // Latin-1 |
|
5792 map.put("all", new CharPropertyFactory() { |
|
5793 CharProperty make() { return new All(); }}); |
|
5794 |
|
5795 // Posix regular expression character classes, defined in |
|
5796 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html |
|
5797 defRange("ASCII", 0x00, 0x7F); // ASCII |
|
5798 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters |
|
5799 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters |
|
5800 defCtype("Blank", ASCII.BLANK); // Space and tab characters |
|
5801 defCtype("Cntrl", ASCII.CNTRL); // Control characters |
|
5802 defRange("Digit", '0', '9'); // Numeric characters |
|
5803 defCtype("Graph", ASCII.GRAPH); // printable and visible |
|
5804 defRange("Lower", 'a', 'z'); // Lower-case alphabetic |
|
5805 defRange("Print", 0x20, 0x7E); // Printable characters |
|
5806 defCtype("Punct", ASCII.PUNCT); // Punctuation characters |
|
5807 defCtype("Space", ASCII.SPACE); // Space characters |
|
5808 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic |
|
5809 defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits |
|
5810 |
|
5811 // Java character properties, defined by methods in Character.java |
|
5812 defClone("javaLowerCase", new CloneableProperty() { |
|
5813 boolean isSatisfiedBy(int ch) { |
|
5814 return Character.isLowerCase(ch);}}); |
|
5815 defClone("javaUpperCase", new CloneableProperty() { |
|
5816 boolean isSatisfiedBy(int ch) { |
|
5817 return Character.isUpperCase(ch);}}); |
|
5818 defClone("javaAlphabetic", new CloneableProperty() { |
|
5819 boolean isSatisfiedBy(int ch) { |
|
5820 return Character.isAlphabetic(ch);}}); |
|
5821 defClone("javaIdeographic", new CloneableProperty() { |
|
5822 boolean isSatisfiedBy(int ch) { |
|
5823 return Character.isIdeographic(ch);}}); |
|
5824 defClone("javaTitleCase", new CloneableProperty() { |
|
5825 boolean isSatisfiedBy(int ch) { |
|
5826 return Character.isTitleCase(ch);}}); |
|
5827 defClone("javaDigit", new CloneableProperty() { |
|
5828 boolean isSatisfiedBy(int ch) { |
|
5829 return Character.isDigit(ch);}}); |
|
5830 defClone("javaDefined", new CloneableProperty() { |
|
5831 boolean isSatisfiedBy(int ch) { |
|
5832 return Character.isDefined(ch);}}); |
|
5833 defClone("javaLetter", new CloneableProperty() { |
|
5834 boolean isSatisfiedBy(int ch) { |
|
5835 return Character.isLetter(ch);}}); |
|
5836 defClone("javaLetterOrDigit", new CloneableProperty() { |
|
5837 boolean isSatisfiedBy(int ch) { |
|
5838 return Character.isLetterOrDigit(ch);}}); |
|
5839 defClone("javaJavaIdentifierStart", new CloneableProperty() { |
|
5840 boolean isSatisfiedBy(int ch) { |
|
5841 return Character.isJavaIdentifierStart(ch);}}); |
|
5842 defClone("javaJavaIdentifierPart", new CloneableProperty() { |
|
5843 boolean isSatisfiedBy(int ch) { |
|
5844 return Character.isJavaIdentifierPart(ch);}}); |
|
5845 defClone("javaUnicodeIdentifierStart", new CloneableProperty() { |
|
5846 boolean isSatisfiedBy(int ch) { |
|
5847 return Character.isUnicodeIdentifierStart(ch);}}); |
|
5848 defClone("javaUnicodeIdentifierPart", new CloneableProperty() { |
|
5849 boolean isSatisfiedBy(int ch) { |
|
5850 return Character.isUnicodeIdentifierPart(ch);}}); |
|
5851 defClone("javaIdentifierIgnorable", new CloneableProperty() { |
|
5852 boolean isSatisfiedBy(int ch) { |
|
5853 return Character.isIdentifierIgnorable(ch);}}); |
|
5854 defClone("javaSpaceChar", new CloneableProperty() { |
|
5855 boolean isSatisfiedBy(int ch) { |
|
5856 return Character.isSpaceChar(ch);}}); |
|
5857 defClone("javaWhitespace", new CloneableProperty() { |
|
5858 boolean isSatisfiedBy(int ch) { |
|
5859 return Character.isWhitespace(ch);}}); |
|
5860 defClone("javaISOControl", new CloneableProperty() { |
|
5861 boolean isSatisfiedBy(int ch) { |
|
5862 return Character.isISOControl(ch);}}); |
|
5863 defClone("javaMirrored", new CloneableProperty() { |
|
5864 boolean isSatisfiedBy(int ch) { |
|
5865 return Character.isMirrored(ch);}}); |
|
5866 } |
|
5867 } |
|
5868 |
5780 |
5869 /** |
5781 /** |
5870 * Creates a predicate which can be used to match a string. |
5782 * Creates a predicate which can be used to match a string. |
5871 * |
5783 * |
5872 * @return The predicate which can be used for matching on a string |
5784 * @return The predicate which can be used for matching on a string |