jdk-sandbox: comparison jdk/src/java.base/share/classes/java/util/regex/Pattern.java

equal deleted inserted replaced

-:fda31effa7b3
+:e7f3cf12e739
 /*
-* Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+* Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 */
 package java.util.regex;
 import java.text.Normalizer;
+import java.text.Normalizer.Form;
 import java.util.Locale;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
 import java.util.Arrays;
 import java.util.NoSuchElementException;
 import java.util.Spliterator;
 import java.util.Spliterators;
 import java.util.function.Predicate;
 * Temporary storage used by parsing pattern slice.
 */
 transient int[] buffer;
 /**
+* A temporary storage used for predicate for double return.
+*/
+transient CharPredicate predicate;
+/**
 * Map the "name" of the "named capturing group" to its group id
 * node.
 */
 transient volatile Map<String, Integer> namedGroups;
 /**
 * Temporary storage used while parsing group references.
 */
 transient GroupHead[] groupNodes;
+/**
+* Temporary storage used to store the top level closure nodes.
+*/
+transient List<Node> topClosureNodes;
+/**
+* The number of top greedy closure nodes in this Pattern. Used by
+* matchers to allocate storage needed for a IntHashSet to keep the
+* beginning pos {@code i} of all failed match.
+*/
+transient int localTCNCount;
+/*
+* Turn off the stop-exponential-backtracking optimization if there
+* is a group ref in the pattern.
+*/
+transient boolean hasGroupRef;
 /**
 * Temporary null terminated code point array used by pattern compiling.
 */
 private transient int[] temp;
 /**
 * If the Start node might possibly match supplementary characters.
 * It is set to true during compiling if
 * (1) There is supplementary char in pattern, or
-* (2) There is complement node of Category or Block
+* (2) There is complement node of a "family" CharProperty
 */
 private transient boolean hasSupplementary;
 /**
 * Compiles the given regular expression into a pattern.
 s.defaultReadObject();
 // Initialize counts
 capturingGroupCount = 1;
 localCount = 0;
+localTCNCount = 0;
 // if length > 0, the Pattern is lazily compiled
 if (pattern.length() == 0) {
 root = new Start(lastAccept);
 matchRoot = lastAccept;
 flags |= UNICODE_CASE;
 // Reset group index count
 capturingGroupCount = 1;
 localCount = 0;
+localTCNCount = 0;
 if (pattern.length() > 0) {
 compile();
 } else {
 root = new Start(lastAccept);
 matchRoot = lastAccept;
 }
 }
 /**
-* The pattern is converted to normalized form ({@linkplain
+* The pattern is converted to normalized form ({@link
-* java.text.Normalizer.Form.NFD NFD}, canonical decomposition)
+* java.text.Normalizer.Form.NFC NFC}, canonical decomposition,
-* and then a pure group is constructed to match canonical
+* followed by canonical composition for the character class
-* equivalences of the characters.
+* part, and {@link java.text.Normalizer.Form.NFD NFD},
-*/
+* canonical decomposition) for the rest), and then a pure
-private void normalize() {
+* group is constructed to match canonical equivalences of the
-int lastCodePoint = -1;
+* characters.
+*/
-// Convert pattern into normalized form
+private static String normalize(String pattern) {
-normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
+int plen = pattern.length();
-patternLength = normalizedPattern.length();
+StringBuilder pbuf = new StringBuilder(plen);
+char last = 0;
-// Modify pattern to match canonical equivalences
+int lastStart = 0;
-StringBuilder newPattern = new StringBuilder(patternLength);
+char cc = 0;
-for(int i=0; i<patternLength; ) {
+for (int i = 0; i < plen;) {
-int c = normalizedPattern.codePointAt(i);
+char c = pattern.charAt(i);
-StringBuilder sequenceBuffer;
+if (cc == 0 &&    // top level
-if ((Character.getType(c) == Character.NON_SPACING_MARK)
+c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') {
-&& (lastCodePoint != -1)) {
+i += 2; last = 0;
-sequenceBuffer = new StringBuilder();
+continue;
-sequenceBuffer.appendCodePoint(lastCodePoint);
+}
-sequenceBuffer.appendCodePoint(c);
+if (c == '[' && last != '\\') {
-while(Character.getType(c) == Character.NON_SPACING_MARK) {
+if (cc == 0) {
-i += Character.charCount(c);
+if (lastStart < i)
-if (i >= patternLength)
+normalizeSlice(pattern, lastStart, i, pbuf);
-break;
+lastStart = i;
-c = normalizedPattern.codePointAt(i);
+}
-sequenceBuffer.appendCodePoint(c);
+cc++;
-}
+} else if (c == ']' && last != '\\') {
-String ea = produceEquivalentAlternation(
+cc--;
-sequenceBuffer.toString());
+if (cc == 0) {
-newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
+normalizeClazz(pattern, lastStart, i + 1, pbuf);
-newPattern.append("(?:").append(ea).append(")");
+lastStart = i + 1;
-} else if (c == '[' && lastCodePoint != '\\') {
+}
-i = normalizeCharClass(newPattern, i);
+}
-} else {
+last = c;
-newPattern.appendCodePoint(c);
+i++;
 }
-lastCodePoint = c;
+assert (cc == 0);
-i += Character.charCount(c);
+if (lastStart < plen)
-}
+normalizeSlice(pattern, lastStart, plen, pbuf);
-normalizedPattern = newPattern.toString();
+return pbuf.toString();
 }
-/**
+private static void normalizeSlice(String src, int off, int limit,
-* Complete the character class being parsed and add a set
+StringBuilder dst)
-* of alternations to it that will match the canonical equivalences
+{
-* of the characters within the class.
+int len = src.length();
-*/
+int off0 = off;
-private int normalizeCharClass(StringBuilder newPattern, int i) {
+while (off < limit && ASCII.isAscii(src.charAt(off))) {
-StringBuilder charClass = new StringBuilder();
+off++;
-StringBuilder eq = null;
+}
-int lastCodePoint = -1;
+if (off == limit) {
-String result;
+dst.append(src, off0, limit);
+return;
-i++;
+}
-charClass.append("[");
+off--;
-while(true) {
+if (off < off0)
-int c = normalizedPattern.codePointAt(i);
+off = off0;
-StringBuilder sequenceBuffer;
+else
+dst.append(src, off0, off);
-if (c == ']' && lastCodePoint != '\\') {
+while (off < limit) {
-charClass.append((char)c);
+int ch0 = src.codePointAt(off);
-break;
+if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {
-} else if (Character.getType(c) == Character.NON_SPACING_MARK) {
+dst.append((char)ch0);
-sequenceBuffer = new StringBuilder();
+off++;
-sequenceBuffer.appendCodePoint(lastCodePoint);
+continue;
-while(Character.getType(c) == Character.NON_SPACING_MARK) {
+}
-sequenceBuffer.appendCodePoint(c);
+int j = off + Character.charCount(ch0);
-i += Character.charCount(c);
+int ch1;
-if (i >= normalizedPattern.length())
+while (j < limit) {
-break;
+ch1 = src.codePointAt(j);
-c = normalizedPattern.codePointAt(i);
+if (Grapheme.isBoundary(ch0, ch1))
-}
+break;
-String ea = produceEquivalentAlternation(
+ch0 = ch1;
-sequenceBuffer.toString());
+j += Character.charCount(ch1);
+}
-charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
+String seq = src.substring(off, j);
-if (eq == null)
+String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
-eq = new StringBuilder();
+off = j;
-eq.append('|');
+if (nfd.length() > 1) {
-eq.append(ea);
+ch0 = nfd.codePointAt(0);
-} else {
+ch1 = nfd.codePointAt(Character.charCount(ch0));
-charClass.appendCodePoint(c);
+if (Character.getType(ch1) == Character.NON_SPACING_MARK) {
-i++;
+Set<String> altns = new LinkedHashSet<>();
-}
+altns.add(seq);
-if (i == normalizedPattern.length())
+produceEquivalentAlternation(nfd, altns);
-throw error("Unclosed character class");
+dst.append("(?:");
-lastCodePoint = c;
+altns.forEach( s -> dst.append(s + "|"));
-}
+dst.delete(dst.length() - 1, dst.length());
+dst.append(")");
-if (eq != null) {
+continue;
-result = "(?:"+charClass.toString()+eq.toString()+")";
+}
-} else {
+}
-result = charClass.toString();
+String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC);
-}
+if (!seq.equals(nfc) && !nfd.equals(nfc))
+dst.append("(?:" + seq + "|" + nfd  + "|" + nfc + ")");
-newPattern.append(result);
+else if (!seq.equals(nfd))
-return i;
+dst.append("(?:" + seq + "|" + nfd + ")");
+else
+dst.append(seq);
+}
+}
+private static void normalizeClazz(String src, int off, int limit,
+StringBuilder dst)
+{
+dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC));
 }
 /**
 * Given a specific sequence composed of a regular character and
 * combining marks that follow it, produce the alternation that will
 * match all canonical equivalences of that sequence.
 */
-private String produceEquivalentAlternation(String source) {
+private static void produceEquivalentAlternation(String src,
-int len = countChars(source, 0, 1);
+Set<String> dst)
-if (source.length() == len)
+{
-// source has one character.
+int len = countChars(src, 0, 1);
-return source;
+if (src.length() == len) {
+dst.add(src);  // source has one character.
-String base = source.substring(0,len);
+return;
-String combiningMarks = source.substring(len);
+}
+String base = src.substring(0,len);
+String combiningMarks = src.substring(len);
 String[] perms = producePermutations(combiningMarks);
-StringBuilder result = new StringBuilder(source);
 // Add combined permutations
-for(int x=0; x<perms.length; x++) {
+for(int x = 0; x < perms.length; x++) {
 String next = base + perms[x];
-if (x>0)
+dst.add(next);
-result.append("|"+next);
 next = composeOneStep(next);
-if (next != null)
+if (next != null) {
-result.append("|"+produceEquivalentAlternation(next));
+produceEquivalentAlternation(next, dst);
 }
-return result.toString();
+}
 }
 /**
 * Returns an array of strings that have all the possible
 * permutations of the characters in the input string.
 * of a set of combining marks. Note that some of the permutations
 * are invalid because of combining class collisions, and these
 * possibilities must be removed because they are not canonically
 * equivalent.
 */
-private String[] producePermutations(String input) {
+private static String[] producePermutations(String input) {
 if (input.length() == countChars(input, 0, 1))
 return new String[] {input};
 if (input.length() == countChars(input, 0, 2)) {
 int c0 = Character.codePointAt(input, 0);
 String[] result = new String[index];
 System.arraycopy(temp, 0, result, 0, index);
 return result;
 }
-private int getClass(int c) {
+private static int getClass(int c) {
 return sun.text.Normalizer.getCombiningClass(c);
 }
 /**
 * Attempts to compose input by combining the first character
 * with the first combining mark following it. Returns a String
 * that is the composition of the leading character with its first
 * combining mark followed by the remaining combining marks. Returns
 * null if the first two characters cannot be further composed.
 */
-private String composeOneStep(String input) {
+private static String composeOneStep(String input) {
 int len = countChars(input, 0, 2);
 String firstTwoCharacters = input.substring(0, len);
 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
 if (result.equals(firstTwoCharacters))
 return null;
 else {
 String remainder = input.substring(len);
 return result + remainder;
 * of the expression which will create the object tree.
 */
 private void compile() {
 // Handle canonical equivalences
 if (has(CANON_EQ) && !has(LITERAL)) {
-normalize();
+normalizedPattern = normalize(pattern);
 } else {
 normalizedPattern = pattern;
 }
 patternLength = normalizedPattern.length();
 // Allocate all temporary objects here.
 buffer = new int[32];
 groupNodes = new GroupHead[10];
 namedGroups = null;
+topClosureNodes = new ArrayList<>(10);
 if (has(LITERAL)) {
 // Literal pattern handling
 matchRoot = newSlice(temp, patternLength, hasSupplementary);
 matchRoot.next = lastAccept;
 root = matchRoot;
 } else {
 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
 }
+// Optimize the greedy Loop to prevent exponential backtracking, IF there
+// is no group ref in this pattern. With a non-negative localTCNCount value,
+// the greedy type Loop, Curly will skip the backtracking for any starting
+// position "i" that failed in the past.
+if (!hasGroupRef) {
+for (Node node : topClosureNodes) {
+if (node instanceof Loop) {
+// non-deterministic-greedy-group
+((Loop)node).posIndex = localTCNCount++;
+}
+}
+}
 // Release temporary storage
 temp = null;
 buffer = null;
 groupNodes = null;
 patternLength = 0;
 compiled = true;
+topClosureNodes = null;
 }
 Map<String, Integer> namedGroups() {
 Map<String, Integer> groups = namedGroups;
 if (groups == null) {
 namedGroups = groups = new HashMap<>(2);
 }
 return groups;
-}
-/**
-* Used to print out a subtree of the Pattern to help with debugging.
-*/
-private static void printObjectTree(Node node) {
-while(node != null) {
-if (node instanceof Prolog) {
-System.out.println(node);
-printObjectTree(((Prolog)node).loop);
-System.out.println("**** end contents prolog loop");
-} else if (node instanceof Loop) {
-System.out.println(node);
-printObjectTree(((Loop)node).body);
-System.out.println("**** end contents Loop body");
-} else if (node instanceof Curly) {
-System.out.println(node);
-printObjectTree(((Curly)node).atom);
-System.out.println("**** end contents Curly body");
-} else if (node instanceof GroupCurly) {
-System.out.println(node);
-printObjectTree(((GroupCurly)node).atom);
-System.out.println("**** end contents GroupCurly body");
-} else if (node instanceof GroupTail) {
-System.out.println(node);
-System.out.println("Tail next is "+node.next);
-return;
-} else {
-System.out.println(node);
-}
-node = node.next;
-if (node != null)
-System.out.println("->next:");
-if (node == Pattern.accept) {
-System.out.println("Accept Node");
-node = null;
-}
-}
 }
 /**
 * Used to accumulate information about a subtree of the object graph
 * so that optimizations can be applied to the subtree.
 tail.next = node;
 // Double return: Tail was returned in root
 tail = root;
 continue;
 case '[':
-node = clazz(true);
+if (has(CANON_EQ) && !has(LITERAL))
+node = new NFCCharProperty(clazz(true));
+else
+node = newCharProperty(clazz(true));
 break;
 case '\\':
 ch = nextEscaped();
 if (ch == 'p' || ch == 'P') {
 boolean oneLetter = true;
 if (ch != '{') {
 unread();
 } else {
 oneLetter = false;
 }
-node = family(oneLetter, comp);
+// node = newCharProperty(family(oneLetter, comp));
+if (has(CANON_EQ) && !has(LITERAL))
+node = new NFCCharProperty(family(oneLetter, comp));
+else
+node = newCharProperty(family(oneLetter, comp));
 } else {
 unread();
 node = atom();
 }
 break;
 node = new Dollar(has(MULTILINE));
 break;
 case '.':
 next();
 if (has(DOTALL)) {
-node = new All();
+node = new CharProperty(ALL);
 } else {
-if (has(UNIX_LINES))
+if (has(UNIX_LINES)) {
-node = new UnixDot();
+node = new CharProperty(UNIXDOT);
-else {
+} else {
-node = new Dot();
+node = new CharProperty(DOT);
 }
 }
 break;
 case '|':
 case ')':
 node = atom();
 break;
 }
 node = closure(node);
+/* save the top dot-greedy nodes (.*, .+) as well
+if (node instanceof GreedyCharProperty &&
+((GreedyCharProperty)node).cp instanceof Dot) {
+topClosureNodes.add(node);
+}
+*/
 if (head == null) {
 head = tail = node;
 } else {
 tail.next = node;
 tail = node;
 ch = next(); // Consume { if present
 if (ch != '{')
 unread();
 else
 oneLetter = false;
-return family(oneLetter, comp);
+if (has(CANON_EQ) && !has(LITERAL))
+return new NFCCharProperty(family(oneLetter, comp));
+else
+return newCharProperty(family(oneLetter, comp));
 }
 }
 unread();
 prev = cursor;
 ch = escape(false, first == 0, false);
 continue;
 }
 break;
 }
 if (first == 1) {
-return newSingle(buffer[0]);
+return newCharProperty(single(buffer[0]));
 } else {
 return newSlice(buffer, first, hasSupplementary);
 }
 }
 default:
 done = true;
 break;
 }
 }
+hasGroupRef = true;
 if (has(CASE_INSENSITIVE))
 return new CIBackRef(refNum, has(UNICODE_CASE));
 else
 return new BackRef(refNum);
 }
 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
 return -1;
 case 'C':
 break;
 case 'D':
-if (create) root = has(UNICODE_CHARACTER_CLASS)
+if (create) {
-? new Utype(UnicodeProp.DIGIT).complement()
+predicate = has(UNICODE_CHARACTER_CLASS) ?
-: new Ctype(ASCII.DIGIT).complement();
+CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
+predicate = predicate.negate();
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'E':
 case 'F':
 break;
 case 'G':
 if (inclass) break;
 if (create) root = new LastMatch();
 return -1;
 case 'H':
-if (create) root = new HorizWS().complement();
+if (create) {
+predicate = HorizWS.negate();
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'I':
 case 'J':
 case 'K':
 case 'L':
 case 'R':
 if (inclass) break;
 if (create) root = new LineEnding();
 return -1;
 case 'S':
-if (create) root = has(UNICODE_CHARACTER_CLASS)
+if (create) {
-? new Utype(UnicodeProp.WHITE_SPACE).complement()
+predicate = has(UNICODE_CHARACTER_CLASS) ?
-: new Ctype(ASCII.SPACE).complement();
+CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
+predicate = predicate.negate();
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'T':
 case 'U':
 break;
 case 'V':
-if (create) root = new VertWS().complement();
+if (create) {
+predicate = VertWS.negate();
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'W':
-if (create) root = has(UNICODE_CHARACTER_CLASS)
+if (create) {
-? new Utype(UnicodeProp.WORD).complement()
+predicate = has(UNICODE_CHARACTER_CLASS) ?
-: new Ctype(ASCII.WORD).complement();
+CharPredicates.WORD : CharPredicates.ASCII_WORD;
+predicate = predicate.negate();
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'X':
 if (inclass) break;
 if (create) {
 root = new XGrapheme();
 }
 return -1;
 case 'c':
 return c();
 case 'd':
-if (create) root = has(UNICODE_CHARACTER_CLASS)
+if (create) {
-? new Utype(UnicodeProp.DIGIT)
+predicate = has(UNICODE_CHARACTER_CLASS) ?
-: new Ctype(ASCII.DIGIT);
+CharPredicates.DIGIT : CharPredicates.ASCII_DIGIT;
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'e':
 return '\033';
 case 'f':
 return '\f';
 case 'g':
 break;
 case 'h':
-if (create) root = new HorizWS();
+if (create) {
+predicate = HorizWS;
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'i':
 case 'j':
 break;
 case 'k':
 throw error("\\k is not followed by '<' for named capturing group");
 String name = groupname(read());
 if (!namedGroups().containsKey(name))
 throw error("(named capturing group <"+ name+"> does not exit");
 if (create) {
+hasGroupRef = true;
 if (has(CASE_INSENSITIVE))
 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
 else
 root = new BackRef(namedGroups().get(name));
 }
 case 'q':
 break;
 case 'r':
 return '\r';
 case 's':
-if (create) root = has(UNICODE_CHARACTER_CLASS)
+if (create) {
-? new Utype(UnicodeProp.WHITE_SPACE)
+predicate = has(UNICODE_CHARACTER_CLASS) ?
-: new Ctype(ASCII.SPACE);
+CharPredicates.WHITE_SPACE : CharPredicates.ASCII_SPACE;
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 't':
 return '\t';
 case 'u':
 return u();
 // the start or end value, such as [\v-...] or [...-\v], in
 // which a single definite value (0x0B) is expected. For
 // compatibility concern '\013'/0x0B is returned if isrange.
 if (isrange)
 return '\013';
-if (create) root = new VertWS();
+if (create) {
+predicate = VertWS;
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'w':
-if (create) root = has(UNICODE_CHARACTER_CLASS)
+if (create) {
-? new Utype(UnicodeProp.WORD)
+predicate = has(UNICODE_CHARACTER_CLASS) ?
-: new Ctype(ASCII.WORD);
+CharPredicates.WORD : CharPredicates.ASCII_WORD;
+if (!inclass)
+root = newCharProperty(predicate);
+}
 return -1;
 case 'x':
 return x();
 case 'y':
 break;
 *
 * Consumes a ] on the way out if consume is true. Usually consume
 * is true except for the case of [abc&&def] where def is a separate
 * right hand node with "understood" brackets.
 */
-private CharProperty clazz(boolean consume) {
+private CharPredicate clazz(boolean consume) {
-CharProperty prev = null;
+CharPredicate prev = null;
-CharProperty node = null;
+CharPredicate curr = null;
 BitClass bits = new BitClass();
-boolean include = true;
+BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
-boolean firstInClass = true;
+boolean isNeg = false;
+boolean hasBits = false;
 int ch = next();
+// Negates if first char in a class, otherwise literal
+if (ch == '^' && temp[cursor-1] == '[') {
+ch = next();
+isNeg = true;
+}
 for (;;) {
 switch (ch) {
-case '^':
-// Negates if first char in a class, otherwise literal
-if (firstInClass) {
-if (temp[cursor-1] != '[')
-break;
-ch = next();
-include = !include;
-continue;
-} else {
-// ^ not first in class, treat as literal
-break;
-}
 case '[':
-firstInClass = false;
+curr = clazz(true);
-node = clazz(true);
 if (prev == null)
-prev = node;
+prev = curr;
 else
-prev = union(prev, node);
+prev = prev.union(curr);
 ch = peek();
 continue;
 case '&':
-firstInClass = false;
 ch = next();
 if (ch == '&') {
 ch = next();
-CharProperty rightNode = null;
+CharPredicate right = null;
 while (ch != ']' && ch != '&') {
 if (ch == '[') {
-if (rightNode == null)
+if (right == null)
-rightNode = clazz(true);
+right = clazz(true);
 else
-rightNode = union(rightNode, clazz(true));
+right = right.union(clazz(true));
 } else { // abc&&def
 unread();
-rightNode = clazz(false);
+right = clazz(false);
 }
 ch = peek();
 }
-if (rightNode != null)
+if (hasBits) {
-node = rightNode;
+// bits used, union has high precedence
+if (prev == null) {
+prev = curr = bitsP;
+} else {
+prev = prev.union(bitsP);
+}
+hasBits = false;
+}
+if (right != null)
+curr = right;
 if (prev == null) {
-if (rightNode == null)
+if (right == null)
 throw error("Bad class syntax");
 else
-prev = rightNode;
+prev = right;
 } else {
-prev = intersection(prev, node);
+prev = prev.and(curr);
 }
 } else {
 // treat as a literal &
 unread();
 break;
 }
 continue;
 case 0:
-firstInClass = false;
 if (cursor >= patternLength)
 throw error("Unclosed character class");
 break;
 case ']':
-firstInClass = false;
+if (prev != null || hasBits) {
-if (prev != null) {
 if (consume)
 next();
+if (prev == null)
+prev = bitsP;
+else if (hasBits)
+prev = prev.union(bitsP);
+if (isNeg)
+return prev.negate();
 return prev;
 }
 break;
 default:
-firstInClass = false;
 break;
 }
-node = range(bits);
+curr = range(bits);
-if (include) {
+if (curr == null) {    // the bits used
-if (prev == null) {
+hasBits = true;
-prev = node;
-} else {
-if (prev != node)
-prev = union(prev, node);
-}
 } else {
-if (prev == null) {
+if (prev == null)
-prev = node.complement();
+prev = curr;
-} else {
+else if (prev != curr)
-if (prev != node)
+prev = prev.union(curr);
-prev = setDifference(prev, node);
-}
 }
 ch = peek();
 }
 }
-private CharProperty bitsOrSingle(BitClass bits, int ch) {
+private CharPredicate bitsOrSingle(BitClass bits, int ch) {
 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
 Use "single" node instead of bits when dealing with unicode
 case folding for codepoints listed below.
 (1)Uppercase out of range: u+00ff, u+00b5
 toUpperCase(u+00ff) -> u+0178
 */
 int d;
 if (ch < 256 &&
 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
 (ch == 0xff || ch == 0xb5 ||
-ch == 0x49 || ch == 0x69 ||  //I and i
+ch == 0x49 || ch == 0x69 ||    //I and i
-ch == 0x53 || ch == 0x73 ||  //S and s
+ch == 0x53 || ch == 0x73 ||    //S and s
-ch == 0x4b || ch == 0x6b ||  //K and k
+ch == 0x4b || ch == 0x6b ||    //K and k
-ch == 0xc5 || ch == 0xe5)))  //A+ring
+ch == 0xc5 || ch == 0xe5))) {  //A+ring
-return bits.add(ch, flags());
+bits.add(ch, flags());
-return newSingle(ch);
+return null;
+}
+return single(ch);
+}
+/**
+*  Returns a suitably optimized, single character predicate
+*/
+private CharPredicate single(final int ch) {
+if (has(CASE_INSENSITIVE)) {
+int lower, upper;
+if (has(UNICODE_CASE)) {
+upper = Character.toUpperCase(ch);
+lower = Character.toLowerCase(upper);
+// Unicode case insensitive matches
+if (upper != lower)
+return SingleU(lower);
+} else if (ASCII.isAscii(ch)) {
+lower = ASCII.toLower(ch);
+upper = ASCII.toUpper(ch);
+// Case insensitive matches a given BMP character
+if (lower != upper)
+return SingleI(lower, upper);
+}
+}
+if (isSupplementary(ch))
+return SingleS(ch);
+return Single(ch);  // Match a given BMP character
 }
 /**
 * Parse a single character or a character range in a character class
 * and return its representative node.
 */
-private CharProperty range(BitClass bits) {
+private CharPredicate range(BitClass bits) {
 int ch = peek();
 if (ch == '\\') {
 ch = nextEscaped();
 if (ch == 'p' || ch == 'P') { // A property
 boolean comp = (ch == 'P');
 } else { // ordinary escape
 boolean isrange = temp[cursor+1] == '-';
 unread();
 ch = escape(true, true, isrange);
 if (ch == -1)
-return (CharProperty) root;
+return predicate;
 }
 } else {
 next();
 }
 if (ch >= 0) {
 next();
 }
 if (m < ch) {
 throw error("Illegal character range");
 }
-if (has(CASE_INSENSITIVE))
+if (has(CASE_INSENSITIVE)) {
-return caseInsensitiveRangeFor(ch, m);
+if (has(UNICODE_CASE))
-else
+return CIRangeU(ch, m);
-return rangeFor(ch, m);
+return CIRange(ch, m);
+} else {
+return Range(ch, m);
+}
 }
 }
 return bitsOrSingle(bits, ch);
 }
 throw error("Unexpected character '"+((char)ch)+"'");
 }
 /**
 * Parses a Unicode character family and returns its representative node.
 */
-private CharProperty family(boolean singleLetter,
+private CharPredicate family(boolean singleLetter, boolean isComplement) {
-boolean maybeComplement)
-{
 next();
 String name;
-CharProperty node = null;
+CharPredicate p = null;
 if (singleLetter) {
 int c = temp[cursor];
 if (!Character.isSupplementaryCodePoint(c)) {
 name = String.valueOf((char)c);
 String value = name.substring(i + 1);
 name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
 switch (name) {
 case "sc":
 case "script":
-node = unicodeScriptPropertyFor(value);
+p = CharPredicates.forUnicodeScript(value);
 break;
 case "blk":
 case "block":
-node = unicodeBlockPropertyFor(value);
+p = CharPredicates.forUnicodeBlock(value);
 break;
 case "gc":
 case "general_category":
-node = charPropertyNodeFor(value);
+p = CharPredicates.forProperty(value);
 break;
 default:
-throw error("Unknown Unicode property {name=<" + name + ">, "
+break;
-+ "value=<" + value + ">}");
+}
-}
+if (p == null)
+throw error("Unknown Unicode property {name=<" + name + ">, "
++ "value=<" + value + ">}");
 } else {
 if (name.startsWith("In")) {
-// \p{inBlockName}
+// \p{InBlockName}
-node = unicodeBlockPropertyFor(name.substring(2));
+p = CharPredicates.forUnicodeBlock(name.substring(2));
 } else if (name.startsWith("Is")) {
-// \p{isGeneralCategory} and \p{isScriptName}
+// \p{IsGeneralCategory} and \p{IsScriptName}
 name = name.substring(2);
-UnicodeProp uprop = UnicodeProp.forName(name);
+p = CharPredicates.forUnicodeProperty(name);
-if (uprop != null)
+if (p == null)
-node = new Utype(uprop);
+p = CharPredicates.forProperty(name);
-if (node == null)
+if (p == null)
-node = CharPropertyNames.charPropertyFor(name);
+p = CharPredicates.forUnicodeScript(name);
-if (node == null)
-node = unicodeScriptPropertyFor(name);
 } else {
 if (has(UNICODE_CHARACTER_CLASS)) {
-UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
+p = CharPredicates.forPOSIXName(name);
-if (uprop != null)
+}
-node = new Utype(uprop);
+if (p == null)
-}
+p = CharPredicates.forProperty(name);
-if (node == null)
+}
-node = charPropertyNodeFor(name);
+if (p == null)
-}
+throw error("Unknown character property name {In/Is" + name + "}");
 }
-if (maybeComplement) {
+if (isComplement) {
-if (node instanceof Category || node instanceof Block)
+// it might be too expensive to detect if a complement of
-hasSupplementary = true;
+// CharProperty can match "certain" supplementary. So just
-node = node.complement();
+// go with StartS.
-}
+hasSupplementary = true;
-return node;
+p = p.negate();
 }
+return p;
+}
-/**
-* Returns a CharProperty matching all characters belong to
+private CharProperty newCharProperty(CharPredicate p) {
-* a UnicodeScript.
-*/
-private CharProperty unicodeScriptPropertyFor(String name) {
-final Character.UnicodeScript script;
-try {
-script = Character.UnicodeScript.forName(name);
-} catch (IllegalArgumentException iae) {
-throw error("Unknown character script name {" + name + "}");
-}
-return new Script(script);
-}
-/**
-* Returns a CharProperty matching all characters in a UnicodeBlock.
-*/
-private CharProperty unicodeBlockPropertyFor(String name) {
-final Character.UnicodeBlock block;
-try {
-block = Character.UnicodeBlock.forName(name);
-} catch (IllegalArgumentException iae) {
-throw error("Unknown character block name {" + name + "}");
-}
-return new Block(block);
-}
-/**
-* Returns a CharProperty matching all characters in a named property.
-*/
-private CharProperty charPropertyNodeFor(String name) {
-CharProperty p = CharPropertyNames.charPropertyFor(name);
 if (p == null)
-throw error("Unknown character property name {" + name + "}");
+return null;
-return p;
+if (p instanceof BmpCharPredicate)
+return new BmpCharProperty((BmpCharPredicate)p);
+else
+return new CharProperty(p);
 }
 /**
 * Parses and returns the name of a "named capturing group", the trailing
 * ">" is consumed after parsing.
 private Node group0() {
 boolean capturingGroup = false;
 Node head = null;
 Node tail = null;
 int save = flags;
+int saveTCNCount = topClosureNodes.size();
 root = null;
 int ch = next();
 if (ch == '?') {
 ch = skip();
 switch (ch) {
 break;
 case '>':   // (?>xxx)  independent group
 head = createGroup(true);
 tail = root;
 head.next = expr(tail);
-head = tail = new Ques(head, INDEPENDENT);
+head = tail = new Ques(head, Qtype.INDEPENDENT);
 break;
 case '<':   // (?<xxx)  look behind
 ch = read();
 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
 // named captured group
 new NotBehind(head, info.maxLength,
 info.minLength));
 } else {
 throw error("Unknown look-behind group");
 }
+// clear all top-closure-nodes inside lookbehind
+if (saveTCNCount < topClosureNodes.size())
+topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
 break;
 case '$':
 case '@':
 throw error("Unknown group type");
 default:    // (?xxx:) inlined match flags
 if (head == tail) { // Zero length assertion
 root = node;
 return node;    // Dual return
 }
+// have group closure, clear all inner closure nodes from the
+// top list (no backtracking stopper optimization for inner
+if (saveTCNCount < topClosureNodes.size())
+topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();
 if (node instanceof Ques) {
 Ques ques = (Ques) node;
-if (ques.type == POSSESSIVE) {
+if (ques.type == Qtype.POSSESSIVE) {
 root = node;
 return node;
 }
 tail.next = new BranchConn();
 tail = tail.next;
-if (ques.type == GREEDY) {
+if (ques.type == Qtype.GREEDY) {
 head = new Branch(head, null, tail);
 } else { // Reluctant quantifier
 head = new Branch(null, head, tail);
 }
 root = tail;
 return head;
 } else if (node instanceof Curly) {
 Curly curly = (Curly) node;
-if (curly.type == POSSESSIVE) {
+if (curly.type == Qtype.POSSESSIVE) {
 root = node;
 return node;
 }
 // Discover if the group is deterministic
 TreeInfo info = new TreeInfo();
 capturingGroup);
 return head;
 } else { // Non-deterministic
 int temp = ((GroupHead) head).localIndex;
 Loop loop;
-if (curly.type == GREEDY)
+if (curly.type == Qtype.GREEDY) {
 loop = new Loop(this.localCount, temp);
-else  // Reluctant Curly
+// add the max_reps greedy to the top-closure-node list
+if (curly.cmax == MAX_REPS)
+topClosureNodes.add(loop);
+} else {  // Reluctant Curly
 loop = new LazyLoop(this.localCount, temp);
+}
 Prolog prolog = new Prolog(loop);
 this.localCount += 1;
 loop.cmin = curly.cmin;
 loop.cmax = curly.cmax;
 loop.body = head;
 int groupIndex = 0;
 if (!anonymous)
 groupIndex = capturingGroupCount++;
 GroupHead head = new GroupHead(localIndex);
 root = new GroupTail(localIndex, groupIndex);
+// for debug/print only, head.match does NOT need the "tail" info
+head.tail = (GroupTail)root;
 if (!anonymous && groupIndex < 10)
 groupNodes[groupIndex] = head;
 return head;
 }
 }
 }
 static final int MAX_REPS   = 0x7FFFFFFF;
-static final int GREEDY     = 0;
+static enum Qtype {
+GREEDY, LAZY, POSSESSIVE, INDEPENDENT
-static final int LAZY       = 1;
+}
-static final int POSSESSIVE = 2;
+private Node curly(Node prev, int cmin) {
+int ch = next();
-static final int INDEPENDENT = 3;
+if (ch == '?') {
+next();
+return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
+} else if (ch == '+') {
+next();
+return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
+}
+if (prev instanceof BmpCharProperty) {
+return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
+} else if (prev instanceof CharProperty) {
+return new CharPropertyGreedy((CharProperty)prev, cmin);
+}
+return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
+}
 /**
 * Processes repetition. If the next character peeked is a quantifier
 * then new nodes must be appended to handle the repetition.
 * Prev could be a single or a group, so it could be a chain of nodes.
 switch (ch) {
 case '?':
 ch = next();
 if (ch == '?') {
 next();
-return new Ques(prev, LAZY);
+return new Ques(prev, Qtype.LAZY);
 } else if (ch == '+') {
 next();
-return new Ques(prev, POSSESSIVE);
+return new Ques(prev, Qtype.POSSESSIVE);
 }
-return new Ques(prev, GREEDY);
+return new Ques(prev, Qtype.GREEDY);
 case '*':
-ch = next();
+return curly(prev, 0);
-if (ch == '?') {
-next();
-return new Curly(prev, 0, MAX_REPS, LAZY);
-} else if (ch == '+') {
-next();
-return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
-}
-return new Curly(prev, 0, MAX_REPS, GREEDY);
 case '+':
-ch = next();
+return curly(prev, 1);
-if (ch == '?') {
-next();
-return new Curly(prev, 1, MAX_REPS, LAZY);
-} else if (ch == '+') {
-next();
-return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
-}
-return new Curly(prev, 1, MAX_REPS, GREEDY);
 case '{':
 ch = temp[cursor+1];
 if (ASCII.isDigit(ch)) {
 skip();
 int cmin = 0;
 throw error("Illegal repetition range");
 Curly curly;
 ch = peek();
 if (ch == '?') {
 next();
-curly = new Curly(prev, cmin, cmax, LAZY);
+curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
 } else if (ch == '+') {
 next();
-curly = new Curly(prev, cmin, cmax, POSSESSIVE);
+curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
 } else {
-curly = new Curly(prev, cmin, cmax, GREEDY);
+curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
 }
 return curly;
 } else {
 throw error("Illegal repetition");
 }
 /**
 *  Creates a bit vector for matching Latin-1 values. A normal BitClass
 *  never matches values above Latin-1, and a complemented BitClass always
 *  matches values above Latin-1.
 */
-private static final class BitClass extends BmpCharProperty {
+static final class BitClass extends BmpCharProperty {
 final boolean[] bits;
-BitClass() { bits = new boolean[256]; }
+BitClass() {
-private BitClass(boolean[] bits) { this.bits = bits; }
+this(new boolean[256]);
+}
+private BitClass(boolean[] bits) {
+super( ch -> ch < 256 && bits[ch]);
+this.bits = bits;
+}
 BitClass add(int c, int flags) {
 assert c >= 0 && c <= 255;
 if ((flags & CASE_INSENSITIVE) != 0) {
 if (ASCII.isAscii(c)) {
 bits[ASCII.toUpper(c)] = true;
 }
 }
 bits[c] = true;
 return this;
 }
-boolean isSatisfiedBy(int ch) {
-return ch < 256 && bits[ch];
-}
-}
-/**
-*  Returns a suitably optimized, single character matcher.
-*/
-private CharProperty newSingle(final int ch) {
-if (has(CASE_INSENSITIVE)) {
-int lower, upper;
-if (has(UNICODE_CASE)) {
-upper = Character.toUpperCase(ch);
-lower = Character.toLowerCase(upper);
-if (upper != lower)
-return new SingleU(lower);
-} else if (ASCII.isAscii(ch)) {
-lower = ASCII.toLower(ch);
-upper = ASCII.toUpper(ch);
-if (lower != upper)
-return new SingleI(lower, upper);
-}
-}
-if (isSupplementary(ch))
-return new SingleS(ch);    // Match a given Unicode character
-return new Single(ch);         // Match a given BMP character
 }
 /**
 *  Utility method for creating a string slice matcher.
 */
 /**
 * Abstract node class to match one character satisfying some
 * boolean property.
 */
-private abstract static class CharProperty extends Node {
+static class CharProperty extends Node {
-abstract boolean isSatisfiedBy(int ch);
+CharPredicate predicate;
-CharProperty complement() {
-return new CharProperty() {
+CharProperty (CharPredicate predicate) {
-boolean isSatisfiedBy(int ch) {
+this.predicate = predicate;
-return ! CharProperty.this.isSatisfiedBy(ch);}};
 }
 boolean match(Matcher matcher, int i, CharSequence seq) {
 if (i < matcher.to) {
 int ch = Character.codePointAt(seq, i);
-return isSatisfiedBy(ch)
+return predicate.is(ch) &&
-&& next.match(matcher, i+Character.charCount(ch), seq);
+next.match(matcher, i + Character.charCount(ch), seq);
 } else {
 matcher.hitEnd = true;
 return false;
 }
 }
 /**
 * Optimized version of CharProperty that works only for
 * properties never satisfied by Supplementary characters.
 */
-private abstract static class BmpCharProperty extends CharProperty {
+private static class BmpCharProperty extends CharProperty {
+BmpCharProperty (BmpCharPredicate predicate) {
+super(predicate);
+}
 boolean match(Matcher matcher, int i, CharSequence seq) {
 if (i < matcher.to) {
-return isSatisfiedBy(seq.charAt(i))
+return predicate.is(seq.charAt(i)) &&
-&& next.match(matcher, i+1, seq);
+next.match(matcher, i + 1, seq);
 } else {
 matcher.hitEnd = true;
 return false;
 }
 }
 }
-/**
+private static class NFCCharProperty extends Node {
-* Node class that matches a Supplementary Unicode character
+CharPredicate predicate;
-*/
+NFCCharProperty (CharPredicate predicate) {
-static final class SingleS extends CharProperty {
+this.predicate = predicate;
-final int c;
+}
-SingleS(int c) { this.c = c; }
-boolean isSatisfiedBy(int ch) {
+boolean match(Matcher matcher, int i, CharSequence seq) {
-return ch == c;
+if (i < matcher.to) {
-}
+int ch0 = Character.codePointAt(seq, i);
-}
+int n = Character.charCount(ch0);
+int j = i + n;
-/**
+while (j < matcher.to) {
-* Optimization -- matches a given BMP character
+int ch1 = Character.codePointAt(seq, j);
-*/
+if (Grapheme.isBoundary(ch0, ch1))
-static final class Single extends BmpCharProperty {
+break;
-final int c;
+ch0 = ch1;
-Single(int c) { this.c = c; }
+j += Character.charCount(ch1);
-boolean isSatisfiedBy(int ch) {
+}
-return ch == c;
+if (i + n == j) {    // single, assume nfc cp
-}
+if (predicate.is(ch0))
-}
+return next.match(matcher, j, seq);
+} else {
-/**
+while (i + n < j) {
-* Case insensitive matches a given BMP character
+String nfc = Normalizer.normalize(
-*/
+seq.toString().substring(i, j), Normalizer.Form.NFC);
-static final class SingleI extends BmpCharProperty {
+if (nfc.codePointCount(0, nfc.length()) == 1) {
-final int lower;
+if (predicate.is(nfc.codePointAt(0)) &&
-final int upper;
+next.match(matcher, j, seq)) {
-SingleI(int lower, int upper) {
+return true;
-this.lower = lower;
+}
-this.upper = upper;
+}
-}
-boolean isSatisfiedBy(int ch) {
+ch0 = Character.codePointBefore(seq, j);
-return ch == lower || ch == upper;
+j -= Character.charCount(ch0);
 }
 }
+if (j < matcher.to)
-/**
+return false;
-* Unicode case insensitive matches a given Unicode character
+}
-*/
+matcher.hitEnd = true;
-static final class SingleU extends CharProperty {
+return false;
-final int lower;
+}
-SingleU(int lower) {
-this.lower = lower;
+boolean study(TreeInfo info) {
-}
+info.minLength++;
-boolean isSatisfiedBy(int ch) {
+info.deterministic = false;
-return lower == ch ||
+return next.study(info);
-lower == Character.toLowerCase(Character.toUpperCase(ch));
-}
-}
-/**
-* Node class that matches a Unicode block.
-*/
-static final class Block extends CharProperty {
-final Character.UnicodeBlock block;
-Block(Character.UnicodeBlock block) {
-this.block = block;
-}
-boolean isSatisfiedBy(int ch) {
-return block == Character.UnicodeBlock.of(ch);
-}
-}
-/**
-* Node class that matches a Unicode script
-*/
-static final class Script extends CharProperty {
-final Character.UnicodeScript script;
-Script(Character.UnicodeScript script) {
-this.script = script;
-}
-boolean isSatisfiedBy(int ch) {
-return script == Character.UnicodeScript.of(ch);
-}
-}
-/**
-* Node class that matches a Unicode category.
-*/
-static final class Category extends CharProperty {
-final int typeMask;
-Category(int typeMask) { this.typeMask = typeMask; }
-boolean isSatisfiedBy(int ch) {
-return (typeMask & (1 << Character.getType(ch))) != 0;
-}
-}
-/**
-* Node class that matches a Unicode "type"
-*/
-static final class Utype extends CharProperty {
-final UnicodeProp uprop;
-Utype(UnicodeProp uprop) { this.uprop = uprop; }
-boolean isSatisfiedBy(int ch) {
-return uprop.is(ch);
-}
-}
-/**
-* Node class that matches a POSIX type.
-*/
-static final class Ctype extends BmpCharProperty {
-final int ctype;
-Ctype(int ctype) { this.ctype = ctype; }
-boolean isSatisfiedBy(int ch) {
-return ch < 128 && ASCII.isType(ch, ctype);
-}
-}
-/**
-* Node class that matches a Perl vertical whitespace
-*/
-static final class VertWS extends BmpCharProperty {
-boolean isSatisfiedBy(int cp) {
-return (cp >= 0x0A && cp <= 0x0D) ||
-cp == 0x85 || cp == 0x2028 || cp == 0x2029;
-}
-}
-/**
-* Node class that matches a Perl horizontal whitespace
-*/
-static final class HorizWS extends BmpCharProperty {
-boolean isSatisfiedBy(int cp) {
-return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
-cp == 0x1680 || cp == 0x180e ||
-cp >= 0x2000 && cp <= 0x200a ||
-cp == 0x202f || cp == 0x205f || cp == 0x3000;
 }
 }
 /**
 * Node class that matches an unicode extended grapheme cluster
 int toLower(int c) {
 return Character.toLowerCase(Character.toUpperCase(c));
 }
 }
-private static boolean inRange(int lower, int ch, int upper) {
-return lower <= ch && ch <= upper;
-}
-/**
-* Returns node for matching characters within an explicit value range.
-*/
-private static CharProperty rangeFor(final int lower,
-final int upper) {
-return new CharProperty() {
-boolean isSatisfiedBy(int ch) {
-return inRange(lower, ch, upper);}};
-}
-/**
-* Returns node for matching characters within an explicit value
-* range in a case insensitive manner.
-*/
-private CharProperty caseInsensitiveRangeFor(final int lower,
-final int upper) {
-if (has(UNICODE_CASE))
-return new CharProperty() {
-boolean isSatisfiedBy(int ch) {
-if (inRange(lower, ch, upper))
-return true;
-int up = Character.toUpperCase(ch);
-return inRange(lower, up, upper) ||
-inRange(lower, Character.toLowerCase(up), upper);}};
-return new CharProperty() {
-boolean isSatisfiedBy(int ch) {
-return inRange(lower, ch, upper) ||
-ASCII.isAscii(ch) &&
-(inRange(lower, ASCII.toUpper(ch), upper) ||
-inRange(lower, ASCII.toLower(ch), upper));
-}};
-}
-/**
-* Implements the Unicode category ALL and the dot metacharacter when
-* in dotall mode.
-*/
-static final class All extends CharProperty {
-boolean isSatisfiedBy(int ch) {
-return true;
-}
-}
-/**
-* Node class for the dot metacharacter when dotall is not enabled.
-*/
-static final class Dot extends CharProperty {
-boolean isSatisfiedBy(int ch) {
-return (ch != '\n' && ch != '\r'
-&& (ch|1) != '\u2029'
-&& ch != '\u0085');
-}
-}
-/**
-* Node class for the dot metacharacter when dotall is not enabled
-* but UNIX_LINES is enabled.
-*/
-static final class UnixDot extends CharProperty {
-boolean isSatisfiedBy(int ch) {
-return ch != '\n';
-}
-}
 /**
 * The 0 or 1 quantifier. This one class implements all three types.
 */
 static final class Ques extends Node {
 Node atom;
-int type;
+Qtype type;
-Ques(Node node, int type) {
+Ques(Node node, Qtype type) {
 this.atom = node;
 this.type = type;
 }
 boolean match(Matcher matcher, int i, CharSequence seq) {
 switch (type) {
 default:
 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
 }
 }
 boolean study(TreeInfo info) {
-if (type != INDEPENDENT) {
+if (type != Qtype.INDEPENDENT) {
 int minL = info.minLength;
 atom.study(info);
 info.minLength = minL;
 info.deterministic = false;
 return next.study(info);
 }
 }
 }
 /**
+* Handles the greedy style repetition with the minimum either be
+* 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
+*/
+static class CharPropertyGreedy extends Node {
+final CharPredicate predicate;
+final int cmin;
+CharPropertyGreedy(CharProperty cp, int cmin) {
+this.predicate = cp.predicate;
+this.cmin = cmin;
+}
+boolean match(Matcher matcher, int i,  CharSequence seq) {
+int n = 0;
+int to = matcher.to;
+// greedy, all the way down
+while (i < to) {
+int ch = Character.codePointAt(seq, i);
+if (!predicate.is(ch))
+break;
+i += Character.charCount(ch);
+n++;
+}
+if (i >= to) {
+matcher.hitEnd = true;
+}
+while (n >= cmin) {
+if (next.match(matcher, i, seq))
+return true;
+if (n == cmin)
+return false;
+// backing off if match fails
+int ch = Character.codePointBefore(seq, i);
+i -= Character.charCount(ch);
+n--;
+}
+return false;
+}
+boolean study(TreeInfo info) {
+info.minLength += cmin;
+if (info.maxValid) {
+info.maxLength += MAX_REPS;
+}
+info.deterministic = false;
+return next.study(info);
+}
+}
+static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
+BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
+super(bcp, cmin);
+}
+boolean match(Matcher matcher, int i,  CharSequence seq) {
+int n = 0;
+int to = matcher.to;
+while (i < to && predicate.is(seq.charAt(i))) {
+i++; n++;
+}
+if (i >= to) {
+matcher.hitEnd = true;
+}
+while (n >= cmin) {
+if (next.match(matcher, i, seq))
+return true;
+i--; n--;  // backing off if match fails
+}
+return false;
+}
+}
+/**
 * Handles the curly-brace style repetition with a specified minimum and
 * maximum occurrences. The * quantifier is handled as a special case.
 * This class handles the three types.
 */
 static final class Curly extends Node {
 Node atom;
-int type;
+Qtype type;
 int cmin;
 int cmax;
-Curly(Node node, int cmin, int cmax, int type) {
+Curly(Node node, int cmin, int cmax, Qtype type) {
 this.atom = node;
 this.type = type;
 this.cmin = cmin;
 this.cmax = cmax;
 }
 i = matcher.last;
 continue;
 }
 return false;
 }
-if (type == GREEDY)
+if (type == Qtype.GREEDY)
 return match0(matcher, i, j, seq);
-else if (type == LAZY)
+else if (type == Qtype.LAZY)
 return match1(matcher, i, j, seq);
 else
 return match2(matcher, i, j, seq);
 }
 // Greedy match.
 * If capture is true then this class saves group settings and ensures
 * that groups are unset when backing off of a group match.
 */
 static final class GroupCurly extends Node {
 Node atom;
-int type;
+Qtype type;
 int cmin;
 int cmax;
 int localIndex;
 int groupIndex;
 boolean capture;
-GroupCurly(Node node, int cmin, int cmax, int type, int local,
+GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,
 int group, boolean capture) {
 this.atom = node;
 this.type = type;
 this.cmin = cmin;
 this.cmax = cmax;
 ret = false;
 break;
 }
 }
 if (ret) {
-if (type == GREEDY) {
+if (type == Qtype.GREEDY) {
 ret = match0(matcher, i, cmin, seq);
-} else if (type == LAZY) {
+} else if (type == Qtype.LAZY) {
 ret = match1(matcher, i, cmin, seq);
 } else {
 ret = match2(matcher, i, cmin, seq);
 }
 }
 * indicate that we do not want to unset the group if the reference
 * doesn't match.
 */
 static final class GroupHead extends Node {
 int localIndex;
+GroupTail tail;    // for debug/print only, match does not need to know
 GroupHead(int localCount) {
 localIndex = localCount;
 }
 boolean match(Matcher matcher, int i, CharSequence seq) {
 int save = matcher.locals[localIndex];
 static class Loop extends Node {
 Node body;
 int countIndex; // local count index in matcher locals
 int beginIndex; // group beginning index
 int cmin, cmax;
+int posIndex;
 Loop(int countIndex, int beginIndex) {
 this.countIndex = countIndex;
 this.beginIndex = beginIndex;
+this.posIndex = -1;
 }
 boolean match(Matcher matcher, int i, CharSequence seq) {
 // Avoid infinite loop in zero-length case.
 if (i > matcher.locals[beginIndex]) {
 int count = matcher.locals[countIndex];
 return b;
 }
 // This block is for after we have the minimum
 // iterations required for the loop to match
 if (count < cmax) {
+// Let's check if we have already tried and failed
+// at this starting position "i" in the past.
+// If yes, then just return false wihtout trying
+// again, to stop the exponential backtracking.
+if (posIndex != -1 &&
+matcher.localsPos[posIndex].contains(i)) {
+return next.match(matcher, i, seq);
+}
 matcher.locals[countIndex] = count + 1;
 boolean b = body.match(matcher, i, seq);
 // If match failed we must backtrack, so
 // the loop count should NOT be incremented
-if (!b)
+if (b)
-matcher.locals[countIndex] = count;
-else
 return true;
+matcher.locals[countIndex] = count;
+// save the failed position
+if (posIndex != -1) {
+matcher.localsPos[posIndex].add(i);
+}
 }
 }
 return next.match(matcher, i, seq);
 }
 boolean matchInit(Matcher matcher, int i, CharSequence seq) {
 int save = matcher.locals[countIndex];
 boolean ret = false;
+if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
+matcher.localsPos[posIndex] = new IntHashSet();
+}
 if (0 < cmin) {
 matcher.locals[countIndex] = 1;
 ret = body.match(matcher, i, seq);
 } else if (0 < cmax) {
 matcher.locals[countIndex] = 1;
 return !conditionMatched && next.match(matcher, i, seq);
 }
 }
 /**
-* Returns the set union of two CharProperty nodes.
-*/
-private static CharProperty union(final CharProperty lhs,
-final CharProperty rhs) {
-return new CharProperty() {
-boolean isSatisfiedBy(int ch) {
-return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
-}
-/**
-* Returns the set intersection of two CharProperty nodes.
-*/
-private static CharProperty intersection(final CharProperty lhs,
-final CharProperty rhs) {
-return new CharProperty() {
-boolean isSatisfiedBy(int ch) {
-return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
-}
-/**
-* Returns the set difference of two CharProperty nodes.
-*/
-private static CharProperty setDifference(final CharProperty lhs,
-final CharProperty rhs) {
-return new CharProperty() {
-boolean isSatisfiedBy(int ch) {
-return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
-}
-/**
 * Handles word boundaries. Includes a field to allow this one class to
 * deal with the different types of word boundaries we can match. The word
 * characters include underscores, letters, and digits. Non spacing marks
 * can are also part of a word if they have a base character, otherwise
 * they are ignored for purposes of finding word boundaries.
 type = n;
 this.useUWORD = useUWORD;
 }
 boolean isWord(int ch) {
-return useUWORD ? UnicodeProp.WORD.is(ch)
+return useUWORD ? CharPredicates.WORD.is(ch)
 : (ch == '_' || Character.isLetterOrDigit(ch));
 }
 int check(Matcher matcher, int i, CharSequence seq) {
 int ch;
 matcher.hitEnd = true;
 return false;
 }
 }
-///////////////////////////////////////////////////////////////////////////////
+@FunctionalInterface
-///////////////////////////////////////////////////////////////////////////////
+static interface CharPredicate {
+boolean is(int ch);
+default CharPredicate and(CharPredicate p) {
+return ch -> is(ch) && p.is(ch);
+}
+default CharPredicate union(CharPredicate p) {
+return ch -> is(ch) || p.is(ch);
+}
+default CharPredicate union(CharPredicate p1,
+CharPredicate p2 ) {
+return ch -> is(ch) || p1.is(ch) || p2.is(ch);
+}
+default CharPredicate negate() {
+return ch -> !is(ch);
+}
+}
+static interface BmpCharPredicate extends CharPredicate {
+default CharPredicate and(CharPredicate p) {
+if(p instanceof BmpCharPredicate)
+return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
+return ch -> is(ch) && p.is(ch);
+}
+default CharPredicate union(CharPredicate p) {
+if (p instanceof BmpCharPredicate)
+return (BmpCharPredicate)(ch -> is(ch) || p.is(ch));
+return ch -> is(ch) || p.is(ch);
+}
+static CharPredicate union(CharPredicate... predicates) {
+CharPredicate cp = ch -> {
+for (CharPredicate p : predicates) {
+if (!p.is(ch))
+return false;
+}
+return true;
+};
+for (CharPredicate p : predicates) {
+if (! (p instanceof BmpCharPredicate))
+return cp;
+}
+return (BmpCharPredicate)cp;
+}
+}
+/**
+* matches a Perl vertical whitespace
+*/
+static BmpCharPredicate VertWS = cp ->
+(cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029;
+/**
+* matches a Perl horizontal whitespace
+*/
+static BmpCharPredicate HorizWS = cp ->
+cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||
+cp == 0x180e || cp >= 0x2000 && cp <= 0x200a ||  cp == 0x202f ||
+cp == 0x205f || cp == 0x3000;
+/**
+*  for the Unicode category ALL and the dot metacharacter when
+*  in dotall mode.
+*/
+static CharPredicate ALL = ch -> true;
+/**
+* for the dot metacharacter when dotall is not enabled.
+*/
+static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r'
+&& (ch|1) != '\u2029'
+&& ch != '\u0085');
+/**
+*  the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.
+*/
+static CharPredicate UNIXDOT = ch ->  ch != '\n';
+/**
+* Indicate that matches a Supplementary Unicode character
+*/
+static CharPredicate SingleS(int c) {
+return ch -> ch == c;
+}
+/**
+* A bmp/optimized predicate of single
+*/
+static BmpCharPredicate Single(int c) {
+return ch -> ch == c;
+}
+/**
+* Case insensitive matches a given BMP character
+*/
+static BmpCharPredicate SingleI(int lower, int upper) {
+return ch -> ch == lower || ch == upper;
+}
+/**
+* Unicode case insensitive matches a given Unicode character
+*/
+static CharPredicate SingleU(int lower) {
+return ch -> lower == ch ||
+lower == Character.toLowerCase(Character.toUpperCase(ch));
+}
+private static boolean inRange(int lower, int ch, int upper) {
+return lower <= ch && ch <= upper;
+}
+/**
+* Charactrs within a explicit value range
+*/
+static CharPredicate Range(int lower, int upper) {
+if (upper < Character.MIN_HIGH_SURROGATE ||
+lower > Character.MAX_HIGH_SURROGATE &&
+upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
+return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
+return ch -> inRange(lower, ch, upper);
+}
+/**
+* Charactrs within a explicit value range in a case insensitive manner.
+*/
+static CharPredicate CIRange(int lower, int upper) {
+return ch -> inRange(lower, ch, upper) ||
+ASCII.isAscii(ch) &&
+(inRange(lower, ASCII.toUpper(ch), upper) ||
+inRange(lower, ASCII.toLower(ch), upper));
+}
+static CharPredicate CIRangeU(int lower, int upper) {
+return ch -> {
+if (inRange(lower, ch, upper))
+return true;
+int up = Character.toUpperCase(ch);
+return inRange(lower, up, upper) ||
+inRange(lower, Character.toLowerCase(up), upper);
+};
+}
 /**
 *  This must be the very first initializer.
 */
-static Node accept = new Node();
+static final Node accept = new Node();
-static Node lastAccept = new LastNode();
+static final Node lastAccept = new LastNode();
-private static class CharPropertyNames {
-static CharProperty charPropertyFor(String name) {
-CharPropertyFactory m = map.get(name);
-return m == null ? null : m.make();
-}
-private abstract static class CharPropertyFactory {
-abstract CharProperty make();
-}
-private static void defCategory(String name,
-final int typeMask) {
-map.put(name, new CharPropertyFactory() {
-CharProperty make() { return new Category(typeMask);}});
-}
-private static void defRange(String name,
-final int lower, final int upper) {
-map.put(name, new CharPropertyFactory() {
-CharProperty make() { return rangeFor(lower, upper);}});
-}
-private static void defCtype(String name,
-final int ctype) {
-map.put(name, new CharPropertyFactory() {
-CharProperty make() { return new Ctype(ctype);}});
-}
-private abstract static class CloneableProperty
-extends CharProperty implements Cloneable
-{
-public CloneableProperty clone() {
-try {
-return (CloneableProperty) super.clone();
-} catch (CloneNotSupportedException e) {
-throw new AssertionError(e);
-}
-}
-}
-private static void defClone(String name,
-final CloneableProperty p) {
-map.put(name, new CharPropertyFactory() {
-CharProperty make() { return p.clone();}});
-}
-private static final HashMap<String, CharPropertyFactory> map
-= new HashMap<>();
-static {
-// Unicode character property aliases, defined in
-// http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
-defCategory("Cn", 1<<Character.UNASSIGNED);
-defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
-defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
-defCategory("Lt", 1<<Character.TITLECASE_LETTER);
-defCategory("Lm", 1<<Character.MODIFIER_LETTER);
-defCategory("Lo", 1<<Character.OTHER_LETTER);
-defCategory("Mn", 1<<Character.NON_SPACING_MARK);
-defCategory("Me", 1<<Character.ENCLOSING_MARK);
-defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
-defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
-defCategory("Nl", 1<<Character.LETTER_NUMBER);
-defCategory("No", 1<<Character.OTHER_NUMBER);
-defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
-defCategory("Zl", 1<<Character.LINE_SEPARATOR);
-defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
-defCategory("Cc", 1<<Character.CONTROL);
-defCategory("Cf", 1<<Character.FORMAT);
-defCategory("Co", 1<<Character.PRIVATE_USE);
-defCategory("Cs", 1<<Character.SURROGATE);
-defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
-defCategory("Ps", 1<<Character.START_PUNCTUATION);
-defCategory("Pe", 1<<Character.END_PUNCTUATION);
-defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
-defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
-defCategory("Sm", 1<<Character.MATH_SYMBOL);
-defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
-defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
-defCategory("So", 1<<Character.OTHER_SYMBOL);
-defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
-defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
-defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
-(1<<Character.LOWERCASE_LETTER) |
-(1<<Character.TITLECASE_LETTER) |
-(1<<Character.MODIFIER_LETTER)  |
-(1<<Character.OTHER_LETTER)));
-defCategory("M", ((1<<Character.NON_SPACING_MARK) |
-(1<<Character.ENCLOSING_MARK)   |
-(1<<Character.COMBINING_SPACING_MARK)));
-defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
-(1<<Character.LETTER_NUMBER)        |
-(1<<Character.OTHER_NUMBER)));
-defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
-(1<<Character.LINE_SEPARATOR)  |
-(1<<Character.PARAGRAPH_SEPARATOR)));
-defCategory("C", ((1<<Character.CONTROL)     |
-(1<<Character.FORMAT)      |
-(1<<Character.PRIVATE_USE) |
-(1<<Character.SURROGATE))); // Other
-defCategory("P", ((1<<Character.DASH_PUNCTUATION)      |
-(1<<Character.START_PUNCTUATION)     |
-(1<<Character.END_PUNCTUATION)       |
-(1<<Character.CONNECTOR_PUNCTUATION) |
-(1<<Character.OTHER_PUNCTUATION)     |
-(1<<Character.INITIAL_QUOTE_PUNCTUATION) |
-(1<<Character.FINAL_QUOTE_PUNCTUATION)));
-defCategory("S", ((1<<Character.MATH_SYMBOL)     |
-(1<<Character.CURRENCY_SYMBOL) |
-(1<<Character.MODIFIER_SYMBOL) |
-(1<<Character.OTHER_SYMBOL)));
-defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
-(1<<Character.LOWERCASE_LETTER) |
-(1<<Character.TITLECASE_LETTER)));
-defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
-(1<<Character.LOWERCASE_LETTER) |
-(1<<Character.TITLECASE_LETTER) |
-(1<<Character.MODIFIER_LETTER)  |
-(1<<Character.OTHER_LETTER)     |
-(1<<Character.DECIMAL_DIGIT_NUMBER)));
-defRange("L1", 0x00, 0xFF); // Latin-1
-map.put("all", new CharPropertyFactory() {
-CharProperty make() { return new All(); }});
-// Posix regular expression character classes, defined in
-// http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
-defRange("ASCII", 0x00, 0x7F);   // ASCII
-defCtype("Alnum", ASCII.ALNUM);  // Alphanumeric characters
-defCtype("Alpha", ASCII.ALPHA);  // Alphabetic characters
-defCtype("Blank", ASCII.BLANK);  // Space and tab characters
-defCtype("Cntrl", ASCII.CNTRL);  // Control characters
-defRange("Digit", '0', '9');     // Numeric characters
-defCtype("Graph", ASCII.GRAPH);  // printable and visible
-defRange("Lower", 'a', 'z');     // Lower-case alphabetic
-defRange("Print", 0x20, 0x7E);   // Printable characters
-defCtype("Punct", ASCII.PUNCT);  // Punctuation characters
-defCtype("Space", ASCII.SPACE);  // Space characters
-defRange("Upper", 'A', 'Z');     // Upper-case alphabetic
-defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
-// Java character properties, defined by methods in Character.java
-defClone("javaLowerCase", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isLowerCase(ch);}});
-defClone("javaUpperCase", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isUpperCase(ch);}});
-defClone("javaAlphabetic", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isAlphabetic(ch);}});
-defClone("javaIdeographic", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isIdeographic(ch);}});
-defClone("javaTitleCase", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isTitleCase(ch);}});
-defClone("javaDigit", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isDigit(ch);}});
-defClone("javaDefined", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isDefined(ch);}});
-defClone("javaLetter", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isLetter(ch);}});
-defClone("javaLetterOrDigit", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isLetterOrDigit(ch);}});
-defClone("javaJavaIdentifierStart", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isJavaIdentifierStart(ch);}});
-defClone("javaJavaIdentifierPart", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isJavaIdentifierPart(ch);}});
-defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isUnicodeIdentifierStart(ch);}});
-defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isUnicodeIdentifierPart(ch);}});
-defClone("javaIdentifierIgnorable", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isIdentifierIgnorable(ch);}});
-defClone("javaSpaceChar", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isSpaceChar(ch);}});
-defClone("javaWhitespace", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isWhitespace(ch);}});
-defClone("javaISOControl", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isISOControl(ch);}});
-defClone("javaMirrored", new CloneableProperty() {
-boolean isSatisfiedBy(int ch) {
-return Character.isMirrored(ch);}});
-}
-}
 /**
 * Creates a predicate which can be used to match a string.
 *
 * @return  The predicate which can be used for matching on a string

changeset 37882	e7f3cf12e739
parent 37880	60ec48925dc6
child 38777	826eb7091523