jdk/make/src/classes/build/tools/generatebreakiteratordata/CharSet.java
changeset 21805 c7d7946239de
parent 17950 b2d5b298ec6e
child 23010 6dadb192ad81
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/make/src/classes/build/tools/generatebreakiteratordata/CharSet.java	Thu Nov 14 11:19:32 2013 +0100
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
+ * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
+ *
+ * The original version of this source code and documentation
+ * is copyrighted and owned by Taligent, Inc., a wholly-owned
+ * subsidiary of IBM. These materials are provided under terms
+ * of a License Agreement between Taligent and Sun. This technology
+ * is protected by multiple US and International patents.
+ *
+ * This notice and attribution to Taligent may not be removed.
+ * Taligent is a registered trademark of Taligent, Inc.
+ */
+
+package build.tools.generatebreakiteratordata;
+
+import java.util.Arrays;
+import java.util.Hashtable;
+
+/**
+ * An object representing a set of characters.  (This is a "set" in the
+ * mathematical sense: an unduplicated list of characters on which set
+ * operations such as union and intersection can be performed.)  The
+ * set information is stored in compressed, optimized form: The object
+ * contains an integer array with an even number of characters.  Each
+ * pair of characters represents a range of characters contained in the set
+ * (a pair of the same character represents a single character).  The
+ * characters are sorted in increasing order.
+ */
+class CharSet {
+    /**
+     * The structure containing the set information.  The characters
+     * in this array are organized into pairs, each pair representing
+     * a range of characters contained in the set
+     */
+    private int[] chars;
+
+    //==========================================================================
+    // parseString() and associated routines
+    //==========================================================================
+    /**
+     * A cache which is used to speed up parseString() whenever it is
+     * used to parse a description that has been parsed before
+     */
+    private static Hashtable<String, CharSet> expressionCache = null;
+
+    /**
+     * Builds a CharSet based on a textual description.  For the syntax of
+     * the description, see the documentation of RuleBasedBreakIterator.
+     * @see java.text.RuleBasedBreakIterator
+     */
+    public static CharSet parseString(String s) {
+        CharSet result = null;
+
+        // if "s" is in the expression cache, pull the result out
+        // of the expresison cache
+        if (expressionCache != null) {
+            result = expressionCache.get(s);
+        }
+
+        // otherwise, use doParseString() to actually parse the string,
+        // and then add a corresponding entry to the expression cache
+        if (result == null) {
+            result = doParseString(s);
+            if (expressionCache == null) {
+                expressionCache = new Hashtable<>();
+            }
+            expressionCache.put(s, result);
+        }
+        result = (CharSet)(result.clone());
+        return result;
+    }
+
+    /**
+     * This function is used by parseString() to actually parse the string
+     */
+    private static CharSet doParseString(String s) {
+        CharSet result = new CharSet();
+        int p = 0;
+
+        boolean haveDash = false;
+        boolean haveTilde = false;
+        boolean wIsReal = false;
+        int w = 0x0000;
+
+        // for each character in the description...
+        while (p < s.length()) {
+            int c = s.codePointAt(p);
+
+            // if it's an opening bracket...
+            if (c == '[') {
+                // flush the single-character cache
+                if (wIsReal) {
+                    result.internalUnion(new CharSet(w));
+                }
+
+                // locate the matching closing bracket
+                int bracketLevel = 1;
+                int q = p + 1;
+                while (bracketLevel != 0) {
+                    // if no matching bracket by end of string then...
+                    if (q >= s.length()) {
+                        throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
+                    }
+                    int ch = s.codePointAt(q);
+                    switch (ch) {
+                    case '\\': // need to step over next character
+                        ch = s.codePointAt(++q);
+                        break;
+                    case '[':
+                        ++bracketLevel;
+                        break;
+                    case ']':
+                        --bracketLevel;
+                        break;
+                    }
+                    q += Character.charCount(ch);
+                }
+                --q;
+
+                // call parseString() recursively to parse the text inside
+                // the brackets, then either add or subtract the result from
+                // our running result depending on whether or not the []
+                // expresison was preceded by a ^
+                if (!haveTilde) {
+                    result.internalUnion(CharSet.parseString(s.substring(p + 1, q)));
+                }
+                else {
+                    result.internalDifference(CharSet.parseString(s.substring(p + 1, q)));
+                }
+                haveTilde = false;
+                haveDash = false;
+                wIsReal = false;
+                p = q + 1;
+            }
+
+            // if the character is a colon...
+            else if (c == ':') {
+                // flush the single-character cache
+                if (wIsReal) {
+                    result.internalUnion(new CharSet(w));
+                }
+
+                // locate the matching colon (and throw an error if there
+                // isn't one)
+                int q = s.indexOf(':', p + 1);
+                if (q == -1) {
+                    throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
+                }
+
+                // use charSetForCategory() to parse the text in the colons,
+                // and either add or substract the result from our running
+                // result depending on whether the :: expression was
+                // preceded by a ^
+                if (!haveTilde) {
+                    result.internalUnion(charSetForCategory(s.substring(p + 1, q)));
+                }
+                else {
+                    result.internalDifference(charSetForCategory(s.substring(p + 1, q)));
+                }
+
+                // reset everything and advance to the next character
+                haveTilde = false;
+                haveDash = false;
+                wIsReal = false;
+                p = q + 1;
+            }
+
+            // if the character is a dash, set an appropriate flag
+            else if (c == '-') {
+                if (wIsReal) {
+                    haveDash = true;
+                }
+                ++p;
+            }
+
+            // if the character is a caret, flush the single-character
+            // cache and set an appropriate flag.  If the set is empty
+            // (i.e., if the expression begins with ^), invert the set
+            // (i.e., set it to include everything).  The idea here is
+            // that a set that includes nothing but ^ expressions
+            // means "everything but these things".
+            else if (c == '^') {
+                if (wIsReal) {
+                    result.internalUnion(new CharSet(w));
+                    wIsReal = false;
+                }
+                haveTilde = true;
+                ++p;
+                if (result.empty()) {
+                    result.internalComplement();
+                }
+            }
+
+            // throw an exception on an illegal character
+            else if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c)
+                     && !Character.isDigit((char)c) && c != '\\') {
+                throw new IllegalArgumentException("Parse error at position " + p + " in " + s);
+            }
+
+            // otherwise, we end up here...
+            else {
+                // on a backslash, advance to the next character
+                if (c == '\\') {
+                    ++p;
+                }
+
+                // if the preceding character was a dash, this character
+                // defines the end of a range.  Add or subtract that range
+                // from the running result depending on whether or not it
+                // was preceded by a ^
+                if (haveDash) {
+                    if (s.codePointAt(p) < w) {
+                        throw new IllegalArgumentException("U+" +
+                            Integer.toHexString(s.codePointAt(p))
+                            + " is less than U+" + Integer.toHexString(w) + ".  Dash expressions "
+                            + "can't have their endpoints in reverse order.");
+                    }
+
+                    int ch = s.codePointAt(p);
+                    if (!haveTilde) {
+                        result.internalUnion(new CharSet(w, ch));
+                    }
+                    else {
+                        result.internalDifference(new CharSet(w, ch));
+                    }
+                    p += Character.charCount(ch);
+                    haveDash = false;
+                    haveTilde = false;
+                    wIsReal = false;
+                }
+
+                // if the preceding character was a caret, remove this character
+                // from the running result
+                else if (haveTilde) {
+                    w = s.codePointAt(p);
+                    result.internalDifference(new CharSet(w));
+                    p += Character.charCount(w);
+                    haveTilde = false;
+                    wIsReal = false;
+                }
+
+                // otherwise, flush the single-character cache and then
+                // put this character into the cache
+                else if (wIsReal) {
+                    result.internalUnion(new CharSet(w));
+                    w = s.codePointAt(p);
+                    p += Character.charCount(w);
+                    wIsReal = true;
+                } else {
+                    w = s.codePointAt(p);
+                    p += Character.charCount(w);
+                    wIsReal = true;
+                }
+            }
+        }
+
+        // finally, flush the single-character cache one last time
+        if (wIsReal) {
+            result.internalUnion(new CharSet(w));
+        }
+
+        return result;
+    }
+
+    /**
+     * Creates a CharSet containing all the characters in a particular
+     * Unicode category.  The text is either a two-character code from
+     * the Unicode database or a single character that begins one or more
+     * two-character codes.
+     */
+    private static CharSet charSetForCategory(String category) {
+        // throw an exception if we have anything other than one or two
+        // characters inside the colons
+        if (category.length() == 0 || category.length() >= 3) {
+            throw new IllegalArgumentException("Invalid character category: " + category);
+        }
+
+        // if we have two characters, search the category map for that code
+        // and either construct and return a CharSet from the data in the
+        // category map or throw an exception
+        if (category.length() == 2) {
+            for (int i = 0; i < CharacterCategory.categoryNames.length; i++) {
+                if (CharacterCategory.categoryNames[i].equals(category)) {
+                    return new CharSet(CharacterCategory.getCategoryMap(i));
+                }
+            }
+            throw new IllegalArgumentException("Invalid character category: " + category);
+        }
+
+        // if we have one character, search the category map for codes beginning
+        // with that letter, and union together all of the matching sets that
+        // we find (or throw an exception if there are no matches)
+        else if (category.length() == 1) {
+            CharSet result = new CharSet();
+            for (int i = 0; i < CharacterCategory.categoryNames.length; i++) {
+                if (CharacterCategory.categoryNames[i].startsWith(category)) {
+                    result = result.union(new CharSet(CharacterCategory.getCategoryMap(i)));
+                }
+            }
+            if (result.empty()) {
+                throw new IllegalArgumentException("Invalid character category: " + category);
+            }
+            else {
+                return result;
+            }
+        }
+        return new CharSet(); // should never get here, but to make the compiler happy...
+    }
+
+    /**
+     * Returns a copy of CharSet's expression cache and sets CharSet's
+     * expression cache to empty.
+     */
+    public static Hashtable<String, CharSet> releaseExpressionCache() {
+        Hashtable<String, CharSet> result = expressionCache;
+        expressionCache = null;
+        return result;
+    }
+
+    //==========================================================================
+    // CharSet manipulation
+    //==========================================================================
+    /**
+     * Creates an empty CharSet.
+     */
+    public CharSet() {
+        chars = new int[0];
+    }
+
+    /**
+     * Creates a CharSet containing a single character.
+     * @param c The character to put into the CharSet
+     */
+    public CharSet(int c) {
+        chars = new int[2];
+        chars[0] = c;
+        chars[1] = c;
+    }
+
+    /**
+     * Creates a CharSet containing a range of characters.
+     * @param lo The lowest-numbered character to include in the range
+     * @param hi The highest-numbered character to include in the range
+     */
+    public CharSet(int lo, int hi) {
+        chars = new int[2];
+        if (lo <= hi) {
+            chars[0] = lo;
+            chars[1] = hi;
+        }
+        else {
+            chars[0] = hi;
+            chars[1] = lo;
+        }
+    }
+
+    /**
+     * Creates a CharSet, initializing it from the internal storage
+     * of another CharSet (this function performs no error checking
+     * on "chars", so if it's malformed, undefined behavior will result)
+     */
+    private CharSet(int[] chars) {
+        this.chars = chars;
+    }
+
+    /**
+     * Returns a CharSet representing the union of two CharSets.
+     */
+    public CharSet union(CharSet that) {
+        return new CharSet(doUnion(that.chars));
+    }
+
+    /**
+     * Adds the characters in "that" to this CharSet
+     */
+    private void internalUnion(CharSet that) {
+        chars = doUnion(that.chars);
+    }
+
+    /**
+     * The actual implementation of the union functions
+     */
+    private int[] doUnion(int[] c2) {
+        int[] result = new int[chars.length+c2.length];
+
+        int i = 0;
+        int j = 0;
+        int index = 0;
+
+        // consider all the characters in both strings
+        while (i < chars.length && j < c2.length) {
+            int ub;
+
+            // the first character in the result is the lower of the
+            // starting characters of the two strings, and "ub" gets
+            // set to the upper bound of that range
+            if (chars[i] < c2[j]) {
+                result[index++] = chars[i];
+                ub = chars[++i];
+            }
+            else {
+                result[index++] = c2[j];
+                ub = c2[++j];
+            }
+
+            // for as long as one of our two pointers is pointing to a range's
+            // end point, or i is pointing to a character that is less than
+            // "ub" plus one (the "plus one" stitches touching ranges together)...
+            while (i % 2 == 1 ||
+                   j % 2 == 1 ||
+                   (i < chars.length && chars[i] <= ub + 1)) {
+
+                // advance i to the first character that is greater than
+                // "ub" plus one
+                while (i < chars.length && chars[i] <= ub + 1) {
+                    ++i;
+                }
+
+                // if i points to the endpoint of a range, update "ub"
+                // to that character, or if i points to the start of
+                // a range and the endpoint of the preceding range is
+                // greater than "ub", update "up" to _that_ character
+                if (i % 2 == 1) {
+                    ub = chars[i];
+                }
+                else if (i > 0 && chars[i - 1] > ub) {
+                    ub = chars[i - 1];
+                }
+
+                // now advance j to the first character that is greater
+                // that "ub" plus one
+                while (j < c2.length && c2[j] <= ub + 1) {
+                    ++j;
+                }
+
+                // if j points to the endpoint of a range, update "ub"
+                // to that character, or if j points to the start of
+                // a range and the endpoint of the preceding range is
+                // greater than "ub", update "up" to _that_ character
+                if (j % 2 == 1) {
+                    ub = c2[j];
+                }
+                else if (j > 0 && c2[j - 1] > ub) {
+                    ub = c2[j - 1];
+                }
+            }
+            // when we finally fall out of this loop, we will have stitched
+            // together a series of ranges that overlap or touch, i and j
+            // will both point to starting points of ranges, and "ub" will
+            // be the endpoint of the range we're working on.  Write "ub"
+            // to the result
+            result[index++] = ub;
+
+        // loop back around to create the next range in the result
+        }
+
+        // we fall out to here when we've exhausted all the characters in
+        // one of the operands.  We can append all of the remaining characters
+        // in the other operand without doing any extra work.
+        if (i < chars.length) {
+            for (int k = i; k < chars.length; k++) {
+                result[index++] = chars[k];
+            }
+        }
+        if (j < c2.length) {
+            for (int k = j; k < c2.length; k++) {
+                result[index++] = c2[k];
+            }
+        }
+
+        if (result.length > index) {
+            int[] tmpbuf = new int[index];
+            System.arraycopy(result, 0, tmpbuf, 0, index);
+            return tmpbuf;
+        }
+
+        return result;
+    }
+
+    /**
+     * Returns the intersection of two CharSets.
+     */
+    public CharSet intersection(CharSet that) {
+        return new CharSet(doIntersection(that.chars));
+    }
+
+    /**
+     * Removes from this CharSet any characters that aren't also in "that"
+     */
+    private void internalIntersection(CharSet that) {
+        chars = doIntersection(that.chars);
+    }
+
+    /**
+     * The internal implementation of the two intersection functions
+     */
+    private int[] doIntersection(int[] c2) {
+        int[] result = new int[chars.length+c2.length];
+
+        int i = 0;
+        int j = 0;
+        int oldI;
+        int oldJ;
+        int index = 0;
+
+        // iterate until we've exhausted one of the operands
+        while (i < chars.length && j < c2.length) {
+
+            // advance j until it points to a character that is larger than
+            // the one i points to.  If this is the beginning of a one-
+            // character range, advance j to point to the end
+            if (i < chars.length && i % 2 == 0) {
+                while (j < c2.length && c2[j] < chars[i]) {
+                    ++j;
+                }
+                if (j < c2.length && j % 2 == 0 && c2[j] == chars[i]) {
+                    ++j;
+                }
+            }
+
+            // if j points to the endpoint of a range, save the current
+            // value of i, then advance i until it reaches a character
+            // which is larger than the character pointed at
+            // by j.  All of the characters we've advanced over (except
+            // the one currently pointed to by i) are added to the result
+            oldI = i;
+            while (j % 2 == 1 && i < chars.length && chars[i] <= c2[j]) {
+                ++i;
+            }
+            for (int k = oldI; k < i; k++) {
+                result[index++] = chars[k];
+            }
+
+            // if i points to the endpoint of a range, save the current
+            // value of j, then advance j until it reaches a character
+            // which is larger than the character pointed at
+            // by i.  All of the characters we've advanced over (except
+            // the one currently pointed to by i) are added to the result
+            oldJ = j;
+            while (i % 2 == 1 && j < c2.length && c2[j] <= chars[i]) {
+                ++j;
+            }
+            for (int k = oldJ; k < j; k++) {
+                result[index++] = c2[k];
+            }
+
+            // advance i until it points to a character larger than j
+            // If it points at the beginning of a one-character range,
+            // advance it to the end of that range
+            if (j < c2.length && j % 2 == 0) {
+                while (i < chars.length && chars[i] < c2[j]) {
+                    ++i;
+                }
+                if (i < chars.length && i % 2 == 0 && c2[j] == chars[i]) {
+                    ++i;
+                }
+            }
+        }
+
+        if (result.length > index) {
+            int[] tmpbuf = new int[index];
+            System.arraycopy(result, 0, tmpbuf, 0, index);
+            return tmpbuf;
+        }
+
+        return result;
+    }
+
+    /**
+     * Returns a CharSet containing all the characters in "this" that
+     * aren't also in "that"
+     */
+    public CharSet difference(CharSet that) {
+        return new CharSet(doIntersection(that.doComplement()));
+    }
+
+    /**
+     * Removes from "this" all the characters that are also in "that"
+     */
+    private void internalDifference(CharSet that) {
+        chars = doIntersection(that.doComplement());
+    }
+
+    /**
+     * Returns a CharSet containing all the characters which are not
+     * in "this"
+     */
+    public CharSet complement() {
+        return new CharSet(doComplement());
+    }
+
+    /**
+     * Complements "this".  All the characters it contains are removed,
+     * and all the characters it doesn't contain are added.
+     */
+    private void internalComplement() {
+        chars = doComplement();
+    }
+
+    /**
+     * The internal implementation function for the complement routines
+     */
+    private int[] doComplement() {
+        // the complement of an empty CharSet is one containing everything
+        if (empty()) {
+            int[] result = new int[2];
+            result[0] = 0x0000;
+            result[1] = 0x10FFFF;
+            return result;
+        }
+
+        int[] result = new int[chars.length+2];
+
+        int i = 0;
+        int index = 0;
+
+        // the result begins with \u0000 unless the original CharSet does
+        if (chars[0] != 0x0000) {
+            result[index++] = 0x0000;
+        }
+
+        // walk through the characters in this CharSet.  Append a pair of
+        // characters the first of which is one less than the first
+        // character we see and the second of which is one plus the second
+        // character we see (don't write the first character if it's \u0000,
+        // and don't write the second character if it's \uffff.
+        while (i < chars.length) {
+            if (chars[i] != 0x0000) {
+                result[index++] = chars[i] - 1;
+            }
+            if (chars[i + 1] != 0x10FFFF) {
+                result[index++] = chars[i + 1] + 1;
+            }
+            i += 2;
+        }
+
+        // add 0x10ffff to the end of the result, unless it was in
+        // the original set
+        if (chars[i-1] != 0x10FFFF) {
+            result[index++] = 0x10FFFF;
+        }
+
+        if (result.length > index) {
+            int[] tmpbuf = new int[index];
+            System.arraycopy(result, 0, tmpbuf, 0, index);
+            return tmpbuf;
+        }
+
+        return result;
+    }
+
+    /**
+     * Returns true if this CharSet contains the specified character
+     * @param c The character we're testing for set membership
+     */
+    public boolean contains(int c) {
+        // search for the first range endpoint that is greater than or
+        // equal to c
+        int i = 1;
+        while (i < chars.length && chars[i] < c) {
+            i += 2;
+        }
+
+        // if we've walked off the end, we don't contain c
+        if (i == chars.length) {
+            return false;
+        }
+
+        // otherwise, we contain c if the beginning of the range is less
+        // than or equal to c
+        return chars[i - 1] <= c;
+    }
+
+    /**
+     * Returns true if "that" is another instance of CharSet containing
+     * the exact same characters as this one
+     */
+    public boolean equals(Object that) {
+        return (that instanceof CharSet) && Arrays.equals(chars, ((CharSet)that).chars);
+    }
+
+    /**
+     * Returns the hash code for this set of characters
+     */
+    public int hashCode() {
+       return Arrays.hashCode(chars);
+    }
+
+    /**
+     * Creates a new CharSet that is equal to this one
+     */
+    public Object clone() {
+        return new CharSet(chars);
+    }
+
+    /**
+     * Returns true if this CharSet contains no characters
+     */
+    public boolean empty() {
+        return chars.length == 0;
+    }
+
+    /**
+     * Returns a textual representation of this CharSet.  If the result
+     * of calling this function is passed to CharSet.parseString(), it
+     * will produce another CharSet that is equal to this one.
+     */
+    public String toString() {
+        StringBuffer result = new StringBuffer();
+
+        // the result begins with an opening bracket
+        result.append('[');
+
+        // iterate through the ranges in the CharSet
+        for (int i = 0; i < chars.length; i += 2) {
+            // for a range with the same beginning and ending point,
+            // output that character
+            if (chars[i] == chars[i + 1]) {
+                result.append("0x");
+                result.append(Integer.toHexString(chars[i]));
+            }
+
+            // otherwise, output the start and end points of the range
+            // separated by a dash
+            else {
+                result.append("0x");
+                result.append(Integer.toHexString(chars[i]));
+                result.append("-0x");
+                result.append(Integer.toHexString(chars[i + 1]));
+            }
+        }
+
+        // the result ends with a closing bracket
+        result.append(']');
+        return result.toString();
+    }
+
+    /**
+     * Returns an integer array representing the contents of this CharSet
+     * in the same form in which they're stored internally: as pairs
+     * of characters representing the start and end points of ranges
+     */
+    public int[] getRanges() {
+        return chars;
+    }
+
+    /**
+     * Returns an Enumeration that will return the ranges of characters
+     * contained in this CharSet one at a time
+     */
+    public Enumeration getChars() {
+        return new Enumeration(this);
+    }
+
+    //==========================================================================
+    // CharSet.Enumeration
+    //==========================================================================
+
+    /**
+     * An Enumeration that can be used to extract the character ranges
+     * from a CharSet one at a time
+     */
+    public class Enumeration implements java.util.Enumeration<int[]> {
+        /**
+         * Initializes a CharSet.Enumeration
+         */
+        Enumeration(CharSet cs) {
+            this.chars = cs.chars;
+            p = 0;
+        }
+
+        /**
+         * Returns true if the enumeration hasn't yet returned
+         * all the ranges in the CharSet
+         */
+        public boolean hasMoreElements() {
+            return p < chars.length;
+        }
+
+        /**
+         * Returns the next range in the CarSet
+         */
+        public int[] nextElement() {
+            int[] result = new int[2];
+            result[0] = chars[p++];
+            result[1] = chars[p++];
+            return result;
+        }
+
+        int p;
+        int[] chars;
+    }
+}