jdk/src/share/classes/java/text/RBTableBuilder.java
changeset 2 90ce3da70b43
child 5506 202f599c92aa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/share/classes/java/text/RBTableBuilder.java	Sat Dec 01 00:00:00 2007 +0000
@@ -0,0 +1,618 @@
+/*
+ * Copyright 1999-2005 Sun Microsystems, Inc.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Sun designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Sun in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ */
+
+/*
+ * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
+ * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved
+ *
+ *   The original version of this source code and documentation is copyrighted
+ * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
+ * materials are provided under terms of a License Agreement between Taligent
+ * and Sun. This technology is protected by multiple US and International
+ * patents. This notice and attribution to Taligent may not be removed.
+ *   Taligent is a registered trademark of Taligent, Inc.
+ *
+ */
+
+package java.text;
+
+import java.util.Vector;
+import sun.text.UCompactIntArray;
+import sun.text.IntHashtable;
+import sun.text.ComposedCharIter;
+import sun.text.CollatorUtilities;
+import sun.text.normalizer.NormalizerImpl;
+
+/**
+ * This class contains all the code to parse a RuleBasedCollator pattern
+ * and build a RBCollationTables object from it.  A particular instance
+ * of tis class exists only during the actual build process-- once an
+ * RBCollationTables object has been built, the RBTableBuilder object
+ * goes away.  This object carries all of the state which is only needed
+ * during the build process, plus a "shadow" copy of all of the state
+ * that will go into the tables object itself.  This object communicates
+ * with RBCollationTables through a separate class, RBCollationTables.BuildAPI,
+ * this is an inner class of RBCollationTables and provides a separate
+ * private API for communication with RBTableBuilder.
+ * This class isn't just an inner class of RBCollationTables itself because
+ * of its large size.  For source-code readability, it seemed better for the
+ * builder to have its own source file.
+ */
+final class RBTableBuilder {
+
+    public RBTableBuilder(RBCollationTables.BuildAPI tables) {
+        this.tables = tables;
+    }
+
+    /**
+     * Create a table-based collation object with the given rules.
+     * This is the main function that actually builds the tables and
+     * stores them back in the RBCollationTables object.  It is called
+     * ONLY by the RBCollationTables constructor.
+     * @see java.util.RuleBasedCollator#RuleBasedCollator
+     * @exception ParseException If the rules format is incorrect.
+     */
+
+    public void build(String pattern, int decmp) throws ParseException
+    {
+        boolean isSource = true;
+        int i = 0;
+        String expChars;
+        String groupChars;
+        if (pattern.length() == 0)
+            throw new ParseException("Build rules empty.", 0);
+
+        // This array maps Unicode characters to their collation ordering
+        mapping = new UCompactIntArray((int)RBCollationTables.UNMAPPED);
+        // Normalize the build rules.  Find occurances of all decomposed characters
+        // and normalize the rules before feeding into the builder.  By "normalize",
+        // we mean that all precomposed Unicode characters must be converted into
+        // a base character and one or more combining characters (such as accents).
+        // When there are multiple combining characters attached to a base character,
+        // the combining characters must be in their canonical order
+        //
+        // sherman/Note:
+        //(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose
+        //hangual syllables to jamos, so we can actually just call decompose with
+        //normalizer's IGNORE_HANGUL option turned on
+        //
+        //(2)just call the "special version" in NormalizerImpl directly
+        //pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true);
+        //
+        //Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp);
+        //pattern = Normalizer.normalize(pattern, mode, 0, true);
+
+        pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern);
+
+        // Build the merged collation entries
+        // Since rules can be specified in any order in the string
+        // (e.g. "c , C < d , D < e , E .... C < CH")
+        // this splits all of the rules in the string out into separate
+        // objects and then sorts them.  In the above example, it merges the
+        // "C < CH" rule in just before the "C < D" rule.
+        //
+
+        mPattern = new MergeCollation(pattern);
+
+        int order = 0;
+
+        // Now walk though each entry and add it to my own tables
+        for (i = 0; i < mPattern.getCount(); ++i)
+        {
+            PatternEntry entry = mPattern.getItemAt(i);
+            if (entry != null) {
+                groupChars = entry.getChars();
+                if (groupChars.length() > 1) {
+                    switch(groupChars.charAt(groupChars.length()-1)) {
+                    case '@':
+                        frenchSec = true;
+                        groupChars = groupChars.substring(0, groupChars.length()-1);
+                        break;
+                    case '!':
+                        seAsianSwapping = true;
+                        groupChars = groupChars.substring(0, groupChars.length()-1);
+                        break;
+                    }
+                }
+
+                order = increment(entry.getStrength(), order);
+                expChars = entry.getExtension();
+
+                if (expChars.length() != 0) {
+                    addExpandOrder(groupChars, expChars, order);
+                } else if (groupChars.length() > 1) {
+                    char ch = groupChars.charAt(0);
+                    if (Character.isHighSurrogate(ch) && groupChars.length() == 2) {
+                        addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order);
+                    } else {
+                        addContractOrder(groupChars, order);
+                    }
+                } else {
+                    char ch = groupChars.charAt(0);
+                    addOrder(ch, order);
+                }
+            }
+        }
+        addComposedChars();
+
+        commit();
+        mapping.compact();
+        /*
+        System.out.println("mappingSize=" + mapping.getKSize());
+        for (int j = 0; j < 0xffff; j++) {
+            int value = mapping.elementAt(j);
+            if (value != RBCollationTables.UNMAPPED)
+                System.out.println("index=" + Integer.toString(j, 16)
+                           + ", value=" + Integer.toString(value, 16));
+        }
+        */
+        tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable,
+                    contractFlags, maxSecOrder, maxTerOrder);
+    }
+
+    /** Add expanding entries for pre-composed unicode characters so that this
+     * collator can be used reasonably well with decomposition turned off.
+     */
+    private void addComposedChars() throws ParseException {
+        // Iterate through all of the pre-composed characters in Unicode
+        ComposedCharIter iter = new ComposedCharIter();
+        int c;
+        while ((c = iter.next()) != ComposedCharIter.DONE) {
+            if (getCharOrder(c) == RBCollationTables.UNMAPPED) {
+                //
+                // We don't already have an ordering for this pre-composed character.
+                //
+                // First, see if the decomposed string is already in our
+                // tables as a single contracting-string ordering.
+                // If so, just map the precomposed character to that order.
+                //
+                // TODO: What we should really be doing here is trying to find the
+                // longest initial substring of the decomposition that is present
+                // in the tables as a contracting character sequence, and find its
+                // ordering.  Then do this recursively with the remaining chars
+                // so that we build a list of orderings, and add that list to
+                // the expansion table.
+                // That would be more correct but also significantly slower, so
+                // I'm not totally sure it's worth doing.
+                //
+                String s = iter.decomposition();
+
+                //sherman/Note: if this is 1 character decomposed string, the
+                //only thing need to do is to check if this decomposed character
+                //has an entry in our order table, this order is not necessary
+                //to be a contraction order, if it does have one, add an entry
+                //for the precomposed character by using the same order, the
+                //previous impl unnecessarily adds a single character expansion
+                //entry.
+                if (s.length() == 1) {
+                    int order = getCharOrder(s.charAt(0));
+                    if (order != RBCollationTables.UNMAPPED) {
+                        addOrder(c, order);
+                    }
+                    continue;
+                } else if (s.length() == 2) {
+                    char ch0 = s.charAt(0);
+                    if (Character.isHighSurrogate(ch0)) {
+                        int order = getCharOrder(s.codePointAt(0));
+                        if (order != RBCollationTables.UNMAPPED) {
+                            addOrder(c, order);
+                        }
+                        continue;
+                    }
+                }
+                int contractOrder = getContractOrder(s);
+                if (contractOrder != RBCollationTables.UNMAPPED) {
+                    addOrder(c, contractOrder);
+                } else {
+                    //
+                    // We don't have a contracting ordering for the entire string
+                    // that results from the decomposition, but if we have orders
+                    // for each individual character, we can add an expanding
+                    // table entry for the pre-composed character
+                    //
+                    boolean allThere = true;
+                    for (int i = 0; i < s.length(); i++) {
+                        if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) {
+                            allThere = false;
+                            break;
+                        }
+                    }
+                    if (allThere) {
+                        addExpandOrder(c, s, RBCollationTables.UNMAPPED);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Look up for unmapped values in the expanded character table.
+     *
+     * When the expanding character tables are built by addExpandOrder,
+     * it doesn't know what the final ordering of each character
+     * in the expansion will be.  Instead, it just puts the raw character
+     * code into the table, adding CHARINDEX as a flag.  Now that we've
+     * finished building the mapping table, we can go back and look up
+     * that character to see what its real collation order is and
+     * stick that into the expansion table.  That lets us avoid doing
+     * a two-stage lookup later.
+     */
+    private final void commit()
+    {
+        if (expandTable != null) {
+            for (int i = 0; i < expandTable.size(); i++) {
+                int[] valueList = (int [])expandTable.elementAt(i);
+                for (int j = 0; j < valueList.length; j++) {
+                    int order = valueList[j];
+                    if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) {
+                        // found a expanding character that isn't filled in yet
+                        int ch = order - CHARINDEX;
+
+                        // Get the real values for the non-filled entry
+                        int realValue = getCharOrder(ch);
+
+                        if (realValue == RBCollationTables.UNMAPPED) {
+                            // The real value is still unmapped, maybe it's ignorable
+                            valueList[j] = IGNORABLEMASK & ch;
+                        } else {
+                            // just fill in the value
+                            valueList[j] = realValue;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    /**
+     *  Increment of the last order based on the comparison level.
+     */
+    private final int increment(int aStrength, int lastValue)
+    {
+        switch(aStrength)
+        {
+        case Collator.PRIMARY:
+            // increment priamry order  and mask off secondary and tertiary difference
+            lastValue += PRIMARYORDERINCREMENT;
+            lastValue &= RBCollationTables.PRIMARYORDERMASK;
+            isOverIgnore = true;
+            break;
+        case Collator.SECONDARY:
+            // increment secondary order and mask off tertiary difference
+            lastValue += SECONDARYORDERINCREMENT;
+            lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY;
+            // record max # of ignorable chars with secondary difference
+            if (!isOverIgnore)
+                maxSecOrder++;
+            break;
+        case Collator.TERTIARY:
+            // increment tertiary order
+            lastValue += TERTIARYORDERINCREMENT;
+            // record max # of ignorable chars with tertiary difference
+            if (!isOverIgnore)
+                maxTerOrder++;
+            break;
+        }
+        return lastValue;
+    }
+
+    /**
+     *  Adds a character and its designated order into the collation table.
+     */
+    private final void addOrder(int ch, int anOrder)
+    {
+        // See if the char already has an order in the mapping table
+        int order = mapping.elementAt(ch);
+
+        if (order >= RBCollationTables.CONTRACTCHARINDEX) {
+            // There's already an entry for this character that points to a contracting
+            // character table.  Instead of adding the character directly to the mapping
+            // table, we must add it to the contract table instead.
+            int length = 1;
+            if (Character.isSupplementaryCodePoint(ch)) {
+                length = Character.toChars(ch, keyBuf, 0);
+            } else {
+                keyBuf[0] = (char)ch;
+            }
+            addContractOrder(new String(keyBuf, 0, length), anOrder);
+        } else {
+            // add the entry to the mapping table,
+            // the same later entry replaces the previous one
+            mapping.setElementAt(ch, anOrder);
+        }
+    }
+
+    private final void addContractOrder(String groupChars, int anOrder) {
+        addContractOrder(groupChars, anOrder, true);
+    }
+
+    /**
+     *  Adds the contracting string into the collation table.
+     */
+    private final void addContractOrder(String groupChars, int anOrder,
+                                          boolean fwd)
+    {
+        if (contractTable == null) {
+            contractTable = new Vector(INITIALTABLESIZE);
+        }
+
+        //initial character
+        int ch = groupChars.codePointAt(0);
+        /*
+        char ch0 = groupChars.charAt(0);
+        int ch = Character.isHighSurrogate(ch0)?
+          Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
+          */
+        // See if the initial character of the string already has a contract table.
+        int entry = mapping.elementAt(ch);
+        Vector entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX);
+
+        if (entryTable == null) {
+            // We need to create a new table of contract entries for this base char
+            int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size();
+            entryTable = new Vector(INITIALTABLESIZE);
+            contractTable.addElement(entryTable);
+
+            // Add the initial character's current ordering first. then
+            // update its mapping to point to this contract table
+            entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry));
+            mapping.setElementAt(ch, tableIndex);
+        }
+
+        // Now add (or replace) this string in the table
+        int index = RBCollationTables.getEntry(entryTable, groupChars, fwd);
+        if (index != RBCollationTables.UNMAPPED) {
+            EntryPair pair = (EntryPair) entryTable.elementAt(index);
+            pair.value = anOrder;
+        } else {
+            EntryPair pair = (EntryPair)entryTable.lastElement();
+
+            // NOTE:  This little bit of logic is here to speed CollationElementIterator
+            // .nextContractChar().  This code ensures that the longest sequence in
+            // this list is always the _last_ one in the list.  This keeps
+            // nextContractChar() from having to search the entire list for the longest
+            // sequence.
+            if (groupChars.length() > pair.entryName.length()) {
+                entryTable.addElement(new EntryPair(groupChars, anOrder, fwd));
+            } else {
+                entryTable.insertElementAt(new EntryPair(groupChars, anOrder,
+                        fwd), entryTable.size() - 1);
+            }
+        }
+
+        // If this was a forward mapping for a contracting string, also add a
+        // reverse mapping for it, so that CollationElementIterator.previous
+        // can work right
+        if (fwd && groupChars.length() > 1) {
+            addContractFlags(groupChars);
+            addContractOrder(new StringBuffer(groupChars).reverse().toString(),
+                             anOrder, false);
+        }
+    }
+
+    /**
+     * If the given string has been specified as a contracting string
+     * in this collation table, return its ordering.
+     * Otherwise return UNMAPPED.
+     */
+    private int getContractOrder(String groupChars)
+    {
+        int result = RBCollationTables.UNMAPPED;
+        if (contractTable != null) {
+            int ch = groupChars.codePointAt(0);
+            /*
+            char ch0 = groupChars.charAt(0);
+            int ch = Character.isHighSurrogate(ch0)?
+              Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
+              */
+            Vector entryTable = getContractValues(ch);
+            if (entryTable != null) {
+                int index = RBCollationTables.getEntry(entryTable, groupChars, true);
+                if (index != RBCollationTables.UNMAPPED) {
+                    EntryPair pair = (EntryPair) entryTable.elementAt(index);
+                    result = pair.value;
+                }
+            }
+        }
+        return result;
+    }
+
+    private final int getCharOrder(int ch) {
+        int order = mapping.elementAt(ch);
+
+        if (order >= RBCollationTables.CONTRACTCHARINDEX) {
+            Vector groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX);
+            EntryPair pair = (EntryPair)groupList.firstElement();
+            order = pair.value;
+        }
+        return order;
+    }
+
+    /**
+     *  Get the entry of hash table of the contracting string in the collation
+     *  table.
+     *  @param ch the starting character of the contracting string
+     */
+    private Vector getContractValues(int ch)
+    {
+        int index = mapping.elementAt(ch);
+        return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX);
+    }
+
+    private Vector getContractValuesImpl(int index)
+    {
+        if (index >= 0)
+        {
+            return (Vector)contractTable.elementAt(index);
+        }
+        else // not found
+        {
+            return null;
+        }
+    }
+
+    /**
+     *  Adds the expanding string into the collation table.
+     */
+    private final void addExpandOrder(String contractChars,
+                                String expandChars,
+                                int anOrder) throws ParseException
+    {
+        // Create an expansion table entry
+        int tableIndex = addExpansion(anOrder, expandChars);
+
+        // And add its index into the main mapping table
+        if (contractChars.length() > 1) {
+            char ch = contractChars.charAt(0);
+            if (Character.isHighSurrogate(ch) && contractChars.length() == 2) {
+                char ch2 = contractChars.charAt(1);
+                if (Character.isLowSurrogate(ch2)) {
+                    //only add into table when it is a legal surrogate
+                    addOrder(Character.toCodePoint(ch, ch2), tableIndex);
+                }
+            } else {
+                addContractOrder(contractChars, tableIndex);
+            }
+        } else {
+            addOrder(contractChars.charAt(0), tableIndex);
+        }
+    }
+
+    private final void addExpandOrder(int ch, String expandChars, int anOrder)
+      throws ParseException
+    {
+        int tableIndex = addExpansion(anOrder, expandChars);
+        addOrder(ch, tableIndex);
+    }
+
+    /**
+     * Create a new entry in the expansion table that contains the orderings
+     * for the given characers.  If anOrder is valid, it is added to the
+     * beginning of the expanded list of orders.
+     */
+    private int addExpansion(int anOrder, String expandChars) {
+        if (expandTable == null) {
+            expandTable = new Vector(INITIALTABLESIZE);
+        }
+
+        // If anOrder is valid, we want to add it at the beginning of the list
+        int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1;
+
+        int[] valueList = new int[expandChars.length() + offset];
+        if (offset == 1) {
+            valueList[0] = anOrder;
+        }
+
+        int j = offset;
+        for (int i = 0; i < expandChars.length(); i++) {
+            char ch0 = expandChars.charAt(i);
+            char ch1;
+            int ch;
+            if (Character.isHighSurrogate(ch0)) {
+                if (++i == expandChars.length() ||
+                    !Character.isLowSurrogate(ch1=expandChars.charAt(i))) {
+                    //ether we are missing the low surrogate or the next char
+                    //is not a legal low surrogate, so stop loop
+                    break;
+                }
+                ch = Character.toCodePoint(ch0, ch1);
+
+            } else {
+                ch = ch0;
+            }
+
+            int mapValue = getCharOrder(ch);
+
+            if (mapValue != RBCollationTables.UNMAPPED) {
+                valueList[j++] = mapValue;
+            } else {
+                // can't find it in the table, will be filled in by commit().
+                valueList[j++] = CHARINDEX + ch;
+            }
+        }
+        if (j < valueList.length) {
+            //we had at least one supplementary character, the size of valueList
+            //is bigger than it really needs...
+            int[] tmpBuf = new int[j];
+            while (--j >= 0) {
+                tmpBuf[j] = valueList[j];
+            }
+            valueList = tmpBuf;
+        }
+        // Add the expanding char list into the expansion table.
+        int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size();
+        expandTable.addElement(valueList);
+
+        return tableIndex;
+    }
+
+    private void addContractFlags(String chars) {
+        char c0;
+        int c;
+        int len = chars.length();
+        for (int i = 0; i < len; i++) {
+            c0 = chars.charAt(i);
+            c = Character.isHighSurrogate(c0)
+                          ?Character.toCodePoint(c0, chars.charAt(++i))
+                          :c0;
+            contractFlags.put(c, 1);
+        }
+    }
+
+    // ==============================================================
+    // constants
+    // ==============================================================
+    final static int CHARINDEX = 0x70000000;  // need look up in .commit()
+
+    private final static int IGNORABLEMASK = 0x0000ffff;
+    private final static int PRIMARYORDERINCREMENT = 0x00010000;
+    private final static int SECONDARYORDERINCREMENT = 0x00000100;
+    private final static int TERTIARYORDERINCREMENT = 0x00000001;
+    private final static int INITIALTABLESIZE = 20;
+    private final static int MAXKEYSIZE = 5;
+
+    // ==============================================================
+    // instance variables
+    // ==============================================================
+
+    // variables used by the build process
+    private RBCollationTables.BuildAPI tables = null;
+    private MergeCollation mPattern = null;
+    private boolean isOverIgnore = false;
+    private char[] keyBuf = new char[MAXKEYSIZE];
+    private IntHashtable contractFlags = new IntHashtable(100);
+
+    // "shadow" copies of the instance variables in RBCollationTables
+    // (the values in these variables are copied back into RBCollationTables
+    // at the end of the build process)
+    private boolean frenchSec = false;
+    private boolean seAsianSwapping = false;
+
+    private UCompactIntArray mapping = null;
+    private Vector   contractTable = null;
+    private Vector   expandTable = null;
+
+    private short maxSecOrder = 0;
+    private short maxTerOrder = 0;
+}