--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/share/classes/java/text/RBTableBuilder.java Sat Dec 01 00:00:00 2007 +0000
@@ -0,0 +1,618 @@
+/*
+ * Copyright 1999-2005 Sun Microsystems, Inc. All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Sun designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Sun in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ */
+
+/*
+ * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
+ * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved
+ *
+ * The original version of this source code and documentation is copyrighted
+ * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
+ * materials are provided under terms of a License Agreement between Taligent
+ * and Sun. This technology is protected by multiple US and International
+ * patents. This notice and attribution to Taligent may not be removed.
+ * Taligent is a registered trademark of Taligent, Inc.
+ *
+ */
+
+package java.text;
+
+import java.util.Vector;
+import sun.text.UCompactIntArray;
+import sun.text.IntHashtable;
+import sun.text.ComposedCharIter;
+import sun.text.CollatorUtilities;
+import sun.text.normalizer.NormalizerImpl;
+
+/**
+ * This class contains all the code to parse a RuleBasedCollator pattern
+ * and build a RBCollationTables object from it. A particular instance
+ * of tis class exists only during the actual build process-- once an
+ * RBCollationTables object has been built, the RBTableBuilder object
+ * goes away. This object carries all of the state which is only needed
+ * during the build process, plus a "shadow" copy of all of the state
+ * that will go into the tables object itself. This object communicates
+ * with RBCollationTables through a separate class, RBCollationTables.BuildAPI,
+ * this is an inner class of RBCollationTables and provides a separate
+ * private API for communication with RBTableBuilder.
+ * This class isn't just an inner class of RBCollationTables itself because
+ * of its large size. For source-code readability, it seemed better for the
+ * builder to have its own source file.
+ */
+final class RBTableBuilder {
+
+ public RBTableBuilder(RBCollationTables.BuildAPI tables) {
+ this.tables = tables;
+ }
+
+ /**
+ * Create a table-based collation object with the given rules.
+ * This is the main function that actually builds the tables and
+ * stores them back in the RBCollationTables object. It is called
+ * ONLY by the RBCollationTables constructor.
+ * @see java.util.RuleBasedCollator#RuleBasedCollator
+ * @exception ParseException If the rules format is incorrect.
+ */
+
+ public void build(String pattern, int decmp) throws ParseException
+ {
+ boolean isSource = true;
+ int i = 0;
+ String expChars;
+ String groupChars;
+ if (pattern.length() == 0)
+ throw new ParseException("Build rules empty.", 0);
+
+ // This array maps Unicode characters to their collation ordering
+ mapping = new UCompactIntArray((int)RBCollationTables.UNMAPPED);
+ // Normalize the build rules. Find occurances of all decomposed characters
+ // and normalize the rules before feeding into the builder. By "normalize",
+ // we mean that all precomposed Unicode characters must be converted into
+ // a base character and one or more combining characters (such as accents).
+ // When there are multiple combining characters attached to a base character,
+ // the combining characters must be in their canonical order
+ //
+ // sherman/Note:
+ //(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose
+ //hangual syllables to jamos, so we can actually just call decompose with
+ //normalizer's IGNORE_HANGUL option turned on
+ //
+ //(2)just call the "special version" in NormalizerImpl directly
+ //pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true);
+ //
+ //Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp);
+ //pattern = Normalizer.normalize(pattern, mode, 0, true);
+
+ pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern);
+
+ // Build the merged collation entries
+ // Since rules can be specified in any order in the string
+ // (e.g. "c , C < d , D < e , E .... C < CH")
+ // this splits all of the rules in the string out into separate
+ // objects and then sorts them. In the above example, it merges the
+ // "C < CH" rule in just before the "C < D" rule.
+ //
+
+ mPattern = new MergeCollation(pattern);
+
+ int order = 0;
+
+ // Now walk though each entry and add it to my own tables
+ for (i = 0; i < mPattern.getCount(); ++i)
+ {
+ PatternEntry entry = mPattern.getItemAt(i);
+ if (entry != null) {
+ groupChars = entry.getChars();
+ if (groupChars.length() > 1) {
+ switch(groupChars.charAt(groupChars.length()-1)) {
+ case '@':
+ frenchSec = true;
+ groupChars = groupChars.substring(0, groupChars.length()-1);
+ break;
+ case '!':
+ seAsianSwapping = true;
+ groupChars = groupChars.substring(0, groupChars.length()-1);
+ break;
+ }
+ }
+
+ order = increment(entry.getStrength(), order);
+ expChars = entry.getExtension();
+
+ if (expChars.length() != 0) {
+ addExpandOrder(groupChars, expChars, order);
+ } else if (groupChars.length() > 1) {
+ char ch = groupChars.charAt(0);
+ if (Character.isHighSurrogate(ch) && groupChars.length() == 2) {
+ addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order);
+ } else {
+ addContractOrder(groupChars, order);
+ }
+ } else {
+ char ch = groupChars.charAt(0);
+ addOrder(ch, order);
+ }
+ }
+ }
+ addComposedChars();
+
+ commit();
+ mapping.compact();
+ /*
+ System.out.println("mappingSize=" + mapping.getKSize());
+ for (int j = 0; j < 0xffff; j++) {
+ int value = mapping.elementAt(j);
+ if (value != RBCollationTables.UNMAPPED)
+ System.out.println("index=" + Integer.toString(j, 16)
+ + ", value=" + Integer.toString(value, 16));
+ }
+ */
+ tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable,
+ contractFlags, maxSecOrder, maxTerOrder);
+ }
+
+ /** Add expanding entries for pre-composed unicode characters so that this
+ * collator can be used reasonably well with decomposition turned off.
+ */
+ private void addComposedChars() throws ParseException {
+ // Iterate through all of the pre-composed characters in Unicode
+ ComposedCharIter iter = new ComposedCharIter();
+ int c;
+ while ((c = iter.next()) != ComposedCharIter.DONE) {
+ if (getCharOrder(c) == RBCollationTables.UNMAPPED) {
+ //
+ // We don't already have an ordering for this pre-composed character.
+ //
+ // First, see if the decomposed string is already in our
+ // tables as a single contracting-string ordering.
+ // If so, just map the precomposed character to that order.
+ //
+ // TODO: What we should really be doing here is trying to find the
+ // longest initial substring of the decomposition that is present
+ // in the tables as a contracting character sequence, and find its
+ // ordering. Then do this recursively with the remaining chars
+ // so that we build a list of orderings, and add that list to
+ // the expansion table.
+ // That would be more correct but also significantly slower, so
+ // I'm not totally sure it's worth doing.
+ //
+ String s = iter.decomposition();
+
+ //sherman/Note: if this is 1 character decomposed string, the
+ //only thing need to do is to check if this decomposed character
+ //has an entry in our order table, this order is not necessary
+ //to be a contraction order, if it does have one, add an entry
+ //for the precomposed character by using the same order, the
+ //previous impl unnecessarily adds a single character expansion
+ //entry.
+ if (s.length() == 1) {
+ int order = getCharOrder(s.charAt(0));
+ if (order != RBCollationTables.UNMAPPED) {
+ addOrder(c, order);
+ }
+ continue;
+ } else if (s.length() == 2) {
+ char ch0 = s.charAt(0);
+ if (Character.isHighSurrogate(ch0)) {
+ int order = getCharOrder(s.codePointAt(0));
+ if (order != RBCollationTables.UNMAPPED) {
+ addOrder(c, order);
+ }
+ continue;
+ }
+ }
+ int contractOrder = getContractOrder(s);
+ if (contractOrder != RBCollationTables.UNMAPPED) {
+ addOrder(c, contractOrder);
+ } else {
+ //
+ // We don't have a contracting ordering for the entire string
+ // that results from the decomposition, but if we have orders
+ // for each individual character, we can add an expanding
+ // table entry for the pre-composed character
+ //
+ boolean allThere = true;
+ for (int i = 0; i < s.length(); i++) {
+ if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) {
+ allThere = false;
+ break;
+ }
+ }
+ if (allThere) {
+ addExpandOrder(c, s, RBCollationTables.UNMAPPED);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Look up for unmapped values in the expanded character table.
+ *
+ * When the expanding character tables are built by addExpandOrder,
+ * it doesn't know what the final ordering of each character
+ * in the expansion will be. Instead, it just puts the raw character
+ * code into the table, adding CHARINDEX as a flag. Now that we've
+ * finished building the mapping table, we can go back and look up
+ * that character to see what its real collation order is and
+ * stick that into the expansion table. That lets us avoid doing
+ * a two-stage lookup later.
+ */
+ private final void commit()
+ {
+ if (expandTable != null) {
+ for (int i = 0; i < expandTable.size(); i++) {
+ int[] valueList = (int [])expandTable.elementAt(i);
+ for (int j = 0; j < valueList.length; j++) {
+ int order = valueList[j];
+ if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) {
+ // found a expanding character that isn't filled in yet
+ int ch = order - CHARINDEX;
+
+ // Get the real values for the non-filled entry
+ int realValue = getCharOrder(ch);
+
+ if (realValue == RBCollationTables.UNMAPPED) {
+ // The real value is still unmapped, maybe it's ignorable
+ valueList[j] = IGNORABLEMASK & ch;
+ } else {
+ // just fill in the value
+ valueList[j] = realValue;
+ }
+ }
+ }
+ }
+ }
+ }
+ /**
+ * Increment of the last order based on the comparison level.
+ */
+ private final int increment(int aStrength, int lastValue)
+ {
+ switch(aStrength)
+ {
+ case Collator.PRIMARY:
+ // increment priamry order and mask off secondary and tertiary difference
+ lastValue += PRIMARYORDERINCREMENT;
+ lastValue &= RBCollationTables.PRIMARYORDERMASK;
+ isOverIgnore = true;
+ break;
+ case Collator.SECONDARY:
+ // increment secondary order and mask off tertiary difference
+ lastValue += SECONDARYORDERINCREMENT;
+ lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY;
+ // record max # of ignorable chars with secondary difference
+ if (!isOverIgnore)
+ maxSecOrder++;
+ break;
+ case Collator.TERTIARY:
+ // increment tertiary order
+ lastValue += TERTIARYORDERINCREMENT;
+ // record max # of ignorable chars with tertiary difference
+ if (!isOverIgnore)
+ maxTerOrder++;
+ break;
+ }
+ return lastValue;
+ }
+
+ /**
+ * Adds a character and its designated order into the collation table.
+ */
+ private final void addOrder(int ch, int anOrder)
+ {
+ // See if the char already has an order in the mapping table
+ int order = mapping.elementAt(ch);
+
+ if (order >= RBCollationTables.CONTRACTCHARINDEX) {
+ // There's already an entry for this character that points to a contracting
+ // character table. Instead of adding the character directly to the mapping
+ // table, we must add it to the contract table instead.
+ int length = 1;
+ if (Character.isSupplementaryCodePoint(ch)) {
+ length = Character.toChars(ch, keyBuf, 0);
+ } else {
+ keyBuf[0] = (char)ch;
+ }
+ addContractOrder(new String(keyBuf, 0, length), anOrder);
+ } else {
+ // add the entry to the mapping table,
+ // the same later entry replaces the previous one
+ mapping.setElementAt(ch, anOrder);
+ }
+ }
+
+ private final void addContractOrder(String groupChars, int anOrder) {
+ addContractOrder(groupChars, anOrder, true);
+ }
+
+ /**
+ * Adds the contracting string into the collation table.
+ */
+ private final void addContractOrder(String groupChars, int anOrder,
+ boolean fwd)
+ {
+ if (contractTable == null) {
+ contractTable = new Vector(INITIALTABLESIZE);
+ }
+
+ //initial character
+ int ch = groupChars.codePointAt(0);
+ /*
+ char ch0 = groupChars.charAt(0);
+ int ch = Character.isHighSurrogate(ch0)?
+ Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
+ */
+ // See if the initial character of the string already has a contract table.
+ int entry = mapping.elementAt(ch);
+ Vector entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX);
+
+ if (entryTable == null) {
+ // We need to create a new table of contract entries for this base char
+ int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size();
+ entryTable = new Vector(INITIALTABLESIZE);
+ contractTable.addElement(entryTable);
+
+ // Add the initial character's current ordering first. then
+ // update its mapping to point to this contract table
+ entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry));
+ mapping.setElementAt(ch, tableIndex);
+ }
+
+ // Now add (or replace) this string in the table
+ int index = RBCollationTables.getEntry(entryTable, groupChars, fwd);
+ if (index != RBCollationTables.UNMAPPED) {
+ EntryPair pair = (EntryPair) entryTable.elementAt(index);
+ pair.value = anOrder;
+ } else {
+ EntryPair pair = (EntryPair)entryTable.lastElement();
+
+ // NOTE: This little bit of logic is here to speed CollationElementIterator
+ // .nextContractChar(). This code ensures that the longest sequence in
+ // this list is always the _last_ one in the list. This keeps
+ // nextContractChar() from having to search the entire list for the longest
+ // sequence.
+ if (groupChars.length() > pair.entryName.length()) {
+ entryTable.addElement(new EntryPair(groupChars, anOrder, fwd));
+ } else {
+ entryTable.insertElementAt(new EntryPair(groupChars, anOrder,
+ fwd), entryTable.size() - 1);
+ }
+ }
+
+ // If this was a forward mapping for a contracting string, also add a
+ // reverse mapping for it, so that CollationElementIterator.previous
+ // can work right
+ if (fwd && groupChars.length() > 1) {
+ addContractFlags(groupChars);
+ addContractOrder(new StringBuffer(groupChars).reverse().toString(),
+ anOrder, false);
+ }
+ }
+
+ /**
+ * If the given string has been specified as a contracting string
+ * in this collation table, return its ordering.
+ * Otherwise return UNMAPPED.
+ */
+ private int getContractOrder(String groupChars)
+ {
+ int result = RBCollationTables.UNMAPPED;
+ if (contractTable != null) {
+ int ch = groupChars.codePointAt(0);
+ /*
+ char ch0 = groupChars.charAt(0);
+ int ch = Character.isHighSurrogate(ch0)?
+ Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
+ */
+ Vector entryTable = getContractValues(ch);
+ if (entryTable != null) {
+ int index = RBCollationTables.getEntry(entryTable, groupChars, true);
+ if (index != RBCollationTables.UNMAPPED) {
+ EntryPair pair = (EntryPair) entryTable.elementAt(index);
+ result = pair.value;
+ }
+ }
+ }
+ return result;
+ }
+
+ private final int getCharOrder(int ch) {
+ int order = mapping.elementAt(ch);
+
+ if (order >= RBCollationTables.CONTRACTCHARINDEX) {
+ Vector groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX);
+ EntryPair pair = (EntryPair)groupList.firstElement();
+ order = pair.value;
+ }
+ return order;
+ }
+
+ /**
+ * Get the entry of hash table of the contracting string in the collation
+ * table.
+ * @param ch the starting character of the contracting string
+ */
+ private Vector getContractValues(int ch)
+ {
+ int index = mapping.elementAt(ch);
+ return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX);
+ }
+
+ private Vector getContractValuesImpl(int index)
+ {
+ if (index >= 0)
+ {
+ return (Vector)contractTable.elementAt(index);
+ }
+ else // not found
+ {
+ return null;
+ }
+ }
+
+ /**
+ * Adds the expanding string into the collation table.
+ */
+ private final void addExpandOrder(String contractChars,
+ String expandChars,
+ int anOrder) throws ParseException
+ {
+ // Create an expansion table entry
+ int tableIndex = addExpansion(anOrder, expandChars);
+
+ // And add its index into the main mapping table
+ if (contractChars.length() > 1) {
+ char ch = contractChars.charAt(0);
+ if (Character.isHighSurrogate(ch) && contractChars.length() == 2) {
+ char ch2 = contractChars.charAt(1);
+ if (Character.isLowSurrogate(ch2)) {
+ //only add into table when it is a legal surrogate
+ addOrder(Character.toCodePoint(ch, ch2), tableIndex);
+ }
+ } else {
+ addContractOrder(contractChars, tableIndex);
+ }
+ } else {
+ addOrder(contractChars.charAt(0), tableIndex);
+ }
+ }
+
+ private final void addExpandOrder(int ch, String expandChars, int anOrder)
+ throws ParseException
+ {
+ int tableIndex = addExpansion(anOrder, expandChars);
+ addOrder(ch, tableIndex);
+ }
+
+ /**
+ * Create a new entry in the expansion table that contains the orderings
+ * for the given characers. If anOrder is valid, it is added to the
+ * beginning of the expanded list of orders.
+ */
+ private int addExpansion(int anOrder, String expandChars) {
+ if (expandTable == null) {
+ expandTable = new Vector(INITIALTABLESIZE);
+ }
+
+ // If anOrder is valid, we want to add it at the beginning of the list
+ int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1;
+
+ int[] valueList = new int[expandChars.length() + offset];
+ if (offset == 1) {
+ valueList[0] = anOrder;
+ }
+
+ int j = offset;
+ for (int i = 0; i < expandChars.length(); i++) {
+ char ch0 = expandChars.charAt(i);
+ char ch1;
+ int ch;
+ if (Character.isHighSurrogate(ch0)) {
+ if (++i == expandChars.length() ||
+ !Character.isLowSurrogate(ch1=expandChars.charAt(i))) {
+ //ether we are missing the low surrogate or the next char
+ //is not a legal low surrogate, so stop loop
+ break;
+ }
+ ch = Character.toCodePoint(ch0, ch1);
+
+ } else {
+ ch = ch0;
+ }
+
+ int mapValue = getCharOrder(ch);
+
+ if (mapValue != RBCollationTables.UNMAPPED) {
+ valueList[j++] = mapValue;
+ } else {
+ // can't find it in the table, will be filled in by commit().
+ valueList[j++] = CHARINDEX + ch;
+ }
+ }
+ if (j < valueList.length) {
+ //we had at least one supplementary character, the size of valueList
+ //is bigger than it really needs...
+ int[] tmpBuf = new int[j];
+ while (--j >= 0) {
+ tmpBuf[j] = valueList[j];
+ }
+ valueList = tmpBuf;
+ }
+ // Add the expanding char list into the expansion table.
+ int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size();
+ expandTable.addElement(valueList);
+
+ return tableIndex;
+ }
+
+ private void addContractFlags(String chars) {
+ char c0;
+ int c;
+ int len = chars.length();
+ for (int i = 0; i < len; i++) {
+ c0 = chars.charAt(i);
+ c = Character.isHighSurrogate(c0)
+ ?Character.toCodePoint(c0, chars.charAt(++i))
+ :c0;
+ contractFlags.put(c, 1);
+ }
+ }
+
+ // ==============================================================
+ // constants
+ // ==============================================================
+ final static int CHARINDEX = 0x70000000; // need look up in .commit()
+
+ private final static int IGNORABLEMASK = 0x0000ffff;
+ private final static int PRIMARYORDERINCREMENT = 0x00010000;
+ private final static int SECONDARYORDERINCREMENT = 0x00000100;
+ private final static int TERTIARYORDERINCREMENT = 0x00000001;
+ private final static int INITIALTABLESIZE = 20;
+ private final static int MAXKEYSIZE = 5;
+
+ // ==============================================================
+ // instance variables
+ // ==============================================================
+
+ // variables used by the build process
+ private RBCollationTables.BuildAPI tables = null;
+ private MergeCollation mPattern = null;
+ private boolean isOverIgnore = false;
+ private char[] keyBuf = new char[MAXKEYSIZE];
+ private IntHashtable contractFlags = new IntHashtable(100);
+
+ // "shadow" copies of the instance variables in RBCollationTables
+ // (the values in these variables are copied back into RBCollationTables
+ // at the end of the build process)
+ private boolean frenchSec = false;
+ private boolean seAsianSwapping = false;
+
+ private UCompactIntArray mapping = null;
+ private Vector contractTable = null;
+ private Vector expandTable = null;
+
+ private short maxSecOrder = 0;
+ private short maxTerOrder = 0;
+}