8165804: Revisit the way of loading BreakIterator rules/dictionaries
Reviewed-by: naoto, peytoia, erikj
--- a/jdk/make/gendata/GendataBreakIterator.gmk Mon Oct 24 21:44:33 2016 -0700
+++ b/jdk/make/gendata/GendataBreakIterator.gmk Tue Oct 25 15:43:19 2016 +0900
@@ -55,7 +55,6 @@
$(eval $(call SetupJavaCompilation,BUILD_BREAKITERATOR_LD, \
SETUP := GENERATE_OLDBYTECODE, \
SRC := $(JDK_TOPDIR)/src/jdk.localedata/share/classes, \
- INCLUDES := $(TEXT_PKG_LD), \
INCLUDE_FILES := \
$(TEXT_PKG_LD)/BreakIteratorRules_th.java \
$(TEXT_PKG_LD)/BreakIteratorInfo_th.java, \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/BreakDictionary.java Tue Oct 25 15:43:19 2016 +0900
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *
+ * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
+ * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
+ *
+ * The original version of this source code and documentation
+ * is copyrighted and owned by Taligent, Inc., a wholly-owned
+ * subsidiary of IBM. These materials are provided under terms
+ * of a License Agreement between Taligent and Sun. This technology
+ * is protected by multiple US and International patents.
+ *
+ * This notice and attribution to Taligent may not be removed.
+ * Taligent is a registered trademark of Taligent, Inc.
+ */
+package sun.text;
+
+import java.nio.BufferUnderflowException;
+import java.nio.ByteBuffer;
+import java.util.MissingResourceException;
+import sun.text.CompactByteArray;
+import sun.text.SupplementaryCharacterData;
+
+/**
+ * This is the class that represents the list of known words used by
+ * DictionaryBasedBreakIterator. The conceptual data structure used
+ * here is a trie: there is a node hanging off the root node for every
+ * letter that can start a word. Each of these nodes has a node hanging
+ * off of it for every letter that can be the second letter of a word
+ * if this node is the first letter, and so on. The trie is represented
+ * as a two-dimensional array that can be treated as a table of state
+ * transitions. Indexes are used to compress this array, taking
+ * advantage of the fact that this array will always be very sparse.
+ */
+class BreakDictionary {
+
+ //=========================================================================
+ // data members
+ //=========================================================================
+
+ /**
+ * The version of the dictionary that was read in.
+ */
+ private static int supportedVersion = 1;
+
+ /**
+ * Maps from characters to column numbers. The main use of this is to
+ * avoid making room in the array for empty columns.
+ */
+ private CompactByteArray columnMap = null;
+ private SupplementaryCharacterData supplementaryCharColumnMap = null;
+
+ /**
+ * The number of actual columns in the table
+ */
+ private int numCols;
+
+ /**
+ * Columns are organized into groups of 32. This says how many
+ * column groups. (We could calculate this, but we store the
+ * value to avoid having to repeatedly calculate it.)
+ */
+ private int numColGroups;
+
+ /**
+ * The actual compressed state table. Each conceptual row represents
+ * a state, and the cells in it contain the row numbers of the states
+ * to transition to for each possible letter. 0 is used to indicate
+ * an illegal combination of letters (i.e., the error state). The
+ * table is compressed by eliminating all the unpopulated (i.e., zero)
+ * cells. Multiple conceptual rows can then be doubled up in a single
+ * physical row by sliding them up and possibly shifting them to one
+ * side or the other so the populated cells don't collide. Indexes
+ * are used to identify unpopulated cells and to locate populated cells.
+ */
+ private short[] table = null;
+
+ /**
+ * This index maps logical row numbers to physical row numbers
+ */
+ private short[] rowIndex = null;
+
+ /**
+ * A bitmap is used to tell which cells in the comceptual table are
+ * populated. This array contains all the unique bit combinations
+ * in that bitmap. If the table is more than 32 columns wide,
+ * successive entries in this array are used for a single row.
+ */
+ private int[] rowIndexFlags = null;
+
+ /**
+ * This index maps from a logical row number into the bitmap table above.
+ * (This keeps us from storing duplicate bitmap combinations.) Since there
+ * are a lot of rows with only one populated cell, instead of wasting space
+ * in the bitmap table, we just store a negative number in this index for
+ * rows with one populated cell. The absolute value of that number is
+ * the column number of the populated cell.
+ */
+ private short[] rowIndexFlagsIndex = null;
+
+ /**
+ * For each logical row, this index contains a constant that is added to
+ * the logical column number to get the physical column number
+ */
+ private byte[] rowIndexShifts = null;
+
+ //=========================================================================
+ // deserialization
+ //=========================================================================
+
+ BreakDictionary(String dictionaryName, byte[] dictionaryData) {
+ try {
+ setupDictionary(dictionaryName, dictionaryData);
+ } catch (BufferUnderflowException bue) {
+ MissingResourceException e;
+ e = new MissingResourceException("Corrupted dictionary data",
+ dictionaryName, "");
+ e.initCause(bue);
+ throw e;
+ }
+ }
+
+ private void setupDictionary(String dictionaryName, byte[] dictionaryData) {
+ ByteBuffer bb = ByteBuffer.wrap(dictionaryData);
+
+ // check version
+ int version = bb.getInt();
+ if (version != supportedVersion) {
+ throw new MissingResourceException("Dictionary version(" + version + ") is unsupported",
+ dictionaryName, "");
+ }
+
+ // Check data size
+ int len = bb.getInt();
+ if (bb.position() + len != bb.limit()) {
+ throw new MissingResourceException("Dictionary size is wrong: " + bb.limit(),
+ dictionaryName, "");
+ }
+
+ // read in the column map for BMP characteres (this is serialized in
+ // its internal form: an index array followed by a data array)
+ len = bb.getInt();
+ short[] temp = new short[len];
+ for (int i = 0; i < len; i++) {
+ temp[i] = bb.getShort();
+ }
+ len = bb.getInt();
+ byte[] temp2 = new byte[len];
+ bb.get(temp2);
+ columnMap = new CompactByteArray(temp, temp2);
+
+ // read in numCols and numColGroups
+ numCols = bb.getInt();
+ numColGroups = bb.getInt();
+
+ // read in the row-number index
+ len = bb.getInt();
+ rowIndex = new short[len];
+ for (int i = 0; i < len; i++) {
+ rowIndex[i] = bb.getShort();
+ }
+
+ // load in the populated-cells bitmap: index first, then bitmap list
+ len = bb.getInt();
+ rowIndexFlagsIndex = new short[len];
+ for (int i = 0; i < len; i++) {
+ rowIndexFlagsIndex[i] = bb.getShort();
+ }
+ len = bb.getInt();
+ rowIndexFlags = new int[len];
+ for (int i = 0; i < len; i++) {
+ rowIndexFlags[i] = bb.getInt();
+ }
+
+ // load in the row-shift index
+ len = bb.getInt();
+ rowIndexShifts = new byte[len];
+ bb.get(rowIndexShifts);
+
+ // load in the actual state table
+ len = bb.getInt();
+ table = new short[len];
+ for (int i = 0; i < len; i++) {
+ table[i] = bb.getShort();
+ }
+
+ // finally, prepare the column map for supplementary characters
+ len = bb.getInt();
+ int[] temp3 = new int[len];
+ for (int i = 0; i < len; i++) {
+ temp3[i] = bb.getInt();
+ }
+ assert bb.position() == bb.limit();
+
+ supplementaryCharColumnMap = new SupplementaryCharacterData(temp3);
+ }
+
+ //=========================================================================
+ // access to the words
+ //=========================================================================
+
+ /**
+ * Uses the column map to map the character to a column number, then
+ * passes the row and column number to getNextState()
+ * @param row The current state
+ * @param ch The character whose column we're interested in
+ * @return The new state to transition to
+ */
+ public final short getNextStateFromCharacter(int row, int ch) {
+ int col;
+ if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+ col = columnMap.elementAt((char)ch);
+ } else {
+ col = supplementaryCharColumnMap.getValue(ch);
+ }
+ return getNextState(row, col);
+ }
+
+ /**
+ * Returns the value in the cell with the specified (logical) row and
+ * column numbers. In DictionaryBasedBreakIterator, the row number is
+ * a state number, the column number is an input, and the return value
+ * is the row number of the new state to transition to. (0 is the
+ * "error" state, and -1 is the "end of word" state in a dictionary)
+ * @param row The row number of the current state
+ * @param col The column number of the input character (0 means "not a
+ * dictionary character")
+ * @return The row number of the new state to transition to
+ */
+ public final short getNextState(int row, int col) {
+ if (cellIsPopulated(row, col)) {
+ // we map from logical to physical row number by looking up the
+ // mapping in rowIndex; we map from logical column number to
+ // physical column number by looking up a shift value for this
+ // logical row and offsetting the logical column number by
+ // the shift amount. Then we can use internalAt() to actually
+ // get the value out of the table.
+ return internalAt(rowIndex[row], col + rowIndexShifts[row]);
+ }
+ else {
+ return 0;
+ }
+ }
+
+ /**
+ * Given (logical) row and column numbers, returns true if the
+ * cell in that position is populated
+ */
+ private boolean cellIsPopulated(int row, int col) {
+ // look up the entry in the bitmap index for the specified row.
+ // If it's a negative number, it's the column number of the only
+ // populated cell in the row
+ if (rowIndexFlagsIndex[row] < 0) {
+ return col == -rowIndexFlagsIndex[row];
+ }
+
+ // if it's a positive number, it's the offset of an entry in the bitmap
+ // list. If the table is more than 32 columns wide, the bitmap is stored
+ // successive entries in the bitmap list, so we have to divide the column
+ // number by 32 and offset the number we got out of the index by the result.
+ // Once we have the appropriate piece of the bitmap, test the appropriate
+ // bit and return the result.
+ else {
+ int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
+ return (flags & (1 << (col & 0x1f))) != 0;
+ }
+ }
+
+ /**
+ * Implementation of getNextState() when we know the specified cell is
+ * populated.
+ * @param row The PHYSICAL row number of the cell
+ * @param col The PHYSICAL column number of the cell
+ * @return The value stored in the cell
+ */
+ private short internalAt(int row, int col) {
+ // the table is a one-dimensional array, so this just does the math necessary
+ // to treat it as a two-dimensional array (we don't just use a two-dimensional
+ // array because two-dimensional arrays are inefficient in Java)
+ return table[row * numCols + col];
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/DictionaryBasedBreakIterator.java Tue Oct 25 15:43:19 2016 +0900
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *
+ * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
+ * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
+ *
+ * The original version of this source code and documentation
+ * is copyrighted and owned by Taligent, Inc., a wholly-owned
+ * subsidiary of IBM. These materials are provided under terms
+ * of a License Agreement between Taligent and Sun. This technology
+ * is protected by multiple US and International patents.
+ *
+ * This notice and attribution to Taligent may not be removed.
+ * Taligent is a registered trademark of Taligent, Inc.
+ */
+
+package sun.text;
+
+import java.text.CharacterIterator;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Stack;
+
+/**
+ * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
+ * to further subdivide ranges of text beyond what is possible using just the
+ * state-table-based algorithm. This is necessary, for example, to handle
+ * word and line breaking in Thai, which doesn't use spaces between words. The
+ * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
+ * up text as far as possible, and then contiguous ranges of letters are
+ * repeatedly compared against a list of known words (i.e., the dictionary)
+ * to divide them up into words.
+ *
+ * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
+ * but adds one more special substitution name: <dictionary>. This substitution
+ * name is used to identify characters in words in the dictionary. The idea is that
+ * if the iterator passes over a chunk of text that includes two or more characters
+ * in a row that are included in <dictionary>, it goes back through that range and
+ * derives additional break positions (if possible) using the dictionary.
+ *
+ * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
+ * file. It follows a prescribed search path to locate the dictionary (right now,
+ * it looks for it in /com/ibm/text/resources in each directory in the classpath,
+ * and won't find it in JAR files, but this location is likely to change). The
+ * dictionary file is in a serialized binary format. We have a very primitive (and
+ * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
+ * currently making it public. Contact us for help.
+ */
+public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
+
+ /**
+ * a list of known words that is used to divide up contiguous ranges of letters,
+ * stored in a compressed, indexed, format that offers fast access
+ */
+ private BreakDictionary dictionary;
+
+ /**
+ * a list of flags indicating which character categories are contained in
+ * the dictionary file (this is used to determine which ranges of characters
+ * to apply the dictionary to)
+ */
+ private boolean[] categoryFlags;
+
+ /**
+ * a temporary hiding place for the number of dictionary characters in the
+ * last range passed over by next()
+ */
+ private int dictionaryCharCount;
+
+ /**
+ * when a range of characters is divided up using the dictionary, the break
+ * positions that are discovered are stored here, preventing us from having
+ * to use either the dictionary or the state table again until the iterator
+ * leaves this range of text
+ */
+ private int[] cachedBreakPositions;
+
+ /**
+ * if cachedBreakPositions is not null, this indicates which item in the
+ * cache the current iteration position refers to
+ */
+ private int positionInCache;
+
+ /**
+ * Constructs a DictionaryBasedBreakIterator.
+ *
+ * @param ruleFile the name of the rule data file
+ * @param ruleData the rule data loaded from the rule data file
+ * @param dictionaryFile the name of the dictionary file
+ * @param dictionartData the dictionary data loaded from the dictionary file
+ * @throws MissingResourceException if rule data or dictionary initialization failed
+ */
+ public DictionaryBasedBreakIterator(String ruleFile, byte[] ruleData,
+ String dictionaryFile, byte[] dictionaryData) {
+ super(ruleFile, ruleData);
+ byte[] tmp = super.getAdditionalData();
+ if (tmp != null) {
+ prepareCategoryFlags(tmp);
+ super.setAdditionalData(null);
+ }
+ dictionary = new BreakDictionary(dictionaryFile, dictionaryData);
+ }
+
+ private void prepareCategoryFlags(byte[] data) {
+ categoryFlags = new boolean[data.length];
+ for (int i = 0; i < data.length; i++) {
+ categoryFlags[i] = (data[i] == (byte)1) ? true : false;
+ }
+ }
+
+ @Override
+ public void setText(CharacterIterator newText) {
+ super.setText(newText);
+ cachedBreakPositions = null;
+ dictionaryCharCount = 0;
+ positionInCache = 0;
+ }
+
+ /**
+ * Sets the current iteration position to the beginning of the text.
+ * (i.e., the CharacterIterator's starting offset).
+ * @return The offset of the beginning of the text.
+ */
+ @Override
+ public int first() {
+ cachedBreakPositions = null;
+ dictionaryCharCount = 0;
+ positionInCache = 0;
+ return super.first();
+ }
+
+ /**
+ * Sets the current iteration position to the end of the text.
+ * (i.e., the CharacterIterator's ending offset).
+ * @return The text's past-the-end offset.
+ */
+ @Override
+ public int last() {
+ cachedBreakPositions = null;
+ dictionaryCharCount = 0;
+ positionInCache = 0;
+ return super.last();
+ }
+
+ /**
+ * Advances the iterator one step backwards.
+ * @return The position of the last boundary position before the
+ * current iteration position
+ */
+ @Override
+ public int previous() {
+ CharacterIterator text = getText();
+
+ // if we have cached break positions and we're still in the range
+ // covered by them, just move one step backward in the cache
+ if (cachedBreakPositions != null && positionInCache > 0) {
+ --positionInCache;
+ text.setIndex(cachedBreakPositions[positionInCache]);
+ return cachedBreakPositions[positionInCache];
+ }
+
+ // otherwise, dump the cache and use the inherited previous() method to move
+ // backward. This may fill up the cache with new break positions, in which
+ // case we have to mark our position in the cache
+ else {
+ cachedBreakPositions = null;
+ int result = super.previous();
+ if (cachedBreakPositions != null) {
+ positionInCache = cachedBreakPositions.length - 2;
+ }
+ return result;
+ }
+ }
+
+ /**
+ * Sets the current iteration position to the last boundary position
+ * before the specified position.
+ * @param offset The position to begin searching from
+ * @return The position of the last boundary before "offset"
+ */
+ @Override
+ public int preceding(int offset) {
+ CharacterIterator text = getText();
+ checkOffset(offset, text);
+
+ // if we have no cached break positions, or "offset" is outside the
+ // range covered by the cache, we can just call the inherited routine
+ // (which will eventually call other routines in this class that may
+ // refresh the cache)
+ if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
+ offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
+ cachedBreakPositions = null;
+ return super.preceding(offset);
+ }
+
+ // on the other hand, if "offset" is within the range covered by the cache,
+ // then all we have to do is search the cache for the last break position
+ // before "offset"
+ else {
+ positionInCache = 0;
+ while (positionInCache < cachedBreakPositions.length
+ && offset > cachedBreakPositions[positionInCache]) {
+ ++positionInCache;
+ }
+ --positionInCache;
+ text.setIndex(cachedBreakPositions[positionInCache]);
+ return text.getIndex();
+ }
+ }
+
+ /**
+ * Sets the current iteration position to the first boundary position after
+ * the specified position.
+ * @param offset The position to begin searching forward from
+ * @return The position of the first boundary after "offset"
+ */
+ @Override
+ public int following(int offset) {
+ CharacterIterator text = getText();
+ checkOffset(offset, text);
+
+ // if we have no cached break positions, or if "offset" is outside the
+ // range covered by the cache, then dump the cache and call our
+ // inherited following() method. This will call other methods in this
+ // class that may refresh the cache.
+ if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
+ offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
+ cachedBreakPositions = null;
+ return super.following(offset);
+ }
+
+ // on the other hand, if "offset" is within the range covered by the
+ // cache, then just search the cache for the first break position
+ // after "offset"
+ else {
+ positionInCache = 0;
+ while (positionInCache < cachedBreakPositions.length
+ && offset >= cachedBreakPositions[positionInCache]) {
+ ++positionInCache;
+ }
+ text.setIndex(cachedBreakPositions[positionInCache]);
+ return text.getIndex();
+ }
+ }
+
+ /**
+ * This is the implementation function for next().
+ */
+ @Override
+ protected int handleNext() {
+ CharacterIterator text = getText();
+
+ // if there are no cached break positions, or if we've just moved
+ // off the end of the range covered by the cache, we have to dump
+ // and possibly regenerate the cache
+ if (cachedBreakPositions == null ||
+ positionInCache == cachedBreakPositions.length - 1) {
+
+ // start by using the inherited handleNext() to find a tentative return
+ // value. dictionaryCharCount tells us how many dictionary characters
+ // we passed over on our way to the tentative return value
+ int startPos = text.getIndex();
+ dictionaryCharCount = 0;
+ int result = super.handleNext();
+
+ // if we passed over more than one dictionary character, then we use
+ // divideUpDictionaryRange() to regenerate the cached break positions
+ // for the new range
+ if (dictionaryCharCount > 1 && result - startPos > 1) {
+ divideUpDictionaryRange(startPos, result);
+ }
+
+ // otherwise, the value we got back from the inherited fuction
+ // is our return value, and we can dump the cache
+ else {
+ cachedBreakPositions = null;
+ return result;
+ }
+ }
+
+ // if the cache of break positions has been regenerated (or existed all
+ // along), then just advance to the next break position in the cache
+ // and return it
+ if (cachedBreakPositions != null) {
+ ++positionInCache;
+ text.setIndex(cachedBreakPositions[positionInCache]);
+ return cachedBreakPositions[positionInCache];
+ }
+ return -9999; // SHOULD NEVER GET HERE!
+ }
+
+ /**
+ * Looks up a character category for a character.
+ */
+ @Override
+ protected int lookupCategory(int c) {
+ // this override of lookupCategory() exists only to keep track of whether we've
+ // passed over any dictionary characters. It calls the inherited lookupCategory()
+ // to do the real work, and then checks whether its return value is one of the
+ // categories represented in the dictionary. If it is, bump the dictionary-
+ // character count.
+ int result = super.lookupCategory(c);
+ if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
+ ++dictionaryCharCount;
+ }
+ return result;
+ }
+
+ /**
+ * This is the function that actually implements the dictionary-based
+ * algorithm. Given the endpoints of a range of text, it uses the
+ * dictionary to determine the positions of any boundaries in this
+ * range. It stores all the boundary positions it discovers in
+ * cachedBreakPositions so that we only have to do this work once
+ * for each time we enter the range.
+ */
+ @SuppressWarnings("unchecked")
+ private void divideUpDictionaryRange(int startPos, int endPos) {
+ CharacterIterator text = getText();
+
+ // the range we're dividing may begin or end with non-dictionary characters
+ // (i.e., for line breaking, we may have leading or trailing punctuation
+ // that needs to be kept with the word). Seek from the beginning of the
+ // range to the first dictionary character
+ text.setIndex(startPos);
+ int c = getCurrent();
+ int category = lookupCategory(c);
+ while (category == IGNORE || !categoryFlags[category]) {
+ c = getNext();
+ category = lookupCategory(c);
+ }
+
+ // initialize. We maintain two stacks: currentBreakPositions contains
+ // the list of break positions that will be returned if we successfully
+ // finish traversing the whole range now. possibleBreakPositions lists
+ // all other possible word ends we've passed along the way. (Whenever
+ // we reach an error [a sequence of characters that can't begin any word
+ // in the dictionary], we back up, possibly delete some breaks from
+ // currentBreakPositions, move a break from possibleBreakPositions
+ // to currentBreakPositions, and start over from there. This process
+ // continues in this way until we either successfully make it all the way
+ // across the range, or exhaust all of our combinations of break
+ // positions.)
+ Stack<Integer> currentBreakPositions = new Stack<>();
+ Stack<Integer> possibleBreakPositions = new Stack<>();
+ List<Integer> wrongBreakPositions = new ArrayList<>();
+
+ // the dictionary is implemented as a trie, which is treated as a state
+ // machine. -1 represents the end of a legal word. Every word in the
+ // dictionary is represented by a path from the root node to -1. A path
+ // that ends in state 0 is an illegal combination of characters.
+ int state = 0;
+
+ // these two variables are used for error handling. We keep track of the
+ // farthest we've gotten through the range being divided, and the combination
+ // of breaks that got us that far. If we use up all possible break
+ // combinations, the text contains an error or a word that's not in the
+ // dictionary. In this case, we "bless" the break positions that got us the
+ // farthest as real break positions, and then start over from scratch with
+ // the character where the error occurred.
+ int farthestEndPoint = text.getIndex();
+ Stack<Integer> bestBreakPositions = null;
+
+ // initialize (we always exit the loop with a break statement)
+ c = getCurrent();
+ while (true) {
+
+ // if we can transition to state "-1" from our current state, we're
+ // on the last character of a legal word. Push that position onto
+ // the possible-break-positions stack
+ if (dictionary.getNextState(state, 0) == -1) {
+ possibleBreakPositions.push(text.getIndex());
+ }
+
+ // look up the new state to transition to in the dictionary
+ state = dictionary.getNextStateFromCharacter(state, c);
+
+ // if the character we're sitting on causes us to transition to
+ // the "end of word" state, then it was a non-dictionary character
+ // and we've successfully traversed the whole range. Drop out
+ // of the loop.
+ if (state == -1) {
+ currentBreakPositions.push(text.getIndex());
+ break;
+ }
+
+ // if the character we're sitting on causes us to transition to
+ // the error state, or if we've gone off the end of the range
+ // without transitioning to the "end of word" state, we've hit
+ // an error...
+ else if (state == 0 || text.getIndex() >= endPos) {
+
+ // if this is the farthest we've gotten, take note of it in
+ // case there's an error in the text
+ if (text.getIndex() > farthestEndPoint) {
+ farthestEndPoint = text.getIndex();
+
+ @SuppressWarnings("unchecked")
+ Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone();
+
+ bestBreakPositions = currentBreakPositionsCopy;
+ }
+
+ // wrongBreakPositions is a list of all break positions
+ // we've tried starting that didn't allow us to traverse
+ // all the way through the text. Every time we pop a
+ // break position off of currentBreakPositions, we put it
+ // into wrongBreakPositions to avoid trying it again later.
+ // If we make it to this spot, we're either going to back
+ // up to a break in possibleBreakPositions and try starting
+ // over from there, or we've exhausted all possible break
+ // positions and are going to do the fallback procedure.
+ // This loop prevents us from messing with anything in
+ // possibleBreakPositions that didn't work as a starting
+ // point the last time we tried it (this is to prevent a bunch of
+ // repetitive checks from slowing down some extreme cases)
+ while (!possibleBreakPositions.isEmpty()
+ && wrongBreakPositions.contains(possibleBreakPositions.peek())) {
+ possibleBreakPositions.pop();
+ }
+
+ // if we've used up all possible break-position combinations, there's
+ // an error or an unknown word in the text. In this case, we start
+ // over, treating the farthest character we've reached as the beginning
+ // of the range, and "blessing" the break positions that got us that
+ // far as real break positions
+ if (possibleBreakPositions.isEmpty()) {
+ if (bestBreakPositions != null) {
+ currentBreakPositions = bestBreakPositions;
+ if (farthestEndPoint < endPos) {
+ text.setIndex(farthestEndPoint + 1);
+ }
+ else {
+ break;
+ }
+ }
+ else {
+ if ((currentBreakPositions.size() == 0 ||
+ currentBreakPositions.peek().intValue() != text.getIndex())
+ && text.getIndex() != startPos) {
+ currentBreakPositions.push(text.getIndex());
+ }
+ getNext();
+ currentBreakPositions.push(text.getIndex());
+ }
+ }
+
+ // if we still have more break positions we can try, then promote the
+ // last break in possibleBreakPositions into currentBreakPositions,
+ // and get rid of all entries in currentBreakPositions that come after
+ // it. Then back up to that position and start over from there (i.e.,
+ // treat that position as the beginning of a new word)
+ else {
+ Integer temp = possibleBreakPositions.pop();
+ Integer temp2 = null;
+ while (!currentBreakPositions.isEmpty() && temp.intValue() <
+ currentBreakPositions.peek().intValue()) {
+ temp2 = currentBreakPositions.pop();
+ wrongBreakPositions.add(temp2);
+ }
+ currentBreakPositions.push(temp);
+ text.setIndex(currentBreakPositions.peek().intValue());
+ }
+
+ // re-sync "c" for the next go-round, and drop out of the loop if
+ // we've made it off the end of the range
+ c = getCurrent();
+ if (text.getIndex() >= endPos) {
+ break;
+ }
+ }
+
+ // if we didn't hit any exceptional conditions on this last iteration,
+ // just advance to the next character and loop
+ else {
+ c = getNext();
+ }
+ }
+
+ // dump the last break position in the list, and replace it with the actual
+ // end of the range (which may be the same character, or may be further on
+ // because the range actually ended with non-dictionary characters we want to
+ // keep with the word)
+ if (!currentBreakPositions.isEmpty()) {
+ currentBreakPositions.pop();
+ }
+ currentBreakPositions.push(endPos);
+
+ // create a regular array to hold the break positions and copy
+ // the break positions from the stack to the array (in addition,
+ // our starting position goes into this array as a break position).
+ // This array becomes the cache of break positions used by next()
+ // and previous(), so this is where we actually refresh the cache.
+ cachedBreakPositions = new int[currentBreakPositions.size() + 1];
+ cachedBreakPositions[0] = startPos;
+
+ for (int i = 0; i < currentBreakPositions.size(); i++) {
+ cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
+ }
+ positionInCache = 0;
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/RuleBasedBreakIterator.java Tue Oct 25 15:43:19 2016 +0900
@@ -0,0 +1,1144 @@
+/*
+ * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *
+ * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
+ * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
+ *
+ * The original version of this source code and documentation
+ * is copyrighted and owned by Taligent, Inc., a wholly-owned
+ * subsidiary of IBM. These materials are provided under terms
+ * of a License Agreement between Taligent and Sun. This technology
+ * is protected by multiple US and International patents.
+ *
+ * This notice and attribution to Taligent may not be removed.
+ * Taligent is a registered trademark of Taligent, Inc.
+ */
+
+package sun.text;
+
+import java.nio.BufferUnderflowException;
+import java.nio.ByteBuffer;
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+import java.util.MissingResourceException;
+import sun.text.CompactByteArray;
+import sun.text.SupplementaryCharacterData;
+
+/**
+ * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
+ *
+ * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
+ * and <i>regular expressions.</i></p>
+ *
+ * <p>A substitution rule defines a name that can be used in place of an expression. It
+ * consists of a name, which is a string of characters contained in angle brackets, an equals
+ * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
+ * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
+ * square brackets. A substitution is visible after its definition, and is filled in using
+ * simple textual substitution. Substitution definitions can contain other substitutions, as
+ * long as those substitutions have been defined first. Substitutions are generally used to
+ * make the regular expressions (which can get quite complex) shorted and easier to read.
+ * They typically define either character categories or commonly-used subexpressions.</p>
+ *
+ * <p>There is one special substitution. If the description defines a substitution
+ * called "<ignore>", the expression must be a [] expression, and the
+ * expression defines a set of characters (the "<em>ignore characters</em>") that
+ * will be transparent to the BreakIterator. A sequence of characters will break the
+ * same way it would if any ignore characters it contains are taken out. Break
+ * positions never occur befoer ignore characters.</p>
+ *
+ * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
+ * defines a sequence of characters to be kept together. With one significant exception, the
+ * iterator uses a longest-possible-match algorithm when matching text to regular
+ * expressions. The iterator also treats descriptions containing multiple regular expressions
+ * as if they were ORed together (i.e., as if they were separated by |).</p>
+ *
+ * <p>The special characters recognized by the regular-expression parser are as follows:</p>
+ *
+ * <blockquote>
+ * <table border="1" width="100%">
+ * <tr>
+ * <td width="6%">*</td>
+ * <td width="94%">Specifies that the expression preceding the asterisk may occur any number
+ * of times (including not at all).</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">{}</td>
+ * <td width="94%">Encloses a sequence of characters that is optional.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">()</td>
+ * <td width="94%">Encloses a sequence of characters. If followed by *, the sequence
+ * repeats. Otherwise, the parentheses are just a grouping device and a way to delimit
+ * the ends of expressions containing |.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">|</td>
+ * <td width="94%">Separates two alternative sequences of characters. Either one
+ * sequence or the other, but not both, matches this expression. The | character can
+ * only occur inside ().</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">.</td>
+ * <td width="94%">Matches any character.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">*?</td>
+ * <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except
+ * when there is overlap between the last group of characters in the expression preceding the
+ * * and the first group of characters following the *. When there is this kind of
+ * overlap, * will match the longest sequence of characters that match the expression before
+ * the *, and *? will match the shortest sequence of characters matching the expression
+ * before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
+ * "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
+ * but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">[]</td>
+ * <td width="94%">Specifies a group of alternative characters. A [] expression will
+ * match any single character that is specified in the [] expression. For more on the
+ * syntax of [] expressions, see below.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">/</td>
+ * <td width="94%">Specifies where the break position should go if text matches this
+ * expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run
+ * of letters, followed by a run of whitespace, followed by a digit, but the break position
+ * will actually go before the whitespace). Expressions that don't contain / put the
+ * break position at the end of the matching text.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">\</td>
+ * <td width="94%">Escape character. The \ itself is ignored, but causes the next
+ * character to be treated as literal character. This has no effect for many
+ * characters, but for the characters listed above, this deprives them of their special
+ * meaning. (There are no special escape sequences for Unicode characters, or tabs and
+ * newlines; these are all handled by a higher-level protocol. In a Java string,
+ * "\n" will be converted to a literal newline character by the time the
+ * regular-expression parser sees it. Of course, this means that \ sequences that are
+ * visible to the regexp parser must be written as \\ when inside a Java string.) All
+ * characters in the ASCII range except for letters, digits, and control characters are
+ * reserved characters to the parser and must be preceded by \ even if they currently don't
+ * mean anything.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">!</td>
+ * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
+ * parser that this expression specifies the backwards-iteration behavior of the iterator,
+ * and not its normal iteration behavior. This is generally only used in situations
+ * where the automatically-generated backwards-iteration brhavior doesn't produce
+ * satisfactory results and must be supplemented with extra client-specified rules.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%"><em>(all others)</em></td>
+ * <td width="94%">All other characters are treated as literal characters, which must match
+ * the corresponding character(s) in the text exactly.</td>
+ * </tr>
+ * </table>
+ * </blockquote>
+ *
+ * <p>Within a [] expression, a number of other special characters can be used to specify
+ * groups of characters:</p>
+ *
+ * <blockquote>
+ * <table border="1" width="100%">
+ * <tr>
+ * <td width="6%">-</td>
+ * <td width="94%">Specifies a range of matching characters. For example
+ * "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The -
+ * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
+ * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
+ * it include accented letters such as a-umlaut.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">::</td>
+ * <td width="94%">A pair of colons containing a one- or two-letter code matches all
+ * characters in the corresponding Unicode category. The two-letter codes are the same
+ * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
+ * matches all currency symbols and all math symbols). Specifying a one-letter code is
+ * the same as specifying all two-letter codes that begin with that letter (for example,
+ * "[:L:]" matches all letters, and is equivalent to
+ * "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode
+ * category code or a single letter that begins a Unicode category code is illegal within
+ * colons.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">[]</td>
+ * <td width="94%">[] expressions can nest. This has no effect, except when used in
+ * conjunction with the ^ token.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%">^</td>
+ * <td width="94%">Excludes the character (or the characters in the [] expression) following
+ * it from the group of characters. For example, "[a-z^p]" matches all Latin
+ * lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters
+ * except the Han ideographs.</td>
+ * </tr>
+ * <tr>
+ * <td width="6%"><em>(all others)</em></td>
+ * <td width="94%">All other characters are treated as literal characters. (For
+ * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
+ * </tr>
+ * </table>
+ * </blockquote>
+ *
+ * <p>For a more complete explanation, see <a
+ * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
+ * For examples, see the resource data (which is annotated).</p>
+ *
+ * @author Richard Gillam
+ */
+public class RuleBasedBreakIterator extends BreakIterator {
+
+ /**
+ * A token used as a character-category value to identify ignore characters
+ */
+ protected static final byte IGNORE = -1;
+
+ /**
+ * The state number of the starting state
+ */
+ private static final short START_STATE = 1;
+
+ /**
+ * The state-transition value indicating "stop"
+ */
+ private static final short STOP_STATE = 0;
+
+ /**
+ * Magic number for the BreakIterator data file format.
+ */
+ static final byte[] LABEL = {
+ (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a',
+ (byte)'\0'
+ };
+ static final int LABEL_LENGTH = LABEL.length;
+
+ /**
+ * Version number of the dictionary that was read in.
+ */
+ static final byte supportedVersion = 1;
+
+ /**
+ * An array length of indices for BMP characters
+ */
+ private static final int BMP_INDICES_LENGTH = 512;
+
+ /**
+ * Tables that indexes from character values to character category numbers
+ */
+ private CompactByteArray charCategoryTable = null;
+ private SupplementaryCharacterData supplementaryCharCategoryTable = null;
+
+ /**
+ * The table of state transitions used for forward iteration
+ */
+ private short[] stateTable = null;
+
+ /**
+ * The table of state transitions used to sync up the iterator with the
+ * text in backwards and random-access iteration
+ */
+ private short[] backwardsStateTable = null;
+
+ /**
+ * A list of flags indicating which states in the state table are accepting
+ * ("end") states
+ */
+ private boolean[] endStates = null;
+
+ /**
+ * A list of flags indicating which states in the state table are
+ * lookahead states (states which turn lookahead on and off)
+ */
+ private boolean[] lookaheadStates = null;
+
+ /**
+ * A table for additional data. May be used by a subclass of
+ * RuleBasedBreakIterator.
+ */
+ private byte[] additionalData = null;
+
+ /**
+ * The number of character categories (and, thus, the number of columns in
+ * the state tables)
+ */
+ private int numCategories;
+
+ /**
+ * The character iterator through which this BreakIterator accesses the text
+ */
+ private CharacterIterator text = null;
+
+ /**
+ * A CRC32 value of all data in datafile
+ */
+ private long checksum;
+
+ //=======================================================================
+ // constructors
+ //=======================================================================
+
+ /**
+ * Constructs a RuleBasedBreakIterator using the given rule data.
+ *
+ * @throws MissingResourceException if the rule data is invalid or corrupted
+ */
+ public RuleBasedBreakIterator(String ruleFile, byte[] ruleData) {
+ ByteBuffer bb = ByteBuffer.wrap(ruleData);
+ try {
+ validateRuleData(ruleFile, bb);
+ setupTables(ruleFile, bb);
+ } catch (BufferUnderflowException bue) {
+ MissingResourceException e;
+ e = new MissingResourceException("Corrupted rule data file", ruleFile, "");
+ e.initCause(bue);
+ throw e;
+ }
+ }
+
+ /**
+ * Initializes the fields with the given rule data.
+ * The data format is as follows:
+ * <pre>
+ * BreakIteratorData {
+ * u1 magic[7];
+ * u1 version;
+ * u4 totalDataSize;
+ * header_info header;
+ * body value;
+ * }
+ * </pre>
+ * <code>totalDataSize</code> is the summation of the size of
+ * <code>header_info</code> and <code>body</code> in byte count.
+ * <p>
+ * In <code>header</code>, each field except for checksum implies the
+ * length of each field. Since <code>BMPdataLength</code> is a fixed-length
+ * data(512 entries), its length isn't included in <code>header</code>.
+ * <code>checksum</code> is a CRC32 value of all in <code>body</code>.
+ * <pre>
+ * header_info {
+ * u4 stateTableLength;
+ * u4 backwardsStateTableLength;
+ * u4 endStatesLength;
+ * u4 lookaheadStatesLength;
+ * u4 BMPdataLength;
+ * u4 nonBMPdataLength;
+ * u4 additionalDataLength;
+ * u8 checksum;
+ * }
+ * </pre>
+ * <p>
+ *
+ * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to
+ * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to
+ * <code>supplementaryCharCategoryTable</code>.
+ * <pre>
+ * body {
+ * u2 stateTable[stateTableLength];
+ * u2 backwardsStateTable[backwardsStateTableLength];
+ * u1 endStates[endStatesLength];
+ * u1 lookaheadStates[lookaheadStatesLength];
+ * u2 BMPindices[512];
+ * u1 BMPdata[BMPdataLength];
+ * u4 nonBMPdata[numNonBMPdataLength];
+ * u1 additionalData[additionalDataLength];
+ * }
+ * </pre>
+ *
+ * @throws BufferUnderflowException if the end-of-data is reached before
+ * setting up all the tables
+ */
+ private void setupTables(String ruleFile, ByteBuffer bb) {
+ /* Read header_info. */
+ int stateTableLength = bb.getInt();
+ int backwardsStateTableLength = bb.getInt();
+ int endStatesLength = bb.getInt();
+ int lookaheadStatesLength = bb.getInt();
+ int BMPdataLength = bb.getInt();
+ int nonBMPdataLength = bb.getInt();
+ int additionalDataLength = bb.getInt();
+ checksum = bb.getLong();
+
+ /* Read stateTable[numCategories * numRows] */
+ stateTable = new short[stateTableLength];
+ for (int i = 0; i < stateTableLength; i++) {
+ stateTable[i] = bb.getShort();
+ }
+
+ /* Read backwardsStateTable[numCategories * numRows] */
+ backwardsStateTable = new short[backwardsStateTableLength];
+ for (int i = 0; i < backwardsStateTableLength; i++) {
+ backwardsStateTable[i] = bb.getShort();
+ }
+
+ /* Read endStates[numRows] */
+ endStates = new boolean[endStatesLength];
+ for (int i = 0; i < endStatesLength; i++) {
+ endStates[i] = bb.get() == 1;
+ }
+
+ /* Read lookaheadStates[numRows] */
+ lookaheadStates = new boolean[lookaheadStatesLength];
+ for (int i = 0; i < lookaheadStatesLength; i++) {
+ lookaheadStates[i] = bb.get() == 1;
+ }
+
+ /* Read a category table and indices for BMP characters. */
+ short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices
+ for (int i = 0; i < BMP_INDICES_LENGTH; i++) {
+ temp1[i] = bb.getShort();
+ }
+ byte[] temp2 = new byte[BMPdataLength]; // BMPdata
+ bb.get(temp2);
+ charCategoryTable = new CompactByteArray(temp1, temp2);
+
+ /* Read a category table for non-BMP characters. */
+ int[] temp3 = new int[nonBMPdataLength];
+ for (int i = 0; i < nonBMPdataLength; i++) {
+ temp3[i] = bb.getInt();
+ }
+ supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);
+
+ /* Read additional data */
+ if (additionalDataLength > 0) {
+ additionalData = new byte[additionalDataLength];
+ bb.get(additionalData);
+ }
+ assert bb.position() == bb.limit();
+
+ /* Set numCategories */
+ numCategories = stateTable.length / endStates.length;
+ }
+
+ /**
+ * Validates the magic number, version, and the length of the given data.
+ *
+ * @throws BufferUnderflowException if the end-of-data is reached while
+ * validating data
+ * @throws MissingResourceException if valification failed
+ */
+ void validateRuleData(String ruleFile, ByteBuffer bb) {
+ /* Verify the magic number. */
+ for (int i = 0; i < LABEL_LENGTH; i++) {
+ if (bb.get() != LABEL[i]) {
+ throw new MissingResourceException("Wrong magic number",
+ ruleFile, "");
+ }
+ }
+
+ /* Verify the version number. */
+ byte version = bb.get();
+ if (version != supportedVersion) {
+ throw new MissingResourceException("Unsupported version(" + version + ")",
+ ruleFile, "");
+ }
+
+ // Check the length of the rest of data
+ int len = bb.getInt();
+ if (bb.position() + len != bb.limit()) {
+ throw new MissingResourceException("Wrong data length",
+ ruleFile, "");
+ }
+ }
+
+ byte[] getAdditionalData() {
+ return additionalData;
+ }
+
+ void setAdditionalData(byte[] b) {
+ additionalData = b;
+ }
+
+ //=======================================================================
+ // boilerplate
+ //=======================================================================
+ /**
+ * Clones this iterator.
+ * @return A newly-constructed RuleBasedBreakIterator with the same
+ * behavior as this one.
+ */
+ @Override
+ public Object clone() {
+ RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();
+ if (text != null) {
+ result.text = (CharacterIterator) text.clone();
+ }
+ return result;
+ }
+
+ /**
+ * Returns true if both BreakIterators are of the same class, have the same
+ * rules, and iterate over the same text.
+ */
+ @Override
+ public boolean equals(Object that) {
+ try {
+ if (that == null) {
+ return false;
+ }
+
+ RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
+ if (checksum != other.checksum) {
+ return false;
+ }
+ if (text == null) {
+ return other.text == null;
+ } else {
+ return text.equals(other.text);
+ }
+ }
+ catch(ClassCastException e) {
+ return false;
+ }
+ }
+
+ /**
+ * Returns text
+ */
+ @Override
+ public String toString() {
+ return "[checksum=0x" + Long.toHexString(checksum) + ']';
+ }
+
+ /**
+ * Compute a hashcode for this BreakIterator
+ * @return A hash code
+ */
+ @Override
+ public int hashCode() {
+ return (int)checksum;
+ }
+
+ //=======================================================================
+ // BreakIterator overrides
+ //=======================================================================
+
+ /**
+ * Sets the current iteration position to the beginning of the text.
+ * (i.e., the CharacterIterator's starting offset).
+ * @return The offset of the beginning of the text.
+ */
+ @Override
+ public int first() {
+ CharacterIterator t = getText();
+
+ t.first();
+ return t.getIndex();
+ }
+
+ /**
+ * Sets the current iteration position to the end of the text.
+ * (i.e., the CharacterIterator's ending offset).
+ * @return The text's past-the-end offset.
+ */
+ @Override
+ public int last() {
+ CharacterIterator t = getText();
+
+ // I'm not sure why, but t.last() returns the offset of the last character,
+ // rather than the past-the-end offset
+ t.setIndex(t.getEndIndex());
+ return t.getIndex();
+ }
+
+ /**
+ * Advances the iterator either forward or backward the specified number of steps.
+ * Negative values move backward, and positive values move forward. This is
+ * equivalent to repeatedly calling next() or previous().
+ * @param n The number of steps to move. The sign indicates the direction
+ * (negative is backwards, and positive is forwards).
+ * @return The character offset of the boundary position n boundaries away from
+ * the current one.
+ */
+ @Override
+ public int next(int n) {
+ int result = current();
+ while (n > 0) {
+ result = handleNext();
+ --n;
+ }
+ while (n < 0) {
+ result = previous();
+ ++n;
+ }
+ return result;
+ }
+
+ /**
+ * Advances the iterator to the next boundary position.
+ * @return The position of the first boundary after this one.
+ */
+ @Override
+ public int next() {
+ return handleNext();
+ }
+
+ private int cachedLastKnownBreak = BreakIterator.DONE;
+
+ /**
+ * Advances the iterator backwards, to the last boundary preceding this one.
+ * @return The position of the last boundary position preceding this one.
+ */
+ @Override
+ public int previous() {
+ // if we're already sitting at the beginning of the text, return DONE
+ CharacterIterator text = getText();
+ if (current() == text.getBeginIndex()) {
+ return BreakIterator.DONE;
+ }
+
+ // set things up. handlePrevious() will back us up to some valid
+ // break position before the current position (we back our internal
+ // iterator up one step to prevent handlePrevious() from returning
+ // the current position), but not necessarily the last one before
+ // where we started
+ int start = current();
+ int lastResult = cachedLastKnownBreak;
+ if (lastResult >= start || lastResult <= BreakIterator.DONE) {
+ getPrevious();
+ lastResult = handlePrevious();
+ } else {
+ //it might be better to check if handlePrevious() give us closer
+ //safe value but handlePrevious() is slow too
+ //So, this has to be done carefully
+ text.setIndex(lastResult);
+ }
+ int result = lastResult;
+
+ // iterate forward from the known break position until we pass our
+ // starting point. The last break position before the starting
+ // point is our return value
+ while (result != BreakIterator.DONE && result < start) {
+ lastResult = result;
+ result = handleNext();
+ }
+
+ // set the current iteration position to be the last break position
+ // before where we started, and then return that value
+ text.setIndex(lastResult);
+ cachedLastKnownBreak = lastResult;
+ return lastResult;
+ }
+
+ /**
+ * Returns previous character
+ */
+ private int getPrevious() {
+ char c2 = text.previous();
+ if (Character.isLowSurrogate(c2) &&
+ text.getIndex() > text.getBeginIndex()) {
+ char c1 = text.previous();
+ if (Character.isHighSurrogate(c1)) {
+ return Character.toCodePoint(c1, c2);
+ } else {
+ text.next();
+ }
+ }
+ return (int)c2;
+ }
+
+ /**
+ * Returns current character
+ */
+ int getCurrent() {
+ char c1 = text.current();
+ if (Character.isHighSurrogate(c1) &&
+ text.getIndex() < text.getEndIndex()) {
+ char c2 = text.next();
+ text.previous();
+ if (Character.isLowSurrogate(c2)) {
+ return Character.toCodePoint(c1, c2);
+ }
+ }
+ return (int)c1;
+ }
+
+ /**
+ * Returns the count of next character.
+ */
+ private int getCurrentCodePointCount() {
+ char c1 = text.current();
+ if (Character.isHighSurrogate(c1) &&
+ text.getIndex() < text.getEndIndex()) {
+ char c2 = text.next();
+ text.previous();
+ if (Character.isLowSurrogate(c2)) {
+ return 2;
+ }
+ }
+ return 1;
+ }
+
+ /**
+ * Returns next character
+ */
+ int getNext() {
+ int index = text.getIndex();
+ int endIndex = text.getEndIndex();
+ if (index == endIndex ||
+ (index += getCurrentCodePointCount()) >= endIndex) {
+ return CharacterIterator.DONE;
+ }
+ text.setIndex(index);
+ return getCurrent();
+ }
+
+ /**
+ * Returns the position of next character.
+ */
+ private int getNextIndex() {
+ int index = text.getIndex() + getCurrentCodePointCount();
+ int endIndex = text.getEndIndex();
+ if (index > endIndex) {
+ return endIndex;
+ } else {
+ return index;
+ }
+ }
+
+ /**
+ * Throw IllegalArgumentException unless begin <= offset < end.
+ */
+ protected static final void checkOffset(int offset, CharacterIterator text) {
+ if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
+ throw new IllegalArgumentException("offset out of bounds");
+ }
+ }
+
+ /**
+ * Sets the iterator to refer to the first boundary position following
+ * the specified position.
+ * @offset The position from which to begin searching for a break position.
+ * @return The position of the first break after the current position.
+ */
+ @Override
+ public int following(int offset) {
+
+ CharacterIterator text = getText();
+ checkOffset(offset, text);
+
+ // Set our internal iteration position (temporarily)
+ // to the position passed in. If this is the _beginning_ position,
+ // then we can just use next() to get our return value
+ text.setIndex(offset);
+ if (offset == text.getBeginIndex()) {
+ cachedLastKnownBreak = handleNext();
+ return cachedLastKnownBreak;
+ }
+
+ // otherwise, we have to sync up first. Use handlePrevious() to back
+ // us up to a known break position before the specified position (if
+ // we can determine that the specified position is a break position,
+ // we don't back up at all). This may or may not be the last break
+ // position at or before our starting position. Advance forward
+ // from here until we've passed the starting position. The position
+ // we stop on will be the first break position after the specified one.
+ int result = cachedLastKnownBreak;
+ if (result >= offset || result <= BreakIterator.DONE) {
+ result = handlePrevious();
+ } else {
+ //it might be better to check if handlePrevious() give us closer
+ //safe value but handlePrevious() is slow too
+ //So, this has to be done carefully
+ text.setIndex(result);
+ }
+ while (result != BreakIterator.DONE && result <= offset) {
+ result = handleNext();
+ }
+ cachedLastKnownBreak = result;
+ return result;
+ }
+
+ /**
+ * Sets the iterator to refer to the last boundary position before the
+ * specified position.
+ * @offset The position to begin searching for a break from.
+ * @return The position of the last boundary before the starting position.
+ */
+ @Override
+ public int preceding(int offset) {
+ // if we start by updating the current iteration position to the
+ // position specified by the caller, we can just use previous()
+ // to carry out this operation
+ CharacterIterator text = getText();
+ checkOffset(offset, text);
+ text.setIndex(offset);
+ return previous();
+ }
+
+ /**
+ * Returns true if the specified position is a boundary position. As a side
+ * effect, leaves the iterator pointing to the first boundary position at
+ * or after "offset".
+ * @param offset the offset to check.
+ * @return True if "offset" is a boundary position.
+ */
+ @Override
+ public boolean isBoundary(int offset) {
+ CharacterIterator text = getText();
+ checkOffset(offset, text);
+ if (offset == text.getBeginIndex()) {
+ return true;
+ }
+
+ // to check whether this is a boundary, we can use following() on the
+ // position before the specified one and return true if the position we
+ // get back is the one the user specified
+ else {
+ return following(offset - 1) == offset;
+ }
+ }
+
+ /**
+ * Returns the current iteration position.
+ * @return The current iteration position.
+ */
+ @Override
+ public int current() {
+ return getText().getIndex();
+ }
+
+ /**
+ * Return a CharacterIterator over the text being analyzed. This version
+ * of this method returns the actual CharacterIterator we're using internally.
+ * Changing the state of this iterator can have undefined consequences. If
+ * you need to change it, clone it first.
+ * @return An iterator over the text being analyzed.
+ */
+ @Override
+ public CharacterIterator getText() {
+ // The iterator is initialized pointing to no text at all, so if this
+ // function is called while we're in that state, we have to fudge an
+ // iterator to return.
+ if (text == null) {
+ text = new StringCharacterIterator("");
+ }
+ return text;
+ }
+
+ /**
+ * Set the iterator to analyze a new piece of text. This function resets
+ * the current iteration position to the beginning of the text.
+ * @param newText An iterator over the text to analyze.
+ */
+ @Override
+ public void setText(CharacterIterator newText) {
+ // Test iterator to see if we need to wrap it in a SafeCharIterator.
+ // The correct behavior for CharacterIterators is to allow the
+ // position to be set to the endpoint of the iterator. Many
+ // CharacterIterators do not uphold this, so this is a workaround
+ // to permit them to use this class.
+ int end = newText.getEndIndex();
+ boolean goodIterator;
+ try {
+ newText.setIndex(end); // some buggy iterators throw an exception here
+ goodIterator = newText.getIndex() == end;
+ }
+ catch(IllegalArgumentException e) {
+ goodIterator = false;
+ }
+
+ if (goodIterator) {
+ text = newText;
+ }
+ else {
+ text = new SafeCharIterator(newText);
+ }
+ text.first();
+
+ cachedLastKnownBreak = BreakIterator.DONE;
+ }
+
+
+ //=======================================================================
+ // implementation
+ //=======================================================================
+
+ /**
+ * This method is the actual implementation of the next() method. All iteration
+ * vectors through here. This method initializes the state machine to state 1
+ * and advances through the text character by character until we reach the end
+ * of the text or the state machine transitions to state 0. We update our return
+ * value every time the state machine passes through a possible end state.
+ */
+ protected int handleNext() {
+ // if we're already at the end of the text, return DONE.
+ CharacterIterator text = getText();
+ if (text.getIndex() == text.getEndIndex()) {
+ return BreakIterator.DONE;
+ }
+
+ // no matter what, we always advance at least one character forward
+ int result = getNextIndex();
+ int lookaheadResult = 0;
+
+ // begin in state 1
+ int state = START_STATE;
+ int category;
+ int c = getCurrent();
+
+ // loop until we reach the end of the text or transition to state 0
+ while (c != CharacterIterator.DONE && state != STOP_STATE) {
+
+ // look up the current character's character category (which tells us
+ // which column in the state table to look at)
+ category = lookupCategory(c);
+
+ // if the character isn't an ignore character, look up a state
+ // transition in the state table
+ if (category != IGNORE) {
+ state = lookupState(state, category);
+ }
+
+ // if the state we've just transitioned to is a lookahead state,
+ // (but not also an end state), save its position. If it's
+ // both a lookahead state and an end state, update the break position
+ // to the last saved lookup-state position
+ if (lookaheadStates[state]) {
+ if (endStates[state]) {
+ result = lookaheadResult;
+ }
+ else {
+ lookaheadResult = getNextIndex();
+ }
+ }
+
+ // otherwise, if the state we've just transitioned to is an accepting
+ // state, update the break position to be the current iteration position
+ else {
+ if (endStates[state]) {
+ result = getNextIndex();
+ }
+ }
+
+ c = getNext();
+ }
+
+ // if we've run off the end of the text, and the very last character took us into
+ // a lookahead state, advance the break position to the lookahead position
+ // (the theory here is that if there are no characters at all after the lookahead
+ // position, that always matches the lookahead criteria)
+ if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
+ result = lookaheadResult;
+ }
+
+ text.setIndex(result);
+ return result;
+ }
+
+ /**
+ * This method backs the iterator back up to a "safe position" in the text.
+ * This is a position that we know, without any context, must be a break position.
+ * The various calling methods then iterate forward from this safe position to
+ * the appropriate position to return. (For more information, see the description
+ * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
+ */
+ protected int handlePrevious() {
+ CharacterIterator text = getText();
+ int state = START_STATE;
+ int category = 0;
+ int lastCategory = 0;
+ int c = getCurrent();
+
+ // loop until we reach the beginning of the text or transition to state 0
+ while (c != CharacterIterator.DONE && state != STOP_STATE) {
+
+ // save the last character's category and look up the current
+ // character's category
+ lastCategory = category;
+ category = lookupCategory(c);
+
+ // if the current character isn't an ignore character, look up a
+ // state transition in the backwards state table
+ if (category != IGNORE) {
+ state = lookupBackwardState(state, category);
+ }
+
+ // then advance one character backwards
+ c = getPrevious();
+ }
+
+ // if we didn't march off the beginning of the text, we're either one or two
+ // positions away from the real break position. (One because of the call to
+ // previous() at the end of the loop above, and another because the character
+ // that takes us into the stop state will always be the character BEFORE
+ // the break position.)
+ if (c != CharacterIterator.DONE) {
+ if (lastCategory != IGNORE) {
+ getNext();
+ getNext();
+ }
+ else {
+ getNext();
+ }
+ }
+ return text.getIndex();
+ }
+
+ /**
+ * Looks up a character's category (i.e., its category for breaking purposes,
+ * not its Unicode category)
+ */
+ protected int lookupCategory(int c) {
+ if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+ return charCategoryTable.elementAt((char)c);
+ } else {
+ return supplementaryCharCategoryTable.getValue(c);
+ }
+ }
+
+ /**
+ * Given a current state and a character category, looks up the
+ * next state to transition to in the state table.
+ */
+ protected int lookupState(int state, int category) {
+ return stateTable[state * numCategories + category];
+ }
+
+ /**
+ * Given a current state and a character category, looks up the
+ * next state to transition to in the backwards state table.
+ */
+ protected int lookupBackwardState(int state, int category) {
+ return backwardsStateTable[state * numCategories + category];
+ }
+
+ /*
+ * This class exists to work around a bug in incorrect implementations
+ * of CharacterIterator, which incorrectly handle setIndex(endIndex).
+ * This iterator relies only on base.setIndex(n) where n is less than
+ * endIndex.
+ *
+ * One caveat: if the base iterator's begin and end indices change
+ * the change will not be reflected by this wrapper. Does that matter?
+ */
+ // TODO: Review this class to see if it's still required.
+ private static final class SafeCharIterator implements CharacterIterator,
+ Cloneable {
+
+ private CharacterIterator base;
+ private int rangeStart;
+ private int rangeLimit;
+ private int currentIndex;
+
+ SafeCharIterator(CharacterIterator base) {
+ this.base = base;
+ this.rangeStart = base.getBeginIndex();
+ this.rangeLimit = base.getEndIndex();
+ this.currentIndex = base.getIndex();
+ }
+
+ @Override
+ public char first() {
+ return setIndex(rangeStart);
+ }
+
+ @Override
+ public char last() {
+ return setIndex(rangeLimit - 1);
+ }
+
+ @Override
+ public char current() {
+ if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
+ return DONE;
+ }
+ else {
+ return base.setIndex(currentIndex);
+ }
+ }
+
+ @Override
+ public char next() {
+
+ currentIndex++;
+ if (currentIndex >= rangeLimit) {
+ currentIndex = rangeLimit;
+ return DONE;
+ }
+ else {
+ return base.setIndex(currentIndex);
+ }
+ }
+
+ @Override
+ public char previous() {
+
+ currentIndex--;
+ if (currentIndex < rangeStart) {
+ currentIndex = rangeStart;
+ return DONE;
+ }
+ else {
+ return base.setIndex(currentIndex);
+ }
+ }
+
+ @Override
+ public char setIndex(int i) {
+
+ if (i < rangeStart || i > rangeLimit) {
+ throw new IllegalArgumentException("Invalid position");
+ }
+ currentIndex = i;
+ return current();
+ }
+
+ @Override
+ public int getBeginIndex() {
+ return rangeStart;
+ }
+
+ @Override
+ public int getEndIndex() {
+ return rangeLimit;
+ }
+
+ @Override
+ public int getIndex() {
+ return currentIndex;
+ }
+
+ @Override
+ public Object clone() {
+
+ SafeCharIterator copy = null;
+ try {
+ copy = (SafeCharIterator) super.clone();
+ }
+ catch(CloneNotSupportedException e) {
+ throw new Error("Clone not supported: " + e);
+ }
+
+ CharacterIterator copyOfBase = (CharacterIterator) base.clone();
+ copy.base = copyOfBase;
+ return copy;
+ }
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/text/resources/BreakIteratorResources.java Tue Oct 25 15:43:19 2016 +0900
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package sun.text.resources;
+
+import java.util.ResourceBundle;
+import sun.util.resources.BreakIteratorResourceBundle;
+
+public class BreakIteratorResources extends BreakIteratorResourceBundle {
+ @Override
+ protected ResourceBundle getBreakIteratorInfo() {
+ return new BreakIteratorInfo();
+ }
+}
--- a/jdk/src/java.base/share/classes/sun/util/locale/provider/BreakDictionary.java Mon Oct 24 21:44:33 2016 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,352 +0,0 @@
-/*
- * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *
- * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
- * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
- *
- * The original version of this source code and documentation
- * is copyrighted and owned by Taligent, Inc., a wholly-owned
- * subsidiary of IBM. These materials are provided under terms
- * of a License Agreement between Taligent and Sun. This technology
- * is protected by multiple US and International patents.
- *
- * This notice and attribution to Taligent may not be removed.
- * Taligent is a registered trademark of Taligent, Inc.
- */
-package sun.util.locale.provider;
-
-import java.io.BufferedInputStream;
-import java.io.InputStream;
-import java.io.IOException;
-import java.lang.reflect.Module;
-import java.security.AccessController;
-import java.security.PrivilegedActionException;
-import java.security.PrivilegedExceptionAction;
-import java.util.MissingResourceException;
-import sun.text.CompactByteArray;
-import sun.text.SupplementaryCharacterData;
-
-/**
- * This is the class that represents the list of known words used by
- * DictionaryBasedBreakIterator. The conceptual data structure used
- * here is a trie: there is a node hanging off the root node for every
- * letter that can start a word. Each of these nodes has a node hanging
- * off of it for every letter that can be the second letter of a word
- * if this node is the first letter, and so on. The trie is represented
- * as a two-dimensional array that can be treated as a table of state
- * transitions. Indexes are used to compress this array, taking
- * advantage of the fact that this array will always be very sparse.
- */
-class BreakDictionary {
-
- //=========================================================================
- // data members
- //=========================================================================
-
- /**
- * The version of the dictionary that was read in.
- */
- private static int supportedVersion = 1;
-
- /**
- * Maps from characters to column numbers. The main use of this is to
- * avoid making room in the array for empty columns.
- */
- private CompactByteArray columnMap = null;
- private SupplementaryCharacterData supplementaryCharColumnMap = null;
-
- /**
- * The number of actual columns in the table
- */
- private int numCols;
-
- /**
- * Columns are organized into groups of 32. This says how many
- * column groups. (We could calculate this, but we store the
- * value to avoid having to repeatedly calculate it.)
- */
- private int numColGroups;
-
- /**
- * The actual compressed state table. Each conceptual row represents
- * a state, and the cells in it contain the row numbers of the states
- * to transition to for each possible letter. 0 is used to indicate
- * an illegal combination of letters (i.e., the error state). The
- * table is compressed by eliminating all the unpopulated (i.e., zero)
- * cells. Multiple conceptual rows can then be doubled up in a single
- * physical row by sliding them up and possibly shifting them to one
- * side or the other so the populated cells don't collide. Indexes
- * are used to identify unpopulated cells and to locate populated cells.
- */
- private short[] table = null;
-
- /**
- * This index maps logical row numbers to physical row numbers
- */
- private short[] rowIndex = null;
-
- /**
- * A bitmap is used to tell which cells in the comceptual table are
- * populated. This array contains all the unique bit combinations
- * in that bitmap. If the table is more than 32 columns wide,
- * successive entries in this array are used for a single row.
- */
- private int[] rowIndexFlags = null;
-
- /**
- * This index maps from a logical row number into the bitmap table above.
- * (This keeps us from storing duplicate bitmap combinations.) Since there
- * are a lot of rows with only one populated cell, instead of wasting space
- * in the bitmap table, we just store a negative number in this index for
- * rows with one populated cell. The absolute value of that number is
- * the column number of the populated cell.
- */
- private short[] rowIndexFlagsIndex = null;
-
- /**
- * For each logical row, this index contains a constant that is added to
- * the logical column number to get the physical column number
- */
- private byte[] rowIndexShifts = null;
-
- //=========================================================================
- // deserialization
- //=========================================================================
-
- BreakDictionary(Module module, String dictionaryName)
- throws IOException, MissingResourceException {
-
- readDictionaryFile(module, dictionaryName);
- }
-
- private void readDictionaryFile(final Module module, final String dictionaryName)
- throws IOException, MissingResourceException {
-
- BufferedInputStream in;
- try {
- PrivilegedExceptionAction<BufferedInputStream> pa = () -> {
- String pathName = "jdk.localedata".equals(module.getName()) ?
- "sun/text/resources/ext/" :
- "sun/text/resources/";
- InputStream is = module.getResourceAsStream(pathName + dictionaryName);
- if (is == null) {
- // Try to load the file with "java.base" module instance. Assumption
- // here is that the fall back data files to be read should reside in
- // java.base.
- is = BreakDictionary.class.getModule().getResourceAsStream("sun/text/resources/" + dictionaryName);
- }
-
- return new BufferedInputStream(is);
- };
- in = AccessController.doPrivileged(pa);
- }
- catch (PrivilegedActionException e) {
- throw new InternalError(e.toString(), e);
- }
-
- byte[] buf = new byte[8];
- if (in.read(buf) != 8) {
- throw new MissingResourceException("Wrong data length",
- dictionaryName, "");
- }
-
- // check version
- int version = RuleBasedBreakIterator.getInt(buf, 0);
- if (version != supportedVersion) {
- throw new MissingResourceException("Dictionary version(" + version + ") is unsupported",
- dictionaryName, "");
- }
-
- // get data size
- int len = RuleBasedBreakIterator.getInt(buf, 4);
- buf = new byte[len];
- if (in.read(buf) != len) {
- throw new MissingResourceException("Wrong data length",
- dictionaryName, "");
- }
-
- // close the stream
- in.close();
-
- int l;
- int offset = 0;
-
- // read in the column map for BMP characteres (this is serialized in
- // its internal form: an index array followed by a data array)
- l = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- short[] temp = new short[l];
- for (int i = 0; i < l; i++, offset+=2) {
- temp[i] = RuleBasedBreakIterator.getShort(buf, offset);
- }
- l = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- byte[] temp2 = new byte[l];
- for (int i = 0; i < l; i++, offset++) {
- temp2[i] = buf[offset];
- }
- columnMap = new CompactByteArray(temp, temp2);
-
- // read in numCols and numColGroups
- numCols = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- numColGroups = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
-
- // read in the row-number index
- l = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- rowIndex = new short[l];
- for (int i = 0; i < l; i++, offset+=2) {
- rowIndex[i] = RuleBasedBreakIterator.getShort(buf, offset);
- }
-
- // load in the populated-cells bitmap: index first, then bitmap list
- l = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- rowIndexFlagsIndex = new short[l];
- for (int i = 0; i < l; i++, offset+=2) {
- rowIndexFlagsIndex[i] = RuleBasedBreakIterator.getShort(buf, offset);
- }
- l = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- rowIndexFlags = new int[l];
- for (int i = 0; i < l; i++, offset+=4) {
- rowIndexFlags[i] = RuleBasedBreakIterator.getInt(buf, offset);
- }
-
- // load in the row-shift index
- l = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- rowIndexShifts = new byte[l];
- for (int i = 0; i < l; i++, offset++) {
- rowIndexShifts[i] = buf[offset];
- }
-
- // load in the actual state table
- l = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- table = new short[l];
- for (int i = 0; i < l; i++, offset+=2) {
- table[i] = RuleBasedBreakIterator.getShort(buf, offset);
- }
-
- // finally, prepare the column map for supplementary characters
- l = RuleBasedBreakIterator.getInt(buf, offset);
- offset += 4;
- int[] temp3 = new int[l];
- for (int i = 0; i < l; i++, offset+=4) {
- temp3[i] = RuleBasedBreakIterator.getInt(buf, offset);
- }
- supplementaryCharColumnMap = new SupplementaryCharacterData(temp3);
- }
-
- //=========================================================================
- // access to the words
- //=========================================================================
-
- /**
- * Uses the column map to map the character to a column number, then
- * passes the row and column number to getNextState()
- * @param row The current state
- * @param ch The character whose column we're interested in
- * @return The new state to transition to
- */
- public final short getNextStateFromCharacter(int row, int ch) {
- int col;
- if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
- col = columnMap.elementAt((char)ch);
- } else {
- col = supplementaryCharColumnMap.getValue(ch);
- }
- return getNextState(row, col);
- }
-
- /**
- * Returns the value in the cell with the specified (logical) row and
- * column numbers. In DictionaryBasedBreakIterator, the row number is
- * a state number, the column number is an input, and the return value
- * is the row number of the new state to transition to. (0 is the
- * "error" state, and -1 is the "end of word" state in a dictionary)
- * @param row The row number of the current state
- * @param col The column number of the input character (0 means "not a
- * dictionary character")
- * @return The row number of the new state to transition to
- */
- public final short getNextState(int row, int col) {
- if (cellIsPopulated(row, col)) {
- // we map from logical to physical row number by looking up the
- // mapping in rowIndex; we map from logical column number to
- // physical column number by looking up a shift value for this
- // logical row and offsetting the logical column number by
- // the shift amount. Then we can use internalAt() to actually
- // get the value out of the table.
- return internalAt(rowIndex[row], col + rowIndexShifts[row]);
- }
- else {
- return 0;
- }
- }
-
- /**
- * Given (logical) row and column numbers, returns true if the
- * cell in that position is populated
- */
- private boolean cellIsPopulated(int row, int col) {
- // look up the entry in the bitmap index for the specified row.
- // If it's a negative number, it's the column number of the only
- // populated cell in the row
- if (rowIndexFlagsIndex[row] < 0) {
- return col == -rowIndexFlagsIndex[row];
- }
-
- // if it's a positive number, it's the offset of an entry in the bitmap
- // list. If the table is more than 32 columns wide, the bitmap is stored
- // successive entries in the bitmap list, so we have to divide the column
- // number by 32 and offset the number we got out of the index by the result.
- // Once we have the appropriate piece of the bitmap, test the appropriate
- // bit and return the result.
- else {
- int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
- return (flags & (1 << (col & 0x1f))) != 0;
- }
- }
-
- /**
- * Implementation of getNextState() when we know the specified cell is
- * populated.
- * @param row The PHYSICAL row number of the cell
- * @param col The PHYSICAL column number of the cell
- * @return The value stored in the cell
- */
- private short internalAt(int row, int col) {
- // the table is a one-dimensional array, so this just does the math necessary
- // to treat it as a two-dimensional array (we don't just use a two-dimensional
- // array because two-dimensional arrays are inefficient in Java)
- return table[row * numCols + col];
- }
-}
--- a/jdk/src/java.base/share/classes/sun/util/locale/provider/BreakIteratorProviderImpl.java Mon Oct 24 21:44:33 2016 -0700
+++ b/jdk/src/java.base/share/classes/sun/util/locale/provider/BreakIteratorProviderImpl.java Tue Oct 25 15:43:19 2016 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -32,6 +32,8 @@
import java.util.MissingResourceException;
import java.util.Objects;
import java.util.Set;
+import sun.text.DictionaryBasedBreakIterator;
+import sun.text.RuleBasedBreakIterator;
/**
* Concrete implementation of the {@link java.text.spi.BreakIteratorProvider
@@ -153,29 +155,31 @@
}
private BreakIterator getBreakInstance(Locale locale,
- int type,
- String dataName,
- String dictionaryName) {
+ int type,
+ String ruleName,
+ String dictionaryName) {
Objects.requireNonNull(locale);
LocaleResources lr = LocaleProviderAdapter.forJRE().getLocaleResources(locale);
String[] classNames = (String[]) lr.getBreakIteratorInfo("BreakIteratorClasses");
- String dataFile = (String) lr.getBreakIteratorInfo(dataName);
+ String ruleFile = (String) lr.getBreakIteratorInfo(ruleName);
+ byte[] ruleData = lr.getBreakIteratorResources(ruleName);
try {
switch (classNames[type]) {
case "RuleBasedBreakIterator":
- return new RuleBasedBreakIterator(
- lr.getBreakIteratorDataModule(), dataFile);
+ return new RuleBasedBreakIterator(ruleFile, ruleData);
+
case "DictionaryBasedBreakIterator":
String dictionaryFile = (String) lr.getBreakIteratorInfo(dictionaryName);
- return new DictionaryBasedBreakIterator(
- lr.getBreakIteratorDataModule(), dataFile, dictionaryFile);
+ byte[] dictionaryData = lr.getBreakIteratorResources(dictionaryName);
+ return new DictionaryBasedBreakIterator(ruleFile, ruleData,
+ dictionaryFile, dictionaryData);
default:
throw new IllegalArgumentException("Invalid break iterator class \"" +
classNames[type] + "\"");
}
- } catch (IOException | MissingResourceException | IllegalArgumentException e) {
+ } catch (MissingResourceException | IllegalArgumentException e) {
throw new InternalError(e.toString(), e);
}
}
--- a/jdk/src/java.base/share/classes/sun/util/locale/provider/DictionaryBasedBreakIterator.java Mon Oct 24 21:44:33 2016 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,524 +0,0 @@
-/*
- * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *
- * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
- * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
- *
- * The original version of this source code and documentation
- * is copyrighted and owned by Taligent, Inc., a wholly-owned
- * subsidiary of IBM. These materials are provided under terms
- * of a License Agreement between Taligent and Sun. This technology
- * is protected by multiple US and International patents.
- *
- * This notice and attribution to Taligent may not be removed.
- * Taligent is a registered trademark of Taligent, Inc.
- */
-
-package sun.util.locale.provider;
-
-import java.io.IOException;
-import java.lang.reflect.Module;
-import java.text.CharacterIterator;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Stack;
-
-/**
- * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
- * to further subdivide ranges of text beyond what is possible using just the
- * state-table-based algorithm. This is necessary, for example, to handle
- * word and line breaking in Thai, which doesn't use spaces between words. The
- * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
- * up text as far as possible, and then contiguous ranges of letters are
- * repeatedly compared against a list of known words (i.e., the dictionary)
- * to divide them up into words.
- *
- * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
- * but adds one more special substitution name: <dictionary>. This substitution
- * name is used to identify characters in words in the dictionary. The idea is that
- * if the iterator passes over a chunk of text that includes two or more characters
- * in a row that are included in <dictionary>, it goes back through that range and
- * derives additional break positions (if possible) using the dictionary.
- *
- * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
- * file. It follows a prescribed search path to locate the dictionary (right now,
- * it looks for it in /com/ibm/text/resources in each directory in the classpath,
- * and won't find it in JAR files, but this location is likely to change). The
- * dictionary file is in a serialized binary format. We have a very primitive (and
- * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
- * currently making it public. Contact us for help.
- */
-class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
-
- /**
- * a list of known words that is used to divide up contiguous ranges of letters,
- * stored in a compressed, indexed, format that offers fast access
- */
- private BreakDictionary dictionary;
-
- /**
- * a list of flags indicating which character categories are contained in
- * the dictionary file (this is used to determine which ranges of characters
- * to apply the dictionary to)
- */
- private boolean[] categoryFlags;
-
- /**
- * a temporary hiding place for the number of dictionary characters in the
- * last range passed over by next()
- */
- private int dictionaryCharCount;
-
- /**
- * when a range of characters is divided up using the dictionary, the break
- * positions that are discovered are stored here, preventing us from having
- * to use either the dictionary or the state table again until the iterator
- * leaves this range of text
- */
- private int[] cachedBreakPositions;
-
- /**
- * if cachedBreakPositions is not null, this indicates which item in the
- * cache the current iteration position refers to
- */
- private int positionInCache;
-
- /**
- * Constructs a DictionaryBasedBreakIterator.
- * @param module The module where the dictionary file resides
- * @param dictionaryFilename The filename of the dictionary file to use
- */
- DictionaryBasedBreakIterator(Module module, String dataFile, String dictionaryFile)
- throws IOException {
- super(module, dataFile);
- byte[] tmp = super.getAdditionalData();
- if (tmp != null) {
- prepareCategoryFlags(tmp);
- super.setAdditionalData(null);
- }
- dictionary = new BreakDictionary(module, dictionaryFile);
- }
-
- private void prepareCategoryFlags(byte[] data) {
- categoryFlags = new boolean[data.length];
- for (int i = 0; i < data.length; i++) {
- categoryFlags[i] = (data[i] == (byte)1) ? true : false;
- }
- }
-
- @Override
- public void setText(CharacterIterator newText) {
- super.setText(newText);
- cachedBreakPositions = null;
- dictionaryCharCount = 0;
- positionInCache = 0;
- }
-
- /**
- * Sets the current iteration position to the beginning of the text.
- * (i.e., the CharacterIterator's starting offset).
- * @return The offset of the beginning of the text.
- */
- @Override
- public int first() {
- cachedBreakPositions = null;
- dictionaryCharCount = 0;
- positionInCache = 0;
- return super.first();
- }
-
- /**
- * Sets the current iteration position to the end of the text.
- * (i.e., the CharacterIterator's ending offset).
- * @return The text's past-the-end offset.
- */
- @Override
- public int last() {
- cachedBreakPositions = null;
- dictionaryCharCount = 0;
- positionInCache = 0;
- return super.last();
- }
-
- /**
- * Advances the iterator one step backwards.
- * @return The position of the last boundary position before the
- * current iteration position
- */
- @Override
- public int previous() {
- CharacterIterator text = getText();
-
- // if we have cached break positions and we're still in the range
- // covered by them, just move one step backward in the cache
- if (cachedBreakPositions != null && positionInCache > 0) {
- --positionInCache;
- text.setIndex(cachedBreakPositions[positionInCache]);
- return cachedBreakPositions[positionInCache];
- }
-
- // otherwise, dump the cache and use the inherited previous() method to move
- // backward. This may fill up the cache with new break positions, in which
- // case we have to mark our position in the cache
- else {
- cachedBreakPositions = null;
- int result = super.previous();
- if (cachedBreakPositions != null) {
- positionInCache = cachedBreakPositions.length - 2;
- }
- return result;
- }
- }
-
- /**
- * Sets the current iteration position to the last boundary position
- * before the specified position.
- * @param offset The position to begin searching from
- * @return The position of the last boundary before "offset"
- */
- @Override
- public int preceding(int offset) {
- CharacterIterator text = getText();
- checkOffset(offset, text);
-
- // if we have no cached break positions, or "offset" is outside the
- // range covered by the cache, we can just call the inherited routine
- // (which will eventually call other routines in this class that may
- // refresh the cache)
- if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
- offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
- cachedBreakPositions = null;
- return super.preceding(offset);
- }
-
- // on the other hand, if "offset" is within the range covered by the cache,
- // then all we have to do is search the cache for the last break position
- // before "offset"
- else {
- positionInCache = 0;
- while (positionInCache < cachedBreakPositions.length
- && offset > cachedBreakPositions[positionInCache]) {
- ++positionInCache;
- }
- --positionInCache;
- text.setIndex(cachedBreakPositions[positionInCache]);
- return text.getIndex();
- }
- }
-
- /**
- * Sets the current iteration position to the first boundary position after
- * the specified position.
- * @param offset The position to begin searching forward from
- * @return The position of the first boundary after "offset"
- */
- @Override
- public int following(int offset) {
- CharacterIterator text = getText();
- checkOffset(offset, text);
-
- // if we have no cached break positions, or if "offset" is outside the
- // range covered by the cache, then dump the cache and call our
- // inherited following() method. This will call other methods in this
- // class that may refresh the cache.
- if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
- offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
- cachedBreakPositions = null;
- return super.following(offset);
- }
-
- // on the other hand, if "offset" is within the range covered by the
- // cache, then just search the cache for the first break position
- // after "offset"
- else {
- positionInCache = 0;
- while (positionInCache < cachedBreakPositions.length
- && offset >= cachedBreakPositions[positionInCache]) {
- ++positionInCache;
- }
- text.setIndex(cachedBreakPositions[positionInCache]);
- return text.getIndex();
- }
- }
-
- /**
- * This is the implementation function for next().
- */
- @Override
- protected int handleNext() {
- CharacterIterator text = getText();
-
- // if there are no cached break positions, or if we've just moved
- // off the end of the range covered by the cache, we have to dump
- // and possibly regenerate the cache
- if (cachedBreakPositions == null ||
- positionInCache == cachedBreakPositions.length - 1) {
-
- // start by using the inherited handleNext() to find a tentative return
- // value. dictionaryCharCount tells us how many dictionary characters
- // we passed over on our way to the tentative return value
- int startPos = text.getIndex();
- dictionaryCharCount = 0;
- int result = super.handleNext();
-
- // if we passed over more than one dictionary character, then we use
- // divideUpDictionaryRange() to regenerate the cached break positions
- // for the new range
- if (dictionaryCharCount > 1 && result - startPos > 1) {
- divideUpDictionaryRange(startPos, result);
- }
-
- // otherwise, the value we got back from the inherited fuction
- // is our return value, and we can dump the cache
- else {
- cachedBreakPositions = null;
- return result;
- }
- }
-
- // if the cache of break positions has been regenerated (or existed all
- // along), then just advance to the next break position in the cache
- // and return it
- if (cachedBreakPositions != null) {
- ++positionInCache;
- text.setIndex(cachedBreakPositions[positionInCache]);
- return cachedBreakPositions[positionInCache];
- }
- return -9999; // SHOULD NEVER GET HERE!
- }
-
- /**
- * Looks up a character category for a character.
- */
- @Override
- protected int lookupCategory(int c) {
- // this override of lookupCategory() exists only to keep track of whether we've
- // passed over any dictionary characters. It calls the inherited lookupCategory()
- // to do the real work, and then checks whether its return value is one of the
- // categories represented in the dictionary. If it is, bump the dictionary-
- // character count.
- int result = super.lookupCategory(c);
- if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
- ++dictionaryCharCount;
- }
- return result;
- }
-
- /**
- * This is the function that actually implements the dictionary-based
- * algorithm. Given the endpoints of a range of text, it uses the
- * dictionary to determine the positions of any boundaries in this
- * range. It stores all the boundary positions it discovers in
- * cachedBreakPositions so that we only have to do this work once
- * for each time we enter the range.
- */
- @SuppressWarnings("unchecked")
- private void divideUpDictionaryRange(int startPos, int endPos) {
- CharacterIterator text = getText();
-
- // the range we're dividing may begin or end with non-dictionary characters
- // (i.e., for line breaking, we may have leading or trailing punctuation
- // that needs to be kept with the word). Seek from the beginning of the
- // range to the first dictionary character
- text.setIndex(startPos);
- int c = getCurrent();
- int category = lookupCategory(c);
- while (category == IGNORE || !categoryFlags[category]) {
- c = getNext();
- category = lookupCategory(c);
- }
-
- // initialize. We maintain two stacks: currentBreakPositions contains
- // the list of break positions that will be returned if we successfully
- // finish traversing the whole range now. possibleBreakPositions lists
- // all other possible word ends we've passed along the way. (Whenever
- // we reach an error [a sequence of characters that can't begin any word
- // in the dictionary], we back up, possibly delete some breaks from
- // currentBreakPositions, move a break from possibleBreakPositions
- // to currentBreakPositions, and start over from there. This process
- // continues in this way until we either successfully make it all the way
- // across the range, or exhaust all of our combinations of break
- // positions.)
- Stack<Integer> currentBreakPositions = new Stack<>();
- Stack<Integer> possibleBreakPositions = new Stack<>();
- List<Integer> wrongBreakPositions = new ArrayList<>();
-
- // the dictionary is implemented as a trie, which is treated as a state
- // machine. -1 represents the end of a legal word. Every word in the
- // dictionary is represented by a path from the root node to -1. A path
- // that ends in state 0 is an illegal combination of characters.
- int state = 0;
-
- // these two variables are used for error handling. We keep track of the
- // farthest we've gotten through the range being divided, and the combination
- // of breaks that got us that far. If we use up all possible break
- // combinations, the text contains an error or a word that's not in the
- // dictionary. In this case, we "bless" the break positions that got us the
- // farthest as real break positions, and then start over from scratch with
- // the character where the error occurred.
- int farthestEndPoint = text.getIndex();
- Stack<Integer> bestBreakPositions = null;
-
- // initialize (we always exit the loop with a break statement)
- c = getCurrent();
- while (true) {
-
- // if we can transition to state "-1" from our current state, we're
- // on the last character of a legal word. Push that position onto
- // the possible-break-positions stack
- if (dictionary.getNextState(state, 0) == -1) {
- possibleBreakPositions.push(text.getIndex());
- }
-
- // look up the new state to transition to in the dictionary
- state = dictionary.getNextStateFromCharacter(state, c);
-
- // if the character we're sitting on causes us to transition to
- // the "end of word" state, then it was a non-dictionary character
- // and we've successfully traversed the whole range. Drop out
- // of the loop.
- if (state == -1) {
- currentBreakPositions.push(text.getIndex());
- break;
- }
-
- // if the character we're sitting on causes us to transition to
- // the error state, or if we've gone off the end of the range
- // without transitioning to the "end of word" state, we've hit
- // an error...
- else if (state == 0 || text.getIndex() >= endPos) {
-
- // if this is the farthest we've gotten, take note of it in
- // case there's an error in the text
- if (text.getIndex() > farthestEndPoint) {
- farthestEndPoint = text.getIndex();
-
- @SuppressWarnings("unchecked")
- Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone();
-
- bestBreakPositions = currentBreakPositionsCopy;
- }
-
- // wrongBreakPositions is a list of all break positions
- // we've tried starting that didn't allow us to traverse
- // all the way through the text. Every time we pop a
- // break position off of currentBreakPositions, we put it
- // into wrongBreakPositions to avoid trying it again later.
- // If we make it to this spot, we're either going to back
- // up to a break in possibleBreakPositions and try starting
- // over from there, or we've exhausted all possible break
- // positions and are going to do the fallback procedure.
- // This loop prevents us from messing with anything in
- // possibleBreakPositions that didn't work as a starting
- // point the last time we tried it (this is to prevent a bunch of
- // repetitive checks from slowing down some extreme cases)
- while (!possibleBreakPositions.isEmpty()
- && wrongBreakPositions.contains(possibleBreakPositions.peek())) {
- possibleBreakPositions.pop();
- }
-
- // if we've used up all possible break-position combinations, there's
- // an error or an unknown word in the text. In this case, we start
- // over, treating the farthest character we've reached as the beginning
- // of the range, and "blessing" the break positions that got us that
- // far as real break positions
- if (possibleBreakPositions.isEmpty()) {
- if (bestBreakPositions != null) {
- currentBreakPositions = bestBreakPositions;
- if (farthestEndPoint < endPos) {
- text.setIndex(farthestEndPoint + 1);
- }
- else {
- break;
- }
- }
- else {
- if ((currentBreakPositions.size() == 0 ||
- currentBreakPositions.peek().intValue() != text.getIndex())
- && text.getIndex() != startPos) {
- currentBreakPositions.push(text.getIndex());
- }
- getNext();
- currentBreakPositions.push(text.getIndex());
- }
- }
-
- // if we still have more break positions we can try, then promote the
- // last break in possibleBreakPositions into currentBreakPositions,
- // and get rid of all entries in currentBreakPositions that come after
- // it. Then back up to that position and start over from there (i.e.,
- // treat that position as the beginning of a new word)
- else {
- Integer temp = possibleBreakPositions.pop();
- Integer temp2 = null;
- while (!currentBreakPositions.isEmpty() && temp.intValue() <
- currentBreakPositions.peek().intValue()) {
- temp2 = currentBreakPositions.pop();
- wrongBreakPositions.add(temp2);
- }
- currentBreakPositions.push(temp);
- text.setIndex(currentBreakPositions.peek().intValue());
- }
-
- // re-sync "c" for the next go-round, and drop out of the loop if
- // we've made it off the end of the range
- c = getCurrent();
- if (text.getIndex() >= endPos) {
- break;
- }
- }
-
- // if we didn't hit any exceptional conditions on this last iteration,
- // just advance to the next character and loop
- else {
- c = getNext();
- }
- }
-
- // dump the last break position in the list, and replace it with the actual
- // end of the range (which may be the same character, or may be further on
- // because the range actually ended with non-dictionary characters we want to
- // keep with the word)
- if (!currentBreakPositions.isEmpty()) {
- currentBreakPositions.pop();
- }
- currentBreakPositions.push(endPos);
-
- // create a regular array to hold the break positions and copy
- // the break positions from the stack to the array (in addition,
- // our starting position goes into this array as a break position).
- // This array becomes the cache of break positions used by next()
- // and previous(), so this is where we actually refresh the cache.
- cachedBreakPositions = new int[currentBreakPositions.size() + 1];
- cachedBreakPositions[0] = startPos;
-
- for (int i = 0; i < currentBreakPositions.size(); i++) {
- cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
- }
- positionInCache = 0;
- }
-}
--- a/jdk/src/java.base/share/classes/sun/util/locale/provider/LocaleResources.java Mon Oct 24 21:44:33 2016 -0700
+++ b/jdk/src/java.base/share/classes/sun/util/locale/provider/LocaleResources.java Tue Oct 25 15:43:19 2016 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -42,7 +42,6 @@
import java.lang.ref.ReferenceQueue;
import java.lang.ref.SoftReference;
-import java.lang.reflect.Module;
import java.text.MessageFormat;
import java.util.Calendar;
import java.util.LinkedHashSet;
@@ -113,13 +112,14 @@
if (data == null || ((biInfo = data.get()) == null)) {
biInfo = localeData.getBreakIteratorInfo(locale).getObject(key);
cache.put(cacheKey, new ResourceReference(cacheKey, biInfo, referenceQueue));
- }
+ }
return biInfo;
}
- Module getBreakIteratorDataModule() {
- return localeData.getBreakIteratorInfo(locale).getClass().getModule();
+ @SuppressWarnings("unchecked")
+ byte[] getBreakIteratorResources(String key) {
+ return (byte[]) localeData.getBreakIteratorResources(locale).getObject(key);
}
int getCalendarData(String key) {
--- a/jdk/src/java.base/share/classes/sun/util/locale/provider/RuleBasedBreakIterator.java Mon Oct 24 21:44:33 2016 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1198 +0,0 @@
-/*
- * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *
- * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
- * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
- *
- * The original version of this source code and documentation
- * is copyrighted and owned by Taligent, Inc., a wholly-owned
- * subsidiary of IBM. These materials are provided under terms
- * of a License Agreement between Taligent and Sun. This technology
- * is protected by multiple US and International patents.
- *
- * This notice and attribution to Taligent may not be removed.
- * Taligent is a registered trademark of Taligent, Inc.
- */
-
-package sun.util.locale.provider;
-
-import java.io.BufferedInputStream;
-import java.io.InputStream;
-import java.io.IOException;
-import java.lang.reflect.Module;
-import java.security.AccessController;
-import java.security.PrivilegedActionException;
-import java.security.PrivilegedExceptionAction;
-import java.text.BreakIterator;
-import java.text.CharacterIterator;
-import java.text.StringCharacterIterator;
-import java.util.MissingResourceException;
-import sun.text.CompactByteArray;
-import sun.text.SupplementaryCharacterData;
-
-/**
- * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
- *
- * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
- * and <i>regular expressions.</i></p>
- *
- * <p>A substitution rule defines a name that can be used in place of an expression. It
- * consists of a name, which is a string of characters contained in angle brackets, an equals
- * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
- * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
- * square brackets. A substitution is visible after its definition, and is filled in using
- * simple textual substitution. Substitution definitions can contain other substitutions, as
- * long as those substitutions have been defined first. Substitutions are generally used to
- * make the regular expressions (which can get quite complex) shorted and easier to read.
- * They typically define either character categories or commonly-used subexpressions.</p>
- *
- * <p>There is one special substitution. If the description defines a substitution
- * called "<ignore>", the expression must be a [] expression, and the
- * expression defines a set of characters (the "<em>ignore characters</em>") that
- * will be transparent to the BreakIterator. A sequence of characters will break the
- * same way it would if any ignore characters it contains are taken out. Break
- * positions never occur befoer ignore characters.</p>
- *
- * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
- * defines a sequence of characters to be kept together. With one significant exception, the
- * iterator uses a longest-possible-match algorithm when matching text to regular
- * expressions. The iterator also treats descriptions containing multiple regular expressions
- * as if they were ORed together (i.e., as if they were separated by |).</p>
- *
- * <p>The special characters recognized by the regular-expression parser are as follows:</p>
- *
- * <blockquote>
- * <table border="1" width="100%">
- * <tr>
- * <td width="6%">*</td>
- * <td width="94%">Specifies that the expression preceding the asterisk may occur any number
- * of times (including not at all).</td>
- * </tr>
- * <tr>
- * <td width="6%">{}</td>
- * <td width="94%">Encloses a sequence of characters that is optional.</td>
- * </tr>
- * <tr>
- * <td width="6%">()</td>
- * <td width="94%">Encloses a sequence of characters. If followed by *, the sequence
- * repeats. Otherwise, the parentheses are just a grouping device and a way to delimit
- * the ends of expressions containing |.</td>
- * </tr>
- * <tr>
- * <td width="6%">|</td>
- * <td width="94%">Separates two alternative sequences of characters. Either one
- * sequence or the other, but not both, matches this expression. The | character can
- * only occur inside ().</td>
- * </tr>
- * <tr>
- * <td width="6%">.</td>
- * <td width="94%">Matches any character.</td>
- * </tr>
- * <tr>
- * <td width="6%">*?</td>
- * <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except
- * when there is overlap between the last group of characters in the expression preceding the
- * * and the first group of characters following the *. When there is this kind of
- * overlap, * will match the longest sequence of characters that match the expression before
- * the *, and *? will match the shortest sequence of characters matching the expression
- * before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
- * "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
- * but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
- * </tr>
- * <tr>
- * <td width="6%">[]</td>
- * <td width="94%">Specifies a group of alternative characters. A [] expression will
- * match any single character that is specified in the [] expression. For more on the
- * syntax of [] expressions, see below.</td>
- * </tr>
- * <tr>
- * <td width="6%">/</td>
- * <td width="94%">Specifies where the break position should go if text matches this
- * expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run
- * of letters, followed by a run of whitespace, followed by a digit, but the break position
- * will actually go before the whitespace). Expressions that don't contain / put the
- * break position at the end of the matching text.</td>
- * </tr>
- * <tr>
- * <td width="6%">\</td>
- * <td width="94%">Escape character. The \ itself is ignored, but causes the next
- * character to be treated as literal character. This has no effect for many
- * characters, but for the characters listed above, this deprives them of their special
- * meaning. (There are no special escape sequences for Unicode characters, or tabs and
- * newlines; these are all handled by a higher-level protocol. In a Java string,
- * "\n" will be converted to a literal newline character by the time the
- * regular-expression parser sees it. Of course, this means that \ sequences that are
- * visible to the regexp parser must be written as \\ when inside a Java string.) All
- * characters in the ASCII range except for letters, digits, and control characters are
- * reserved characters to the parser and must be preceded by \ even if they currently don't
- * mean anything.</td>
- * </tr>
- * <tr>
- * <td width="6%">!</td>
- * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
- * parser that this expression specifies the backwards-iteration behavior of the iterator,
- * and not its normal iteration behavior. This is generally only used in situations
- * where the automatically-generated backwards-iteration brhavior doesn't produce
- * satisfactory results and must be supplemented with extra client-specified rules.</td>
- * </tr>
- * <tr>
- * <td width="6%"><em>(all others)</em></td>
- * <td width="94%">All other characters are treated as literal characters, which must match
- * the corresponding character(s) in the text exactly.</td>
- * </tr>
- * </table>
- * </blockquote>
- *
- * <p>Within a [] expression, a number of other special characters can be used to specify
- * groups of characters:</p>
- *
- * <blockquote>
- * <table border="1" width="100%">
- * <tr>
- * <td width="6%">-</td>
- * <td width="94%">Specifies a range of matching characters. For example
- * "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The -
- * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
- * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
- * it include accented letters such as a-umlaut.</td>
- * </tr>
- * <tr>
- * <td width="6%">::</td>
- * <td width="94%">A pair of colons containing a one- or two-letter code matches all
- * characters in the corresponding Unicode category. The two-letter codes are the same
- * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
- * matches all currency symbols and all math symbols). Specifying a one-letter code is
- * the same as specifying all two-letter codes that begin with that letter (for example,
- * "[:L:]" matches all letters, and is equivalent to
- * "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode
- * category code or a single letter that begins a Unicode category code is illegal within
- * colons.</td>
- * </tr>
- * <tr>
- * <td width="6%">[]</td>
- * <td width="94%">[] expressions can nest. This has no effect, except when used in
- * conjunction with the ^ token.</td>
- * </tr>
- * <tr>
- * <td width="6%">^</td>
- * <td width="94%">Excludes the character (or the characters in the [] expression) following
- * it from the group of characters. For example, "[a-z^p]" matches all Latin
- * lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters
- * except the Han ideographs.</td>
- * </tr>
- * <tr>
- * <td width="6%"><em>(all others)</em></td>
- * <td width="94%">All other characters are treated as literal characters. (For
- * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
- * </tr>
- * </table>
- * </blockquote>
- *
- * <p>For a more complete explanation, see <a
- * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
- * For examples, see the resource data (which is annotated).</p>
- *
- * @author Richard Gillam
- */
-class RuleBasedBreakIterator extends BreakIterator {
-
- /**
- * A token used as a character-category value to identify ignore characters
- */
- protected static final byte IGNORE = -1;
-
- /**
- * The state number of the starting state
- */
- private static final short START_STATE = 1;
-
- /**
- * The state-transition value indicating "stop"
- */
- private static final short STOP_STATE = 0;
-
- /**
- * Magic number for the BreakIterator data file format.
- */
- static final byte[] LABEL = {
- (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a',
- (byte)'\0'
- };
- static final int LABEL_LENGTH = LABEL.length;
-
- /**
- * Version number of the dictionary that was read in.
- */
- static final byte supportedVersion = 1;
-
- /**
- * Header size in byte count
- */
- private static final int HEADER_LENGTH = 36;
-
- /**
- * An array length of indices for BMP characters
- */
- private static final int BMP_INDICES_LENGTH = 512;
-
- /**
- * Tables that indexes from character values to character category numbers
- */
- private CompactByteArray charCategoryTable = null;
- private SupplementaryCharacterData supplementaryCharCategoryTable = null;
-
- /**
- * The table of state transitions used for forward iteration
- */
- private short[] stateTable = null;
-
- /**
- * The table of state transitions used to sync up the iterator with the
- * text in backwards and random-access iteration
- */
- private short[] backwardsStateTable = null;
-
- /**
- * A list of flags indicating which states in the state table are accepting
- * ("end") states
- */
- private boolean[] endStates = null;
-
- /**
- * A list of flags indicating which states in the state table are
- * lookahead states (states which turn lookahead on and off)
- */
- private boolean[] lookaheadStates = null;
-
- /**
- * A table for additional data. May be used by a subclass of
- * RuleBasedBreakIterator.
- */
- private byte[] additionalData = null;
-
- /**
- * The number of character categories (and, thus, the number of columns in
- * the state tables)
- */
- private int numCategories;
-
- /**
- * The character iterator through which this BreakIterator accesses the text
- */
- private CharacterIterator text = null;
-
- /**
- * A CRC32 value of all data in datafile
- */
- private long checksum;
-
- //=======================================================================
- // constructors
- //=======================================================================
-
- /**
- * Constructs a RuleBasedBreakIterator according to the module and the datafile
- * provided.
- */
- RuleBasedBreakIterator(Module module, String datafile)
- throws IOException, MissingResourceException {
- readTables(module, datafile);
- }
-
- /**
- * Read datafile. The datafile's format is as follows:
- * <pre>
- * BreakIteratorData {
- * u1 magic[7];
- * u1 version;
- * u4 totalDataSize;
- * header_info header;
- * body value;
- * }
- * </pre>
- * <code>totalDataSize</code> is the summation of the size of
- * <code>header_info</code> and <code>body</code> in byte count.
- * <p>
- * In <code>header</code>, each field except for checksum implies the
- * length of each field. Since <code>BMPdataLength</code> is a fixed-length
- * data(512 entries), its length isn't included in <code>header</code>.
- * <code>checksum</code> is a CRC32 value of all in <code>body</code>.
- * <pre>
- * header_info {
- * u4 stateTableLength;
- * u4 backwardsStateTableLength;
- * u4 endStatesLength;
- * u4 lookaheadStatesLength;
- * u4 BMPdataLength;
- * u4 nonBMPdataLength;
- * u4 additionalDataLength;
- * u8 checksum;
- * }
- * </pre>
- * <p>
- *
- * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to
- * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to
- * <code>supplementaryCharCategoryTable</code>.
- * <pre>
- * body {
- * u2 stateTable[stateTableLength];
- * u2 backwardsStateTable[backwardsStateTableLength];
- * u1 endStates[endStatesLength];
- * u1 lookaheadStates[lookaheadStatesLength];
- * u2 BMPindices[512];
- * u1 BMPdata[BMPdataLength];
- * u4 nonBMPdata[numNonBMPdataLength];
- * u1 additionalData[additionalDataLength];
- * }
- * </pre>
- */
- protected final void readTables(Module module, String datafile)
- throws IOException, MissingResourceException {
-
- byte[] buffer = readFile(module, datafile);
-
- /* Read header_info. */
- int stateTableLength = getInt(buffer, 0);
- int backwardsStateTableLength = getInt(buffer, 4);
- int endStatesLength = getInt(buffer, 8);
- int lookaheadStatesLength = getInt(buffer, 12);
- int BMPdataLength = getInt(buffer, 16);
- int nonBMPdataLength = getInt(buffer, 20);
- int additionalDataLength = getInt(buffer, 24);
- checksum = getLong(buffer, 28);
-
- /* Read stateTable[numCategories * numRows] */
- stateTable = new short[stateTableLength];
- int offset = HEADER_LENGTH;
- for (int i = 0; i < stateTableLength; i++, offset+=2) {
- stateTable[i] = getShort(buffer, offset);
- }
-
- /* Read backwardsStateTable[numCategories * numRows] */
- backwardsStateTable = new short[backwardsStateTableLength];
- for (int i = 0; i < backwardsStateTableLength; i++, offset+=2) {
- backwardsStateTable[i] = getShort(buffer, offset);
- }
-
- /* Read endStates[numRows] */
- endStates = new boolean[endStatesLength];
- for (int i = 0; i < endStatesLength; i++, offset++) {
- endStates[i] = buffer[offset] == 1;
- }
-
- /* Read lookaheadStates[numRows] */
- lookaheadStates = new boolean[lookaheadStatesLength];
- for (int i = 0; i < lookaheadStatesLength; i++, offset++) {
- lookaheadStates[i] = buffer[offset] == 1;
- }
-
- /* Read a category table and indices for BMP characters. */
- short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices
- for (int i = 0; i < BMP_INDICES_LENGTH; i++, offset+=2) {
- temp1[i] = getShort(buffer, offset);
- }
- byte[] temp2 = new byte[BMPdataLength]; // BMPdata
- System.arraycopy(buffer, offset, temp2, 0, BMPdataLength);
- offset += BMPdataLength;
- charCategoryTable = new CompactByteArray(temp1, temp2);
-
- /* Read a category table for non-BMP characters. */
- int[] temp3 = new int[nonBMPdataLength];
- for (int i = 0; i < nonBMPdataLength; i++, offset+=4) {
- temp3[i] = getInt(buffer, offset);
- }
- supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);
-
- /* Read additional data */
- if (additionalDataLength > 0) {
- additionalData = new byte[additionalDataLength];
- System.arraycopy(buffer, offset, additionalData, 0, additionalDataLength);
- }
-
- /* Set numCategories */
- numCategories = stateTable.length / endStates.length;
- }
-
- protected byte[] readFile(final Module module, final String datafile)
- throws IOException, MissingResourceException {
-
- BufferedInputStream is;
- try {
- PrivilegedExceptionAction<BufferedInputStream> pa = () -> {
- String pathName = "jdk.localedata".equals(module.getName()) ?
- "sun/text/resources/ext/" :
- "sun/text/resources/";
- InputStream in = module.getResourceAsStream(pathName + datafile);
- if (in == null) {
- // Try to load the file with "java.base" module instance. Assumption
- // here is that the fall back data files to be read should reside in
- // java.base.
- in = RuleBasedBreakIterator.class.getModule().getResourceAsStream("sun/text/resources/" + datafile);
- }
-
- return new BufferedInputStream(in);
- };
- is = AccessController.doPrivileged(pa);
- } catch (PrivilegedActionException e) {
- throw new InternalError(e.toString(), e);
- }
-
- int offset = 0;
-
- /* First, read magic, version, and header_info. */
- int len = LABEL_LENGTH + 5;
- byte[] buf = new byte[len];
- if (is.read(buf) != len) {
- throw new MissingResourceException("Wrong header length",
- datafile, "");
- }
-
- /* Validate the magic number. */
- for (int i = 0; i < LABEL_LENGTH; i++, offset++) {
- if (buf[offset] != LABEL[offset]) {
- throw new MissingResourceException("Wrong magic number",
- datafile, "");
- }
- }
-
- /* Validate the version number. */
- if (buf[offset] != supportedVersion) {
- throw new MissingResourceException("Unsupported version(" + buf[offset] + ")",
- datafile, "");
- }
-
- /* Read data: totalDataSize + 8(for checksum) */
- len = getInt(buf, ++offset);
- buf = new byte[len];
- if (is.read(buf) != len) {
- throw new MissingResourceException("Wrong data length",
- datafile, "");
- }
-
- is.close();
-
- return buf;
- }
-
- byte[] getAdditionalData() {
- return additionalData;
- }
-
- void setAdditionalData(byte[] b) {
- additionalData = b;
- }
-
- //=======================================================================
- // boilerplate
- //=======================================================================
- /**
- * Clones this iterator.
- * @return A newly-constructed RuleBasedBreakIterator with the same
- * behavior as this one.
- */
- @Override
- public Object clone() {
- RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();
- if (text != null) {
- result.text = (CharacterIterator) text.clone();
- }
- return result;
- }
-
- /**
- * Returns true if both BreakIterators are of the same class, have the same
- * rules, and iterate over the same text.
- */
- @Override
- public boolean equals(Object that) {
- try {
- if (that == null) {
- return false;
- }
-
- RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
- if (checksum != other.checksum) {
- return false;
- }
- if (text == null) {
- return other.text == null;
- } else {
- return text.equals(other.text);
- }
- }
- catch(ClassCastException e) {
- return false;
- }
- }
-
- /**
- * Returns text
- */
- @Override
- public String toString() {
- return "[checksum=0x" + Long.toHexString(checksum) + ']';
- }
-
- /**
- * Compute a hashcode for this BreakIterator
- * @return A hash code
- */
- @Override
- public int hashCode() {
- return (int)checksum;
- }
-
- //=======================================================================
- // BreakIterator overrides
- //=======================================================================
-
- /**
- * Sets the current iteration position to the beginning of the text.
- * (i.e., the CharacterIterator's starting offset).
- * @return The offset of the beginning of the text.
- */
- @Override
- public int first() {
- CharacterIterator t = getText();
-
- t.first();
- return t.getIndex();
- }
-
- /**
- * Sets the current iteration position to the end of the text.
- * (i.e., the CharacterIterator's ending offset).
- * @return The text's past-the-end offset.
- */
- @Override
- public int last() {
- CharacterIterator t = getText();
-
- // I'm not sure why, but t.last() returns the offset of the last character,
- // rather than the past-the-end offset
- t.setIndex(t.getEndIndex());
- return t.getIndex();
- }
-
- /**
- * Advances the iterator either forward or backward the specified number of steps.
- * Negative values move backward, and positive values move forward. This is
- * equivalent to repeatedly calling next() or previous().
- * @param n The number of steps to move. The sign indicates the direction
- * (negative is backwards, and positive is forwards).
- * @return The character offset of the boundary position n boundaries away from
- * the current one.
- */
- @Override
- public int next(int n) {
- int result = current();
- while (n > 0) {
- result = handleNext();
- --n;
- }
- while (n < 0) {
- result = previous();
- ++n;
- }
- return result;
- }
-
- /**
- * Advances the iterator to the next boundary position.
- * @return The position of the first boundary after this one.
- */
- @Override
- public int next() {
- return handleNext();
- }
-
- private int cachedLastKnownBreak = BreakIterator.DONE;
-
- /**
- * Advances the iterator backwards, to the last boundary preceding this one.
- * @return The position of the last boundary position preceding this one.
- */
- @Override
- public int previous() {
- // if we're already sitting at the beginning of the text, return DONE
- CharacterIterator text = getText();
- if (current() == text.getBeginIndex()) {
- return BreakIterator.DONE;
- }
-
- // set things up. handlePrevious() will back us up to some valid
- // break position before the current position (we back our internal
- // iterator up one step to prevent handlePrevious() from returning
- // the current position), but not necessarily the last one before
- // where we started
- int start = current();
- int lastResult = cachedLastKnownBreak;
- if (lastResult >= start || lastResult <= BreakIterator.DONE) {
- getPrevious();
- lastResult = handlePrevious();
- } else {
- //it might be better to check if handlePrevious() give us closer
- //safe value but handlePrevious() is slow too
- //So, this has to be done carefully
- text.setIndex(lastResult);
- }
- int result = lastResult;
-
- // iterate forward from the known break position until we pass our
- // starting point. The last break position before the starting
- // point is our return value
- while (result != BreakIterator.DONE && result < start) {
- lastResult = result;
- result = handleNext();
- }
-
- // set the current iteration position to be the last break position
- // before where we started, and then return that value
- text.setIndex(lastResult);
- cachedLastKnownBreak = lastResult;
- return lastResult;
- }
-
- /**
- * Returns previous character
- */
- private int getPrevious() {
- char c2 = text.previous();
- if (Character.isLowSurrogate(c2) &&
- text.getIndex() > text.getBeginIndex()) {
- char c1 = text.previous();
- if (Character.isHighSurrogate(c1)) {
- return Character.toCodePoint(c1, c2);
- } else {
- text.next();
- }
- }
- return (int)c2;
- }
-
- /**
- * Returns current character
- */
- int getCurrent() {
- char c1 = text.current();
- if (Character.isHighSurrogate(c1) &&
- text.getIndex() < text.getEndIndex()) {
- char c2 = text.next();
- text.previous();
- if (Character.isLowSurrogate(c2)) {
- return Character.toCodePoint(c1, c2);
- }
- }
- return (int)c1;
- }
-
- /**
- * Returns the count of next character.
- */
- private int getCurrentCodePointCount() {
- char c1 = text.current();
- if (Character.isHighSurrogate(c1) &&
- text.getIndex() < text.getEndIndex()) {
- char c2 = text.next();
- text.previous();
- if (Character.isLowSurrogate(c2)) {
- return 2;
- }
- }
- return 1;
- }
-
- /**
- * Returns next character
- */
- int getNext() {
- int index = text.getIndex();
- int endIndex = text.getEndIndex();
- if (index == endIndex ||
- (index += getCurrentCodePointCount()) >= endIndex) {
- return CharacterIterator.DONE;
- }
- text.setIndex(index);
- return getCurrent();
- }
-
- /**
- * Returns the position of next character.
- */
- private int getNextIndex() {
- int index = text.getIndex() + getCurrentCodePointCount();
- int endIndex = text.getEndIndex();
- if (index > endIndex) {
- return endIndex;
- } else {
- return index;
- }
- }
-
- /**
- * Throw IllegalArgumentException unless begin <= offset < end.
- */
- protected static final void checkOffset(int offset, CharacterIterator text) {
- if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
- throw new IllegalArgumentException("offset out of bounds");
- }
- }
-
- /**
- * Sets the iterator to refer to the first boundary position following
- * the specified position.
- * @offset The position from which to begin searching for a break position.
- * @return The position of the first break after the current position.
- */
- @Override
- public int following(int offset) {
-
- CharacterIterator text = getText();
- checkOffset(offset, text);
-
- // Set our internal iteration position (temporarily)
- // to the position passed in. If this is the _beginning_ position,
- // then we can just use next() to get our return value
- text.setIndex(offset);
- if (offset == text.getBeginIndex()) {
- cachedLastKnownBreak = handleNext();
- return cachedLastKnownBreak;
- }
-
- // otherwise, we have to sync up first. Use handlePrevious() to back
- // us up to a known break position before the specified position (if
- // we can determine that the specified position is a break position,
- // we don't back up at all). This may or may not be the last break
- // position at or before our starting position. Advance forward
- // from here until we've passed the starting position. The position
- // we stop on will be the first break position after the specified one.
- int result = cachedLastKnownBreak;
- if (result >= offset || result <= BreakIterator.DONE) {
- result = handlePrevious();
- } else {
- //it might be better to check if handlePrevious() give us closer
- //safe value but handlePrevious() is slow too
- //So, this has to be done carefully
- text.setIndex(result);
- }
- while (result != BreakIterator.DONE && result <= offset) {
- result = handleNext();
- }
- cachedLastKnownBreak = result;
- return result;
- }
-
- /**
- * Sets the iterator to refer to the last boundary position before the
- * specified position.
- * @offset The position to begin searching for a break from.
- * @return The position of the last boundary before the starting position.
- */
- @Override
- public int preceding(int offset) {
- // if we start by updating the current iteration position to the
- // position specified by the caller, we can just use previous()
- // to carry out this operation
- CharacterIterator text = getText();
- checkOffset(offset, text);
- text.setIndex(offset);
- return previous();
- }
-
- /**
- * Returns true if the specified position is a boundary position. As a side
- * effect, leaves the iterator pointing to the first boundary position at
- * or after "offset".
- * @param offset the offset to check.
- * @return True if "offset" is a boundary position.
- */
- @Override
- public boolean isBoundary(int offset) {
- CharacterIterator text = getText();
- checkOffset(offset, text);
- if (offset == text.getBeginIndex()) {
- return true;
- }
-
- // to check whether this is a boundary, we can use following() on the
- // position before the specified one and return true if the position we
- // get back is the one the user specified
- else {
- return following(offset - 1) == offset;
- }
- }
-
- /**
- * Returns the current iteration position.
- * @return The current iteration position.
- */
- @Override
- public int current() {
- return getText().getIndex();
- }
-
- /**
- * Return a CharacterIterator over the text being analyzed. This version
- * of this method returns the actual CharacterIterator we're using internally.
- * Changing the state of this iterator can have undefined consequences. If
- * you need to change it, clone it first.
- * @return An iterator over the text being analyzed.
- */
- @Override
- public CharacterIterator getText() {
- // The iterator is initialized pointing to no text at all, so if this
- // function is called while we're in that state, we have to fudge an
- // iterator to return.
- if (text == null) {
- text = new StringCharacterIterator("");
- }
- return text;
- }
-
- /**
- * Set the iterator to analyze a new piece of text. This function resets
- * the current iteration position to the beginning of the text.
- * @param newText An iterator over the text to analyze.
- */
- @Override
- public void setText(CharacterIterator newText) {
- // Test iterator to see if we need to wrap it in a SafeCharIterator.
- // The correct behavior for CharacterIterators is to allow the
- // position to be set to the endpoint of the iterator. Many
- // CharacterIterators do not uphold this, so this is a workaround
- // to permit them to use this class.
- int end = newText.getEndIndex();
- boolean goodIterator;
- try {
- newText.setIndex(end); // some buggy iterators throw an exception here
- goodIterator = newText.getIndex() == end;
- }
- catch(IllegalArgumentException e) {
- goodIterator = false;
- }
-
- if (goodIterator) {
- text = newText;
- }
- else {
- text = new SafeCharIterator(newText);
- }
- text.first();
-
- cachedLastKnownBreak = BreakIterator.DONE;
- }
-
-
- //=======================================================================
- // implementation
- //=======================================================================
-
- /**
- * This method is the actual implementation of the next() method. All iteration
- * vectors through here. This method initializes the state machine to state 1
- * and advances through the text character by character until we reach the end
- * of the text or the state machine transitions to state 0. We update our return
- * value every time the state machine passes through a possible end state.
- */
- protected int handleNext() {
- // if we're already at the end of the text, return DONE.
- CharacterIterator text = getText();
- if (text.getIndex() == text.getEndIndex()) {
- return BreakIterator.DONE;
- }
-
- // no matter what, we always advance at least one character forward
- int result = getNextIndex();
- int lookaheadResult = 0;
-
- // begin in state 1
- int state = START_STATE;
- int category;
- int c = getCurrent();
-
- // loop until we reach the end of the text or transition to state 0
- while (c != CharacterIterator.DONE && state != STOP_STATE) {
-
- // look up the current character's character category (which tells us
- // which column in the state table to look at)
- category = lookupCategory(c);
-
- // if the character isn't an ignore character, look up a state
- // transition in the state table
- if (category != IGNORE) {
- state = lookupState(state, category);
- }
-
- // if the state we've just transitioned to is a lookahead state,
- // (but not also an end state), save its position. If it's
- // both a lookahead state and an end state, update the break position
- // to the last saved lookup-state position
- if (lookaheadStates[state]) {
- if (endStates[state]) {
- result = lookaheadResult;
- }
- else {
- lookaheadResult = getNextIndex();
- }
- }
-
- // otherwise, if the state we've just transitioned to is an accepting
- // state, update the break position to be the current iteration position
- else {
- if (endStates[state]) {
- result = getNextIndex();
- }
- }
-
- c = getNext();
- }
-
- // if we've run off the end of the text, and the very last character took us into
- // a lookahead state, advance the break position to the lookahead position
- // (the theory here is that if there are no characters at all after the lookahead
- // position, that always matches the lookahead criteria)
- if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
- result = lookaheadResult;
- }
-
- text.setIndex(result);
- return result;
- }
-
- /**
- * This method backs the iterator back up to a "safe position" in the text.
- * This is a position that we know, without any context, must be a break position.
- * The various calling methods then iterate forward from this safe position to
- * the appropriate position to return. (For more information, see the description
- * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
- */
- protected int handlePrevious() {
- CharacterIterator text = getText();
- int state = START_STATE;
- int category = 0;
- int lastCategory = 0;
- int c = getCurrent();
-
- // loop until we reach the beginning of the text or transition to state 0
- while (c != CharacterIterator.DONE && state != STOP_STATE) {
-
- // save the last character's category and look up the current
- // character's category
- lastCategory = category;
- category = lookupCategory(c);
-
- // if the current character isn't an ignore character, look up a
- // state transition in the backwards state table
- if (category != IGNORE) {
- state = lookupBackwardState(state, category);
- }
-
- // then advance one character backwards
- c = getPrevious();
- }
-
- // if we didn't march off the beginning of the text, we're either one or two
- // positions away from the real break position. (One because of the call to
- // previous() at the end of the loop above, and another because the character
- // that takes us into the stop state will always be the character BEFORE
- // the break position.)
- if (c != CharacterIterator.DONE) {
- if (lastCategory != IGNORE) {
- getNext();
- getNext();
- }
- else {
- getNext();
- }
- }
- return text.getIndex();
- }
-
- /**
- * Looks up a character's category (i.e., its category for breaking purposes,
- * not its Unicode category)
- */
- protected int lookupCategory(int c) {
- if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
- return charCategoryTable.elementAt((char)c);
- } else {
- return supplementaryCharCategoryTable.getValue(c);
- }
- }
-
- /**
- * Given a current state and a character category, looks up the
- * next state to transition to in the state table.
- */
- protected int lookupState(int state, int category) {
- return stateTable[state * numCategories + category];
- }
-
- /**
- * Given a current state and a character category, looks up the
- * next state to transition to in the backwards state table.
- */
- protected int lookupBackwardState(int state, int category) {
- return backwardsStateTable[state * numCategories + category];
- }
-
- static long getLong(byte[] buf, int offset) {
- long num = buf[offset]&0xFF;
- for (int i = 1; i < 8; i++) {
- num = num<<8 | (buf[offset+i]&0xFF);
- }
- return num;
- }
-
- static int getInt(byte[] buf, int offset) {
- int num = buf[offset]&0xFF;
- for (int i = 1; i < 4; i++) {
- num = num<<8 | (buf[offset+i]&0xFF);
- }
- return num;
- }
-
- static short getShort(byte[] buf, int offset) {
- short num = (short)(buf[offset]&0xFF);
- num = (short)(num<<8 | (buf[offset+1]&0xFF));
- return num;
- }
-
- /*
- * This class exists to work around a bug in incorrect implementations
- * of CharacterIterator, which incorrectly handle setIndex(endIndex).
- * This iterator relies only on base.setIndex(n) where n is less than
- * endIndex.
- *
- * One caveat: if the base iterator's begin and end indices change
- * the change will not be reflected by this wrapper. Does that matter?
- */
- // TODO: Review this class to see if it's still required.
- private static final class SafeCharIterator implements CharacterIterator,
- Cloneable {
-
- private CharacterIterator base;
- private int rangeStart;
- private int rangeLimit;
- private int currentIndex;
-
- SafeCharIterator(CharacterIterator base) {
- this.base = base;
- this.rangeStart = base.getBeginIndex();
- this.rangeLimit = base.getEndIndex();
- this.currentIndex = base.getIndex();
- }
-
- @Override
- public char first() {
- return setIndex(rangeStart);
- }
-
- @Override
- public char last() {
- return setIndex(rangeLimit - 1);
- }
-
- @Override
- public char current() {
- if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
- return DONE;
- }
- else {
- return base.setIndex(currentIndex);
- }
- }
-
- @Override
- public char next() {
-
- currentIndex++;
- if (currentIndex >= rangeLimit) {
- currentIndex = rangeLimit;
- return DONE;
- }
- else {
- return base.setIndex(currentIndex);
- }
- }
-
- @Override
- public char previous() {
-
- currentIndex--;
- if (currentIndex < rangeStart) {
- currentIndex = rangeStart;
- return DONE;
- }
- else {
- return base.setIndex(currentIndex);
- }
- }
-
- @Override
- public char setIndex(int i) {
-
- if (i < rangeStart || i > rangeLimit) {
- throw new IllegalArgumentException("Invalid position");
- }
- currentIndex = i;
- return current();
- }
-
- @Override
- public int getBeginIndex() {
- return rangeStart;
- }
-
- @Override
- public int getEndIndex() {
- return rangeLimit;
- }
-
- @Override
- public int getIndex() {
- return currentIndex;
- }
-
- @Override
- public Object clone() {
-
- SafeCharIterator copy = null;
- try {
- copy = (SafeCharIterator) super.clone();
- }
- catch(CloneNotSupportedException e) {
- throw new Error("Clone not supported: " + e);
- }
-
- CharacterIterator copyOfBase = (CharacterIterator) base.clone();
- copy.base = copyOfBase;
- return copy;
- }
- }
-}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/util/resources/BreakIteratorResourceBundle.java Tue Oct 25 15:43:19 2016 +0900
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package sun.util.resources;
+
+import java.io.InputStream;
+import java.security.AccessController;
+import java.security.PrivilegedActionException;
+import java.security.PrivilegedExceptionAction;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.ResourceBundle;
+import java.util.Set;
+
+/**
+ * BreakIteratorResourceBundle is an abstract class for loading BreakIterator
+ * data (rules or dictionary) from each module. An implementation class must
+ * implement getBreakIteratorInfo() that returns an instance of the
+ * corresponding BreakIteratorInfo (basename). The data name is taken from the
+ * BreakIteratorInfo instance.
+ *
+ * <p>For example, if the given key is "WordDictionary" and Locale is "th", the
+ * data name is taken from a BreakIteratorInfo_th and the key's value is
+ * "thai_dict". Its data thai_dict is loaded from the Module of the
+ * implementation class of this class.
+ */
+
+public abstract class BreakIteratorResourceBundle extends ResourceBundle {
+ // If any keys that are not for data names are added to BreakIteratorInfo*,
+ // those keys must be added to NON_DATA_KEYS.
+ private static final Set<String> NON_DATA_KEYS = Set.of("BreakIteratorClasses");
+
+ private volatile Set<String> keys;
+
+ /**
+ * Returns an instance of the corresponding {@code BreakIteratorInfo} (basename).
+ * The instance shouldn't have its parent.
+ */
+ protected abstract ResourceBundle getBreakIteratorInfo();
+
+ @Override
+ protected Object handleGetObject(String key) {
+ if (NON_DATA_KEYS.contains(key)) {
+ return null;
+ }
+ ResourceBundle info = getBreakIteratorInfo();
+ if (!info.containsKey(key)) {
+ return null;
+ }
+ String path = getClass().getPackage().getName().replace('.', '/')
+ + '/' + info.getString(key);
+ byte[] data;
+ try (InputStream is = getResourceAsStream(path)) {
+ data = is.readAllBytes();
+ } catch (Exception e) {
+ throw new InternalError("Can't load " + path, e);
+ }
+ return data;
+ }
+
+ private InputStream getResourceAsStream(String path) throws Exception {
+ PrivilegedExceptionAction<InputStream> pa;
+ pa = () -> getClass().getModule().getResourceAsStream(path);
+ InputStream is;
+ try {
+ is = AccessController.doPrivileged(pa);
+ } catch (PrivilegedActionException e) {
+ throw e.getException();
+ }
+ return is;
+ }
+
+ @Override
+ public Enumeration<String> getKeys() {
+ return Collections.enumeration(keySet());
+ }
+
+ @Override
+ protected Set<String> handleKeySet() {
+ if (keys == null) {
+ ResourceBundle info = getBreakIteratorInfo();
+ Set<String> k = info.keySet();
+ k.removeAll(NON_DATA_KEYS);
+ synchronized (this) {
+ if (keys == null) {
+ keys = k;
+ }
+ }
+ }
+ return keys;
+ }
+}
--- a/jdk/src/java.base/share/classes/sun/util/resources/LocaleData.java Mon Oct 24 21:44:33 2016 -0700
+++ b/jdk/src/java.base/share/classes/sun/util/resources/LocaleData.java Tue Oct 25 15:43:19 2016 +0900
@@ -123,6 +123,14 @@
}
/**
+ * Gets a break iterator resources resource bundle, using
+ * privileges to allow accessing a sun.* package.
+ */
+ public ResourceBundle getBreakIteratorResources(Locale locale) {
+ return getBundle(type.getTextResourcesPackage() + ".BreakIteratorResources", locale);
+ }
+
+ /**
* Gets a collation data resource bundle, using privileges
* to allow accessing a sun.* package.
*/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/jdk.localedata/share/classes/sun/text/resources/ext/BreakIteratorResources_th.java Tue Oct 25 15:43:19 2016 +0900
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package sun.text.resources.ext;
+
+import java.util.ResourceBundle;
+import sun.util.resources.BreakIteratorResourceBundle;
+
+public class BreakIteratorResources_th extends BreakIteratorResourceBundle {
+ @Override
+ protected ResourceBundle getBreakIteratorInfo() {
+ return new BreakIteratorInfo_th();
+ }
+}
--- a/jdk/test/java/util/PluggableLocale/BreakIteratorProviderTest.java Mon Oct 24 21:44:33 2016 -0700
+++ b/jdk/test/java/util/PluggableLocale/BreakIteratorProviderTest.java Tue Oct 25 15:43:19 2016 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -89,7 +89,7 @@
String[] jresResult = new String[4];
if (jreSupportsLocale) {
for (int i = 0; i < 4; i++) {
- jresResult[i] = "sun.util.locale.provider."+classNames[i];
+ jresResult[i] = "sun.text." + classNames[i];
}
}
--- a/jdk/test/java/util/PluggableLocale/BreakIteratorProviderTest.sh Mon Oct 24 21:44:33 2016 -0700
+++ b/jdk/test/java/util/PluggableLocale/BreakIteratorProviderTest.sh Tue Oct 25 15:43:19 2016 +0900
@@ -1,6 +1,6 @@
#!/bin/sh
#
-# Copyright (c) 2007, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2007, 2016, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
@@ -23,6 +23,6 @@
#
#
# @test
-# @bug 4052440 8062588
+# @bug 4052440 8062588 8165804
# @summary BreakIteratorProvider tests
# @run shell ExecTest.sh foo BreakIteratorProviderTest
--- a/jdk/test/tools/jlink/plugins/IncludeLocalesPluginTest.java Mon Oct 24 21:44:33 2016 -0700
+++ b/jdk/test/tools/jlink/plugins/IncludeLocalesPluginTest.java Tue Oct 25 15:43:19 2016 +0900
@@ -40,7 +40,7 @@
/*
* @test
- * @bug 8152143 8152704 8155649
+ * @bug 8152143 8152704 8155649 8165804
* @summary IncludeLocalesPlugin tests
* @author Naoto Sato
* @library ../../lib
@@ -236,6 +236,7 @@
"/jdk.localedata/sun/text/resources/ext/thai_dict",
"/jdk.localedata/sun/text/resources/ext/WordBreakIteratorData_th",
"/jdk.localedata/sun/text/resources/ext/BreakIteratorInfo_th.class",
+ "/jdk.localedata/sun/text/resources/ext/BreakIteratorResources_th.class",
"/jdk.localedata/sun/text/resources/ext/FormatData_en_GB.class",
"/jdk.localedata/sun/text/resources/ext/FormatData_ja.class",
"/jdk.localedata/sun/text/resources/ext/FormatData_th.class",
@@ -261,6 +262,7 @@
"/jdk.localedata/sun/text/resources/ext/thai_dict",
"/jdk.localedata/sun/text/resources/ext/WordBreakIteratorData_th",
"/jdk.localedata/sun/text/resources/ext/BreakIteratorInfo_th.class",
+ "/jdk.localedata/sun/text/resources/ext/BreakIteratorResources_th.class",
"/jdk.localedata/sun/text/resources/ext/FormatData_th.class"),
List.of(
"/jdk.localedata/sun/text/resources/ext/FormatData_en_GB.class",