--- a/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerDataReader.java Thu Jul 16 19:31:01 2015 -0700
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,389 +0,0 @@
-/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.io.DataInputStream;
-import java.io.InputStream;
-import java.io.IOException;
-
-/**
- * @author Ram Viswanadha
- */
-
- /*
- * Description of the format of unorm.icu version 2.1.
- *
- * Main change from version 1 to version 2:
- * Use of new, common Trie instead of normalization-specific tries.
- * Change to version 2.1: add third/auxiliary trie with associated data.
- *
- * For more details of how to use the data structures see the code
- * in unorm.cpp (runtime normalization code) and
- * in gennorm.c and gennorm/store.c (build-time data generation).
- *
- * For the serialized format of Trie see Trie.c/TrieHeader.
- *
- * - Overall partition
- *
- * unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
- * After that there are the following structures:
- *
- * char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
- *
- * Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
- *
- * char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
- * extraData[0] contains the number of units for
- * FC_NFKC_Closure (formatVersion>=2.1)
- *
- * char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
- * combiningTableTop may include one 16-bit padding unit
- * to make sure that fcdTrie is 32-bit-aligned
- *
- * Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
- *
- * Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
- *
- *
- * The indexes array contains lengths and sizes of the following arrays and structures
- * as well as the following values:
- * indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
- * -- one more than the highest combining index computed for forward-only-combining characters
- * indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
- * -- number of combining indexes computed for both-ways-combining characters
- * indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
- * -- number of combining indexes computed for backward-only-combining characters
- *
- * indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
- * -- first code point with a quick check NF* value of NO/MAYBE
- *
- *
- * - Tries
- *
- * The main structures are two Trie tables ("compact arrays"),
- * each with one index array and one data array.
- * See Trie.h and Trie.c.
- *
- *
- * - Tries in unorm.icu
- *
- * The first trie (normTrie above)
- * provides data for the NF* quick checks and normalization.
- * The second trie (fcdTrie above) provides data just for FCD checks.
- *
- *
- * - norm32 data words from the first trie
- *
- * The norm32Table contains one 32-bit word "norm32" per code point.
- * It contains the following bit fields:
- * 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
- * if this index is <EXTRA_INDEX_TOP then it is an index into
- * extraData[] where variable-length normalization data for this
- * code point is found
- * if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
- * then this is a norm32 for a leading surrogate, and the index
- * value is used together with the following trailing surrogate
- * code unit in the second trie access
- * if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
- * then this is a norm32 for a "special" character,
- * i.e., the character is a Hangul syllable or a Jamo
- * see EXTRA_HANGUL etc.
- * generally, instead of extracting this index from the norm32 and
- * comparing it with the above constants,
- * the normalization code compares the entire norm32 value
- * with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
- *
- * 15..8 combining class (cc) according to UnicodeData.txt
- *
- * 7..6 COMBINES_ANY flags, used in composition to see if a character
- * combines with any following or preceding character(s)
- * at all
- * 7 COMBINES_BACK
- * 6 COMBINES_FWD
- *
- * 5..0 quick check flags, set for "no" or "maybe", with separate flags for
- * each normalization form
- * the higher bits are "maybe" flags; for NF*D there are no such flags
- * the lower bits are "no" flags for all forms, in the same order
- * as the "maybe" flags,
- * which is (MSB to LSB): NFKD NFD NFKC NFC
- * 5..4 QC_ANY_MAYBE
- * 3..0 QC_ANY_NO
- * see further related constants
- *
- *
- * - Extra data per code point
- *
- * "Extra data" is referenced by the index in norm32.
- * It is variable-length data. It is only present, and only those parts
- * of it are, as needed for a given character.
- * The norm32 extra data index is added to the beginning of extraData[]
- * to get to a vector of 16-bit words with data at the following offsets:
- *
- * [-1] Combining index for composition.
- * Stored only if norm32&COMBINES_ANY .
- * [0] Lengths of the canonical and compatibility decomposition strings.
- * Stored only if there are decompositions, i.e.,
- * if norm32&(QC_NFD|QC_NFKD)
- * High byte: length of NFKD, or 0 if none
- * Low byte: length of NFD, or 0 if none
- * Each length byte also has another flag:
- * Bit 7 of a length byte is set if there are non-zero
- * combining classes (cc's) associated with the respective
- * decomposition. If this flag is set, then the decomposition
- * is preceded by a 16-bit word that contains the
- * leading and trailing cc's.
- * Bits 6..0 of a length byte are the length of the
- * decomposition string, not counting the cc word.
- * [1..n] NFD
- * [n+1..] NFKD
- *
- * Each of the two decompositions consists of up to two parts:
- * - The 16-bit words with the leading and trailing cc's.
- * This is only stored if bit 7 of the corresponding length byte
- * is set. In this case, at least one of the cc's is not zero.
- * High byte: leading cc==cc of the first code point in the decomposition string
- * Low byte: trailing cc==cc of the last code point in the decomposition string
- * - The decomposition string in UTF-16, with length code units.
- *
- *
- * - Combining indexes and combiningTable[]
- *
- * Combining indexes are stored at the [-1] offset of the extra data
- * if the character combines forward or backward with any other characters.
- * They are used for (re)composition in NF*C.
- * Values of combining indexes are arranged according to whether a character
- * combines forward, backward, or both ways:
- * forward-only < both ways < backward-only
- *
- * The index values for forward-only and both-ways combining characters
- * are indexes into the combiningTable[].
- * The index values for backward-only combining characters are simply
- * incremented from the preceding index values to be unique.
- *
- * In the combiningTable[], a variable-length list
- * of variable-length (back-index, code point) pair entries is stored
- * for each forward-combining character.
- *
- * These back-indexes are the combining indexes of both-ways or backward-only
- * combining characters that the forward-combining character combines with.
- *
- * Each list is sorted in ascending order of back-indexes.
- * Each list is terminated with the last back-index having bit 15 set.
- *
- * Each pair (back-index, code point) takes up either 2 or 3
- * 16-bit words.
- * The first word of a list entry is the back-index, with its bit 15 set if
- * this is the last pair in the list.
- *
- * The second word contains flags in bits 15..13 that determine
- * if there is a third word and how the combined character is encoded:
- * 15 set if there is a third word in this list entry
- * 14 set if the result is a supplementary character
- * 13 set if the result itself combines forward
- *
- * According to these bits 15..14 of the second word,
- * the result character is encoded as follows:
- * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
- * the second word.
- * 10 The result is 0x2000..0xffff and stored in the third word.
- * Bits 12..0 of the second word are not used.
- * 11 The result is a supplementary character.
- * Bits 9..0 of the leading surrogate are in bits 9..0 of
- * the second word.
- * Add 0xd800 to these bits to get the complete surrogate.
- * Bits 12..10 of the second word are not used.
- * The trailing surrogate is stored in the third word.
- *
- *
- * - FCD trie
- *
- * The FCD trie is very simple.
- * It is a folded trie with 16-bit data words.
- * In each word, the high byte contains the leading cc of the character,
- * and the low byte contains the trailing cc of the character.
- * These cc's are the cc's of the first and last code points in the
- * canonical decomposition of the character.
- *
- * Since all 16 bits are used for cc's, lead surrogates must be tested
- * by checking the code unit instead of the trie data.
- * This is done only if the 16-bit data word is not zero.
- * If the code unit is a leading surrogate and the data word is not zero,
- * then instead of cc's it contains the offset for the second trie lookup.
- *
- *
- * - Auxiliary trie and data
- *
- *
- * The auxiliary 16-bit trie contains data for additional properties.
- * Bits
- * 15..13 reserved
- * 12 not NFC_Skippable (f) (formatVersion>=2.2)
- * 11 flag: not a safe starter for canonical closure
- * 10 composition exclusion
- * 9.. 0 index into extraData[] to FC_NFKC_Closure string
- * (not for lead surrogate),
- * or lead surrogate offset (for lead surrogate, if 9..0 not zero)
- *
- * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
- * (used in NormalizerTransliterator)
- *
- * A skippable character is
- * a) unassigned, or ALL of the following:
- * b) of combining class 0.
- * c) not decomposed by this normalization form.
- * AND if NFC or NFKC,
- * d) can never compose with a previous character.
- * e) can never compose with a following character.
- * f) can never change if another character is added.
- * Example: a-breve might satisfy all but f, but if you
- * add an ogonek it changes to a-ogonek + breve
- *
- * a)..e) must be tested from norm32.
- * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
- * into the auxiliary trie.
- * The same bit is used for NFC and NFKC; (c) differs for them.
- * As usual, we build the "not skippable" flags so that unassigned
- * code points get a 0 bit.
- * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
- * Test Hangul LV syllables entirely in code.
- *
- *
- * - FC_NFKC_Closure strings in extraData[]
- *
- * Strings are either stored as a single code unit or as the length
- * followed by that many units.
- *
- */
-final class NormalizerDataReader implements ICUBinary.Authenticate {
-
- /**
- * <p>Protected constructor.</p>
- * @param inputStream ICU uprop.dat file input stream
- * @exception IOException throw if data file fails authentication
- * @draft 2.1
- */
- protected NormalizerDataReader(InputStream inputStream)
- throws IOException{
-
- unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
- dataInputStream = new DataInputStream(inputStream);
- }
-
- // protected methods -------------------------------------------------
-
- protected int[] readIndexes(int length)throws IOException{
- int[] indexes = new int[length];
- //Read the indexes
- for (int i = 0; i <length ; i++) {
- indexes[i] = dataInputStream.readInt();
- }
- return indexes;
- }
- /**
- * <p>Reads unorm.icu, parse it into blocks of data to be stored in
- * NormalizerImpl.</P
- * @param normBytes
- * @param fcdBytes
- * @param auxBytes
- * @param extraData
- * @param combiningTable
- * @exception thrown when data reading fails
- * @draft 2.1
- */
- protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
- char[] extraData, char[] combiningTable)
- throws IOException{
-
- //Read the bytes that make up the normTrie
- dataInputStream.readFully(normBytes);
-
- //normTrieStream= new ByteArrayInputStream(normBytes);
-
- //Read the extra data
- for(int i=0;i<extraData.length;i++){
- extraData[i]=dataInputStream.readChar();
- }
-
- //Read the combining class table
- for(int i=0; i<combiningTable.length; i++){
- combiningTable[i]=dataInputStream.readChar();
- }
-
- //Read the fcdTrie
- dataInputStream.readFully(fcdBytes);
-
-
- //Read the AuxTrie
- dataInputStream.readFully(auxBytes);
- }
-
- public byte[] getDataFormatVersion(){
- return DATA_FORMAT_VERSION;
- }
-
- public boolean isDataVersionAcceptable(byte version[])
- {
- return version[0] == DATA_FORMAT_VERSION[0]
- && version[2] == DATA_FORMAT_VERSION[2]
- && version[3] == DATA_FORMAT_VERSION[3];
- }
-
- public byte[] getUnicodeVersion(){
- return unicodeVersion;
- }
- // private data members -------------------------------------------------
-
-
- /**
- * ICU data file input stream
- */
- private DataInputStream dataInputStream;
-
- private byte[] unicodeVersion;
-
- /**
- * File format version that this class understands.
- * No guarantees are made if a older version is used
- * see store.c of gennorm for more information and values
- */
- private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
- (byte)0x72, (byte)0x6D};
- private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
- (byte)0x5, (byte)0x2};
-
-}