jdk/src/share/classes/java/text/RuleBasedBreakIterator.java
author alanb
Thu, 18 Aug 2011 16:47:20 +0100
changeset 10347 1c9efe1ec7d3
parent 7668 d4a77089c587
child 10419 12c063b39232
permissions -rw-r--r--
7015589: (spec) BufferedWriter.close leaves stream open if close of underlying Writer fails Reviewed-by: forax, mduigou
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     1
/*
7668
d4a77089c587 6962318: Update copyright year
ohair
parents: 5506
diff changeset
     2
 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
90ce3da70b43 Initial load
duke
parents:
diff changeset
     4
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
     5
 * This code is free software; you can redistribute it and/or modify it
90ce3da70b43 Initial load
duke
parents:
diff changeset
     6
 * under the terms of the GNU General Public License version 2 only, as
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 4844
diff changeset
     7
 * published by the Free Software Foundation.  Oracle designates this
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     8
 * particular file as subject to the "Classpath" exception as provided
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 4844
diff changeset
     9
 * by Oracle in the LICENSE file that accompanied this code.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    10
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    11
 * This code is distributed in the hope that it will be useful, but WITHOUT
90ce3da70b43 Initial load
duke
parents:
diff changeset
    12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
90ce3da70b43 Initial load
duke
parents:
diff changeset
    13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
90ce3da70b43 Initial load
duke
parents:
diff changeset
    14
 * version 2 for more details (a copy is included in the LICENSE file that
90ce3da70b43 Initial load
duke
parents:
diff changeset
    15
 * accompanied this code).
90ce3da70b43 Initial load
duke
parents:
diff changeset
    16
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    17
 * You should have received a copy of the GNU General Public License version
90ce3da70b43 Initial load
duke
parents:
diff changeset
    18
 * 2 along with this work; if not, write to the Free Software Foundation,
90ce3da70b43 Initial load
duke
parents:
diff changeset
    19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    20
 *
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 4844
diff changeset
    21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 4844
diff changeset
    22
 * or visit www.oracle.com if you need additional information or have any
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 4844
diff changeset
    23
 * questions.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    24
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
    25
90ce3da70b43 Initial load
duke
parents:
diff changeset
    26
/*
90ce3da70b43 Initial load
duke
parents:
diff changeset
    27
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    28
 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
90ce3da70b43 Initial load
duke
parents:
diff changeset
    29
 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
90ce3da70b43 Initial load
duke
parents:
diff changeset
    30
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    31
 * The original version of this source code and documentation
90ce3da70b43 Initial load
duke
parents:
diff changeset
    32
 * is copyrighted and owned by Taligent, Inc., a wholly-owned
90ce3da70b43 Initial load
duke
parents:
diff changeset
    33
 * subsidiary of IBM. These materials are provided under terms
90ce3da70b43 Initial load
duke
parents:
diff changeset
    34
 * of a License Agreement between Taligent and Sun. This technology
90ce3da70b43 Initial load
duke
parents:
diff changeset
    35
 * is protected by multiple US and International patents.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    36
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    37
 * This notice and attribution to Taligent may not be removed.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    38
 * Taligent is a registered trademark of Taligent, Inc.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    39
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
    40
90ce3da70b43 Initial load
duke
parents:
diff changeset
    41
package java.text;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    42
90ce3da70b43 Initial load
duke
parents:
diff changeset
    43
import java.io.BufferedInputStream;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    44
import java.io.IOException;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    45
import java.security.AccessController;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    46
import java.security.PrivilegedActionException;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    47
import java.security.PrivilegedExceptionAction;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    48
import java.util.Vector;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    49
import java.util.Stack;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    50
import java.util.Hashtable;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    51
import java.util.Enumeration;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    52
import java.util.MissingResourceException;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    53
import java.text.CharacterIterator;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    54
import java.text.StringCharacterIterator;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    55
import sun.text.CompactByteArray;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    56
import sun.text.SupplementaryCharacterData;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    57
90ce3da70b43 Initial load
duke
parents:
diff changeset
    58
/**
90ce3da70b43 Initial load
duke
parents:
diff changeset
    59
 * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    60
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    61
 * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    62
 * and <i>regular expressions.</i></p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    63
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    64
 * <p>A substitution rule defines a name that can be used in place of an expression. It
90ce3da70b43 Initial load
duke
parents:
diff changeset
    65
 * consists of a name, which is a string of characters contained in angle brackets, an equals
90ce3da70b43 Initial load
duke
parents:
diff changeset
    66
 * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
90ce3da70b43 Initial load
duke
parents:
diff changeset
    67
 * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
90ce3da70b43 Initial load
duke
parents:
diff changeset
    68
 * square brackets. A substitution is visible after its definition, and is filled in using
90ce3da70b43 Initial load
duke
parents:
diff changeset
    69
 * simple textual substitution. Substitution definitions can contain other substitutions, as
90ce3da70b43 Initial load
duke
parents:
diff changeset
    70
 * long as those substitutions have been defined first. Substitutions are generally used to
90ce3da70b43 Initial load
duke
parents:
diff changeset
    71
 * make the regular expressions (which can get quite complex) shorted and easier to read.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    72
 * They typically define either character categories or commonly-used subexpressions.</p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    73
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    74
 * <p>There is one special substitution.&nbsp; If the description defines a substitution
90ce3da70b43 Initial load
duke
parents:
diff changeset
    75
 * called &quot;&lt;ignore&gt;&quot;, the expression must be a [] expression, and the
90ce3da70b43 Initial load
duke
parents:
diff changeset
    76
 * expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
90ce3da70b43 Initial load
duke
parents:
diff changeset
    77
 * will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
90ce3da70b43 Initial load
duke
parents:
diff changeset
    78
 * same way it would if any ignore characters it contains are taken out.&nbsp; Break
90ce3da70b43 Initial load
duke
parents:
diff changeset
    79
 * positions never occur befoer ignore characters.</p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    80
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    81
 * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
90ce3da70b43 Initial load
duke
parents:
diff changeset
    82
 * defines a sequence of characters to be kept together. With one significant exception, the
90ce3da70b43 Initial load
duke
parents:
diff changeset
    83
 * iterator uses a longest-possible-match algorithm when matching text to regular
90ce3da70b43 Initial load
duke
parents:
diff changeset
    84
 * expressions. The iterator also treats descriptions containing multiple regular expressions
90ce3da70b43 Initial load
duke
parents:
diff changeset
    85
 * as if they were ORed together (i.e., as if they were separated by |).</p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    86
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    87
 * <p>The special characters recognized by the regular-expression parser are as follows:</p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    88
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    89
 * <blockquote>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    90
 *   <table border="1" width="100%">
90ce3da70b43 Initial load
duke
parents:
diff changeset
    91
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    92
 *       <td width="6%">*</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    93
 *       <td width="94%">Specifies that the expression preceding the asterisk may occur any number
90ce3da70b43 Initial load
duke
parents:
diff changeset
    94
 *       of times (including not at all).</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    95
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    96
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    97
 *       <td width="6%">{}</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    98
 *       <td width="94%">Encloses a sequence of characters that is optional.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
    99
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   100
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   101
 *       <td width="6%">()</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   102
 *       <td width="94%">Encloses a sequence of characters.&nbsp; If followed by *, the sequence
90ce3da70b43 Initial load
duke
parents:
diff changeset
   103
 *       repeats.&nbsp; Otherwise, the parentheses are just a grouping device and a way to delimit
90ce3da70b43 Initial load
duke
parents:
diff changeset
   104
 *       the ends of expressions containing |.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   105
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   106
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   107
 *       <td width="6%">|</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   108
 *       <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
90ce3da70b43 Initial load
duke
parents:
diff changeset
   109
 *       sequence or the other, but not both, matches this expression.&nbsp; The | character can
90ce3da70b43 Initial load
duke
parents:
diff changeset
   110
 *       only occur inside ().</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   111
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   112
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   113
 *       <td width="6%">.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   114
 *       <td width="94%">Matches any character.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   115
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   116
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   117
 *       <td width="6%">*?</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   118
 *       <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
90ce3da70b43 Initial load
duke
parents:
diff changeset
   119
 *       when there is overlap between the last group of characters in the expression preceding the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   120
 *       * and the first group of characters following the *.&nbsp; When there is this kind of
90ce3da70b43 Initial load
duke
parents:
diff changeset
   121
 *       overlap, * will match the longest sequence of characters that match the expression before
90ce3da70b43 Initial load
duke
parents:
diff changeset
   122
 *       the *, and *? will match the shortest sequence of characters matching the expression
90ce3da70b43 Initial load
duke
parents:
diff changeset
   123
 *       before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   124
 *       &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   125
 *       but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   126
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   127
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   128
 *       <td width="6%">[]</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   129
 *       <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
90ce3da70b43 Initial load
duke
parents:
diff changeset
   130
 *       match any single character that is specified in the [] expression.&nbsp; For more on the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   131
 *       syntax of [] expressions, see below.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   132
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   133
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   134
 *       <td width="6%">/</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   135
 *       <td width="94%">Specifies where the break position should go if text matches this
90ce3da70b43 Initial load
duke
parents:
diff changeset
   136
 *       expression.&nbsp; (e.g., &quot;[a-z]&#42;/[:Zs:]*[1-0]&quot; will match if the iterator sees a run
90ce3da70b43 Initial load
duke
parents:
diff changeset
   137
 *       of letters, followed by a run of whitespace, followed by a digit, but the break position
90ce3da70b43 Initial load
duke
parents:
diff changeset
   138
 *       will actually go before the whitespace).&nbsp; Expressions that don't contain / put the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   139
 *       break position at the end of the matching text.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   140
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   141
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   142
 *       <td width="6%">\</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   143
 *       <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
90ce3da70b43 Initial load
duke
parents:
diff changeset
   144
 *       character to be treated as literal character.&nbsp; This has no effect for many
90ce3da70b43 Initial load
duke
parents:
diff changeset
   145
 *       characters, but for the characters listed above, this deprives them of their special
90ce3da70b43 Initial load
duke
parents:
diff changeset
   146
 *       meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
90ce3da70b43 Initial load
duke
parents:
diff changeset
   147
 *       newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   148
 *       &quot;\n&quot; will be converted to a literal newline character by the time the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   149
 *       regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
90ce3da70b43 Initial load
duke
parents:
diff changeset
   150
 *       visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
90ce3da70b43 Initial load
duke
parents:
diff changeset
   151
 *       characters in the ASCII range except for letters, digits, and control characters are
90ce3da70b43 Initial load
duke
parents:
diff changeset
   152
 *       reserved characters to the parser and must be preceded by \ even if they currently don't
90ce3da70b43 Initial load
duke
parents:
diff changeset
   153
 *       mean anything.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   154
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   155
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   156
 *       <td width="6%">!</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   157
 *       <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
90ce3da70b43 Initial load
duke
parents:
diff changeset
   158
 *       parser that this expression specifies the backwards-iteration behavior of the iterator,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   159
 *       and not its normal iteration behavior.&nbsp; This is generally only used in situations
90ce3da70b43 Initial load
duke
parents:
diff changeset
   160
 *       where the automatically-generated backwards-iteration brhavior doesn't produce
90ce3da70b43 Initial load
duke
parents:
diff changeset
   161
 *       satisfactory results and must be supplemented with extra client-specified rules.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   162
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   163
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   164
 *       <td width="6%"><em>(all others)</em></td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   165
 *       <td width="94%">All other characters are treated as literal characters, which must match
90ce3da70b43 Initial load
duke
parents:
diff changeset
   166
 *       the corresponding character(s) in the text exactly.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   167
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   168
 *   </table>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   169
 * </blockquote>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   170
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   171
 * <p>Within a [] expression, a number of other special characters can be used to specify
90ce3da70b43 Initial load
duke
parents:
diff changeset
   172
 * groups of characters:</p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   173
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   174
 * <blockquote>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   175
 *   <table border="1" width="100%">
90ce3da70b43 Initial load
duke
parents:
diff changeset
   176
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   177
 *       <td width="6%">-</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   178
 *       <td width="94%">Specifies a range of matching characters.&nbsp; For example
90ce3da70b43 Initial load
duke
parents:
diff changeset
   179
 *       &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
90ce3da70b43 Initial load
duke
parents:
diff changeset
   180
 *       sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
90ce3da70b43 Initial load
duke
parents:
diff changeset
   181
 *       language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
90ce3da70b43 Initial load
duke
parents:
diff changeset
   182
 *       it include accented letters such as a-umlaut.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   183
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   184
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   185
 *       <td width="6%">::</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   186
 *       <td width="94%">A pair of colons containing a one- or two-letter code matches all
90ce3da70b43 Initial load
duke
parents:
diff changeset
   187
 *       characters in the corresponding Unicode category.&nbsp; The two-letter codes are the same
90ce3da70b43 Initial load
duke
parents:
diff changeset
   188
 *       as the two-letter codes in the Unicode database (for example, &quot;[:Sc::Sm:]&quot;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   189
 *       matches all currency symbols and all math symbols).&nbsp; Specifying a one-letter code is
90ce3da70b43 Initial load
duke
parents:
diff changeset
   190
 *       the same as specifying all two-letter codes that begin with that letter (for example,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   191
 *       &quot;[:L:]&quot; matches all letters, and is equivalent to
90ce3da70b43 Initial load
duke
parents:
diff changeset
   192
 *       &quot;[:Lu::Ll::Lo::Lm::Lt:]&quot;).&nbsp; Anything other than a valid two-letter Unicode
90ce3da70b43 Initial load
duke
parents:
diff changeset
   193
 *       category code or a single letter that begins a Unicode category code is illegal within
90ce3da70b43 Initial load
duke
parents:
diff changeset
   194
 *       colons.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   195
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   196
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   197
 *       <td width="6%">[]</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   198
 *       <td width="94%">[] expressions can nest.&nbsp; This has no effect, except when used in
90ce3da70b43 Initial load
duke
parents:
diff changeset
   199
 *       conjunction with the ^ token.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   200
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   201
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   202
 *       <td width="6%">^</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   203
 *       <td width="94%">Excludes the character (or the characters in the [] expression) following
90ce3da70b43 Initial load
duke
parents:
diff changeset
   204
 *       it from the group of characters.&nbsp; For example, &quot;[a-z^p]&quot; matches all Latin
90ce3da70b43 Initial load
duke
parents:
diff changeset
   205
 *       lowercase letters except p.&nbsp; &quot;[:L:^[&#92;u4e00-&#92;u9fff]]&quot; matches all letters
90ce3da70b43 Initial load
duke
parents:
diff changeset
   206
 *       except the Han ideographs.</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   207
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   208
 *     <tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   209
 *       <td width="6%"><em>(all others)</em></td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   210
 *       <td width="94%">All other characters are treated as literal characters.&nbsp; (For
90ce3da70b43 Initial load
duke
parents:
diff changeset
   211
 *       example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   212
 *     </tr>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   213
 *   </table>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   214
 * </blockquote>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   215
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   216
 * <p>For a more complete explanation, see <a
90ce3da70b43 Initial load
duke
parents:
diff changeset
   217
 * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   218
 * &nbsp; For examples, see the resource data (which is annotated).</p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   219
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   220
 * @author Richard Gillam
90ce3da70b43 Initial load
duke
parents:
diff changeset
   221
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   222
class RuleBasedBreakIterator extends BreakIterator {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   223
90ce3da70b43 Initial load
duke
parents:
diff changeset
   224
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   225
     * A token used as a character-category value to identify ignore characters
90ce3da70b43 Initial load
duke
parents:
diff changeset
   226
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   227
    protected static final byte IGNORE = -1;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   228
90ce3da70b43 Initial load
duke
parents:
diff changeset
   229
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   230
     * The state number of the starting state
90ce3da70b43 Initial load
duke
parents:
diff changeset
   231
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   232
    private static final short START_STATE = 1;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   233
90ce3da70b43 Initial load
duke
parents:
diff changeset
   234
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   235
     * The state-transition value indicating "stop"
90ce3da70b43 Initial load
duke
parents:
diff changeset
   236
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   237
    private static final short STOP_STATE = 0;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   238
90ce3da70b43 Initial load
duke
parents:
diff changeset
   239
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   240
     * Magic number for the BreakIterator data file format.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   241
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   242
    static final byte[] LABEL = {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   243
        (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a',
90ce3da70b43 Initial load
duke
parents:
diff changeset
   244
        (byte)'\0'
90ce3da70b43 Initial load
duke
parents:
diff changeset
   245
    };
90ce3da70b43 Initial load
duke
parents:
diff changeset
   246
    static final int    LABEL_LENGTH = LABEL.length;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   247
90ce3da70b43 Initial load
duke
parents:
diff changeset
   248
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   249
     * Version number of the dictionary that was read in.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   250
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   251
    static final byte supportedVersion = 1;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   252
90ce3da70b43 Initial load
duke
parents:
diff changeset
   253
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   254
     * Header size in byte count
90ce3da70b43 Initial load
duke
parents:
diff changeset
   255
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   256
    private static final int HEADER_LENGTH = 36;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   257
90ce3da70b43 Initial load
duke
parents:
diff changeset
   258
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   259
     * An array length of indices for BMP characters
90ce3da70b43 Initial load
duke
parents:
diff changeset
   260
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   261
    private static final int BMP_INDICES_LENGTH = 512;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   262
90ce3da70b43 Initial load
duke
parents:
diff changeset
   263
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   264
     * Tables that indexes from character values to character category numbers
90ce3da70b43 Initial load
duke
parents:
diff changeset
   265
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   266
    private CompactByteArray charCategoryTable = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   267
    private SupplementaryCharacterData supplementaryCharCategoryTable = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   268
90ce3da70b43 Initial load
duke
parents:
diff changeset
   269
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   270
     * The table of state transitions used for forward iteration
90ce3da70b43 Initial load
duke
parents:
diff changeset
   271
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   272
    private short[] stateTable = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   273
90ce3da70b43 Initial load
duke
parents:
diff changeset
   274
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   275
     * The table of state transitions used to sync up the iterator with the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   276
     * text in backwards and random-access iteration
90ce3da70b43 Initial load
duke
parents:
diff changeset
   277
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   278
    private short[] backwardsStateTable = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   279
90ce3da70b43 Initial load
duke
parents:
diff changeset
   280
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   281
     * A list of flags indicating which states in the state table are accepting
90ce3da70b43 Initial load
duke
parents:
diff changeset
   282
     * ("end") states
90ce3da70b43 Initial load
duke
parents:
diff changeset
   283
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   284
    private boolean[] endStates = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   285
90ce3da70b43 Initial load
duke
parents:
diff changeset
   286
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   287
     * A list of flags indicating which states in the state table are
90ce3da70b43 Initial load
duke
parents:
diff changeset
   288
     * lookahead states (states which turn lookahead on and off)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   289
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   290
    private boolean[] lookaheadStates = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   291
90ce3da70b43 Initial load
duke
parents:
diff changeset
   292
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   293
     * A table for additional data. May be used by a subclass of
90ce3da70b43 Initial load
duke
parents:
diff changeset
   294
     * RuleBasedBreakIterator.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   295
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   296
    private byte[] additionalData = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   297
90ce3da70b43 Initial load
duke
parents:
diff changeset
   298
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   299
     * The number of character categories (and, thus, the number of columns in
90ce3da70b43 Initial load
duke
parents:
diff changeset
   300
     * the state tables)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   301
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   302
    private int numCategories;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   303
90ce3da70b43 Initial load
duke
parents:
diff changeset
   304
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   305
     * The character iterator through which this BreakIterator accesses the text
90ce3da70b43 Initial load
duke
parents:
diff changeset
   306
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   307
    private CharacterIterator text = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   308
90ce3da70b43 Initial load
duke
parents:
diff changeset
   309
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   310
     * A CRC32 value of all data in datafile
90ce3da70b43 Initial load
duke
parents:
diff changeset
   311
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   312
    private long checksum;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   313
90ce3da70b43 Initial load
duke
parents:
diff changeset
   314
    //=======================================================================
90ce3da70b43 Initial load
duke
parents:
diff changeset
   315
    // constructors
90ce3da70b43 Initial load
duke
parents:
diff changeset
   316
    //=======================================================================
90ce3da70b43 Initial load
duke
parents:
diff changeset
   317
90ce3da70b43 Initial load
duke
parents:
diff changeset
   318
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   319
     * Constructs a RuleBasedBreakIterator according to the datafile
90ce3da70b43 Initial load
duke
parents:
diff changeset
   320
     * provided.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   321
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   322
    public RuleBasedBreakIterator(String datafile)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   323
        throws IOException, MissingResourceException {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   324
        readTables(datafile);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   325
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   326
90ce3da70b43 Initial load
duke
parents:
diff changeset
   327
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   328
     * Read datafile. The datafile's format is as follows:
90ce3da70b43 Initial load
duke
parents:
diff changeset
   329
     * <pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   330
     *   BreakIteratorData {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   331
     *       u1           magic[7];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   332
     *       u1           version;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   333
     *       u4           totalDataSize;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   334
     *       header_info  header;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   335
     *       body         value;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   336
     *   }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   337
     * </pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   338
     * <code>totalDataSize</code> is the summation of the size of
90ce3da70b43 Initial load
duke
parents:
diff changeset
   339
     * <code>header_info</code> and <code>body</code> in byte count.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   340
     * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   341
     * In <code>header</code>, each field except for checksum implies the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   342
     * length of each field. Since <code>BMPdataLength</code> is a fixed-length
90ce3da70b43 Initial load
duke
parents:
diff changeset
   343
     *  data(512 entries), its length isn't included in <code>header</code>.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   344
     * <code>checksum</code> is a CRC32 value of all in <code>body</code>.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   345
     * <pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   346
     *   header_info {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   347
     *       u4           stateTableLength;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   348
     *       u4           backwardsStateTableLength;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   349
     *       u4           endStatesLength;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   350
     *       u4           lookaheadStatesLength;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   351
     *       u4           BMPdataLength;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   352
     *       u4           nonBMPdataLength;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   353
     *       u4           additionalDataLength;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   354
     *       u8           checksum;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   355
     *   }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   356
     * </pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   357
     * <p>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   358
     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
   359
     * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to
90ce3da70b43 Initial load
duke
parents:
diff changeset
   360
     * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to
90ce3da70b43 Initial load
duke
parents:
diff changeset
   361
     * <code>supplementaryCharCategoryTable</code>.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   362
     * <pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   363
     *   body {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   364
     *       u2           stateTable[stateTableLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   365
     *       u2           backwardsStateTable[backwardsStateTableLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   366
     *       u1           endStates[endStatesLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   367
     *       u1           lookaheadStates[lookaheadStatesLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   368
     *       u2           BMPindices[512];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   369
     *       u1           BMPdata[BMPdataLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   370
     *       u4           nonBMPdata[numNonBMPdataLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   371
     *       u1           additionalData[additionalDataLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   372
     *   }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   373
     * </pre>
90ce3da70b43 Initial load
duke
parents:
diff changeset
   374
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   375
    protected void readTables(String datafile)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   376
        throws IOException, MissingResourceException {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   377
90ce3da70b43 Initial load
duke
parents:
diff changeset
   378
        byte[] buffer = readFile(datafile);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   379
90ce3da70b43 Initial load
duke
parents:
diff changeset
   380
        /* Read header_info. */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   381
        int stateTableLength = BreakIterator.getInt(buffer, 0);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   382
        int backwardsStateTableLength = BreakIterator.getInt(buffer, 4);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   383
        int endStatesLength = BreakIterator.getInt(buffer, 8);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   384
        int lookaheadStatesLength = BreakIterator.getInt(buffer, 12);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   385
        int BMPdataLength = BreakIterator.getInt(buffer, 16);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   386
        int nonBMPdataLength = BreakIterator.getInt(buffer, 20);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   387
        int additionalDataLength = BreakIterator.getInt(buffer, 24);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   388
        checksum = BreakIterator.getLong(buffer, 28);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   389
90ce3da70b43 Initial load
duke
parents:
diff changeset
   390
        /* Read stateTable[numCategories * numRows] */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   391
        stateTable = new short[stateTableLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   392
        int offset = HEADER_LENGTH;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   393
        for (int i = 0; i < stateTableLength; i++, offset+=2) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   394
           stateTable[i] = BreakIterator.getShort(buffer, offset);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   395
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   396
90ce3da70b43 Initial load
duke
parents:
diff changeset
   397
        /* Read backwardsStateTable[numCategories * numRows] */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   398
        backwardsStateTable = new short[backwardsStateTableLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   399
        for (int i = 0; i < backwardsStateTableLength; i++, offset+=2) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   400
           backwardsStateTable[i] = BreakIterator.getShort(buffer, offset);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   401
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   402
90ce3da70b43 Initial load
duke
parents:
diff changeset
   403
        /* Read endStates[numRows] */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   404
        endStates = new boolean[endStatesLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   405
        for (int i = 0; i < endStatesLength; i++, offset++) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   406
           endStates[i] = buffer[offset] == 1;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   407
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   408
90ce3da70b43 Initial load
duke
parents:
diff changeset
   409
        /* Read lookaheadStates[numRows] */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   410
        lookaheadStates = new boolean[lookaheadStatesLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   411
        for (int i = 0; i < lookaheadStatesLength; i++, offset++) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   412
           lookaheadStates[i] = buffer[offset] == 1;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   413
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   414
90ce3da70b43 Initial load
duke
parents:
diff changeset
   415
        /* Read a category table and indices for BMP characters. */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   416
        short[] temp1 = new short[BMP_INDICES_LENGTH];  // BMPindices
90ce3da70b43 Initial load
duke
parents:
diff changeset
   417
        for (int i = 0; i < BMP_INDICES_LENGTH; i++, offset+=2) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   418
            temp1[i] = BreakIterator.getShort(buffer, offset);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   419
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   420
        byte[] temp2 = new byte[BMPdataLength];  // BMPdata
90ce3da70b43 Initial load
duke
parents:
diff changeset
   421
        System.arraycopy(buffer, offset, temp2, 0, BMPdataLength);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   422
        offset += BMPdataLength;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   423
        charCategoryTable = new CompactByteArray(temp1, temp2);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   424
90ce3da70b43 Initial load
duke
parents:
diff changeset
   425
        /* Read a category table for non-BMP characters. */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   426
        int[] temp3 = new int[nonBMPdataLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   427
        for (int i = 0; i < nonBMPdataLength; i++, offset+=4) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   428
            temp3[i] = BreakIterator.getInt(buffer, offset);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   429
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   430
        supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   431
90ce3da70b43 Initial load
duke
parents:
diff changeset
   432
        /* Read additional data */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   433
        if (additionalDataLength > 0) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   434
            additionalData = new byte[additionalDataLength];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   435
            System.arraycopy(buffer, offset, additionalData, 0, additionalDataLength);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   436
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   437
90ce3da70b43 Initial load
duke
parents:
diff changeset
   438
        /* Set numCategories */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   439
        numCategories = stateTable.length / endStates.length;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   440
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   441
90ce3da70b43 Initial load
duke
parents:
diff changeset
   442
    protected byte[] readFile(final String datafile)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   443
        throws IOException, MissingResourceException {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   444
90ce3da70b43 Initial load
duke
parents:
diff changeset
   445
        BufferedInputStream is;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   446
        try {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   447
            is = (BufferedInputStream)AccessController.doPrivileged(
90ce3da70b43 Initial load
duke
parents:
diff changeset
   448
                new PrivilegedExceptionAction() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   449
                    public Object run() throws Exception {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   450
                        return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + datafile));
90ce3da70b43 Initial load
duke
parents:
diff changeset
   451
                    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   452
                }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   453
            );
90ce3da70b43 Initial load
duke
parents:
diff changeset
   454
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   455
        catch (PrivilegedActionException e) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   456
            throw new InternalError(e.toString());
90ce3da70b43 Initial load
duke
parents:
diff changeset
   457
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   458
90ce3da70b43 Initial load
duke
parents:
diff changeset
   459
        int offset = 0;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   460
90ce3da70b43 Initial load
duke
parents:
diff changeset
   461
        /* First, read magic, version, and header_info. */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   462
        int len = LABEL_LENGTH + 5;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   463
        byte[] buf = new byte[len];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   464
        if (is.read(buf) != len) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   465
            throw new MissingResourceException("Wrong header length",
90ce3da70b43 Initial load
duke
parents:
diff changeset
   466
                                               datafile, "");
90ce3da70b43 Initial load
duke
parents:
diff changeset
   467
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   468
90ce3da70b43 Initial load
duke
parents:
diff changeset
   469
        /* Validate the magic number. */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   470
        for (int i = 0; i < LABEL_LENGTH; i++, offset++) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   471
            if (buf[offset] != LABEL[offset]) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   472
                throw new MissingResourceException("Wrong magic number",
90ce3da70b43 Initial load
duke
parents:
diff changeset
   473
                                                   datafile, "");
90ce3da70b43 Initial load
duke
parents:
diff changeset
   474
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   475
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   476
90ce3da70b43 Initial load
duke
parents:
diff changeset
   477
        /* Validate the version number. */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   478
        if (buf[offset] != supportedVersion) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   479
            throw new MissingResourceException("Unsupported version(" + buf[offset] + ")",
90ce3da70b43 Initial load
duke
parents:
diff changeset
   480
                                               datafile, "");
90ce3da70b43 Initial load
duke
parents:
diff changeset
   481
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   482
90ce3da70b43 Initial load
duke
parents:
diff changeset
   483
        /* Read data: totalDataSize + 8(for checksum) */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   484
        len = BreakIterator.getInt(buf, ++offset);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   485
        buf = new byte[len];
90ce3da70b43 Initial load
duke
parents:
diff changeset
   486
        if (is.read(buf) != len) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   487
            throw new MissingResourceException("Wrong data length",
90ce3da70b43 Initial load
duke
parents:
diff changeset
   488
                                               datafile, "");
90ce3da70b43 Initial load
duke
parents:
diff changeset
   489
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   490
90ce3da70b43 Initial load
duke
parents:
diff changeset
   491
        is.close();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   492
90ce3da70b43 Initial load
duke
parents:
diff changeset
   493
        return buf;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   494
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   495
90ce3da70b43 Initial load
duke
parents:
diff changeset
   496
    byte[] getAdditionalData() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   497
        return additionalData;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   498
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   499
90ce3da70b43 Initial load
duke
parents:
diff changeset
   500
    void setAdditionalData(byte[] b) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   501
        additionalData = b;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   502
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   503
90ce3da70b43 Initial load
duke
parents:
diff changeset
   504
    //=======================================================================
90ce3da70b43 Initial load
duke
parents:
diff changeset
   505
    // boilerplate
90ce3da70b43 Initial load
duke
parents:
diff changeset
   506
    //=======================================================================
90ce3da70b43 Initial load
duke
parents:
diff changeset
   507
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   508
     * Clones this iterator.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   509
     * @return A newly-constructed RuleBasedBreakIterator with the same
90ce3da70b43 Initial load
duke
parents:
diff changeset
   510
     * behavior as this one.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   511
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   512
    public Object clone() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   513
        RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   514
        if (text != null) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   515
            result.text = (CharacterIterator) text.clone();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   516
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   517
        return result;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   518
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   519
90ce3da70b43 Initial load
duke
parents:
diff changeset
   520
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   521
     * Returns true if both BreakIterators are of the same class, have the same
90ce3da70b43 Initial load
duke
parents:
diff changeset
   522
     * rules, and iterate over the same text.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   523
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   524
    public boolean equals(Object that) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   525
        try {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   526
            if (that == null) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   527
                return false;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   528
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   529
90ce3da70b43 Initial load
duke
parents:
diff changeset
   530
            RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   531
            if (checksum != other.checksum) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   532
                return false;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   533
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   534
            if (text == null) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   535
                return other.text == null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   536
            } else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   537
                return text.equals(other.text);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   538
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   539
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   540
        catch(ClassCastException e) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   541
            return false;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   542
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   543
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   544
90ce3da70b43 Initial load
duke
parents:
diff changeset
   545
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   546
     * Returns text
90ce3da70b43 Initial load
duke
parents:
diff changeset
   547
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   548
    public String toString() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   549
        StringBuffer sb = new StringBuffer();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   550
        sb.append('[');
90ce3da70b43 Initial load
duke
parents:
diff changeset
   551
        sb.append("checksum=0x" + Long.toHexString(checksum));
90ce3da70b43 Initial load
duke
parents:
diff changeset
   552
        sb.append(']');
90ce3da70b43 Initial load
duke
parents:
diff changeset
   553
        return sb.toString();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   554
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   555
90ce3da70b43 Initial load
duke
parents:
diff changeset
   556
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   557
     * Compute a hashcode for this BreakIterator
90ce3da70b43 Initial load
duke
parents:
diff changeset
   558
     * @return A hash code
90ce3da70b43 Initial load
duke
parents:
diff changeset
   559
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   560
    public int hashCode() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   561
        return (int)checksum;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   562
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   563
90ce3da70b43 Initial load
duke
parents:
diff changeset
   564
    //=======================================================================
90ce3da70b43 Initial load
duke
parents:
diff changeset
   565
    // BreakIterator overrides
90ce3da70b43 Initial load
duke
parents:
diff changeset
   566
    //=======================================================================
90ce3da70b43 Initial load
duke
parents:
diff changeset
   567
90ce3da70b43 Initial load
duke
parents:
diff changeset
   568
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   569
     * Sets the current iteration position to the beginning of the text.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   570
     * (i.e., the CharacterIterator's starting offset).
90ce3da70b43 Initial load
duke
parents:
diff changeset
   571
     * @return The offset of the beginning of the text.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   572
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   573
    public int first() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   574
        CharacterIterator t = getText();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   575
90ce3da70b43 Initial load
duke
parents:
diff changeset
   576
        t.first();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   577
        return t.getIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   578
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   579
90ce3da70b43 Initial load
duke
parents:
diff changeset
   580
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   581
     * Sets the current iteration position to the end of the text.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   582
     * (i.e., the CharacterIterator's ending offset).
90ce3da70b43 Initial load
duke
parents:
diff changeset
   583
     * @return The text's past-the-end offset.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   584
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   585
    public int last() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   586
        CharacterIterator t = getText();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   587
90ce3da70b43 Initial load
duke
parents:
diff changeset
   588
        // I'm not sure why, but t.last() returns the offset of the last character,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   589
        // rather than the past-the-end offset
90ce3da70b43 Initial load
duke
parents:
diff changeset
   590
        t.setIndex(t.getEndIndex());
90ce3da70b43 Initial load
duke
parents:
diff changeset
   591
        return t.getIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   592
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   593
90ce3da70b43 Initial load
duke
parents:
diff changeset
   594
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   595
     * Advances the iterator either forward or backward the specified number of steps.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   596
     * Negative values move backward, and positive values move forward.  This is
90ce3da70b43 Initial load
duke
parents:
diff changeset
   597
     * equivalent to repeatedly calling next() or previous().
90ce3da70b43 Initial load
duke
parents:
diff changeset
   598
     * @param n The number of steps to move.  The sign indicates the direction
90ce3da70b43 Initial load
duke
parents:
diff changeset
   599
     * (negative is backwards, and positive is forwards).
90ce3da70b43 Initial load
duke
parents:
diff changeset
   600
     * @return The character offset of the boundary position n boundaries away from
90ce3da70b43 Initial load
duke
parents:
diff changeset
   601
     * the current one.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   602
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   603
    public int next(int n) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   604
        int result = current();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   605
        while (n > 0) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   606
            result = handleNext();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   607
            --n;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   608
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   609
        while (n < 0) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   610
            result = previous();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   611
            ++n;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   612
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   613
        return result;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   614
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   615
90ce3da70b43 Initial load
duke
parents:
diff changeset
   616
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   617
     * Advances the iterator to the next boundary position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   618
     * @return The position of the first boundary after this one.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   619
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   620
    public int next() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   621
        return handleNext();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   622
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   623
4844
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   624
    private int cachedLastKnownBreak = BreakIterator.DONE;
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   625
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   626
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   627
     * Advances the iterator backwards, to the last boundary preceding this one.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   628
     * @return The position of the last boundary position preceding this one.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   629
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   630
    public int previous() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   631
        // if we're already sitting at the beginning of the text, return DONE
90ce3da70b43 Initial load
duke
parents:
diff changeset
   632
        CharacterIterator text = getText();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   633
        if (current() == text.getBeginIndex()) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   634
            return BreakIterator.DONE;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   635
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   636
90ce3da70b43 Initial load
duke
parents:
diff changeset
   637
        // set things up.  handlePrevious() will back us up to some valid
90ce3da70b43 Initial load
duke
parents:
diff changeset
   638
        // break position before the current position (we back our internal
90ce3da70b43 Initial load
duke
parents:
diff changeset
   639
        // iterator up one step to prevent handlePrevious() from returning
90ce3da70b43 Initial load
duke
parents:
diff changeset
   640
        // the current position), but not necessarily the last one before
90ce3da70b43 Initial load
duke
parents:
diff changeset
   641
        // where we started
90ce3da70b43 Initial load
duke
parents:
diff changeset
   642
        int start = current();
4844
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   643
        int lastResult = cachedLastKnownBreak;
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   644
        if (lastResult >= start || lastResult <= BreakIterator.DONE) {
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   645
            getPrevious();
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   646
            lastResult = handlePrevious();
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   647
        } else {
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   648
            //it might be better to check if handlePrevious() give us closer
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   649
            //safe value but handlePrevious() is slow too
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   650
            //So, this has to be done carefully
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   651
            text.setIndex(lastResult);
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   652
        }
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   653
        int result = lastResult;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   654
90ce3da70b43 Initial load
duke
parents:
diff changeset
   655
        // iterate forward from the known break position until we pass our
90ce3da70b43 Initial load
duke
parents:
diff changeset
   656
        // starting point.  The last break position before the starting
90ce3da70b43 Initial load
duke
parents:
diff changeset
   657
        // point is our return value
90ce3da70b43 Initial load
duke
parents:
diff changeset
   658
        while (result != BreakIterator.DONE && result < start) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   659
            lastResult = result;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   660
            result = handleNext();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   661
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   662
90ce3da70b43 Initial load
duke
parents:
diff changeset
   663
        // set the current iteration position to be the last break position
90ce3da70b43 Initial load
duke
parents:
diff changeset
   664
        // before where we started, and then return that value
90ce3da70b43 Initial load
duke
parents:
diff changeset
   665
        text.setIndex(lastResult);
4844
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   666
        cachedLastKnownBreak = lastResult;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   667
        return lastResult;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   668
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   669
90ce3da70b43 Initial load
duke
parents:
diff changeset
   670
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   671
     * Returns previous character
90ce3da70b43 Initial load
duke
parents:
diff changeset
   672
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   673
    private int getPrevious() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   674
        char c2 = text.previous();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   675
        if (Character.isLowSurrogate(c2) &&
90ce3da70b43 Initial load
duke
parents:
diff changeset
   676
            text.getIndex() > text.getBeginIndex()) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   677
            char c1 = text.previous();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   678
            if (Character.isHighSurrogate(c1)) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   679
                return Character.toCodePoint(c1, c2);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   680
            } else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   681
                text.next();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   682
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   683
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   684
        return (int)c2;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   685
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   686
90ce3da70b43 Initial load
duke
parents:
diff changeset
   687
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   688
     * Returns current character
90ce3da70b43 Initial load
duke
parents:
diff changeset
   689
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   690
    int getCurrent() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   691
        char c1 = text.current();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   692
        if (Character.isHighSurrogate(c1) &&
90ce3da70b43 Initial load
duke
parents:
diff changeset
   693
            text.getIndex() < text.getEndIndex()) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   694
            char c2 = text.next();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   695
            text.previous();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   696
            if (Character.isLowSurrogate(c2)) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   697
                return Character.toCodePoint(c1, c2);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   698
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   699
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   700
        return (int)c1;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   701
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   702
90ce3da70b43 Initial load
duke
parents:
diff changeset
   703
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   704
     * Returns the count of next character.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   705
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   706
    private int getCurrentCodePointCount() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   707
        char c1 = text.current();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   708
        if (Character.isHighSurrogate(c1) &&
90ce3da70b43 Initial load
duke
parents:
diff changeset
   709
            text.getIndex() < text.getEndIndex()) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   710
            char c2 = text.next();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   711
            text.previous();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   712
            if (Character.isLowSurrogate(c2)) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   713
                return 2;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   714
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   715
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   716
        return 1;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   717
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   718
90ce3da70b43 Initial load
duke
parents:
diff changeset
   719
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   720
     * Returns next character
90ce3da70b43 Initial load
duke
parents:
diff changeset
   721
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   722
    int getNext() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   723
        int index = text.getIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   724
        int endIndex = text.getEndIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   725
        if (index == endIndex ||
90ce3da70b43 Initial load
duke
parents:
diff changeset
   726
            (index = index + getCurrentCodePointCount()) >= endIndex) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   727
            return CharacterIterator.DONE;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   728
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   729
        text.setIndex(index);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   730
        return getCurrent();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   731
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   732
90ce3da70b43 Initial load
duke
parents:
diff changeset
   733
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   734
     * Returns the position of next character.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   735
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   736
    private int getNextIndex() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   737
        int index = text.getIndex() + getCurrentCodePointCount();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   738
        int endIndex = text.getEndIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   739
        if (index > endIndex) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   740
            return endIndex;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   741
        } else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   742
            return index;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   743
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   744
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   745
90ce3da70b43 Initial load
duke
parents:
diff changeset
   746
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   747
     * Throw IllegalArgumentException unless begin <= offset < end.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   748
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   749
    protected static final void checkOffset(int offset, CharacterIterator text) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   750
        if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   751
            throw new IllegalArgumentException("offset out of bounds");
90ce3da70b43 Initial load
duke
parents:
diff changeset
   752
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   753
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   754
90ce3da70b43 Initial load
duke
parents:
diff changeset
   755
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   756
     * Sets the iterator to refer to the first boundary position following
90ce3da70b43 Initial load
duke
parents:
diff changeset
   757
     * the specified position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   758
     * @offset The position from which to begin searching for a break position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   759
     * @return The position of the first break after the current position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   760
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   761
    public int following(int offset) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   762
90ce3da70b43 Initial load
duke
parents:
diff changeset
   763
        CharacterIterator text = getText();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   764
        checkOffset(offset, text);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   765
90ce3da70b43 Initial load
duke
parents:
diff changeset
   766
        // Set our internal iteration position (temporarily)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   767
        // to the position passed in.  If this is the _beginning_ position,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   768
        // then we can just use next() to get our return value
90ce3da70b43 Initial load
duke
parents:
diff changeset
   769
        text.setIndex(offset);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   770
        if (offset == text.getBeginIndex()) {
4844
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   771
            cachedLastKnownBreak = handleNext();
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   772
            return cachedLastKnownBreak;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   773
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   774
90ce3da70b43 Initial load
duke
parents:
diff changeset
   775
        // otherwise, we have to sync up first.  Use handlePrevious() to back
90ce3da70b43 Initial load
duke
parents:
diff changeset
   776
        // us up to a known break position before the specified position (if
90ce3da70b43 Initial load
duke
parents:
diff changeset
   777
        // we can determine that the specified position is a break position,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   778
        // we don't back up at all).  This may or may not be the last break
90ce3da70b43 Initial load
duke
parents:
diff changeset
   779
        // position at or before our starting position.  Advance forward
90ce3da70b43 Initial load
duke
parents:
diff changeset
   780
        // from here until we've passed the starting position.  The position
90ce3da70b43 Initial load
duke
parents:
diff changeset
   781
        // we stop on will be the first break position after the specified one.
4844
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   782
        int result = cachedLastKnownBreak;
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   783
        if (result >= offset || result <= BreakIterator.DONE) {
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   784
            result = handlePrevious();
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   785
        } else {
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   786
            //it might be better to check if handlePrevious() give us closer
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   787
            //safe value but handlePrevious() is slow too
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   788
            //So, this has to be done carefully
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   789
            text.setIndex(result);
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   790
        }
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   791
        while (result != BreakIterator.DONE && result <= offset) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   792
            result = handleNext();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   793
        }
4844
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   794
        cachedLastKnownBreak = result;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   795
        return result;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   796
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   797
90ce3da70b43 Initial load
duke
parents:
diff changeset
   798
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   799
     * Sets the iterator to refer to the last boundary position before the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   800
     * specified position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   801
     * @offset The position to begin searching for a break from.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   802
     * @return The position of the last boundary before the starting position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   803
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   804
    public int preceding(int offset) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   805
        // if we start by updating the current iteration position to the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   806
        // position specified by the caller, we can just use previous()
90ce3da70b43 Initial load
duke
parents:
diff changeset
   807
        // to carry out this operation
90ce3da70b43 Initial load
duke
parents:
diff changeset
   808
        CharacterIterator text = getText();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   809
        checkOffset(offset, text);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   810
        text.setIndex(offset);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   811
        return previous();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   812
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   813
90ce3da70b43 Initial load
duke
parents:
diff changeset
   814
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   815
     * Returns true if the specfied position is a boundary position.  As a side
90ce3da70b43 Initial load
duke
parents:
diff changeset
   816
     * effect, leaves the iterator pointing to the first boundary position at
90ce3da70b43 Initial load
duke
parents:
diff changeset
   817
     * or after "offset".
90ce3da70b43 Initial load
duke
parents:
diff changeset
   818
     * @param offset the offset to check.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   819
     * @return True if "offset" is a boundary position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   820
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   821
    public boolean isBoundary(int offset) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   822
        CharacterIterator text = getText();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   823
        checkOffset(offset, text);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   824
        if (offset == text.getBeginIndex()) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   825
            return true;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   826
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   827
90ce3da70b43 Initial load
duke
parents:
diff changeset
   828
        // to check whether this is a boundary, we can use following() on the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   829
        // position before the specified one and return true if the position we
90ce3da70b43 Initial load
duke
parents:
diff changeset
   830
        // get back is the one the user specified
90ce3da70b43 Initial load
duke
parents:
diff changeset
   831
        else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   832
            return following(offset - 1) == offset;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   833
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   834
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   835
90ce3da70b43 Initial load
duke
parents:
diff changeset
   836
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   837
     * Returns the current iteration position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   838
     * @return The current iteration position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   839
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   840
    public int current() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   841
        return getText().getIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   842
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   843
90ce3da70b43 Initial load
duke
parents:
diff changeset
   844
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   845
     * Return a CharacterIterator over the text being analyzed.  This version
90ce3da70b43 Initial load
duke
parents:
diff changeset
   846
     * of this method returns the actual CharacterIterator we're using internally.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   847
     * Changing the state of this iterator can have undefined consequences.  If
90ce3da70b43 Initial load
duke
parents:
diff changeset
   848
     * you need to change it, clone it first.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   849
     * @return An iterator over the text being analyzed.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   850
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   851
    public CharacterIterator getText() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   852
        // The iterator is initialized pointing to no text at all, so if this
90ce3da70b43 Initial load
duke
parents:
diff changeset
   853
        // function is called while we're in that state, we have to fudge an
90ce3da70b43 Initial load
duke
parents:
diff changeset
   854
        // iterator to return.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   855
        if (text == null) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   856
            text = new StringCharacterIterator("");
90ce3da70b43 Initial load
duke
parents:
diff changeset
   857
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   858
        return text;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   859
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   860
90ce3da70b43 Initial load
duke
parents:
diff changeset
   861
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   862
     * Set the iterator to analyze a new piece of text.  This function resets
90ce3da70b43 Initial load
duke
parents:
diff changeset
   863
     * the current iteration position to the beginning of the text.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   864
     * @param newText An iterator over the text to analyze.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   865
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   866
    public void setText(CharacterIterator newText) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   867
        // Test iterator to see if we need to wrap it in a SafeCharIterator.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   868
        // The correct behavior for CharacterIterators is to allow the
90ce3da70b43 Initial load
duke
parents:
diff changeset
   869
        // position to be set to the endpoint of the iterator.  Many
90ce3da70b43 Initial load
duke
parents:
diff changeset
   870
        // CharacterIterators do not uphold this, so this is a workaround
90ce3da70b43 Initial load
duke
parents:
diff changeset
   871
        // to permit them to use this class.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   872
        int end = newText.getEndIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   873
        boolean goodIterator;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   874
        try {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   875
            newText.setIndex(end);  // some buggy iterators throw an exception here
90ce3da70b43 Initial load
duke
parents:
diff changeset
   876
            goodIterator = newText.getIndex() == end;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   877
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   878
        catch(IllegalArgumentException e) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   879
            goodIterator = false;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   880
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   881
90ce3da70b43 Initial load
duke
parents:
diff changeset
   882
        if (goodIterator) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   883
            text = newText;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   884
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   885
        else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   886
            text = new SafeCharIterator(newText);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   887
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   888
        text.first();
4844
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   889
68e39b0ed557 6868503: RuleBasedBreakIterator is inefficient
peytoia
parents: 2
diff changeset
   890
        cachedLastKnownBreak = BreakIterator.DONE;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   891
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   892
90ce3da70b43 Initial load
duke
parents:
diff changeset
   893
90ce3da70b43 Initial load
duke
parents:
diff changeset
   894
    //=======================================================================
90ce3da70b43 Initial load
duke
parents:
diff changeset
   895
    // implementation
90ce3da70b43 Initial load
duke
parents:
diff changeset
   896
    //=======================================================================
90ce3da70b43 Initial load
duke
parents:
diff changeset
   897
90ce3da70b43 Initial load
duke
parents:
diff changeset
   898
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   899
     * This method is the actual implementation of the next() method.  All iteration
90ce3da70b43 Initial load
duke
parents:
diff changeset
   900
     * vectors through here.  This method initializes the state machine to state 1
90ce3da70b43 Initial load
duke
parents:
diff changeset
   901
     * and advances through the text character by character until we reach the end
90ce3da70b43 Initial load
duke
parents:
diff changeset
   902
     * of the text or the state machine transitions to state 0.  We update our return
90ce3da70b43 Initial load
duke
parents:
diff changeset
   903
     * value every time the state machine passes through a possible end state.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   904
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   905
    protected int handleNext() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   906
        // if we're already at the end of the text, return DONE.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   907
        CharacterIterator text = getText();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   908
        if (text.getIndex() == text.getEndIndex()) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   909
            return BreakIterator.DONE;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   910
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   911
90ce3da70b43 Initial load
duke
parents:
diff changeset
   912
        // no matter what, we always advance at least one character forward
90ce3da70b43 Initial load
duke
parents:
diff changeset
   913
        int result = getNextIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   914
        int lookaheadResult = 0;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   915
90ce3da70b43 Initial load
duke
parents:
diff changeset
   916
        // begin in state 1
90ce3da70b43 Initial load
duke
parents:
diff changeset
   917
        int state = START_STATE;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   918
        int category;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   919
        int c = getCurrent();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   920
90ce3da70b43 Initial load
duke
parents:
diff changeset
   921
        // loop until we reach the end of the text or transition to state 0
90ce3da70b43 Initial load
duke
parents:
diff changeset
   922
        while (c != CharacterIterator.DONE && state != STOP_STATE) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   923
90ce3da70b43 Initial load
duke
parents:
diff changeset
   924
            // look up the current character's character category (which tells us
90ce3da70b43 Initial load
duke
parents:
diff changeset
   925
            // which column in the state table to look at)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   926
            category = lookupCategory(c);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   927
90ce3da70b43 Initial load
duke
parents:
diff changeset
   928
            // if the character isn't an ignore character, look up a state
90ce3da70b43 Initial load
duke
parents:
diff changeset
   929
            // transition in the state table
90ce3da70b43 Initial load
duke
parents:
diff changeset
   930
            if (category != IGNORE) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   931
                state = lookupState(state, category);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   932
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   933
90ce3da70b43 Initial load
duke
parents:
diff changeset
   934
            // if the state we've just transitioned to is a lookahead state,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   935
            // (but not also an end state), save its position.  If it's
90ce3da70b43 Initial load
duke
parents:
diff changeset
   936
            // both a lookahead state and an end state, update the break position
90ce3da70b43 Initial load
duke
parents:
diff changeset
   937
            // to the last saved lookup-state position
90ce3da70b43 Initial load
duke
parents:
diff changeset
   938
            if (lookaheadStates[state]) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   939
                if (endStates[state]) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   940
                    result = lookaheadResult;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   941
                }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   942
                else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   943
                    lookaheadResult = getNextIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   944
                }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   945
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   946
90ce3da70b43 Initial load
duke
parents:
diff changeset
   947
            // otherwise, if the state we've just transitioned to is an accepting
90ce3da70b43 Initial load
duke
parents:
diff changeset
   948
            // state, update the break position to be the current iteration position
90ce3da70b43 Initial load
duke
parents:
diff changeset
   949
            else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   950
                if (endStates[state]) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   951
                    result = getNextIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   952
                }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   953
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   954
90ce3da70b43 Initial load
duke
parents:
diff changeset
   955
            c = getNext();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   956
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   957
90ce3da70b43 Initial load
duke
parents:
diff changeset
   958
        // if we've run off the end of the text, and the very last character took us into
90ce3da70b43 Initial load
duke
parents:
diff changeset
   959
        // a lookahead state, advance the break position to the lookahead position
90ce3da70b43 Initial load
duke
parents:
diff changeset
   960
        // (the theory here is that if there are no characters at all after the lookahead
90ce3da70b43 Initial load
duke
parents:
diff changeset
   961
        // position, that always matches the lookahead criteria)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   962
        if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   963
            result = lookaheadResult;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   964
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   965
90ce3da70b43 Initial load
duke
parents:
diff changeset
   966
        text.setIndex(result);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   967
        return result;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   968
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   969
90ce3da70b43 Initial load
duke
parents:
diff changeset
   970
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
   971
     * This method backs the iterator back up to a "safe position" in the text.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   972
     * This is a position that we know, without any context, must be a break position.
90ce3da70b43 Initial load
duke
parents:
diff changeset
   973
     * The various calling methods then iterate forward from this safe position to
90ce3da70b43 Initial load
duke
parents:
diff changeset
   974
     * the appropriate position to return.  (For more information, see the description
90ce3da70b43 Initial load
duke
parents:
diff changeset
   975
     * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   976
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
   977
    protected int handlePrevious() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   978
        CharacterIterator text = getText();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   979
        int state = START_STATE;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   980
        int category = 0;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   981
        int lastCategory = 0;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   982
        int c = getCurrent();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   983
90ce3da70b43 Initial load
duke
parents:
diff changeset
   984
        // loop until we reach the beginning of the text or transition to state 0
90ce3da70b43 Initial load
duke
parents:
diff changeset
   985
        while (c != CharacterIterator.DONE && state != STOP_STATE) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   986
90ce3da70b43 Initial load
duke
parents:
diff changeset
   987
            // save the last character's category and look up the current
90ce3da70b43 Initial load
duke
parents:
diff changeset
   988
            // character's category
90ce3da70b43 Initial load
duke
parents:
diff changeset
   989
            lastCategory = category;
90ce3da70b43 Initial load
duke
parents:
diff changeset
   990
            category = lookupCategory(c);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   991
90ce3da70b43 Initial load
duke
parents:
diff changeset
   992
            // if the current character isn't an ignore character, look up a
90ce3da70b43 Initial load
duke
parents:
diff changeset
   993
            // state transition in the backwards state table
90ce3da70b43 Initial load
duke
parents:
diff changeset
   994
            if (category != IGNORE) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   995
                state = lookupBackwardState(state, category);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   996
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   997
90ce3da70b43 Initial load
duke
parents:
diff changeset
   998
            // then advance one character backwards
90ce3da70b43 Initial load
duke
parents:
diff changeset
   999
            c = getPrevious();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1000
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1001
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1002
        // if we didn't march off the beginning of the text, we're either one or two
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1003
        // positions away from the real break position.  (One because of the call to
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1004
        // previous() at the end of the loop above, and another because the character
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1005
        // that takes us into the stop state will always be the character BEFORE
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1006
        // the break position.)
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1007
        if (c != CharacterIterator.DONE) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1008
            if (lastCategory != IGNORE) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1009
                getNext();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1010
                getNext();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1011
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1012
            else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1013
                getNext();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1014
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1015
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1016
        return text.getIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1017
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1018
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1019
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1020
     * Looks up a character's category (i.e., its category for breaking purposes,
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1021
     * not its Unicode category)
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1022
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1023
    protected int lookupCategory(int c) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1024
        if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1025
            return charCategoryTable.elementAt((char)c);
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1026
        } else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1027
            return supplementaryCharCategoryTable.getValue(c);
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1028
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1029
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1030
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1031
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1032
     * Given a current state and a character category, looks up the
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1033
     * next state to transition to in the state table.
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1034
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1035
    protected int lookupState(int state, int category) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1036
        return stateTable[state * numCategories + category];
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1037
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1038
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1039
    /**
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1040
     * Given a current state and a character category, looks up the
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1041
     * next state to transition to in the backwards state table.
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1042
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1043
    protected int lookupBackwardState(int state, int category) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1044
        return backwardsStateTable[state * numCategories + category];
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1045
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1046
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1047
    /*
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1048
     * This class exists to work around a bug in incorrect implementations
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1049
     * of CharacterIterator, which incorrectly handle setIndex(endIndex).
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1050
     * This iterator relies only on base.setIndex(n) where n is less than
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1051
     * endIndex.
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1052
     *
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1053
     * One caveat:  if the base iterator's begin and end indices change
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1054
     * the change will not be reflected by this wrapper.  Does that matter?
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1055
     */
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1056
    private static final class SafeCharIterator implements CharacterIterator,
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1057
                                                           Cloneable {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1058
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1059
        private CharacterIterator base;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1060
        private int rangeStart;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1061
        private int rangeLimit;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1062
        private int currentIndex;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1063
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1064
        SafeCharIterator(CharacterIterator base) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1065
            this.base = base;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1066
            this.rangeStart = base.getBeginIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1067
            this.rangeLimit = base.getEndIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1068
            this.currentIndex = base.getIndex();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1069
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1070
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1071
        public char first() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1072
            return setIndex(rangeStart);
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1073
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1074
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1075
        public char last() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1076
            return setIndex(rangeLimit - 1);
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1077
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1078
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1079
        public char current() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1080
            if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1081
                return DONE;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1082
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1083
            else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1084
                return base.setIndex(currentIndex);
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1085
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1086
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1087
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1088
        public char next() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1089
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1090
            currentIndex++;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1091
            if (currentIndex >= rangeLimit) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1092
                currentIndex = rangeLimit;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1093
                return DONE;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1094
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1095
            else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1096
                return base.setIndex(currentIndex);
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1097
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1098
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1099
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1100
        public char previous() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1101
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1102
            currentIndex--;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1103
            if (currentIndex < rangeStart) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1104
                currentIndex = rangeStart;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1105
                return DONE;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1106
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1107
            else {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1108
                return base.setIndex(currentIndex);
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1109
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1110
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1111
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1112
        public char setIndex(int i) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1113
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1114
            if (i < rangeStart || i > rangeLimit) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1115
                throw new IllegalArgumentException("Invalid position");
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1116
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1117
            currentIndex = i;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1118
            return current();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1119
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1120
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1121
        public int getBeginIndex() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1122
            return rangeStart;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1123
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1124
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1125
        public int getEndIndex() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1126
            return rangeLimit;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1127
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1128
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1129
        public int getIndex() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1130
            return currentIndex;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1131
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1132
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1133
        public Object clone() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1134
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1135
            SafeCharIterator copy = null;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1136
            try {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1137
                copy = (SafeCharIterator) super.clone();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1138
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1139
            catch(CloneNotSupportedException e) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1140
                throw new Error("Clone not supported: " + e);
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1141
            }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1142
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1143
            CharacterIterator copyOfBase = (CharacterIterator) base.clone();
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1144
            copy.base = copyOfBase;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1145
            return copy;
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1146
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1147
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
  1148
}