author | rfield |
Mon, 13 Feb 2017 08:50:26 -0800 | |
changeset 43856 | fcdebb803c62 |
parent 23010 | 6dadb192ad81 |
permissions | -rw-r--r-- |
2 | 1 |
/* |
23010
6dadb192ad81
8029235: Update copyright year to match last edit in jdk8 jdk repository for 2013
lana
parents:
21805
diff
changeset
|
2 |
* Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. |
2 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
5506 | 7 |
* published by the Free Software Foundation. Oracle designates this |
2 | 8 |
* particular file as subject to the "Classpath" exception as provided |
5506 | 9 |
* by Oracle in the LICENSE file that accompanied this code. |
2 | 10 |
* |
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
15 |
* accompanied this code). |
|
16 |
* |
|
17 |
* You should have received a copy of the GNU General Public License version |
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
* |
|
5506 | 21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
22 |
* or visit www.oracle.com if you need additional information or have any |
|
23 |
* questions. |
|
2 | 24 |
*/ |
25 |
||
26 |
package build.tools.generatebreakiteratordata; |
|
27 |
||
28 |
import java.io.*; |
|
29 |
import java.util.Enumeration; |
|
30 |
import java.util.Hashtable; |
|
31 |
import java.util.Stack; |
|
32 |
import java.util.Vector; |
|
33 |
import java.util.zip.CRC32; |
|
34 |
import sun.text.CompactByteArray; |
|
35 |
||
36 |
/** |
|
37 |
* This class has the job of constructing a RuleBasedBreakIterator from a |
|
38 |
* textual description. A Builder is constructed by GenerateBreakIteratorData, |
|
39 |
* which uses it to construct the iterator itself and then throws it away. |
|
40 |
* <p>The construction logic is separated out into its own class for two primary |
|
41 |
* reasons: |
|
42 |
* <ul> |
|
43 |
* <li>The construction logic is quite sophisticated and large. Separating |
|
44 |
* it out into its own class means the code must only be loaded into memory |
|
45 |
* while a RuleBasedBreakIterator is being constructed, and can be purged after |
|
46 |
* that. |
|
47 |
* <li>There is a fair amount of state that must be maintained throughout the |
|
48 |
* construction process that is not needed by the iterator after construction. |
|
49 |
* Separating this state out into another class prevents all of the functions |
|
50 |
* that construct the iterator from having to have really long parameter lists, |
|
51 |
* (hopefully) contributing to readability and maintainability. |
|
52 |
* </ul> |
|
53 |
* <p> |
|
54 |
* It'd be really nice if this could be an independent class rather than an |
|
55 |
* inner class, because that would shorten the source file considerably, but |
|
56 |
* making Builder an inner class of RuleBasedBreakIterator allows it direct |
|
57 |
* access to RuleBasedBreakIterator's private members, which saves us from |
|
58 |
* having to provide some kind of "back door" to the Builder class that could |
|
59 |
* then also be used by other classes. |
|
60 |
*/ |
|
61 |
class RuleBasedBreakIteratorBuilder { |
|
62 |
||
63 |
/** |
|
64 |
* A token used as a character-category value to identify ignore characters |
|
65 |
*/ |
|
66 |
protected static final byte IGNORE = -1; |
|
67 |
||
68 |
/** |
|
69 |
* Tables that indexes from character values to character category numbers |
|
70 |
*/ |
|
71 |
private CompactByteArray charCategoryTable = null; |
|
72 |
private SupplementaryCharacterData supplementaryCharCategoryTable = null; |
|
73 |
||
74 |
/** |
|
75 |
* The table of state transitions used for forward iteration |
|
76 |
*/ |
|
77 |
private short[] stateTable = null; |
|
78 |
||
79 |
/** |
|
80 |
* The table of state transitions used to sync up the iterator with the |
|
81 |
* text in backwards and random-access iteration |
|
82 |
*/ |
|
83 |
private short[] backwardsStateTable = null; |
|
84 |
||
85 |
/** |
|
86 |
* A list of flags indicating which states in the state table are accepting |
|
87 |
* ("end") states |
|
88 |
*/ |
|
89 |
private boolean[] endStates = null; |
|
90 |
||
91 |
/** |
|
92 |
* A list of flags indicating which states in the state table are |
|
93 |
* lookahead states (states which turn lookahead on and off) |
|
94 |
*/ |
|
95 |
private boolean[] lookaheadStates = null; |
|
96 |
||
97 |
/** |
|
98 |
* A table for additional data. May be used by a subclass of |
|
99 |
* RuleBasedBreakIterator. |
|
100 |
*/ |
|
101 |
private byte[] additionalData = null; |
|
102 |
||
103 |
/** |
|
104 |
* The number of character categories (and, thus, the number of columns in |
|
105 |
* the state tables) |
|
106 |
*/ |
|
107 |
private int numCategories; |
|
108 |
||
109 |
/** |
|
110 |
* A temporary holding place used for calculating the character categories. |
|
111 |
* This object contains CharSet objects. |
|
112 |
*/ |
|
10110 | 113 |
protected Vector<CharSet> categories = null; |
2 | 114 |
|
115 |
/** |
|
116 |
* A table used to map parts of regexp text to lists of character |
|
117 |
* categories, rather than having to figure them out from scratch each time |
|
118 |
*/ |
|
10110 | 119 |
protected Hashtable<String, Object> expressions = null; |
2 | 120 |
|
121 |
/** |
|
122 |
* A temporary holding place for the list of ignore characters |
|
123 |
*/ |
|
124 |
protected CharSet ignoreChars = null; |
|
125 |
||
126 |
/** |
|
127 |
* A temporary holding place where the forward state table is built |
|
128 |
*/ |
|
10110 | 129 |
protected Vector<short[]> tempStateTable = null; |
2 | 130 |
|
131 |
/** |
|
132 |
* A list of all the states that have to be filled in with transitions to |
|
133 |
* the next state that is created. Used when building the state table from |
|
134 |
* the regular expressions. |
|
135 |
*/ |
|
10110 | 136 |
protected Vector<Integer> decisionPointList = null; |
2 | 137 |
|
138 |
/** |
|
139 |
* A stack for holding decision point lists. This is used to handle nested |
|
140 |
* parentheses and braces in regexps. |
|
141 |
*/ |
|
10110 | 142 |
protected Stack<Vector<Integer>> decisionPointStack = null; |
2 | 143 |
|
144 |
/** |
|
145 |
* A list of states that loop back on themselves. Used to handle .*? |
|
146 |
*/ |
|
10110 | 147 |
protected Vector<Integer> loopingStates = null; |
2 | 148 |
|
149 |
/** |
|
150 |
* Looping states actually have to be backfilled later in the process |
|
151 |
* than everything else. This is where a the list of states to backfill |
|
152 |
* is accumulated. This is also used to handle .*? |
|
153 |
*/ |
|
10110 | 154 |
protected Vector<Integer> statesToBackfill = null; |
2 | 155 |
|
156 |
/** |
|
157 |
* A list mapping pairs of state numbers for states that are to be combined |
|
158 |
* to the state number of the state representing their combination. Used |
|
159 |
* in the process of making the state table deterministic to prevent |
|
160 |
* infinite recursion. |
|
161 |
*/ |
|
10110 | 162 |
protected Vector<int[]> mergeList = null; |
2 | 163 |
|
164 |
/** |
|
165 |
* A flag that is used to indicate when the list of looping states can |
|
166 |
* be reset. |
|
167 |
*/ |
|
168 |
protected boolean clearLoopingStates = false; |
|
169 |
||
170 |
/** |
|
171 |
* A bit mask used to indicate a bit in the table's flags column that marks |
|
172 |
* a state as an accepting state. |
|
173 |
*/ |
|
174 |
protected static final int END_STATE_FLAG = 0x8000; |
|
175 |
||
176 |
/** |
|
177 |
* A bit mask used to indicate a bit in the table's flags column that marks |
|
178 |
* a state as one the builder shouldn't loop to any looping states |
|
179 |
*/ |
|
180 |
protected static final int DONT_LOOP_FLAG = 0x4000; |
|
181 |
||
182 |
/** |
|
183 |
* A bit mask used to indicate a bit in the table's flags column that marks |
|
184 |
* a state as a lookahead state. |
|
185 |
*/ |
|
186 |
protected static final int LOOKAHEAD_STATE_FLAG = 0x2000; |
|
187 |
||
188 |
/** |
|
189 |
* A bit mask representing the union of the mask values listed above. |
|
190 |
* Used for clearing or masking off the flag bits. |
|
191 |
*/ |
|
192 |
protected static final int ALL_FLAGS = END_STATE_FLAG |
|
193 |
| LOOKAHEAD_STATE_FLAG |
|
194 |
| DONT_LOOP_FLAG; |
|
195 |
||
196 |
/** |
|
197 |
* This is the main function for setting up the BreakIterator's tables. It |
|
198 |
* just vectors different parts of the job off to other functions. |
|
199 |
*/ |
|
200 |
public RuleBasedBreakIteratorBuilder(String description) { |
|
10110 | 201 |
Vector<String> tempRuleList = buildRuleList(description); |
2 | 202 |
buildCharCategories(tempRuleList); |
203 |
buildStateTable(tempRuleList); |
|
204 |
buildBackwardsStateTable(tempRuleList); |
|
205 |
} |
|
206 |
||
207 |
/** |
|
208 |
* Thus function has three main purposes: |
|
209 |
* <ul><li>Perform general syntax checking on the description, so the rest |
|
210 |
* of the build code can assume that it's parsing a legal description. |
|
211 |
* <li>Split the description into separate rules |
|
212 |
* <li>Perform variable-name substitutions (so that no one else sees |
|
213 |
* variable names) |
|
214 |
* </ul> |
|
215 |
*/ |
|
10110 | 216 |
private Vector<String> buildRuleList(String description) { |
2 | 217 |
// invariants: |
218 |
// - parentheses must be balanced: ()[]{}<> |
|
219 |
// - nothing can be nested inside <> |
|
220 |
// - nothing can be nested inside [] except more []s |
|
221 |
// - pairs of ()[]{}<> must not be empty |
|
222 |
// - ; can only occur at the outer level |
|
223 |
// - | can only appear inside () |
|
224 |
// - only one = or / can occur in a single rule |
|
225 |
// - = and / cannot both occur in the same rule |
|
226 |
// - <> can only occur on the left side of a = expression |
|
227 |
// (because we'll perform substitutions to eliminate them other places) |
|
228 |
// - the left-hand side of a = expression can only be a single character |
|
229 |
// (possibly with \) or text inside <> |
|
230 |
// - the right-hand side of a = expression must be enclosed in [] or () |
|
231 |
// - * may not occur at the beginning of a rule, nor may it follow |
|
232 |
// =, /, (, (, |, }, ;, or * |
|
233 |
// - ? may only follow * |
|
234 |
// - the rule list must contain at least one / rule |
|
235 |
// - no rule may be empty |
|
236 |
// - all printing characters in the ASCII range except letters and digits |
|
237 |
// are reserved and must be preceded by \ |
|
238 |
// - ! may only occur at the beginning of a rule |
|
239 |
||
240 |
// set up a vector to contain the broken-up description (each entry in the |
|
241 |
// vector is a separate rule) and a stack for keeping track of opening |
|
242 |
// punctuation |
|
10110 | 243 |
Vector<String> tempRuleList = new Vector<>(); |
244 |
Stack<Character> parenStack = new Stack<>(); |
|
2 | 245 |
|
246 |
int p = 0; |
|
247 |
int ruleStart = 0; |
|
248 |
int c = '\u0000'; |
|
249 |
int lastC = '\u0000'; |
|
250 |
int lastOpen = '\u0000'; |
|
251 |
boolean haveEquals = false; |
|
252 |
boolean havePipe = false; |
|
253 |
boolean sawVarName = false; |
|
254 |
final String charsThatCantPrecedeAsterisk = "=/{(|}*;\u0000"; |
|
255 |
||
256 |
// if the description doesn't end with a semicolon, tack a semicolon onto the end |
|
257 |
if (description.length() != 0 && |
|
258 |
description.codePointAt(description.length() - 1) != ';') { |
|
259 |
description = description + ";"; |
|
260 |
} |
|
261 |
||
262 |
// for each character, do... |
|
263 |
while (p < description.length()) { |
|
264 |
c = description.codePointAt(p); |
|
265 |
||
266 |
switch (c) { |
|
267 |
// if the character is a backslash, skip the character that follows it |
|
268 |
// (it'll get treated as a literal character) |
|
269 |
case '\\': |
|
270 |
++p; |
|
271 |
break; |
|
272 |
||
273 |
// if the character is opening punctuation, verify that no nesting |
|
274 |
// rules are broken, and push the character onto the stack |
|
275 |
case '{': |
|
276 |
case '<': |
|
277 |
case '[': |
|
278 |
case '(': |
|
279 |
if (lastOpen == '<') { |
|
280 |
error("Can't nest brackets inside <>", p, description); |
|
281 |
} |
|
282 |
if (lastOpen == '[' && c != '[') { |
|
283 |
error("Can't nest anything in [] but []", p, description); |
|
284 |
} |
|
285 |
||
286 |
// if we see < anywhere except on the left-hand side of =, |
|
287 |
// we must be seeing a variable name that was never defined |
|
288 |
if (c == '<' && (haveEquals || havePipe)) { |
|
289 |
error("Unknown variable name", p, description); |
|
290 |
} |
|
291 |
||
292 |
lastOpen = c; |
|
293 |
parenStack.push(new Character((char)c)); |
|
294 |
if (c == '<') { |
|
295 |
sawVarName = true; |
|
296 |
} |
|
297 |
break; |
|
298 |
||
299 |
// if the character is closing punctuation, verify that it matches the |
|
300 |
// last opening punctuation we saw, and that the brackets contain |
|
301 |
// something, then pop the stack |
|
302 |
case '}': |
|
303 |
case '>': |
|
304 |
case ']': |
|
305 |
case ')': |
|
306 |
char expectedClose = '\u0000'; |
|
307 |
switch (lastOpen) { |
|
308 |
case '{': |
|
309 |
expectedClose = '}'; |
|
310 |
break; |
|
311 |
case '[': |
|
312 |
expectedClose = ']'; |
|
313 |
break; |
|
314 |
case '(': |
|
315 |
expectedClose = ')'; |
|
316 |
break; |
|
317 |
case '<': |
|
318 |
expectedClose = '>'; |
|
319 |
break; |
|
320 |
} |
|
321 |
if (c != expectedClose) { |
|
322 |
error("Unbalanced parentheses", p, description); |
|
323 |
} |
|
324 |
if (lastC == lastOpen) { |
|
325 |
error("Parens don't contain anything", p, description); |
|
326 |
} |
|
327 |
parenStack.pop(); |
|
328 |
if (!parenStack.empty()) { |
|
10110 | 329 |
lastOpen = parenStack.peek().charValue(); |
2 | 330 |
} |
331 |
else { |
|
332 |
lastOpen = '\u0000'; |
|
333 |
} |
|
334 |
||
335 |
break; |
|
336 |
||
337 |
// if the character is an asterisk, make sure it occurs in a place |
|
338 |
// where an asterisk can legally go |
|
339 |
case '*': |
|
340 |
if (charsThatCantPrecedeAsterisk.indexOf(lastC) != -1) { |
|
341 |
error("Misplaced asterisk", p, description); |
|
342 |
} |
|
343 |
break; |
|
344 |
||
345 |
// if the character is a question mark, make sure it follows an asterisk |
|
346 |
case '?': |
|
347 |
if (lastC != '*') { |
|
348 |
error("Misplaced ?", p, description); |
|
349 |
} |
|
350 |
break; |
|
351 |
||
352 |
// if the character is an equals sign, make sure we haven't seen another |
|
353 |
// equals sign or a slash yet |
|
354 |
case '=': |
|
355 |
if (haveEquals || havePipe) { |
|
356 |
error("More than one = or / in rule", p, description); |
|
357 |
} |
|
358 |
haveEquals = true; |
|
359 |
break; |
|
360 |
||
361 |
// if the character is a slash, make sure we haven't seen another slash |
|
362 |
// or an equals sign yet |
|
363 |
case '/': |
|
364 |
if (haveEquals || havePipe) { |
|
365 |
error("More than one = or / in rule", p, description); |
|
366 |
} |
|
367 |
if (sawVarName) { |
|
368 |
error("Unknown variable name", p, description); |
|
369 |
} |
|
370 |
havePipe = true; |
|
371 |
break; |
|
372 |
||
373 |
// if the character is an exclamation point, make sure it occurs only |
|
374 |
// at the beginning of a rule |
|
375 |
case '!': |
|
376 |
if (lastC != ';' && lastC != '\u0000') { |
|
377 |
error("! can only occur at the beginning of a rule", p, description); |
|
378 |
} |
|
379 |
break; |
|
380 |
||
381 |
// we don't have to do anything special on a period |
|
382 |
case '.': |
|
383 |
break; |
|
384 |
||
385 |
// if the character is a syntax character that can only occur |
|
386 |
// inside [], make sure that it does in fact only occur inside []. |
|
387 |
case '^': |
|
388 |
case '-': |
|
389 |
case ':': |
|
390 |
if (lastOpen != '[' && lastOpen != '<') { |
|
391 |
error("Illegal character", p, description); |
|
392 |
} |
|
393 |
break; |
|
394 |
||
395 |
// if the character is a semicolon, do the following... |
|
396 |
case ';': |
|
397 |
// make sure the rule contains something and that there are no |
|
398 |
// unbalanced parentheses or brackets |
|
399 |
if (lastC == ';' || lastC == '\u0000') { |
|
400 |
error("Empty rule", p, description); |
|
401 |
} |
|
402 |
if (!parenStack.empty()) { |
|
403 |
error("Unbalanced parenheses", p, description); |
|
404 |
} |
|
405 |
||
406 |
if (parenStack.empty()) { |
|
407 |
// if the rule contained an = sign, call processSubstitution() |
|
408 |
// to replace the substitution name with the substitution text |
|
409 |
// wherever it appears in the description |
|
410 |
if (haveEquals) { |
|
411 |
description = processSubstitution(description.substring(ruleStart, |
|
412 |
p), description, p + 1); |
|
413 |
} |
|
414 |
else { |
|
415 |
// otherwise, check to make sure the rule doesn't reference |
|
416 |
// any undefined substitutions |
|
417 |
if (sawVarName) { |
|
418 |
error("Unknown variable name", p, description); |
|
419 |
} |
|
420 |
||
421 |
// then add it to tempRuleList |
|
422 |
tempRuleList.addElement(description.substring(ruleStart, p)); |
|
423 |
} |
|
424 |
||
425 |
// and reset everything to process the next rule |
|
426 |
ruleStart = p + 1; |
|
427 |
haveEquals = havePipe = sawVarName = false; |
|
428 |
} |
|
429 |
break; |
|
430 |
||
431 |
// if the character is a vertical bar, check to make sure that it |
|
432 |
// occurs inside a () expression and that the character that precedes |
|
433 |
// it isn't also a vertical bar |
|
434 |
case '|': |
|
435 |
if (lastC == '|') { |
|
436 |
error("Empty alternative", p, description); |
|
437 |
} |
|
438 |
if (parenStack.empty() || lastOpen != '(') { |
|
439 |
error("Misplaced |", p, description); |
|
440 |
} |
|
441 |
break; |
|
442 |
||
443 |
// if the character is anything else (escaped characters are |
|
444 |
// skipped and don't make it here), it's an error |
|
445 |
default: |
|
446 |
if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c) |
|
447 |
&& !Character.isDigit((char)c)) { |
|
448 |
error("Illegal character", p, description); |
|
449 |
} |
|
450 |
if (c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) { |
|
451 |
++p; |
|
452 |
} |
|
453 |
break; |
|
454 |
} |
|
455 |
lastC = c; |
|
456 |
++p; |
|
457 |
} |
|
458 |
if (tempRuleList.size() == 0) { |
|
459 |
error("No valid rules in description", p, description); |
|
460 |
} |
|
461 |
return tempRuleList; |
|
462 |
} |
|
463 |
||
464 |
/** |
|
465 |
* This function performs variable-name substitutions. First it does syntax |
|
466 |
* checking on the variable-name definition. If it's syntactically valid, it |
|
467 |
* then goes through the remainder of the description and does a simple |
|
468 |
* find-and-replace of the variable name with its text. (The variable text |
|
469 |
* must be enclosed in either [] or () for this to work.) |
|
470 |
*/ |
|
471 |
protected String processSubstitution(String substitutionRule, String description, |
|
472 |
int startPos) { |
|
473 |
// isolate out the text on either side of the equals sign |
|
474 |
String replace; |
|
475 |
String replaceWith; |
|
476 |
int equalPos = substitutionRule.indexOf('='); |
|
477 |
replace = substitutionRule.substring(0, equalPos); |
|
478 |
replaceWith = substitutionRule.substring(equalPos + 1); |
|
479 |
||
480 |
// check to see whether the substitution name is something we've declared |
|
481 |
// to be "special". For RuleBasedBreakIterator itself, this is "<ignore>". |
|
482 |
// This function takes care of any extra processing that has to be done |
|
483 |
// with "special" substitution names. |
|
484 |
handleSpecialSubstitution(replace, replaceWith, startPos, description); |
|
485 |
||
486 |
// perform various other syntax checks on the rule |
|
487 |
if (replaceWith.length() == 0) { |
|
488 |
error("Nothing on right-hand side of =", startPos, description); |
|
489 |
} |
|
490 |
if (replace.length() == 0) { |
|
491 |
error("Nothing on left-hand side of =", startPos, description); |
|
492 |
} |
|
493 |
if (replace.length() == 2 && replace.charAt(0) != '\\') { |
|
494 |
error("Illegal left-hand side for =", startPos, description); |
|
495 |
} |
|
496 |
if (replace.length() >= 3 && replace.charAt(0) != '<' && |
|
497 |
replace.codePointBefore(equalPos) != '>') { |
|
498 |
error("Illegal left-hand side for =", startPos, description); |
|
499 |
} |
|
500 |
if (!(replaceWith.charAt(0) == '[' && |
|
501 |
replaceWith.charAt(replaceWith.length() - 1) == ']') && |
|
502 |
!(replaceWith.charAt(0) == '(' && |
|
503 |
replaceWith.charAt(replaceWith.length() - 1) == ')')) { |
|
504 |
error("Illegal right-hand side for =", startPos, description); |
|
505 |
} |
|
506 |
||
507 |
// now go through the rest of the description (which hasn't been broken up |
|
508 |
// into separate rules yet) and replace every occurrence of the |
|
509 |
// substitution name with the substitution body |
|
510 |
StringBuffer result = new StringBuffer(); |
|
511 |
result.append(description.substring(0, startPos)); |
|
512 |
int lastPos = startPos; |
|
513 |
int pos = description.indexOf(replace, startPos); |
|
514 |
while (pos != -1) { |
|
515 |
result.append(description.substring(lastPos, pos)); |
|
516 |
result.append(replaceWith); |
|
517 |
lastPos = pos + replace.length(); |
|
518 |
pos = description.indexOf(replace, lastPos); |
|
519 |
} |
|
520 |
result.append(description.substring(lastPos)); |
|
521 |
return result.toString(); |
|
522 |
} |
|
523 |
||
524 |
/** |
|
525 |
* This function defines a protocol for handling substitution names that |
|
526 |
* are "special," i.e., that have some property beyond just being |
|
527 |
* substitutions. At the RuleBasedBreakIterator level, we have one |
|
528 |
* special substitution name, "<ignore>". Subclasses can override this |
|
529 |
* function to add more. Any special processing that has to go on beyond |
|
530 |
* that which is done by the normal substitution-processing code is done |
|
531 |
* here. |
|
532 |
*/ |
|
533 |
protected void handleSpecialSubstitution(String replace, String replaceWith, |
|
534 |
int startPos, String description) { |
|
535 |
// if we get a definition for a substitution called "ignore", it defines |
|
536 |
// the ignore characters for the iterator. Check to make sure the expression |
|
537 |
// is a [] expression, and if it is, parse it and store the characters off |
|
538 |
// to the side. |
|
539 |
if (replace.equals("<ignore>")) { |
|
540 |
if (replaceWith.charAt(0) == '(') { |
|
541 |
error("Ignore group can't be enclosed in (", startPos, description); |
|
542 |
} |
|
543 |
ignoreChars = CharSet.parseString(replaceWith); |
|
544 |
} |
|
545 |
} |
|
546 |
||
547 |
/** |
|
548 |
* This function builds the character category table. On entry, |
|
549 |
* tempRuleList is a vector of break rules that has had variable names substituted. |
|
550 |
* On exit, the charCategoryTable data member has been initialized to hold the |
|
551 |
* character category table, and tempRuleList's rules have been munged to contain |
|
552 |
* character category numbers everywhere a literal character or a [] expression |
|
553 |
* originally occurred. |
|
554 |
*/ |
|
10110 | 555 |
@SuppressWarnings("fallthrough") |
556 |
protected void buildCharCategories(Vector<String> tempRuleList) { |
|
2 | 557 |
int bracketLevel = 0; |
558 |
int p = 0; |
|
559 |
int lineNum = 0; |
|
560 |
||
561 |
// build hash table of every literal character or [] expression in the rule list |
|
562 |
// and use CharSet.parseString() to derive a CharSet object representing the |
|
563 |
// characters each refers to |
|
10110 | 564 |
expressions = new Hashtable<>(); |
2 | 565 |
while (lineNum < tempRuleList.size()) { |
10110 | 566 |
String line = tempRuleList.elementAt(lineNum); |
2 | 567 |
p = 0; |
568 |
while (p < line.length()) { |
|
569 |
int c = line.codePointAt(p); |
|
570 |
switch (c) { |
|
571 |
// skip over all syntax characters except [ |
|
572 |
case '{': case '}': case '(': case ')': case '*': case '.': |
|
573 |
case '/': case '|': case ';': case '?': case '!': |
|
574 |
break; |
|
575 |
||
576 |
// for [, find the matching ] (taking nested [] pairs into account) |
|
577 |
// and add the whole expression to the expression list |
|
578 |
case '[': |
|
579 |
int q = p + 1; |
|
580 |
++bracketLevel; |
|
581 |
while (q < line.length() && bracketLevel != 0) { |
|
582 |
c = line.codePointAt(q); |
|
583 |
switch (c) { |
|
584 |
case '\\': |
|
585 |
q++; |
|
586 |
break; |
|
587 |
case '[': |
|
588 |
++bracketLevel; |
|
589 |
break; |
|
590 |
case ']': |
|
591 |
--bracketLevel; |
|
592 |
break; |
|
593 |
} |
|
594 |
q = q + Character.charCount(c); |
|
595 |
} |
|
596 |
if (expressions.get(line.substring(p, q)) == null) { |
|
597 |
expressions.put(line.substring(p, q), CharSet.parseString(line.substring(p, q))); |
|
598 |
} |
|
599 |
p = q - 1; |
|
600 |
break; |
|
601 |
||
602 |
// for \ sequences, just move to the next character and treat |
|
603 |
// it as a single character |
|
604 |
case '\\': |
|
605 |
++p; |
|
606 |
c = line.codePointAt(p); |
|
607 |
// DON'T break; fall through into "default" clause |
|
608 |
||
609 |
// for an isolated single character, add it to the expression list |
|
610 |
default: |
|
611 |
expressions.put(line.substring(p, p + 1), CharSet.parseString(line.substring(p, p + 1))); |
|
612 |
break; |
|
613 |
} |
|
614 |
p += Character.charCount(line.codePointAt(p)); |
|
615 |
} |
|
616 |
++lineNum; |
|
617 |
} |
|
618 |
// dump CharSet's internal expression cache |
|
619 |
CharSet.releaseExpressionCache(); |
|
620 |
||
621 |
// create the temporary category table (which is a vector of CharSet objects) |
|
10110 | 622 |
categories = new Vector<>(); |
2 | 623 |
if (ignoreChars != null) { |
624 |
categories.addElement(ignoreChars); |
|
625 |
} |
|
626 |
else { |
|
627 |
categories.addElement(new CharSet()); |
|
628 |
} |
|
629 |
ignoreChars = null; |
|
630 |
||
631 |
// this is a hook to allow subclasses to add categories on their own |
|
632 |
mungeExpressionList(expressions); |
|
633 |
||
634 |
// Derive the character categories. Go through the existing character categories |
|
635 |
// looking for overlap. Any time there's overlap, we create a new character |
|
636 |
// category for the characters that overlapped and remove them from their original |
|
637 |
// category. At the end, any characters that are left in the expression haven't |
|
638 |
// been mentioned in any category, so another new category is created for them. |
|
639 |
// For example, if the first expression is [abc], then a, b, and c will be placed |
|
640 |
// into a single character category. If the next expression is [bcd], we will first |
|
641 |
// remove b and c from their existing category (leaving a behind), create a new |
|
642 |
// category for b and c, and then create another new category for d (which hadn't |
|
643 |
// been mentioned in the previous expression). |
|
644 |
// At no time should a character ever occur in more than one character category. |
|
645 |
||
646 |
// for each expression in the expressions list, do... |
|
10110 | 647 |
for (Enumeration<Object> iter = expressions.elements(); iter.hasMoreElements(); ) { |
2 | 648 |
// initialize the working char set to the chars in the current expression |
649 |
CharSet e = (CharSet)iter.nextElement(); |
|
650 |
||
651 |
// for each category in the category list, do... |
|
652 |
for (int j = categories.size() - 1; !e.empty() && j > 0; j--) { |
|
653 |
||
654 |
// if there's overlap between the current working set of chars |
|
655 |
// and the current category... |
|
10110 | 656 |
CharSet that = categories.elementAt(j); |
2 | 657 |
if (!that.intersection(e).empty()) { |
658 |
||
659 |
// add a new category for the characters that were in the |
|
660 |
// current category but not in the working char set |
|
661 |
CharSet temp = that.difference(e); |
|
662 |
if (!temp.empty()) { |
|
663 |
categories.addElement(temp); |
|
664 |
} |
|
665 |
||
666 |
// remove those characters from the working char set and replace |
|
667 |
// the current category with the characters that it did |
|
668 |
// have in common with the current working char set |
|
669 |
temp = e.intersection(that); |
|
670 |
e = e.difference(that); |
|
671 |
if (!temp.equals(that)) { |
|
672 |
categories.setElementAt(temp, j); |
|
673 |
} |
|
674 |
} |
|
675 |
} |
|
676 |
||
677 |
// if there are still characters left in the working char set, |
|
678 |
// add a new category containing them |
|
679 |
if (!e.empty()) { |
|
680 |
categories.addElement(e); |
|
681 |
} |
|
682 |
} |
|
683 |
||
684 |
// we have the ignore characters stored in position 0. Make an extra pass through |
|
685 |
// the character category list and remove anything from the ignore list that shows |
|
686 |
// up in some other category |
|
687 |
CharSet allChars = new CharSet(); |
|
688 |
for (int i = 1; i < categories.size(); i++) { |
|
10110 | 689 |
allChars = allChars.union(categories.elementAt(i)); |
2 | 690 |
} |
10110 | 691 |
CharSet ignoreChars = categories.elementAt(0); |
2 | 692 |
ignoreChars = ignoreChars.difference(allChars); |
693 |
categories.setElementAt(ignoreChars, 0); |
|
694 |
||
695 |
// now that we've derived the character categories, go back through the expression |
|
696 |
// list and replace each CharSet object with a String that represents the |
|
697 |
// character categories that expression refers to. The String is encoded: each |
|
698 |
// character is a character category number (plus 0x100 to avoid confusing them |
|
699 |
// with syntax characters in the rule grammar) |
|
10110 | 700 |
|
701 |
for (Enumeration<String> iter = expressions.keys(); iter.hasMoreElements(); ) { |
|
702 |
String key = iter.nextElement(); |
|
2 | 703 |
CharSet cs = (CharSet)expressions.get(key); |
704 |
StringBuffer cats = new StringBuffer(); |
|
705 |
||
706 |
// for each category... |
|
707 |
for (int j = 0; j < categories.size(); j++) { |
|
708 |
||
709 |
// if the current expression contains characters in that category... |
|
10110 | 710 |
CharSet temp = cs.intersection(categories.elementAt(j)); |
2 | 711 |
if (!temp.empty()) { |
712 |
||
713 |
// then add the encoded category number to the String for this |
|
714 |
// expression |
|
715 |
cats.append((char)(0x100 + j)); |
|
716 |
if (temp.equals(cs)) { |
|
717 |
break; |
|
718 |
} |
|
719 |
} |
|
720 |
} |
|
721 |
||
722 |
// once we've finished building the encoded String for this expression, |
|
723 |
// replace the CharSet object with it |
|
724 |
expressions.put(key, cats.toString()); |
|
725 |
} |
|
726 |
||
727 |
// and finally, we turn the temporary category table into a permanent category |
|
728 |
// table, which is a CompactByteArray. (we skip category 0, which by definition |
|
729 |
// refers to all characters not mentioned specifically in the rules) |
|
730 |
charCategoryTable = new CompactByteArray((byte)0); |
|
731 |
supplementaryCharCategoryTable = new SupplementaryCharacterData((byte)0); |
|
732 |
||
733 |
// for each category... |
|
734 |
for (int i = 0; i < categories.size(); i++) { |
|
10110 | 735 |
CharSet chars = categories.elementAt(i); |
2 | 736 |
|
737 |
// go through the character ranges in the category one by one... |
|
10110 | 738 |
Enumeration<int[]> enum_ = chars.getChars(); |
2 | 739 |
while (enum_.hasMoreElements()) { |
10110 | 740 |
int[] range = enum_.nextElement(); |
2 | 741 |
|
742 |
// and set the corresponding elements in the CompactArray accordingly |
|
743 |
if (i != 0) { |
|
744 |
if (range[0] < Character.MIN_SUPPLEMENTARY_CODE_POINT) { |
|
745 |
if (range[1] < Character.MIN_SUPPLEMENTARY_CODE_POINT) { |
|
746 |
charCategoryTable.setElementAt((char)range[0], (char)range[1], (byte)i); |
|
747 |
} else { |
|
748 |
charCategoryTable.setElementAt((char)range[0], (char)0xFFFF, (byte)i); |
|
749 |
supplementaryCharCategoryTable.appendElement(Character.MIN_SUPPLEMENTARY_CODE_POINT, range[1], (byte)i); |
|
750 |
} |
|
751 |
} else { |
|
752 |
supplementaryCharCategoryTable.appendElement(range[0], range[1], (byte)i); |
|
753 |
} |
|
754 |
} |
|
755 |
||
756 |
// (category 0 is special-- it's the hiding place for the ignore |
|
757 |
// characters, whose real category number in the CompactArray is |
|
758 |
// -1 [this is because category 0 contains all characters not |
|
759 |
// specifically mentioned anywhere in the rules] ) |
|
760 |
else { |
|
761 |
if (range[0] < Character.MIN_SUPPLEMENTARY_CODE_POINT) { |
|
762 |
if (range[1] < Character.MIN_SUPPLEMENTARY_CODE_POINT) { |
|
763 |
charCategoryTable.setElementAt((char)range[0], (char)range[1], IGNORE); |
|
764 |
} else { |
|
765 |
charCategoryTable.setElementAt((char)range[0], (char)0xFFFF, IGNORE); |
|
766 |
supplementaryCharCategoryTable.appendElement(Character.MIN_SUPPLEMENTARY_CODE_POINT, range[1], IGNORE); |
|
767 |
} |
|
768 |
} else { |
|
769 |
supplementaryCharCategoryTable.appendElement(range[0], range[1], IGNORE); |
|
770 |
} |
|
771 |
} |
|
772 |
} |
|
773 |
} |
|
774 |
||
775 |
// once we've populated the CompactArray, compact it |
|
776 |
charCategoryTable.compact(); |
|
777 |
||
778 |
// And, complete the category table for supplementary characters |
|
779 |
supplementaryCharCategoryTable.complete(); |
|
780 |
||
781 |
// initialize numCategories |
|
782 |
numCategories = categories.size(); |
|
783 |
} |
|
784 |
||
10110 | 785 |
protected void mungeExpressionList(Hashtable<String, Object> expressions) { |
2 | 786 |
// empty in the parent class. This function provides a hook for subclasses |
787 |
// to mess with the character category table. |
|
788 |
} |
|
789 |
||
790 |
/** |
|
791 |
* This is the function that builds the forward state table. Most of the real |
|
792 |
* work is done in parseRule(), which is called once for each rule in the |
|
793 |
* description. |
|
794 |
*/ |
|
10110 | 795 |
private void buildStateTable(Vector<String> tempRuleList) { |
2 | 796 |
// initialize our temporary state table, and fill it with two states: |
797 |
// state 0 is a dummy state that allows state 1 to be the starting state |
|
798 |
// and 0 to represent "stop". State 1 is added here to seed things |
|
799 |
// before we start parsing |
|
10110 | 800 |
tempStateTable = new Vector<>(); |
2 | 801 |
tempStateTable.addElement(new short[numCategories + 1]); |
802 |
tempStateTable.addElement(new short[numCategories + 1]); |
|
803 |
||
804 |
// call parseRule() for every rule in the rule list (except those which |
|
805 |
// start with !, which are actually backwards-iteration rules) |
|
806 |
for (int i = 0; i < tempRuleList.size(); i++) { |
|
10110 | 807 |
String rule = tempRuleList.elementAt(i); |
2 | 808 |
if (rule.charAt(0) != '!') { |
809 |
parseRule(rule, true); |
|
810 |
} |
|
811 |
} |
|
812 |
||
813 |
// finally, use finishBuildingStateTable() to minimize the number of |
|
814 |
// states in the table and perform some other cleanup work |
|
815 |
finishBuildingStateTable(true); |
|
816 |
} |
|
817 |
||
818 |
/** |
|
819 |
* This is where most of the work really happens. This routine parses a single |
|
820 |
* rule in the rule description, adding and modifying states in the state |
|
821 |
* table according to the new expression. The state table is kept deterministic |
|
822 |
* throughout the whole operation, although some ugly postprocessing is needed |
|
823 |
* to handle the *? token. |
|
824 |
*/ |
|
825 |
private void parseRule(String rule, boolean forward) { |
|
826 |
// algorithm notes: |
|
827 |
// - The basic idea here is to read successive character-category groups |
|
828 |
// from the input string. For each group, you create a state and point |
|
829 |
// the appropriate entries in the previous state to it. This produces a |
|
830 |
// straight line from the start state to the end state. The {}, *, and (|) |
|
831 |
// idioms produce branches in this straight line. These branches (states |
|
832 |
// that can transition to more than one other state) are called "decision |
|
833 |
// points." A list of decision points is kept. This contains a list of |
|
834 |
// all states that can transition to the next state to be created. For a |
|
835 |
// straight line progression, the only thing in the decision-point list is |
|
836 |
// the current state. But if there's a branch, the decision-point list |
|
837 |
// will contain all of the beginning points of the branch when the next |
|
838 |
// state to be created represents the end point of the branch. A stack is |
|
839 |
// used to save decision point lists in the presence of nested parentheses |
|
840 |
// and the like. For example, when a { is encountered, the current decision |
|
841 |
// point list is saved on the stack and restored when the corresponding } |
|
842 |
// is encountered. This way, after the } is read, the decision point list |
|
843 |
// will contain both the state right before the } _and_ the state before |
|
844 |
// the whole {} expression. Both of these states can transition to the next |
|
845 |
// state after the {} expression. |
|
846 |
// - one complication arises when we have to stamp a transition value into |
|
847 |
// an array cell that already contains one. The updateStateTable() and |
|
848 |
// mergeStates() functions handle this case. Their basic approach is to |
|
849 |
// create a new state that combines the two states that conflict and point |
|
850 |
// at it when necessary. This happens recursively, so if the merged states |
|
851 |
// also conflict, they're resolved in the same way, and so on. There are |
|
852 |
// a number of tests aimed at preventing infinite recursion. |
|
853 |
// - another complication arises with repeating characters. It's somewhat |
|
854 |
// ambiguous whether the user wants a greedy or non-greedy match in these cases. |
|
855 |
// (e.g., whether "[a-z]*abc" means the SHORTEST sequence of letters ending in |
|
856 |
// "abc" or the LONGEST sequence of letters ending in "abc". We've adopted |
|
857 |
// the *? to mean "shortest" and * by itself to mean "longest". (You get the |
|
858 |
// same result with both if there's no overlap between the repeating character |
|
859 |
// group and the group immediately following it.) Handling the *? token is |
|
860 |
// rather complicated and involves keeping track of whether a state needs to |
|
861 |
// be merged (as described above) or merely overwritten when you update one of |
|
862 |
// its cells, and copying the contents of a state that loops with a *? token |
|
863 |
// into some of the states that follow it after the rest of the table-building |
|
864 |
// process is complete ("backfilling"). |
|
865 |
// implementation notes: |
|
866 |
// - This function assumes syntax checking has been performed on the input string |
|
867 |
// prior to its being passed in here. It assumes that parentheses are |
|
868 |
// balanced, all literal characters are enclosed in [] and turned into category |
|
869 |
// numbers, that there are no illegal characters or character sequences, and so |
|
870 |
// on. Violation of these invariants will lead to undefined behavior. |
|
871 |
// - It'd probably be better to use linked lists rather than Vector and Stack |
|
872 |
// to maintain the decision point list and stack. I went for simplicity in |
|
873 |
// this initial implementation. If performance is critical enough, we can go |
|
874 |
// back and fix this later. |
|
875 |
// -There are a number of important limitations on the *? token. It does not work |
|
876 |
// right when followed by a repeating character sequence (e.g., ".*?(abc)*") |
|
877 |
// (although it does work right when followed by a single repeating character). |
|
878 |
// It will not always work right when nested in parentheses or braces (although |
|
879 |
// sometimes it will). It also will not work right if the group of repeating |
|
880 |
// characters and the group of characters that follows overlap partially |
|
881 |
// (e.g., "[a-g]*?[e-j]"). None of these capabilites was deemed necessary for |
|
882 |
// describing breaking rules we know about, so we left them out for |
|
883 |
// expeditiousness. |
|
884 |
// - Rules such as "[a-z]*?abc;" will be treated the same as "[a-z]*?aa*bc;"-- |
|
885 |
// that is, if the string ends in "aaaabc", the break will go before the first |
|
886 |
// "a" rather than the last one. Both of these are limitations in the design |
|
887 |
// of RuleBasedBreakIterator and not limitations of the rule parser. |
|
888 |
||
889 |
int p = 0; |
|
890 |
int currentState = 1; // don't use state number 0; 0 means "stop" |
|
891 |
int lastState = currentState; |
|
892 |
String pendingChars = ""; |
|
893 |
||
10110 | 894 |
decisionPointStack = new Stack<>(); |
895 |
decisionPointList = new Vector<>(); |
|
896 |
loopingStates = new Vector<>(); |
|
897 |
statesToBackfill = new Vector<>(); |
|
2 | 898 |
|
899 |
short[] state; |
|
900 |
boolean sawEarlyBreak = false; |
|
901 |
||
902 |
// if we're adding rules to the backward state table, mark the initial state |
|
903 |
// as a looping state |
|
904 |
if (!forward) { |
|
905 |
loopingStates.addElement(new Integer(1)); |
|
906 |
} |
|
907 |
||
908 |
// put the current state on the decision point list before we start |
|
909 |
decisionPointList.addElement(new Integer(currentState)); // we want currentState to |
|
910 |
// be 1 here... |
|
911 |
currentState = tempStateTable.size() - 1; // but after that, we want it to be |
|
912 |
// 1 less than the state number of the next state |
|
913 |
while (p < rule.length()) { |
|
914 |
int c = rule.codePointAt(p); |
|
915 |
clearLoopingStates = false; |
|
916 |
||
917 |
// this section handles literal characters, escaped characters (which are |
|
918 |
// effectively literal characters too), the . token, and [] expressions |
|
919 |
if (c == '[' |
|
920 |
|| c == '\\' |
|
921 |
|| Character.isLetter(c) |
|
922 |
|| Character.isDigit(c) |
|
923 |
|| c < ' ' |
|
924 |
|| c == '.' |
|
925 |
|| c >= '\u007f') { |
|
926 |
||
927 |
// if we're not on a period, isolate the expression and look up |
|
928 |
// the corresponding category list |
|
929 |
if (c != '.') { |
|
930 |
int q = p; |
|
931 |
||
932 |
// if we're on a backslash, the expression is the character |
|
933 |
// after the backslash |
|
934 |
if (c == '\\') { |
|
935 |
q = p + 2; |
|
936 |
++p; |
|
937 |
} |
|
938 |
||
939 |
// if we're on an opening bracket, scan to the closing bracket |
|
940 |
// to isolate the expression |
|
941 |
else if (c == '[') { |
|
942 |
int bracketLevel = 1; |
|
943 |
||
944 |
q += Character.charCount(rule.codePointAt(q)); |
|
945 |
while (bracketLevel > 0) { |
|
946 |
c = rule.codePointAt(q); |
|
947 |
if (c == '[') { |
|
948 |
++bracketLevel; |
|
949 |
} |
|
950 |
else if (c == ']') { |
|
951 |
--bracketLevel; |
|
952 |
} |
|
953 |
else if (c == '\\') { |
|
954 |
c = rule.codePointAt(++q); |
|
955 |
} |
|
956 |
q += Character.charCount(c); |
|
957 |
} |
|
958 |
} |
|
959 |
||
960 |
// otherwise, the expression is just the character itself |
|
961 |
else { |
|
962 |
q = p + Character.charCount(c); |
|
963 |
} |
|
964 |
||
965 |
// look up the category list for the expression and store it |
|
966 |
// in pendingChars |
|
967 |
pendingChars = (String)expressions.get(rule.substring(p, q)); |
|
968 |
||
969 |
// advance the current position past the expression |
|
970 |
p = q - Character.charCount(rule.codePointBefore(q)); |
|
971 |
} |
|
972 |
||
973 |
// if the character we're on is a period, we end up down here |
|
974 |
else { |
|
10110 | 975 |
int rowNum = decisionPointList.lastElement().intValue(); |
976 |
state = tempStateTable.elementAt(rowNum); |
|
2 | 977 |
|
978 |
// if the period is followed by an asterisk, then just set the current |
|
979 |
// state to loop back on itself |
|
980 |
if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) { |
|
981 |
decisionPointList.addElement(new Integer(state[0])); |
|
982 |
pendingChars = ""; |
|
983 |
++p; |
|
984 |
} |
|
985 |
||
986 |
// otherwise, fabricate a category list ("pendingChars") with |
|
987 |
// every category in it |
|
988 |
else { |
|
989 |
StringBuffer temp = new StringBuffer(); |
|
990 |
for (int i = 0; i < numCategories; i++) |
|
991 |
temp.append((char)(i + 0x100)); |
|
992 |
pendingChars = temp.toString(); |
|
993 |
} |
|
994 |
} |
|
995 |
||
996 |
// we'll end up in here for all expressions except for .*, which is |
|
997 |
// special-cased above |
|
998 |
if (pendingChars.length() != 0) { |
|
999 |
||
1000 |
// if the expression is followed by an asterisk, then push a copy |
|
1001 |
// of the current desicion point list onto the stack (this is |
|
1002 |
// the same thing we do on an opening brace) |
|
1003 |
if (p + 1 < rule.length() && rule.charAt(p + 1) == '*') { |
|
10110 | 1004 |
@SuppressWarnings("unchecked") |
1005 |
Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone(); |
|
1006 |
decisionPointStack.push(clone); |
|
2 | 1007 |
} |
1008 |
||
1009 |
// create a new state, add it to the list of states to backfill |
|
1010 |
// if we have looping states to worry about, set its "don't make |
|
1011 |
// me an accepting state" flag if we've seen a slash, and add |
|
1012 |
// it to the end of the state table |
|
1013 |
int newState = tempStateTable.size(); |
|
1014 |
if (loopingStates.size() != 0) { |
|
1015 |
statesToBackfill.addElement(new Integer(newState)); |
|
1016 |
} |
|
1017 |
state = new short[numCategories + 1]; |
|
1018 |
if (sawEarlyBreak) { |
|
1019 |
state[numCategories] = DONT_LOOP_FLAG; |
|
1020 |
} |
|
1021 |
tempStateTable.addElement(state); |
|
1022 |
||
1023 |
// update everybody in the decision point list to point to |
|
1024 |
// the new state (this also performs all the reconciliation |
|
1025 |
// needed to make the table deterministic), then clear the |
|
1026 |
// decision point list |
|
1027 |
updateStateTable(decisionPointList, pendingChars, (short)newState); |
|
1028 |
decisionPointList.removeAllElements(); |
|
1029 |
||
1030 |
// add all states created since the last literal character we've |
|
1031 |
// seen to the decision point list |
|
1032 |
lastState = currentState; |
|
1033 |
do { |
|
1034 |
++currentState; |
|
1035 |
decisionPointList.addElement(new Integer(currentState)); |
|
1036 |
} while (currentState + 1 < tempStateTable.size()); |
|
1037 |
} |
|
1038 |
} |
|
1039 |
||
1040 |
// a { marks the beginning of an optional run of characters. Push a |
|
1041 |
// copy of the current decision point list onto the stack. This saves |
|
1042 |
// it, preventing it from being affected by whatever's inside the parentheses. |
|
1043 |
// This decision point list is restored when a } is encountered. |
|
1044 |
else if (c == '{') { |
|
10110 | 1045 |
@SuppressWarnings("unchecked") |
1046 |
Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone(); |
|
1047 |
decisionPointStack.push(clone); |
|
2 | 1048 |
} |
1049 |
||
1050 |
// a } marks the end of an optional run of characters. Pop the last decision |
|
1051 |
// point list off the stack and merge it with the current decision point list. |
|
1052 |
// a * denotes a repeating character or group (* after () is handled separately |
|
1053 |
// below). In addition to restoring the decision point list, modify the |
|
1054 |
// current state to point to itself on the appropriate character categories. |
|
1055 |
else if (c == '}' || c == '*') { |
|
1056 |
// when there's a *, update the current state to loop back on itself |
|
1057 |
// on the character categories that caused us to enter this state |
|
1058 |
if (c == '*') { |
|
1059 |
for (int i = lastState + 1; i < tempStateTable.size(); i++) { |
|
10110 | 1060 |
Vector<Integer> temp = new Vector<>(); |
2 | 1061 |
temp.addElement(new Integer(i)); |
1062 |
updateStateTable(temp, pendingChars, (short)(lastState + 1)); |
|
1063 |
} |
|
1064 |
} |
|
1065 |
||
1066 |
// pop the top element off the decision point stack and merge |
|
1067 |
// it with the current decision point list (this causes the divergent |
|
1068 |
// paths through the state table to come together again on the next |
|
1069 |
// new state) |
|
10110 | 1070 |
Vector<Integer> temp = decisionPointStack.pop(); |
2 | 1071 |
for (int i = 0; i < decisionPointList.size(); i++) |
1072 |
temp.addElement(decisionPointList.elementAt(i)); |
|
1073 |
decisionPointList = temp; |
|
1074 |
} |
|
1075 |
||
1076 |
// a ? after a * modifies the behavior of * in cases where there is overlap |
|
1077 |
// between the set of characters that repeat and the characters which follow. |
|
1078 |
// Without the ?, all states following the repeating state, up to a state which |
|
1079 |
// is reached by a character that doesn't overlap, will loop back into the |
|
1080 |
// repeating state. With the ?, the mark states following the *? DON'T loop |
|
1081 |
// back into the repeating state. Thus, "[a-z]*xyz" will match the longest |
|
1082 |
// sequence of letters that ends in "xyz," while "[a-z]*? will match the |
|
1083 |
// _shortest_ sequence of letters that ends in "xyz". |
|
1084 |
// We use extra bookkeeping to achieve this effect, since everything else works |
|
1085 |
// according to the "longest possible match" principle. The basic principle |
|
1086 |
// is that transitions out of a looping state are written in over the looping |
|
1087 |
// value instead of being reconciled, and that we copy the contents of the |
|
1088 |
// looping state into empty cells of all non-terminal states that follow the |
|
1089 |
// looping state. |
|
1090 |
else if (c == '?') { |
|
1091 |
setLoopingStates(decisionPointList, decisionPointList); |
|
1092 |
} |
|
1093 |
||
1094 |
// a ( marks the beginning of a sequence of characters. Parentheses can either |
|
1095 |
// contain several alternative character sequences (i.e., "(ab|cd|ef)"), or |
|
1096 |
// they can contain a sequence of characters that can repeat (i.e., "(abc)*"). Thus, |
|
1097 |
// A () group can have multiple entry and exit points. To keep track of this, |
|
1098 |
// we reserve TWO spots on the decision-point stack. The top of the stack is |
|
1099 |
// the list of exit points, which becomes the current decision point list when |
|
1100 |
// the ) is reached. The next entry down is the decision point list at the |
|
1101 |
// beginning of the (), which becomes the current decision point list at every |
|
1102 |
// entry point. |
|
1103 |
// In addition to keeping track of the exit points and the active decision |
|
1104 |
// points before the ( (i.e., the places from which the () can be entered), |
|
1105 |
// we need to keep track of the entry points in case the expression loops |
|
1106 |
// (i.e., is followed by *). We do that by creating a dummy state in the |
|
1107 |
// state table and adding it to the decision point list (BEFORE it's duplicated |
|
1108 |
// on the stack). Nobody points to this state, so it'll get optimized out |
|
1109 |
// at the end. It exists only to hold the entry points in case the () |
|
1110 |
// expression loops. |
|
1111 |
else if (c == '(') { |
|
1112 |
||
1113 |
// add a new state to the state table to hold the entry points into |
|
1114 |
// the () expression |
|
1115 |
tempStateTable.addElement(new short[numCategories + 1]); |
|
1116 |
||
1117 |
// we have to adjust lastState and currentState to account for the |
|
1118 |
// new dummy state |
|
1119 |
lastState = currentState; |
|
1120 |
++currentState; |
|
1121 |
||
1122 |
// add the current state to the decision point list (add it at the |
|
1123 |
// BEGINNING so we can find it later) |
|
1124 |
decisionPointList.insertElementAt(new Integer(currentState), 0); |
|
1125 |
||
1126 |
// finally, push a copy of the current decision point list onto the |
|
1127 |
// stack (this keeps track of the active decision point list before |
|
1128 |
// the () expression), followed by an empty decision point list |
|
1129 |
// (this will hold the exit points) |
|
10110 | 1130 |
@SuppressWarnings("unchecked") |
1131 |
Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone(); |
|
1132 |
decisionPointStack.push(clone); |
|
1133 |
decisionPointStack.push(new Vector<Integer>()); |
|
2 | 1134 |
} |
1135 |
||
1136 |
// a | separates alternative character sequences in a () expression. When |
|
1137 |
// a | is encountered, we add the current decision point list to the exit-point |
|
1138 |
// list, and restore the decision point list to its state prior to the (. |
|
1139 |
else if (c == '|') { |
|
1140 |
||
1141 |
// pick out the top two decision point lists on the stack |
|
10110 | 1142 |
Vector<Integer> oneDown = decisionPointStack.pop(); |
1143 |
Vector<Integer> twoDown = decisionPointStack.peek(); |
|
2 | 1144 |
decisionPointStack.push(oneDown); |
1145 |
||
1146 |
// append the current decision point list to the list below it |
|
1147 |
// on the stack (the list of exit points), and restore the |
|
1148 |
// current decision point list to its state before the () expression |
|
1149 |
for (int i = 0; i < decisionPointList.size(); i++) |
|
1150 |
oneDown.addElement(decisionPointList.elementAt(i)); |
|
10110 | 1151 |
@SuppressWarnings("unchecked") |
1152 |
Vector<Integer> clone = (Vector<Integer>)twoDown.clone(); |
|
1153 |
decisionPointList = clone; |
|
2 | 1154 |
} |
1155 |
||
1156 |
// a ) marks the end of a sequence of characters. We do one of two things |
|
1157 |
// depending on whether the sequence repeats (i.e., whether the ) is followed |
|
1158 |
// by *): If the sequence doesn't repeat, then the exit-point list is merged |
|
1159 |
// with the current decision point list and the decision point list from before |
|
1160 |
// the () is thrown away. If the sequence does repeat, then we fish out the |
|
1161 |
// state we were in before the ( and copy all of its forward transitions |
|
1162 |
// (i.e., every transition added by the () expression) into every state in the |
|
1163 |
// exit-point list and the current decision point list. The current decision |
|
1164 |
// point list is then merged with both the exit-point list AND the saved version |
|
1165 |
// of the decision point list from before the (). Then we throw out the *. |
|
1166 |
else if (c == ')') { |
|
1167 |
||
1168 |
// pull the exit point list off the stack, merge it with the current |
|
1169 |
// decision point list, and make the merged version the current |
|
1170 |
// decision point list |
|
10110 | 1171 |
Vector<Integer> exitPoints = decisionPointStack.pop(); |
2 | 1172 |
for (int i = 0; i < decisionPointList.size(); i++) |
1173 |
exitPoints.addElement(decisionPointList.elementAt(i)); |
|
1174 |
decisionPointList = exitPoints; |
|
1175 |
||
1176 |
// if the ) isn't followed by a *, then all we have to do is throw |
|
1177 |
// away the other list on the decision point stack, and we're done |
|
1178 |
if (p + 1 >= rule.length() || rule.charAt(p + 1) != '*') { |
|
1179 |
decisionPointStack.pop(); |
|
1180 |
} |
|
1181 |
||
1182 |
// but if the sequence repeats, we have a lot more work to do... |
|
1183 |
else { |
|
1184 |
||
1185 |
// now exitPoints and decisionPointList have to point to equivalent |
|
1186 |
// vectors, but not the SAME vector |
|
10110 | 1187 |
@SuppressWarnings("unchecked") |
1188 |
Vector<Integer> clone = (Vector<Integer>)decisionPointList.clone(); |
|
1189 |
exitPoints = clone; |
|
2 | 1190 |
|
1191 |
// pop the original decision point list off the stack |
|
10110 | 1192 |
Vector<Integer> temp = decisionPointStack.pop(); |
2 | 1193 |
|
1194 |
// we squirreled away the row number of our entry point list |
|
1195 |
// at the beginning of the original decision point list. Fish |
|
1196 |
// that state number out and retrieve the entry point list |
|
10110 | 1197 |
int tempStateNum = temp.firstElement().intValue(); |
1198 |
short[] tempState = tempStateTable.elementAt(tempStateNum); |
|
2 | 1199 |
|
1200 |
// merge the original decision point list with the current |
|
1201 |
// decision point list |
|
1202 |
for (int i = 0; i < decisionPointList.size(); i++) |
|
1203 |
temp.addElement(decisionPointList.elementAt(i)); |
|
1204 |
decisionPointList = temp; |
|
1205 |
||
1206 |
// finally, copy every forward reference from the entry point |
|
1207 |
// list into every state in the new decision point list |
|
1208 |
for (int i = 0; i < tempState.length; i++) { |
|
1209 |
if (tempState[i] > tempStateNum) { |
|
1210 |
updateStateTable(exitPoints, |
|
1211 |
new Character((char)(i + 0x100)).toString(), |
|
1212 |
tempState[i]); |
|
1213 |
} |
|
1214 |
} |
|
1215 |
||
1216 |
// update lastState and currentState, and throw away the * |
|
1217 |
lastState = currentState; |
|
1218 |
currentState = tempStateTable.size() - 1; |
|
1219 |
++p; |
|
1220 |
} |
|
1221 |
} |
|
1222 |
||
1223 |
// a / marks the position where the break is to go if the character sequence |
|
1224 |
// matches this rule. We update the flag word of every state on the decision |
|
1225 |
// point list to mark them as ending states, and take note of the fact that |
|
1226 |
// we've seen the slash |
|
1227 |
else if (c == '/') { |
|
1228 |
sawEarlyBreak = true; |
|
1229 |
for (int i = 0; i < decisionPointList.size(); i++) { |
|
10110 | 1230 |
state = tempStateTable.elementAt(decisionPointList. |
1231 |
elementAt(i).intValue()); |
|
2 | 1232 |
state[numCategories] |= LOOKAHEAD_STATE_FLAG; |
1233 |
} |
|
1234 |
} |
|
1235 |
||
1236 |
// if we get here without executing any of the above clauses, we have a |
|
1237 |
// syntax error. However, for now we just ignore the offending character |
|
1238 |
// and move on |
|
1239 |
||
1240 |
// clearLoopingStates is a signal back from updateStateTable() that we've |
|
1241 |
// transitioned to a state that won't loop back to the current looping |
|
1242 |
// state. (In other words, we've gotten to a point where we can no longer |
|
1243 |
// go back into a *? we saw earlier.) Clear out the list of looping states |
|
1244 |
// and backfill any states that need to be backfilled. |
|
1245 |
if (clearLoopingStates) { |
|
1246 |
setLoopingStates(null, decisionPointList); |
|
1247 |
} |
|
1248 |
||
1249 |
// advance to the next character, now that we've processed the current |
|
1250 |
// character |
|
1251 |
p += Character.charCount(c); |
|
1252 |
} |
|
1253 |
||
1254 |
// this takes care of backfilling any states that still need to be backfilled |
|
1255 |
setLoopingStates(null, decisionPointList); |
|
1256 |
||
1257 |
// when we reach the end of the string, we do a postprocessing step to mark the |
|
1258 |
// end states. The decision point list contains every state that can transition |
|
1259 |
// to the end state-- that is, every state that is the last state in a sequence |
|
1260 |
// that matches the rule. All of these states are considered "mark states" |
|
1261 |
// or "accepting states"-- that is, states that cause the position returned from |
|
1262 |
// next() to be updated. A mark state represents a possible break position. |
|
1263 |
// This allows us to look ahead and remember how far the rule matched |
|
1264 |
// before following the new branch (see next() for more information). |
|
1265 |
// The temporary state table has an extra "flag column" at the end where this |
|
1266 |
// information is stored. We mark the end states by setting a flag in their |
|
1267 |
// flag column. |
|
1268 |
// Now if we saw the / in the rule, then everything after it is lookahead |
|
1269 |
// material and the break really goes where the slash is. In this case, |
|
1270 |
// we mark these states as BOTH accepting states and lookahead states. This |
|
1271 |
// signals that these states cause the break position to be updated to the |
|
1272 |
// position of the slash rather than the current break position. |
|
1273 |
for (int i = 0; i < decisionPointList.size(); i++) { |
|
10110 | 1274 |
int rowNum = decisionPointList.elementAt(i).intValue(); |
1275 |
state = tempStateTable.elementAt(rowNum); |
|
2 | 1276 |
state[numCategories] |= END_STATE_FLAG; |
1277 |
if (sawEarlyBreak) { |
|
1278 |
state[numCategories] |= LOOKAHEAD_STATE_FLAG; |
|
1279 |
} |
|
1280 |
} |
|
1281 |
} |
|
1282 |
||
1283 |
||
1284 |
/** |
|
1285 |
* Update entries in the state table, and merge states when necessary to keep |
|
1286 |
* the table deterministic. |
|
1287 |
* @param rows The list of rows that need updating (the decision point list) |
|
1288 |
* @param pendingChars A character category list, encoded in a String. This is the |
|
1289 |
* list of the columns that need updating. |
|
1290 |
* @param newValue Update the cells specfied above to contain this value |
|
1291 |
*/ |
|
10110 | 1292 |
private void updateStateTable(Vector<Integer> rows, |
2 | 1293 |
String pendingChars, |
1294 |
short newValue) { |
|
1295 |
// create a dummy state that has the specified row number (newValue) in |
|
1296 |
// the cells that need to be updated (those specified by pendingChars) |
|
1297 |
// and 0 in the other cells |
|
1298 |
short[] newValues = new short[numCategories + 1]; |
|
1299 |
for (int i = 0; i < pendingChars.length(); i++) |
|
1300 |
newValues[(int)(pendingChars.charAt(i)) - 0x100] = newValue; |
|
1301 |
||
1302 |
// go through the list of rows to update, and update them by calling |
|
1303 |
// mergeStates() to merge them the the dummy state we created |
|
1304 |
for (int i = 0; i < rows.size(); i++) { |
|
10110 | 1305 |
mergeStates(rows.elementAt(i).intValue(), newValues, rows); |
2 | 1306 |
} |
1307 |
} |
|
1308 |
||
1309 |
/** |
|
1310 |
* The real work of making the state table deterministic happens here. This function |
|
1311 |
* merges a state in the state table (specified by rowNum) with a state that is |
|
1312 |
* passed in (newValues). The basic process is to copy the nonzero cells in newStates |
|
1313 |
* into the state in the state table (we'll call that oldValues). If there's a |
|
1314 |
* collision (i.e., if the same cell has a nonzero value in both states, and it's |
|
1315 |
* not the SAME value), then we have to reconcile the collision. We do this by |
|
1316 |
* creating a new state, adding it to the end of the state table, and using this |
|
1317 |
* function recursively to merge the original two states into a single, combined |
|
1318 |
* state. This process may happen recursively (i.e., each successive level may |
|
1319 |
* involve collisions). To prevent infinite recursion, we keep a log of merge |
|
1320 |
* operations. Any time we're merging two states we've merged before, we can just |
|
1321 |
* supply the row number for the result of that merge operation rather than creating |
|
1322 |
* a new state just like it. |
|
1323 |
* @param rowNum The row number in the state table of the state to be updated |
|
1324 |
* @param newValues The state to merge it with. |
|
1325 |
* @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable() |
|
1326 |
* (itself a copy of the decision point list from parseRule()). Newly-created |
|
1327 |
* states get added to the decision point list if their "parents" were on it. |
|
1328 |
*/ |
|
1329 |
private void mergeStates(int rowNum, |
|
1330 |
short[] newValues, |
|
10110 | 1331 |
Vector<Integer> rowsBeingUpdated) { |
1332 |
short[] oldValues = tempStateTable.elementAt(rowNum); |
|
2 | 1333 |
boolean isLoopingState = loopingStates.contains(new Integer(rowNum)); |
1334 |
||
1335 |
// for each of the cells in the rows we're reconciling, do... |
|
1336 |
for (int i = 0; i < oldValues.length; i++) { |
|
1337 |
||
1338 |
// if they contain the same value, we don't have to do anything |
|
1339 |
if (oldValues[i] == newValues[i]) { |
|
1340 |
continue; |
|
1341 |
} |
|
1342 |
||
1343 |
// if oldValues is a looping state and the state the current cell points to |
|
1344 |
// is too, then we can just stomp over the current value of that cell (and |
|
1345 |
// set the clear-looping-states flag if necessary) |
|
1346 |
else if (isLoopingState && loopingStates.contains(new Integer(oldValues[i]))) { |
|
1347 |
if (newValues[i] != 0) { |
|
1348 |
if (oldValues[i] == 0) { |
|
1349 |
clearLoopingStates = true; |
|
1350 |
} |
|
1351 |
oldValues[i] = newValues[i]; |
|
1352 |
} |
|
1353 |
} |
|
1354 |
||
1355 |
// if the current cell in oldValues is 0, copy in the corresponding value |
|
1356 |
// from newValues |
|
1357 |
else if (oldValues[i] == 0) { |
|
1358 |
oldValues[i] = newValues[i]; |
|
1359 |
} |
|
1360 |
||
1361 |
// the last column of each row is the flag column. Take care to merge the |
|
1362 |
// flag words correctly |
|
1363 |
else if (i == numCategories) { |
|
1364 |
oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]); |
|
1365 |
} |
|
1366 |
||
1367 |
// if both newValues and oldValues have a nonzero value in the current |
|
1368 |
// cell, and it isn't the same value both places... |
|
1369 |
else if (oldValues[i] != 0 && newValues[i] != 0) { |
|
1370 |
||
1371 |
// look up this pair of cell values in the merge list. If it's |
|
1372 |
// found, update the cell in oldValues to point to the merged state |
|
1373 |
int combinedRowNum = searchMergeList(oldValues[i], newValues[i]); |
|
1374 |
if (combinedRowNum != 0) { |
|
1375 |
oldValues[i] = (short)combinedRowNum; |
|
1376 |
} |
|
1377 |
||
1378 |
// otherwise, we have to reconcile them... |
|
1379 |
else { |
|
1380 |
// copy our row numbers into variables to make things easier |
|
1381 |
int oldRowNum = oldValues[i]; |
|
1382 |
int newRowNum = newValues[i]; |
|
1383 |
combinedRowNum = tempStateTable.size(); |
|
1384 |
||
1385 |
// add this pair of row numbers to the merge list (create it first |
|
1386 |
// if we haven't created the merge list yet) |
|
1387 |
if (mergeList == null) { |
|
10110 | 1388 |
mergeList = new Vector<>(); |
2 | 1389 |
} |
1390 |
mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum }); |
|
1391 |
||
1392 |
// create a new row to represent the merged state, and copy the |
|
1393 |
// contents of oldRow into it, then add it to the end of the |
|
1394 |
// state table and update the original row (oldValues) to point |
|
1395 |
// to the new, merged, state |
|
1396 |
short[] newRow = new short[numCategories + 1]; |
|
10110 | 1397 |
short[] oldRow = tempStateTable.elementAt(oldRowNum); |
2 | 1398 |
System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1); |
1399 |
tempStateTable.addElement(newRow); |
|
1400 |
oldValues[i] = (short)combinedRowNum; |
|
1401 |
||
1402 |
// if the decision point list contains either of the parent rows, |
|
1403 |
// update it to include the new row as well |
|
1404 |
if ((decisionPointList.contains(new Integer(oldRowNum)) |
|
1405 |
|| decisionPointList.contains(new Integer(newRowNum))) |
|
1406 |
&& !decisionPointList.contains(new Integer(combinedRowNum)) |
|
1407 |
) { |
|
1408 |
decisionPointList.addElement(new Integer(combinedRowNum)); |
|
1409 |
} |
|
1410 |
||
1411 |
// do the same thing with the list of rows being updated |
|
1412 |
if ((rowsBeingUpdated.contains(new Integer(oldRowNum)) |
|
1413 |
|| rowsBeingUpdated.contains(new Integer(newRowNum))) |
|
1414 |
&& !rowsBeingUpdated.contains(new Integer(combinedRowNum)) |
|
1415 |
) { |
|
1416 |
decisionPointList.addElement(new Integer(combinedRowNum)); |
|
1417 |
} |
|
1418 |
// now (groan) do the same thing for all the entries on the |
|
1419 |
// decision point stack |
|
1420 |
for (int k = 0; k < decisionPointStack.size(); k++) { |
|
10110 | 1421 |
Vector<Integer> dpl = decisionPointStack.elementAt(k); |
2 | 1422 |
if ((dpl.contains(new Integer(oldRowNum)) |
1423 |
|| dpl.contains(new Integer(newRowNum))) |
|
1424 |
&& !dpl.contains(new Integer(combinedRowNum)) |
|
1425 |
) { |
|
1426 |
dpl.addElement(new Integer(combinedRowNum)); |
|
1427 |
} |
|
1428 |
} |
|
1429 |
||
1430 |
// FINALLY (puff puff puff), call mergeStates() recursively to copy |
|
1431 |
// the row referred to by newValues into the new row and resolve any |
|
1432 |
// conflicts that come up at that level |
|
10110 | 1433 |
mergeStates(combinedRowNum, tempStateTable.elementAt( |
1434 |
newValues[i]), rowsBeingUpdated); |
|
2 | 1435 |
} |
1436 |
} |
|
1437 |
} |
|
1438 |
return; |
|
1439 |
} |
|
1440 |
||
1441 |
/** |
|
1442 |
* The merge list is a list of pairs of rows that have been merged somewhere in |
|
1443 |
* the process of building this state table, along with the row number of the |
|
1444 |
* row containing the merged state. This function looks up a pair of row numbers |
|
1445 |
* and returns the row number of the row they combine into. (It returns 0 if |
|
1446 |
* this pair of rows isn't in the merge list.) |
|
1447 |
*/ |
|
1448 |
private int searchMergeList(int a, int b) { |
|
1449 |
// if there is no merge list, there obviously isn't anything in it |
|
1450 |
if (mergeList == null) { |
|
1451 |
return 0; |
|
1452 |
} |
|
1453 |
||
1454 |
// otherwise, for each element in the merge list... |
|
1455 |
else { |
|
1456 |
int[] entry; |
|
1457 |
for (int i = 0; i < mergeList.size(); i++) { |
|
10110 | 1458 |
entry = mergeList.elementAt(i); |
2 | 1459 |
|
1460 |
// we have a hit if the two row numbers match the two row numbers |
|
1461 |
// in the beginning of the entry (the two that combine), in either |
|
1462 |
// order |
|
1463 |
if ((entry[0] == a && entry[1] == b) || (entry[0] == b && entry[1] == a)) { |
|
1464 |
return entry[2]; |
|
1465 |
} |
|
1466 |
||
1467 |
// we also have a hit if one of the two row numbers matches the marged |
|
1468 |
// row number and the other one matches one of the original row numbers |
|
1469 |
if ((entry[2] == a && (entry[0] == b || entry[1] == b))) { |
|
1470 |
return entry[2]; |
|
1471 |
} |
|
1472 |
if ((entry[2] == b && (entry[0] == a || entry[1] == a))) { |
|
1473 |
return entry[2]; |
|
1474 |
} |
|
1475 |
} |
|
1476 |
return 0; |
|
1477 |
} |
|
1478 |
} |
|
1479 |
||
1480 |
/** |
|
1481 |
* This function is used to update the list of current loooping states (i.e., |
|
1482 |
* states that are controlled by a *? construct). It backfills values from |
|
1483 |
* the looping states into unpopulated cells of the states that are currently |
|
1484 |
* marked for backfilling, and then updates the list of looping states to be |
|
1485 |
* the new list |
|
1486 |
* @param newLoopingStates The list of new looping states |
|
1487 |
* @param endStates The list of states to treat as end states (states that |
|
1488 |
* can exit the loop). |
|
1489 |
*/ |
|
10110 | 1490 |
private void setLoopingStates(Vector<Integer> newLoopingStates, |
1491 |
Vector<Integer> endStates) { |
|
2 | 1492 |
|
1493 |
// if the current list of looping states isn't empty, we have to backfill |
|
1494 |
// values from the looping states into the states that are waiting to be |
|
1495 |
// backfilled |
|
1496 |
if (!loopingStates.isEmpty()) { |
|
10110 | 1497 |
int loopingState = loopingStates.lastElement().intValue(); |
2 | 1498 |
int rowNum; |
1499 |
||
1500 |
// don't backfill into an end state OR any state reachable from an end state |
|
1501 |
// (since the search for reachable states is recursive, it's split out into |
|
1502 |
// a separate function, eliminateBackfillStates(), below) |
|
1503 |
for (int i = 0; i < endStates.size(); i++) { |
|
10110 | 1504 |
eliminateBackfillStates(endStates.elementAt(i).intValue()); |
2 | 1505 |
} |
1506 |
||
1507 |
// we DON'T actually backfill the states that need to be backfilled here. |
|
1508 |
// Instead, we MARK them for backfilling. The reason for this is that if |
|
1509 |
// there are multiple rules in the state-table description, the looping |
|
1510 |
// states may have some of their values changed by a succeeding rule, and |
|
1511 |
// this wouldn't be reflected in the backfilled states. We mark a state |
|
1512 |
// for backfilling by putting the row number of the state to copy from |
|
1513 |
// into the flag cell at the end of the row |
|
1514 |
for (int i = 0; i < statesToBackfill.size(); i++) { |
|
10110 | 1515 |
rowNum = statesToBackfill.elementAt(i).intValue(); |
1516 |
short[] state = tempStateTable.elementAt(rowNum); |
|
2 | 1517 |
state[numCategories] = |
1518 |
(short)((state[numCategories] & ALL_FLAGS) | loopingState); |
|
1519 |
} |
|
1520 |
statesToBackfill.removeAllElements(); |
|
1521 |
loopingStates.removeAllElements(); |
|
1522 |
} |
|
1523 |
||
1524 |
if (newLoopingStates != null) { |
|
10110 | 1525 |
@SuppressWarnings("unchecked") |
1526 |
Vector<Integer> clone = (Vector<Integer>)newLoopingStates.clone(); |
|
1527 |
loopingStates = clone; |
|
2 | 1528 |
} |
1529 |
} |
|
1530 |
||
1531 |
/** |
|
1532 |
* This removes "ending states" and states reachable from them from the |
|
1533 |
* list of states to backfill. |
|
1534 |
* @param The row number of the state to remove from the backfill list |
|
1535 |
*/ |
|
1536 |
private void eliminateBackfillStates(int baseState) { |
|
1537 |
||
1538 |
// don't do anything unless this state is actually in the backfill list... |
|
1539 |
if (statesToBackfill.contains(new Integer(baseState))) { |
|
1540 |
||
1541 |
// if it is, take it out |
|
1542 |
statesToBackfill.removeElement(new Integer(baseState)); |
|
1543 |
||
1544 |
// then go through and recursively call this function for every |
|
1545 |
// state that the base state points to |
|
10110 | 1546 |
short[] state = tempStateTable.elementAt(baseState); |
2 | 1547 |
for (int i = 0; i < numCategories; i++) { |
1548 |
if (state[i] != 0) { |
|
1549 |
eliminateBackfillStates(state[i]); |
|
1550 |
} |
|
1551 |
} |
|
1552 |
} |
|
1553 |
} |
|
1554 |
||
1555 |
/** |
|
1556 |
* This function completes the backfilling process by actually doing the |
|
1557 |
* backfilling on the states that are marked for it |
|
1558 |
*/ |
|
1559 |
private void backfillLoopingStates() { |
|
1560 |
short[] state; |
|
1561 |
short[] loopingState = null; |
|
1562 |
int loopingStateRowNum = 0; |
|
1563 |
int fromState; |
|
1564 |
||
1565 |
// for each state in the state table... |
|
1566 |
for (int i = 0; i < tempStateTable.size(); i++) { |
|
10110 | 1567 |
state = tempStateTable.elementAt(i); |
2 | 1568 |
|
1569 |
// check the state's flag word to see if it's marked for backfilling |
|
1570 |
// (it's marked for backfilling if any bits other than the two high-order |
|
1571 |
// bits are set-- if they are, then the flag word, minus the two high bits, |
|
1572 |
// is the row number to copy from) |
|
1573 |
fromState = state[numCategories] & ~ALL_FLAGS; |
|
1574 |
if (fromState > 0) { |
|
1575 |
||
1576 |
// load up the state to copy from (if we haven't already) |
|
1577 |
if (fromState != loopingStateRowNum) { |
|
1578 |
loopingStateRowNum = fromState; |
|
10110 | 1579 |
loopingState = tempStateTable.elementAt(loopingStateRowNum); |
2 | 1580 |
} |
1581 |
||
1582 |
// clear out the backfill part of the flag word |
|
1583 |
state[numCategories] &= ALL_FLAGS; |
|
1584 |
||
1585 |
// then fill all zero cells in the current state with values |
|
1586 |
// from the corresponding cells of the fromState |
|
1587 |
for (int j = 0; j < state.length; j++) { |
|
1588 |
if (state[j] == 0) { |
|
1589 |
state[j] = loopingState[j]; |
|
1590 |
} |
|
1591 |
else if (state[j] == DONT_LOOP_FLAG) { |
|
1592 |
state[j] = 0; |
|
1593 |
} |
|
1594 |
} |
|
1595 |
} |
|
1596 |
} |
|
1597 |
} |
|
1598 |
||
1599 |
/** |
|
1600 |
* This function completes the state-table-building process by doing several |
|
1601 |
* postprocessing steps and copying everything into its final resting place |
|
1602 |
* in the iterator itself |
|
1603 |
* @param forward True if we're working on the forward state table |
|
1604 |
*/ |
|
1605 |
private void finishBuildingStateTable(boolean forward) { |
|
1606 |
// start by backfilling the looping states |
|
1607 |
backfillLoopingStates(); |
|
1608 |
||
1609 |
int[] rowNumMap = new int[tempStateTable.size()]; |
|
10110 | 1610 |
Stack<Integer> rowsToFollow = new Stack<>(); |
2 | 1611 |
rowsToFollow.push(new Integer(1)); |
1612 |
rowNumMap[1] = 1; |
|
1613 |
||
1614 |
// determine which states are no longer reachable from the start state |
|
1615 |
// (the reachable states will have their row numbers in the row number |
|
1616 |
// map, and the nonreachable states will have zero in the row number map) |
|
1617 |
while (rowsToFollow.size() != 0) { |
|
10110 | 1618 |
int rowNum = rowsToFollow.pop().intValue(); |
1619 |
short[] row = tempStateTable.elementAt(rowNum); |
|
2 | 1620 |
|
1621 |
for (int i = 0; i < numCategories; i++) { |
|
1622 |
if (row[i] != 0) { |
|
1623 |
if (rowNumMap[row[i]] == 0) { |
|
1624 |
rowNumMap[row[i]] = row[i]; |
|
1625 |
rowsToFollow.push(new Integer(row[i])); |
|
1626 |
} |
|
1627 |
} |
|
1628 |
} |
|
1629 |
} |
|
1630 |
||
1631 |
boolean madeChange; |
|
1632 |
int newRowNum; |
|
1633 |
||
1634 |
// algorithm for minimizing the number of states in the table adapted from |
|
1635 |
// Aho & Ullman, "Principles of Compiler Design" |
|
1636 |
// The basic idea here is to organize the states into classes. When we're done, |
|
1637 |
// all states in the same class can be considered identical and all but one eliminated. |
|
1638 |
||
1639 |
// initially assign states to classes based on the number of populated cells they |
|
1640 |
// contain (the class number is the number of populated cells) |
|
1641 |
int[] stateClasses = new int[tempStateTable.size()]; |
|
1642 |
int nextClass = numCategories + 1; |
|
1643 |
short[] state1, state2; |
|
1644 |
for (int i = 1; i < stateClasses.length; i++) { |
|
1645 |
if (rowNumMap[i] == 0) { |
|
1646 |
continue; |
|
1647 |
} |
|
10110 | 1648 |
state1 = tempStateTable.elementAt(i); |
2 | 1649 |
for (int j = 0; j < numCategories; j++) { |
1650 |
if (state1[j] != 0) { |
|
1651 |
++stateClasses[i]; |
|
1652 |
} |
|
1653 |
} |
|
1654 |
if (stateClasses[i] == 0) { |
|
1655 |
stateClasses[i] = nextClass; |
|
1656 |
} |
|
1657 |
} |
|
1658 |
++nextClass; |
|
1659 |
||
1660 |
// then, for each class, elect the first member of that class as that class's |
|
1661 |
// "representative". For each member of the class, compare it to the "representative." |
|
1662 |
// If there's a column position where the state being tested transitions to a |
|
1663 |
// state in a DIFFERENT class from the class where the "representative" transitions, |
|
1664 |
// then move the state into a new class. Repeat this process until no new classes |
|
1665 |
// are created. |
|
1666 |
int currentClass; |
|
1667 |
int lastClass; |
|
1668 |
boolean split; |
|
1669 |
||
1670 |
do { |
|
1671 |
currentClass = 1; |
|
1672 |
lastClass = nextClass; |
|
1673 |
while (currentClass < nextClass) { |
|
1674 |
split = false; |
|
1675 |
state1 = state2 = null; |
|
1676 |
for (int i = 0; i < stateClasses.length; i++) { |
|
1677 |
if (stateClasses[i] == currentClass) { |
|
1678 |
if (state1 == null) { |
|
10110 | 1679 |
state1 = tempStateTable.elementAt(i); |
2 | 1680 |
} |
1681 |
else { |
|
10110 | 1682 |
state2 = tempStateTable.elementAt(i); |
2 | 1683 |
for (int j = 0; j < state2.length; j++) { |
1684 |
if ((j == numCategories && state1[j] != state2[j] && forward) |
|
1685 |
|| (j != numCategories && stateClasses[state1[j]] |
|
1686 |
!= stateClasses[state2[j]])) { |
|
1687 |
stateClasses[i] = nextClass; |
|
1688 |
split = true; |
|
1689 |
break; |
|
1690 |
} |
|
1691 |
} |
|
1692 |
} |
|
1693 |
} |
|
1694 |
} |
|
1695 |
if (split) { |
|
1696 |
++nextClass; |
|
1697 |
} |
|
1698 |
++currentClass; |
|
1699 |
} |
|
1700 |
} while (lastClass != nextClass); |
|
1701 |
||
1702 |
// at this point, all of the states in a class except the first one (the |
|
1703 |
//"representative") can be eliminated, so update the row-number map accordingly |
|
1704 |
int[] representatives = new int[nextClass]; |
|
1705 |
for (int i = 1; i < stateClasses.length; i++) |
|
1706 |
if (representatives[stateClasses[i]] == 0) { |
|
1707 |
representatives[stateClasses[i]] = i; |
|
1708 |
} |
|
1709 |
else { |
|
1710 |
rowNumMap[i] = representatives[stateClasses[i]]; |
|
1711 |
} |
|
1712 |
||
1713 |
// renumber all remaining rows... |
|
1714 |
// first drop all that are either unreferenced or not a class representative |
|
1715 |
for (int i = 1; i < rowNumMap.length; i++) { |
|
1716 |
if (rowNumMap[i] != i) { |
|
1717 |
tempStateTable.setElementAt(null, i); |
|
1718 |
} |
|
1719 |
} |
|
1720 |
||
1721 |
// then calculate everybody's new row number and update the row |
|
1722 |
// number map appropriately (the first pass updates the row numbers |
|
1723 |
// of all the class representatives [the rows we're keeping], and the |
|
1724 |
// second pass updates the cross references for all the rows that |
|
1725 |
// are being deleted) |
|
1726 |
newRowNum = 1; |
|
1727 |
for (int i = 1; i < rowNumMap.length; i++) { |
|
1728 |
if (tempStateTable.elementAt(i) != null) { |
|
1729 |
rowNumMap[i] = newRowNum++; |
|
1730 |
} |
|
1731 |
} |
|
1732 |
for (int i = 1; i < rowNumMap.length; i++) { |
|
1733 |
if (tempStateTable.elementAt(i) == null) { |
|
1734 |
rowNumMap[i] = rowNumMap[rowNumMap[i]]; |
|
1735 |
} |
|
1736 |
} |
|
1737 |
||
1738 |
// allocate the permanent state table, and copy the remaining rows into it |
|
1739 |
// (adjusting all the cell values, of course) |
|
1740 |
||
1741 |
// this section does that for the forward state table |
|
1742 |
if (forward) { |
|
1743 |
endStates = new boolean[newRowNum]; |
|
1744 |
lookaheadStates = new boolean[newRowNum]; |
|
1745 |
stateTable = new short[newRowNum * numCategories]; |
|
1746 |
int p = 0; |
|
1747 |
int p2 = 0; |
|
1748 |
for (int i = 0; i < tempStateTable.size(); i++) { |
|
10110 | 1749 |
short[] row = tempStateTable.elementAt(i); |
2 | 1750 |
if (row == null) { |
1751 |
continue; |
|
1752 |
} |
|
1753 |
for (int j = 0; j < numCategories; j++) { |
|
1754 |
stateTable[p] = (short)(rowNumMap[row[j]]); |
|
1755 |
++p; |
|
1756 |
} |
|
1757 |
endStates[p2] = ((row[numCategories] & END_STATE_FLAG) != 0); |
|
1758 |
lookaheadStates[p2] = ((row[numCategories] & LOOKAHEAD_STATE_FLAG) != 0); |
|
1759 |
++p2; |
|
1760 |
} |
|
1761 |
} |
|
1762 |
||
1763 |
// and this section does it for the backward state table |
|
1764 |
else { |
|
1765 |
backwardsStateTable = new short[newRowNum * numCategories]; |
|
1766 |
int p = 0; |
|
1767 |
for (int i = 0; i < tempStateTable.size(); i++) { |
|
10110 | 1768 |
short[] row = tempStateTable.elementAt(i); |
2 | 1769 |
if (row == null) { |
1770 |
continue; |
|
1771 |
} |
|
1772 |
for (int j = 0; j < numCategories; j++) { |
|
1773 |
backwardsStateTable[p] = (short)(rowNumMap[row[j]]); |
|
1774 |
++p; |
|
1775 |
} |
|
1776 |
} |
|
1777 |
} |
|
1778 |
} |
|
1779 |
||
1780 |
/** |
|
1781 |
* This function builds the backward state table from the forward state |
|
1782 |
* table and any additional rules (identified by the ! on the front) |
|
1783 |
* supplied in the description |
|
1784 |
*/ |
|
10110 | 1785 |
private void buildBackwardsStateTable(Vector<String> tempRuleList) { |
2 | 1786 |
|
1787 |
// create the temporary state table and seed it with two rows (row 0 |
|
1788 |
// isn't used for anything, and we have to create row 1 (the initial |
|
1789 |
// state) before we can do anything else |
|
10110 | 1790 |
tempStateTable = new Vector<>(); |
2 | 1791 |
tempStateTable.addElement(new short[numCategories + 1]); |
1792 |
tempStateTable.addElement(new short[numCategories + 1]); |
|
1793 |
||
1794 |
// although the backwards state table is built automatically from the forward |
|
1795 |
// state table, there are some situations (the default sentence-break rules, |
|
1796 |
// for example) where this doesn't yield enough stop states, causing a dramatic |
|
1797 |
// drop in performance. To help with these cases, the user may supply |
|
1798 |
// supplemental rules that are added to the backward state table. These have |
|
1799 |
// the same syntax as the normal break rules, but begin with '!' to distinguish |
|
1800 |
// them from normal break rules |
|
1801 |
for (int i = 0; i < tempRuleList.size(); i++) { |
|
10110 | 1802 |
String rule = tempRuleList.elementAt(i); |
2 | 1803 |
if (rule.charAt(0) == '!') { |
1804 |
parseRule(rule.substring(1), false); |
|
1805 |
} |
|
1806 |
} |
|
1807 |
backfillLoopingStates(); |
|
1808 |
||
1809 |
// Backwards iteration is qualitatively different from forwards iteration. |
|
1810 |
// This is because backwards iteration has to be made to operate from no context |
|
1811 |
// at all-- the user should be able to ask BreakIterator for the break position |
|
1812 |
// immediately on either side of some arbitrary offset in the text. The |
|
1813 |
// forward iteration table doesn't let us do that-- it assumes complete |
|
1814 |
// information on the context, which means starting from the beginning of the |
|
1815 |
// document. |
|
1816 |
// The way we do backward and random-access iteration is to back up from the |
|
1817 |
// current (or user-specified) position until we see something we're sure is |
|
1818 |
// a break position (it may not be the last break position immediately |
|
1819 |
// preceding our starting point, however). Then we roll forward from there to |
|
1820 |
// locate the actual break position we're after. |
|
1821 |
// This means that the backwards state table doesn't have to identify every |
|
1822 |
// break position, allowing the building algorithm to be much simpler. Here, |
|
1823 |
// we use a "pairs" approach, scanning the forward-iteration state table for |
|
1824 |
// pairs of character categories we ALWAYS break between, and building a state |
|
1825 |
// table from that information. No context is required-- all this state table |
|
1826 |
// looks at is a pair of adjacent characters. |
|
1827 |
||
1828 |
// It's possible that the user has supplied supplementary rules (see above). |
|
1829 |
// This has to be done first to keep parseRule() and friends from becoming |
|
1830 |
// EVEN MORE complicated. The automatically-generated states are appended |
|
1831 |
// onto the end of the state table, and then the two sets of rules are |
|
1832 |
// stitched together at the end. Take note of the row number of the |
|
1833 |
// first row of the auromatically-generated part. |
|
1834 |
int backTableOffset = tempStateTable.size(); |
|
1835 |
if (backTableOffset > 2) { |
|
1836 |
++backTableOffset; |
|
1837 |
} |
|
1838 |
||
1839 |
// the automatically-generated part of the table models a two-dimensional |
|
1840 |
// array where the two dimensions represent the two characters we're currently |
|
1841 |
// looking at. To model this as a state table, we actually need one additional |
|
1842 |
// row to represent the initial state. It gets populated with the row numbers |
|
1843 |
// of the other rows (in order). |
|
1844 |
for (int i = 0; i < numCategories + 1; i++) |
|
1845 |
tempStateTable.addElement(new short[numCategories + 1]); |
|
1846 |
||
10110 | 1847 |
short[] state = tempStateTable.elementAt(backTableOffset - 1); |
2 | 1848 |
for (int i = 0; i < numCategories; i++) |
1849 |
state[i] = (short)(i + backTableOffset); |
|
1850 |
||
1851 |
// scavenge the forward state table for pairs of character categories |
|
1852 |
// that always have a break between them. The algorithm is as follows: |
|
1853 |
// Look down each column in the state table. For each nonzero cell in |
|
1854 |
// that column, look up the row it points to. For each nonzero cell in |
|
1855 |
// that row, populate a cell in the backwards state table: the row number |
|
1856 |
// of that cell is the number of the column we were scanning (plus the |
|
1857 |
// offset that locates this sub-table), and the column number of that cell |
|
1858 |
// is the column number of the nonzero cell we just found. This cell is |
|
1859 |
// populated with its own column number (adjusted according to the actual |
|
1860 |
// location of the sub-table). This process will produce a state table |
|
1861 |
// whose behavior is the same as looking up successive pairs of characters |
|
1862 |
// in an array of Booleans to determine whether there is a break. |
|
1863 |
int numRows = stateTable.length / numCategories; |
|
1864 |
for (int column = 0; column < numCategories; column++) { |
|
1865 |
for (int row = 0; row < numRows; row++) { |
|
1866 |
int nextRow = lookupState(row, column); |
|
1867 |
if (nextRow != 0) { |
|
1868 |
for (int nextColumn = 0; nextColumn < numCategories; nextColumn++) { |
|
1869 |
int cellValue = lookupState(nextRow, nextColumn); |
|
1870 |
if (cellValue != 0) { |
|
10110 | 1871 |
state = tempStateTable.elementAt(nextColumn + |
2 | 1872 |
backTableOffset); |
1873 |
state[column] = (short)(column + backTableOffset); |
|
1874 |
} |
|
1875 |
} |
|
1876 |
} |
|
1877 |
} |
|
1878 |
} |
|
1879 |
||
1880 |
// if the user specified some backward-iteration rules with the ! token, |
|
1881 |
// we have to merge the resulting state table with the auto-generated one |
|
1882 |
// above. First copy the populated cells from row 1 over the populated |
|
1883 |
// cells in the auto-generated table. Then copy values from row 1 of the |
|
1884 |
// auto-generated table into all of the the unpopulated cells of the |
|
1885 |
// rule-based table. |
|
1886 |
if (backTableOffset > 1) { |
|
1887 |
||
1888 |
// for every row in the auto-generated sub-table, if a cell is |
|
1889 |
// populated that is also populated in row 1 of the rule-based |
|
1890 |
// sub-table, copy the value from row 1 over the value in the |
|
1891 |
// auto-generated sub-table |
|
10110 | 1892 |
state = tempStateTable.elementAt(1); |
2 | 1893 |
for (int i = backTableOffset - 1; i < tempStateTable.size(); i++) { |
10110 | 1894 |
short[] state2 = tempStateTable.elementAt(i); |
2 | 1895 |
for (int j = 0; j < numCategories; j++) { |
1896 |
if (state[j] != 0 && state2[j] != 0) { |
|
1897 |
state2[j] = state[j]; |
|
1898 |
} |
|
1899 |
} |
|
1900 |
} |
|
1901 |
||
1902 |
// now, for every row in the rule-based sub-table that is not |
|
1903 |
// an end state, fill in all unpopulated cells with the values |
|
1904 |
// of the corresponding cells in the first row of the auto- |
|
1905 |
// generated sub-table. |
|
10110 | 1906 |
state = tempStateTable.elementAt(backTableOffset - 1); |
2 | 1907 |
for (int i = 1; i < backTableOffset - 1; i++) { |
10110 | 1908 |
short[] state2 = tempStateTable.elementAt(i); |
2 | 1909 |
if ((state2[numCategories] & END_STATE_FLAG) == 0) { |
1910 |
for (int j = 0; j < numCategories; j++) { |
|
1911 |
if (state2[j] == 0) { |
|
1912 |
state2[j] = state[j]; |
|
1913 |
} |
|
1914 |
} |
|
1915 |
} |
|
1916 |
} |
|
1917 |
} |
|
1918 |
||
1919 |
// finally, clean everything up and copy it into the actual BreakIterator |
|
1920 |
// by calling finishBuildingStateTable() |
|
1921 |
finishBuildingStateTable(false); |
|
1922 |
} |
|
1923 |
||
1924 |
/** |
|
1925 |
* Given a current state and a character category, looks up the |
|
1926 |
* next state to transition to in the state table. |
|
1927 |
*/ |
|
1928 |
protected int lookupState(int state, int category) { |
|
1929 |
return stateTable[state * numCategories + category]; |
|
1930 |
} |
|
1931 |
||
1932 |
/** |
|
1933 |
* Throws an IllegalArgumentException representing a syntax error in the rule |
|
1934 |
* description. The exception's message contains some debugging information. |
|
1935 |
* @param message A message describing the problem |
|
1936 |
* @param position The position in the description where the problem was |
|
1937 |
* discovered |
|
1938 |
* @param context The string containing the error |
|
1939 |
*/ |
|
1940 |
protected void error(String message, int position, String context) { |
|
1941 |
throw new IllegalArgumentException("Parse error at position (" + position + "): " + message + "\n" + |
|
1942 |
context.substring(0, position) + " -here- " + context.substring(position)); |
|
1943 |
} |
|
1944 |
||
1945 |
void makeFile(String filename) { |
|
1946 |
writeTables(filename); |
|
1947 |
} |
|
1948 |
||
1949 |
/** |
|
1950 |
* Magic number for the BreakIterator data file format. |
|
1951 |
*/ |
|
1952 |
private static final byte[] LABEL = { |
|
1953 |
(byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a', |
|
1954 |
(byte)'\0' |
|
1955 |
}; |
|
1956 |
||
1957 |
/** |
|
1958 |
* Version number of the dictionary that was read in. |
|
1959 |
*/ |
|
1960 |
private static final byte[] supportedVersion = { (byte)1 }; |
|
1961 |
||
1962 |
/** |
|
1963 |
* Header size in byte count |
|
1964 |
*/ |
|
1965 |
private static final int HEADER_LENGTH = 36; |
|
1966 |
||
1967 |
/** |
|
1968 |
* Array length of indices for BMP characters |
|
1969 |
*/ |
|
1970 |
private static final int BMP_INDICES_LENGTH = 512; |
|
1971 |
||
1972 |
/** |
|
1973 |
* Read datafile. The datafile's format is as follows: |
|
1974 |
* <pre> |
|
1975 |
* BreakIteratorData { |
|
1976 |
* u1 magic[7]; |
|
1977 |
* u1 version; |
|
1978 |
* u4 totalDataSize; |
|
1979 |
* header_info header; |
|
1980 |
* body value; |
|
1981 |
* } |
|
1982 |
* </pre> |
|
1983 |
* <code>totalDataSize</code> is the summation of the size of |
|
1984 |
* <code>header_info</code> and <code>body</code> in byte count. |
|
1985 |
* <p> |
|
1986 |
* In <code>header</code>, each field except for checksum implies the |
|
1987 |
* length of each field. Since <code>BMPdataLength</code> is a fixed-length |
|
1988 |
* data(512 entries), its length isn't included in <code>header</code>. |
|
1989 |
* <code>checksum</code> is a CRC32 value of all in <code>body</code>. |
|
1990 |
* <pre> |
|
1991 |
* header_info { |
|
1992 |
* u4 stateTableLength; |
|
1993 |
* u4 backwardsStateTableLength; |
|
1994 |
* u4 endStatesLength; |
|
1995 |
* u4 lookaheadStatesLength; |
|
1996 |
* u4 BMPdataLength; |
|
1997 |
* u4 nonBMPdataLength; |
|
1998 |
* u4 additionalDataLength; |
|
1999 |
* u8 checksum; |
|
2000 |
* } |
|
2001 |
* </pre> |
|
2002 |
* <p> |
|
2003 |
* |
|
2004 |
* Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to |
|
2005 |
* <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to |
|
2006 |
* <code>supplementaryCharCategoryTable</code>. |
|
2007 |
* <pre> |
|
2008 |
* body { |
|
2009 |
* u2 stateTable[stateTableLength]; |
|
2010 |
* u2 backwardsStateTable[backwardsStateTableLength]; |
|
2011 |
* u1 endStates[endStatesLength]; |
|
2012 |
* u1 lookaheadStates[lookaheadStatesLength]; |
|
2013 |
* u2 BMPindices[512]; |
|
2014 |
* u1 BMPdata[BMPdataLength]; |
|
2015 |
* u4 nonBMPdata[numNonBMPdataLength]; |
|
2016 |
* u1 additionalData[additionalDataLength]; |
|
2017 |
* } |
|
2018 |
* </pre> |
|
2019 |
*/ |
|
2020 |
protected void writeTables(String datafile) { |
|
2021 |
final String filename; |
|
2022 |
final String outputDir; |
|
2023 |
String tmpbuf = GenerateBreakIteratorData.getOutputDirectory(); |
|
2024 |
||
2025 |
if (tmpbuf.equals("")) { |
|
2026 |
filename = datafile; |
|
2027 |
outputDir = ""; |
|
2028 |
} else { |
|
2029 |
char sep = File.separatorChar; |
|
2030 |
if (sep == '/') { |
|
2031 |
outputDir = tmpbuf; |
|
2032 |
} else if (sep == '\\') { |
|
2033 |
outputDir = tmpbuf.replaceAll("/", "\\\\"); |
|
2034 |
} else { |
|
2035 |
outputDir = tmpbuf.replaceAll("/", String.valueOf(sep)); |
|
2036 |
} |
|
2037 |
||
2038 |
filename = outputDir + sep + datafile; |
|
2039 |
} |
|
2040 |
||
2041 |
try { |
|
2042 |
if (!outputDir.equals("")) { |
|
2043 |
new File(outputDir).mkdirs(); |
|
2044 |
} |
|
2045 |
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(filename)); |
|
2046 |
||
2047 |
byte[] BMPdata = charCategoryTable.getStringArray(); |
|
2048 |
short[] BMPindices = charCategoryTable.getIndexArray(); |
|
2049 |
int[] nonBMPdata = supplementaryCharCategoryTable.getArray(); |
|
2050 |
||
2051 |
if (BMPdata.length <= 0) { |
|
2052 |
throw new InternalError("Wrong BMP data length(" + BMPdata.length + ")"); |
|
2053 |
} |
|
2054 |
if (BMPindices.length != BMP_INDICES_LENGTH) { |
|
2055 |
throw new InternalError("Wrong BMP indices length(" + BMPindices.length + ")"); |
|
2056 |
} |
|
2057 |
if (nonBMPdata.length <= 0) { |
|
2058 |
throw new InternalError("Wrong non-BMP data length(" + nonBMPdata.length + ")"); |
|
2059 |
} |
|
2060 |
||
2061 |
int len; |
|
2062 |
||
2063 |
/* Compute checksum */ |
|
2064 |
CRC32 crc32 = new CRC32(); |
|
2065 |
len = stateTable.length; |
|
2066 |
for (int i = 0; i < len; i++) { |
|
2067 |
crc32.update(stateTable[i]); |
|
2068 |
} |
|
2069 |
len = backwardsStateTable.length; |
|
2070 |
for (int i = 0; i < len; i++) { |
|
2071 |
crc32.update(backwardsStateTable[i]); |
|
2072 |
} |
|
2073 |
crc32.update(toByteArray(endStates)); |
|
2074 |
crc32.update(toByteArray(lookaheadStates)); |
|
2075 |
for (int i = 0; i < BMP_INDICES_LENGTH; i++) { |
|
2076 |
crc32.update(BMPindices[i]); |
|
2077 |
} |
|
2078 |
crc32.update(BMPdata); |
|
2079 |
len = nonBMPdata.length; |
|
2080 |
for (int i = 0; i < len; i++) { |
|
2081 |
crc32.update(nonBMPdata[i]); |
|
2082 |
} |
|
2083 |
if (additionalData != null) { |
|
2084 |
len = additionalData.length; |
|
2085 |
for (int i = 0; i < len; i++) { |
|
2086 |
crc32.update(additionalData[i]); |
|
2087 |
} |
|
2088 |
} |
|
2089 |
||
2090 |
/* First, write magic, version, and totalDataSize. */ |
|
2091 |
len = HEADER_LENGTH + |
|
2092 |
(stateTable.length + backwardsStateTable.length) * 2 + |
|
2093 |
endStates.length + lookaheadStates.length + 1024 + |
|
2094 |
BMPdata.length + nonBMPdata.length * 4 + |
|
2095 |
((additionalData == null) ? 0 : additionalData.length); |
|
2096 |
out.write(LABEL); |
|
2097 |
out.write(supportedVersion); |
|
2098 |
out.write(toByteArray(len)); |
|
2099 |
||
2100 |
/* Write header_info. */ |
|
2101 |
out.write(toByteArray(stateTable.length)); |
|
2102 |
out.write(toByteArray(backwardsStateTable.length)); |
|
2103 |
out.write(toByteArray(endStates.length)); |
|
2104 |
out.write(toByteArray(lookaheadStates.length)); |
|
2105 |
out.write(toByteArray(BMPdata.length)); |
|
2106 |
out.write(toByteArray(nonBMPdata.length)); |
|
2107 |
if (additionalData == null) { |
|
2108 |
out.write(toByteArray(0)); |
|
2109 |
} else { |
|
2110 |
out.write(toByteArray(additionalData.length)); |
|
2111 |
} |
|
2112 |
out.write(toByteArray(crc32.getValue())); |
|
2113 |
||
2114 |
/* Write stateTable[numCategories * numRows] */ |
|
2115 |
len = stateTable.length; |
|
2116 |
for (int i = 0; i < len; i++) { |
|
2117 |
out.write(toByteArray(stateTable[i])); |
|
2118 |
} |
|
2119 |
||
2120 |
/* Write backwardsStateTable[numCategories * numRows] */ |
|
2121 |
len = backwardsStateTable.length; |
|
2122 |
for (int i = 0; i < len; i++) { |
|
2123 |
out.write(toByteArray(backwardsStateTable[i])); |
|
2124 |
} |
|
2125 |
||
2126 |
/* Write endStates[numRows] */ |
|
2127 |
out.write(toByteArray(endStates)); |
|
2128 |
||
2129 |
/* Write lookaheadStates[numRows] */ |
|
2130 |
out.write(toByteArray(lookaheadStates)); |
|
2131 |
||
2132 |
for (int i = 0; i < BMP_INDICES_LENGTH; i++) { |
|
2133 |
out.write(toByteArray(BMPindices[i])); |
|
2134 |
} |
|
2135 |
BMPindices = null; |
|
2136 |
out.write(BMPdata); |
|
2137 |
BMPdata = null; |
|
2138 |
||
2139 |
/* Write a category table for non-BMP characters. */ |
|
2140 |
len = nonBMPdata.length; |
|
2141 |
for (int i = 0; i < len; i++) { |
|
2142 |
out.write(toByteArray(nonBMPdata[i])); |
|
2143 |
} |
|
2144 |
nonBMPdata = null; |
|
2145 |
||
2146 |
/* Write additional data */ |
|
2147 |
if (additionalData != null) { |
|
2148 |
out.write(additionalData); |
|
2149 |
} |
|
2150 |
||
2151 |
out.close(); |
|
2152 |
} |
|
2153 |
catch (Exception e) { |
|
2154 |
throw new InternalError(e.toString()); |
|
2155 |
} |
|
2156 |
} |
|
2157 |
||
2158 |
byte[] toByteArray(short val) { |
|
2159 |
byte[] buf = new byte[2]; |
|
2160 |
buf[0] = (byte)((val>>>8) & 0xFF); |
|
2161 |
buf[1] = (byte)(val & 0xFF); |
|
2162 |
return buf; |
|
2163 |
} |
|
2164 |
||
2165 |
byte[] toByteArray(int val) { |
|
2166 |
byte[] buf = new byte[4]; |
|
2167 |
buf[0] = (byte)((val>>>24) & 0xFF); |
|
2168 |
buf[1] = (byte)((val>>>16) & 0xFF); |
|
2169 |
buf[2] = (byte)((val>>>8) & 0xFF); |
|
2170 |
buf[3] = (byte)(val & 0xFF); |
|
2171 |
return buf; |
|
2172 |
} |
|
2173 |
||
2174 |
byte[] toByteArray(long val) { |
|
2175 |
byte[] buf = new byte[8]; |
|
2176 |
buf[0] = (byte)((val>>>56) & 0xff); |
|
2177 |
buf[1] = (byte)((val>>>48) & 0xff); |
|
2178 |
buf[2] = (byte)((val>>>40) & 0xff); |
|
2179 |
buf[3] = (byte)((val>>>32) & 0xff); |
|
2180 |
buf[4] = (byte)((val>>>24) & 0xff); |
|
2181 |
buf[5] = (byte)((val>>>16) & 0xff); |
|
2182 |
buf[6] = (byte)((val>>>8) & 0xff); |
|
2183 |
buf[7] = (byte)(val & 0xff); |
|
2184 |
return buf; |
|
2185 |
} |
|
2186 |
||
2187 |
byte[] toByteArray(boolean[] data) { |
|
2188 |
byte[] buf = new byte[data.length]; |
|
2189 |
for (int i = 0; i < data.length; i++) { |
|
2190 |
buf[i] = data[i] ? (byte)1 : (byte)0; |
|
2191 |
} |
|
2192 |
return buf; |
|
2193 |
} |
|
2194 |
||
2195 |
void setAdditionalData(byte[] data) { |
|
2196 |
additionalData = data; |
|
2197 |
} |
|
2198 |
} |