jdk-sandbox: jdk/src/share/classes/sun/text/resources/BreakIteratorRules.java@202f599c92aa (annotated)

2 90ce3da70b43 Initial load duke parents: diff changeset	1	/*
5506 202f599c92aa 6943119: Rebrand source copyright notices ohair parents: 2 diff changeset	2	* Copyright (c) 1999, 2007, Oracle and/or its affiliates. All rights reserved.
2 90ce3da70b43 Initial load duke parents: diff changeset	3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
90ce3da70b43 Initial load duke parents: diff changeset	4	*
90ce3da70b43 Initial load duke parents: diff changeset	5	* This code is free software; you can redistribute it and/or modify it
90ce3da70b43 Initial load duke parents: diff changeset	6	* under the terms of the GNU General Public License version 2 only, as
5506 202f599c92aa 6943119: Rebrand source copyright notices ohair parents: 2 diff changeset	7	* published by the Free Software Foundation. Oracle designates this
2 90ce3da70b43 Initial load duke parents: diff changeset	8	* particular file as subject to the "Classpath" exception as provided
5506 202f599c92aa 6943119: Rebrand source copyright notices ohair parents: 2 diff changeset	9	* by Oracle in the LICENSE file that accompanied this code.
2 90ce3da70b43 Initial load duke parents: diff changeset	10	*
90ce3da70b43 Initial load duke parents: diff changeset	11	* This code is distributed in the hope that it will be useful, but WITHOUT
90ce3da70b43 Initial load duke parents: diff changeset	12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
90ce3da70b43 Initial load duke parents: diff changeset	13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
90ce3da70b43 Initial load duke parents: diff changeset	14	* version 2 for more details (a copy is included in the LICENSE file that
90ce3da70b43 Initial load duke parents: diff changeset	15	* accompanied this code).
90ce3da70b43 Initial load duke parents: diff changeset	16	*
90ce3da70b43 Initial load duke parents: diff changeset	17	* You should have received a copy of the GNU General Public License version
90ce3da70b43 Initial load duke parents: diff changeset	18	* 2 along with this work; if not, write to the Free Software Foundation,
90ce3da70b43 Initial load duke parents: diff changeset	19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
90ce3da70b43 Initial load duke parents: diff changeset	20	*
5506 202f599c92aa 6943119: Rebrand source copyright notices ohair parents: 2 diff changeset	21	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
202f599c92aa 6943119: Rebrand source copyright notices ohair parents: 2 diff changeset	22	* or visit www.oracle.com if you need additional information or have any
202f599c92aa 6943119: Rebrand source copyright notices ohair parents: 2 diff changeset	23	* questions.
2 90ce3da70b43 Initial load duke parents: diff changeset	24	*/
90ce3da70b43 Initial load duke parents: diff changeset	25
90ce3da70b43 Initial load duke parents: diff changeset	26	/*
90ce3da70b43 Initial load duke parents: diff changeset	27	*/
90ce3da70b43 Initial load duke parents: diff changeset	28
90ce3da70b43 Initial load duke parents: diff changeset	29	/*
90ce3da70b43 Initial load duke parents: diff changeset	30	* Licensed Materials - Property of IBM
90ce3da70b43 Initial load duke parents: diff changeset	31	*
90ce3da70b43 Initial load duke parents: diff changeset	32	* (C) Copyright IBM Corp. 1999 All Rights Reserved.
90ce3da70b43 Initial load duke parents: diff changeset	33	* (C) IBM Corp. 1997-1998. All Rights Reserved.
90ce3da70b43 Initial load duke parents: diff changeset	34	*
90ce3da70b43 Initial load duke parents: diff changeset	35	* The program is provided "as is" without any warranty express or
90ce3da70b43 Initial load duke parents: diff changeset	36	* implied, including the warranty of non-infringement and the implied
90ce3da70b43 Initial load duke parents: diff changeset	37	* warranties of merchantibility and fitness for a particular purpose.
90ce3da70b43 Initial load duke parents: diff changeset	38	* IBM will not be liable for any damages suffered by you as a result
90ce3da70b43 Initial load duke parents: diff changeset	39	* of using the Program. In no event will IBM be liable for any
90ce3da70b43 Initial load duke parents: diff changeset	40	* special, indirect or consequential damages or lost profits even if
90ce3da70b43 Initial load duke parents: diff changeset	41	* IBM has been advised of the possibility of their occurrence. IBM
90ce3da70b43 Initial load duke parents: diff changeset	42	* will not be liable for any third party claims against you.
90ce3da70b43 Initial load duke parents: diff changeset	43	*/
90ce3da70b43 Initial load duke parents: diff changeset	44
90ce3da70b43 Initial load duke parents: diff changeset	45	package sun.text.resources;
90ce3da70b43 Initial load duke parents: diff changeset	46
90ce3da70b43 Initial load duke parents: diff changeset	47	import java.util.ListResourceBundle;
90ce3da70b43 Initial load duke parents: diff changeset	48
90ce3da70b43 Initial load duke parents: diff changeset	49	/**
90ce3da70b43 Initial load duke parents: diff changeset	50	* Default break-iterator rules. These rules are more or less general for
90ce3da70b43 Initial load duke parents: diff changeset	51	* all locales, although there are probably a few we're missing. The
90ce3da70b43 Initial load duke parents: diff changeset	52	* behavior currently mimics the behavior of BreakIterator in JDK 1.2.
90ce3da70b43 Initial load duke parents: diff changeset	53	* There are known deficiencies in this behavior, including the fact that
90ce3da70b43 Initial load duke parents: diff changeset	54	* the logic for handling CJK characters works for Japanese but not for
90ce3da70b43 Initial load duke parents: diff changeset	55	* Chinese, and that we don't currently have an appropriate locale for
90ce3da70b43 Initial load duke parents: diff changeset	56	* Thai. The resources will eventually be updated to fix these problems.
90ce3da70b43 Initial load duke parents: diff changeset	57	*/
90ce3da70b43 Initial load duke parents: diff changeset	58
90ce3da70b43 Initial load duke parents: diff changeset	59	/* Modified for Hindi 3/1/99. */
90ce3da70b43 Initial load duke parents: diff changeset	60
90ce3da70b43 Initial load duke parents: diff changeset	61	/*
90ce3da70b43 Initial load duke parents: diff changeset	62	* Since JDK 1.5.0, this file no longer goes to runtime and is used at J2SE
90ce3da70b43 Initial load duke parents: diff changeset	63	* build phase in order to create [Character\|Word\|Line\|Sentence]BreakIteratorData
90ce3da70b43 Initial load duke parents: diff changeset	64	* files which are used on runtime instead.
90ce3da70b43 Initial load duke parents: diff changeset	65	*/
90ce3da70b43 Initial load duke parents: diff changeset	66
90ce3da70b43 Initial load duke parents: diff changeset	67	public class BreakIteratorRules extends ListResourceBundle {
90ce3da70b43 Initial load duke parents: diff changeset	68	protected final Object[][] getContents() {
90ce3da70b43 Initial load duke parents: diff changeset	69	return new Object[][] {
90ce3da70b43 Initial load duke parents: diff changeset	70	// rules describing how to break between logical characters
90ce3da70b43 Initial load duke parents: diff changeset	71	{ "CharacterBreakRules",
90ce3da70b43 Initial load duke parents: diff changeset	72
90ce3da70b43 Initial load duke parents: diff changeset	73	// ignore non-spacing marks and enclosing marks (since we never
90ce3da70b43 Initial load duke parents: diff changeset	74	// put a break before ignore characters, this keeps combining
90ce3da70b43 Initial load duke parents: diff changeset	75	// accents with the base characters they modify)
90ce3da70b43 Initial load duke parents: diff changeset	76	"<enclosing>=[:Mn::Me:];"
90ce3da70b43 Initial load duke parents: diff changeset	77
90ce3da70b43 Initial load duke parents: diff changeset	78	// other category definitions
90ce3da70b43 Initial load duke parents: diff changeset	79	+ "<choseong>=[\u1100-\u115f];"
90ce3da70b43 Initial load duke parents: diff changeset	80	+ "<jungseong>=[\u1160-\u11a7];"
90ce3da70b43 Initial load duke parents: diff changeset	81	+ "<jongseong>=[\u11a8-\u11ff];"
90ce3da70b43 Initial load duke parents: diff changeset	82	+ "<surr-hi>=[\ud800-\udbff];"
90ce3da70b43 Initial load duke parents: diff changeset	83	+ "<surr-lo>=[\udc00-\udfff];"
90ce3da70b43 Initial load duke parents: diff changeset	84
90ce3da70b43 Initial load duke parents: diff changeset	85	// break after every character, except as follows:
90ce3da70b43 Initial load duke parents: diff changeset	86	+ ".;"
90ce3da70b43 Initial load duke parents: diff changeset	87
90ce3da70b43 Initial load duke parents: diff changeset	88	// keep base and combining characters togethers
90ce3da70b43 Initial load duke parents: diff changeset	89	+ "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];"
90ce3da70b43 Initial load duke parents: diff changeset	90	+ "<base><enclosing><enclosing>*;"
90ce3da70b43 Initial load duke parents: diff changeset	91
90ce3da70b43 Initial load duke parents: diff changeset	92	// keep CRLF sequences together
90ce3da70b43 Initial load duke parents: diff changeset	93	+ "\r\n;"
90ce3da70b43 Initial load duke parents: diff changeset	94
90ce3da70b43 Initial load duke parents: diff changeset	95	// keep surrogate pairs together
90ce3da70b43 Initial load duke parents: diff changeset	96	+ "<surr-hi><surr-lo>;"
90ce3da70b43 Initial load duke parents: diff changeset	97
90ce3da70b43 Initial load duke parents: diff changeset	98	// keep Hangul syllables spelled out using conjoining jamo together
90ce3da70b43 Initial load duke parents: diff changeset	99	+ "<choseong><jungseong><jongseong>*;"
90ce3da70b43 Initial load duke parents: diff changeset	100
90ce3da70b43 Initial load duke parents: diff changeset	101	// various additions for Hindi support
90ce3da70b43 Initial load duke parents: diff changeset	102	+ "<nukta>=[\u093c];"
90ce3da70b43 Initial load duke parents: diff changeset	103	+ "<danda>=[\u0964\u0965];"
90ce3da70b43 Initial load duke parents: diff changeset	104	+ "<virama>=[\u094d];"
90ce3da70b43 Initial load duke parents: diff changeset	105	+ "<devVowelSign>=[\u093e-\u094c\u0962\u0963];"
90ce3da70b43 Initial load duke parents: diff changeset	106	+ "<devConsonant>=[\u0915-\u0939];"
90ce3da70b43 Initial load duke parents: diff changeset	107	+ "<devNuktaConsonant>=[\u0958-\u095f];"
90ce3da70b43 Initial load duke parents: diff changeset	108	+ "<devCharEnd>=[\u0902\u0903\u0951-\u0954];"
90ce3da70b43 Initial load duke parents: diff changeset	109	+ "<devCAMN>=(<devConsonant>{<nukta>});"
90ce3da70b43 Initial load duke parents: diff changeset	110	+ "<devConsonant1>=(<devNuktaConsonant>\|<devCAMN>);"
90ce3da70b43 Initial load duke parents: diff changeset	111	+ "<zwj>=[\u200d];"
90ce3da70b43 Initial load duke parents: diff changeset	112	+ "<devConjunct>=({<devConsonant1><virama>{<zwj>}}<devConsonant1>);"
90ce3da70b43 Initial load duke parents: diff changeset	113	+ "<devConjunct>{<devVowelSign>}{<devCharEnd>};"
90ce3da70b43 Initial load duke parents: diff changeset	114	+ "<danda><nukta>;"
90ce3da70b43 Initial load duke parents: diff changeset	115	},
90ce3da70b43 Initial load duke parents: diff changeset	116
90ce3da70b43 Initial load duke parents: diff changeset	117	// default rules for finding word boundaries
90ce3da70b43 Initial load duke parents: diff changeset	118	{ "WordBreakRules",
90ce3da70b43 Initial load duke parents: diff changeset	119	// ignore non-spacing marks, enclosing marks, and format characters,
90ce3da70b43 Initial load duke parents: diff changeset	120	// all of which should not influence the algorithm
90ce3da70b43 Initial load duke parents: diff changeset	121	//"<ignore>=[:Mn::Me::Cf:];"
90ce3da70b43 Initial load duke parents: diff changeset	122	"<ignore>=[:Cf:];"
90ce3da70b43 Initial load duke parents: diff changeset	123
90ce3da70b43 Initial load duke parents: diff changeset	124	+ "<enclosing>=[:Mn::Me:];"
90ce3da70b43 Initial load duke parents: diff changeset	125
90ce3da70b43 Initial load duke parents: diff changeset	126	// Hindi phrase separator, kanji, katakana, hiragana, CJK diacriticals,
90ce3da70b43 Initial load duke parents: diff changeset	127	// other letters, and digits
90ce3da70b43 Initial load duke parents: diff changeset	128	+ "<danda>=[\u0964\u0965];"
90ce3da70b43 Initial load duke parents: diff changeset	129	+ "<kanji>=[\u3005\u4e00-\u9fa5\uf900-\ufa2d];"
90ce3da70b43 Initial load duke parents: diff changeset	130	+ "<kata>=[\u30a1-\u30fa\u30fd\u30fe];"
90ce3da70b43 Initial load duke parents: diff changeset	131	+ "<hira>=[\u3041-\u3094\u309d\u309e];"
90ce3da70b43 Initial load duke parents: diff changeset	132	+ "<cjk-diacrit>=[\u3099-\u309c\u30fb\u30fc];"
90ce3da70b43 Initial load duke parents: diff changeset	133	+ "<letter-base>=[:L::Mc:^[<kanji><kata><hira><cjk-diacrit>]];"
90ce3da70b43 Initial load duke parents: diff changeset	134	+ "<let>=(<letter-base><enclosing>*);"
90ce3da70b43 Initial load duke parents: diff changeset	135	+ "<digit-base>=[:N:];"
90ce3da70b43 Initial load duke parents: diff changeset	136	+ "<dgt>=(<digit-base><enclosing>*);"
90ce3da70b43 Initial load duke parents: diff changeset	137
90ce3da70b43 Initial load duke parents: diff changeset	138	// punctuation that can occur in the middle of a word: currently
90ce3da70b43 Initial load duke parents: diff changeset	139	// dashes, apostrophes, quotation marks, and periods
90ce3da70b43 Initial load duke parents: diff changeset	140	+ "<mid-word>=[:Pd::Pc:\u00ad\u2027\\\"\\\'\\.];"
90ce3da70b43 Initial load duke parents: diff changeset	141
90ce3da70b43 Initial load duke parents: diff changeset	142	// punctuation that can occur in the middle of a number: currently
90ce3da70b43 Initial load duke parents: diff changeset	143	// apostrophes, qoutation marks, periods, commas, and the Arabic
90ce3da70b43 Initial load duke parents: diff changeset	144	// decimal point
90ce3da70b43 Initial load duke parents: diff changeset	145	+ "<mid-num>=[\\\"\\\'\\,\u066b\\.];"
90ce3da70b43 Initial load duke parents: diff changeset	146
90ce3da70b43 Initial load duke parents: diff changeset	147	// punctuation that can occur at the beginning of a number: currently
90ce3da70b43 Initial load duke parents: diff changeset	148	// the period, the number sign, and all currency symbols except the cents sign
90ce3da70b43 Initial load duke parents: diff changeset	149	+ "<pre-num>=[:Sc:\\#\\.^\u00a2];"
90ce3da70b43 Initial load duke parents: diff changeset	150
90ce3da70b43 Initial load duke parents: diff changeset	151	// punctuation that can occur at the end of a number: currently
90ce3da70b43 Initial load duke parents: diff changeset	152	// the percent, per-thousand, per-ten-thousand, and Arabic percent
90ce3da70b43 Initial load duke parents: diff changeset	153	// signs, the cents sign, and the ampersand
90ce3da70b43 Initial load duke parents: diff changeset	154	+ "<post-num>=[\\%\\&\u00a2\u066a\u2030\u2031];"
90ce3da70b43 Initial load duke parents: diff changeset	155
90ce3da70b43 Initial load duke parents: diff changeset	156	// line separators: currently LF, FF, PS, and LS
90ce3da70b43 Initial load duke parents: diff changeset	157	+ "<ls>=[\n\u000c\u2028\u2029];"
90ce3da70b43 Initial load duke parents: diff changeset	158
90ce3da70b43 Initial load duke parents: diff changeset	159	// whitespace: all space separators and the tab character
90ce3da70b43 Initial load duke parents: diff changeset	160	+ "<ws-base>=[:Zs:\t];"
90ce3da70b43 Initial load duke parents: diff changeset	161	+ "<ws>=(<ws-base><enclosing>*);"
90ce3da70b43 Initial load duke parents: diff changeset	162
90ce3da70b43 Initial load duke parents: diff changeset	163	// a word is a sequence of letters that may contain internal
90ce3da70b43 Initial load duke parents: diff changeset	164	// punctuation, as long as it begins and ends with a letter and
90ce3da70b43 Initial load duke parents: diff changeset	165	// never contains two punctuation marks in a row
90ce3da70b43 Initial load duke parents: diff changeset	166	+ "<word>=((<let><let>(<mid-word><let><let>)*){<danda>});"
90ce3da70b43 Initial load duke parents: diff changeset	167
90ce3da70b43 Initial load duke parents: diff changeset	168	// a number is a sequence of digits that may contain internal
90ce3da70b43 Initial load duke parents: diff changeset	169	// punctuation, as long as it begins and ends with a digit and
90ce3da70b43 Initial load duke parents: diff changeset	170	// never contains two punctuation marks in a row.
90ce3da70b43 Initial load duke parents: diff changeset	171	+ "<number>=(<dgt><dgt>(<mid-num><dgt><dgt>)*);"
90ce3da70b43 Initial load duke parents: diff changeset	172
90ce3da70b43 Initial load duke parents: diff changeset	173	// break after every character, with the following exceptions
90ce3da70b43 Initial load duke parents: diff changeset	174	// (this will cause punctuation marks that aren't considered
90ce3da70b43 Initial load duke parents: diff changeset	175	// part of words or numbers to be treated as words unto themselves)
90ce3da70b43 Initial load duke parents: diff changeset	176	+ ".;"
90ce3da70b43 Initial load duke parents: diff changeset	177
90ce3da70b43 Initial load duke parents: diff changeset	178	// keep together any sequence of contiguous words and numbers
90ce3da70b43 Initial load duke parents: diff changeset	179	// (including just one of either), plus an optional trailing
90ce3da70b43 Initial load duke parents: diff changeset	180	// number-suffix character
90ce3da70b43 Initial load duke parents: diff changeset	181	+ "{<word>}(<number><word>)*{<number>{<post-num>}};"
90ce3da70b43 Initial load duke parents: diff changeset	182
90ce3da70b43 Initial load duke parents: diff changeset	183	// keep together and sequence of contiguous words and numbers
90ce3da70b43 Initial load duke parents: diff changeset	184	// that starts with a number-prefix character and a number,
90ce3da70b43 Initial load duke parents: diff changeset	185	// and may end with a number-suffix character
90ce3da70b43 Initial load duke parents: diff changeset	186	+ "<pre-num>(<number><word>)*{<number>{<post-num>}};"
90ce3da70b43 Initial load duke parents: diff changeset	187
90ce3da70b43 Initial load duke parents: diff changeset	188	// keep together runs of whitespace (optionally with a single trailing
90ce3da70b43 Initial load duke parents: diff changeset	189	// line separator or CRLF sequence)
90ce3da70b43 Initial load duke parents: diff changeset	190	+ "<ws>*{\r}{<ls>};"
90ce3da70b43 Initial load duke parents: diff changeset	191
90ce3da70b43 Initial load duke parents: diff changeset	192	// keep together runs of Katakana and CJK diacritical marks
90ce3da70b43 Initial load duke parents: diff changeset	193	+ "[<kata><cjk-diacrit>]*;"
90ce3da70b43 Initial load duke parents: diff changeset	194
90ce3da70b43 Initial load duke parents: diff changeset	195	// keep together runs of Hiragana and CJK diacritical marks
90ce3da70b43 Initial load duke parents: diff changeset	196	+ "[<hira><cjk-diacrit>]*;"
90ce3da70b43 Initial load duke parents: diff changeset	197
90ce3da70b43 Initial load duke parents: diff changeset	198	// keep together runs of Kanji
90ce3da70b43 Initial load duke parents: diff changeset	199	+ "<kanji>*;"
90ce3da70b43 Initial load duke parents: diff changeset	200
90ce3da70b43 Initial load duke parents: diff changeset	201	// keep together anything else and an enclosing mark
90ce3da70b43 Initial load duke parents: diff changeset	202	+ "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];"
90ce3da70b43 Initial load duke parents: diff changeset	203	+ "<base><enclosing><enclosing>*;"
90ce3da70b43 Initial load duke parents: diff changeset	204	},
90ce3da70b43 Initial load duke parents: diff changeset	205
90ce3da70b43 Initial load duke parents: diff changeset	206	// default rules for determining legal line-breaking positions
90ce3da70b43 Initial load duke parents: diff changeset	207	{ "LineBreakRules",
90ce3da70b43 Initial load duke parents: diff changeset	208	// characters that always cause a break: ETX, tab, LF, FF, LS, and PS
90ce3da70b43 Initial load duke parents: diff changeset	209	"<break>=[\u0003\t\n\f\u2028\u2029];"
90ce3da70b43 Initial load duke parents: diff changeset	210
90ce3da70b43 Initial load duke parents: diff changeset	211	// ignore format characters and control characters EXCEPT for breaking chars
90ce3da70b43 Initial load duke parents: diff changeset	212	+ "<ignore>=[:Cf:[:Cc:^[<break>\r]]];"
90ce3da70b43 Initial load duke parents: diff changeset	213
90ce3da70b43 Initial load duke parents: diff changeset	214	// enclosing marks
90ce3da70b43 Initial load duke parents: diff changeset	215	+ "<enclosing>=[:Mn::Me:];"
90ce3da70b43 Initial load duke parents: diff changeset	216
90ce3da70b43 Initial load duke parents: diff changeset	217	// Hindi phrase separators
90ce3da70b43 Initial load duke parents: diff changeset	218	+ "<danda>=[\u0964\u0965];"
90ce3da70b43 Initial load duke parents: diff changeset	219
90ce3da70b43 Initial load duke parents: diff changeset	220	// characters that always prevent a break: the non-breaking space
90ce3da70b43 Initial load duke parents: diff changeset	221	// and similar characters
90ce3da70b43 Initial load duke parents: diff changeset	222	+ "<glue>=[\u00a0\u0f0c\u2007\u2011\u202f\ufeff];"
90ce3da70b43 Initial load duke parents: diff changeset	223
90ce3da70b43 Initial load duke parents: diff changeset	224	// whitespace: space separators and control characters, except for
90ce3da70b43 Initial load duke parents: diff changeset	225	// CR and the other characters mentioned above
90ce3da70b43 Initial load duke parents: diff changeset	226	+ "<space>=[:Zs::Cc:^[<glue><break>\r]];"
90ce3da70b43 Initial load duke parents: diff changeset	227
90ce3da70b43 Initial load duke parents: diff changeset	228	// dashes: dash punctuation and the discretionary hyphen, except for
90ce3da70b43 Initial load duke parents: diff changeset	229	// non-breaking hyphens
90ce3da70b43 Initial load duke parents: diff changeset	230	+ "<dash>=[:Pd:\u00ad^<glue>];"
90ce3da70b43 Initial load duke parents: diff changeset	231
90ce3da70b43 Initial load duke parents: diff changeset	232	// characters that stick to a word if they precede it: currency symbols
90ce3da70b43 Initial load duke parents: diff changeset	233	// (except the cents sign) and starting punctuation
90ce3da70b43 Initial load duke parents: diff changeset	234	+ "<pre-word>=[:Sc::Ps::Pi:^[\u00a2]\\\"\\\'];"
90ce3da70b43 Initial load duke parents: diff changeset	235
90ce3da70b43 Initial load duke parents: diff changeset	236	// characters that stick to a word if they follow it: ending punctuation,
90ce3da70b43 Initial load duke parents: diff changeset	237	// other punctuation that usually occurs at the end of a sentence,
90ce3da70b43 Initial load duke parents: diff changeset	238	// small Kana characters, some CJK diacritics, etc.
90ce3da70b43 Initial load duke parents: diff changeset	239	+ "<post-word>=[\\\":Pe::Pf:\\!\\%\\.\\,\\:\\;\\?\u00a2\u00b0\u066a\u2030-\u2034\u2103"
90ce3da70b43 Initial load duke parents: diff changeset	240	+ "\u2105\u2109\u3001\u3002\u3005\u3041\u3043\u3045\u3047\u3049\u3063"
90ce3da70b43 Initial load duke parents: diff changeset	241	+ "\u3083\u3085\u3087\u308e\u3099-\u309e\u30a1\u30a3\u30a5\u30a7\u30a9"
90ce3da70b43 Initial load duke parents: diff changeset	242	+ "\u30c3\u30e3\u30e5\u30e7\u30ee\u30f5\u30f6\u30fc-\u30fe\uff01\uff05"
90ce3da70b43 Initial load duke parents: diff changeset	243	+ "\uff0c\uff0e\uff1a\uff1b\uff1f];"
90ce3da70b43 Initial load duke parents: diff changeset	244
90ce3da70b43 Initial load duke parents: diff changeset	245	// Kanji: actually includes Kanji,Kana and Hangul syllables,
90ce3da70b43 Initial load duke parents: diff changeset	246	// except for small Kana and CJK diacritics
90ce3da70b43 Initial load duke parents: diff changeset	247	+ "<kanji>=[\u4e00-\u9fa5\uac00-\ud7a3\uf900-\ufa2d\ufa30-\ufa6a\u3041-\u3094\u30a1-\u30fa^[<post-word><ignore>]];"
90ce3da70b43 Initial load duke parents: diff changeset	248
90ce3da70b43 Initial load duke parents: diff changeset	249	// digits
90ce3da70b43 Initial load duke parents: diff changeset	250	+ "<digit>=[:Nd::No:];"
90ce3da70b43 Initial load duke parents: diff changeset	251
90ce3da70b43 Initial load duke parents: diff changeset	252	// punctuation that can occur in the middle of a number: periods and commas
90ce3da70b43 Initial load duke parents: diff changeset	253	+ "<mid-num>=[\\.\\,];"
90ce3da70b43 Initial load duke parents: diff changeset	254
90ce3da70b43 Initial load duke parents: diff changeset	255	// everything not mentioned above
90ce3da70b43 Initial load duke parents: diff changeset	256	+ "<char>=[^[<break><space><dash><kanji><glue><ignore><pre-word><post-word><mid-num>\r<danda>]];"
90ce3da70b43 Initial load duke parents: diff changeset	257
90ce3da70b43 Initial load duke parents: diff changeset	258	// a "number" is a run of prefix characters and dashes, followed by one or
90ce3da70b43 Initial load duke parents: diff changeset	259	// more digits with isolated number-punctuation characters interspersed
90ce3da70b43 Initial load duke parents: diff changeset	260	+ "<number>=([<pre-word><dash>]<digit><digit>(<mid-num><digit><digit>));"
90ce3da70b43 Initial load duke parents: diff changeset	261
90ce3da70b43 Initial load duke parents: diff changeset	262	// the basic core of a word can be either a "number" as defined above, a single
90ce3da70b43 Initial load duke parents: diff changeset	263	// "Kanji" character, or a run of any number of not-explicitly-mentioned
90ce3da70b43 Initial load duke parents: diff changeset	264	// characters (this includes Latin letters)
90ce3da70b43 Initial load duke parents: diff changeset	265	+ "<word-core>=(<char>*\|<kanji>\|<number>);"
90ce3da70b43 Initial load duke parents: diff changeset	266
90ce3da70b43 Initial load duke parents: diff changeset	267	// a word may end with an optional suffix that be either a run of one or
90ce3da70b43 Initial load duke parents: diff changeset	268	// more dashes or a run of word-suffix characters
90ce3da70b43 Initial load duke parents: diff changeset	269	+ "<word-suffix>=((<dash><dash>\|<post-word>));"
90ce3da70b43 Initial load duke parents: diff changeset	270
90ce3da70b43 Initial load duke parents: diff changeset	271	// a word, thus, is an optional run of word-prefix characters, followed by
90ce3da70b43 Initial load duke parents: diff changeset	272	// a word core and a word suffix (the syntax of <word-core> and <word-suffix>
90ce3da70b43 Initial load duke parents: diff changeset	273	// actually allows either of them to match the empty string, putting a break
90ce3da70b43 Initial load duke parents: diff changeset	274	// between things like ")(" or "aaa(aaa"
90ce3da70b43 Initial load duke parents: diff changeset	275	+ "<word>=(<pre-word>*<word-core><word-suffix>);"
90ce3da70b43 Initial load duke parents: diff changeset	276
90ce3da70b43 Initial load duke parents: diff changeset	277	+ "<hack1>=[\\(];"
90ce3da70b43 Initial load duke parents: diff changeset	278	+ "<hack2>=[\\)];"
90ce3da70b43 Initial load duke parents: diff changeset	279	+ "<hack3>=[\\$\\'];"
90ce3da70b43 Initial load duke parents: diff changeset	280
90ce3da70b43 Initial load duke parents: diff changeset	281	// finally, the rule that does the work: Keep together any run of words that
90ce3da70b43 Initial load duke parents: diff changeset	282	// are joined by runs of one of more non-spacing mark. Also keep a trailing
90ce3da70b43 Initial load duke parents: diff changeset	283	// line-break character or CRLF combination with the word. (line separators
90ce3da70b43 Initial load duke parents: diff changeset	284	// "win" over nbsp's)
90ce3da70b43 Initial load duke parents: diff changeset	285	+ "<word>(((<space><glue><glue>{<space>})\|<hack3>)<word>)<space>{<enclosing>}{<hack1><hack2><post-word>}{<enclosing>*}{\r}{<break>};"
90ce3da70b43 Initial load duke parents: diff changeset	286	+ "\r<break>;"
90ce3da70b43 Initial load duke parents: diff changeset	287	},
90ce3da70b43 Initial load duke parents: diff changeset	288
90ce3da70b43 Initial load duke parents: diff changeset	289	// default rules for finding sentence boundaries
90ce3da70b43 Initial load duke parents: diff changeset	290	{ "SentenceBreakRules",
90ce3da70b43 Initial load duke parents: diff changeset	291	// ignore non-spacing marks, enclosing marks, and format characters
90ce3da70b43 Initial load duke parents: diff changeset	292	"<ignore>=[:Mn::Me::Cf:];"
90ce3da70b43 Initial load duke parents: diff changeset	293
90ce3da70b43 Initial load duke parents: diff changeset	294	// letters
90ce3da70b43 Initial load duke parents: diff changeset	295	+ "<letter>=[:L:];"
90ce3da70b43 Initial load duke parents: diff changeset	296
90ce3da70b43 Initial load duke parents: diff changeset	297	// lowercase letters
90ce3da70b43 Initial load duke parents: diff changeset	298	+ "<lc>=[:Ll:];"
90ce3da70b43 Initial load duke parents: diff changeset	299
90ce3da70b43 Initial load duke parents: diff changeset	300	// uppercase letters
90ce3da70b43 Initial load duke parents: diff changeset	301	+ "<uc>=[:Lu:];"
90ce3da70b43 Initial load duke parents: diff changeset	302
90ce3da70b43 Initial load duke parents: diff changeset	303	// NOT lowercase letters
90ce3da70b43 Initial load duke parents: diff changeset	304	+ "<notlc>=[<letter>^<lc>];"
90ce3da70b43 Initial load duke parents: diff changeset	305
90ce3da70b43 Initial load duke parents: diff changeset	306	// whitespace (line separators are treated as whitespace)
90ce3da70b43 Initial load duke parents: diff changeset	307	+ "<space>=[\t\r\f\n\u2028:Zs:];"
90ce3da70b43 Initial load duke parents: diff changeset	308
90ce3da70b43 Initial load duke parents: diff changeset	309	// punctuation which may occur at the beginning of a sentence: "starting
90ce3da70b43 Initial load duke parents: diff changeset	310	// punctuation" and quotation marks
90ce3da70b43 Initial load duke parents: diff changeset	311	+ "<start-punctuation>=[:Ps::Pi:\\\"\\\'];"
90ce3da70b43 Initial load duke parents: diff changeset	312
90ce3da70b43 Initial load duke parents: diff changeset	313	// punctuation with may occur at the end of a sentence: "ending punctuation"
90ce3da70b43 Initial load duke parents: diff changeset	314	// and quotation marks
90ce3da70b43 Initial load duke parents: diff changeset	315	+ "<end>=[:Pe::Pf:\\\"\\\'];"
90ce3da70b43 Initial load duke parents: diff changeset	316
90ce3da70b43 Initial load duke parents: diff changeset	317	// digits
90ce3da70b43 Initial load duke parents: diff changeset	318	+ "<digit>=[:N:];"
90ce3da70b43 Initial load duke parents: diff changeset	319
90ce3da70b43 Initial load duke parents: diff changeset	320	// characters that unambiguously signal the end of a sentence
90ce3da70b43 Initial load duke parents: diff changeset	321	+ "<term>=[\\!\\?\u3002\uff01\uff1f];"
90ce3da70b43 Initial load duke parents: diff changeset	322
90ce3da70b43 Initial load duke parents: diff changeset	323	// periods, which MAY signal the end of a sentence
90ce3da70b43 Initial load duke parents: diff changeset	324	+ "<period>=[\\.\uff0e];"
90ce3da70b43 Initial load duke parents: diff changeset	325
90ce3da70b43 Initial load duke parents: diff changeset	326	// characters that may occur at the beginning of a sentence: basically anything
90ce3da70b43 Initial load duke parents: diff changeset	327	// not mentioned above (letters and digits are specifically excluded)
90ce3da70b43 Initial load duke parents: diff changeset	328	+ "<sent-start>=[^[:L:<space><start-punctuation><end><digit><term><period>\u2029<ignore>]];"
90ce3da70b43 Initial load duke parents: diff changeset	329
90ce3da70b43 Initial load duke parents: diff changeset	330	// Hindi phrase separator
90ce3da70b43 Initial load duke parents: diff changeset	331	+ "<danda>=[\u0964\u0965];"
90ce3da70b43 Initial load duke parents: diff changeset	332
90ce3da70b43 Initial load duke parents: diff changeset	333	// always break sentences after paragraph separators
90ce3da70b43 Initial load duke parents: diff changeset	334	+ ".*?{\u2029};"
90ce3da70b43 Initial load duke parents: diff changeset	335
90ce3da70b43 Initial load duke parents: diff changeset	336	// always break after a danda, if it's followed by whitespace
90ce3da70b43 Initial load duke parents: diff changeset	337	+ ".?<danda><space>;"
90ce3da70b43 Initial load duke parents: diff changeset	338
90ce3da70b43 Initial load duke parents: diff changeset	339	// if you see a period, skip over additional periods and ending punctuation
90ce3da70b43 Initial load duke parents: diff changeset	340	// and if the next character is a paragraph separator, break after the
90ce3da70b43 Initial load duke parents: diff changeset	341	// paragraph separator
90ce3da70b43 Initial load duke parents: diff changeset	342	//+ ".?<period>[<period><end>]<space>*\u2029;"
90ce3da70b43 Initial load duke parents: diff changeset	343	//+ ".?[<period><end>]<space>*\u2029;"
90ce3da70b43 Initial load duke parents: diff changeset	344
90ce3da70b43 Initial load duke parents: diff changeset	345	// if you see a period, skip over additional periods and ending punctuation,
90ce3da70b43 Initial load duke parents: diff changeset	346	// followed by optional whitespace, followed by optional starting punctuation,
90ce3da70b43 Initial load duke parents: diff changeset	347	// and if the next character is something that can start a sentence
90ce3da70b43 Initial load duke parents: diff changeset	348	// (basically, a capital letter), then put the sentence break between the
90ce3da70b43 Initial load duke parents: diff changeset	349	// whitespace and the opening punctuation
90ce3da70b43 Initial load duke parents: diff changeset	350	+ ".?<period>[<period><end>]<space><space>*/<notlc>;"
90ce3da70b43 Initial load duke parents: diff changeset	351	+ ".?<period>[<period><end>]<space>/[<start-punctuation><sent-start>][<start-punctuation><sent-start>]<letter>;"
90ce3da70b43 Initial load duke parents: diff changeset	352
90ce3da70b43 Initial load duke parents: diff changeset	353	// if you see a sentence-terminating character, skip over any additional
90ce3da70b43 Initial load duke parents: diff changeset	354	// terminators, periods, or ending punctuation, followed by any whitespace,
90ce3da70b43 Initial load duke parents: diff changeset	355	// followed by a SINGLE optional paragraph separator, and put the break there
90ce3da70b43 Initial load duke parents: diff changeset	356	+ ".?<term>[<term><period><end>]<space>*{\u2029};"
90ce3da70b43 Initial load duke parents: diff changeset	357
90ce3da70b43 Initial load duke parents: diff changeset	358	// The following rules are here to aid in backwards iteration. The automatically
90ce3da70b43 Initial load duke parents: diff changeset	359	// generated backwards state table will rewind to the beginning of the
90ce3da70b43 Initial load duke parents: diff changeset	360	// paragraph all the time (or all the way to the beginning of the document
90ce3da70b43 Initial load duke parents: diff changeset	361	// if the document doesn't use the Unicode PS character) because the only
90ce3da70b43 Initial load duke parents: diff changeset	362	// unambiguous character pairs are those involving paragraph separators.
90ce3da70b43 Initial load duke parents: diff changeset	363	// These specify a few more unambiguous breaking situations.
90ce3da70b43 Initial load duke parents: diff changeset	364
90ce3da70b43 Initial load duke parents: diff changeset	365	// if you see a sentence-starting character, followed by starting punctuation
90ce3da70b43 Initial load duke parents: diff changeset	366	// (remember, we're iterating backwards), followed by an optional run of
90ce3da70b43 Initial load duke parents: diff changeset	367	// whitespace, followed by an optional run of ending punctuation, followed
90ce3da70b43 Initial load duke parents: diff changeset	368	// by a period, this is a safe place to turn around
90ce3da70b43 Initial load duke parents: diff changeset	369	+ "!<sent-start><start-punctuation><space><end>*<period>;"
90ce3da70b43 Initial load duke parents: diff changeset	370
90ce3da70b43 Initial load duke parents: diff changeset	371	// if you see a letter or a digit, followed by an optional run of
90ce3da70b43 Initial load duke parents: diff changeset	372	// starting punctuation, followed by an optional run of whitespace,
90ce3da70b43 Initial load duke parents: diff changeset	373	// followed by an optional run of ending punctuation, followed by
90ce3da70b43 Initial load duke parents: diff changeset	374	// a sentence terminator, this is a safe place to turn around
90ce3da70b43 Initial load duke parents: diff changeset	375	+ "![<sent-start><lc><digit>]<start-punctuation><space><end>*<term>;"
90ce3da70b43 Initial load duke parents: diff changeset	376	}
90ce3da70b43 Initial load duke parents: diff changeset	377	};
90ce3da70b43 Initial load duke parents: diff changeset	378	}
90ce3da70b43 Initial load duke parents: diff changeset	379	}

author	ohair
	Tue, 25 May 2010 15:58:33 -0700
changeset 5506	202f599c92aa
parent 2	90ce3da70b43
permissions	-rw-r--r--