2
|
1 |
/*
|
|
2 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
3 |
*
|
|
4 |
* This code is free software; you can redistribute it and/or modify it
|
|
5 |
* under the terms of the GNU General Public License version 2 only, as
|
|
6 |
* published by the Free Software Foundation. Sun designates this
|
|
7 |
* particular file as subject to the "Classpath" exception as provided
|
|
8 |
* by Sun in the LICENSE file that accompanied this code.
|
|
9 |
*
|
|
10 |
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
11 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
12 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
13 |
* version 2 for more details (a copy is included in the LICENSE file that
|
|
14 |
* accompanied this code).
|
|
15 |
*
|
|
16 |
* You should have received a copy of the GNU General Public License version
|
|
17 |
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
18 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
19 |
*
|
|
20 |
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
|
|
21 |
* CA 95054 USA or visit www.sun.com if you need additional information or
|
|
22 |
* have any questions.
|
|
23 |
*
|
|
24 |
*/
|
|
25 |
|
|
26 |
/*
|
|
27 |
*
|
|
28 |
* (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved
|
|
29 |
*
|
|
30 |
* This file is a modification of the ICU file IndicReordering.h
|
|
31 |
* by Jens Herden and Javier Sola for Khmer language
|
|
32 |
*
|
|
33 |
*/
|
|
34 |
|
|
35 |
#ifndef __KHMERREORDERING_H
|
|
36 |
#define __KHMERREORDERING_H
|
|
37 |
|
|
38 |
#include "LETypes.h"
|
|
39 |
#include "OpenTypeTables.h"
|
|
40 |
|
|
41 |
class LEGlyphStorage;
|
|
42 |
|
|
43 |
// Vocabulary
|
|
44 |
|
|
45 |
// Base ->
|
|
46 |
// A consonant or an independent vowel in its full (not
|
|
47 |
// subscript) form. It is the center of the syllable, it can be
|
|
48 |
// souranded by coeng (subscript) consonants, vowels, split
|
|
49 |
// vowels, signs... but there is only one base in a syllable, it
|
|
50 |
// has to be coded as the first character of the syllable.
|
|
51 |
// split vowel ->
|
|
52 |
// vowel that has two parts placed separately (e.g. Before and
|
|
53 |
// after the consonant). Khmer language has five of them. Khmer
|
|
54 |
// split vowels either have one part before the base and one after
|
|
55 |
// the base or they have a part before the base and a part above
|
|
56 |
// the base. The first part of all Khmer split vowels is the same
|
|
57 |
// character, identical to the glyph of Khmer dependent vowel SRA
|
|
58 |
// EI
|
|
59 |
// coeng ->
|
|
60 |
// modifier used in Khmer to construct coeng (subscript)
|
|
61 |
// consonants differently than indian languages, the coeng
|
|
62 |
// modifies the consonant that follows it, not the one preceding
|
|
63 |
// it Each consonant has two forms, the base form and the
|
|
64 |
// subscript form the base form is the normal one (using the
|
|
65 |
// consonants code-point), the subscript form is displayed when
|
|
66 |
// the combination coeng + consonant is encountered.
|
|
67 |
// Consonant of type 1 ->
|
|
68 |
// A consonant which has subscript for that only occupies space
|
|
69 |
// under a base consonant
|
|
70 |
// Consonant of type 2 ->
|
|
71 |
// Its subscript form occupies space under and before the base
|
|
72 |
// (only one, RO)
|
|
73 |
// Consonant of Type 3 ->
|
|
74 |
// Its subscript form occupies space under and after the base
|
|
75 |
// (KHO, CHHO, THHO, BA, YO, SA)
|
|
76 |
// Consonant shifter ->
|
|
77 |
// Khmer has to series of consonants. The same dependent vowel has
|
|
78 |
// different sounds if it is attached to a consonant of the first
|
|
79 |
// series or a consonant of the second series Most consonants have
|
|
80 |
// an equivalent in the other series, but some of theme exist only
|
|
81 |
// in one series (for example SA). If we want to use the consonant
|
|
82 |
// SA with a vowel sound that can only be done with a vowel sound
|
|
83 |
// that corresponds to a vowel accompanying a consonant of the
|
|
84 |
// other series, then we need to use a consonant shifter: TRIISAP
|
|
85 |
// or MUSIKATOAN x17C9 y x17CA. TRIISAP changes a first series
|
|
86 |
// consonant to second series sound and MUSIKATOAN a second series
|
|
87 |
// consonant to have a first series vowel sound. Consonant
|
|
88 |
// shifter are both normally supercript marks, but, when they are
|
|
89 |
// followed by a superscript, they change shape and take the form
|
|
90 |
// of subscript dependent vowel SRA U. If they are in the same
|
|
91 |
// syllable as a coeng consonant, Unicode 3.0 says that they
|
|
92 |
// should be typed before the coeng. Unicode 4.0 breaks the
|
|
93 |
// standard and says that it should be placed after the coeng
|
|
94 |
// consonant.
|
|
95 |
// Dependent vowel ->
|
|
96 |
// In khmer dependent vowels can be placed above, below, before or
|
|
97 |
// after the base Each vowel has its own position. Only one vowel
|
|
98 |
// per syllable is allowed.
|
|
99 |
// Signs ->
|
|
100 |
// Khmer has above signs and post signs. Only one above sign
|
|
101 |
// and/or one post sign are Allowed in a syllable.
|
|
102 |
//
|
|
103 |
|
|
104 |
// This list must include all types of components that can be used
|
|
105 |
// inside a syllable
|
|
106 |
struct KhmerClassTable
|
|
107 |
{
|
|
108 |
// order is important here! This order must be the same that is
|
|
109 |
// found in each horizontal line in the statetable for Khmer (file
|
|
110 |
// KhmerReordering.cpp).
|
|
111 |
enum CharClassValues
|
|
112 |
{
|
|
113 |
CC_RESERVED = 0,
|
|
114 |
CC_CONSONANT = 1, // consonant of type 1 or independent vowel
|
|
115 |
CC_CONSONANT2 = 2, // Consonant of type 2
|
|
116 |
CC_CONSONANT3 = 3, // Consonant of type 3
|
|
117 |
CC_ZERO_WIDTH_NJ_MARK = 4, // Zero Width non joiner character (0x200C)
|
|
118 |
CC_CONSONANT_SHIFTER = 5,
|
|
119 |
CC_ROBAT = 6, // Khmer special diacritic accent
|
|
120 |
// -treated differently in state table
|
|
121 |
CC_COENG = 7, // Subscript consonant combining character
|
|
122 |
CC_DEPENDENT_VOWEL = 8,
|
|
123 |
CC_SIGN_ABOVE = 9,
|
|
124 |
CC_SIGN_AFTER = 10,
|
|
125 |
CC_ZERO_WIDTH_J_MARK = 11, // Zero width joiner character
|
|
126 |
CC_COUNT = 12 // This is the number of character classes
|
|
127 |
};
|
|
128 |
|
|
129 |
enum CharClassFlags
|
|
130 |
{
|
|
131 |
CF_CLASS_MASK = 0x0000FFFF,
|
|
132 |
|
|
133 |
CF_CONSONANT = 0x01000000, // flag to speed up comparing
|
|
134 |
CF_SPLIT_VOWEL = 0x02000000, // flag for a split vowel -> the first part
|
|
135 |
// is added in front of the syllable
|
|
136 |
CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with
|
|
137 |
// this flag is the first in a syllable
|
|
138 |
CF_COENG = 0x08000000, // flag to speed up comparing
|
|
139 |
CF_SHIFTER = 0x10000000, // flag to speed up comparing
|
|
140 |
CF_ABOVE_VOWEL = 0x20000000, // flag to speed up comparing
|
|
141 |
|
|
142 |
// position flags
|
|
143 |
CF_POS_BEFORE = 0x00080000,
|
|
144 |
CF_POS_BELOW = 0x00040000,
|
|
145 |
CF_POS_ABOVE = 0x00020000,
|
|
146 |
CF_POS_AFTER = 0x00010000,
|
|
147 |
CF_POS_MASK = 0x000f0000
|
|
148 |
};
|
|
149 |
|
|
150 |
typedef le_uint32 CharClass;
|
|
151 |
|
|
152 |
typedef le_int32 ScriptFlags;
|
|
153 |
|
|
154 |
LEUnicode firstChar; // for Khmer this will become x1780
|
|
155 |
LEUnicode lastChar; // and this x17DF
|
|
156 |
const CharClass *classTable;
|
|
157 |
|
|
158 |
CharClass getCharClass(LEUnicode ch) const;
|
|
159 |
|
|
160 |
static const KhmerClassTable *getKhmerClassTable();
|
|
161 |
};
|
|
162 |
|
|
163 |
|
|
164 |
class KhmerReordering {
|
|
165 |
public:
|
|
166 |
static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount,
|
|
167 |
le_int32 scriptCode, LEUnicode *outChars, LEGlyphStorage &glyphStorage);
|
|
168 |
|
|
169 |
static const FeatureMap *getFeatureMap(le_int32 &count);
|
|
170 |
|
|
171 |
private:
|
|
172 |
// do not instantiate
|
|
173 |
KhmerReordering();
|
|
174 |
|
|
175 |
static le_int32 findSyllable(const KhmerClassTable *classTable,
|
|
176 |
const LEUnicode *chars, le_int32 prev, le_int32 charCount);
|
|
177 |
};
|
|
178 |
|
|
179 |
#endif
|