1 /* |
|
2 * Copyright © 2011,2012,2014 Google, Inc. |
|
3 * |
|
4 * This is part of HarfBuzz, a text shaping library. |
|
5 * |
|
6 * Permission is hereby granted, without written agreement and without |
|
7 * license or royalty fees, to use, copy, modify, and distribute this |
|
8 * software and its documentation for any purpose, provided that the |
|
9 * above copyright notice and the following two paragraphs appear in |
|
10 * all copies of this software. |
|
11 * |
|
12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
|
13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
|
14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
|
15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
|
16 * DAMAGE. |
|
17 * |
|
18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
|
19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
|
21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
|
22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
|
23 * |
|
24 * Google Author(s): Behdad Esfahbod |
|
25 */ |
|
26 |
|
27 #ifndef HB_UTF_PRIVATE_HH |
|
28 #define HB_UTF_PRIVATE_HH |
|
29 |
|
30 #include "hb-private.hh" |
|
31 |
|
32 |
|
33 struct hb_utf8_t |
|
34 { |
|
35 typedef uint8_t codepoint_t; |
|
36 |
|
37 static inline const uint8_t * |
|
38 next (const uint8_t *text, |
|
39 const uint8_t *end, |
|
40 hb_codepoint_t *unicode, |
|
41 hb_codepoint_t replacement) |
|
42 { |
|
43 /* Written to only accept well-formed sequences. |
|
44 * Based on ideas from ICU's U8_NEXT. |
|
45 * Generates one "replacement" for each ill-formed byte. */ |
|
46 |
|
47 hb_codepoint_t c = *text++; |
|
48 |
|
49 if (c > 0x7Fu) |
|
50 { |
|
51 if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */ |
|
52 { |
|
53 unsigned int t1; |
|
54 if (likely (text < end && |
|
55 (t1 = text[0] - 0x80u) <= 0x3Fu)) |
|
56 { |
|
57 c = ((c&0x1Fu)<<6) | t1; |
|
58 text++; |
|
59 } |
|
60 else |
|
61 goto error; |
|
62 } |
|
63 else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */ |
|
64 { |
|
65 unsigned int t1, t2; |
|
66 if (likely (1 < end - text && |
|
67 (t1 = text[0] - 0x80u) <= 0x3Fu && |
|
68 (t2 = text[1] - 0x80u) <= 0x3Fu)) |
|
69 { |
|
70 c = ((c&0xFu)<<12) | (t1<<6) | t2; |
|
71 if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
|
72 goto error; |
|
73 text += 2; |
|
74 } |
|
75 else |
|
76 goto error; |
|
77 } |
|
78 else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */ |
|
79 { |
|
80 unsigned int t1, t2, t3; |
|
81 if (likely (2 < end - text && |
|
82 (t1 = text[0] - 0x80u) <= 0x3Fu && |
|
83 (t2 = text[1] - 0x80u) <= 0x3Fu && |
|
84 (t3 = text[2] - 0x80u) <= 0x3Fu)) |
|
85 { |
|
86 c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; |
|
87 if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu))) |
|
88 goto error; |
|
89 text += 3; |
|
90 } |
|
91 else |
|
92 goto error; |
|
93 } |
|
94 else |
|
95 goto error; |
|
96 } |
|
97 |
|
98 *unicode = c; |
|
99 return text; |
|
100 |
|
101 error: |
|
102 *unicode = replacement; |
|
103 return text; |
|
104 } |
|
105 |
|
106 static inline const uint8_t * |
|
107 prev (const uint8_t *text, |
|
108 const uint8_t *start, |
|
109 hb_codepoint_t *unicode, |
|
110 hb_codepoint_t replacement) |
|
111 { |
|
112 const uint8_t *end = text--; |
|
113 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) |
|
114 text--; |
|
115 |
|
116 if (likely (next (text, end, unicode, replacement) == end)) |
|
117 return text; |
|
118 |
|
119 *unicode = replacement; |
|
120 return end - 1; |
|
121 } |
|
122 |
|
123 static inline unsigned int |
|
124 strlen (const uint8_t *text) |
|
125 { |
|
126 return ::strlen ((const char *) text); |
|
127 } |
|
128 }; |
|
129 |
|
130 |
|
131 struct hb_utf16_t |
|
132 { |
|
133 typedef uint16_t codepoint_t; |
|
134 |
|
135 static inline const uint16_t * |
|
136 next (const uint16_t *text, |
|
137 const uint16_t *end, |
|
138 hb_codepoint_t *unicode, |
|
139 hb_codepoint_t replacement) |
|
140 { |
|
141 hb_codepoint_t c = *text++; |
|
142 |
|
143 if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
|
144 { |
|
145 *unicode = c; |
|
146 return text; |
|
147 } |
|
148 |
|
149 if (likely (c <= 0xDBFFu && text < end)) |
|
150 { |
|
151 /* High-surrogate in c */ |
|
152 hb_codepoint_t l = *text; |
|
153 if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) |
|
154 { |
|
155 /* Low-surrogate in l */ |
|
156 *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
|
157 text++; |
|
158 return text; |
|
159 } |
|
160 } |
|
161 |
|
162 /* Lonely / out-of-order surrogate. */ |
|
163 *unicode = replacement; |
|
164 return text; |
|
165 } |
|
166 |
|
167 static inline const uint16_t * |
|
168 prev (const uint16_t *text, |
|
169 const uint16_t *start, |
|
170 hb_codepoint_t *unicode, |
|
171 hb_codepoint_t replacement) |
|
172 { |
|
173 hb_codepoint_t c = *--text; |
|
174 |
|
175 if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
|
176 { |
|
177 *unicode = c; |
|
178 return text; |
|
179 } |
|
180 |
|
181 if (likely (c >= 0xDC00u && start < text)) |
|
182 { |
|
183 /* Low-surrogate in c */ |
|
184 hb_codepoint_t h = text[-1]; |
|
185 if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu))) |
|
186 { |
|
187 /* High-surrogate in h */ |
|
188 *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
|
189 text--; |
|
190 return text; |
|
191 } |
|
192 } |
|
193 |
|
194 /* Lonely / out-of-order surrogate. */ |
|
195 *unicode = replacement; |
|
196 return text; |
|
197 } |
|
198 |
|
199 |
|
200 static inline unsigned int |
|
201 strlen (const uint16_t *text) |
|
202 { |
|
203 unsigned int l = 0; |
|
204 while (*text++) l++; |
|
205 return l; |
|
206 } |
|
207 }; |
|
208 |
|
209 |
|
210 template <bool validate=true> |
|
211 struct hb_utf32_t |
|
212 { |
|
213 typedef uint32_t codepoint_t; |
|
214 |
|
215 static inline const uint32_t * |
|
216 next (const uint32_t *text, |
|
217 const uint32_t *end HB_UNUSED, |
|
218 hb_codepoint_t *unicode, |
|
219 hb_codepoint_t replacement) |
|
220 { |
|
221 hb_codepoint_t c = *unicode = *text++; |
|
222 if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
|
223 *unicode = replacement; |
|
224 return text; |
|
225 } |
|
226 |
|
227 static inline const uint32_t * |
|
228 prev (const uint32_t *text, |
|
229 const uint32_t *start HB_UNUSED, |
|
230 hb_codepoint_t *unicode, |
|
231 hb_codepoint_t replacement) |
|
232 { |
|
233 hb_codepoint_t c = *unicode = *--text; |
|
234 if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
|
235 *unicode = replacement; |
|
236 return text; |
|
237 } |
|
238 |
|
239 static inline unsigned int |
|
240 strlen (const uint32_t *text) |
|
241 { |
|
242 unsigned int l = 0; |
|
243 while (*text++) l++; |
|
244 return l; |
|
245 } |
|
246 }; |
|
247 |
|
248 |
|
249 struct hb_latin1_t |
|
250 { |
|
251 typedef uint8_t codepoint_t; |
|
252 |
|
253 static inline const uint8_t * |
|
254 next (const uint8_t *text, |
|
255 const uint8_t *end HB_UNUSED, |
|
256 hb_codepoint_t *unicode, |
|
257 hb_codepoint_t replacement HB_UNUSED) |
|
258 { |
|
259 *unicode = *text++; |
|
260 return text; |
|
261 } |
|
262 |
|
263 static inline const uint8_t * |
|
264 prev (const uint8_t *text, |
|
265 const uint8_t *start HB_UNUSED, |
|
266 hb_codepoint_t *unicode, |
|
267 hb_codepoint_t replacement) |
|
268 { |
|
269 *unicode = *--text; |
|
270 return text; |
|
271 } |
|
272 |
|
273 static inline unsigned int |
|
274 strlen (const uint8_t *text) |
|
275 { |
|
276 unsigned int l = 0; |
|
277 while (*text++) l++; |
|
278 return l; |
|
279 } |
|
280 }; |
|
281 |
|
282 #endif /* HB_UTF_PRIVATE_HH */ |
|