character.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
character.c (18706B) | |
--- | |
1 #include <stdio.h> | |
2 | |
3 /* See LICENSE file for copyright and license details. */ | |
4 #include <limits.h> | |
5 #include <stdbool.h> | |
6 #include <stddef.h> | |
7 | |
8 #include "../gen/character.h" | |
9 #include "../grapheme.h" | |
10 #include "util.h" | |
11 | |
12 struct character_break_state { | |
13 uint_least8_t prop; | |
14 bool prop_set; | |
15 bool gb11_flag; | |
16 bool gb12_13_flag; | |
17 uint_least8_t gb9c_level; | |
18 }; | |
19 | |
20 static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = { | |
21 [CHAR_BREAK_PROP_OTHER] = | |
22 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
23 UINT32_C(1) | |
24 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
25 UINT32_C(1) | |
26 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
27 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
28 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
29 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
30 [CHAR_BREAK_PROP_ICB_CONSONANT] = | |
31 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
32 UINT32_C(1) | |
33 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
34 UINT32_C(1) | |
35 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
36 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
37 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
38 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
39 [CHAR_BREAK_PROP_ICB_EXTEND] = | |
40 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
41 UINT32_C(1) | |
42 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
43 UINT32_C(1) | |
44 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
45 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
46 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
47 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
48 [CHAR_BREAK_PROP_ICB_LINKER] = | |
49 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
50 UINT32_C(1) | |
51 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
52 UINT32_C(1) | |
53 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
54 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
55 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
56 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
57 [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF, /* … | |
58 [CHAR_BREAK_PROP_EXTEND] = | |
59 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
60 UINT32_C(1) | |
61 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
62 UINT32_C(1) | |
63 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
64 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
65 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
66 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
67 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] = | |
68 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
69 UINT32_C(1) | |
70 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
71 UINT32_C(1) | |
72 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
73 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
74 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
75 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
76 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] = | |
77 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
78 UINT32_C(1) | |
79 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
80 UINT32_C(1) | |
81 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
82 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
83 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
84 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
85 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = | |
86 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
87 UINT32_C(1) | |
88 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
89 UINT32_C(1) | |
90 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
91 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
92 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
93 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
94 [CHAR_BREAK_PROP_HANGUL_L] = | |
95 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ | |
96 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ | |
97 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ | |
98 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ | |
99 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
100 UINT32_C(1) | |
101 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
102 UINT32_C(1) | |
103 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
104 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
105 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
106 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
107 [CHAR_BREAK_PROP_HANGUL_V] = | |
108 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ | |
109 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ | |
110 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
111 UINT32_C(1) | |
112 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
113 UINT32_C(1) | |
114 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
115 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
116 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
117 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
118 [CHAR_BREAK_PROP_HANGUL_T] = | |
119 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ | |
120 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
121 UINT32_C(1) | |
122 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
123 UINT32_C(1) | |
124 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
125 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
126 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
127 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
128 [CHAR_BREAK_PROP_HANGUL_LV] = | |
129 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ | |
130 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ | |
131 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
132 UINT32_C(1) | |
133 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
134 UINT32_C(1) | |
135 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
136 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
137 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
138 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
139 [CHAR_BREAK_PROP_HANGUL_LVT] = | |
140 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ | |
141 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
142 UINT32_C(1) | |
143 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
144 UINT32_C(1) | |
145 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
146 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
147 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
148 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
149 [CHAR_BREAK_PROP_PREPEND] = | |
150 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
151 UINT32_C(1) | |
152 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
153 UINT32_C(1) | |
154 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
155 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
156 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
157 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* … | |
158 (UINT32_C(0xFFFFFFFF) & | |
159 ~(UINT32_C(1) << CHAR_BREAK_PROP_CR | | |
160 UINT32_C(1) << CHAR_BREAK_PROP_LF | | |
161 UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */ | |
162 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = | |
163 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
164 UINT32_C(1) | |
165 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
166 UINT32_C(1) | |
167 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
168 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
169 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
170 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
171 [CHAR_BREAK_PROP_SPACINGMARK] = | |
172 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
173 UINT32_C(1) | |
174 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
175 UINT32_C(1) | |
176 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
177 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
178 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
179 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
180 [CHAR_BREAK_PROP_ZWJ] = | |
181 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
182 UINT32_C(1) | |
183 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
184 UINT32_C(1) | |
185 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
186 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
187 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
188 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
189 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] = | |
190 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
191 UINT32_C(1) | |
192 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
193 UINT32_C(1) | |
194 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
195 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
196 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
197 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
198 | |
199 }; | |
200 static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] =… | |
201 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = | |
202 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
203 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
204 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* … | |
205 UINT32_C(1) | |
206 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
207 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /… | |
208 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = | |
209 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
210 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
211 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
212 [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
213 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
214 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
215 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
216 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
217 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
218 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
219 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
220 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
221 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
222 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
223 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
224 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] = | |
225 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
226 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
227 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
228 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
229 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
230 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = | |
231 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
232 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | | |
233 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
234 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
235 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, | |
236 }; | |
237 static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { | |
238 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = | |
239 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
240 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
241 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
242 }; | |
243 static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS… | |
244 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = | |
245 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, | |
246 }; | |
247 static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS]… | |
248 [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = | |
249 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, | |
250 }; | |
251 | |
252 static inline enum char_break_property | |
253 get_break_prop(uint_least32_t cp) | |
254 { | |
255 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
256 return (enum char_break_property) | |
257 char_break_minor[char_break_major[cp >> 8] + | |
258 (cp & 0xFF)]; | |
259 } else { | |
260 return CHAR_BREAK_PROP_OTHER; | |
261 } | |
262 } | |
263 | |
264 static inline void | |
265 state_serialize(const struct character_break_state *in, uint_least16_t *… | |
266 { | |
267 *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 b… | |
268 (uint_least16_t)(((uint_least16_t)(in->prop_set)) | |
269 << 8) | /* 9th bit */ | |
270 (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) | |
271 << 9) | /* 10th bit */ | |
272 (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) | |
273 << 10) | /* 11th bit */ | |
274 (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3)) | |
275 << 11); /* 12th and 13th bit */ | |
276 } | |
277 | |
278 static inline void | |
279 state_deserialize(uint_least16_t in, struct character_break_state *out) | |
280 { | |
281 out->prop = in & UINT8_C(0xFF); | |
282 out->prop_set = in & (UINT16_C(1) << 8); | |
283 out->gb11_flag = in & (UINT16_C(1) << 9); | |
284 out->gb12_13_flag = in & (UINT16_C(1) << 10); | |
285 out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3); | |
286 } | |
287 | |
288 bool | |
289 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, | |
290 uint_least16_t *s) | |
291 { | |
292 struct character_break_state state; | |
293 enum char_break_property cp0_prop, cp1_prop; | |
294 bool notbreak = false; | |
295 | |
296 if (likely(s)) { | |
297 state_deserialize(*s, &state); | |
298 | |
299 if (likely(state.prop_set)) { | |
300 cp0_prop = state.prop; | |
301 } else { | |
302 cp0_prop = get_break_prop(cp0); | |
303 } | |
304 cp1_prop = get_break_prop(cp1); | |
305 | |
306 /* preserve prop of right codepoint for next iteration */ | |
307 state.prop = (uint_least8_t)cp1_prop; | |
308 state.prop_set = true; | |
309 | |
310 /* update flags */ | |
311 state.gb11_flag = | |
312 flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS… | |
313 state.gb11_f… | |
314 UINT32_C(1) << cp1_prop; | |
315 state.gb12_13_flag = | |
316 flag_update_gb12_13[cp0_prop + | |
317 NUM_CHAR_BREAK_PROPS * | |
318 state.gb12_13_flag] & | |
319 UINT32_C(1) << cp1_prop; | |
320 | |
321 /* | |
322 * update GB9c state, which deals with indic conjunct br… | |
323 * We want to detect the following prefix: | |
324 * | |
325 * ICB_CONSONANT | |
326 * [ICB_EXTEND ICB_LINKER]* | |
327 * ICB_LINKER | |
328 * [ICB_EXTEND ICB_LINKER]* | |
329 * | |
330 * This representation is not ideal: In reality, what is | |
331 * meant is that the prefix is a sequence of [ICB_EXTEND | |
332 * ICB_LINKER]*, following an ICB_CONSONANT, that contai… | |
333 * least one ICB_LINKER. We thus use the following equiv… | |
334 * representation that allows us to store the levels 0..… | |
335 * bits. | |
336 * | |
337 * ICB_CONSONANT -- Level 1 | |
338 * ICB_EXTEND* -- Level 2 | |
339 * ICB_LINKER -- Level 3 | |
340 * [ICB_EXTEND ICB_LINKER]* -- Level 3 | |
341 * | |
342 * The following chain of if-else-blocks is a bit redund… | |
343 * of course could be optimised, but this is kept as is … | |
344 * best readability. | |
345 */ | |
346 if (state.gb9c_level == 0 && | |
347 cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { | |
348 /* the sequence has begun */ | |
349 state.gb9c_level = 1; | |
350 } else if ((state.gb9c_level == 1 || state.gb9c_level ==… | |
351 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || | |
352 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXT… | |
353 cp0_prop == | |
354 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTE… | |
355 /* | |
356 * either the level is 1 and thus the ICB conson… | |
357 * followed by an ICB extend, where we jump | |
358 * to level 2, or we are at level 2 and just wit… | |
359 * more ICB extends, staying at level 2. | |
360 */ | |
361 state.gb9c_level = 2; | |
362 } else if ((state.gb9c_level == 1 || state.gb9c_level ==… | |
363 (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || | |
364 cp0_prop == | |
365 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINK… | |
366 /* | |
367 * witnessing an ICB linker directly lifts us up… | |
368 * level 3 | |
369 */ | |
370 state.gb9c_level = 3; | |
371 } else if (state.gb9c_level == 3 && | |
372 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || | |
373 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXT… | |
374 cp0_prop == | |
375 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTE… | |
376 cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || | |
377 cp0_prop == | |
378 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINK… | |
379 /* | |
380 * we stay at level 3 when we observe either ICB | |
381 * extends or linkers | |
382 */ | |
383 state.gb9c_level = 3; | |
384 } else { | |
385 /* | |
386 * the sequence has collapsed, but it could be | |
387 * that the left property is ICB consonant, which | |
388 * means that we jump right back to level 1 inst… | |
389 * of 0 | |
390 */ | |
391 if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { | |
392 state.gb9c_level = 1; | |
393 } else { | |
394 state.gb9c_level = 0; | |
395 } | |
396 } | |
397 | |
398 /* | |
399 * Apply grapheme cluster breaking algorithm (UAX #29), … | |
400 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Bou… | |
401 */ | |
402 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_p… | |
403 (state.gb9c_level == 3 && | |
404 cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) || | |
405 (dont_break_gb11[cp0_prop + | |
406 state.gb11_flag * | |
407 NUM_CHAR_BREAK_PROPS… | |
408 (UINT32_C(1) << cp1_prop)) || | |
409 (dont_break_gb12_13[cp0_prop + | |
410 state.gb12_13_flag * | |
411 NUM_CHAR_BREAK_PR… | |
412 (UINT32_C(1) << cp1_prop)); | |
413 | |
414 /* update or reset flags (when we have a break) */ | |
415 if (likely(!notbreak)) { | |
416 state.gb11_flag = state.gb12_13_flag = false; | |
417 } | |
418 | |
419 state_serialize(&state, s); | |
420 } else { | |
421 cp0_prop = get_break_prop(cp0); | |
422 cp1_prop = get_break_prop(cp1); | |
423 | |
424 /* | |
425 * Apply grapheme cluster breaking algorithm (UAX #29), … | |
426 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Bou… | |
427 * | |
428 * Given we have no state, this behaves as if the state-… | |
429 * were all set to false | |
430 */ | |
431 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_p… | |
432 (dont_break_gb11[cp0_prop] & | |
433 (UINT32_C(1) << cp1_prop)) || | |
434 (dont_break_gb12_13[cp0_prop] & | |
435 (UINT32_C(1) << cp1_prop)); | |
436 } | |
437 | |
438 return !notbreak; | |
439 } | |
440 | |
441 static size_t | |
442 next_character_break(HERODOTUS_READER *r) | |
443 { | |
444 uint_least16_t state = 0; | |
445 uint_least32_t cp0 = 0, cp1 = 0; | |
446 | |
447 for (herodotus_read_codepoint(r, true, &cp0); | |
448 herodotus_read_codepoint(r, false, &cp1) == | |
449 HERODOTUS_STATUS_SUCCESS; | |
450 herodotus_read_codepoint(r, true, &cp0)) { | |
451 if (grapheme_is_character_break(cp0, cp1, &state)) { | |
452 break; | |
453 } | |
454 } | |
455 | |
456 return herodotus_reader_number_read(r); | |
457 } | |
458 | |
459 size_t | |
460 grapheme_next_character_break(const uint_least32_t *str, size_t len) | |
461 { | |
462 HERODOTUS_READER r; | |
463 | |
464 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); | |
465 | |
466 return next_character_break(&r); | |
467 } | |
468 | |
469 size_t | |
470 grapheme_next_character_break_utf8(const char *str, size_t len) | |
471 { | |
472 HERODOTUS_READER r; | |
473 | |
474 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); | |
475 | |
476 return next_character_break(&r); | |
477 } |