Introduction
Introduction Statistics Contact Development Disclaimer Help
character.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
character.c (18706B)
---
1 #include <stdio.h>
2
3 /* See LICENSE file for copyright and license details. */
4 #include <limits.h>
5 #include <stdbool.h>
6 #include <stddef.h>
7
8 #include "../gen/character.h"
9 #include "../grapheme.h"
10 #include "util.h"
11
12 struct character_break_state {
13 uint_least8_t prop;
14 bool prop_set;
15 bool gb11_flag;
16 bool gb12_13_flag;
17 uint_least8_t gb9c_level;
18 };
19
20 static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = {
21 [CHAR_BREAK_PROP_OTHER] =
22 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
23 UINT32_C(1)
24 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
25 UINT32_C(1)
26 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
27 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
28 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
29 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
30 [CHAR_BREAK_PROP_ICB_CONSONANT] =
31 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
32 UINT32_C(1)
33 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
34 UINT32_C(1)
35 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
36 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
37 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
38 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
39 [CHAR_BREAK_PROP_ICB_EXTEND] =
40 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
41 UINT32_C(1)
42 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
43 UINT32_C(1)
44 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
45 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
46 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
47 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
48 [CHAR_BREAK_PROP_ICB_LINKER] =
49 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
50 UINT32_C(1)
51 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
52 UINT32_C(1)
53 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
54 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
55 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
56 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
57 [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF, /* …
58 [CHAR_BREAK_PROP_EXTEND] =
59 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
60 UINT32_C(1)
61 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
62 UINT32_C(1)
63 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
64 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
65 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
66 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
67 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] =
68 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
69 UINT32_C(1)
70 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
71 UINT32_C(1)
72 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
73 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
74 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
75 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
76 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] =
77 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
78 UINT32_C(1)
79 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
80 UINT32_C(1)
81 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
82 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
83 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
84 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
85 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
86 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
87 UINT32_C(1)
88 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
89 UINT32_C(1)
90 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
91 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
92 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
93 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
94 [CHAR_BREAK_PROP_HANGUL_L] =
95 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
96 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
97 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
98 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
99 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
100 UINT32_C(1)
101 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
102 UINT32_C(1)
103 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
104 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
105 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
106 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
107 [CHAR_BREAK_PROP_HANGUL_V] =
108 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
109 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
110 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
111 UINT32_C(1)
112 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
113 UINT32_C(1)
114 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
115 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
116 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
117 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
118 [CHAR_BREAK_PROP_HANGUL_T] =
119 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
120 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
121 UINT32_C(1)
122 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
123 UINT32_C(1)
124 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
125 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
126 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
127 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
128 [CHAR_BREAK_PROP_HANGUL_LV] =
129 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
130 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
131 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
132 UINT32_C(1)
133 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
134 UINT32_C(1)
135 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
136 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
137 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
138 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
139 [CHAR_BREAK_PROP_HANGUL_LVT] =
140 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
141 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
142 UINT32_C(1)
143 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
144 UINT32_C(1)
145 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
146 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
147 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
148 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
149 [CHAR_BREAK_PROP_PREPEND] =
150 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
151 UINT32_C(1)
152 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
153 UINT32_C(1)
154 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
155 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
156 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
157 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* …
158 (UINT32_C(0xFFFFFFFF) &
159 ~(UINT32_C(1) << CHAR_BREAK_PROP_CR |
160 UINT32_C(1) << CHAR_BREAK_PROP_LF |
161 UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
162 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
163 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
164 UINT32_C(1)
165 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
166 UINT32_C(1)
167 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
168 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
169 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
170 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
171 [CHAR_BREAK_PROP_SPACINGMARK] =
172 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
173 UINT32_C(1)
174 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
175 UINT32_C(1)
176 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
177 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
178 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
179 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
180 [CHAR_BREAK_PROP_ZWJ] =
181 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
182 UINT32_C(1)
183 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
184 UINT32_C(1)
185 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
186 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
187 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
188 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
189 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] =
190 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
191 UINT32_C(1)
192 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
193 UINT32_C(1)
194 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* …
195 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* …
196 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
197 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* …
198
199 };
200 static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] =…
201 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
202 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
203 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* …
204 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* …
205 UINT32_C(1)
206 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G…
207 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /…
208 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
209 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
210 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
211 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
212 [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
213 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
214 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
215 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
216 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
217 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
218 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
219 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
220 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
221 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
222 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
223 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
224 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] =
225 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
226 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
227 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
228 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
229 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
230 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
231 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
232 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND |
233 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
234 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
235 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER,
236 };
237 static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
238 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
239 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
240 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
241 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
242 };
243 static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS…
244 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
245 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
246 };
247 static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS]…
248 [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
249 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
250 };
251
252 static inline enum char_break_property
253 get_break_prop(uint_least32_t cp)
254 {
255 if (likely(cp <= UINT32_C(0x10FFFF))) {
256 return (enum char_break_property)
257 char_break_minor[char_break_major[cp >> 8] +
258 (cp & 0xFF)];
259 } else {
260 return CHAR_BREAK_PROP_OTHER;
261 }
262 }
263
264 static inline void
265 state_serialize(const struct character_break_state *in, uint_least16_t *…
266 {
267 *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 b…
268 (uint_least16_t)(((uint_least16_t)(in->prop_set))
269 << 8) | /* 9th bit */
270 (uint_least16_t)(((uint_least16_t)(in->gb11_flag))
271 << 9) | /* 10th bit */
272 (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag))
273 << 10) | /* 11th bit */
274 (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3))
275 << 11); /* 12th and 13th bit */
276 }
277
278 static inline void
279 state_deserialize(uint_least16_t in, struct character_break_state *out)
280 {
281 out->prop = in & UINT8_C(0xFF);
282 out->prop_set = in & (UINT16_C(1) << 8);
283 out->gb11_flag = in & (UINT16_C(1) << 9);
284 out->gb12_13_flag = in & (UINT16_C(1) << 10);
285 out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3);
286 }
287
288 bool
289 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
290 uint_least16_t *s)
291 {
292 struct character_break_state state;
293 enum char_break_property cp0_prop, cp1_prop;
294 bool notbreak = false;
295
296 if (likely(s)) {
297 state_deserialize(*s, &state);
298
299 if (likely(state.prop_set)) {
300 cp0_prop = state.prop;
301 } else {
302 cp0_prop = get_break_prop(cp0);
303 }
304 cp1_prop = get_break_prop(cp1);
305
306 /* preserve prop of right codepoint for next iteration */
307 state.prop = (uint_least8_t)cp1_prop;
308 state.prop_set = true;
309
310 /* update flags */
311 state.gb11_flag =
312 flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS…
313 state.gb11_f…
314 UINT32_C(1) << cp1_prop;
315 state.gb12_13_flag =
316 flag_update_gb12_13[cp0_prop +
317 NUM_CHAR_BREAK_PROPS *
318 state.gb12_13_flag] &
319 UINT32_C(1) << cp1_prop;
320
321 /*
322 * update GB9c state, which deals with indic conjunct br…
323 * We want to detect the following prefix:
324 *
325 * ICB_CONSONANT
326 * [ICB_EXTEND ICB_LINKER]*
327 * ICB_LINKER
328 * [ICB_EXTEND ICB_LINKER]*
329 *
330 * This representation is not ideal: In reality, what is
331 * meant is that the prefix is a sequence of [ICB_EXTEND
332 * ICB_LINKER]*, following an ICB_CONSONANT, that contai…
333 * least one ICB_LINKER. We thus use the following equiv…
334 * representation that allows us to store the levels 0..…
335 * bits.
336 *
337 * ICB_CONSONANT -- Level 1
338 * ICB_EXTEND* -- Level 2
339 * ICB_LINKER -- Level 3
340 * [ICB_EXTEND ICB_LINKER]* -- Level 3
341 *
342 * The following chain of if-else-blocks is a bit redund…
343 * of course could be optimised, but this is kept as is …
344 * best readability.
345 */
346 if (state.gb9c_level == 0 &&
347 cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
348 /* the sequence has begun */
349 state.gb9c_level = 1;
350 } else if ((state.gb9c_level == 1 || state.gb9c_level ==…
351 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
352 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXT…
353 cp0_prop ==
354 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTE…
355 /*
356 * either the level is 1 and thus the ICB conson…
357 * followed by an ICB extend, where we jump
358 * to level 2, or we are at level 2 and just wit…
359 * more ICB extends, staying at level 2.
360 */
361 state.gb9c_level = 2;
362 } else if ((state.gb9c_level == 1 || state.gb9c_level ==…
363 (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
364 cp0_prop ==
365 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINK…
366 /*
367 * witnessing an ICB linker directly lifts us up…
368 * level 3
369 */
370 state.gb9c_level = 3;
371 } else if (state.gb9c_level == 3 &&
372 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
373 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXT…
374 cp0_prop ==
375 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTE…
376 cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
377 cp0_prop ==
378 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINK…
379 /*
380 * we stay at level 3 when we observe either ICB
381 * extends or linkers
382 */
383 state.gb9c_level = 3;
384 } else {
385 /*
386 * the sequence has collapsed, but it could be
387 * that the left property is ICB consonant, which
388 * means that we jump right back to level 1 inst…
389 * of 0
390 */
391 if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
392 state.gb9c_level = 1;
393 } else {
394 state.gb9c_level = 0;
395 }
396 }
397
398 /*
399 * Apply grapheme cluster breaking algorithm (UAX #29), …
400 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Bou…
401 */
402 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_p…
403 (state.gb9c_level == 3 &&
404 cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) ||
405 (dont_break_gb11[cp0_prop +
406 state.gb11_flag *
407 NUM_CHAR_BREAK_PROPS…
408 (UINT32_C(1) << cp1_prop)) ||
409 (dont_break_gb12_13[cp0_prop +
410 state.gb12_13_flag *
411 NUM_CHAR_BREAK_PR…
412 (UINT32_C(1) << cp1_prop));
413
414 /* update or reset flags (when we have a break) */
415 if (likely(!notbreak)) {
416 state.gb11_flag = state.gb12_13_flag = false;
417 }
418
419 state_serialize(&state, s);
420 } else {
421 cp0_prop = get_break_prop(cp0);
422 cp1_prop = get_break_prop(cp1);
423
424 /*
425 * Apply grapheme cluster breaking algorithm (UAX #29), …
426 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Bou…
427 *
428 * Given we have no state, this behaves as if the state-…
429 * were all set to false
430 */
431 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_p…
432 (dont_break_gb11[cp0_prop] &
433 (UINT32_C(1) << cp1_prop)) ||
434 (dont_break_gb12_13[cp0_prop] &
435 (UINT32_C(1) << cp1_prop));
436 }
437
438 return !notbreak;
439 }
440
441 static size_t
442 next_character_break(HERODOTUS_READER *r)
443 {
444 uint_least16_t state = 0;
445 uint_least32_t cp0 = 0, cp1 = 0;
446
447 for (herodotus_read_codepoint(r, true, &cp0);
448 herodotus_read_codepoint(r, false, &cp1) ==
449 HERODOTUS_STATUS_SUCCESS;
450 herodotus_read_codepoint(r, true, &cp0)) {
451 if (grapheme_is_character_break(cp0, cp1, &state)) {
452 break;
453 }
454 }
455
456 return herodotus_reader_number_read(r);
457 }
458
459 size_t
460 grapheme_next_character_break(const uint_least32_t *str, size_t len)
461 {
462 HERODOTUS_READER r;
463
464 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
465
466 return next_character_break(&r);
467 }
468
469 size_t
470 grapheme_next_character_break_utf8(const char *str, size_t len)
471 {
472 HERODOTUS_READER r;
473
474 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
475
476 return next_character_break(&r);
477 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.