line.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
line.c (14397B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <stdbool.h> | |
3 #include <stddef.h> | |
4 | |
5 #include "../gen/line.h" | |
6 #include "../grapheme.h" | |
7 #include "util.h" | |
8 | |
9 static inline enum line_break_property | |
10 get_break_prop(uint_least32_t cp) | |
11 { | |
12 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
13 return (enum line_break_property) | |
14 line_break_minor[line_break_major[cp >> 8] + | |
15 (cp & 0xff)]; | |
16 } else { | |
17 return LINE_BREAK_PROP_AL; | |
18 } | |
19 } | |
20 | |
21 static size_t | |
22 next_line_break(HERODOTUS_READER *r) | |
23 { | |
24 HERODOTUS_READER tmp; | |
25 enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_… | |
26 last_non_sp_prop, last_non_sp_cm_or_zwj_prop; | |
27 uint_least32_t cp; | |
28 uint_least8_t lb25_level = 0; | |
29 bool lb21a_flag = false, ri_even = true; | |
30 | |
31 /* | |
32 * Apply line breaking algorithm (UAX #14), see | |
33 * https://unicode.org/reports/tr14/#Algorithm and tailoring | |
34 * https://unicode.org/reports/tr14/#Examples (example 7), | |
35 * given the automatic test-cases implement this example for | |
36 * better number handling. | |
37 * | |
38 */ | |
39 | |
40 /* | |
41 * Initialize the different properties such that we have | |
42 * a good state after the state-update in the loop | |
43 */ | |
44 last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB… | |
45 last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_P… | |
46 | |
47 for (herodotus_read_codepoint(r, true, &cp), | |
48 cp0_prop = get_break_prop(cp); | |
49 herodotus_read_codepoint(r, false, &cp) == | |
50 HERODOTUS_STATUS_SUCCESS; | |
51 herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop… | |
52 /* get property of the right codepoint */ | |
53 cp1_prop = get_break_prop(cp); | |
54 | |
55 /* update retention-states */ | |
56 | |
57 /* | |
58 * store the last observed non-CM-or-ZWJ-property for | |
59 * LB9 and following. | |
60 */ | |
61 if (cp0_prop != LINE_BREAK_PROP_CM && | |
62 cp0_prop != LINE_BREAK_PROP_ZWJ) { | |
63 /* | |
64 * check if the property we are overwriting now … | |
65 * HL. If so, we set the LB21a-flag which depend… | |
66 * this knowledge. | |
67 */ | |
68 lb21a_flag = | |
69 (last_non_cm_or_zwj_prop == LINE_BREAK_P… | |
70 | |
71 /* check regional indicator state */ | |
72 if (cp0_prop == LINE_BREAK_PROP_RI) { | |
73 /* | |
74 * The property we just shifted in is | |
75 * a regional indicator, increasing the | |
76 * number of consecutive RIs on the left | |
77 * side of the breakpoint by one, changi… | |
78 * the oddness. | |
79 * | |
80 */ | |
81 ri_even = !ri_even; | |
82 } else { | |
83 /* | |
84 * We saw no regional indicator, so the | |
85 * number of consecutive RIs on the left | |
86 * side of the breakpoint is zero, which | |
87 * is an even number. | |
88 * | |
89 */ | |
90 ri_even = true; | |
91 } | |
92 | |
93 /* | |
94 * Here comes a bit of magic. The tailored rule | |
95 * LB25 (using example 7) has a very complicated | |
96 * left-hand-side-rule of the form | |
97 * | |
98 * NU (NU | SY | IS)* (CL | CP)? | |
99 * | |
100 * but instead of backtracking, we keep the state | |
101 * as some kind of "power level" in the variable | |
102 * | |
103 * lb25_level | |
104 * | |
105 * that goes from 0 to 3 | |
106 * | |
107 * 0: we are not in the sequence | |
108 * 1: we have one NU to the left of the middle | |
109 * spot | |
110 * 2: we have one NU and one or more (NU | SY |… | |
111 * to the left of the middle spot | |
112 * 3: we have one NU, zero or more (NU | SY | I… | |
113 * and one (CL | CP) to the left of the midd… | |
114 * spot | |
115 */ | |
116 if ((lb25_level == 0 || lb25_level == 1) && | |
117 cp0_prop == LINE_BREAK_PROP_NU) { | |
118 /* sequence has begun */ | |
119 lb25_level = 1; | |
120 } else if ((lb25_level == 1 || lb25_level == 2) … | |
121 (cp0_prop == LINE_BREAK_PROP_NU || | |
122 cp0_prop == LINE_BREAK_PROP_SY || | |
123 cp0_prop == LINE_BREAK_PROP_IS)) { | |
124 /* (NU | SY | IS) sequence begins or con… | |
125 */ | |
126 lb25_level = 2; | |
127 } else if ( | |
128 (lb25_level == 1 || lb25_level == 2) && | |
129 (cp0_prop == LINE_BREAK_PROP_CL || | |
130 cp0_prop == | |
131 LINE_BREAK_PROP_CP_WITHOUT_EAW_… | |
132 cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW… | |
133 /* CL or CP at the end of the sequence */ | |
134 lb25_level = 3; | |
135 } else { | |
136 /* sequence broke */ | |
137 lb25_level = 0; | |
138 } | |
139 | |
140 last_non_cm_or_zwj_prop = cp0_prop; | |
141 } | |
142 | |
143 /* | |
144 * store the last observed non-SP-property for LB8, LB14, | |
145 * LB15, LB16 and LB17. LB8 gets its own unskipped prope… | |
146 * whereas the others build on top of the CM-ZWJ-skipped | |
147 * properties as they come after LB9 | |
148 */ | |
149 if (cp0_prop != LINE_BREAK_PROP_SP) { | |
150 last_non_sp_prop = cp0_prop; | |
151 } | |
152 if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) { | |
153 last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_… | |
154 } | |
155 | |
156 /* apply the algorithm */ | |
157 | |
158 /* LB4 */ | |
159 if (cp0_prop == LINE_BREAK_PROP_BK) { | |
160 break; | |
161 } | |
162 | |
163 /* LB5 */ | |
164 if (cp0_prop == LINE_BREAK_PROP_CR && | |
165 cp1_prop == LINE_BREAK_PROP_LF) { | |
166 continue; | |
167 } | |
168 if (cp0_prop == LINE_BREAK_PROP_CR || | |
169 cp0_prop == LINE_BREAK_PROP_LF || | |
170 cp0_prop == LINE_BREAK_PROP_NL) { | |
171 break; | |
172 } | |
173 | |
174 /* LB6 */ | |
175 if (cp1_prop == LINE_BREAK_PROP_BK || | |
176 cp1_prop == LINE_BREAK_PROP_CR || | |
177 cp1_prop == LINE_BREAK_PROP_LF || | |
178 cp1_prop == LINE_BREAK_PROP_NL) { | |
179 continue; | |
180 } | |
181 | |
182 /* LB7 */ | |
183 if (cp1_prop == LINE_BREAK_PROP_SP || | |
184 cp1_prop == LINE_BREAK_PROP_ZW) { | |
185 continue; | |
186 } | |
187 | |
188 /* LB8 */ | |
189 if (last_non_sp_prop == LINE_BREAK_PROP_ZW) { | |
190 break; | |
191 } | |
192 | |
193 /* LB8a */ | |
194 if (cp0_prop == LINE_BREAK_PROP_ZWJ) { | |
195 continue; | |
196 } | |
197 | |
198 /* LB9 */ | |
199 if ((cp0_prop != LINE_BREAK_PROP_BK && | |
200 cp0_prop != LINE_BREAK_PROP_CR && | |
201 cp0_prop != LINE_BREAK_PROP_LF && | |
202 cp0_prop != LINE_BREAK_PROP_NL && | |
203 cp0_prop != LINE_BREAK_PROP_SP && | |
204 cp0_prop != LINE_BREAK_PROP_ZW) && | |
205 (cp1_prop == LINE_BREAK_PROP_CM || | |
206 cp1_prop == LINE_BREAK_PROP_ZWJ)) { | |
207 /* | |
208 * given we skip them, we don't break in such | |
209 * a sequence | |
210 */ | |
211 continue; | |
212 } | |
213 | |
214 /* LB10 is baked into the following rules */ | |
215 | |
216 /* LB11 */ | |
217 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ || | |
218 cp1_prop == LINE_BREAK_PROP_WJ) { | |
219 continue; | |
220 } | |
221 | |
222 /* LB12 */ | |
223 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) { | |
224 continue; | |
225 } | |
226 | |
227 /* LB12a */ | |
228 if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP && | |
229 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA && | |
230 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) && | |
231 cp1_prop == LINE_BREAK_PROP_GL) { | |
232 continue; | |
233 } | |
234 | |
235 /* LB13 (affected by tailoring for LB25, see example 7) … | |
236 if (cp1_prop == LINE_BREAK_PROP_EX || | |
237 (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU && | |
238 (cp1_prop == LINE_BREAK_PROP_CL || | |
239 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || | |
240 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF || | |
241 cp1_prop == LINE_BREAK_PROP_IS || | |
242 cp1_prop == LINE_BREAK_PROP_SY))) { | |
243 continue; | |
244 } | |
245 | |
246 /* LB14 */ | |
247 if (last_non_sp_cm_or_zwj_prop == | |
248 LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || | |
249 last_non_sp_cm_or_zwj_prop == | |
250 LINE_BREAK_PROP_OP_WITH_EAW_HWF) { | |
251 continue; | |
252 } | |
253 | |
254 /* LB15 */ | |
255 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU && | |
256 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || | |
257 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) { | |
258 continue; | |
259 } | |
260 | |
261 /* LB16 */ | |
262 if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL || | |
263 last_non_sp_cm_or_zwj_prop == | |
264 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || | |
265 last_non_sp_cm_or_zwj_prop == | |
266 LINE_BREAK_PROP_CP_WITH_EAW_HWF) && | |
267 cp1_prop == LINE_BREAK_PROP_NS) { | |
268 continue; | |
269 } | |
270 | |
271 /* LB17 */ | |
272 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 && | |
273 cp1_prop == LINE_BREAK_PROP_B2) { | |
274 continue; | |
275 } | |
276 | |
277 /* LB18 */ | |
278 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) { | |
279 break; | |
280 } | |
281 | |
282 /* LB19 */ | |
283 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU || | |
284 cp1_prop == LINE_BREAK_PROP_QU) { | |
285 continue; | |
286 } | |
287 | |
288 /* LB20 */ | |
289 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB || | |
290 cp1_prop == LINE_BREAK_PROP_CB) { | |
291 break; | |
292 } | |
293 | |
294 /* LB21 */ | |
295 if (cp1_prop == LINE_BREAK_PROP_BA || | |
296 cp1_prop == LINE_BREAK_PROP_HY || | |
297 cp1_prop == LINE_BREAK_PROP_NS || | |
298 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) { | |
299 continue; | |
300 } | |
301 | |
302 /* LB21a */ | |
303 if (lb21a_flag && | |
304 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY || | |
305 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) { | |
306 continue; | |
307 } | |
308 | |
309 /* LB21b */ | |
310 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY && | |
311 cp1_prop == LINE_BREAK_PROP_HL) { | |
312 continue; | |
313 } | |
314 | |
315 /* LB22 */ | |
316 if (cp1_prop == LINE_BREAK_PROP_IN) { | |
317 continue; | |
318 } | |
319 | |
320 /* LB23 */ | |
321 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || | |
322 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && | |
323 cp1_prop == LINE_BREAK_PROP_NU) { | |
324 continue; | |
325 } | |
326 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU && | |
327 (cp1_prop == LINE_BREAK_PROP_AL || | |
328 cp1_prop == LINE_BREAK_PROP_HL)) { | |
329 continue; | |
330 } | |
331 | |
332 /* LB23a */ | |
333 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR && | |
334 (cp1_prop == LINE_BREAK_PROP_ID || | |
335 cp1_prop == LINE_BREAK_PROP_EB || | |
336 cp1_prop == LINE_BREAK_PROP_EM)) { | |
337 continue; | |
338 } | |
339 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID || | |
340 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB || | |
341 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) && | |
342 cp1_prop == LINE_BREAK_PROP_PO) { | |
343 continue; | |
344 } | |
345 | |
346 /* LB24 */ | |
347 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR || | |
348 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) && | |
349 (cp1_prop == LINE_BREAK_PROP_AL || | |
350 cp1_prop == LINE_BREAK_PROP_HL)) { | |
351 continue; | |
352 } | |
353 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || | |
354 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && | |
355 (cp1_prop == LINE_BREAK_PROP_PR || | |
356 cp1_prop == LINE_BREAK_PROP_PO)) { | |
357 continue; | |
358 } | |
359 | |
360 /* LB25 (tailored with example 7) */ | |
361 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR || | |
362 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) { | |
363 if (cp1_prop == LINE_BREAK_PROP_NU) { | |
364 continue; | |
365 } | |
366 | |
367 /* this stupid rule is the reason why we cannot | |
368 * simply have a stateful break-detection between | |
369 * two adjacent codepoints as we have it with | |
370 * characters. | |
371 */ | |
372 herodotus_reader_copy(r, &tmp); | |
373 herodotus_read_codepoint(&tmp, true, &cp); | |
374 if (herodotus_read_codepoint(&tmp, true, &cp) == | |
375 HERODOTUS_STATUS_SUCCESS && | |
376 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_… | |
377 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF… | |
378 cp1_prop == LINE_BREAK_PROP_HY)) { | |
379 if (get_break_prop(cp) == LINE_BREAK_PRO… | |
380 continue; | |
381 } | |
382 } | |
383 } | |
384 if ((last_non_cm_or_zwj_prop == | |
385 LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF || | |
386 last_non_cm_or_zwj_prop == | |
387 LINE_BREAK_PROP_OP_WITH_EAW_HWF || | |
388 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) && | |
389 cp1_prop == LINE_BREAK_PROP_NU) { | |
390 continue; | |
391 } | |
392 if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU || | |
393 cp1_prop == LINE_BREAK_PROP_SY || | |
394 cp1_prop == LINE_BREAK_PROP_IS))… | |
395 continue; | |
396 } | |
397 if ((lb25_level == 1 || lb25_level == 2) && | |
398 (cp1_prop == LINE_BREAK_PROP_NU || | |
399 cp1_prop == LINE_BREAK_PROP_SY || | |
400 cp1_prop == LINE_BREAK_PROP_IS || | |
401 cp1_prop == LINE_BREAK_PROP_CL || | |
402 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF || | |
403 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) { | |
404 continue; | |
405 } | |
406 if ((lb25_level == 1 || lb25_level == 2 || lb25_level ==… | |
407 (cp1_prop == LINE_BREAK_PROP_PO || | |
408 cp1_prop == LINE_BREAK_PROP_PR)) { | |
409 continue; | |
410 } | |
411 | |
412 /* LB26 */ | |
413 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL && | |
414 (cp1_prop == LINE_BREAK_PROP_JL || | |
415 cp1_prop == LINE_BREAK_PROP_JV || | |
416 cp1_prop == LINE_BREAK_PROP_H2 || | |
417 cp1_prop == LINE_BREAK_PROP_H3)) { | |
418 continue; | |
419 } | |
420 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV || | |
421 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) && | |
422 (cp1_prop == LINE_BREAK_PROP_JV || | |
423 cp1_prop == LINE_BREAK_PROP_JT)) { | |
424 continue; | |
425 } | |
426 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT || | |
427 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) && | |
428 cp1_prop == LINE_BREAK_PROP_JT) { | |
429 continue; | |
430 } | |
431 | |
432 /* LB27 */ | |
433 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL || | |
434 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV || | |
435 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT || | |
436 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 || | |
437 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) && | |
438 cp1_prop == LINE_BREAK_PROP_PO) { | |
439 continue; | |
440 } | |
441 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR && | |
442 (cp1_prop == LINE_BREAK_PROP_JL || | |
443 cp1_prop == LINE_BREAK_PROP_JV || | |
444 cp1_prop == LINE_BREAK_PROP_JT || | |
445 cp1_prop == LINE_BREAK_PROP_H2 || | |
446 cp1_prop == LINE_BREAK_PROP_H3)) { | |
447 continue; | |
448 } | |
449 | |
450 /* LB28 */ | |
451 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || | |
452 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) && | |
453 (cp1_prop == LINE_BREAK_PROP_AL || | |
454 cp1_prop == LINE_BREAK_PROP_HL)) { | |
455 continue; | |
456 } | |
457 | |
458 /* LB29 */ | |
459 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS && | |
460 (cp1_prop == LINE_BREAK_PROP_AL || | |
461 cp1_prop == LINE_BREAK_PROP_HL)) { | |
462 continue; | |
463 } | |
464 | |
465 /* LB30 */ | |
466 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL || | |
467 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL || | |
468 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) && | |
469 cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) { | |
470 continue; | |
471 } | |
472 if (last_non_cm_or_zwj_prop == | |
473 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF && | |
474 (cp1_prop == LINE_BREAK_PROP_AL || | |
475 cp1_prop == LINE_BREAK_PROP_HL || | |
476 cp1_prop == LINE_BREAK_PROP_NU)) { | |
477 continue; | |
478 } | |
479 | |
480 /* LB30a */ | |
481 if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PR… | |
482 cp1_prop == LINE_BREAK_PROP_RI) { | |
483 continue; | |
484 } | |
485 | |
486 /* LB30b */ | |
487 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB && | |
488 cp1_prop == LINE_BREAK_PROP_EM) { | |
489 continue; | |
490 } | |
491 if (last_non_cm_or_zwj_prop == | |
492 LINE_BREAK_PROP_BOTH_CN_EXTPICT && | |
493 cp1_prop == LINE_BREAK_PROP_EM) { | |
494 continue; | |
495 } | |
496 | |
497 /* LB31 */ | |
498 break; | |
499 } | |
500 | |
501 return herodotus_reader_number_read(r); | |
502 } | |
503 | |
504 size_t | |
505 grapheme_next_line_break(const uint_least32_t *str, size_t len) | |
506 { | |
507 HERODOTUS_READER r; | |
508 | |
509 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); | |
510 | |
511 return next_line_break(&r); | |
512 } | |
513 | |
514 size_t | |
515 grapheme_next_line_break_utf8(const char *str, size_t len) | |
516 { | |
517 HERODOTUS_READER r; | |
518 | |
519 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); | |
520 | |
521 return next_line_break(&r); | |
522 } |