Introduction
Introduction Statistics Contact Development Disclaimer Help
line.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
line.c (14397B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4
5 #include "../gen/line.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 static inline enum line_break_property
10 get_break_prop(uint_least32_t cp)
11 {
12 if (likely(cp <= UINT32_C(0x10FFFF))) {
13 return (enum line_break_property)
14 line_break_minor[line_break_major[cp >> 8] +
15 (cp & 0xff)];
16 } else {
17 return LINE_BREAK_PROP_AL;
18 }
19 }
20
21 static size_t
22 next_line_break(HERODOTUS_READER *r)
23 {
24 HERODOTUS_READER tmp;
25 enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_…
26 last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
27 uint_least32_t cp;
28 uint_least8_t lb25_level = 0;
29 bool lb21a_flag = false, ri_even = true;
30
31 /*
32 * Apply line breaking algorithm (UAX #14), see
33 * https://unicode.org/reports/tr14/#Algorithm and tailoring
34 * https://unicode.org/reports/tr14/#Examples (example 7),
35 * given the automatic test-cases implement this example for
36 * better number handling.
37 *
38 */
39
40 /*
41 * Initialize the different properties such that we have
42 * a good state after the state-update in the loop
43 */
44 last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB…
45 last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_P…
46
47 for (herodotus_read_codepoint(r, true, &cp),
48 cp0_prop = get_break_prop(cp);
49 herodotus_read_codepoint(r, false, &cp) ==
50 HERODOTUS_STATUS_SUCCESS;
51 herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop…
52 /* get property of the right codepoint */
53 cp1_prop = get_break_prop(cp);
54
55 /* update retention-states */
56
57 /*
58 * store the last observed non-CM-or-ZWJ-property for
59 * LB9 and following.
60 */
61 if (cp0_prop != LINE_BREAK_PROP_CM &&
62 cp0_prop != LINE_BREAK_PROP_ZWJ) {
63 /*
64 * check if the property we are overwriting now …
65 * HL. If so, we set the LB21a-flag which depend…
66 * this knowledge.
67 */
68 lb21a_flag =
69 (last_non_cm_or_zwj_prop == LINE_BREAK_P…
70
71 /* check regional indicator state */
72 if (cp0_prop == LINE_BREAK_PROP_RI) {
73 /*
74 * The property we just shifted in is
75 * a regional indicator, increasing the
76 * number of consecutive RIs on the left
77 * side of the breakpoint by one, changi…
78 * the oddness.
79 *
80 */
81 ri_even = !ri_even;
82 } else {
83 /*
84 * We saw no regional indicator, so the
85 * number of consecutive RIs on the left
86 * side of the breakpoint is zero, which
87 * is an even number.
88 *
89 */
90 ri_even = true;
91 }
92
93 /*
94 * Here comes a bit of magic. The tailored rule
95 * LB25 (using example 7) has a very complicated
96 * left-hand-side-rule of the form
97 *
98 * NU (NU | SY | IS)* (CL | CP)?
99 *
100 * but instead of backtracking, we keep the state
101 * as some kind of "power level" in the variable
102 *
103 * lb25_level
104 *
105 * that goes from 0 to 3
106 *
107 * 0: we are not in the sequence
108 * 1: we have one NU to the left of the middle
109 * spot
110 * 2: we have one NU and one or more (NU | SY |…
111 * to the left of the middle spot
112 * 3: we have one NU, zero or more (NU | SY | I…
113 * and one (CL | CP) to the left of the midd…
114 * spot
115 */
116 if ((lb25_level == 0 || lb25_level == 1) &&
117 cp0_prop == LINE_BREAK_PROP_NU) {
118 /* sequence has begun */
119 lb25_level = 1;
120 } else if ((lb25_level == 1 || lb25_level == 2) …
121 (cp0_prop == LINE_BREAK_PROP_NU ||
122 cp0_prop == LINE_BREAK_PROP_SY ||
123 cp0_prop == LINE_BREAK_PROP_IS)) {
124 /* (NU | SY | IS) sequence begins or con…
125 */
126 lb25_level = 2;
127 } else if (
128 (lb25_level == 1 || lb25_level == 2) &&
129 (cp0_prop == LINE_BREAK_PROP_CL ||
130 cp0_prop ==
131 LINE_BREAK_PROP_CP_WITHOUT_EAW_…
132 cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW…
133 /* CL or CP at the end of the sequence */
134 lb25_level = 3;
135 } else {
136 /* sequence broke */
137 lb25_level = 0;
138 }
139
140 last_non_cm_or_zwj_prop = cp0_prop;
141 }
142
143 /*
144 * store the last observed non-SP-property for LB8, LB14,
145 * LB15, LB16 and LB17. LB8 gets its own unskipped prope…
146 * whereas the others build on top of the CM-ZWJ-skipped
147 * properties as they come after LB9
148 */
149 if (cp0_prop != LINE_BREAK_PROP_SP) {
150 last_non_sp_prop = cp0_prop;
151 }
152 if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
153 last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_…
154 }
155
156 /* apply the algorithm */
157
158 /* LB4 */
159 if (cp0_prop == LINE_BREAK_PROP_BK) {
160 break;
161 }
162
163 /* LB5 */
164 if (cp0_prop == LINE_BREAK_PROP_CR &&
165 cp1_prop == LINE_BREAK_PROP_LF) {
166 continue;
167 }
168 if (cp0_prop == LINE_BREAK_PROP_CR ||
169 cp0_prop == LINE_BREAK_PROP_LF ||
170 cp0_prop == LINE_BREAK_PROP_NL) {
171 break;
172 }
173
174 /* LB6 */
175 if (cp1_prop == LINE_BREAK_PROP_BK ||
176 cp1_prop == LINE_BREAK_PROP_CR ||
177 cp1_prop == LINE_BREAK_PROP_LF ||
178 cp1_prop == LINE_BREAK_PROP_NL) {
179 continue;
180 }
181
182 /* LB7 */
183 if (cp1_prop == LINE_BREAK_PROP_SP ||
184 cp1_prop == LINE_BREAK_PROP_ZW) {
185 continue;
186 }
187
188 /* LB8 */
189 if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
190 break;
191 }
192
193 /* LB8a */
194 if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
195 continue;
196 }
197
198 /* LB9 */
199 if ((cp0_prop != LINE_BREAK_PROP_BK &&
200 cp0_prop != LINE_BREAK_PROP_CR &&
201 cp0_prop != LINE_BREAK_PROP_LF &&
202 cp0_prop != LINE_BREAK_PROP_NL &&
203 cp0_prop != LINE_BREAK_PROP_SP &&
204 cp0_prop != LINE_BREAK_PROP_ZW) &&
205 (cp1_prop == LINE_BREAK_PROP_CM ||
206 cp1_prop == LINE_BREAK_PROP_ZWJ)) {
207 /*
208 * given we skip them, we don't break in such
209 * a sequence
210 */
211 continue;
212 }
213
214 /* LB10 is baked into the following rules */
215
216 /* LB11 */
217 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
218 cp1_prop == LINE_BREAK_PROP_WJ) {
219 continue;
220 }
221
222 /* LB12 */
223 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
224 continue;
225 }
226
227 /* LB12a */
228 if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
229 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
230 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
231 cp1_prop == LINE_BREAK_PROP_GL) {
232 continue;
233 }
234
235 /* LB13 (affected by tailoring for LB25, see example 7) …
236 if (cp1_prop == LINE_BREAK_PROP_EX ||
237 (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
238 (cp1_prop == LINE_BREAK_PROP_CL ||
239 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
240 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
241 cp1_prop == LINE_BREAK_PROP_IS ||
242 cp1_prop == LINE_BREAK_PROP_SY))) {
243 continue;
244 }
245
246 /* LB14 */
247 if (last_non_sp_cm_or_zwj_prop ==
248 LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
249 last_non_sp_cm_or_zwj_prop ==
250 LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
251 continue;
252 }
253
254 /* LB15 */
255 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
256 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
257 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
258 continue;
259 }
260
261 /* LB16 */
262 if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
263 last_non_sp_cm_or_zwj_prop ==
264 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
265 last_non_sp_cm_or_zwj_prop ==
266 LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
267 cp1_prop == LINE_BREAK_PROP_NS) {
268 continue;
269 }
270
271 /* LB17 */
272 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
273 cp1_prop == LINE_BREAK_PROP_B2) {
274 continue;
275 }
276
277 /* LB18 */
278 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
279 break;
280 }
281
282 /* LB19 */
283 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
284 cp1_prop == LINE_BREAK_PROP_QU) {
285 continue;
286 }
287
288 /* LB20 */
289 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
290 cp1_prop == LINE_BREAK_PROP_CB) {
291 break;
292 }
293
294 /* LB21 */
295 if (cp1_prop == LINE_BREAK_PROP_BA ||
296 cp1_prop == LINE_BREAK_PROP_HY ||
297 cp1_prop == LINE_BREAK_PROP_NS ||
298 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
299 continue;
300 }
301
302 /* LB21a */
303 if (lb21a_flag &&
304 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
305 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
306 continue;
307 }
308
309 /* LB21b */
310 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
311 cp1_prop == LINE_BREAK_PROP_HL) {
312 continue;
313 }
314
315 /* LB22 */
316 if (cp1_prop == LINE_BREAK_PROP_IN) {
317 continue;
318 }
319
320 /* LB23 */
321 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
322 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
323 cp1_prop == LINE_BREAK_PROP_NU) {
324 continue;
325 }
326 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
327 (cp1_prop == LINE_BREAK_PROP_AL ||
328 cp1_prop == LINE_BREAK_PROP_HL)) {
329 continue;
330 }
331
332 /* LB23a */
333 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
334 (cp1_prop == LINE_BREAK_PROP_ID ||
335 cp1_prop == LINE_BREAK_PROP_EB ||
336 cp1_prop == LINE_BREAK_PROP_EM)) {
337 continue;
338 }
339 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
340 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
341 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
342 cp1_prop == LINE_BREAK_PROP_PO) {
343 continue;
344 }
345
346 /* LB24 */
347 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
348 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
349 (cp1_prop == LINE_BREAK_PROP_AL ||
350 cp1_prop == LINE_BREAK_PROP_HL)) {
351 continue;
352 }
353 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
354 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
355 (cp1_prop == LINE_BREAK_PROP_PR ||
356 cp1_prop == LINE_BREAK_PROP_PO)) {
357 continue;
358 }
359
360 /* LB25 (tailored with example 7) */
361 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
362 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
363 if (cp1_prop == LINE_BREAK_PROP_NU) {
364 continue;
365 }
366
367 /* this stupid rule is the reason why we cannot
368 * simply have a stateful break-detection between
369 * two adjacent codepoints as we have it with
370 * characters.
371 */
372 herodotus_reader_copy(r, &tmp);
373 herodotus_read_codepoint(&tmp, true, &cp);
374 if (herodotus_read_codepoint(&tmp, true, &cp) ==
375 HERODOTUS_STATUS_SUCCESS &&
376 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_…
377 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF…
378 cp1_prop == LINE_BREAK_PROP_HY)) {
379 if (get_break_prop(cp) == LINE_BREAK_PRO…
380 continue;
381 }
382 }
383 }
384 if ((last_non_cm_or_zwj_prop ==
385 LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
386 last_non_cm_or_zwj_prop ==
387 LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
388 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
389 cp1_prop == LINE_BREAK_PROP_NU) {
390 continue;
391 }
392 if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU ||
393 cp1_prop == LINE_BREAK_PROP_SY ||
394 cp1_prop == LINE_BREAK_PROP_IS))…
395 continue;
396 }
397 if ((lb25_level == 1 || lb25_level == 2) &&
398 (cp1_prop == LINE_BREAK_PROP_NU ||
399 cp1_prop == LINE_BREAK_PROP_SY ||
400 cp1_prop == LINE_BREAK_PROP_IS ||
401 cp1_prop == LINE_BREAK_PROP_CL ||
402 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
403 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
404 continue;
405 }
406 if ((lb25_level == 1 || lb25_level == 2 || lb25_level ==…
407 (cp1_prop == LINE_BREAK_PROP_PO ||
408 cp1_prop == LINE_BREAK_PROP_PR)) {
409 continue;
410 }
411
412 /* LB26 */
413 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
414 (cp1_prop == LINE_BREAK_PROP_JL ||
415 cp1_prop == LINE_BREAK_PROP_JV ||
416 cp1_prop == LINE_BREAK_PROP_H2 ||
417 cp1_prop == LINE_BREAK_PROP_H3)) {
418 continue;
419 }
420 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
421 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
422 (cp1_prop == LINE_BREAK_PROP_JV ||
423 cp1_prop == LINE_BREAK_PROP_JT)) {
424 continue;
425 }
426 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
427 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
428 cp1_prop == LINE_BREAK_PROP_JT) {
429 continue;
430 }
431
432 /* LB27 */
433 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
434 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
435 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
436 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
437 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
438 cp1_prop == LINE_BREAK_PROP_PO) {
439 continue;
440 }
441 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
442 (cp1_prop == LINE_BREAK_PROP_JL ||
443 cp1_prop == LINE_BREAK_PROP_JV ||
444 cp1_prop == LINE_BREAK_PROP_JT ||
445 cp1_prop == LINE_BREAK_PROP_H2 ||
446 cp1_prop == LINE_BREAK_PROP_H3)) {
447 continue;
448 }
449
450 /* LB28 */
451 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
452 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
453 (cp1_prop == LINE_BREAK_PROP_AL ||
454 cp1_prop == LINE_BREAK_PROP_HL)) {
455 continue;
456 }
457
458 /* LB29 */
459 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
460 (cp1_prop == LINE_BREAK_PROP_AL ||
461 cp1_prop == LINE_BREAK_PROP_HL)) {
462 continue;
463 }
464
465 /* LB30 */
466 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
467 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
468 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
469 cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
470 continue;
471 }
472 if (last_non_cm_or_zwj_prop ==
473 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
474 (cp1_prop == LINE_BREAK_PROP_AL ||
475 cp1_prop == LINE_BREAK_PROP_HL ||
476 cp1_prop == LINE_BREAK_PROP_NU)) {
477 continue;
478 }
479
480 /* LB30a */
481 if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PR…
482 cp1_prop == LINE_BREAK_PROP_RI) {
483 continue;
484 }
485
486 /* LB30b */
487 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
488 cp1_prop == LINE_BREAK_PROP_EM) {
489 continue;
490 }
491 if (last_non_cm_or_zwj_prop ==
492 LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
493 cp1_prop == LINE_BREAK_PROP_EM) {
494 continue;
495 }
496
497 /* LB31 */
498 break;
499 }
500
501 return herodotus_reader_number_read(r);
502 }
503
504 size_t
505 grapheme_next_line_break(const uint_least32_t *str, size_t len)
506 {
507 HERODOTUS_READER r;
508
509 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
510
511 return next_line_break(&r);
512 }
513
514 size_t
515 grapheme_next_line_break_utf8(const char *str, size_t len)
516 {
517 HERODOTUS_READER r;
518
519 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
520
521 return next_line_break(&r);
522 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.