Introduction
Introduction Statistics Contact Development Disclaimer Help
bidirectional.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
bidirectional.c (48331B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4
5 #include "../gen/bidirectional.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 #define MAX_DEPTH 125
10
11 enum state_type {
12 STATE_PROP, /* in 0..23, bidi_property */
13 STATE_PRESERVED_PROP, /* in 0..23, preserved bidi_prop for L1-r…
14 STATE_BRACKET_OFF, /* in 0..255, offset in bidi_bracket */
15 STATE_LEVEL, /* in 0..MAX_DEPTH+1=126, embedding level…
16 STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */
17 STATE_VISITED, /* in 0..1, visited within isolating run …
18 };
19
20 static struct {
21 uint_least32_t filter_mask;
22 size_t mask_shift;
23 int_least16_t value_offset;
24 } state_lut[] = {
25 [STATE_PROP] = {
26 .filter_mask = 0x000001F, /* 00000000 00000000 00000000…
27 .mask_shift = 0,
28 .value_offset = 0,
29 },
30 [STATE_PRESERVED_PROP] = {
31 .filter_mask = 0x00003E0, /* 00000000 00000000 00000011…
32 .mask_shift = 5,
33 .value_offset = 0,
34 },
35 [STATE_BRACKET_OFF] = {
36 .filter_mask = 0x003FC00, /* 00000000 00000011 11111100…
37 .mask_shift = 10,
38 .value_offset = 0,
39 },
40 [STATE_LEVEL] = {
41 .filter_mask = 0x1FC0000, /* 00000001 11111100 00000000…
42 .mask_shift = 18,
43 .value_offset = -1,
44 },
45 [STATE_PARAGRAPH_LEVEL] = {
46 .filter_mask = 0x2000000, /* 00000010 00000000 00000000…
47 .mask_shift = 25,
48 .value_offset = 0,
49 },
50 [STATE_VISITED] = {
51 .filter_mask = 0x4000000, /* 00000100 00000000 00000000…
52 .mask_shift = 26,
53 .value_offset = 0,
54 },
55 };
56
57 static inline int_least16_t
58 get_state(enum state_type t, uint_least32_t input)
59 {
60 return (int_least16_t)((input & state_lut[t].filter_mask) >>
61 state_lut[t].mask_shift) +
62 state_lut[t].value_offset;
63 }
64
65 static inline void
66 set_state(enum state_type t, int_least16_t value, uint_least32_t *output)
67 {
68 *output &= ~state_lut[t].filter_mask;
69 *output |= ((uint_least32_t)(value - state_lut[t].value_offset)
70 << state_lut[t].mask_shift) &
71 state_lut[t].filter_mask;
72 }
73
74 struct isolate_runner {
75 uint_least32_t *buf;
76 size_t buflen;
77 size_t start;
78
79 struct {
80 size_t off;
81 } prev, cur, next;
82
83 enum bidi_property sos, eos;
84
85 uint_least8_t paragraph_level;
86 int_least8_t isolating_run_level;
87 };
88
89 static inline enum bidi_property
90 ir_get_previous_prop(const struct isolate_runner *ir)
91 {
92 return (ir->prev.off == SIZE_MAX) ?
93 ir->sos :
94 (uint_least8_t)get_state(STATE_PROP,
95 ir->buf[ir->prev.off]);
96 }
97
98 static inline enum bidi_property
99 ir_get_current_prop(const struct isolate_runner *ir)
100 {
101 return (uint_least8_t)get_state(STATE_PROP, ir->buf[ir->cur.off]…
102 }
103
104 static inline enum bidi_property
105 ir_get_next_prop(const struct isolate_runner *ir)
106 {
107 return (ir->next.off == SIZE_MAX) ?
108 ir->eos :
109 (uint_least8_t)get_state(STATE_PROP,
110 ir->buf[ir->next.off]);
111 }
112
113 static inline enum bidi_property
114 ir_get_current_preserved_prop(const struct isolate_runner *ir)
115 {
116 return (uint_least8_t)get_state(STATE_PRESERVED_PROP,
117 ir->buf[ir->cur.off]);
118 }
119
120 static inline int_least8_t
121 ir_get_current_level(const struct isolate_runner *ir)
122 {
123 return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]…
124 }
125
126 static inline const struct bracket *
127 ir_get_current_bracket_prop(const struct isolate_runner *ir)
128 {
129 return bidi_bracket +
130 (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cu…
131 }
132
133 static void
134 ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property …
135 {
136 set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off…
137 }
138
139 static void
140 ir_init(uint_least32_t *buf, size_t buflen, size_t off,
141 uint_least8_t paragraph_level, bool within, struct isolate_runne…
142 {
143 size_t i;
144 int_least8_t sos_level;
145
146 /* initialize invariants */
147 ir->buf = buf;
148 ir->buflen = buflen;
149 ir->paragraph_level = paragraph_level;
150 ir->start = off;
151
152 /* advance off until we are at a non-removed character */
153 for (; off < buflen; off++) {
154 if (get_state(STATE_LEVEL, buf[off]) != -1) {
155 break;
156 }
157 }
158 if (off == buflen) {
159 /* we encountered no more non-removed character, termina…
160 ir->next.off = SIZE_MAX;
161 return;
162 }
163
164 /* set the isolating run level to that of the current offset */
165 ir->isolating_run_level =
166 (int_least8_t)get_state(STATE_LEVEL, buf[off]);
167
168 /* initialize sos and eos to dummy values */
169 ir->sos = ir->eos = NUM_BIDI_PROPS;
170
171 /*
172 * we write the information of the "current" state into next,
173 * so that the shift-in at the first advancement moves it in
174 * cur, as desired.
175 */
176 ir->next.off = off;
177
178 /*
179 * determine the previous state but store its offset in cur.off,
180 * given it's shifted in on the first advancement
181 */
182 ir->cur.off = SIZE_MAX;
183 for (i = off, sos_level = -1; i >= 1; i--) {
184 if (get_state(STATE_LEVEL, buf[i - 1]) != -1) {
185 /*
186 * we found a character that has not been
187 * removed in X9
188 */
189 sos_level = (int_least8_t)get_state(STATE_LEVEL,
190 buf[i - 1]);
191
192 if (within) {
193 /* we just take it */
194 ir->cur.off = i;
195 }
196
197 break;
198 }
199 }
200 if (sos_level == -1) {
201 /*
202 * there were no preceding non-removed characters, set
203 * sos-level to paragraph embedding level
204 */
205 sos_level = (int_least8_t)paragraph_level;
206 }
207
208 if (!within || ir->cur.off == SIZE_MAX) {
209 /*
210 * we are at the beginning of the sequence; initialize
211 * it faithfully according to the algorithm by looking
212 * at the sos-level
213 */
214 if (MAX(sos_level, ir->isolating_run_level) % 2 == 0) {
215 /* the higher level is even, set sos to L */
216 ir->sos = BIDI_PROP_L;
217 } else {
218 /* the higher level is odd, set sos to R */
219 ir->sos = BIDI_PROP_R;
220 }
221 }
222 }
223
224 static int
225 ir_advance(struct isolate_runner *ir)
226 {
227 enum bidi_property prop;
228 int_least8_t level, isolate_level, last_isolate_level;
229 size_t i;
230
231 if (ir->next.off == SIZE_MAX) {
232 /* the sequence is over */
233 return 1;
234 }
235
236 /* shift in */
237 ir->prev.off = ir->cur.off;
238 ir->cur.off = ir->next.off;
239
240 /* mark as visited */
241 set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off]));
242
243 /* initialize next state by going to the next character in the s…
244 */
245 ir->next.off = SIZE_MAX;
246
247 last_isolate_level = -1;
248 for (i = ir->cur.off, isolate_level = 0; i < ir->buflen; i++) {
249 level = (int_least8_t)get_state(STATE_LEVEL, ir->buf[i]);
250 prop = (uint_least8_t)get_state(STATE_PROP, ir->buf[i]);
251
252 if (level == -1) {
253 /* this is one of the ignored characters, skip */
254 continue;
255 } else if (level == ir->isolating_run_level) {
256 last_isolate_level = level;
257 }
258
259 /* follow BD8/BD9 and P2 to traverse the current sequenc…
260 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
261 prop == BIDI_PROP_FSI) {
262 /*
263 * we encountered an isolate initiator, increment
264 * counter, but go into processing when we
265 * were not isolated before
266 */
267 if (isolate_level < MAX_DEPTH) {
268 isolate_level++;
269 }
270 if (isolate_level != 1) {
271 continue;
272 }
273 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
274 isolate_level--;
275
276 /*
277 * if the current PDI dropped the isolate-level
278 * to zero, it is itself part of the isolating
279 * run sequence; otherwise we simply continue.
280 */
281 if (isolate_level > 0) {
282 continue;
283 }
284 } else if (isolate_level > 0) {
285 /* we are in an isolating sequence */
286 continue;
287 }
288
289 /*
290 * now we either still are in our sequence or we hit
291 * the eos-case as we left the sequence and hit the
292 * first non-isolating-sequence character.
293 */
294 if (i == ir->cur.off) {
295 /* we were in the first initializing round */
296 continue;
297 } else if (level == ir->isolating_run_level) {
298 /* isolate_level-skips have been handled before,…
299 * good */
300 /* still in the sequence */
301 ir->next.off = i;
302 } else {
303 /* out of sequence or isolated, compare levels v…
304 */
305 ir->next.off = SIZE_MAX;
306 if (MAX(last_isolate_level, level) % 2 == 0) {
307 ir->eos = BIDI_PROP_L;
308 } else {
309 ir->eos = BIDI_PROP_R;
310 }
311 }
312 break;
313 }
314 if (i == ir->buflen) {
315 /*
316 * the sequence ended before we could grab an offset.
317 * we need to determine the eos-prop by comparing the
318 * level of the last element in the isolating run sequen…
319 * with the paragraph level.
320 */
321 ir->next.off = SIZE_MAX;
322 if (MAX(last_isolate_level, ir->paragraph_level) % 2 == …
323 /* the higher level is even, set eos to L */
324 ir->eos = BIDI_PROP_L;
325 } else {
326 /* the higher level is odd, set eos to R */
327 ir->eos = BIDI_PROP_R;
328 }
329 }
330
331 return 0;
332 }
333
334 static enum bidi_property
335 ir_get_last_strong_prop(const struct isolate_runner *ir)
336 {
337 struct isolate_runner tmp;
338 enum bidi_property last_strong_prop = ir->sos, prop;
339
340 ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, fal…
341 &tmp);
342 for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
343 prop = ir_get_current_prop(&tmp);
344
345 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
346 prop == BIDI_PROP_AL) {
347 last_strong_prop = prop;
348 }
349 }
350
351 return last_strong_prop;
352 }
353
354 static enum bidi_property
355 ir_get_last_strong_or_number_prop(const struct isolate_runner *ir)
356 {
357 struct isolate_runner tmp;
358 enum bidi_property last_strong_or_number_prop = ir->sos, prop;
359
360 ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, fal…
361 &tmp);
362 for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
363 prop = ir_get_current_prop(&tmp);
364
365 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
366 prop == BIDI_PROP_AL || prop == BIDI_PROP_EN ||
367 prop == BIDI_PROP_AN) {
368 last_strong_or_number_prop = prop;
369 }
370 }
371
372 return last_strong_or_number_prop;
373 }
374
375 static void
376 preprocess_bracket_pair(const struct isolate_runner *start,
377 const struct isolate_runner *end)
378 {
379 enum bidi_property prop, bracket_prop, last_strong_or_number_pro…
380 struct isolate_runner ir;
381 size_t strong_type_off;
382
383 /*
384 * check if the bracket contains a strong type (L or R|EN|AN)
385 */
386 for (ir = *start, strong_type_off = SIZE_MAX,
387 bracket_prop = NUM_BIDI_PROPS;
388 !ir_advance(&ir) && ir.cur.off < end->cur.off;) {
389 prop = ir_get_current_prop(&ir);
390
391 if (prop == BIDI_PROP_L) {
392 strong_type_off = ir.cur.off;
393 if (ir.isolating_run_level % 2 == 0) {
394 /*
395 * set the type for both brackets to L (…
396 * match the strong type they contain)
397 */
398 bracket_prop = BIDI_PROP_L;
399 }
400 } else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN ||
401 prop == BIDI_PROP_AN) {
402 strong_type_off = ir.cur.off;
403 if (ir.isolating_run_level % 2 != 0) {
404 /*
405 * set the type for both brackets to R (…
406 * match the strong type they contain)
407 */
408 bracket_prop = BIDI_PROP_R;
409 }
410 }
411 }
412 if (strong_type_off == SIZE_MAX) {
413 /*
414 * there are no strong types within the brackets and we …
415 * leave the brackets as is
416 */
417 return;
418 }
419
420 if (bracket_prop == NUM_BIDI_PROPS) {
421 /*
422 * We encountered a strong type, but it was opposite
423 * to the embedding direction.
424 * Check the previous strong type before the opening
425 * bracket
426 */
427 last_strong_or_number_prop =
428 ir_get_last_strong_or_number_prop(start);
429 if (last_strong_or_number_prop == BIDI_PROP_L &&
430 ir.isolating_run_level % 2 != 0) {
431 /*
432 * the previous strong type is also opposite
433 * to the embedding direction, so the context
434 * was established and we set the brackets
435 * accordingly.
436 */
437 bracket_prop = BIDI_PROP_L;
438 } else if ((last_strong_or_number_prop == BIDI_PROP_R ||
439 last_strong_or_number_prop == BIDI_PROP_EN ||
440 last_strong_or_number_prop == BIDI_PROP_AN) …
441 ir.isolating_run_level % 2 == 0) {
442 /*
443 * the previous strong type is also opposite
444 * to the embedding direction, so the context
445 * was established and we set the brackets
446 * accordingly.
447 */
448 bracket_prop = BIDI_PROP_R;
449 } else {
450 /* set brackets to the embedding direction */
451 if (ir.isolating_run_level % 2 == 0) {
452 bracket_prop = BIDI_PROP_L;
453 } else {
454 bracket_prop = BIDI_PROP_R;
455 }
456 }
457 }
458
459 ir_set_current_prop(start, bracket_prop);
460 ir_set_current_prop(end, bracket_prop);
461
462 /*
463 * any sequence of NSMs after opening or closing brackets get
464 * the same property as the one we set on the brackets
465 */
466 for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_p…
467 &ir) == BIDI_PROP_…
468 ir_set_current_prop(&ir, bracket_prop);
469 }
470 for (ir = *end; !ir_advance(&ir) &&
471 ir_get_current_preserved_prop(&ir) == BIDI_PROP_…
472 ir_set_current_prop(&ir, bracket_prop);
473 }
474 }
475
476 static void
477 preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off,
478 uint_least8_t paragraph_level)
479 {
480 /*
481 * The N0-rule deals with bracket pairs that shall be determined
482 * with the rule BD16. This is specified as an algorithm with a
483 * stack of 63 bracket openings that are used to resolve into a
484 * separate list of pairs, which is then to be sorted by opening
485 * position. Thus, even though the bracketing-depth is limited
486 * by 63, the algorithm, as is, requires dynamic memory
487 * management.
488 *
489 * A naive approach (used by Fribidi) would be to screw the
490 * stack-approach and simply directly determine the
491 * corresponding closing bracket offset for a given opening
492 * bracket, leading to O(n²) time complexity in the worst case
493 * with a lot of brackets. While many brackets are not common,
494 * it is still possible to find a middle ground where you obtain
495 * strongly linear time complexity in most common cases:
496 *
497 * Instead of a stack, we use a FIFO data structure which is
498 * filled with bracket openings in the order of appearance (thus
499 * yielding an implicit sorting!) at the top. If the
500 * corresponding closing bracket is encountered, it is added to
501 * the respective entry, making it ready to "move out" at the
502 * bottom (i.e. passed to the bracket processing). Due to the
503 * nature of the specified pair detection algorithm, which only
504 * cares about the bracket type and nothing else (bidi class,
505 * level, etc.), we can mix processing and bracket detection.
506 *
507 * Obviously, if you, for instance, have one big bracket pair at
508 * the bottom that has not been closed yet, it will block the
509 * bracket processing and the FIFO might hit its capacity limit.
510 * At this point, the blockage is manually resolved using the
511 * naive quadratic approach.
512 *
513 * To remain within the specified standard behaviour, which
514 * mandates that processing of brackets should stop when the
515 * bracketing-depth is at 63, we simply check in an "overflow"
516 * scenario if all 63 elements in the LIFO are unfinished, which
517 * corresponds with such a bracketing depth.
518 */
519 enum bidi_property prop;
520
521 struct {
522 bool complete;
523 size_t bracket_class;
524 struct isolate_runner start;
525 struct isolate_runner end;
526 } fifo[63];
527 const struct bracket *bracket_prop, *tmp_bracket_prop;
528 struct isolate_runner ir, tmp_ir;
529 size_t fifo_len = 0, i, blevel, j, k;
530
531 ir_init(buf, buflen, off, paragraph_level, false, &ir);
532 while (!ir_advance(&ir)) {
533 prop = ir_get_current_prop(&ir);
534 bracket_prop = ir_get_current_bracket_prop(&ir);
535 if (prop == BIDI_PROP_ON &&
536 bracket_prop->type == BIDI_BRACKET_OPEN) {
537 if (fifo_len == LEN(fifo)) {
538 /*
539 * The FIFO is full, check first if it's
540 * completely blocked (i.e. no finished
541 * bracket pairs, triggering the standard
542 * that mandates to abort in such a case
543 */
544 for (i = 0; i < fifo_len; i++) {
545 if (fifo[i].complete) {
546 break;
547 }
548 }
549 if (i == fifo_len) {
550 /* abort processing */
551 return;
552 }
553
554 /*
555 * by construction, the bottom entry
556 * in the FIFO is guaranteed to be
557 * unfinished (given we "consume" all
558 * finished bottom entries after each
559 * iteration).
560 *
561 * iterate, starting after the opening
562 * bracket, and find the corresponding
563 * closing bracket.
564 *
565 * if we find none, just drop the FIFO
566 * entry silently
567 */
568 for (tmp_ir = fifo[0].start, blevel = 0;
569 !ir_advance(&tmp_ir);) {
570 tmp_bracket_prop =
571 ir_get_current_bracket_p…
572 &tmp_ir);
573
574 if (tmp_bracket_prop->type ==
575 BIDI_BRACKET_OPEN &&
576 tmp_bracket_prop->class ==
577 bracket_prop->class)…
578 /* we encountered another
579 * opening bracket of the
580 * same class */
581 blevel++;
582
583 } else if (tmp_bracket_prop->typ…
584 BIDI_BRACKET_…
585 tmp_bracket_prop->cla…
586 bracket_prop
587 ->cla…
588 /* we encountered a clos…
589 * bracket of the same c…
590 */
591 if (blevel == 0) {
592 /* this is the
593 * corresponding
594 * closing brack…
595 */
596 fifo[0].complete…
597 fifo[0].end = ir;
598 } else {
599 blevel--;
600 }
601 }
602 }
603 if (fifo[0].complete) {
604 /* we found the matching bracket…
605 preprocess_bracket_pair(
606 &(fifo[i].start),
607 &(fifo[i].end));
608 }
609
610 /* shift FIFO one to the left */
611 for (i = 1; i < fifo_len; i++) {
612 fifo[i - 1] = fifo[i];
613 }
614 fifo_len--;
615 }
616
617 /* add element to the FIFO */
618 fifo_len++;
619 fifo[fifo_len - 1].complete = false;
620 fifo[fifo_len - 1].bracket_class = bracket_prop-…
621 fifo[fifo_len - 1].start = ir;
622 } else if (prop == BIDI_PROP_ON &&
623 bracket_prop->type == BIDI_BRACKET_CLOSE) {
624 /*
625 * go backwards in the FIFO, skip finished entri…
626 * and simply ignore (do nothing) the closing
627 * bracket if we do not match anything
628 */
629 for (i = fifo_len; i > 0; i--) {
630 if (bracket_prop->class ==
631 fifo[i - 1].bracket_class &&
632 !fifo[i - 1].complete) {
633 /* we have found a pair */
634 fifo[i - 1].complete = true;
635 fifo[i - 1].end = ir;
636
637 /* remove all uncompleted FIFO e…
638 * above i - 1 */
639 for (j = i; j < fifo_len;) {
640 if (fifo[j].complete) {
641 j++;
642 continue;
643 }
644
645 /* shift FIFO one to the…
646 for (k = j + 1; k < fifo…
647 k++) {
648 fifo[k - 1] = fi…
649 }
650 fifo_len--;
651 }
652 break;
653 }
654 }
655 }
656
657 /* process all ready bracket pairs from the FIFO bottom …
658 while (fifo_len > 0 && fifo[0].complete) {
659 preprocess_bracket_pair(&(fifo[0].start),
660 &(fifo[0].end));
661
662 /* shift FIFO one to the left */
663 for (j = 0; j + 1 < fifo_len; j++) {
664 fifo[j] = fifo[j + 1];
665 }
666 fifo_len--;
667 }
668 }
669
670 /*
671 * afterwards, we still might have unfinished bracket pairs
672 * that will remain as such, but the subsequent finished pairs
673 * also need to be taken into account, so we go through the
674 * FIFO once more and process all finished pairs
675 */
676 for (i = 0; i < fifo_len; i++) {
677 if (fifo[i].complete) {
678 preprocess_bracket_pair(&(fifo[i].start),
679 &(fifo[i].end));
680 }
681 }
682 }
683
684 static size_t
685 preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen,
686 size_t off, uint_least8_t paragraph_le…
687 {
688 enum bidi_property sequence_prop, prop;
689 struct isolate_runner ir, tmp;
690 size_t runsince, sequence_end;
691
692 /* W1 */
693 ir_init(buf, buflen, off, paragraph_level, false, &ir);
694 while (!ir_advance(&ir)) {
695 if (ir_get_current_prop(&ir) == BIDI_PROP_NSM) {
696 prop = ir_get_previous_prop(&ir);
697
698 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_R…
699 prop == BIDI_PROP_FSI || prop == BIDI_PROP_P…
700 ir_set_current_prop(&ir, BIDI_PROP_ON);
701 } else {
702 ir_set_current_prop(&ir, prop);
703 }
704 }
705 }
706
707 /* W2 */
708 ir_init(buf, buflen, off, paragraph_level, false, &ir);
709 while (!ir_advance(&ir)) {
710 if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
711 ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) {
712 ir_set_current_prop(&ir, BIDI_PROP_AN);
713 }
714 }
715
716 /* W3 */
717 ir_init(buf, buflen, off, paragraph_level, false, &ir);
718 while (!ir_advance(&ir)) {
719 if (ir_get_current_prop(&ir) == BIDI_PROP_AL) {
720 ir_set_current_prop(&ir, BIDI_PROP_R);
721 }
722 }
723
724 /* W4 */
725 ir_init(buf, buflen, off, paragraph_level, false, &ir);
726 while (!ir_advance(&ir)) {
727 if (ir_get_previous_prop(&ir) == BIDI_PROP_EN &&
728 (ir_get_current_prop(&ir) == BIDI_PROP_ES ||
729 ir_get_current_prop(&ir) == BIDI_PROP_CS) &&
730 ir_get_next_prop(&ir) == BIDI_PROP_EN) {
731 ir_set_current_prop(&ir, BIDI_PROP_EN);
732 }
733
734 if (ir_get_previous_prop(&ir) == BIDI_PROP_AN &&
735 ir_get_current_prop(&ir) == BIDI_PROP_CS &&
736 ir_get_next_prop(&ir) == BIDI_PROP_AN) {
737 ir_set_current_prop(&ir, BIDI_PROP_AN);
738 }
739 }
740
741 /* W5 */
742 runsince = SIZE_MAX;
743 ir_init(buf, buflen, off, paragraph_level, false, &ir);
744 while (!ir_advance(&ir)) {
745 if (ir_get_current_prop(&ir) == BIDI_PROP_ET) {
746 if (runsince == SIZE_MAX) {
747 /* a new run has begun */
748 runsince = ir.cur.off;
749 }
750 } else if (ir_get_current_prop(&ir) == BIDI_PROP_EN) {
751 /* set the preceding sequence */
752 if (runsince != SIZE_MAX) {
753 ir_init(buf, buflen, runsince, paragraph…
754 (runsince > off), &tmp);
755 while (!ir_advance(&tmp) &&
756 tmp.cur.off < ir.cur.off) {
757 ir_set_current_prop(&tmp, BIDI_P…
758 }
759 runsince = SIZE_MAX;
760 } else {
761 ir_init(buf, buflen, ir.cur.off,
762 paragraph_level, (ir.cur.off > o…
763 &tmp);
764 ir_advance(&tmp);
765 }
766 /* follow the succeeding sequence */
767 while (!ir_advance(&tmp)) {
768 if (ir_get_current_prop(&tmp) != BIDI_PR…
769 break;
770 }
771 ir_set_current_prop(&tmp, BIDI_PROP_EN);
772 }
773 } else {
774 /* sequence ended */
775 runsince = SIZE_MAX;
776 }
777 }
778
779 /* W6 */
780 ir_init(buf, buflen, off, paragraph_level, false, &ir);
781 while (!ir_advance(&ir)) {
782 prop = ir_get_current_prop(&ir);
783
784 if (prop == BIDI_PROP_ES || prop == BIDI_PROP_ET ||
785 prop == BIDI_PROP_CS) {
786 ir_set_current_prop(&ir, BIDI_PROP_ON);
787 }
788 }
789
790 /* W7 */
791 ir_init(buf, buflen, off, paragraph_level, false, &ir);
792 while (!ir_advance(&ir)) {
793 if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
794 ir_get_last_strong_prop(&ir) == BIDI_PROP_L) {
795 ir_set_current_prop(&ir, BIDI_PROP_L);
796 }
797 }
798
799 /* N0 */
800 preprocess_bracket_pairs(buf, buflen, off, paragraph_level);
801
802 /* N1 */
803 sequence_end = SIZE_MAX;
804 sequence_prop = NUM_BIDI_PROPS;
805 ir_init(buf, buflen, off, paragraph_level, false, &ir);
806 while (!ir_advance(&ir)) {
807 if (sequence_end == SIZE_MAX) {
808 prop = ir_get_current_prop(&ir);
809
810 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
811 prop == BIDI_PROP_WS || prop == BIDI_PROP_ON…
812 prop == BIDI_PROP_FSI || prop == BIDI_PROP_L…
813 prop == BIDI_PROP_RLI || prop == BIDI_PROP_P…
814 /* the current character is an NI (neutr…
815 * or isolate) */
816
817 /* scan ahead to the end of the NI-seque…
818 */
819 ir_init(buf, buflen, ir.cur.off,
820 paragraph_level, (ir.cur.off > o…
821 &tmp);
822 while (!ir_advance(&tmp)) {
823 prop = ir_get_next_prop(&tmp);
824
825 if (prop != BIDI_PROP_B &&
826 prop != BIDI_PROP_S &&
827 prop != BIDI_PROP_WS &&
828 prop != BIDI_PROP_ON &&
829 prop != BIDI_PROP_FSI &&
830 prop != BIDI_PROP_LRI &&
831 prop != BIDI_PROP_RLI &&
832 prop != BIDI_PROP_PDI) {
833 break;
834 }
835 }
836
837 /*
838 * check what follows and see if the text
839 * has the same direction on both sides
840 */
841 if (ir_get_previous_prop(&ir) == BIDI_PR…
842 ir_get_next_prop(&tmp) == BIDI_PROP_…
843 sequence_end = tmp.cur.off;
844 sequence_prop = BIDI_PROP_L;
845 } else if ((ir_get_previous_prop(&ir) ==
846 BIDI_PROP_R ||
847 ir_get_previous_prop(&ir) ==
848 BIDI_PROP_EN ||
849 ir_get_previous_prop(&ir) ==
850 BIDI_PROP_AN) &&
851 (ir_get_next_prop(&tmp) ==
852 BIDI_PROP_R ||
853 ir_get_next_prop(&tmp) ==
854 BIDI_PROP_EN ||
855 ir_get_next_prop(&tmp) ==
856 BIDI_PROP_AN)) {
857 sequence_end = tmp.cur.off;
858 sequence_prop = BIDI_PROP_R;
859 }
860 }
861 }
862
863 if (sequence_end != SIZE_MAX) {
864 if (ir.cur.off <= sequence_end) {
865 ir_set_current_prop(&ir, sequence_prop);
866 } else {
867 /* end of sequence, reset */
868 sequence_end = SIZE_MAX;
869 sequence_prop = NUM_BIDI_PROPS;
870 }
871 }
872 }
873
874 /* N2 */
875 ir_init(buf, buflen, off, paragraph_level, false, &ir);
876 while (!ir_advance(&ir)) {
877 prop = ir_get_current_prop(&ir);
878
879 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
880 prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
881 prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
882 prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
883 /* N2 */
884 if (ir_get_current_level(&ir) % 2 == 0) {
885 /* even embedding level */
886 ir_set_current_prop(&ir, BIDI_PROP_L);
887 } else {
888 /* odd embedding level */
889 ir_set_current_prop(&ir, BIDI_PROP_R);
890 }
891 }
892 }
893
894 return 0;
895 }
896
897 static uint_least8_t
898 get_isolated_paragraph_level(const uint_least32_t *state, size_t statele…
899 {
900 enum bidi_property prop;
901 int_least8_t isolate_level;
902 size_t stateoff;
903
904 /* determine paragraph level (rules P1-P3) and terminate on PDI …
905 for (stateoff = 0, isolate_level = 0; stateoff < statelen; state…
906 prop = (uint_least8_t)get_state(STATE_PROP, state[stateo…
907
908 if (prop == BIDI_PROP_PDI && isolate_level == 0) {
909 /*
910 * we are in a FSI-subsection of a paragraph and
911 * matched with the terminating PDI
912 */
913 break;
914 }
915
916 /* BD8/BD9 */
917 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
918 prop == BIDI_PROP_FSI) &&
919 isolate_level < MAX_DEPTH) {
920 /* we hit an isolate initiator, increment counte…
921 isolate_level++;
922 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
923 isolate_level--;
924 }
925
926 /* P2 */
927 if (isolate_level > 0) {
928 continue;
929 }
930
931 /* P3 */
932 if (prop == BIDI_PROP_L) {
933 return 0;
934 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
935 return 1;
936 }
937 }
938
939 return 0;
940 }
941
942 static inline uint_least8_t
943 get_bidi_property(uint_least32_t cp)
944 {
945 if (likely(cp <= 0x10FFFF)) {
946 return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
947 0x1F /* 00011111 */;
948 } else {
949 return BIDI_PROP_L;
950 }
951 }
952
953 static uint_least8_t
954 get_paragraph_level(enum grapheme_bidirectional_direction override,
955 const HERODOTUS_READER *r)
956 {
957 HERODOTUS_READER tmp;
958 enum bidi_property prop;
959 int_least8_t isolate_level;
960 uint_least32_t cp;
961
962 /* check overrides first according to rule HL1 */
963 if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
964 return 0;
965 } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
966 return 1;
967 }
968
969 /* copy reader into temporary reader */
970 herodotus_reader_copy(r, &tmp);
971
972 /* determine paragraph level (rules P1-P3) */
973 for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp…
974 HERODOTUS_STATUS_SUCCESS;) {
975 prop = get_bidi_property(cp);
976
977 /* BD8/BD9 */
978 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
979 prop == BIDI_PROP_FSI) &&
980 isolate_level < MAX_DEPTH) {
981 /* we hit an isolate initiator, increment counte…
982 isolate_level++;
983 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
984 isolate_level--;
985 }
986
987 /* P2 */
988 if (isolate_level > 0) {
989 continue;
990 }
991
992 /* P3 */
993 if (prop == BIDI_PROP_L) {
994 return 0;
995 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
996 return 1;
997 }
998 }
999
1000 return 0;
1001 }
1002
1003 static void
1004 preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf,
1005 size_t buflen)
1006 {
1007 enum bidi_property prop;
1008 int_least8_t level;
1009
1010 struct {
1011 int_least8_t level;
1012 enum grapheme_bidirectional_direction override;
1013 bool directional_isolate;
1014 } directional_status[MAX_DEPTH + 2], *dirstat = directional_stat…
1015
1016 size_t overflow_isolate_count, overflow_embedding_count,
1017 valid_isolate_count, bufoff, i, runsince;
1018
1019 /* X1 */
1020 dirstat->level = (int_least8_t)paragraph_level;
1021 dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
1022 dirstat->directional_isolate = false;
1023 overflow_isolate_count = overflow_embedding_count =
1024 valid_isolate_count = 0;
1025
1026 for (bufoff = 0; bufoff < buflen; bufoff++) {
1027 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
1028
1029 /* set paragraph level we need for line-level-processing…
1030 set_state(STATE_PARAGRAPH_LEVEL, paragraph_level,
1031 &(buf[bufoff]));
1032 again:
1033 if (prop == BIDI_PROP_RLE) {
1034 /* X2 */
1035 if (dirstat->level + (dirstat->level % 2 != 0) +…
1036 MAX_DEPTH &&
1037 overflow_isolate_count == 0 &&
1038 overflow_embedding_count == 0) {
1039 /* valid RLE */
1040 dirstat++;
1041 dirstat->level =
1042 (dirstat - 1)->level +
1043 ((dirstat - 1)->level % 2 != 0) …
1044 dirstat->override =
1045 GRAPHEME_BIDIRECTIONAL_DIRECTION…
1046 dirstat->directional_isolate = false;
1047 } else {
1048 /* overflow RLE */
1049 overflow_embedding_count +=
1050 (overflow_isolate_count == 0);
1051 }
1052 } else if (prop == BIDI_PROP_LRE) {
1053 /* X3 */
1054 if (dirstat->level + (dirstat->level % 2 == 0) +…
1055 MAX_DEPTH &&
1056 overflow_isolate_count == 0 &&
1057 overflow_embedding_count == 0) {
1058 /* valid LRE */
1059 dirstat++;
1060 dirstat->level =
1061 (dirstat - 1)->level +
1062 ((dirstat - 1)->level % 2 == 0) …
1063 dirstat->override =
1064 GRAPHEME_BIDIRECTIONAL_DIRECTION…
1065 dirstat->directional_isolate = false;
1066 } else {
1067 /* overflow LRE */
1068 overflow_embedding_count +=
1069 (overflow_isolate_count == 0);
1070 }
1071 } else if (prop == BIDI_PROP_RLO) {
1072 /* X4 */
1073 if (dirstat->level + (dirstat->level % 2 != 0) +…
1074 MAX_DEPTH &&
1075 overflow_isolate_count == 0 &&
1076 overflow_embedding_count == 0) {
1077 /* valid RLO */
1078 dirstat++;
1079 dirstat->level =
1080 (dirstat - 1)->level +
1081 ((dirstat - 1)->level % 2 != 0) …
1082 dirstat->override =
1083 GRAPHEME_BIDIRECTIONAL_DIRECTION…
1084 dirstat->directional_isolate = false;
1085 } else {
1086 /* overflow RLO */
1087 overflow_embedding_count +=
1088 (overflow_isolate_count == 0);
1089 }
1090 } else if (prop == BIDI_PROP_LRO) {
1091 /* X5 */
1092 if (dirstat->level + (dirstat->level % 2 == 0) +…
1093 MAX_DEPTH &&
1094 overflow_isolate_count == 0 &&
1095 overflow_embedding_count == 0) {
1096 /* valid LRE */
1097 dirstat++;
1098 dirstat->level =
1099 (dirstat - 1)->level +
1100 ((dirstat - 1)->level % 2 == 0) …
1101 dirstat->override =
1102 GRAPHEME_BIDIRECTIONAL_DIRECTION…
1103 dirstat->directional_isolate = false;
1104 } else {
1105 /* overflow LRO */
1106 overflow_embedding_count +=
1107 (overflow_isolate_count == 0);
1108 }
1109 } else if (prop == BIDI_PROP_RLI) {
1110 /* X5a */
1111 set_state(STATE_LEVEL, dirstat->level, &(buf[buf…
1112 if (dirstat->override ==
1113 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
1114 set_state(STATE_PROP, BIDI_PROP_L,
1115 &(buf[bufoff]));
1116 } else if (dirstat->override ==
1117 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL)…
1118 set_state(STATE_PROP, BIDI_PROP_R,
1119 &(buf[bufoff]));
1120 }
1121
1122 if (dirstat->level + (dirstat->level % 2 != 0) +…
1123 MAX_DEPTH &&
1124 overflow_isolate_count == 0 &&
1125 overflow_embedding_count == 0) {
1126 /* valid RLI */
1127 valid_isolate_count++;
1128
1129 dirstat++;
1130 dirstat->level =
1131 (dirstat - 1)->level +
1132 ((dirstat - 1)->level % 2 != 0) …
1133 dirstat->override =
1134 GRAPHEME_BIDIRECTIONAL_DIRECTION…
1135 dirstat->directional_isolate = true;
1136 } else {
1137 /* overflow RLI */
1138 overflow_isolate_count++;
1139 }
1140 } else if (prop == BIDI_PROP_LRI) {
1141 /* X5b */
1142 set_state(STATE_LEVEL, dirstat->level, &(buf[buf…
1143 if (dirstat->override ==
1144 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
1145 set_state(STATE_PROP, BIDI_PROP_L,
1146 &(buf[bufoff]));
1147 } else if (dirstat->override ==
1148 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL)…
1149 set_state(STATE_PROP, BIDI_PROP_R,
1150 &(buf[bufoff]));
1151 }
1152
1153 if (dirstat->level + (dirstat->level % 2 == 0) +…
1154 MAX_DEPTH &&
1155 overflow_isolate_count == 0 &&
1156 overflow_embedding_count == 0) {
1157 /* valid LRI */
1158 valid_isolate_count++;
1159
1160 dirstat++;
1161 dirstat->level =
1162 (dirstat - 1)->level +
1163 ((dirstat - 1)->level % 2 == 0) …
1164 dirstat->override =
1165 GRAPHEME_BIDIRECTIONAL_DIRECTION…
1166 dirstat->directional_isolate = true;
1167 } else {
1168 /* overflow LRI */
1169 overflow_isolate_count++;
1170 }
1171 } else if (prop == BIDI_PROP_FSI) {
1172 /* X5c */
1173 if (get_isolated_paragraph_level(
1174 buf + (bufoff + 1),
1175 buflen - (bufoff + 1)) == 1) {
1176 prop = BIDI_PROP_RLI;
1177 goto again;
1178 } else { /* ... == 0 */
1179 prop = BIDI_PROP_LRI;
1180 goto again;
1181 }
1182 } else if (prop != BIDI_PROP_B && prop != BIDI_PROP_BN &&
1183 prop != BIDI_PROP_PDF && prop != BIDI_PROP_PD…
1184 /* X6 */
1185 set_state(STATE_LEVEL, dirstat->level, &(buf[buf…
1186 if (dirstat->override ==
1187 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
1188 set_state(STATE_PROP, BIDI_PROP_L,
1189 &(buf[bufoff]));
1190 } else if (dirstat->override ==
1191 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL)…
1192 set_state(STATE_PROP, BIDI_PROP_R,
1193 &(buf[bufoff]));
1194 }
1195 } else if (prop == BIDI_PROP_PDI) {
1196 /* X6a */
1197 if (overflow_isolate_count > 0) {
1198 /* PDI matches an overflow isolate initi…
1199 */
1200 overflow_isolate_count--;
1201 } else if (valid_isolate_count > 0) {
1202 /* PDI matches a normal isolate initiato…
1203 overflow_embedding_count = 0;
1204 while (dirstat->directional_isolate == f…
1205 dirstat > directional_status) {
1206 /*
1207 * we are safe here as given the
1208 * valid isolate count is positi…
1209 * there must be a stack-entry
1210 * with positive directional
1211 * isolate status, but we take
1212 * no chances and include an
1213 * explicit check
1214 *
1215 * POSSIBLE OPTIMIZATION: Whenev…
1216 * we push on the stack, check i…
1217 * has the directional isolate
1218 * status true and store a point…
1219 * to it so we can jump to it ve…
1220 * quickly.
1221 */
1222 dirstat--;
1223 }
1224
1225 /*
1226 * as above, the following check is not
1227 * necessary, given we are guaranteed to
1228 * have at least one stack entry left,
1229 * but it's better to be safe
1230 */
1231 if (dirstat > directional_status) {
1232 dirstat--;
1233 }
1234 valid_isolate_count--;
1235 }
1236
1237 set_state(STATE_LEVEL, dirstat->level, &(buf[buf…
1238 if (dirstat->override ==
1239 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
1240 set_state(STATE_PROP, BIDI_PROP_L,
1241 &(buf[bufoff]));
1242 } else if (dirstat->override ==
1243 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL)…
1244 set_state(STATE_PROP, BIDI_PROP_R,
1245 &(buf[bufoff]));
1246 }
1247 } else if (prop == BIDI_PROP_PDF) {
1248 /* X7 */
1249 if (overflow_isolate_count > 0) {
1250 /* do nothing */
1251 } else if (overflow_embedding_count > 0) {
1252 overflow_embedding_count--;
1253 } else if (dirstat->directional_isolate == false…
1254 dirstat > directional_status) {
1255 dirstat--;
1256 }
1257 } else if (prop == BIDI_PROP_B) {
1258 /* X8 */
1259 set_state(STATE_LEVEL, paragraph_level, &(buf[bu…
1260 }
1261
1262 /* X9 */
1263 if (prop == BIDI_PROP_RLE || prop == BIDI_PROP_LRE ||
1264 prop == BIDI_PROP_RLO || prop == BIDI_PROP_LRO ||
1265 prop == BIDI_PROP_PDF || prop == BIDI_PROP_BN) {
1266 set_state(STATE_LEVEL, -1, &(buf[bufoff]));
1267 }
1268 }
1269
1270 /* X10 (W1-W7, N0-N2) */
1271 for (bufoff = 0; bufoff < buflen; bufoff++) {
1272 if (get_state(STATE_VISITED, buf[bufoff]) == 0 &&
1273 get_state(STATE_LEVEL, buf[bufoff]) != -1) {
1274 bufoff += preprocess_isolating_run_sequence(
1275 buf, buflen, bufoff, paragraph_level);
1276 }
1277 }
1278
1279 /*
1280 * I1-I2 (given our sequential approach to processing the
1281 * isolating run sequences, we apply this rule separately)
1282 */
1283 for (bufoff = 0; bufoff < buflen; bufoff++) {
1284 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]…
1285 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
1286
1287 if (level % 2 == 0) {
1288 /* even level */
1289 if (prop == BIDI_PROP_R) {
1290 set_state(STATE_LEVEL, level + 1,
1291 &(buf[bufoff]));
1292 } else if (prop == BIDI_PROP_AN ||
1293 prop == BIDI_PROP_EN) {
1294 set_state(STATE_LEVEL, level + 2,
1295 &(buf[bufoff]));
1296 }
1297 } else {
1298 /* odd level */
1299 if (prop == BIDI_PROP_L || prop == BIDI_PROP_EN …
1300 prop == BIDI_PROP_AN) {
1301 set_state(STATE_LEVEL, level + 1,
1302 &(buf[bufoff]));
1303 }
1304 }
1305 }
1306
1307 /* L1 (rules 1-3) */
1308 runsince = SIZE_MAX;
1309 for (bufoff = 0; bufoff < buflen; bufoff++) {
1310 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]…
1311 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
1312 buf[bufoff]);
1313
1314 if (level == -1) {
1315 /* ignored character */
1316 continue;
1317 }
1318
1319 /* rules 1 and 2 */
1320 if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
1321 set_state(STATE_LEVEL, paragraph_level, &(buf[bu…
1322 }
1323
1324 /* rule 3 */
1325 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
1326 prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
1327 prop == BIDI_PROP_PDI) {
1328 if (runsince == SIZE_MAX) {
1329 /* a new run has begun */
1330 runsince = bufoff;
1331 }
1332 } else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) …
1333 runsince != SIZE_MAX) {
1334 /*
1335 * we hit a segment or paragraph separator in a
1336 * sequence, reset sequence-levels
1337 */
1338 for (i = runsince; i < bufoff; i++) {
1339 if (get_state(STATE_LEVEL, buf[i]) != -1…
1340 set_state(STATE_LEVEL, paragraph…
1341 &(buf[i]));
1342 }
1343 }
1344 runsince = SIZE_MAX;
1345 } else {
1346 /* sequence ended */
1347 runsince = SIZE_MAX;
1348 }
1349 }
1350 if (runsince != SIZE_MAX) {
1351 /*
1352 * this is the end of the paragraph and we
1353 * are in a run
1354 */
1355 for (i = runsince; i < buflen; i++) {
1356 if (get_state(STATE_LEVEL, buf[i]) != -1) {
1357 set_state(STATE_LEVEL, paragraph_level,
1358 &(buf[i]));
1359 }
1360 }
1361 runsince = SIZE_MAX;
1362 }
1363 }
1364
1365 static inline uint_least8_t
1366 get_bidi_bracket_off(uint_least32_t cp)
1367 {
1368 if (likely(cp <= 0x10FFFF)) {
1369 return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] +
1370 (cp & 0xff)]) >>
1371 5);
1372 } else {
1373 return 0;
1374 }
1375 }
1376
1377 static size_t
1378 preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction ov…
1379 uint_least32_t *buf, size_t buflen,
1380 enum grapheme_bidirectional_direction *resolved)
1381 {
1382 HERODOTUS_READER tmp;
1383 size_t bufoff, bufsize, paragraph_len;
1384 uint_least32_t cp;
1385 uint_least8_t paragraph_level;
1386
1387 /* determine length and level of the paragraph */
1388 herodotus_reader_copy(r, &tmp);
1389 for (; herodotus_read_codepoint(&tmp, true, &cp) ==
1390 HERODOTUS_STATUS_SUCCESS;) {
1391 /* break on paragraph separator */
1392 if (get_bidi_property(cp) == BIDI_PROP_B) {
1393 break;
1394 }
1395 }
1396 paragraph_len = herodotus_reader_number_read(&tmp);
1397 paragraph_level = get_paragraph_level(override, r);
1398
1399 if (resolved != NULL) {
1400 /* store resolved paragraph level in output variable */
1401 /* TODO use enum-type */
1402 *resolved = (paragraph_level == 0) ?
1403 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR…
1404 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
1405 }
1406
1407 if (buf == NULL) {
1408 /* see below for return value reasoning */
1409 return paragraph_len;
1410 }
1411
1412 /*
1413 * the first step is to determine the bidirectional properties
1414 * and store them in the buffer
1415 */
1416 for (bufoff = 0;
1417 bufoff < paragraph_len &&
1418 herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_…
1419 bufoff++) {
1420 if (bufoff < buflen) {
1421 /*
1422 * actually only do something when we have
1423 * space in the level-buffer. We continue
1424 * the iteration to be able to give a good
1425 * return value
1426 */
1427 set_state(STATE_PROP,
1428 (uint_least8_t)get_bidi_property(cp),
1429 &(buf[bufoff]));
1430 set_state(STATE_BRACKET_OFF, get_bidi_bracket_of…
1431 &(buf[bufoff]));
1432 set_state(STATE_LEVEL, 0, &(buf[bufoff]));
1433 set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff…
1434 set_state(STATE_VISITED, 0, &(buf[bufoff]));
1435 set_state(STATE_PRESERVED_PROP,
1436 (uint_least8_t)get_bidi_property(cp),
1437 &(buf[bufoff]));
1438 }
1439 }
1440 bufsize = herodotus_reader_number_read(r);
1441
1442 for (bufoff = 0; bufoff < bufsize; bufoff++) {
1443 if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B &&
1444 bufoff != bufsize - 1) {
1445 continue;
1446 }
1447
1448 /*
1449 * we either encountered a paragraph terminator or this
1450 * is the last character in the string.
1451 * Call the paragraph handler on the paragraph, including
1452 * the terminating character or last character of the
1453 * string respectively
1454 */
1455 preprocess_paragraph(paragraph_level, buf, bufoff + 1);
1456 break;
1457 }
1458
1459 /*
1460 * we return the number of total bytes read, as the function
1461 * should indicate if the given level-buffer is too small
1462 */
1463 return herodotus_reader_number_read(r);
1464 }
1465
1466 size_t
1467 grapheme_bidirectional_preprocess_paragraph(
1468 const uint_least32_t *src, size_t srclen,
1469 enum grapheme_bidirectional_direction override, uint_least32_t *…
1470 size_t destlen, enum grapheme_bidirectional_direction *resolved)
1471 {
1472 HERODOTUS_READER r;
1473
1474 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
1475
1476 return preprocess(&r, override, dest, destlen, resolved);
1477 }
1478
1479 static inline size_t
1480 get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen,
1481 int_least8_t (*get_level)(const void *, size_t…
1482 void (*set_level)(void *, size_t, int_least8_t…
1483 void *lev, size_t levsize, bool skipignored)
1484 {
1485 enum bidi_property prop;
1486 size_t i, levlen, runsince;
1487 int_least8_t level, runlevel;
1488
1489 /* rule L1.4 */
1490 runsince = SIZE_MAX;
1491 runlevel = -1;
1492 for (i = 0, levlen = 0; i < linelen; i++) {
1493 level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]…
1494 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
1495 linedata[i]);
1496
1497 /* write level into level array if we still have space */
1498 if (level != -1 || skipignored == false) {
1499 if (levlen <= levsize) {
1500 set_level(lev, levlen, level);
1501 }
1502 levlen++;
1503 }
1504
1505 if (level == -1) {
1506 /* ignored character */
1507 continue;
1508 }
1509
1510 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
1511 prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
1512 prop == BIDI_PROP_PDI) {
1513 if (runsince == SIZE_MAX) {
1514 /* a new run has begun */
1515 runsince = levlen - 1; /* levlen > 0 */
1516 runlevel = (int_least8_t)get_state(
1517 STATE_PARAGRAPH_LEVEL, linedata[…
1518 }
1519 } else {
1520 /* sequence ended */
1521 runsince = SIZE_MAX;
1522 runlevel = -1;
1523 }
1524 }
1525 if (runsince != SIZE_MAX) {
1526 /*
1527 * we hit the end of the line but were in a run;
1528 * reset the line levels to the paragraph level
1529 */
1530 for (i = runsince; i < MIN(linelen, levlen); i++) {
1531 if (get_level(lev, i) != -1) {
1532 set_level(lev, i, runlevel);
1533 }
1534 }
1535 }
1536
1537 return levlen;
1538 }
1539
1540 static inline int_least8_t
1541 get_level_int8(const void *lev, size_t off)
1542 {
1543 return ((const int_least8_t *)lev)[off];
1544 }
1545
1546 static inline void
1547 set_level_int8(void *lev, size_t off, int_least8_t value)
1548 {
1549 ((int_least8_t *)lev)[off] = value;
1550 }
1551
1552 size_t
1553 grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *l…
1554 size_t linelen,
1555 int_least8_t *lev,
1556 size_t levlen)
1557 {
1558 return get_line_embedding_levels(linedata, linelen, get_level_in…
1559 set_level_int8, lev, levlen, fa…
1560 }
1561
1562 static inline int_least8_t
1563 get_level_uint32(const void *lev, size_t off)
1564 {
1565 return (int_least8_t)((((const uint_least32_t *)lev)[off] &
1566 UINT32_C(0x1FE00000)) >>
1567 21) -
1568 1;
1569 }
1570
1571 static inline void
1572 set_level_uint32(void *lev, size_t off, int_least8_t value)
1573 {
1574 ((uint_least32_t *)lev)[off] ^=
1575 ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000);
1576 ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) <<…
1577 }
1578
1579 static inline int_least16_t
1580 get_mirror_offset(uint_least32_t cp)
1581 {
1582 if (cp <= UINT32_C(0x10FFFF)) {
1583 return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)];
1584 } else {
1585 return 0;
1586 }
1587 }
1588
1589 size_t
1590 grapheme_bidirectional_reorder_line(const uint_least32_t *line,
1591 const uint_least32_t *linedata,
1592 size_t linelen, uint_least32_t *outp…
1593 size_t outputsize)
1594 {
1595 size_t i, outputlen, first, last, j, k, l /*, laststart*/;
1596 int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0;
1597 uint_least32_t tmp;
1598
1599 /* write output characters (and apply possible mirroring) */
1600 for (i = 0, outputlen = 0; i < linelen; i++) {
1601 if (get_state(STATE_LEVEL, linedata[i]) != -1) {
1602 if (outputlen < outputsize) {
1603 output[outputlen] =
1604 (uint_least32_t)((int_least32_t)
1605 line[i]…
1606 get_mirror_offs…
1607 line[i]…
1608 }
1609 outputlen++;
1610 }
1611 }
1612 if (outputlen >= outputsize) {
1613 /* clear output buffer */
1614 for (i = 0; i < outputsize; i++) {
1615 output[i] = GRAPHEME_INVALID_CODEPOINT;
1616 }
1617
1618 /* return required size */
1619 return outputlen;
1620 }
1621
1622 /*
1623 * write line embedding levels as metadata and codepoints into t…
1624 * output
1625 */
1626 get_line_embedding_levels(linedata, linelen, get_level_uint32,
1627 set_level_uint32, output, outputsize, …
1628
1629 /* determine level range */
1630 for (i = 0; i < outputlen; i++) {
1631 level = get_level_uint32(output, i);
1632
1633 if (level == -1) {
1634 /* ignored character */
1635 continue;
1636 }
1637
1638 if (level % 2 == 1 && level < min_odd_level) {
1639 min_odd_level = level;
1640 }
1641 if (level > max_level) {
1642 max_level = level;
1643 }
1644 }
1645
1646 for (level = max_level; level >= min_odd_level /* > 0 */; level-…
1647 for (i = 0; i < outputlen; i++) {
1648 if (get_level_uint32(output, i) >= level) {
1649 /*
1650 * the current character has the desired…
1651 */
1652 first = last = i;
1653
1654 /* find the end of the level-sequence */
1655 for (i++; i < outputlen; i++) {
1656 if (get_level_uint32(output, i) …
1657 level) {
1658 /* the sequence continue…
1659 last = i;
1660 } else {
1661 break;
1662 }
1663 }
1664
1665 /* invert the sequence first..last respe…
1666 * grapheme clusters
1667 *
1668 * The standard only speaks of combining…
1669 * inversion, but we should in the perfe…
1670 * respect _all_ grapheme clusters, whic…
1671 * here!
1672 */
1673
1674 /* mark grapheme cluster breaks */
1675 for (j = first; j <= last;
1676 j += grapheme_next_character_break(
1677 line + j, outputlen - j)) {
1678 /*
1679 * we use a special trick here: …
1680 * first 21 bits of the state ar…
1681 * with the codepoint, the next …
1682 * are used for the level, so we…
1683 * the 30th bit to mark the grap…
1684 * cluster breaks. This allows u…
1685 * reinvert the grapheme cluster…
1686 * the proper direction later.
1687 */
1688 output[j] |= UINT32_C(1) << 29;
1689 }
1690
1691 /* global inversion */
1692 for (k = first, l = last; k < l; k++, l-…
1693 /* swap */
1694 tmp = output[k];
1695 output[k] = output[l];
1696 output[l] = tmp;
1697 }
1698
1699 /* grapheme cluster reinversion */
1700 #if 0
1701 for (j = first, laststart = first; j <= …
1702 j++) {
1703 if (output[j] & (UINT32_C(1) << …
1704 /* we hit a mark! given …
1705 * grapheme cluster is i…
1706 * this means that the c…
1707 * ended and we now rein…
1708 * again
1709 */
1710 for (k = laststart, l = …
1711 k < l; k++, l--) {
1712 /* swap */
1713 tmp = output[k];
1714 output[k] = outp…
1715 output[l] = tmp;
1716 }
1717 laststart = j + 1;
1718 }
1719 }
1720 #endif
1721
1722 /* unmark grapheme cluster breaks */
1723 for (j = first; j <= last; j++) {
1724 output[j] ^= output[j] &
1725 UINT32_C(0x20000000…
1726 }
1727 }
1728 }
1729 }
1730
1731 /* remove embedding level metadata */
1732 for (i = 0; i < outputlen; i++) {
1733 output[i] ^= output[i] & UINT32_C(0x1FE00000);
1734 }
1735
1736 return outputlen;
1737 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.