bidirectional.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
bidirectional.c (48331B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <stdbool.h> | |
3 #include <stddef.h> | |
4 | |
5 #include "../gen/bidirectional.h" | |
6 #include "../grapheme.h" | |
7 #include "util.h" | |
8 | |
9 #define MAX_DEPTH 125 | |
10 | |
11 enum state_type { | |
12 STATE_PROP, /* in 0..23, bidi_property */ | |
13 STATE_PRESERVED_PROP, /* in 0..23, preserved bidi_prop for L1-r… | |
14 STATE_BRACKET_OFF, /* in 0..255, offset in bidi_bracket */ | |
15 STATE_LEVEL, /* in 0..MAX_DEPTH+1=126, embedding level… | |
16 STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */ | |
17 STATE_VISITED, /* in 0..1, visited within isolating run … | |
18 }; | |
19 | |
20 static struct { | |
21 uint_least32_t filter_mask; | |
22 size_t mask_shift; | |
23 int_least16_t value_offset; | |
24 } state_lut[] = { | |
25 [STATE_PROP] = { | |
26 .filter_mask = 0x000001F, /* 00000000 00000000 00000000… | |
27 .mask_shift = 0, | |
28 .value_offset = 0, | |
29 }, | |
30 [STATE_PRESERVED_PROP] = { | |
31 .filter_mask = 0x00003E0, /* 00000000 00000000 00000011… | |
32 .mask_shift = 5, | |
33 .value_offset = 0, | |
34 }, | |
35 [STATE_BRACKET_OFF] = { | |
36 .filter_mask = 0x003FC00, /* 00000000 00000011 11111100… | |
37 .mask_shift = 10, | |
38 .value_offset = 0, | |
39 }, | |
40 [STATE_LEVEL] = { | |
41 .filter_mask = 0x1FC0000, /* 00000001 11111100 00000000… | |
42 .mask_shift = 18, | |
43 .value_offset = -1, | |
44 }, | |
45 [STATE_PARAGRAPH_LEVEL] = { | |
46 .filter_mask = 0x2000000, /* 00000010 00000000 00000000… | |
47 .mask_shift = 25, | |
48 .value_offset = 0, | |
49 }, | |
50 [STATE_VISITED] = { | |
51 .filter_mask = 0x4000000, /* 00000100 00000000 00000000… | |
52 .mask_shift = 26, | |
53 .value_offset = 0, | |
54 }, | |
55 }; | |
56 | |
57 static inline int_least16_t | |
58 get_state(enum state_type t, uint_least32_t input) | |
59 { | |
60 return (int_least16_t)((input & state_lut[t].filter_mask) >> | |
61 state_lut[t].mask_shift) + | |
62 state_lut[t].value_offset; | |
63 } | |
64 | |
65 static inline void | |
66 set_state(enum state_type t, int_least16_t value, uint_least32_t *output) | |
67 { | |
68 *output &= ~state_lut[t].filter_mask; | |
69 *output |= ((uint_least32_t)(value - state_lut[t].value_offset) | |
70 << state_lut[t].mask_shift) & | |
71 state_lut[t].filter_mask; | |
72 } | |
73 | |
74 struct isolate_runner { | |
75 uint_least32_t *buf; | |
76 size_t buflen; | |
77 size_t start; | |
78 | |
79 struct { | |
80 size_t off; | |
81 } prev, cur, next; | |
82 | |
83 enum bidi_property sos, eos; | |
84 | |
85 uint_least8_t paragraph_level; | |
86 int_least8_t isolating_run_level; | |
87 }; | |
88 | |
89 static inline enum bidi_property | |
90 ir_get_previous_prop(const struct isolate_runner *ir) | |
91 { | |
92 return (ir->prev.off == SIZE_MAX) ? | |
93 ir->sos : | |
94 (uint_least8_t)get_state(STATE_PROP, | |
95 ir->buf[ir->prev.off]); | |
96 } | |
97 | |
98 static inline enum bidi_property | |
99 ir_get_current_prop(const struct isolate_runner *ir) | |
100 { | |
101 return (uint_least8_t)get_state(STATE_PROP, ir->buf[ir->cur.off]… | |
102 } | |
103 | |
104 static inline enum bidi_property | |
105 ir_get_next_prop(const struct isolate_runner *ir) | |
106 { | |
107 return (ir->next.off == SIZE_MAX) ? | |
108 ir->eos : | |
109 (uint_least8_t)get_state(STATE_PROP, | |
110 ir->buf[ir->next.off]); | |
111 } | |
112 | |
113 static inline enum bidi_property | |
114 ir_get_current_preserved_prop(const struct isolate_runner *ir) | |
115 { | |
116 return (uint_least8_t)get_state(STATE_PRESERVED_PROP, | |
117 ir->buf[ir->cur.off]); | |
118 } | |
119 | |
120 static inline int_least8_t | |
121 ir_get_current_level(const struct isolate_runner *ir) | |
122 { | |
123 return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]… | |
124 } | |
125 | |
126 static inline const struct bracket * | |
127 ir_get_current_bracket_prop(const struct isolate_runner *ir) | |
128 { | |
129 return bidi_bracket + | |
130 (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cu… | |
131 } | |
132 | |
133 static void | |
134 ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property … | |
135 { | |
136 set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off… | |
137 } | |
138 | |
139 static void | |
140 ir_init(uint_least32_t *buf, size_t buflen, size_t off, | |
141 uint_least8_t paragraph_level, bool within, struct isolate_runne… | |
142 { | |
143 size_t i; | |
144 int_least8_t sos_level; | |
145 | |
146 /* initialize invariants */ | |
147 ir->buf = buf; | |
148 ir->buflen = buflen; | |
149 ir->paragraph_level = paragraph_level; | |
150 ir->start = off; | |
151 | |
152 /* advance off until we are at a non-removed character */ | |
153 for (; off < buflen; off++) { | |
154 if (get_state(STATE_LEVEL, buf[off]) != -1) { | |
155 break; | |
156 } | |
157 } | |
158 if (off == buflen) { | |
159 /* we encountered no more non-removed character, termina… | |
160 ir->next.off = SIZE_MAX; | |
161 return; | |
162 } | |
163 | |
164 /* set the isolating run level to that of the current offset */ | |
165 ir->isolating_run_level = | |
166 (int_least8_t)get_state(STATE_LEVEL, buf[off]); | |
167 | |
168 /* initialize sos and eos to dummy values */ | |
169 ir->sos = ir->eos = NUM_BIDI_PROPS; | |
170 | |
171 /* | |
172 * we write the information of the "current" state into next, | |
173 * so that the shift-in at the first advancement moves it in | |
174 * cur, as desired. | |
175 */ | |
176 ir->next.off = off; | |
177 | |
178 /* | |
179 * determine the previous state but store its offset in cur.off, | |
180 * given it's shifted in on the first advancement | |
181 */ | |
182 ir->cur.off = SIZE_MAX; | |
183 for (i = off, sos_level = -1; i >= 1; i--) { | |
184 if (get_state(STATE_LEVEL, buf[i - 1]) != -1) { | |
185 /* | |
186 * we found a character that has not been | |
187 * removed in X9 | |
188 */ | |
189 sos_level = (int_least8_t)get_state(STATE_LEVEL, | |
190 buf[i - 1]); | |
191 | |
192 if (within) { | |
193 /* we just take it */ | |
194 ir->cur.off = i; | |
195 } | |
196 | |
197 break; | |
198 } | |
199 } | |
200 if (sos_level == -1) { | |
201 /* | |
202 * there were no preceding non-removed characters, set | |
203 * sos-level to paragraph embedding level | |
204 */ | |
205 sos_level = (int_least8_t)paragraph_level; | |
206 } | |
207 | |
208 if (!within || ir->cur.off == SIZE_MAX) { | |
209 /* | |
210 * we are at the beginning of the sequence; initialize | |
211 * it faithfully according to the algorithm by looking | |
212 * at the sos-level | |
213 */ | |
214 if (MAX(sos_level, ir->isolating_run_level) % 2 == 0) { | |
215 /* the higher level is even, set sos to L */ | |
216 ir->sos = BIDI_PROP_L; | |
217 } else { | |
218 /* the higher level is odd, set sos to R */ | |
219 ir->sos = BIDI_PROP_R; | |
220 } | |
221 } | |
222 } | |
223 | |
224 static int | |
225 ir_advance(struct isolate_runner *ir) | |
226 { | |
227 enum bidi_property prop; | |
228 int_least8_t level, isolate_level, last_isolate_level; | |
229 size_t i; | |
230 | |
231 if (ir->next.off == SIZE_MAX) { | |
232 /* the sequence is over */ | |
233 return 1; | |
234 } | |
235 | |
236 /* shift in */ | |
237 ir->prev.off = ir->cur.off; | |
238 ir->cur.off = ir->next.off; | |
239 | |
240 /* mark as visited */ | |
241 set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off])); | |
242 | |
243 /* initialize next state by going to the next character in the s… | |
244 */ | |
245 ir->next.off = SIZE_MAX; | |
246 | |
247 last_isolate_level = -1; | |
248 for (i = ir->cur.off, isolate_level = 0; i < ir->buflen; i++) { | |
249 level = (int_least8_t)get_state(STATE_LEVEL, ir->buf[i]); | |
250 prop = (uint_least8_t)get_state(STATE_PROP, ir->buf[i]); | |
251 | |
252 if (level == -1) { | |
253 /* this is one of the ignored characters, skip */ | |
254 continue; | |
255 } else if (level == ir->isolating_run_level) { | |
256 last_isolate_level = level; | |
257 } | |
258 | |
259 /* follow BD8/BD9 and P2 to traverse the current sequenc… | |
260 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || | |
261 prop == BIDI_PROP_FSI) { | |
262 /* | |
263 * we encountered an isolate initiator, increment | |
264 * counter, but go into processing when we | |
265 * were not isolated before | |
266 */ | |
267 if (isolate_level < MAX_DEPTH) { | |
268 isolate_level++; | |
269 } | |
270 if (isolate_level != 1) { | |
271 continue; | |
272 } | |
273 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) { | |
274 isolate_level--; | |
275 | |
276 /* | |
277 * if the current PDI dropped the isolate-level | |
278 * to zero, it is itself part of the isolating | |
279 * run sequence; otherwise we simply continue. | |
280 */ | |
281 if (isolate_level > 0) { | |
282 continue; | |
283 } | |
284 } else if (isolate_level > 0) { | |
285 /* we are in an isolating sequence */ | |
286 continue; | |
287 } | |
288 | |
289 /* | |
290 * now we either still are in our sequence or we hit | |
291 * the eos-case as we left the sequence and hit the | |
292 * first non-isolating-sequence character. | |
293 */ | |
294 if (i == ir->cur.off) { | |
295 /* we were in the first initializing round */ | |
296 continue; | |
297 } else if (level == ir->isolating_run_level) { | |
298 /* isolate_level-skips have been handled before,… | |
299 * good */ | |
300 /* still in the sequence */ | |
301 ir->next.off = i; | |
302 } else { | |
303 /* out of sequence or isolated, compare levels v… | |
304 */ | |
305 ir->next.off = SIZE_MAX; | |
306 if (MAX(last_isolate_level, level) % 2 == 0) { | |
307 ir->eos = BIDI_PROP_L; | |
308 } else { | |
309 ir->eos = BIDI_PROP_R; | |
310 } | |
311 } | |
312 break; | |
313 } | |
314 if (i == ir->buflen) { | |
315 /* | |
316 * the sequence ended before we could grab an offset. | |
317 * we need to determine the eos-prop by comparing the | |
318 * level of the last element in the isolating run sequen… | |
319 * with the paragraph level. | |
320 */ | |
321 ir->next.off = SIZE_MAX; | |
322 if (MAX(last_isolate_level, ir->paragraph_level) % 2 == … | |
323 /* the higher level is even, set eos to L */ | |
324 ir->eos = BIDI_PROP_L; | |
325 } else { | |
326 /* the higher level is odd, set eos to R */ | |
327 ir->eos = BIDI_PROP_R; | |
328 } | |
329 } | |
330 | |
331 return 0; | |
332 } | |
333 | |
334 static enum bidi_property | |
335 ir_get_last_strong_prop(const struct isolate_runner *ir) | |
336 { | |
337 struct isolate_runner tmp; | |
338 enum bidi_property last_strong_prop = ir->sos, prop; | |
339 | |
340 ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, fal… | |
341 &tmp); | |
342 for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) { | |
343 prop = ir_get_current_prop(&tmp); | |
344 | |
345 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L || | |
346 prop == BIDI_PROP_AL) { | |
347 last_strong_prop = prop; | |
348 } | |
349 } | |
350 | |
351 return last_strong_prop; | |
352 } | |
353 | |
354 static enum bidi_property | |
355 ir_get_last_strong_or_number_prop(const struct isolate_runner *ir) | |
356 { | |
357 struct isolate_runner tmp; | |
358 enum bidi_property last_strong_or_number_prop = ir->sos, prop; | |
359 | |
360 ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, fal… | |
361 &tmp); | |
362 for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) { | |
363 prop = ir_get_current_prop(&tmp); | |
364 | |
365 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L || | |
366 prop == BIDI_PROP_AL || prop == BIDI_PROP_EN || | |
367 prop == BIDI_PROP_AN) { | |
368 last_strong_or_number_prop = prop; | |
369 } | |
370 } | |
371 | |
372 return last_strong_or_number_prop; | |
373 } | |
374 | |
375 static void | |
376 preprocess_bracket_pair(const struct isolate_runner *start, | |
377 const struct isolate_runner *end) | |
378 { | |
379 enum bidi_property prop, bracket_prop, last_strong_or_number_pro… | |
380 struct isolate_runner ir; | |
381 size_t strong_type_off; | |
382 | |
383 /* | |
384 * check if the bracket contains a strong type (L or R|EN|AN) | |
385 */ | |
386 for (ir = *start, strong_type_off = SIZE_MAX, | |
387 bracket_prop = NUM_BIDI_PROPS; | |
388 !ir_advance(&ir) && ir.cur.off < end->cur.off;) { | |
389 prop = ir_get_current_prop(&ir); | |
390 | |
391 if (prop == BIDI_PROP_L) { | |
392 strong_type_off = ir.cur.off; | |
393 if (ir.isolating_run_level % 2 == 0) { | |
394 /* | |
395 * set the type for both brackets to L (… | |
396 * match the strong type they contain) | |
397 */ | |
398 bracket_prop = BIDI_PROP_L; | |
399 } | |
400 } else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN || | |
401 prop == BIDI_PROP_AN) { | |
402 strong_type_off = ir.cur.off; | |
403 if (ir.isolating_run_level % 2 != 0) { | |
404 /* | |
405 * set the type for both brackets to R (… | |
406 * match the strong type they contain) | |
407 */ | |
408 bracket_prop = BIDI_PROP_R; | |
409 } | |
410 } | |
411 } | |
412 if (strong_type_off == SIZE_MAX) { | |
413 /* | |
414 * there are no strong types within the brackets and we … | |
415 * leave the brackets as is | |
416 */ | |
417 return; | |
418 } | |
419 | |
420 if (bracket_prop == NUM_BIDI_PROPS) { | |
421 /* | |
422 * We encountered a strong type, but it was opposite | |
423 * to the embedding direction. | |
424 * Check the previous strong type before the opening | |
425 * bracket | |
426 */ | |
427 last_strong_or_number_prop = | |
428 ir_get_last_strong_or_number_prop(start); | |
429 if (last_strong_or_number_prop == BIDI_PROP_L && | |
430 ir.isolating_run_level % 2 != 0) { | |
431 /* | |
432 * the previous strong type is also opposite | |
433 * to the embedding direction, so the context | |
434 * was established and we set the brackets | |
435 * accordingly. | |
436 */ | |
437 bracket_prop = BIDI_PROP_L; | |
438 } else if ((last_strong_or_number_prop == BIDI_PROP_R || | |
439 last_strong_or_number_prop == BIDI_PROP_EN || | |
440 last_strong_or_number_prop == BIDI_PROP_AN) … | |
441 ir.isolating_run_level % 2 == 0) { | |
442 /* | |
443 * the previous strong type is also opposite | |
444 * to the embedding direction, so the context | |
445 * was established and we set the brackets | |
446 * accordingly. | |
447 */ | |
448 bracket_prop = BIDI_PROP_R; | |
449 } else { | |
450 /* set brackets to the embedding direction */ | |
451 if (ir.isolating_run_level % 2 == 0) { | |
452 bracket_prop = BIDI_PROP_L; | |
453 } else { | |
454 bracket_prop = BIDI_PROP_R; | |
455 } | |
456 } | |
457 } | |
458 | |
459 ir_set_current_prop(start, bracket_prop); | |
460 ir_set_current_prop(end, bracket_prop); | |
461 | |
462 /* | |
463 * any sequence of NSMs after opening or closing brackets get | |
464 * the same property as the one we set on the brackets | |
465 */ | |
466 for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_p… | |
467 &ir) == BIDI_PROP_… | |
468 ir_set_current_prop(&ir, bracket_prop); | |
469 } | |
470 for (ir = *end; !ir_advance(&ir) && | |
471 ir_get_current_preserved_prop(&ir) == BIDI_PROP_… | |
472 ir_set_current_prop(&ir, bracket_prop); | |
473 } | |
474 } | |
475 | |
476 static void | |
477 preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off, | |
478 uint_least8_t paragraph_level) | |
479 { | |
480 /* | |
481 * The N0-rule deals with bracket pairs that shall be determined | |
482 * with the rule BD16. This is specified as an algorithm with a | |
483 * stack of 63 bracket openings that are used to resolve into a | |
484 * separate list of pairs, which is then to be sorted by opening | |
485 * position. Thus, even though the bracketing-depth is limited | |
486 * by 63, the algorithm, as is, requires dynamic memory | |
487 * management. | |
488 * | |
489 * A naive approach (used by Fribidi) would be to screw the | |
490 * stack-approach and simply directly determine the | |
491 * corresponding closing bracket offset for a given opening | |
492 * bracket, leading to O(n²) time complexity in the worst case | |
493 * with a lot of brackets. While many brackets are not common, | |
494 * it is still possible to find a middle ground where you obtain | |
495 * strongly linear time complexity in most common cases: | |
496 * | |
497 * Instead of a stack, we use a FIFO data structure which is | |
498 * filled with bracket openings in the order of appearance (thus | |
499 * yielding an implicit sorting!) at the top. If the | |
500 * corresponding closing bracket is encountered, it is added to | |
501 * the respective entry, making it ready to "move out" at the | |
502 * bottom (i.e. passed to the bracket processing). Due to the | |
503 * nature of the specified pair detection algorithm, which only | |
504 * cares about the bracket type and nothing else (bidi class, | |
505 * level, etc.), we can mix processing and bracket detection. | |
506 * | |
507 * Obviously, if you, for instance, have one big bracket pair at | |
508 * the bottom that has not been closed yet, it will block the | |
509 * bracket processing and the FIFO might hit its capacity limit. | |
510 * At this point, the blockage is manually resolved using the | |
511 * naive quadratic approach. | |
512 * | |
513 * To remain within the specified standard behaviour, which | |
514 * mandates that processing of brackets should stop when the | |
515 * bracketing-depth is at 63, we simply check in an "overflow" | |
516 * scenario if all 63 elements in the LIFO are unfinished, which | |
517 * corresponds with such a bracketing depth. | |
518 */ | |
519 enum bidi_property prop; | |
520 | |
521 struct { | |
522 bool complete; | |
523 size_t bracket_class; | |
524 struct isolate_runner start; | |
525 struct isolate_runner end; | |
526 } fifo[63]; | |
527 const struct bracket *bracket_prop, *tmp_bracket_prop; | |
528 struct isolate_runner ir, tmp_ir; | |
529 size_t fifo_len = 0, i, blevel, j, k; | |
530 | |
531 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
532 while (!ir_advance(&ir)) { | |
533 prop = ir_get_current_prop(&ir); | |
534 bracket_prop = ir_get_current_bracket_prop(&ir); | |
535 if (prop == BIDI_PROP_ON && | |
536 bracket_prop->type == BIDI_BRACKET_OPEN) { | |
537 if (fifo_len == LEN(fifo)) { | |
538 /* | |
539 * The FIFO is full, check first if it's | |
540 * completely blocked (i.e. no finished | |
541 * bracket pairs, triggering the standard | |
542 * that mandates to abort in such a case | |
543 */ | |
544 for (i = 0; i < fifo_len; i++) { | |
545 if (fifo[i].complete) { | |
546 break; | |
547 } | |
548 } | |
549 if (i == fifo_len) { | |
550 /* abort processing */ | |
551 return; | |
552 } | |
553 | |
554 /* | |
555 * by construction, the bottom entry | |
556 * in the FIFO is guaranteed to be | |
557 * unfinished (given we "consume" all | |
558 * finished bottom entries after each | |
559 * iteration). | |
560 * | |
561 * iterate, starting after the opening | |
562 * bracket, and find the corresponding | |
563 * closing bracket. | |
564 * | |
565 * if we find none, just drop the FIFO | |
566 * entry silently | |
567 */ | |
568 for (tmp_ir = fifo[0].start, blevel = 0; | |
569 !ir_advance(&tmp_ir);) { | |
570 tmp_bracket_prop = | |
571 ir_get_current_bracket_p… | |
572 &tmp_ir); | |
573 | |
574 if (tmp_bracket_prop->type == | |
575 BIDI_BRACKET_OPEN && | |
576 tmp_bracket_prop->class == | |
577 bracket_prop->class)… | |
578 /* we encountered another | |
579 * opening bracket of the | |
580 * same class */ | |
581 blevel++; | |
582 | |
583 } else if (tmp_bracket_prop->typ… | |
584 BIDI_BRACKET_… | |
585 tmp_bracket_prop->cla… | |
586 bracket_prop | |
587 ->cla… | |
588 /* we encountered a clos… | |
589 * bracket of the same c… | |
590 */ | |
591 if (blevel == 0) { | |
592 /* this is the | |
593 * corresponding | |
594 * closing brack… | |
595 */ | |
596 fifo[0].complete… | |
597 fifo[0].end = ir; | |
598 } else { | |
599 blevel--; | |
600 } | |
601 } | |
602 } | |
603 if (fifo[0].complete) { | |
604 /* we found the matching bracket… | |
605 preprocess_bracket_pair( | |
606 &(fifo[i].start), | |
607 &(fifo[i].end)); | |
608 } | |
609 | |
610 /* shift FIFO one to the left */ | |
611 for (i = 1; i < fifo_len; i++) { | |
612 fifo[i - 1] = fifo[i]; | |
613 } | |
614 fifo_len--; | |
615 } | |
616 | |
617 /* add element to the FIFO */ | |
618 fifo_len++; | |
619 fifo[fifo_len - 1].complete = false; | |
620 fifo[fifo_len - 1].bracket_class = bracket_prop-… | |
621 fifo[fifo_len - 1].start = ir; | |
622 } else if (prop == BIDI_PROP_ON && | |
623 bracket_prop->type == BIDI_BRACKET_CLOSE) { | |
624 /* | |
625 * go backwards in the FIFO, skip finished entri… | |
626 * and simply ignore (do nothing) the closing | |
627 * bracket if we do not match anything | |
628 */ | |
629 for (i = fifo_len; i > 0; i--) { | |
630 if (bracket_prop->class == | |
631 fifo[i - 1].bracket_class && | |
632 !fifo[i - 1].complete) { | |
633 /* we have found a pair */ | |
634 fifo[i - 1].complete = true; | |
635 fifo[i - 1].end = ir; | |
636 | |
637 /* remove all uncompleted FIFO e… | |
638 * above i - 1 */ | |
639 for (j = i; j < fifo_len;) { | |
640 if (fifo[j].complete) { | |
641 j++; | |
642 continue; | |
643 } | |
644 | |
645 /* shift FIFO one to the… | |
646 for (k = j + 1; k < fifo… | |
647 k++) { | |
648 fifo[k - 1] = fi… | |
649 } | |
650 fifo_len--; | |
651 } | |
652 break; | |
653 } | |
654 } | |
655 } | |
656 | |
657 /* process all ready bracket pairs from the FIFO bottom … | |
658 while (fifo_len > 0 && fifo[0].complete) { | |
659 preprocess_bracket_pair(&(fifo[0].start), | |
660 &(fifo[0].end)); | |
661 | |
662 /* shift FIFO one to the left */ | |
663 for (j = 0; j + 1 < fifo_len; j++) { | |
664 fifo[j] = fifo[j + 1]; | |
665 } | |
666 fifo_len--; | |
667 } | |
668 } | |
669 | |
670 /* | |
671 * afterwards, we still might have unfinished bracket pairs | |
672 * that will remain as such, but the subsequent finished pairs | |
673 * also need to be taken into account, so we go through the | |
674 * FIFO once more and process all finished pairs | |
675 */ | |
676 for (i = 0; i < fifo_len; i++) { | |
677 if (fifo[i].complete) { | |
678 preprocess_bracket_pair(&(fifo[i].start), | |
679 &(fifo[i].end)); | |
680 } | |
681 } | |
682 } | |
683 | |
684 static size_t | |
685 preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, | |
686 size_t off, uint_least8_t paragraph_le… | |
687 { | |
688 enum bidi_property sequence_prop, prop; | |
689 struct isolate_runner ir, tmp; | |
690 size_t runsince, sequence_end; | |
691 | |
692 /* W1 */ | |
693 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
694 while (!ir_advance(&ir)) { | |
695 if (ir_get_current_prop(&ir) == BIDI_PROP_NSM) { | |
696 prop = ir_get_previous_prop(&ir); | |
697 | |
698 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_R… | |
699 prop == BIDI_PROP_FSI || prop == BIDI_PROP_P… | |
700 ir_set_current_prop(&ir, BIDI_PROP_ON); | |
701 } else { | |
702 ir_set_current_prop(&ir, prop); | |
703 } | |
704 } | |
705 } | |
706 | |
707 /* W2 */ | |
708 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
709 while (!ir_advance(&ir)) { | |
710 if (ir_get_current_prop(&ir) == BIDI_PROP_EN && | |
711 ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) { | |
712 ir_set_current_prop(&ir, BIDI_PROP_AN); | |
713 } | |
714 } | |
715 | |
716 /* W3 */ | |
717 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
718 while (!ir_advance(&ir)) { | |
719 if (ir_get_current_prop(&ir) == BIDI_PROP_AL) { | |
720 ir_set_current_prop(&ir, BIDI_PROP_R); | |
721 } | |
722 } | |
723 | |
724 /* W4 */ | |
725 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
726 while (!ir_advance(&ir)) { | |
727 if (ir_get_previous_prop(&ir) == BIDI_PROP_EN && | |
728 (ir_get_current_prop(&ir) == BIDI_PROP_ES || | |
729 ir_get_current_prop(&ir) == BIDI_PROP_CS) && | |
730 ir_get_next_prop(&ir) == BIDI_PROP_EN) { | |
731 ir_set_current_prop(&ir, BIDI_PROP_EN); | |
732 } | |
733 | |
734 if (ir_get_previous_prop(&ir) == BIDI_PROP_AN && | |
735 ir_get_current_prop(&ir) == BIDI_PROP_CS && | |
736 ir_get_next_prop(&ir) == BIDI_PROP_AN) { | |
737 ir_set_current_prop(&ir, BIDI_PROP_AN); | |
738 } | |
739 } | |
740 | |
741 /* W5 */ | |
742 runsince = SIZE_MAX; | |
743 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
744 while (!ir_advance(&ir)) { | |
745 if (ir_get_current_prop(&ir) == BIDI_PROP_ET) { | |
746 if (runsince == SIZE_MAX) { | |
747 /* a new run has begun */ | |
748 runsince = ir.cur.off; | |
749 } | |
750 } else if (ir_get_current_prop(&ir) == BIDI_PROP_EN) { | |
751 /* set the preceding sequence */ | |
752 if (runsince != SIZE_MAX) { | |
753 ir_init(buf, buflen, runsince, paragraph… | |
754 (runsince > off), &tmp); | |
755 while (!ir_advance(&tmp) && | |
756 tmp.cur.off < ir.cur.off) { | |
757 ir_set_current_prop(&tmp, BIDI_P… | |
758 } | |
759 runsince = SIZE_MAX; | |
760 } else { | |
761 ir_init(buf, buflen, ir.cur.off, | |
762 paragraph_level, (ir.cur.off > o… | |
763 &tmp); | |
764 ir_advance(&tmp); | |
765 } | |
766 /* follow the succeeding sequence */ | |
767 while (!ir_advance(&tmp)) { | |
768 if (ir_get_current_prop(&tmp) != BIDI_PR… | |
769 break; | |
770 } | |
771 ir_set_current_prop(&tmp, BIDI_PROP_EN); | |
772 } | |
773 } else { | |
774 /* sequence ended */ | |
775 runsince = SIZE_MAX; | |
776 } | |
777 } | |
778 | |
779 /* W6 */ | |
780 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
781 while (!ir_advance(&ir)) { | |
782 prop = ir_get_current_prop(&ir); | |
783 | |
784 if (prop == BIDI_PROP_ES || prop == BIDI_PROP_ET || | |
785 prop == BIDI_PROP_CS) { | |
786 ir_set_current_prop(&ir, BIDI_PROP_ON); | |
787 } | |
788 } | |
789 | |
790 /* W7 */ | |
791 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
792 while (!ir_advance(&ir)) { | |
793 if (ir_get_current_prop(&ir) == BIDI_PROP_EN && | |
794 ir_get_last_strong_prop(&ir) == BIDI_PROP_L) { | |
795 ir_set_current_prop(&ir, BIDI_PROP_L); | |
796 } | |
797 } | |
798 | |
799 /* N0 */ | |
800 preprocess_bracket_pairs(buf, buflen, off, paragraph_level); | |
801 | |
802 /* N1 */ | |
803 sequence_end = SIZE_MAX; | |
804 sequence_prop = NUM_BIDI_PROPS; | |
805 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
806 while (!ir_advance(&ir)) { | |
807 if (sequence_end == SIZE_MAX) { | |
808 prop = ir_get_current_prop(&ir); | |
809 | |
810 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S || | |
811 prop == BIDI_PROP_WS || prop == BIDI_PROP_ON… | |
812 prop == BIDI_PROP_FSI || prop == BIDI_PROP_L… | |
813 prop == BIDI_PROP_RLI || prop == BIDI_PROP_P… | |
814 /* the current character is an NI (neutr… | |
815 * or isolate) */ | |
816 | |
817 /* scan ahead to the end of the NI-seque… | |
818 */ | |
819 ir_init(buf, buflen, ir.cur.off, | |
820 paragraph_level, (ir.cur.off > o… | |
821 &tmp); | |
822 while (!ir_advance(&tmp)) { | |
823 prop = ir_get_next_prop(&tmp); | |
824 | |
825 if (prop != BIDI_PROP_B && | |
826 prop != BIDI_PROP_S && | |
827 prop != BIDI_PROP_WS && | |
828 prop != BIDI_PROP_ON && | |
829 prop != BIDI_PROP_FSI && | |
830 prop != BIDI_PROP_LRI && | |
831 prop != BIDI_PROP_RLI && | |
832 prop != BIDI_PROP_PDI) { | |
833 break; | |
834 } | |
835 } | |
836 | |
837 /* | |
838 * check what follows and see if the text | |
839 * has the same direction on both sides | |
840 */ | |
841 if (ir_get_previous_prop(&ir) == BIDI_PR… | |
842 ir_get_next_prop(&tmp) == BIDI_PROP_… | |
843 sequence_end = tmp.cur.off; | |
844 sequence_prop = BIDI_PROP_L; | |
845 } else if ((ir_get_previous_prop(&ir) == | |
846 BIDI_PROP_R || | |
847 ir_get_previous_prop(&ir) == | |
848 BIDI_PROP_EN || | |
849 ir_get_previous_prop(&ir) == | |
850 BIDI_PROP_AN) && | |
851 (ir_get_next_prop(&tmp) == | |
852 BIDI_PROP_R || | |
853 ir_get_next_prop(&tmp) == | |
854 BIDI_PROP_EN || | |
855 ir_get_next_prop(&tmp) == | |
856 BIDI_PROP_AN)) { | |
857 sequence_end = tmp.cur.off; | |
858 sequence_prop = BIDI_PROP_R; | |
859 } | |
860 } | |
861 } | |
862 | |
863 if (sequence_end != SIZE_MAX) { | |
864 if (ir.cur.off <= sequence_end) { | |
865 ir_set_current_prop(&ir, sequence_prop); | |
866 } else { | |
867 /* end of sequence, reset */ | |
868 sequence_end = SIZE_MAX; | |
869 sequence_prop = NUM_BIDI_PROPS; | |
870 } | |
871 } | |
872 } | |
873 | |
874 /* N2 */ | |
875 ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
876 while (!ir_advance(&ir)) { | |
877 prop = ir_get_current_prop(&ir); | |
878 | |
879 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S || | |
880 prop == BIDI_PROP_WS || prop == BIDI_PROP_ON || | |
881 prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI || | |
882 prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) { | |
883 /* N2 */ | |
884 if (ir_get_current_level(&ir) % 2 == 0) { | |
885 /* even embedding level */ | |
886 ir_set_current_prop(&ir, BIDI_PROP_L); | |
887 } else { | |
888 /* odd embedding level */ | |
889 ir_set_current_prop(&ir, BIDI_PROP_R); | |
890 } | |
891 } | |
892 } | |
893 | |
894 return 0; | |
895 } | |
896 | |
897 static uint_least8_t | |
898 get_isolated_paragraph_level(const uint_least32_t *state, size_t statele… | |
899 { | |
900 enum bidi_property prop; | |
901 int_least8_t isolate_level; | |
902 size_t stateoff; | |
903 | |
904 /* determine paragraph level (rules P1-P3) and terminate on PDI … | |
905 for (stateoff = 0, isolate_level = 0; stateoff < statelen; state… | |
906 prop = (uint_least8_t)get_state(STATE_PROP, state[stateo… | |
907 | |
908 if (prop == BIDI_PROP_PDI && isolate_level == 0) { | |
909 /* | |
910 * we are in a FSI-subsection of a paragraph and | |
911 * matched with the terminating PDI | |
912 */ | |
913 break; | |
914 } | |
915 | |
916 /* BD8/BD9 */ | |
917 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || | |
918 prop == BIDI_PROP_FSI) && | |
919 isolate_level < MAX_DEPTH) { | |
920 /* we hit an isolate initiator, increment counte… | |
921 isolate_level++; | |
922 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) { | |
923 isolate_level--; | |
924 } | |
925 | |
926 /* P2 */ | |
927 if (isolate_level > 0) { | |
928 continue; | |
929 } | |
930 | |
931 /* P3 */ | |
932 if (prop == BIDI_PROP_L) { | |
933 return 0; | |
934 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) { | |
935 return 1; | |
936 } | |
937 } | |
938 | |
939 return 0; | |
940 } | |
941 | |
942 static inline uint_least8_t | |
943 get_bidi_property(uint_least32_t cp) | |
944 { | |
945 if (likely(cp <= 0x10FFFF)) { | |
946 return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) & | |
947 0x1F /* 00011111 */; | |
948 } else { | |
949 return BIDI_PROP_L; | |
950 } | |
951 } | |
952 | |
953 static uint_least8_t | |
954 get_paragraph_level(enum grapheme_bidirectional_direction override, | |
955 const HERODOTUS_READER *r) | |
956 { | |
957 HERODOTUS_READER tmp; | |
958 enum bidi_property prop; | |
959 int_least8_t isolate_level; | |
960 uint_least32_t cp; | |
961 | |
962 /* check overrides first according to rule HL1 */ | |
963 if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
964 return 0; | |
965 } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { | |
966 return 1; | |
967 } | |
968 | |
969 /* copy reader into temporary reader */ | |
970 herodotus_reader_copy(r, &tmp); | |
971 | |
972 /* determine paragraph level (rules P1-P3) */ | |
973 for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp… | |
974 HERODOTUS_STATUS_SUCCESS;) { | |
975 prop = get_bidi_property(cp); | |
976 | |
977 /* BD8/BD9 */ | |
978 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || | |
979 prop == BIDI_PROP_FSI) && | |
980 isolate_level < MAX_DEPTH) { | |
981 /* we hit an isolate initiator, increment counte… | |
982 isolate_level++; | |
983 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) { | |
984 isolate_level--; | |
985 } | |
986 | |
987 /* P2 */ | |
988 if (isolate_level > 0) { | |
989 continue; | |
990 } | |
991 | |
992 /* P3 */ | |
993 if (prop == BIDI_PROP_L) { | |
994 return 0; | |
995 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) { | |
996 return 1; | |
997 } | |
998 } | |
999 | |
1000 return 0; | |
1001 } | |
1002 | |
1003 static void | |
1004 preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf, | |
1005 size_t buflen) | |
1006 { | |
1007 enum bidi_property prop; | |
1008 int_least8_t level; | |
1009 | |
1010 struct { | |
1011 int_least8_t level; | |
1012 enum grapheme_bidirectional_direction override; | |
1013 bool directional_isolate; | |
1014 } directional_status[MAX_DEPTH + 2], *dirstat = directional_stat… | |
1015 | |
1016 size_t overflow_isolate_count, overflow_embedding_count, | |
1017 valid_isolate_count, bufoff, i, runsince; | |
1018 | |
1019 /* X1 */ | |
1020 dirstat->level = (int_least8_t)paragraph_level; | |
1021 dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; | |
1022 dirstat->directional_isolate = false; | |
1023 overflow_isolate_count = overflow_embedding_count = | |
1024 valid_isolate_count = 0; | |
1025 | |
1026 for (bufoff = 0; bufoff < buflen; bufoff++) { | |
1027 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]); | |
1028 | |
1029 /* set paragraph level we need for line-level-processing… | |
1030 set_state(STATE_PARAGRAPH_LEVEL, paragraph_level, | |
1031 &(buf[bufoff])); | |
1032 again: | |
1033 if (prop == BIDI_PROP_RLE) { | |
1034 /* X2 */ | |
1035 if (dirstat->level + (dirstat->level % 2 != 0) +… | |
1036 MAX_DEPTH && | |
1037 overflow_isolate_count == 0 && | |
1038 overflow_embedding_count == 0) { | |
1039 /* valid RLE */ | |
1040 dirstat++; | |
1041 dirstat->level = | |
1042 (dirstat - 1)->level + | |
1043 ((dirstat - 1)->level % 2 != 0) … | |
1044 dirstat->override = | |
1045 GRAPHEME_BIDIRECTIONAL_DIRECTION… | |
1046 dirstat->directional_isolate = false; | |
1047 } else { | |
1048 /* overflow RLE */ | |
1049 overflow_embedding_count += | |
1050 (overflow_isolate_count == 0); | |
1051 } | |
1052 } else if (prop == BIDI_PROP_LRE) { | |
1053 /* X3 */ | |
1054 if (dirstat->level + (dirstat->level % 2 == 0) +… | |
1055 MAX_DEPTH && | |
1056 overflow_isolate_count == 0 && | |
1057 overflow_embedding_count == 0) { | |
1058 /* valid LRE */ | |
1059 dirstat++; | |
1060 dirstat->level = | |
1061 (dirstat - 1)->level + | |
1062 ((dirstat - 1)->level % 2 == 0) … | |
1063 dirstat->override = | |
1064 GRAPHEME_BIDIRECTIONAL_DIRECTION… | |
1065 dirstat->directional_isolate = false; | |
1066 } else { | |
1067 /* overflow LRE */ | |
1068 overflow_embedding_count += | |
1069 (overflow_isolate_count == 0); | |
1070 } | |
1071 } else if (prop == BIDI_PROP_RLO) { | |
1072 /* X4 */ | |
1073 if (dirstat->level + (dirstat->level % 2 != 0) +… | |
1074 MAX_DEPTH && | |
1075 overflow_isolate_count == 0 && | |
1076 overflow_embedding_count == 0) { | |
1077 /* valid RLO */ | |
1078 dirstat++; | |
1079 dirstat->level = | |
1080 (dirstat - 1)->level + | |
1081 ((dirstat - 1)->level % 2 != 0) … | |
1082 dirstat->override = | |
1083 GRAPHEME_BIDIRECTIONAL_DIRECTION… | |
1084 dirstat->directional_isolate = false; | |
1085 } else { | |
1086 /* overflow RLO */ | |
1087 overflow_embedding_count += | |
1088 (overflow_isolate_count == 0); | |
1089 } | |
1090 } else if (prop == BIDI_PROP_LRO) { | |
1091 /* X5 */ | |
1092 if (dirstat->level + (dirstat->level % 2 == 0) +… | |
1093 MAX_DEPTH && | |
1094 overflow_isolate_count == 0 && | |
1095 overflow_embedding_count == 0) { | |
1096 /* valid LRE */ | |
1097 dirstat++; | |
1098 dirstat->level = | |
1099 (dirstat - 1)->level + | |
1100 ((dirstat - 1)->level % 2 == 0) … | |
1101 dirstat->override = | |
1102 GRAPHEME_BIDIRECTIONAL_DIRECTION… | |
1103 dirstat->directional_isolate = false; | |
1104 } else { | |
1105 /* overflow LRO */ | |
1106 overflow_embedding_count += | |
1107 (overflow_isolate_count == 0); | |
1108 } | |
1109 } else if (prop == BIDI_PROP_RLI) { | |
1110 /* X5a */ | |
1111 set_state(STATE_LEVEL, dirstat->level, &(buf[buf… | |
1112 if (dirstat->override == | |
1113 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
1114 set_state(STATE_PROP, BIDI_PROP_L, | |
1115 &(buf[bufoff])); | |
1116 } else if (dirstat->override == | |
1117 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL)… | |
1118 set_state(STATE_PROP, BIDI_PROP_R, | |
1119 &(buf[bufoff])); | |
1120 } | |
1121 | |
1122 if (dirstat->level + (dirstat->level % 2 != 0) +… | |
1123 MAX_DEPTH && | |
1124 overflow_isolate_count == 0 && | |
1125 overflow_embedding_count == 0) { | |
1126 /* valid RLI */ | |
1127 valid_isolate_count++; | |
1128 | |
1129 dirstat++; | |
1130 dirstat->level = | |
1131 (dirstat - 1)->level + | |
1132 ((dirstat - 1)->level % 2 != 0) … | |
1133 dirstat->override = | |
1134 GRAPHEME_BIDIRECTIONAL_DIRECTION… | |
1135 dirstat->directional_isolate = true; | |
1136 } else { | |
1137 /* overflow RLI */ | |
1138 overflow_isolate_count++; | |
1139 } | |
1140 } else if (prop == BIDI_PROP_LRI) { | |
1141 /* X5b */ | |
1142 set_state(STATE_LEVEL, dirstat->level, &(buf[buf… | |
1143 if (dirstat->override == | |
1144 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
1145 set_state(STATE_PROP, BIDI_PROP_L, | |
1146 &(buf[bufoff])); | |
1147 } else if (dirstat->override == | |
1148 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL)… | |
1149 set_state(STATE_PROP, BIDI_PROP_R, | |
1150 &(buf[bufoff])); | |
1151 } | |
1152 | |
1153 if (dirstat->level + (dirstat->level % 2 == 0) +… | |
1154 MAX_DEPTH && | |
1155 overflow_isolate_count == 0 && | |
1156 overflow_embedding_count == 0) { | |
1157 /* valid LRI */ | |
1158 valid_isolate_count++; | |
1159 | |
1160 dirstat++; | |
1161 dirstat->level = | |
1162 (dirstat - 1)->level + | |
1163 ((dirstat - 1)->level % 2 == 0) … | |
1164 dirstat->override = | |
1165 GRAPHEME_BIDIRECTIONAL_DIRECTION… | |
1166 dirstat->directional_isolate = true; | |
1167 } else { | |
1168 /* overflow LRI */ | |
1169 overflow_isolate_count++; | |
1170 } | |
1171 } else if (prop == BIDI_PROP_FSI) { | |
1172 /* X5c */ | |
1173 if (get_isolated_paragraph_level( | |
1174 buf + (bufoff + 1), | |
1175 buflen - (bufoff + 1)) == 1) { | |
1176 prop = BIDI_PROP_RLI; | |
1177 goto again; | |
1178 } else { /* ... == 0 */ | |
1179 prop = BIDI_PROP_LRI; | |
1180 goto again; | |
1181 } | |
1182 } else if (prop != BIDI_PROP_B && prop != BIDI_PROP_BN && | |
1183 prop != BIDI_PROP_PDF && prop != BIDI_PROP_PD… | |
1184 /* X6 */ | |
1185 set_state(STATE_LEVEL, dirstat->level, &(buf[buf… | |
1186 if (dirstat->override == | |
1187 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
1188 set_state(STATE_PROP, BIDI_PROP_L, | |
1189 &(buf[bufoff])); | |
1190 } else if (dirstat->override == | |
1191 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL)… | |
1192 set_state(STATE_PROP, BIDI_PROP_R, | |
1193 &(buf[bufoff])); | |
1194 } | |
1195 } else if (prop == BIDI_PROP_PDI) { | |
1196 /* X6a */ | |
1197 if (overflow_isolate_count > 0) { | |
1198 /* PDI matches an overflow isolate initi… | |
1199 */ | |
1200 overflow_isolate_count--; | |
1201 } else if (valid_isolate_count > 0) { | |
1202 /* PDI matches a normal isolate initiato… | |
1203 overflow_embedding_count = 0; | |
1204 while (dirstat->directional_isolate == f… | |
1205 dirstat > directional_status) { | |
1206 /* | |
1207 * we are safe here as given the | |
1208 * valid isolate count is positi… | |
1209 * there must be a stack-entry | |
1210 * with positive directional | |
1211 * isolate status, but we take | |
1212 * no chances and include an | |
1213 * explicit check | |
1214 * | |
1215 * POSSIBLE OPTIMIZATION: Whenev… | |
1216 * we push on the stack, check i… | |
1217 * has the directional isolate | |
1218 * status true and store a point… | |
1219 * to it so we can jump to it ve… | |
1220 * quickly. | |
1221 */ | |
1222 dirstat--; | |
1223 } | |
1224 | |
1225 /* | |
1226 * as above, the following check is not | |
1227 * necessary, given we are guaranteed to | |
1228 * have at least one stack entry left, | |
1229 * but it's better to be safe | |
1230 */ | |
1231 if (dirstat > directional_status) { | |
1232 dirstat--; | |
1233 } | |
1234 valid_isolate_count--; | |
1235 } | |
1236 | |
1237 set_state(STATE_LEVEL, dirstat->level, &(buf[buf… | |
1238 if (dirstat->override == | |
1239 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
1240 set_state(STATE_PROP, BIDI_PROP_L, | |
1241 &(buf[bufoff])); | |
1242 } else if (dirstat->override == | |
1243 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL)… | |
1244 set_state(STATE_PROP, BIDI_PROP_R, | |
1245 &(buf[bufoff])); | |
1246 } | |
1247 } else if (prop == BIDI_PROP_PDF) { | |
1248 /* X7 */ | |
1249 if (overflow_isolate_count > 0) { | |
1250 /* do nothing */ | |
1251 } else if (overflow_embedding_count > 0) { | |
1252 overflow_embedding_count--; | |
1253 } else if (dirstat->directional_isolate == false… | |
1254 dirstat > directional_status) { | |
1255 dirstat--; | |
1256 } | |
1257 } else if (prop == BIDI_PROP_B) { | |
1258 /* X8 */ | |
1259 set_state(STATE_LEVEL, paragraph_level, &(buf[bu… | |
1260 } | |
1261 | |
1262 /* X9 */ | |
1263 if (prop == BIDI_PROP_RLE || prop == BIDI_PROP_LRE || | |
1264 prop == BIDI_PROP_RLO || prop == BIDI_PROP_LRO || | |
1265 prop == BIDI_PROP_PDF || prop == BIDI_PROP_BN) { | |
1266 set_state(STATE_LEVEL, -1, &(buf[bufoff])); | |
1267 } | |
1268 } | |
1269 | |
1270 /* X10 (W1-W7, N0-N2) */ | |
1271 for (bufoff = 0; bufoff < buflen; bufoff++) { | |
1272 if (get_state(STATE_VISITED, buf[bufoff]) == 0 && | |
1273 get_state(STATE_LEVEL, buf[bufoff]) != -1) { | |
1274 bufoff += preprocess_isolating_run_sequence( | |
1275 buf, buflen, bufoff, paragraph_level); | |
1276 } | |
1277 } | |
1278 | |
1279 /* | |
1280 * I1-I2 (given our sequential approach to processing the | |
1281 * isolating run sequences, we apply this rule separately) | |
1282 */ | |
1283 for (bufoff = 0; bufoff < buflen; bufoff++) { | |
1284 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]… | |
1285 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]); | |
1286 | |
1287 if (level % 2 == 0) { | |
1288 /* even level */ | |
1289 if (prop == BIDI_PROP_R) { | |
1290 set_state(STATE_LEVEL, level + 1, | |
1291 &(buf[bufoff])); | |
1292 } else if (prop == BIDI_PROP_AN || | |
1293 prop == BIDI_PROP_EN) { | |
1294 set_state(STATE_LEVEL, level + 2, | |
1295 &(buf[bufoff])); | |
1296 } | |
1297 } else { | |
1298 /* odd level */ | |
1299 if (prop == BIDI_PROP_L || prop == BIDI_PROP_EN … | |
1300 prop == BIDI_PROP_AN) { | |
1301 set_state(STATE_LEVEL, level + 1, | |
1302 &(buf[bufoff])); | |
1303 } | |
1304 } | |
1305 } | |
1306 | |
1307 /* L1 (rules 1-3) */ | |
1308 runsince = SIZE_MAX; | |
1309 for (bufoff = 0; bufoff < buflen; bufoff++) { | |
1310 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]… | |
1311 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP, | |
1312 buf[bufoff]); | |
1313 | |
1314 if (level == -1) { | |
1315 /* ignored character */ | |
1316 continue; | |
1317 } | |
1318 | |
1319 /* rules 1 and 2 */ | |
1320 if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) { | |
1321 set_state(STATE_LEVEL, paragraph_level, &(buf[bu… | |
1322 } | |
1323 | |
1324 /* rule 3 */ | |
1325 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI || | |
1326 prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || | |
1327 prop == BIDI_PROP_PDI) { | |
1328 if (runsince == SIZE_MAX) { | |
1329 /* a new run has begun */ | |
1330 runsince = bufoff; | |
1331 } | |
1332 } else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) … | |
1333 runsince != SIZE_MAX) { | |
1334 /* | |
1335 * we hit a segment or paragraph separator in a | |
1336 * sequence, reset sequence-levels | |
1337 */ | |
1338 for (i = runsince; i < bufoff; i++) { | |
1339 if (get_state(STATE_LEVEL, buf[i]) != -1… | |
1340 set_state(STATE_LEVEL, paragraph… | |
1341 &(buf[i])); | |
1342 } | |
1343 } | |
1344 runsince = SIZE_MAX; | |
1345 } else { | |
1346 /* sequence ended */ | |
1347 runsince = SIZE_MAX; | |
1348 } | |
1349 } | |
1350 if (runsince != SIZE_MAX) { | |
1351 /* | |
1352 * this is the end of the paragraph and we | |
1353 * are in a run | |
1354 */ | |
1355 for (i = runsince; i < buflen; i++) { | |
1356 if (get_state(STATE_LEVEL, buf[i]) != -1) { | |
1357 set_state(STATE_LEVEL, paragraph_level, | |
1358 &(buf[i])); | |
1359 } | |
1360 } | |
1361 runsince = SIZE_MAX; | |
1362 } | |
1363 } | |
1364 | |
1365 static inline uint_least8_t | |
1366 get_bidi_bracket_off(uint_least32_t cp) | |
1367 { | |
1368 if (likely(cp <= 0x10FFFF)) { | |
1369 return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] + | |
1370 (cp & 0xff)]) >> | |
1371 5); | |
1372 } else { | |
1373 return 0; | |
1374 } | |
1375 } | |
1376 | |
1377 static size_t | |
1378 preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction ov… | |
1379 uint_least32_t *buf, size_t buflen, | |
1380 enum grapheme_bidirectional_direction *resolved) | |
1381 { | |
1382 HERODOTUS_READER tmp; | |
1383 size_t bufoff, bufsize, paragraph_len; | |
1384 uint_least32_t cp; | |
1385 uint_least8_t paragraph_level; | |
1386 | |
1387 /* determine length and level of the paragraph */ | |
1388 herodotus_reader_copy(r, &tmp); | |
1389 for (; herodotus_read_codepoint(&tmp, true, &cp) == | |
1390 HERODOTUS_STATUS_SUCCESS;) { | |
1391 /* break on paragraph separator */ | |
1392 if (get_bidi_property(cp) == BIDI_PROP_B) { | |
1393 break; | |
1394 } | |
1395 } | |
1396 paragraph_len = herodotus_reader_number_read(&tmp); | |
1397 paragraph_level = get_paragraph_level(override, r); | |
1398 | |
1399 if (resolved != NULL) { | |
1400 /* store resolved paragraph level in output variable */ | |
1401 /* TODO use enum-type */ | |
1402 *resolved = (paragraph_level == 0) ? | |
1403 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR… | |
1404 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; | |
1405 } | |
1406 | |
1407 if (buf == NULL) { | |
1408 /* see below for return value reasoning */ | |
1409 return paragraph_len; | |
1410 } | |
1411 | |
1412 /* | |
1413 * the first step is to determine the bidirectional properties | |
1414 * and store them in the buffer | |
1415 */ | |
1416 for (bufoff = 0; | |
1417 bufoff < paragraph_len && | |
1418 herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_… | |
1419 bufoff++) { | |
1420 if (bufoff < buflen) { | |
1421 /* | |
1422 * actually only do something when we have | |
1423 * space in the level-buffer. We continue | |
1424 * the iteration to be able to give a good | |
1425 * return value | |
1426 */ | |
1427 set_state(STATE_PROP, | |
1428 (uint_least8_t)get_bidi_property(cp), | |
1429 &(buf[bufoff])); | |
1430 set_state(STATE_BRACKET_OFF, get_bidi_bracket_of… | |
1431 &(buf[bufoff])); | |
1432 set_state(STATE_LEVEL, 0, &(buf[bufoff])); | |
1433 set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff… | |
1434 set_state(STATE_VISITED, 0, &(buf[bufoff])); | |
1435 set_state(STATE_PRESERVED_PROP, | |
1436 (uint_least8_t)get_bidi_property(cp), | |
1437 &(buf[bufoff])); | |
1438 } | |
1439 } | |
1440 bufsize = herodotus_reader_number_read(r); | |
1441 | |
1442 for (bufoff = 0; bufoff < bufsize; bufoff++) { | |
1443 if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B && | |
1444 bufoff != bufsize - 1) { | |
1445 continue; | |
1446 } | |
1447 | |
1448 /* | |
1449 * we either encountered a paragraph terminator or this | |
1450 * is the last character in the string. | |
1451 * Call the paragraph handler on the paragraph, including | |
1452 * the terminating character or last character of the | |
1453 * string respectively | |
1454 */ | |
1455 preprocess_paragraph(paragraph_level, buf, bufoff + 1); | |
1456 break; | |
1457 } | |
1458 | |
1459 /* | |
1460 * we return the number of total bytes read, as the function | |
1461 * should indicate if the given level-buffer is too small | |
1462 */ | |
1463 return herodotus_reader_number_read(r); | |
1464 } | |
1465 | |
1466 size_t | |
1467 grapheme_bidirectional_preprocess_paragraph( | |
1468 const uint_least32_t *src, size_t srclen, | |
1469 enum grapheme_bidirectional_direction override, uint_least32_t *… | |
1470 size_t destlen, enum grapheme_bidirectional_direction *resolved) | |
1471 { | |
1472 HERODOTUS_READER r; | |
1473 | |
1474 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
1475 | |
1476 return preprocess(&r, override, dest, destlen, resolved); | |
1477 } | |
1478 | |
1479 static inline size_t | |
1480 get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen, | |
1481 int_least8_t (*get_level)(const void *, size_t… | |
1482 void (*set_level)(void *, size_t, int_least8_t… | |
1483 void *lev, size_t levsize, bool skipignored) | |
1484 { | |
1485 enum bidi_property prop; | |
1486 size_t i, levlen, runsince; | |
1487 int_least8_t level, runlevel; | |
1488 | |
1489 /* rule L1.4 */ | |
1490 runsince = SIZE_MAX; | |
1491 runlevel = -1; | |
1492 for (i = 0, levlen = 0; i < linelen; i++) { | |
1493 level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]… | |
1494 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP, | |
1495 linedata[i]); | |
1496 | |
1497 /* write level into level array if we still have space */ | |
1498 if (level != -1 || skipignored == false) { | |
1499 if (levlen <= levsize) { | |
1500 set_level(lev, levlen, level); | |
1501 } | |
1502 levlen++; | |
1503 } | |
1504 | |
1505 if (level == -1) { | |
1506 /* ignored character */ | |
1507 continue; | |
1508 } | |
1509 | |
1510 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI || | |
1511 prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || | |
1512 prop == BIDI_PROP_PDI) { | |
1513 if (runsince == SIZE_MAX) { | |
1514 /* a new run has begun */ | |
1515 runsince = levlen - 1; /* levlen > 0 */ | |
1516 runlevel = (int_least8_t)get_state( | |
1517 STATE_PARAGRAPH_LEVEL, linedata[… | |
1518 } | |
1519 } else { | |
1520 /* sequence ended */ | |
1521 runsince = SIZE_MAX; | |
1522 runlevel = -1; | |
1523 } | |
1524 } | |
1525 if (runsince != SIZE_MAX) { | |
1526 /* | |
1527 * we hit the end of the line but were in a run; | |
1528 * reset the line levels to the paragraph level | |
1529 */ | |
1530 for (i = runsince; i < MIN(linelen, levlen); i++) { | |
1531 if (get_level(lev, i) != -1) { | |
1532 set_level(lev, i, runlevel); | |
1533 } | |
1534 } | |
1535 } | |
1536 | |
1537 return levlen; | |
1538 } | |
1539 | |
1540 static inline int_least8_t | |
1541 get_level_int8(const void *lev, size_t off) | |
1542 { | |
1543 return ((const int_least8_t *)lev)[off]; | |
1544 } | |
1545 | |
1546 static inline void | |
1547 set_level_int8(void *lev, size_t off, int_least8_t value) | |
1548 { | |
1549 ((int_least8_t *)lev)[off] = value; | |
1550 } | |
1551 | |
1552 size_t | |
1553 grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *l… | |
1554 size_t linelen, | |
1555 int_least8_t *lev, | |
1556 size_t levlen) | |
1557 { | |
1558 return get_line_embedding_levels(linedata, linelen, get_level_in… | |
1559 set_level_int8, lev, levlen, fa… | |
1560 } | |
1561 | |
1562 static inline int_least8_t | |
1563 get_level_uint32(const void *lev, size_t off) | |
1564 { | |
1565 return (int_least8_t)((((const uint_least32_t *)lev)[off] & | |
1566 UINT32_C(0x1FE00000)) >> | |
1567 21) - | |
1568 1; | |
1569 } | |
1570 | |
1571 static inline void | |
1572 set_level_uint32(void *lev, size_t off, int_least8_t value) | |
1573 { | |
1574 ((uint_least32_t *)lev)[off] ^= | |
1575 ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000); | |
1576 ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) <<… | |
1577 } | |
1578 | |
1579 static inline int_least16_t | |
1580 get_mirror_offset(uint_least32_t cp) | |
1581 { | |
1582 if (cp <= UINT32_C(0x10FFFF)) { | |
1583 return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)]; | |
1584 } else { | |
1585 return 0; | |
1586 } | |
1587 } | |
1588 | |
1589 size_t | |
1590 grapheme_bidirectional_reorder_line(const uint_least32_t *line, | |
1591 const uint_least32_t *linedata, | |
1592 size_t linelen, uint_least32_t *outp… | |
1593 size_t outputsize) | |
1594 { | |
1595 size_t i, outputlen, first, last, j, k, l /*, laststart*/; | |
1596 int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0; | |
1597 uint_least32_t tmp; | |
1598 | |
1599 /* write output characters (and apply possible mirroring) */ | |
1600 for (i = 0, outputlen = 0; i < linelen; i++) { | |
1601 if (get_state(STATE_LEVEL, linedata[i]) != -1) { | |
1602 if (outputlen < outputsize) { | |
1603 output[outputlen] = | |
1604 (uint_least32_t)((int_least32_t) | |
1605 line[i]… | |
1606 get_mirror_offs… | |
1607 line[i]… | |
1608 } | |
1609 outputlen++; | |
1610 } | |
1611 } | |
1612 if (outputlen >= outputsize) { | |
1613 /* clear output buffer */ | |
1614 for (i = 0; i < outputsize; i++) { | |
1615 output[i] = GRAPHEME_INVALID_CODEPOINT; | |
1616 } | |
1617 | |
1618 /* return required size */ | |
1619 return outputlen; | |
1620 } | |
1621 | |
1622 /* | |
1623 * write line embedding levels as metadata and codepoints into t… | |
1624 * output | |
1625 */ | |
1626 get_line_embedding_levels(linedata, linelen, get_level_uint32, | |
1627 set_level_uint32, output, outputsize, … | |
1628 | |
1629 /* determine level range */ | |
1630 for (i = 0; i < outputlen; i++) { | |
1631 level = get_level_uint32(output, i); | |
1632 | |
1633 if (level == -1) { | |
1634 /* ignored character */ | |
1635 continue; | |
1636 } | |
1637 | |
1638 if (level % 2 == 1 && level < min_odd_level) { | |
1639 min_odd_level = level; | |
1640 } | |
1641 if (level > max_level) { | |
1642 max_level = level; | |
1643 } | |
1644 } | |
1645 | |
1646 for (level = max_level; level >= min_odd_level /* > 0 */; level-… | |
1647 for (i = 0; i < outputlen; i++) { | |
1648 if (get_level_uint32(output, i) >= level) { | |
1649 /* | |
1650 * the current character has the desired… | |
1651 */ | |
1652 first = last = i; | |
1653 | |
1654 /* find the end of the level-sequence */ | |
1655 for (i++; i < outputlen; i++) { | |
1656 if (get_level_uint32(output, i) … | |
1657 level) { | |
1658 /* the sequence continue… | |
1659 last = i; | |
1660 } else { | |
1661 break; | |
1662 } | |
1663 } | |
1664 | |
1665 /* invert the sequence first..last respe… | |
1666 * grapheme clusters | |
1667 * | |
1668 * The standard only speaks of combining… | |
1669 * inversion, but we should in the perfe… | |
1670 * respect _all_ grapheme clusters, whic… | |
1671 * here! | |
1672 */ | |
1673 | |
1674 /* mark grapheme cluster breaks */ | |
1675 for (j = first; j <= last; | |
1676 j += grapheme_next_character_break( | |
1677 line + j, outputlen - j)) { | |
1678 /* | |
1679 * we use a special trick here: … | |
1680 * first 21 bits of the state ar… | |
1681 * with the codepoint, the next … | |
1682 * are used for the level, so we… | |
1683 * the 30th bit to mark the grap… | |
1684 * cluster breaks. This allows u… | |
1685 * reinvert the grapheme cluster… | |
1686 * the proper direction later. | |
1687 */ | |
1688 output[j] |= UINT32_C(1) << 29; | |
1689 } | |
1690 | |
1691 /* global inversion */ | |
1692 for (k = first, l = last; k < l; k++, l-… | |
1693 /* swap */ | |
1694 tmp = output[k]; | |
1695 output[k] = output[l]; | |
1696 output[l] = tmp; | |
1697 } | |
1698 | |
1699 /* grapheme cluster reinversion */ | |
1700 #if 0 | |
1701 for (j = first, laststart = first; j <= … | |
1702 j++) { | |
1703 if (output[j] & (UINT32_C(1) << … | |
1704 /* we hit a mark! given … | |
1705 * grapheme cluster is i… | |
1706 * this means that the c… | |
1707 * ended and we now rein… | |
1708 * again | |
1709 */ | |
1710 for (k = laststart, l = … | |
1711 k < l; k++, l--) { | |
1712 /* swap */ | |
1713 tmp = output[k]; | |
1714 output[k] = outp… | |
1715 output[l] = tmp; | |
1716 } | |
1717 laststart = j + 1; | |
1718 } | |
1719 } | |
1720 #endif | |
1721 | |
1722 /* unmark grapheme cluster breaks */ | |
1723 for (j = first; j <= last; j++) { | |
1724 output[j] ^= output[j] & | |
1725 UINT32_C(0x20000000… | |
1726 } | |
1727 } | |
1728 } | |
1729 } | |
1730 | |
1731 /* remove embedding level metadata */ | |
1732 for (i = 0; i < outputlen; i++) { | |
1733 output[i] ^= output[i] & UINT32_C(0x1FE00000); | |
1734 } | |
1735 | |
1736 return outputlen; | |
1737 } |