Implement bidirectional bracket support - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit bbbc72cba69445535dd035dfe1ee10d473655629 | |
parent b9e1d4bbd4ce6a539999560c1cc863b645a080cd | |
Author: Laslo Hunhold <[email protected]> | |
Date: Tue, 29 Nov 2022 23:23:53 +0100 | |
Implement bidirectional bracket support | |
The single rule N0 in the Unicode Bidirectional Algorithm may not | |
sound like much, but it packs quite a punch and required some deep | |
work. | |
It wasn't exactly made simpler by the fact that the document is very | |
convoluted and not easy to follow. However, it helps to have experience | |
from the other algorithms and the automatic tests allow very broad | |
confirmation of proper function. | |
In particular, the following changes needed to be made: The generator | |
had to be modified to | |
- Implement a decompositon to match canonically equivalent | |
brackets. This requires us to have UnicodeData.txt present, | |
but what matters is that the end result is fast and small. | |
- The LUT-printing automatically detects type, because it's just | |
too fragile otherwise. | |
The implementation of the algorithm itself had the following changes: | |
- The last strong type property of an isolate runner has been | |
refactored to be stateless. Otherwise, you can end up with | |
subtle bugs where strong types are added beforehand, yielding | |
a TOCTOU-problem. | |
- The bracket parsing makes use of a novel FIFO structure that | |
combines the best of both worlds between a stack and naive | |
implementation. | |
As an end result, we now pass all ~900k bidi tests from the Unicode | |
standard. | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M Makefile | 2 +- | |
M gen/bidirectional-test.c | 4 ++-- | |
M gen/bidirectional.c | 209 ++++++++++++++++++++++++++++-… | |
M gen/case.c | 18 +++++++++--------- | |
M gen/util.c | 44 ++++++++++++++++++++++++++---… | |
M gen/util.h | 4 ++-- | |
M src/bidirectional.c | 410 +++++++++++++++++++++++++++++… | |
M test/bidirectional.c | 3 --- | |
8 files changed, 628 insertions(+), 66 deletions(-) | |
--- | |
diff --git a/Makefile b/Makefile | |
@@ -233,7 +233,7 @@ test/utf8-encode: test/utf8-encode.o test/util.o libgraphem… | |
test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a | |
test/word: test/word.o test/util.o libgrapheme.a | |
-gen/bidirectional.h: data/BidiBrackets.txt data/BidiMirroring.txt data/Derived… | |
+gen/bidirectional.h: data/BidiBrackets.txt data/BidiMirroring.txt data/Derived… | |
gen/bidirectional-test.h: data/BidiCharacterTest.txt data/BidiTest.txt gen/bid… | |
gen/case.h: data/DerivedCoreProperties.txt data/UnicodeData.txt data/SpecialCa… | |
gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/charac… | |
diff --git a/gen/bidirectional-test.c b/gen/bidirectional-test.c | |
@@ -471,8 +471,8 @@ main(int argc, char *argv[]) | |
(void)argc; | |
parse_file_with_callback("data/BidiTest.txt", test_callback, NULL); | |
- /*parse_file_with_callback("data/BidiCharacterTest.txt", | |
- character_test_callback, NULL);*/ | |
+ parse_file_with_callback("data/BidiCharacterTest.txt", | |
+ character_test_callback, NULL); | |
bidirectional_test_list_print(test, testlen, "bidirectional_test", | |
argv[0]); | |
diff --git a/gen/bidirectional.c b/gen/bidirectional.c | |
@@ -1,5 +1,6 @@ | |
/* See LICENSE file for copyright and license details. */ | |
#include <errno.h> | |
+#include <inttypes.h> | |
#include <stddef.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
@@ -10,6 +11,9 @@ | |
#define FILE_BIDI_BRACKETS "data/BidiBrackets.txt" | |
#define FILE_BIDI_CLASS "data/DerivedBidiClass.txt" | |
#define FILE_BIDI_MIRRORING "data/BidiMirroring.txt" | |
+#define FILE_UNICODE_DATA "data/UnicodeData.txt" | |
+ | |
+#define NUM_BRACKET_ALIASES 20 | |
static const struct property_spec bidi_property[] = { | |
{ | |
@@ -130,36 +134,189 @@ static const struct property_spec bidi_property[] = { | |
}, | |
}; | |
+struct decomposition_payload { | |
+ uint_least32_t cp; | |
+ uint_least32_t decomposition; | |
+}; | |
+ | |
+static int | |
+decomposition_callback(const char *file, char **field, size_t nfields, | |
+ char *comment, void *payload) | |
+{ | |
+ char *p; | |
+ struct decomposition_payload *decomp = | |
+ (struct decomposition_payload *)payload; | |
+ uint_least32_t cp; | |
+ | |
+ (void)file; | |
+ (void)comment; | |
+ | |
+ if (nfields < 6) { | |
+ /* we have fewer than 6 fields, discard the line */ | |
+ return 0; | |
+ } | |
+ | |
+ hextocp(field[0], strlen(field[0]), &cp); | |
+ | |
+ if (decomp->cp == cp) { | |
+ /* we hit the line that contains our decomposition target */ | |
+ if (strlen(field[5]) > 0) { | |
+ p = field[5]; | |
+ if (*p == '<') { | |
+ /* | |
+ * the decomposition contains some metadata | |
+ * <...> we skip | |
+ */ | |
+ for (; *p != '\0'; p++) { | |
+ if (*p == '>') { | |
+ p++; | |
+ while (*p == ' ') { | |
+ p++; | |
+ } | |
+ break; | |
+ } | |
+ } | |
+ } | |
+ hextocp(p, strlen(p), &(decomp->decomposition)); | |
+ } else { | |
+ decomp->decomposition = decomp->cp; | |
+ } | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
static struct { | |
- uint_least32_t cp_base; | |
- uint_least32_t cp_pair; | |
+ uint_least32_t base[NUM_BRACKET_ALIASES]; | |
+ size_t baselen; | |
+ uint_least32_t pair[NUM_BRACKET_ALIASES]; | |
+ size_t pairlen; | |
+ uint_least8_t class; | |
char type; | |
} *b = NULL; | |
static size_t blen; | |
+static uint_least8_t bracket_class_count = 1; | |
static int | |
bracket_callback(const char *file, char **field, size_t nfields, char *comment, | |
void *payload) | |
{ | |
+ size_t i, j; | |
+ struct decomposition_payload decomp_base, decomp_pair; | |
+ uint_least32_t cp_base, cp_pair; | |
+ | |
(void)file; | |
(void)comment; | |
(void)payload; | |
if (nfields < 3) { | |
- /* we have less than 3 fields, discard the line */ | |
+ /* we have fewer than 3 fields, discard the line */ | |
return 0; | |
} | |
- /* extend bracket pair array */ | |
+ /* parse field data */ | |
+ hextocp(field[0], strlen(field[0]), &cp_base); | |
+ hextocp(field[1], strlen(field[1]), &cp_pair); | |
+ | |
+ /* determine decomposition of the base and pair codepoints */ | |
+ decomp_base.cp = cp_base; | |
+ decomp_pair.cp = cp_pair; | |
+ parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback, | |
+ &decomp_base); | |
+ parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback, | |
+ &decomp_pair); | |
+ | |
+ /* | |
+ * check if we already have the canonical form in the bracket array, | |
+ * per convention the canonical form is the first element of the alias | |
+ * array | |
+ */ | |
+ for (i = 0; i < blen; i++) { | |
+ if (decomp_base.decomposition == b[i].base[0]) { | |
+ /* we have a match, check type */ | |
+ if (strlen(field[2]) != 1 || | |
+ (field[2][0] != 'o' && field[2][0] != 'c')) { | |
+ /* malformed line */ | |
+ return 1; | |
+ } else if (b[i].type != field[2][0]) { | |
+ /* mismatching types */ | |
+ return 1; | |
+ } | |
+ | |
+ /* | |
+ * add our base alias to the base array unless it isn't | |
+ * already in it | |
+ */ | |
+ for (j = 0; j < b[i].baselen; j++) { | |
+ if (cp_base == b[i].base[j]) { | |
+ /* already in array, do nothing */ | |
+ break; | |
+ } | |
+ } | |
+ if (j == b[i].baselen) { | |
+ /* | |
+ * the base alias is not already in the array, | |
+ * add it | |
+ */ | |
+ if (b[i].baselen == NUM_BRACKET_ALIASES) { | |
+ fprintf(stderr, "too many aliases\n"); | |
+ return 1; | |
+ } | |
+ b[i].baselen++; | |
+ b[i].base[b[i].baselen - 1] = cp_base; | |
+ } | |
+ | |
+ /* | |
+ * also add our pair alias to the pair array unless | |
+ * it isn't already in it | |
+ */ | |
+ for (j = 0; j < b[i].pairlen; j++) { | |
+ if (cp_pair == b[i].pair[j]) { | |
+ /* already in array, do nothing */ | |
+ break; | |
+ } | |
+ } | |
+ if (j == b[i].pairlen) { | |
+ /* | |
+ * the pair alias is not already in the array, | |
+ * add it | |
+ */ | |
+ if (b[i].pairlen == NUM_BRACKET_ALIASES) { | |
+ fprintf(stderr, "too many aliases\n"); | |
+ return 1; | |
+ } | |
+ b[i].pairlen++; | |
+ b[i].pair[b[i].pairlen - 1] = cp_pair; | |
+ } | |
+ | |
+ return 0; | |
+ } | |
+ } | |
+ | |
+ /* extend bracket pair array, as this is a new bracket type */ | |
if (!(b = realloc(b, (++blen) * sizeof(*b)))) { | |
fprintf(stderr, "realloc: %s\n", strerror(errno)); | |
exit(1); | |
} | |
- /* parse field data */ | |
- hextocp(field[0], strlen(field[0]), &(b[blen - 1].cp_base)); | |
- hextocp(field[1], strlen(field[1]), &(b[blen - 1].cp_pair)); | |
+ /* fill field data by adding the canonical form first */ | |
+ b[blen - 1].base[0] = decomp_base.decomposition; | |
+ b[blen - 1].baselen = 1; | |
+ b[blen - 1].pair[0] = decomp_pair.decomposition; | |
+ b[blen - 1].pairlen = 1; | |
+ | |
+ /* add alias if it differs from the canonical form */ | |
+ if (cp_base != decomp_base.decomposition) { | |
+ b[blen - 1].base[1] = cp_base; | |
+ b[blen - 1].baselen = 2; | |
+ } | |
+ if (cp_pair != decomp_pair.decomposition) { | |
+ b[blen - 1].pair[1] = cp_pair; | |
+ b[blen - 1].pairlen = 2; | |
+ } | |
+ | |
+ /* add bracket type */ | |
if (strlen(field[2]) != 1 || | |
(field[2][0] != 'o' && field[2][0] != 'c')) { | |
/* malformed line */ | |
@@ -168,13 +325,32 @@ bracket_callback(const char *file, char **field, size_t n… | |
b[blen - 1].type = field[2][0]; | |
} | |
+ /* | |
+ * determine bracket class by iterating over the bracket-array | |
+ * and seeing if our current canonical cp already has a matching pair. | |
+ * We only need to check the first entry in each bracket alias | |
+ * list, as this is, per convention, the canonical form. | |
+ * If not, add a new class. | |
+ */ | |
+ for (i = 0; i + 1 < blen; i++) { | |
+ if (b[i].pair[0] == b[blen - 1].base[0]) { | |
+ /* matched class */ | |
+ b[blen - 1].class = b[i].class; | |
+ break; | |
+ } | |
+ } | |
+ if (i + 1 == blen) { | |
+ /* no match, assign a new class */ | |
+ b[blen - 1].class = bracket_class_count++; | |
+ } | |
+ | |
return 0; | |
} | |
static void | |
post_process(struct properties *prop) | |
{ | |
- size_t i; | |
+ size_t i, j; | |
for (i = 0; i < blen; i++) { | |
/* | |
@@ -185,7 +361,9 @@ post_process(struct properties *prop) | |
* have offset 0, which we prepared to contain a stub | |
* for a character that is not a bracket. | |
*/ | |
- prop[b[i].cp_base].property |= (i << 5); | |
+ for (j = 0; j < b[i].baselen; j++) { | |
+ prop[b[i].base[j]].property |= (i << 5); | |
+ } | |
} | |
} | |
@@ -236,7 +414,7 @@ main(int argc, char *argv[]) | |
* all-zeros, as we use the implicit 0-offset for all those | |
* codepoints that are not a bracket | |
*/ | |
- if (!(b = calloc(1, sizeof(*b)))) { | |
+ if (!(b = calloc((blen = 1), sizeof(*b)))) { | |
fprintf(stderr, "calloc: %s\n", strerror(errno)); | |
exit(1); | |
} | |
@@ -248,16 +426,15 @@ main(int argc, char *argv[]) | |
printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t" | |
"BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n" | |
- "struct bracket {\n\tenum bracket_type type;\n" | |
- "\tuint_least32_t pair;\n};\n\n" | |
- "static const struct bracket bidi_bracket[] = {\n"); | |
+ "static const struct bracket {\n\tenum bracket_type type;\n" | |
+ "\tuint_least8_t class;\n} bidi_bracket[] = {\n"); | |
for (i = 0; i < blen; i++) { | |
- printf("\t{\n\t\t.type = %s,\n\t\t.pair = " | |
- "UINT32_C(0x%06X),\n\t},\n", | |
+ printf("\t{\n\t\t.type = %s,\n\t\t.class = " | |
+ "%" PRIuLEAST8 ",\n\t},\n", | |
(b[i].type == 'o') ? "BIDI_BRACKET_OPEN" : | |
(b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" : | |
"BIDI_BRACKET_NONE", | |
- b[i].cp_pair); | |
+ b[i].class); | |
} | |
printf("};\n"); | |
diff --git a/gen/case.c b/gen/case.c | |
@@ -218,21 +218,21 @@ main(int argc, char *argv[]) | |
properties_print_lookup_table("upper_major", mm_upper.major, 0x1100); | |
printf("\n"); | |
- properties_print_derived_lookup_table("upper_minor", "int_least32_t", | |
- mm_upper.minor, mm_upper.minorle… | |
- get_value, comp_upper.data); | |
+ properties_print_derived_lookup_table("upper_minor", mm_upper.minor, | |
+ mm_upper.minorlen, get_value, | |
+ comp_upper.data); | |
printf("\n"); | |
properties_print_lookup_table("lower_major", mm_lower.major, 0x1100); | |
printf("\n"); | |
- properties_print_derived_lookup_table("lower_minor", "int_least32_t", | |
- mm_lower.minor, mm_lower.minorle… | |
- get_value, comp_lower.data); | |
+ properties_print_derived_lookup_table("lower_minor", mm_lower.minor, | |
+ mm_lower.minorlen, get_value, | |
+ comp_lower.data); | |
printf("\n"); | |
properties_print_lookup_table("title_major", mm_title.major, 0x1100); | |
printf("\n"); | |
- properties_print_derived_lookup_table("title_minor", "int_least32_t", | |
- mm_title.minor, mm_title.minorle… | |
- get_value, comp_title.data); | |
+ properties_print_derived_lookup_table("title_minor", mm_title.minor, | |
+ mm_title.minorlen, get_value, | |
+ comp_title.data); | |
printf("\n"); | |
printf("static const struct special_case upper_special[] = {\n"); | |
diff --git a/gen/util.c b/gen/util.c | |
@@ -405,9 +405,10 @@ properties_get_major_minor(const struct properties_compres… | |
} | |
void | |
-properties_print_lookup_table(char *name, size_t *data, size_t datalen) | |
+properties_print_lookup_table(const char *name, const size_t *data, | |
+ size_t datalen) | |
{ | |
- char *type; | |
+ const char *type; | |
size_t i, maxval; | |
for (i = 0, maxval = 0; i < datalen; i++) { | |
@@ -437,11 +438,41 @@ properties_print_lookup_table(char *name, size_t *data, s… | |
void | |
properties_print_derived_lookup_table( | |
- char *name, char *type, size_t *offset, size_t offsetlen, | |
+ char *name, size_t *offset, size_t offsetlen, | |
int_least64_t (*get_value)(const struct properties *, size_t), | |
const void *payload) | |
{ | |
+ const char *type; | |
size_t i; | |
+ int_least64_t minval, maxval; | |
+ | |
+ for (i = 0, minval = INT_LEAST64_MAX, maxval = INT_LEAST64_MIN; | |
+ i < offsetlen; i++) { | |
+ if (get_value(payload, offset[i]) > maxval) { | |
+ maxval = get_value(payload, offset[i]); | |
+ } else if (get_value(payload, offset[i]) < minval) { | |
+ minval = get_value(payload, offset[i]); | |
+ } | |
+ } | |
+ | |
+ if (minval < 0) { | |
+ /* we need a signed type */ | |
+ type = (minval >= INT_LEAST8_MIN && maxval <= INT_LEAST8_MAX) ? | |
+ "int_least8_t" : | |
+ (minval >= INT_LEAST16_MIN && | |
+ maxval <= INT_LEAST16_MAX) ? | |
+ "int_least16_t" : | |
+ (minval >= INT_LEAST32_MIN && | |
+ maxval <= INT_LEAST32_MAX) ? | |
+ "int_least32_t" : | |
+ "int_least64_t"; | |
+ } else { | |
+ /* we are fine with an unsigned type */ | |
+ type = (maxval <= UINT_LEAST8_MAX) ? "uint_least8_t" : | |
+ (maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" : | |
+ (maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" : | |
+ "uint_least64_t"; | |
+ } | |
printf("static const %s %s[] = {\n\t", type, name); | |
for (i = 0; i < offsetlen; i++) { | |
@@ -499,7 +530,7 @@ set_value_bp(struct properties_payload *payload, uint_least… | |
static int_least64_t | |
get_value_bp(const struct properties *prop, size_t offset) | |
{ | |
- return (uint_least8_t)prop[offset].property; | |
+ return prop[offset].property; | |
} | |
void | |
@@ -606,9 +637,8 @@ properties_generate_break_property( | |
properties_print_enum(spec, speclen, buf1, buf2); | |
properties_print_lookup_table(buf3, mm.major, 0x1100); | |
printf("\n"); | |
- properties_print_derived_lookup_table(buf4, "uint_least8_t", mm.minor, | |
- mm.minorlen, get_value_bp, | |
- comp.data); | |
+ properties_print_derived_lookup_table(buf4, mm.minor, mm.minorlen, | |
+ get_value_bp, comp.data); | |
/* free data */ | |
free(prop); | |
diff --git a/gen/util.h b/gen/util.h | |
@@ -43,9 +43,9 @@ void properties_compress(const struct properties *, | |
struct properties_compressed *comp); | |
double properties_get_major_minor(const struct properties_compressed *, | |
struct properties_major_minor *); | |
-void properties_print_lookup_table(char *, size_t *, size_t); | |
+void properties_print_lookup_table(const char *, const size_t *, size_t); | |
void properties_print_derived_lookup_table( | |
- char *, char *, size_t *, size_t, | |
+ char *, size_t *, size_t, | |
int_least64_t (*get_value)(const struct properties *, size_t), | |
const void *); | |
diff --git a/src/bidirectional.c b/src/bidirectional.c | |
@@ -74,6 +74,7 @@ set_state(enum state_type t, int_least16_t value, uint_least3… | |
struct isolate_runner { | |
uint_least32_t *buf; | |
size_t buflen; | |
+ size_t start; | |
struct { | |
size_t off; | |
@@ -83,7 +84,6 @@ struct isolate_runner { | |
uint_least8_t paragraph_level; | |
int_least8_t isolating_run_level; | |
- enum bidi_property last_strong_type; | |
}; | |
static inline enum bidi_property | |
@@ -110,14 +110,28 @@ ir_get_next_prop(const struct isolate_runner *ir) | |
ir->buf[ir->next.off]); | |
} | |
+static inline enum bidi_property | |
+ir_get_current_preserved_prop(const struct isolate_runner *ir) | |
+{ | |
+ return (uint_least8_t)get_state(STATE_PRESERVED_PROP, | |
+ ir->buf[ir->cur.off]); | |
+} | |
+ | |
static inline int_least8_t | |
ir_get_current_level(const struct isolate_runner *ir) | |
{ | |
return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]); | |
} | |
+static inline const struct bracket * | |
+ir_get_current_bracket_prop(const struct isolate_runner *ir) | |
+{ | |
+ return bidi_bracket + | |
+ (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cur.off]… | |
+} | |
+ | |
static void | |
-ir_set_current_prop(struct isolate_runner *ir, enum bidi_property prop) | |
+ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property prop) | |
{ | |
set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off])); | |
} | |
@@ -133,6 +147,7 @@ ir_init(uint_least32_t *buf, size_t buflen, size_t off, | |
ir->buf = buf; | |
ir->buflen = buflen; | |
ir->paragraph_level = paragraph_level; | |
+ ir->start = off; | |
/* advance off until we are at a non-removed character */ | |
for (; off < buflen; off++) { | |
@@ -225,18 +240,6 @@ ir_advance(struct isolate_runner *ir) | |
/* mark as visited */ | |
set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off])); | |
- /* | |
- * update last strong type, which is guaranteed to work properly | |
- * on the first advancement as the prev.off is SIZE_T and the | |
- * implied sos type can only be either R or L, which are both | |
- * strong types | |
- */ | |
- if (ir_get_previous_prop(ir) == BIDI_PROP_R || | |
- ir_get_previous_prop(ir) == BIDI_PROP_L || | |
- ir_get_previous_prop(ir) == BIDI_PROP_AL) { | |
- ir->last_strong_type = ir_get_previous_prop(ir); | |
- } | |
- | |
/* initialize next state by going to the next character in the sequence | |
*/ | |
ir->next.off = SIZE_MAX; | |
@@ -328,6 +331,355 @@ ir_advance(struct isolate_runner *ir) | |
return 0; | |
} | |
+static enum bidi_property | |
+ir_get_last_strong_prop(const struct isolate_runner *ir) | |
+{ | |
+ struct isolate_runner tmp; | |
+ enum bidi_property last_strong_prop = ir->sos, prop; | |
+ | |
+ ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false, | |
+ &tmp); | |
+ for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) { | |
+ prop = ir_get_current_prop(&tmp); | |
+ | |
+ if (prop == BIDI_PROP_R || prop == BIDI_PROP_L || | |
+ prop == BIDI_PROP_AL) { | |
+ last_strong_prop = prop; | |
+ } | |
+ } | |
+ | |
+ return last_strong_prop; | |
+} | |
+ | |
+static enum bidi_property | |
+ir_get_last_strong_or_number_prop(const struct isolate_runner *ir) | |
+{ | |
+ struct isolate_runner tmp; | |
+ enum bidi_property last_strong_or_number_prop = ir->sos, prop; | |
+ | |
+ ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false, | |
+ &tmp); | |
+ for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) { | |
+ prop = ir_get_current_prop(&tmp); | |
+ | |
+ if (prop == BIDI_PROP_R || prop == BIDI_PROP_L || | |
+ prop == BIDI_PROP_AL || prop == BIDI_PROP_EN || | |
+ prop == BIDI_PROP_AN) { | |
+ last_strong_or_number_prop = prop; | |
+ } | |
+ } | |
+ | |
+ return last_strong_or_number_prop; | |
+} | |
+ | |
+static void | |
+preprocess_bracket_pair(const struct isolate_runner *start, | |
+ const struct isolate_runner *end) | |
+{ | |
+ enum bidi_property prop, bracket_prop, last_strong_or_number_prop; | |
+ struct isolate_runner ir; | |
+ size_t strong_type_off; | |
+ | |
+ /* | |
+ * check if the bracket contains a strong type (L or R|EN|AN) | |
+ */ | |
+ for (ir = *start, strong_type_off = SIZE_MAX, | |
+ bracket_prop = NUM_BIDI_PROPS; | |
+ !ir_advance(&ir) && ir.cur.off < end->cur.off;) { | |
+ prop = ir_get_current_prop(&ir); | |
+ | |
+ if (prop == BIDI_PROP_L) { | |
+ strong_type_off = ir.cur.off; | |
+ if (ir.isolating_run_level % 2 == 0) { | |
+ /* | |
+ * set the type for both brackets to L (so they | |
+ * match the strong type they contain) | |
+ */ | |
+ bracket_prop = BIDI_PROP_L; | |
+ } | |
+ } else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN || | |
+ prop == BIDI_PROP_AN) { | |
+ strong_type_off = ir.cur.off; | |
+ if (ir.isolating_run_level % 2 != 0) { | |
+ /* | |
+ * set the type for both brackets to R (so they | |
+ * match the strong type they contain) | |
+ */ | |
+ bracket_prop = BIDI_PROP_R; | |
+ } | |
+ } | |
+ } | |
+ if (strong_type_off == SIZE_MAX) { | |
+ /* | |
+ * there are no strong types within the brackets and we just | |
+ * leave the brackets as is | |
+ */ | |
+ return; | |
+ } | |
+ | |
+ if (bracket_prop == NUM_BIDI_PROPS) { | |
+ /* | |
+ * We encountered a strong type, but it was opposite | |
+ * to the embedding direction. | |
+ * Check the previous strong type before the opening | |
+ * bracket | |
+ */ | |
+ last_strong_or_number_prop = | |
+ ir_get_last_strong_or_number_prop(start); | |
+ if (last_strong_or_number_prop == BIDI_PROP_L && | |
+ ir.isolating_run_level % 2 != 0) { | |
+ /* | |
+ * the previous strong type is also opposite | |
+ * to the embedding direction, so the context | |
+ * was established and we set the brackets | |
+ * accordingly. | |
+ */ | |
+ bracket_prop = BIDI_PROP_L; | |
+ } else if ((last_strong_or_number_prop == BIDI_PROP_R || | |
+ last_strong_or_number_prop == BIDI_PROP_EN || | |
+ last_strong_or_number_prop == BIDI_PROP_AN) && | |
+ ir.isolating_run_level % 2 == 0) { | |
+ /* | |
+ * the previous strong type is also opposite | |
+ * to the embedding direction, so the context | |
+ * was established and we set the brackets | |
+ * accordingly. | |
+ */ | |
+ bracket_prop = BIDI_PROP_R; | |
+ } else { | |
+ /* set brackets to the embedding direction */ | |
+ if (ir.isolating_run_level % 2 == 0) { | |
+ bracket_prop = BIDI_PROP_L; | |
+ } else { | |
+ bracket_prop = BIDI_PROP_R; | |
+ } | |
+ } | |
+ } | |
+ | |
+ ir_set_current_prop(start, bracket_prop); | |
+ ir_set_current_prop(end, bracket_prop); | |
+ | |
+ /* | |
+ * any sequence of NSMs after opening or closing brackets get | |
+ * the same property as the one we set on the brackets | |
+ */ | |
+ for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_prop( | |
+ &ir) == BIDI_PROP_NSM;) { | |
+ ir_set_current_prop(&ir, bracket_prop); | |
+ } | |
+ for (ir = *end; !ir_advance(&ir) && | |
+ ir_get_current_preserved_prop(&ir) == BIDI_PROP_NSM;) { | |
+ ir_set_current_prop(&ir, bracket_prop); | |
+ } | |
+} | |
+ | |
+static void | |
+preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off, | |
+ uint_least8_t paragraph_level) | |
+{ | |
+ /* | |
+ * The N0-rule deals with bracket pairs that shall be determined | |
+ * with the rule BD16. This is specified as an algorithm with a | |
+ * stack of 63 bracket openings that are used to resolve into a | |
+ * separate list of pairs, which is then to be sorted by opening | |
+ * position. Thus, even though the bracketing-depth is limited | |
+ * by 63, the algorithm, as is, requires dynamic memory | |
+ * management. | |
+ * | |
+ * A naive approach (used by Fribidi) would be to screw the | |
+ * stack-approach and simply directly determine the | |
+ * corresponding closing bracket offset for a given opening | |
+ * bracket, leading to O(n²) time complexity in the worst case | |
+ * with a lot of brackets. While many brackets are not common, | |
+ * it is still possible to find a middle ground where you obtain | |
+ * strongly linear time complexity in most common cases: | |
+ * | |
+ * Instead of a stack, we use a FIFO data structure which is | |
+ * filled with bracket openings in the order of appearance (thus | |
+ * yielding an implicit sorting!) at the top. If the | |
+ * corresponding closing bracket is encountered, it is added to | |
+ * the respective entry, making it ready to "move out" at the | |
+ * bottom (i.e. passed to the bracket processing). Due to the | |
+ * nature of the specified pair detection algorithm, which only | |
+ * cares about the bracket type and nothing else (bidi class, | |
+ * level, etc.), we can mix processing and bracket detection. | |
+ * | |
+ * Obviously, if you, for instance, have one big bracket pair at | |
+ * the bottom that has not been closed yet, it will block the | |
+ * bracket processing and the FIFO might hit its capacity limit. | |
+ * At this point, the blockage is manually resolved using the | |
+ * naive quadratic approach. | |
+ * | |
+ * To remain within the specified standard behaviour, which | |
+ * mandates that processing of brackets should stop when the | |
+ * bracketing-depth is at 63, we simply check in an "overflow" | |
+ * scenario if all 63 elements in the LIFO are unfinished, which | |
+ * corresponds with such a bracketing depth. | |
+ */ | |
+ enum bidi_property prop; | |
+ | |
+ struct { | |
+ bool complete; | |
+ size_t bracket_class; | |
+ struct isolate_runner start; | |
+ struct isolate_runner end; | |
+ } fifo[63]; | |
+ const struct bracket *bracket_prop, *tmp_bracket_prop; | |
+ struct isolate_runner ir, tmp_ir; | |
+ size_t fifo_len = 0, i, blevel, j, k; | |
+ | |
+ ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
+ while (!ir_advance(&ir)) { | |
+ prop = ir_get_current_prop(&ir); | |
+ bracket_prop = ir_get_current_bracket_prop(&ir); | |
+ if (prop == BIDI_PROP_ON && | |
+ bracket_prop->type == BIDI_BRACKET_OPEN) { | |
+ if (fifo_len == LEN(fifo)) { | |
+ /* | |
+ * The FIFO is full, check first if it's | |
+ * completely blocked (i.e. no finished | |
+ * bracket pairs, triggering the standard | |
+ * that mandates to abort in such a case | |
+ */ | |
+ for (i = 0; i < fifo_len; i++) { | |
+ if (fifo[i].complete) { | |
+ break; | |
+ } | |
+ } | |
+ if (i == fifo_len) { | |
+ /* abort processing */ | |
+ return; | |
+ } | |
+ | |
+ /* | |
+ * by construction, the bottom entry | |
+ * in the FIFO is guaranteed to be | |
+ * unfinished (given we "consume" all | |
+ * finished bottom entries after each | |
+ * iteration). | |
+ * | |
+ * iterate, starting after the opening | |
+ * bracket, and find the corresponding | |
+ * closing bracket. | |
+ * | |
+ * if we find none, just drop the FIFO | |
+ * entry silently | |
+ */ | |
+ for (tmp_ir = fifo[0].start, blevel = 0; | |
+ !ir_advance(&tmp_ir);) { | |
+ tmp_bracket_prop = | |
+ ir_get_current_bracket_prop( | |
+ &tmp_ir); | |
+ | |
+ if (tmp_bracket_prop->type == | |
+ BIDI_BRACKET_OPEN && | |
+ tmp_bracket_prop->class == | |
+ bracket_prop->class) { | |
+ /* we encountered another | |
+ * opening bracket of the | |
+ * same class */ | |
+ blevel++; | |
+ | |
+ } else if (tmp_bracket_prop->type == | |
+ BIDI_BRACKET_CLOSE … | |
+ tmp_bracket_prop->class == | |
+ bracket_prop | |
+ ->class) { | |
+ /* we encountered a closing | |
+ * bracket of the same class | |
+ */ | |
+ if (blevel == 0) { | |
+ /* this is the | |
+ * corresponding | |
+ * closing bracket | |
+ */ | |
+ fifo[0].complete = tru… | |
+ fifo[0].end = ir; | |
+ } else { | |
+ blevel--; | |
+ } | |
+ } | |
+ } | |
+ if (fifo[0].complete) { | |
+ /* we found the matching bracket */ | |
+ preprocess_bracket_pair( | |
+ &(fifo[i].start), | |
+ &(fifo[i].end)); | |
+ } | |
+ | |
+ /* shift FIFO one to the left */ | |
+ for (i = 1; i < fifo_len; i++) { | |
+ fifo[i - 1] = fifo[i]; | |
+ } | |
+ fifo_len--; | |
+ } | |
+ | |
+ /* add element to the FIFO */ | |
+ fifo_len++; | |
+ fifo[fifo_len - 1].complete = false; | |
+ fifo[fifo_len - 1].bracket_class = bracket_prop->class; | |
+ fifo[fifo_len - 1].start = ir; | |
+ } else if (prop == BIDI_PROP_ON && | |
+ bracket_prop->type == BIDI_BRACKET_CLOSE) { | |
+ /* | |
+ * go backwards in the FIFO, skip finished entries | |
+ * and simply ignore (do nothing) the closing | |
+ * bracket if we do not match anything | |
+ */ | |
+ for (i = fifo_len; i > 0; i--) { | |
+ if (bracket_prop->class == | |
+ fifo[i - 1].bracket_class && | |
+ !fifo[i - 1].complete) { | |
+ /* we have found a pair */ | |
+ fifo[i - 1].complete = true; | |
+ fifo[i - 1].end = ir; | |
+ | |
+ /* remove all uncompleted FIFO elements | |
+ * above i - 1 */ | |
+ for (j = i; j < fifo_len; j++) { | |
+ if (fifo[j].complete) { | |
+ continue; | |
+ } | |
+ | |
+ /* shift FIFO one to the left … | |
+ for (k = j + 1; k < fifo_len; | |
+ k++) { | |
+ fifo[k - 1] = fifo[k]; | |
+ } | |
+ fifo_len--; | |
+ } | |
+ break; | |
+ } | |
+ } | |
+ } | |
+ | |
+ /* process all ready bracket pairs from the FIFO bottom */ | |
+ while (fifo_len > 0 && fifo[0].complete) { | |
+ preprocess_bracket_pair(&(fifo[0].start), | |
+ &(fifo[0].end)); | |
+ | |
+ /* shift FIFO one to the left */ | |
+ for (j = 0; j + 1 < fifo_len; j++) { | |
+ fifo[j] = fifo[j + 1]; | |
+ } | |
+ fifo_len--; | |
+ } | |
+ } | |
+ | |
+ /* | |
+ * afterwards, we still might have unfinished bracket pairs | |
+ * that will remain as such, but the subsequent finished pairs | |
+ * also need to be taken into account, so we go through the | |
+ * FIFO once more and process all finished pairs | |
+ */ | |
+ for (i = 0; i < fifo_len; i++) { | |
+ if (fifo[i].complete) { | |
+ preprocess_bracket_pair(&(fifo[i].start), | |
+ &(fifo[i].end)); | |
+ } | |
+ } | |
+} | |
+ | |
static size_t | |
preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, | |
size_t off, uint_least8_t paragraph_level) | |
@@ -355,7 +707,7 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, size… | |
ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
while (!ir_advance(&ir)) { | |
if (ir_get_current_prop(&ir) == BIDI_PROP_EN && | |
- ir.last_strong_type == BIDI_PROP_AL) { | |
+ ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) { | |
ir_set_current_prop(&ir, BIDI_PROP_AN); | |
} | |
} | |
@@ -438,12 +790,13 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, si… | |
ir_init(buf, buflen, off, paragraph_level, false, &ir); | |
while (!ir_advance(&ir)) { | |
if (ir_get_current_prop(&ir) == BIDI_PROP_EN && | |
- ir.last_strong_type == BIDI_PROP_L) { | |
+ ir_get_last_strong_prop(&ir) == BIDI_PROP_L) { | |
ir_set_current_prop(&ir, BIDI_PROP_L); | |
} | |
} | |
/* N0 */ | |
+ preprocess_bracket_pairs(buf, buflen, off, paragraph_level); | |
/* N1 */ | |
sequence_end = SIZE_MAX; | |
@@ -457,10 +810,11 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, si… | |
prop == BIDI_PROP_WS || prop == BIDI_PROP_ON || | |
prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI || | |
prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) { | |
- /* the current character is an NI (neutral or | |
- * isolate) */ | |
+ /* the current character is an NI (neutral | |
+ * or isolate) */ | |
- /* scan ahead to the end of the NI-sequence */ | |
+ /* scan ahead to the end of the NI-sequence | |
+ */ | |
ir_init(buf, buflen, ir.cur.off, | |
paragraph_level, (ir.cur.off > off), | |
&tmp); | |
@@ -480,8 +834,8 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, size… | |
} | |
/* | |
- * check what follows and see if the text has | |
- * the same direction on both sides | |
+ * check what follows and see if the text | |
+ * has the same direction on both sides | |
*/ | |
if (ir_get_previous_prop(&ir) == BIDI_PROP_L && | |
ir_get_next_prop(&tmp) == BIDI_PROP_L) { | |
@@ -794,7 +1148,8 @@ again: | |
} else if (prop == BIDI_PROP_PDI) { | |
/* X6a */ | |
if (overflow_isolate_count > 0) { | |
- /* PDI matches an overflow isolate initiator */ | |
+ /* PDI matches an overflow isolate initiator | |
+ */ | |
overflow_isolate_count--; | |
} else if (valid_isolate_count > 0) { | |
/* PDI matches a normal isolate initiator */ | |
@@ -812,9 +1167,10 @@ again: | |
* | |
* POSSIBLE OPTIMIZATION: Whenever | |
* we push on the stack, check if it | |
- * has the directional isolate status | |
- * true and store a pointer to it | |
- * so we can jump to it very quickly. | |
+ * has the directional isolate | |
+ * status true and store a pointer | |
+ * to it so we can jump to it very | |
+ * quickly. | |
*/ | |
dirstat--; | |
} | |
@@ -974,7 +1330,9 @@ static inline uint_least8_t | |
get_bidi_bracket_off(uint_least32_t cp) | |
{ | |
if (likely(cp <= 0x10FFFF)) { | |
- return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) >> 5; | |
+ return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] + | |
+ (cp & 0xff)]) >> | |
+ 5); | |
} else { | |
return 0; | |
} | |
diff --git a/test/bidirectional.c b/test/bidirectional.c | |
@@ -25,9 +25,6 @@ main(int argc, char *argv[]) | |
} | |
for (i = 0, failed = 0; i < LEN(bidirectional_test); i++) { | |
- /*if (i != 490798) | |
- continue;*/ | |
- | |
for (m = 0; m < bidirectional_test[i].modelen; m++) { | |
ret = grapheme_bidirectional_preprocess( | |
bidirectional_test[i].cp, |