Split bidi-level-processing into preprocessing and line step - libgrapheme - un… | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 07ba2622e073850bbdd6acd8dff88b391cc5ad5c | |
parent aafe6c300e59ed1b4407c71917fb2034fdc7798a | |
Author: Laslo Hunhold <[email protected]> | |
Date: Mon, 21 Nov 2022 08:53:14 +0100 | |
Split bidi-level-processing into preprocessing and line step | |
The bidirectional algorithm is a bit convoluted in this regard, | |
but the canonical choice for the implementation is to do | |
preprocessing on all paragraphs first (applying all rules up to | |
L1.3) and applying rule L1.4 separately. | |
The reason for this is that rule L1.4 requires the knowledge | |
about line break positions, which we don't have (yet). We could | |
take it as a parameter for the preprocessing-function, however, | |
line breaks may change often (think of an ncurses-context with | |
window resizes), making constant complete reprocessings very | |
wasteful. | |
Thus, the line-specific processing is put into a separate | |
function. This way, the user passes each individual line together | |
with its preprocessing data. | |
Rule L1.4 will be implemented in a later commit. | |
Diffstat: | |
M grapheme.h | 18 ++++++++++++++---- | |
M src/bidirectional.c | 64 ++++++++++++++++++-----------… | |
M test/bidirectional.c | 15 +++++++++------ | |
3 files changed, 61 insertions(+), 36 deletions(-) | |
--- | |
diff --git a/grapheme.h b/grapheme.h | |
@@ -15,16 +15,26 @@ enum grapheme_bidirectional_override { | |
GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL, | |
}; | |
-size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *); | |
-size_t grapheme_encode_utf8(uint_least32_t, char *, size_t); | |
+void grapheme_bidirectional_get_line_embedding_levels( | |
+ const int_least32_t *, size_t, int_least8_t *); | |
-size_t grapheme_get_bidirectional_embedding_levels( | |
+size_t grapheme_bidirectional_preprocess( | |
const uint_least32_t *, size_t, enum grapheme_bidirectional_override, | |
int_least32_t *, size_t); | |
-size_t grapheme_get_bidirectional_embedding_levels_utf8( | |
+size_t grapheme_bidirectional_preprocess_utf8( | |
const char *, size_t, enum grapheme_bidirectional_override, | |
int_least32_t *, size_t); | |
+size_t grapheme_bidirectional_reorder_line( | |
+ const uint_least32_t *, const int_least8_t *, size_t, | |
+ uint_least32_t *, size_t); | |
+size_t grapheme_bidirectional_reorder_line_utf8( | |
+ const char *, const int_least8_t *, size_t, | |
+ char *, size_t); | |
+ | |
+size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *); | |
+size_t grapheme_encode_utf8(uint_least32_t, char *, size_t); | |
+ | |
bool grapheme_is_character_break(uint_least32_t, uint_least32_t, | |
uint_least16_t *); | |
diff --git a/src/bidirectional.c b/src/bidirectional.c | |
@@ -385,8 +385,8 @@ ir_advance(struct isolate_runner *ir) | |
} | |
static size_t | |
-process_isolating_run_sequence(int_least32_t *buf, size_t buflen, size_t off, | |
- uint_least8_t paragraph_level) | |
+preprocess_isolating_run_sequence(int_least32_t *buf, size_t buflen, size_t of… | |
+ uint_least8_t paragraph_level) | |
{ | |
enum bidi_property sequence_prop, prop; | |
struct isolate_runner ir, tmp; | |
@@ -652,8 +652,8 @@ get_paragraph_level(enum grapheme_bidirectional_override ov… | |
} | |
static void | |
-get_paragraph_embedding_levels(enum grapheme_bidirectional_override override, | |
- int_least32_t *buf, size_t buflen) | |
+preprocess_paragraph(enum grapheme_bidirectional_override override, | |
+ int_least32_t *buf, size_t buflen) | |
{ | |
enum bidi_property prop; | |
int_least8_t level; | |
@@ -920,7 +920,7 @@ again: | |
for (bufoff = 0; bufoff < buflen; bufoff++) { | |
if (get_state(STATE_VISITED, buf[bufoff]) == 0 && | |
get_state(STATE_LEVEL, buf[bufoff]) != -1) { | |
- bufoff += process_isolating_run_sequence( | |
+ bufoff += preprocess_isolating_run_sequence( | |
buf, buflen, bufoff, paragraph_level); | |
} | |
} | |
@@ -964,6 +964,12 @@ again: | |
continue; | |
} | |
+ /* rules 1 and 2 */ | |
+ if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) { | |
+ set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff])… | |
+ } | |
+ | |
+ /* rule 3 */ | |
if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI || | |
prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || | |
prop == BIDI_PROP_PDI) { | |
@@ -971,8 +977,12 @@ again: | |
/* a new run has begun */ | |
runsince = bufoff; | |
} | |
- } else if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) { | |
- /* L1.4 -- ignored for now, < beachten! */ | |
+ } else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) && | |
+ runsince != SIZE_MAX) { | |
+ /* | |
+ * we hit a segment or paragraph separator in a | |
+ * sequence, reset sequence-levels | |
+ */ | |
for (i = runsince; i < bufoff; i++) { | |
if (get_state(STATE_LEVEL, buf[i]) != -1) { | |
set_state(STATE_LEVEL, paragraph_level, | |
@@ -984,11 +994,6 @@ again: | |
/* sequence ended */ | |
runsince = SIZE_MAX; | |
} | |
- | |
- if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) { | |
- set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff])… | |
- } | |
- continue; | |
} | |
if (runsince != SIZE_MAX) { | |
/* | |
@@ -1027,9 +1032,9 @@ get_bidi_bracket_off(uint_least32_t cp) | |
} | |
static size_t | |
-get_embedding_levels(HERODOTUS_READER *r, | |
- enum grapheme_bidirectional_override override, | |
- int_least32_t *buf, size_t buflen) | |
+preprocess(HERODOTUS_READER *r, | |
+ enum grapheme_bidirectional_override override, | |
+ int_least32_t *buf, size_t buflen) | |
{ | |
size_t bufoff, bufsize, lastparoff; | |
uint_least32_t cp; | |
@@ -1086,16 +1091,11 @@ get_embedding_levels(HERODOTUS_READER *r, | |
* the terminating character or last character of the | |
* string respectively | |
*/ | |
- get_paragraph_embedding_levels(override, buf + lastparoff, | |
- bufoff + 1 - lastparoff); | |
+ preprocess_paragraph(override, buf + lastparoff, | |
+ bufoff + 1 - lastparoff); | |
lastparoff = bufoff + 1; | |
} | |
- /* bake the levels into the buffer, discarding the metadata */ | |
- for (bufoff = 0; bufoff < bufsize; bufoff++) { | |
- buf[bufoff] = get_state(STATE_LEVEL, buf[bufoff]); | |
- } | |
- | |
/* | |
* we return the number of total bytes read, as the function | |
* should indicate if the given level-buffer is too small | |
@@ -1104,7 +1104,7 @@ get_embedding_levels(HERODOTUS_READER *r, | |
} | |
size_t | |
-grapheme_get_bidirectional_embedding_levels( | |
+grapheme_bidirectional_preprocess( | |
const uint_least32_t *src, size_t srclen, | |
enum grapheme_bidirectional_override override, int_least32_t *dest, | |
size_t destlen) | |
@@ -1113,11 +1113,11 @@ grapheme_get_bidirectional_embedding_levels( | |
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
- return get_embedding_levels(&r, override, dest, destlen); | |
+ return preprocess(&r, override, dest, destlen); | |
} | |
size_t | |
-grapheme_get_bidirectional_embedding_levels_utf8( | |
+grapheme_bidirectional_preprocess_utf8( | |
const char *src, size_t srclen, | |
enum grapheme_bidirectional_override override, int_least32_t *dest, | |
size_t destlen) | |
@@ -1126,5 +1126,17 @@ grapheme_get_bidirectional_embedding_levels_utf8( | |
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
- return get_embedding_levels(&r, override, dest, destlen); | |
+ return preprocess(&r, override, dest, destlen); | |
+} | |
+ | |
+void | |
+grapheme_bidirectional_get_line_embedding_levels( | |
+ const int_least32_t *linedata, size_t linelen, int_least8_t *linelevel) | |
+{ | |
+ size_t i; | |
+ | |
+ /* write the levels into the level-array */ | |
+ for (i = 0; i < linelen; i++) { | |
+ linelevel[i] = get_state(STATE_LEVEL, linedata[i]); | |
+ } | |
} | |
diff --git a/test/bidirectional.c b/test/bidirectional.c | |
@@ -12,10 +12,11 @@ | |
int | |
main(int argc, char *argv[]) | |
{ | |
- int_least32_t lev[512]; /* TODO iterate and get max, allocate */ | |
- size_t i, num_tests, failed, levlen, ret, j, m; | |
+ int_least32_t data[512]; /* TODO iterate and get max, allocate */ | |
+ int_least8_t lev[512]; | |
+ size_t i, num_tests, failed, datalen, ret, j, m; | |
- levlen = LEN(lev); | |
+ datalen = LEN(data); | |
(void)argc; | |
@@ -28,13 +29,15 @@ main(int argc, char *argv[]) | |
continue;*/ | |
for (m = 0; m < bidirectional_test[i].modelen; m++) { | |
- ret = grapheme_get_bidirectional_embedding_levels( | |
+ ret = grapheme_bidirectional_preprocess( | |
bidirectional_test[i].cp, | |
bidirectional_test[i].cplen, | |
- bidirectional_test[i].mode[m], lev, levlen); | |
+ bidirectional_test[i].mode[m], data, datalen); | |
+ grapheme_bidirectional_get_line_embedding_levels( | |
+ data, datalen, lev); | |
if (ret != bidirectional_test[i].cplen || | |
- ret > levlen) { | |
+ ret > datalen) { | |
goto err; | |
} | |