Refactor bidi and add reordering function - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 52ee78ea80d51b163f7fc85e9387389266d2331b | |
parent 77e30a69ce0807fbee01d43eebedda34b54f41af | |
Author: Laslo Hunhold <[email protected]> | |
Date: Fri, 26 May 2023 09:40:10 +0200 | |
Refactor bidi and add reordering function | |
- Rename bidi-override enum to bidi-direction, including entries. This | |
better reflects the general nature of it. | |
- Remove UTF-8-related bidi-functions, given it would be too complicated | |
to reflect in an API and opens up some very difficult challenges. | |
- Rename *_preprocess to *_preprocess_paragraph and return the resolved | |
paragraph embedding level as an optional out-parameter. This is the | |
only way to meaningfully handle large chunks of text with paragraphs | |
of different embedding levels. | |
- Separate the get_paragraph_level() function into two for | |
isolated-paragraphs and whole paragraphs. This simplifies it a lot, as | |
we don't have the crazy bool-flag-mess any more. | |
- Add a grapheme_bidirectional_reorder_line function that directly | |
operates on preprocessed data and returns the reordered string without | |
any additionally necessary buffering. For this the | |
get_line_embedding_levels had to be made a bit more general to allow | |
different ways of writing the levels into the output. | |
This function makes use of the mirror-LUT and has a small section | |
still commented out regarding the proper inversion of grapheme | |
clusters that will need more investigation. | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M gen/bidirectional-test.c | 44 ++++++++++++++++-------------… | |
M grapheme.h | 24 ++++++++---------------- | |
M src/bidirectional.c | 432 ++++++++++++++++++++++++-----… | |
3 files changed, 366 insertions(+), 134 deletions(-) | |
--- | |
diff --git a/gen/bidirectional-test.c b/gen/bidirectional-test.c | |
@@ -12,7 +12,7 @@ | |
struct bidirectional_test { | |
uint_least32_t *cp; | |
size_t cplen; | |
- enum grapheme_bidirectional_override mode[3]; | |
+ enum grapheme_bidirectional_direction mode[3]; | |
size_t modelen; | |
int_least8_t *level; | |
int_least8_t *reorder; | |
@@ -210,7 +210,7 @@ bidirectional_test_list_print(const struct bidirectional_te… | |
printf("static const struct {\n" | |
"\tuint_least32_t *cp;\n" | |
"\tsize_t cplen;\n" | |
- "\tenum grapheme_bidirectional_override *mode;\n" | |
+ "\tenum grapheme_bidirectional_direction *mode;\n" | |
"\tsize_t modelen;\n" | |
"\tint_least8_t *level;\n" | |
"\tint_least8_t *reorder;\n" | |
@@ -230,18 +230,18 @@ bidirectional_test_list_print(const struct bidirectional_… | |
printf("\t\t.cplen = %zu,\n", test[i].cplen); | |
printf("\t\t.mode = (enum " | |
- "grapheme_bidirectional_override[]){"); | |
+ "grapheme_bidirectional_direction[]){"); | |
for (j = 0; j < test[i].modelen; j++) { | |
if (test[i].mode[j] == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL) { | |
- printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_" | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL) { | |
+ printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_" | |
"NEUTRAL"); | |
} else if (test[i].mode[j] == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { | |
- printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR"); | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
+ printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR"… | |
} else if (test[i].mode[j] == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { | |
- printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL"); | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { | |
+ printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL"… | |
} | |
if (j + 1 < test[i].modelen) { | |
putchar(','); | |
@@ -374,32 +374,32 @@ test_callback(const char *file, char **field, size_t nfie… | |
exit(1); | |
} else if (field[1][0] == '2') { | |
test[testlen - 1].mode[0] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; | |
test[testlen - 1].modelen = 1; | |
} else if (field[1][0] == '3') { | |
/* auto=0 and LTR=1 */ | |
test[testlen - 1].mode[0] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; | |
test[testlen - 1].mode[1] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; | |
test[testlen - 1].modelen = 2; | |
} else if (field[1][0] == '4') { | |
test[testlen - 1].mode[0] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; | |
test[testlen - 1].modelen = 1; | |
} else if (field[1][0] == '5') { | |
test[testlen - 1].mode[0] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; | |
test[testlen - 1].mode[1] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; | |
test[testlen - 1].modelen = 2; | |
} else if (field[1][0] == '7') { | |
test[testlen - 1].mode[0] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; | |
test[testlen - 1].mode[1] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; | |
test[testlen - 1].mode[2] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; | |
test[testlen - 1].modelen = 3; | |
} else { | |
fprintf(stderr, | |
@@ -445,12 +445,14 @@ character_test_callback(const char *file, char **field, s… | |
fprintf(stderr, "malformed paragraph-level-setting.\n"); | |
exit(1); | |
} else if (field[1][0] == '0') { | |
- test[testlen - 1].mode[0] = GRAPHEME_BIDIRECTIONAL_OVERRIDE_LT… | |
+ test[testlen - 1].mode[0] = | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; | |
} else if (field[1][0] == '1') { | |
- test[testlen - 1].mode[0] = GRAPHEME_BIDIRECTIONAL_OVERRIDE_RT… | |
+ test[testlen - 1].mode[0] = | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; | |
} else if (field[1][0] == '2') { | |
test[testlen - 1].mode[0] = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; | |
} else { | |
fprintf(stderr, "unhandled paragraph-level-setting.\n"); | |
exit(1); | |
diff --git a/grapheme.h b/grapheme.h | |
@@ -8,31 +8,23 @@ | |
#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD) | |
-/* TODO call it simply "direction" without override */ | |
-enum grapheme_bidirectional_override { | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL, | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR, | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL, | |
+enum grapheme_bidirectional_direction { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL, | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR, | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL, | |
}; | |
size_t grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *, | |
size_t, int_least8_t *, | |
size_t); | |
-size_t grapheme_bidirectional_preprocess(const uint_least32_t *, size_t, | |
- enum grapheme_bidirectional_override, | |
- uint_least32_t *, size_t); | |
-size_t | |
-grapheme_bidirectional_preprocess_utf8(const char *, size_t, | |
- enum grapheme_bidirectional_override, | |
- uint_least32_t *, size_t); | |
+size_t grapheme_bidirectional_preprocess_paragraph( | |
+ const uint_least32_t *, size_t, enum grapheme_bidirectional_direction, | |
+ uint_least32_t *, size_t, enum grapheme_bidirectional_direction *); | |
size_t grapheme_bidirectional_reorder_line(const uint_least32_t *, | |
- const int_least8_t *, size_t, | |
+ const uint_least32_t *, size_t, | |
uint_least32_t *, size_t); | |
-size_t grapheme_bidirectional_reorder_line_utf8(const char *, | |
- const int_least8_t *, size_t, | |
- char *, size_t); | |
size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *); | |
size_t grapheme_encode_utf8(uint_least32_t, char *, size_t); | |
diff --git a/src/bidirectional.c b/src/bidirectional.c | |
@@ -895,28 +895,17 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, si… | |
} | |
static uint_least8_t | |
-get_paragraph_level(enum grapheme_bidirectional_override override, | |
- bool terminate_on_pdi, const uint_least32_t *buf, | |
- size_t buflen) | |
+get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen) | |
{ | |
enum bidi_property prop; | |
int_least8_t isolate_level; | |
- size_t bufoff; | |
+ size_t stateoff; | |
- /* check overrides first according to rule HL1 */ | |
- if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { | |
- return 0; | |
- } else if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { | |
- return 1; | |
- } | |
- | |
- /* determine paragraph level (rules P1-P3) */ | |
+ /* determine paragraph level (rules P1-P3) and terminate on PDI */ | |
+ for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++)… | |
+ prop = get_state(STATE_PROP, state[stateoff]); | |
- for (bufoff = 0, isolate_level = 0; bufoff < buflen; bufoff++) { | |
- prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]); | |
- | |
- if (prop == BIDI_PROP_PDI && isolate_level == 0 && | |
- terminate_on_pdi) { | |
+ if (prop == BIDI_PROP_PDI && isolate_level == 0) { | |
/* | |
* we are in a FSI-subsection of a paragraph and | |
* matched with the terminating PDI | |
@@ -950,28 +939,86 @@ get_paragraph_level(enum grapheme_bidirectional_override … | |
return 0; | |
} | |
+static inline uint_least8_t | |
+get_bidi_property(uint_least32_t cp) | |
+{ | |
+ if (likely(cp <= 0x10FFFF)) { | |
+ return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) & | |
+ 0x1F /* 00011111 */; | |
+ } else { | |
+ return BIDI_PROP_L; | |
+ } | |
+} | |
+ | |
+static uint_least8_t | |
+get_paragraph_level(enum grapheme_bidirectional_direction override, | |
+ const HERODOTUS_READER *r) | |
+{ | |
+ HERODOTUS_READER tmp; | |
+ enum bidi_property prop; | |
+ int_least8_t isolate_level; | |
+ uint_least32_t cp; | |
+ | |
+ /* check overrides first according to rule HL1 */ | |
+ if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
+ return 0; | |
+ } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { | |
+ return 1; | |
+ } | |
+ | |
+ /* copy reader into temporary reader */ | |
+ herodotus_reader_copy(r, &tmp); | |
+ | |
+ /* determine paragraph level (rules P1-P3) */ | |
+ for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) == | |
+ HERODOTUS_STATUS_SUCCESS;) { | |
+ prop = get_bidi_property(cp); | |
+ | |
+ /* BD8/BD9 */ | |
+ if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI || | |
+ prop == BIDI_PROP_FSI) && | |
+ isolate_level < MAX_DEPTH) { | |
+ /* we hit an isolate initiator, increment counter */ | |
+ isolate_level++; | |
+ } else if (prop == BIDI_PROP_PDI && isolate_level > 0) { | |
+ isolate_level--; | |
+ } | |
+ | |
+ /* P2 */ | |
+ if (isolate_level > 0) { | |
+ continue; | |
+ } | |
+ | |
+ /* P3 */ | |
+ if (prop == BIDI_PROP_L) { | |
+ return 0; | |
+ } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) { | |
+ return 1; | |
+ } | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
static void | |
-preprocess_paragraph(enum grapheme_bidirectional_override override, | |
- uint_least32_t *buf, size_t buflen) | |
+preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf, | |
+ size_t buflen) | |
{ | |
enum bidi_property prop; | |
int_least8_t level; | |
struct { | |
int_least8_t level; | |
- enum grapheme_bidirectional_override override; | |
+ enum grapheme_bidirectional_direction override; | |
bool directional_isolate; | |
} directional_status[MAX_DEPTH + 2], *dirstat = directional_status; | |
size_t overflow_isolate_count, overflow_embedding_count, | |
valid_isolate_count, bufoff, i, runsince; | |
- uint_least8_t paragraph_level; | |
- | |
- paragraph_level = get_paragraph_level(override, false, buf, buflen); | |
/* X1 */ | |
dirstat->level = (int_least8_t)paragraph_level; | |
- dirstat->override = GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL; | |
+ dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL; | |
dirstat->directional_isolate = false; | |
overflow_isolate_count = overflow_embedding_count = | |
valid_isolate_count = 0; | |
@@ -995,7 +1042,7 @@ again: | |
(dirstat - 1)->level + | |
((dirstat - 1)->level % 2 != 0) + 1; | |
dirstat->override = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRA… | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTR… | |
dirstat->directional_isolate = false; | |
} else { | |
/* overflow RLE */ | |
@@ -1014,7 +1061,7 @@ again: | |
(dirstat - 1)->level + | |
((dirstat - 1)->level % 2 == 0) + 1; | |
dirstat->override = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRA… | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTR… | |
dirstat->directional_isolate = false; | |
} else { | |
/* overflow LRE */ | |
@@ -1033,7 +1080,7 @@ again: | |
(dirstat - 1)->level + | |
((dirstat - 1)->level % 2 != 0) + 1; | |
dirstat->override = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL; | |
dirstat->directional_isolate = false; | |
} else { | |
/* overflow RLO */ | |
@@ -1052,7 +1099,7 @@ again: | |
(dirstat - 1)->level + | |
((dirstat - 1)->level % 2 == 0) + 1; | |
dirstat->override = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR; | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR; | |
dirstat->directional_isolate = false; | |
} else { | |
/* overflow LRO */ | |
@@ -1063,11 +1110,11 @@ again: | |
/* X5a */ | |
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); | |
if (dirstat->override == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
set_state(STATE_PROP, BIDI_PROP_L, | |
&(buf[bufoff])); | |
} else if (dirstat->override == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { | |
set_state(STATE_PROP, BIDI_PROP_R, | |
&(buf[bufoff])); | |
} | |
@@ -1084,7 +1131,7 @@ again: | |
(dirstat - 1)->level + | |
((dirstat - 1)->level % 2 != 0) + 1; | |
dirstat->override = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRA… | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTR… | |
dirstat->directional_isolate = true; | |
} else { | |
/* overflow RLI */ | |
@@ -1094,11 +1141,11 @@ again: | |
/* X5b */ | |
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); | |
if (dirstat->override == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
set_state(STATE_PROP, BIDI_PROP_L, | |
&(buf[bufoff])); | |
} else if (dirstat->override == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { | |
set_state(STATE_PROP, BIDI_PROP_R, | |
&(buf[bufoff])); | |
} | |
@@ -1115,7 +1162,7 @@ again: | |
(dirstat - 1)->level + | |
((dirstat - 1)->level % 2 == 0) + 1; | |
dirstat->override = | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRA… | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTR… | |
dirstat->directional_isolate = true; | |
} else { | |
/* overflow LRI */ | |
@@ -1123,9 +1170,8 @@ again: | |
} | |
} else if (prop == BIDI_PROP_FSI) { | |
/* X5c */ | |
- if (get_paragraph_level( | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL, | |
- true, buf + (bufoff + 1), | |
+ if (get_isolated_paragraph_level( | |
+ buf + (bufoff + 1), | |
buflen - (bufoff + 1)) == 1) { | |
prop = BIDI_PROP_RLI; | |
goto again; | |
@@ -1138,11 +1184,11 @@ again: | |
/* X6 */ | |
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); | |
if (dirstat->override == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
set_state(STATE_PROP, BIDI_PROP_L, | |
&(buf[bufoff])); | |
} else if (dirstat->override == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { | |
set_state(STATE_PROP, BIDI_PROP_R, | |
&(buf[bufoff])); | |
} | |
@@ -1190,11 +1236,11 @@ again: | |
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff])); | |
if (dirstat->override == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) { | |
set_state(STATE_PROP, BIDI_PROP_L, | |
&(buf[bufoff])); | |
} else if (dirstat->override == | |
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) { | |
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) { | |
set_state(STATE_PROP, BIDI_PROP_R, | |
&(buf[bufoff])); | |
} | |
@@ -1317,17 +1363,6 @@ again: | |
} | |
static inline uint_least8_t | |
-get_bidi_property(uint_least32_t cp) | |
-{ | |
- if (likely(cp <= 0x10FFFF)) { | |
- return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) & | |
- 0x1F /* 00011111 */; | |
- } else { | |
- return BIDI_PROP_L; | |
- } | |
-} | |
- | |
-static inline uint_least8_t | |
get_bidi_bracket_off(uint_least32_t cp) | |
{ | |
if (likely(cp <= 0x10FFFF)) { | |
@@ -1340,20 +1375,35 @@ get_bidi_bracket_off(uint_least32_t cp) | |
} | |
static size_t | |
-preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override, | |
- uint_least32_t *buf, size_t buflen) | |
+preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override, | |
+ uint_least32_t *buf, size_t buflen, | |
+ enum grapheme_bidirectional_direction *resolved) | |
{ | |
- size_t bufoff, bufsize, lastparoff; | |
+ HERODOTUS_READER tmp; | |
+ size_t bufoff, bufsize, paragraph_len; | |
uint_least32_t cp; | |
+ uint_least8_t paragraph_level; | |
- if (buf == NULL) { | |
- for (; herodotus_read_codepoint(r, true, &cp) == | |
- HERODOTUS_STATUS_SUCCESS;) { | |
- ; | |
+ /* determine length and level of the paragraph */ | |
+ herodotus_reader_copy(r, &tmp); | |
+ for (; herodotus_read_codepoint(&tmp, true, &cp) == | |
+ HERODOTUS_STATUS_SUCCESS;) { | |
+ /* break on paragraph separator */ | |
+ if (get_bidi_property(cp) == BIDI_PROP_B) { | |
+ break; | |
} | |
+ } | |
+ paragraph_len = herodotus_reader_number_read(&tmp); | |
+ paragraph_level = get_paragraph_level(override, r); | |
+ | |
+ if (resolved != NULL) { | |
+ /* store resolved paragraph level in output variable */ | |
+ *resolved = paragraph_level; | |
+ } | |
+ if (buf == NULL) { | |
/* see below for return value reasoning */ | |
- return herodotus_reader_number_read(r); | |
+ return paragraph_len; | |
} | |
/* | |
@@ -1361,6 +1411,7 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirection… | |
* and store them in the buffer | |
*/ | |
for (bufoff = 0; | |
+ bufoff < paragraph_len && | |
herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCES… | |
bufoff++) { | |
if (bufoff < buflen) { | |
@@ -1385,7 +1436,7 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirection… | |
} | |
bufsize = herodotus_reader_number_read(r); | |
- for (bufoff = 0, lastparoff = 0; bufoff < bufsize; bufoff++) { | |
+ for (bufoff = 0; bufoff < bufsize; bufoff++) { | |
if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B && | |
bufoff != bufsize - 1) { | |
continue; | |
@@ -1398,9 +1449,8 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirection… | |
* the terminating character or last character of the | |
* string respectively | |
*/ | |
- preprocess_paragraph(override, buf + lastparoff, | |
- bufoff + 1 - lastparoff); | |
- lastparoff = bufoff + 1; | |
+ preprocess_paragraph(paragraph_level, buf, bufoff + 1); | |
+ break; | |
} | |
/* | |
@@ -1411,50 +1461,41 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirecti… | |
} | |
size_t | |
-grapheme_bidirectional_preprocess(const uint_least32_t *src, size_t srclen, | |
- enum grapheme_bidirectional_override overrid… | |
- uint_least32_t *dest, size_t destlen) | |
+grapheme_bidirectional_preprocess_paragraph( | |
+ const uint_least32_t *src, size_t srclen, | |
+ enum grapheme_bidirectional_direction override, uint_least32_t *dest, | |
+ size_t destlen, enum grapheme_bidirectional_direction *resolved) | |
{ | |
HERODOTUS_READER r; | |
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
- return preprocess(&r, override, dest, destlen); | |
+ return preprocess(&r, override, dest, destlen, resolved); | |
} | |
-size_t | |
-grapheme_bidirectional_preprocess_utf8( | |
- const char *src, size_t srclen, | |
- enum grapheme_bidirectional_override override, uint_least32_t *dest, | |
- size_t destlen) | |
-{ | |
- HERODOTUS_READER r; | |
- | |
- herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
- | |
- return preprocess(&r, override, dest, destlen); | |
-} | |
- | |
-size_t | |
-grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedat… | |
- size_t linelen, | |
- int_least8_t *lev, | |
- size_t levlen) | |
+static inline size_t | |
+get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen, | |
+ int_least8_t (*get_level)(const void *, size_t), | |
+ void (*set_level)(void *, size_t, int_least8_t), | |
+ void *lev, size_t levsize, bool skipignored) | |
{ | |
enum bidi_property prop; | |
- size_t i, runsince; | |
- int_least8_t level; | |
+ size_t i, levlen, runsince; | |
+ int_least8_t level, runlevel; | |
/* rule L1.4 */ | |
runsince = SIZE_MAX; | |
- for (i = 0; i < linelen; i++) { | |
+ for (i = 0, levlen = 0; i < linelen; i++) { | |
level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]); | |
prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP, | |
linedata[i]); | |
/* write level into level array if we still have space */ | |
- if (i < levlen) { | |
- lev[i] = level; | |
+ if (level != -1 || skipignored == false) { | |
+ if (levlen <= levsize) { | |
+ set_level(lev, levlen, level); | |
+ } | |
+ levlen++; | |
} | |
if (level == -1) { | |
@@ -1467,11 +1508,14 @@ grapheme_bidirectional_get_line_embedding_levels(const … | |
prop == BIDI_PROP_PDI) { | |
if (runsince == SIZE_MAX) { | |
/* a new run has begun */ | |
- runsince = i; | |
+ runsince = levlen - 1; /* levlen > 0 */ | |
+ runlevel = get_state(STATE_PARAGRAPH_LEVEL, | |
+ linedata[i]); | |
} | |
} else { | |
/* sequence ended */ | |
runsince = SIZE_MAX; | |
+ runlevel = -1; | |
} | |
} | |
if (runsince != SIZE_MAX) { | |
@@ -1479,13 +1523,207 @@ grapheme_bidirectional_get_line_embedding_levels(const… | |
* we hit the end of the line but were in a run; | |
* reset the line levels to the paragraph level | |
*/ | |
- for (i = runsince; i < MIN(linelen, levlen); i++) { | |
- if (lev[i] != -1) { | |
- lev[i] = (int_least8_t)get_state( | |
- STATE_PARAGRAPH_LEVEL, linedata[i]); | |
+ for (i = runsince; i < MIN(linelen, levsize); i++) { | |
+ if (get_level(lev, i) != -1) { | |
+ set_level(lev, i, runlevel); | |
+ } | |
+ } | |
+ } | |
+ | |
+ return levlen; | |
+} | |
+ | |
+static inline int_least8_t | |
+get_level_int8(const void *lev, size_t off) | |
+{ | |
+ return ((int_least8_t *)lev)[off]; | |
+} | |
+ | |
+static inline void | |
+set_level_int8(void *lev, size_t off, int_least8_t value) | |
+{ | |
+ ((int_least8_t *)lev)[off] = value; | |
+} | |
+ | |
+size_t | |
+grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedat… | |
+ size_t linelen, | |
+ int_least8_t *lev, | |
+ size_t levlen) | |
+{ | |
+ return get_line_embedding_levels(linedata, linelen, get_level_int8, | |
+ set_level_int8, lev, levlen, false); | |
+} | |
+ | |
+static inline int_least8_t | |
+get_level_uint32(const void *lev, size_t off) | |
+{ | |
+ return (int_least8_t)((((uint_least32_t *)lev)[off] & | |
+ UINT32_C(0x1FE00000)) >> | |
+ 21) - | |
+ 1; | |
+} | |
+ | |
+static inline void | |
+set_level_uint32(void *lev, size_t off, int_least8_t value) | |
+{ | |
+ ((uint_least32_t *)lev)[off] ^= | |
+ ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000); | |
+ ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21; | |
+} | |
+ | |
+static inline int_least16_t | |
+get_mirror_offset(uint_least32_t cp) | |
+{ | |
+ if (cp <= UINT32_C(0x10FFFF)) { | |
+ return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)]; | |
+ } else { | |
+ return 0; | |
+ } | |
+} | |
+ | |
+size_t | |
+grapheme_bidirectional_reorder_line(const uint_least32_t *line, | |
+ const uint_least32_t *linedata, | |
+ size_t linelen, uint_least32_t *output, | |
+ size_t outputsize) | |
+{ | |
+ size_t i, outputlen, first, last, j, k, l, laststart; | |
+ int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0; | |
+ uint_least32_t tmp; | |
+ | |
+ /* write output characters */ | |
+ for (i = 0, outputlen = 0; i < linelen; i++) { | |
+ if (get_state(STATE_LEVEL, linedata[i]) != -1) { | |
+ if (outputlen < outputsize) { | |
+ output[outputlen] = line[i]; | |
+ } | |
+ outputlen++; | |
+ } | |
+ } | |
+ if (outputlen >= outputsize) { | |
+ /* clear output buffer */ | |
+ for (i = 0; i < outputsize; i++) { | |
+ output[i] = GRAPHEME_INVALID_CODEPOINT; | |
+ } | |
+ | |
+ /* return required size */ | |
+ return outputlen; | |
+ } | |
+ | |
+ /* | |
+ * write line embedding levels as metadata and codepoints into the | |
+ * output | |
+ */ | |
+ get_line_embedding_levels(linedata, linelen, get_level_uint32, | |
+ set_level_uint32, output, linelen, true); | |
+ | |
+ /* determine level range */ | |
+ for (i = 0; i < outputlen; i++) { | |
+ level = get_level_uint32(output, i); | |
+ | |
+ if (level == -1) { | |
+ /* ignored character */ | |
+ continue; | |
+ } | |
+ | |
+ if (level % 2 == 1 && level < min_odd_level) { | |
+ min_odd_level = level; | |
+ } | |
+ if (level > max_level) { | |
+ max_level = level; | |
+ } | |
+ } | |
+ | |
+ for (level = max_level; level >= min_odd_level /* > 0 */; level--) { | |
+ for (i = 0; i < outputlen; i++) { | |
+ if (get_level_uint32(output, i) >= level) { | |
+ /* | |
+ * the current character has the desired level | |
+ */ | |
+ first = last = i; | |
+ | |
+ /* find the end of the level-sequence */ | |
+ for (i++; i < outputlen; i++) { | |
+ if (get_level_uint32(output, i) >= | |
+ level) { | |
+ /* the sequence continues */ | |
+ last = i; | |
+ } else { | |
+ break; | |
+ } | |
+ } | |
+ | |
+ /* invert the sequence first..last respecting | |
+ * grapheme clusters | |
+ * | |
+ * The standard only speaks of combining marks | |
+ * inversion, but we should in the perfect case | |
+ * respect _all_ grapheme clusters, which we do | |
+ * here! | |
+ */ | |
+ | |
+ /* mark grapheme cluster breaks */ | |
+ for (j = first; j <= last; | |
+ j += grapheme_next_character_break( | |
+ line + j, outputlen - j)) { | |
+ /* | |
+ * we use a special trick here: The | |
+ * first 21 bits of the state are fill… | |
+ * with the codepoint, the next 8 bits | |
+ * are used for the level, so we can u… | |
+ * the 30th bit to mark the grapheme | |
+ * cluster breaks. This allows us to | |
+ * reinvert the grapheme clusters into | |
+ * the proper direction later. | |
+ */ | |
+ output[j] |= UINT32_C(1) << 29; | |
+ } | |
+ | |
+ /* global inversion */ | |
+ for (k = first, l = last; k < l; k++, l--) { | |
+ /* swap */ | |
+ tmp = output[k]; | |
+ output[k] = output[l]; | |
+ output[l] = tmp; | |
+ } | |
+ | |
+ /* grapheme cluster reinversion */ | |
+#if 0 | |
+ for (j = first, laststart = first; j <= last; | |
+ j++) { | |
+ if (output[j] & (UINT32_C(1) << 29)) { | |
+ /* we hit a mark! given the | |
+ * grapheme cluster is inverte… | |
+ * this means that the cluster | |
+ * ended and we now reinvert it | |
+ * again | |
+ */ | |
+ for (k = laststart, l = j; | |
+ k < l; k++, l--) { | |
+ /* swap */ | |
+ tmp = output[k]; | |
+ output[k] = output[l]; | |
+ output[l] = tmp; | |
+ } | |
+ laststart = j + 1; | |
+ } | |
+ } | |
+#endif | |
+ | |
+ /* unmark grapheme cluster breaks */ | |
+ for (j = first; j <= last; j++) { | |
+ output[j] ^= output[j] & | |
+ UINT32_C(0x20000000); | |
+ } | |
} | |
} | |
} | |
- return linelen; | |
+ /* remove embedding level metadata */ | |
+ for (i = 0; i < outputlen; i++) { | |
+ output[i] ^= output[i] & UINT32_C(0x1FE00000); | |
+ } | |
+ | |
+ return outputlen; | |
} |