Introduction
Introduction Statistics Contact Development Disclaimer Help
Refactor bidi and add reordering function - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
commit 52ee78ea80d51b163f7fc85e9387389266d2331b
parent 77e30a69ce0807fbee01d43eebedda34b54f41af
Author: Laslo Hunhold <[email protected]>
Date: Fri, 26 May 2023 09:40:10 +0200
Refactor bidi and add reordering function
- Rename bidi-override enum to bidi-direction, including entries. This
better reflects the general nature of it.
- Remove UTF-8-related bidi-functions, given it would be too complicated
to reflect in an API and opens up some very difficult challenges.
- Rename *_preprocess to *_preprocess_paragraph and return the resolved
paragraph embedding level as an optional out-parameter. This is the
only way to meaningfully handle large chunks of text with paragraphs
of different embedding levels.
- Separate the get_paragraph_level() function into two for
isolated-paragraphs and whole paragraphs. This simplifies it a lot, as
we don't have the crazy bool-flag-mess any more.
- Add a grapheme_bidirectional_reorder_line function that directly
operates on preprocessed data and returns the reordered string without
any additionally necessary buffering. For this the
get_line_embedding_levels had to be made a bit more general to allow
different ways of writing the levels into the output.
This function makes use of the mirror-LUT and has a small section
still commented out regarding the proper inversion of grapheme
clusters that will need more investigation.
Signed-off-by: Laslo Hunhold <[email protected]>
Diffstat:
M gen/bidirectional-test.c | 44 ++++++++++++++++-------------…
M grapheme.h | 24 ++++++++----------------
M src/bidirectional.c | 432 ++++++++++++++++++++++++-----…
3 files changed, 366 insertions(+), 134 deletions(-)
---
diff --git a/gen/bidirectional-test.c b/gen/bidirectional-test.c
@@ -12,7 +12,7 @@
struct bidirectional_test {
uint_least32_t *cp;
size_t cplen;
- enum grapheme_bidirectional_override mode[3];
+ enum grapheme_bidirectional_direction mode[3];
size_t modelen;
int_least8_t *level;
int_least8_t *reorder;
@@ -210,7 +210,7 @@ bidirectional_test_list_print(const struct bidirectional_te…
printf("static const struct {\n"
"\tuint_least32_t *cp;\n"
"\tsize_t cplen;\n"
- "\tenum grapheme_bidirectional_override *mode;\n"
+ "\tenum grapheme_bidirectional_direction *mode;\n"
"\tsize_t modelen;\n"
"\tint_least8_t *level;\n"
"\tint_least8_t *reorder;\n"
@@ -230,18 +230,18 @@ bidirectional_test_list_print(const struct bidirectional_…
printf("\t\t.cplen = %zu,\n", test[i].cplen);
printf("\t\t.mode = (enum "
- "grapheme_bidirectional_override[]){");
+ "grapheme_bidirectional_direction[]){");
for (j = 0; j < test[i].modelen; j++) {
if (test[i].mode[j] ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL) {
- printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_"
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL) {
+ printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_"
"NEUTRAL");
} else if (test[i].mode[j] ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
- printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR");
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
+ printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR"…
} else if (test[i].mode[j] ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
- printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL");
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
+ printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL"…
}
if (j + 1 < test[i].modelen) {
putchar(',');
@@ -374,32 +374,32 @@ test_callback(const char *file, char **field, size_t nfie…
exit(1);
} else if (field[1][0] == '2') {
test[testlen - 1].mode[0] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
test[testlen - 1].modelen = 1;
} else if (field[1][0] == '3') {
/* auto=0 and LTR=1 */
test[testlen - 1].mode[0] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
test[testlen - 1].mode[1] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
test[testlen - 1].modelen = 2;
} else if (field[1][0] == '4') {
test[testlen - 1].mode[0] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
test[testlen - 1].modelen = 1;
} else if (field[1][0] == '5') {
test[testlen - 1].mode[0] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
test[testlen - 1].mode[1] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
test[testlen - 1].modelen = 2;
} else if (field[1][0] == '7') {
test[testlen - 1].mode[0] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
test[testlen - 1].mode[1] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
test[testlen - 1].mode[2] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
test[testlen - 1].modelen = 3;
} else {
fprintf(stderr,
@@ -445,12 +445,14 @@ character_test_callback(const char *file, char **field, s…
fprintf(stderr, "malformed paragraph-level-setting.\n");
exit(1);
} else if (field[1][0] == '0') {
- test[testlen - 1].mode[0] = GRAPHEME_BIDIRECTIONAL_OVERRIDE_LT…
+ test[testlen - 1].mode[0] =
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
} else if (field[1][0] == '1') {
- test[testlen - 1].mode[0] = GRAPHEME_BIDIRECTIONAL_OVERRIDE_RT…
+ test[testlen - 1].mode[0] =
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
} else if (field[1][0] == '2') {
test[testlen - 1].mode[0] =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
} else {
fprintf(stderr, "unhandled paragraph-level-setting.\n");
exit(1);
diff --git a/grapheme.h b/grapheme.h
@@ -8,31 +8,23 @@
#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
-/* TODO call it simply "direction" without override */
-enum grapheme_bidirectional_override {
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL,
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR,
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL,
+enum grapheme_bidirectional_direction {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL,
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR,
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL,
};
size_t grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *,
size_t, int_least8_t *,
size_t);
-size_t grapheme_bidirectional_preprocess(const uint_least32_t *, size_t,
- enum grapheme_bidirectional_override,
- uint_least32_t *, size_t);
-size_t
-grapheme_bidirectional_preprocess_utf8(const char *, size_t,
- enum grapheme_bidirectional_override,
- uint_least32_t *, size_t);
+size_t grapheme_bidirectional_preprocess_paragraph(
+ const uint_least32_t *, size_t, enum grapheme_bidirectional_direction,
+ uint_least32_t *, size_t, enum grapheme_bidirectional_direction *);
size_t grapheme_bidirectional_reorder_line(const uint_least32_t *,
- const int_least8_t *, size_t,
+ const uint_least32_t *, size_t,
uint_least32_t *, size_t);
-size_t grapheme_bidirectional_reorder_line_utf8(const char *,
- const int_least8_t *, size_t,
- char *, size_t);
size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
diff --git a/src/bidirectional.c b/src/bidirectional.c
@@ -895,28 +895,17 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, si…
}
static uint_least8_t
-get_paragraph_level(enum grapheme_bidirectional_override override,
- bool terminate_on_pdi, const uint_least32_t *buf,
- size_t buflen)
+get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen)
{
enum bidi_property prop;
int_least8_t isolate_level;
- size_t bufoff;
+ size_t stateoff;
- /* check overrides first according to rule HL1 */
- if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
- return 0;
- } else if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
- return 1;
- }
-
- /* determine paragraph level (rules P1-P3) */
+ /* determine paragraph level (rules P1-P3) and terminate on PDI */
+ for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++)…
+ prop = get_state(STATE_PROP, state[stateoff]);
- for (bufoff = 0, isolate_level = 0; bufoff < buflen; bufoff++) {
- prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
-
- if (prop == BIDI_PROP_PDI && isolate_level == 0 &&
- terminate_on_pdi) {
+ if (prop == BIDI_PROP_PDI && isolate_level == 0) {
/*
* we are in a FSI-subsection of a paragraph and
* matched with the terminating PDI
@@ -950,28 +939,86 @@ get_paragraph_level(enum grapheme_bidirectional_override …
return 0;
}
+static inline uint_least8_t
+get_bidi_property(uint_least32_t cp)
+{
+ if (likely(cp <= 0x10FFFF)) {
+ return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
+ 0x1F /* 00011111 */;
+ } else {
+ return BIDI_PROP_L;
+ }
+}
+
+static uint_least8_t
+get_paragraph_level(enum grapheme_bidirectional_direction override,
+ const HERODOTUS_READER *r)
+{
+ HERODOTUS_READER tmp;
+ enum bidi_property prop;
+ int_least8_t isolate_level;
+ uint_least32_t cp;
+
+ /* check overrides first according to rule HL1 */
+ if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
+ return 0;
+ } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
+ return 1;
+ }
+
+ /* copy reader into temporary reader */
+ herodotus_reader_copy(r, &tmp);
+
+ /* determine paragraph level (rules P1-P3) */
+ for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS;) {
+ prop = get_bidi_property(cp);
+
+ /* BD8/BD9 */
+ if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
+ prop == BIDI_PROP_FSI) &&
+ isolate_level < MAX_DEPTH) {
+ /* we hit an isolate initiator, increment counter */
+ isolate_level++;
+ } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
+ isolate_level--;
+ }
+
+ /* P2 */
+ if (isolate_level > 0) {
+ continue;
+ }
+
+ /* P3 */
+ if (prop == BIDI_PROP_L) {
+ return 0;
+ } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
static void
-preprocess_paragraph(enum grapheme_bidirectional_override override,
- uint_least32_t *buf, size_t buflen)
+preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf,
+ size_t buflen)
{
enum bidi_property prop;
int_least8_t level;
struct {
int_least8_t level;
- enum grapheme_bidirectional_override override;
+ enum grapheme_bidirectional_direction override;
bool directional_isolate;
} directional_status[MAX_DEPTH + 2], *dirstat = directional_status;
size_t overflow_isolate_count, overflow_embedding_count,
valid_isolate_count, bufoff, i, runsince;
- uint_least8_t paragraph_level;
-
- paragraph_level = get_paragraph_level(override, false, buf, buflen);
/* X1 */
dirstat->level = (int_least8_t)paragraph_level;
- dirstat->override = GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
+ dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
dirstat->directional_isolate = false;
overflow_isolate_count = overflow_embedding_count =
valid_isolate_count = 0;
@@ -995,7 +1042,7 @@ again:
(dirstat - 1)->level +
((dirstat - 1)->level % 2 != 0) + 1;
dirstat->override =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRA…
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTR…
dirstat->directional_isolate = false;
} else {
/* overflow RLE */
@@ -1014,7 +1061,7 @@ again:
(dirstat - 1)->level +
((dirstat - 1)->level % 2 == 0) + 1;
dirstat->override =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRA…
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTR…
dirstat->directional_isolate = false;
} else {
/* overflow LRE */
@@ -1033,7 +1080,7 @@ again:
(dirstat - 1)->level +
((dirstat - 1)->level % 2 != 0) + 1;
dirstat->override =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
dirstat->directional_isolate = false;
} else {
/* overflow RLO */
@@ -1052,7 +1099,7 @@ again:
(dirstat - 1)->level +
((dirstat - 1)->level % 2 == 0) + 1;
dirstat->override =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
dirstat->directional_isolate = false;
} else {
/* overflow LRO */
@@ -1063,11 +1110,11 @@ again:
/* X5a */
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
if (dirstat->override ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
set_state(STATE_PROP, BIDI_PROP_L,
&(buf[bufoff]));
} else if (dirstat->override ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
set_state(STATE_PROP, BIDI_PROP_R,
&(buf[bufoff]));
}
@@ -1084,7 +1131,7 @@ again:
(dirstat - 1)->level +
((dirstat - 1)->level % 2 != 0) + 1;
dirstat->override =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRA…
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTR…
dirstat->directional_isolate = true;
} else {
/* overflow RLI */
@@ -1094,11 +1141,11 @@ again:
/* X5b */
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
if (dirstat->override ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
set_state(STATE_PROP, BIDI_PROP_L,
&(buf[bufoff]));
} else if (dirstat->override ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
set_state(STATE_PROP, BIDI_PROP_R,
&(buf[bufoff]));
}
@@ -1115,7 +1162,7 @@ again:
(dirstat - 1)->level +
((dirstat - 1)->level % 2 == 0) + 1;
dirstat->override =
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRA…
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTR…
dirstat->directional_isolate = true;
} else {
/* overflow LRI */
@@ -1123,9 +1170,8 @@ again:
}
} else if (prop == BIDI_PROP_FSI) {
/* X5c */
- if (get_paragraph_level(
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL,
- true, buf + (bufoff + 1),
+ if (get_isolated_paragraph_level(
+ buf + (bufoff + 1),
buflen - (bufoff + 1)) == 1) {
prop = BIDI_PROP_RLI;
goto again;
@@ -1138,11 +1184,11 @@ again:
/* X6 */
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
if (dirstat->override ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
set_state(STATE_PROP, BIDI_PROP_L,
&(buf[bufoff]));
} else if (dirstat->override ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
set_state(STATE_PROP, BIDI_PROP_R,
&(buf[bufoff]));
}
@@ -1190,11 +1236,11 @@ again:
set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
if (dirstat->override ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
set_state(STATE_PROP, BIDI_PROP_L,
&(buf[bufoff]));
} else if (dirstat->override ==
- GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
+ GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
set_state(STATE_PROP, BIDI_PROP_R,
&(buf[bufoff]));
}
@@ -1317,17 +1363,6 @@ again:
}
static inline uint_least8_t
-get_bidi_property(uint_least32_t cp)
-{
- if (likely(cp <= 0x10FFFF)) {
- return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
- 0x1F /* 00011111 */;
- } else {
- return BIDI_PROP_L;
- }
-}
-
-static inline uint_least8_t
get_bidi_bracket_off(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
@@ -1340,20 +1375,35 @@ get_bidi_bracket_off(uint_least32_t cp)
}
static size_t
-preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override,
- uint_least32_t *buf, size_t buflen)
+preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override,
+ uint_least32_t *buf, size_t buflen,
+ enum grapheme_bidirectional_direction *resolved)
{
- size_t bufoff, bufsize, lastparoff;
+ HERODOTUS_READER tmp;
+ size_t bufoff, bufsize, paragraph_len;
uint_least32_t cp;
+ uint_least8_t paragraph_level;
- if (buf == NULL) {
- for (; herodotus_read_codepoint(r, true, &cp) ==
- HERODOTUS_STATUS_SUCCESS;) {
- ;
+ /* determine length and level of the paragraph */
+ herodotus_reader_copy(r, &tmp);
+ for (; herodotus_read_codepoint(&tmp, true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS;) {
+ /* break on paragraph separator */
+ if (get_bidi_property(cp) == BIDI_PROP_B) {
+ break;
}
+ }
+ paragraph_len = herodotus_reader_number_read(&tmp);
+ paragraph_level = get_paragraph_level(override, r);
+
+ if (resolved != NULL) {
+ /* store resolved paragraph level in output variable */
+ *resolved = paragraph_level;
+ }
+ if (buf == NULL) {
/* see below for return value reasoning */
- return herodotus_reader_number_read(r);
+ return paragraph_len;
}
/*
@@ -1361,6 +1411,7 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirection…
* and store them in the buffer
*/
for (bufoff = 0;
+ bufoff < paragraph_len &&
herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCES…
bufoff++) {
if (bufoff < buflen) {
@@ -1385,7 +1436,7 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirection…
}
bufsize = herodotus_reader_number_read(r);
- for (bufoff = 0, lastparoff = 0; bufoff < bufsize; bufoff++) {
+ for (bufoff = 0; bufoff < bufsize; bufoff++) {
if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B &&
bufoff != bufsize - 1) {
continue;
@@ -1398,9 +1449,8 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirection…
* the terminating character or last character of the
* string respectively
*/
- preprocess_paragraph(override, buf + lastparoff,
- bufoff + 1 - lastparoff);
- lastparoff = bufoff + 1;
+ preprocess_paragraph(paragraph_level, buf, bufoff + 1);
+ break;
}
/*
@@ -1411,50 +1461,41 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirecti…
}
size_t
-grapheme_bidirectional_preprocess(const uint_least32_t *src, size_t srclen,
- enum grapheme_bidirectional_override overrid…
- uint_least32_t *dest, size_t destlen)
+grapheme_bidirectional_preprocess_paragraph(
+ const uint_least32_t *src, size_t srclen,
+ enum grapheme_bidirectional_direction override, uint_least32_t *dest,
+ size_t destlen, enum grapheme_bidirectional_direction *resolved)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
- return preprocess(&r, override, dest, destlen);
+ return preprocess(&r, override, dest, destlen, resolved);
}
-size_t
-grapheme_bidirectional_preprocess_utf8(
- const char *src, size_t srclen,
- enum grapheme_bidirectional_override override, uint_least32_t *dest,
- size_t destlen)
-{
- HERODOTUS_READER r;
-
- herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
-
- return preprocess(&r, override, dest, destlen);
-}
-
-size_t
-grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedat…
- size_t linelen,
- int_least8_t *lev,
- size_t levlen)
+static inline size_t
+get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen,
+ int_least8_t (*get_level)(const void *, size_t),
+ void (*set_level)(void *, size_t, int_least8_t),
+ void *lev, size_t levsize, bool skipignored)
{
enum bidi_property prop;
- size_t i, runsince;
- int_least8_t level;
+ size_t i, levlen, runsince;
+ int_least8_t level, runlevel;
/* rule L1.4 */
runsince = SIZE_MAX;
- for (i = 0; i < linelen; i++) {
+ for (i = 0, levlen = 0; i < linelen; i++) {
level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]);
prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
linedata[i]);
/* write level into level array if we still have space */
- if (i < levlen) {
- lev[i] = level;
+ if (level != -1 || skipignored == false) {
+ if (levlen <= levsize) {
+ set_level(lev, levlen, level);
+ }
+ levlen++;
}
if (level == -1) {
@@ -1467,11 +1508,14 @@ grapheme_bidirectional_get_line_embedding_levels(const …
prop == BIDI_PROP_PDI) {
if (runsince == SIZE_MAX) {
/* a new run has begun */
- runsince = i;
+ runsince = levlen - 1; /* levlen > 0 */
+ runlevel = get_state(STATE_PARAGRAPH_LEVEL,
+ linedata[i]);
}
} else {
/* sequence ended */
runsince = SIZE_MAX;
+ runlevel = -1;
}
}
if (runsince != SIZE_MAX) {
@@ -1479,13 +1523,207 @@ grapheme_bidirectional_get_line_embedding_levels(const…
* we hit the end of the line but were in a run;
* reset the line levels to the paragraph level
*/
- for (i = runsince; i < MIN(linelen, levlen); i++) {
- if (lev[i] != -1) {
- lev[i] = (int_least8_t)get_state(
- STATE_PARAGRAPH_LEVEL, linedata[i]);
+ for (i = runsince; i < MIN(linelen, levsize); i++) {
+ if (get_level(lev, i) != -1) {
+ set_level(lev, i, runlevel);
+ }
+ }
+ }
+
+ return levlen;
+}
+
+static inline int_least8_t
+get_level_int8(const void *lev, size_t off)
+{
+ return ((int_least8_t *)lev)[off];
+}
+
+static inline void
+set_level_int8(void *lev, size_t off, int_least8_t value)
+{
+ ((int_least8_t *)lev)[off] = value;
+}
+
+size_t
+grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedat…
+ size_t linelen,
+ int_least8_t *lev,
+ size_t levlen)
+{
+ return get_line_embedding_levels(linedata, linelen, get_level_int8,
+ set_level_int8, lev, levlen, false);
+}
+
+static inline int_least8_t
+get_level_uint32(const void *lev, size_t off)
+{
+ return (int_least8_t)((((uint_least32_t *)lev)[off] &
+ UINT32_C(0x1FE00000)) >>
+ 21) -
+ 1;
+}
+
+static inline void
+set_level_uint32(void *lev, size_t off, int_least8_t value)
+{
+ ((uint_least32_t *)lev)[off] ^=
+ ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000);
+ ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21;
+}
+
+static inline int_least16_t
+get_mirror_offset(uint_least32_t cp)
+{
+ if (cp <= UINT32_C(0x10FFFF)) {
+ return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)];
+ } else {
+ return 0;
+ }
+}
+
+size_t
+grapheme_bidirectional_reorder_line(const uint_least32_t *line,
+ const uint_least32_t *linedata,
+ size_t linelen, uint_least32_t *output,
+ size_t outputsize)
+{
+ size_t i, outputlen, first, last, j, k, l, laststart;
+ int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0;
+ uint_least32_t tmp;
+
+ /* write output characters */
+ for (i = 0, outputlen = 0; i < linelen; i++) {
+ if (get_state(STATE_LEVEL, linedata[i]) != -1) {
+ if (outputlen < outputsize) {
+ output[outputlen] = line[i];
+ }
+ outputlen++;
+ }
+ }
+ if (outputlen >= outputsize) {
+ /* clear output buffer */
+ for (i = 0; i < outputsize; i++) {
+ output[i] = GRAPHEME_INVALID_CODEPOINT;
+ }
+
+ /* return required size */
+ return outputlen;
+ }
+
+ /*
+ * write line embedding levels as metadata and codepoints into the
+ * output
+ */
+ get_line_embedding_levels(linedata, linelen, get_level_uint32,
+ set_level_uint32, output, linelen, true);
+
+ /* determine level range */
+ for (i = 0; i < outputlen; i++) {
+ level = get_level_uint32(output, i);
+
+ if (level == -1) {
+ /* ignored character */
+ continue;
+ }
+
+ if (level % 2 == 1 && level < min_odd_level) {
+ min_odd_level = level;
+ }
+ if (level > max_level) {
+ max_level = level;
+ }
+ }
+
+ for (level = max_level; level >= min_odd_level /* > 0 */; level--) {
+ for (i = 0; i < outputlen; i++) {
+ if (get_level_uint32(output, i) >= level) {
+ /*
+ * the current character has the desired level
+ */
+ first = last = i;
+
+ /* find the end of the level-sequence */
+ for (i++; i < outputlen; i++) {
+ if (get_level_uint32(output, i) >=
+ level) {
+ /* the sequence continues */
+ last = i;
+ } else {
+ break;
+ }
+ }
+
+ /* invert the sequence first..last respecting
+ * grapheme clusters
+ *
+ * The standard only speaks of combining marks
+ * inversion, but we should in the perfect case
+ * respect _all_ grapheme clusters, which we do
+ * here!
+ */
+
+ /* mark grapheme cluster breaks */
+ for (j = first; j <= last;
+ j += grapheme_next_character_break(
+ line + j, outputlen - j)) {
+ /*
+ * we use a special trick here: The
+ * first 21 bits of the state are fill…
+ * with the codepoint, the next 8 bits
+ * are used for the level, so we can u…
+ * the 30th bit to mark the grapheme
+ * cluster breaks. This allows us to
+ * reinvert the grapheme clusters into
+ * the proper direction later.
+ */
+ output[j] |= UINT32_C(1) << 29;
+ }
+
+ /* global inversion */
+ for (k = first, l = last; k < l; k++, l--) {
+ /* swap */
+ tmp = output[k];
+ output[k] = output[l];
+ output[l] = tmp;
+ }
+
+ /* grapheme cluster reinversion */
+#if 0
+ for (j = first, laststart = first; j <= last;
+ j++) {
+ if (output[j] & (UINT32_C(1) << 29)) {
+ /* we hit a mark! given the
+ * grapheme cluster is inverte…
+ * this means that the cluster
+ * ended and we now reinvert it
+ * again
+ */
+ for (k = laststart, l = j;
+ k < l; k++, l--) {
+ /* swap */
+ tmp = output[k];
+ output[k] = output[l];
+ output[l] = tmp;
+ }
+ laststart = j + 1;
+ }
+ }
+#endif
+
+ /* unmark grapheme cluster breaks */
+ for (j = first; j <= last; j++) {
+ output[j] ^= output[j] &
+ UINT32_C(0x20000000);
+ }
}
}
}
- return linelen;
+ /* remove embedding level metadata */
+ for (i = 0; i < outputlen; i++) {
+ output[i] ^= output[i] & UINT32_C(0x1FE00000);
+ }
+
+ return outputlen;
}
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.