Add "proper"-property-reader - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit b899fd685c50cbc61999296ce1e0a03a45e74f52 | |
parent a4d42053f13e8471ee3903522f964fc0a1d3161a | |
Author: Laslo Hunhold <[email protected]> | |
Date: Sun, 2 Oct 2022 21:09:08 +0200 | |
Add "proper"-property-reader | |
The word- and sentence-segmentation algorithms make use of a complicated | |
logic to accomodate "raw" and "skip" properties. The code is barely | |
readable and doesn't separate abstractions away nicely. Moreover, there | |
is a high probability that certain edge-cases are not handled properly. | |
To fix this, this commit adds a "proper"-property-reader, which | |
basically does the whole dirty details in the background using | |
well-commented and transparent code that builds on top of the | |
herodotus-reader instead of doing this by hand. This ensures that we | |
will (provably) never have buffer overflows unless there is a mistake | |
in the implementation itself, which can be verified relatively easily | |
given each function has a limited scope. | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M src/case.c | 25 ++++++++++++------------- | |
M src/util.c | 159 +++++++++++++++++++++++++++++… | |
M src/util.h | 23 +++++++++++++++++++++++ | |
3 files changed, 190 insertions(+), 17 deletions(-) | |
--- | |
diff --git a/src/case.c b/src/case.c | |
@@ -147,18 +147,14 @@ to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w, | |
static size_t | |
herodotus_next_word_break(const HERODOTUS_READER *r) | |
{ | |
- if (r->src == NULL || r->off > r->srclen) { | |
- return 0; | |
- } | |
+ HERODOTUS_READER tmp; | |
+ | |
+ herodotus_reader_copy(r, &tmp); | |
if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
- return grapheme_next_word_break( | |
- ((const uint_least32_t *)(r->src)) + r->off, | |
- r->srclen - r->off); | |
+ return grapheme_next_word_break(tmp.src, tmp.srclen); | |
} else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
- return grapheme_next_word_break_utf8( | |
- ((const char *)(r->src)) + r->off, | |
- r->srclen - r->off); | |
+ return grapheme_next_word_break_utf8(tmp.src, tmp.srclen); | |
} | |
} | |
@@ -168,9 +164,10 @@ to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w) | |
enum case_property prop; | |
enum herodotus_status s; | |
uint_least32_t cp; | |
+ size_t nwb; | |
- for (;;) { | |
- herodotus_reader_push_advance_limit(r, herodotus_next_word_bre… | |
+ for (; (nwb = herodotus_next_word_break(r)) > 0;) { | |
+ herodotus_reader_push_advance_limit(r, nwb); | |
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODO… | |
/* check if we have a cased character */ | |
prop = get_case_property(cp); | |
@@ -354,9 +351,10 @@ is_titlecase(HERODOTUS_READER *r, size_t *output) | |
enum herodotus_status s; | |
bool ret = true; | |
uint_least32_t cp; | |
+ size_t nwb; | |
- for (;;) { | |
- herodotus_reader_push_advance_limit(r, herodotus_next_word_bre… | |
+ for (; (nwb = herodotus_next_word_break(r)) > 0;) { | |
+ herodotus_reader_push_advance_limit(r, nwb); | |
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODO… | |
/* check if we have a cased character */ | |
prop = get_case_property(cp); | |
@@ -377,6 +375,7 @@ is_titlecase(HERODOTUS_READER *r, size_t *output) | |
* we did not encounter any cased character | |
* up to the word break | |
*/ | |
+ herodotus_reader_pop_limit(r); | |
continue; | |
} else { | |
/* | |
diff --git a/src/util.c b/src/util.c | |
@@ -30,14 +30,31 @@ herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTU… | |
{ | |
size_t i; | |
+ /* | |
+ * we copy such that we have a "fresh" start and build | |
+ * on the fact that src->soft_limit[i] for any i and src->srclen | |
+ * are always larger or equal to src->off | |
+ */ | |
dest->type = src->type; | |
- dest->src = src->src; | |
- dest->srclen = src->srclen; | |
- dest->off = src->off; | |
+ if (src->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ dest->src = ((const uint_least32_t *)(src->src)) + src->off; | |
+ } else { /* src->type == HERODOTUS_TYPE_UTF8 */ | |
+ dest->src = ((const char *)(src->src)) + src->off; | |
+ } | |
+ if (src->srclen == SIZE_MAX) { | |
+ dest->srclen = SIZE_MAX; | |
+ } else { | |
+ dest->srclen = src->srclen - src->off; | |
+ } | |
+ dest->off = 0; | |
dest->terminated_by_null = src->terminated_by_null; | |
for (i = 0; i < LEN(src->soft_limit); i++) { | |
- dest->soft_limit[i] = src->soft_limit[i]; | |
+ if (src->soft_limit[i] == SIZE_MAX) { | |
+ dest->soft_limit[i] = src->soft_limit[i]; | |
+ } else { | |
+ dest->soft_limit[i] = src->soft_limit[i] - src->off; | |
+ } | |
} | |
} | |
@@ -258,6 +275,140 @@ herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least… | |
} | |
} | |
+void | |
+proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop, | |
+ uint_least8_t (*get_break_prop)(uint_least32_t), | |
+ bool (*is_skippable_prop)(uint_least8_t), | |
+ void (*skip_shift_callback)(uint_least8_t, void *), | |
+ struct proper *p) | |
+{ | |
+ uint_least8_t prop; | |
+ uint_least32_t cp; | |
+ size_t i; | |
+ | |
+ /* set internal variables */ | |
+ p->state = state; | |
+ p->no_prop = no_prop; | |
+ p->get_break_prop = get_break_prop; | |
+ p->is_skippable_prop = is_skippable_prop; | |
+ p->skip_shift_callback = skip_shift_callback; | |
+ | |
+ /* | |
+ * Initialize mid-reader, which is basically just there | |
+ * to reflect the current position of the viewing-line | |
+ */ | |
+ herodotus_reader_copy(r, &(p->mid_reader)); | |
+ | |
+ /* | |
+ * In the initialization, we simply (try to) fill in next_prop. | |
+ * If we cannot read in more (due to the buffer ending), we | |
+ * fill in the prop as invalid | |
+ */ | |
+ | |
+ /* | |
+ * initialize the previous properties to have no property | |
+ * (given we are at the start of the buffer) | |
+ */ | |
+ p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop; | |
+ p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop; | |
+ | |
+ /* | |
+ * initialize the next properties | |
+ */ | |
+ | |
+ /* initialize the raw reader */ | |
+ herodotus_reader_copy(r, &(p->raw_reader)); | |
+ | |
+ /* fill in the two next raw properties (after no-initialization) */ | |
+ p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop; | |
+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, … | |
+ HERODOTUS_STATUS_SUCCESS; ) { | |
+ p->raw.next_prop[i++] = p->get_break_prop(cp); | |
+ } | |
+ | |
+ /* initialize the skip reader */ | |
+ herodotus_reader_copy(r, &(p->skip_reader)); | |
+ | |
+ /* fill in the two next skip properties (after no-initialization) */ | |
+ p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop; | |
+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true,… | |
+ HERODOTUS_STATUS_SUCCESS; ) { | |
+ prop = p->get_break_prop(cp); | |
+ if (!p->is_skippable_prop(prop)) { | |
+ p->skip.next_prop[i++] = prop; | |
+ } | |
+ } | |
+} | |
+ | |
+int | |
+proper_advance(struct proper *p) | |
+{ | |
+ uint_least8_t prop; | |
+ uint_least32_t cp; | |
+ | |
+ /* read in next "raw" property */ | |
+ if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) == | |
+ HERODOTUS_STATUS_SUCCESS) { | |
+ prop = p->get_break_prop(cp); | |
+ } else { | |
+ prop = p->no_prop; | |
+ } | |
+ | |
+ /* | |
+ * do a shift-in, unless we find that the property that is to | |
+ * be moved past the "raw-viewing-line" (this property is stored | |
+ * in p->raw.next_prop[0]) is a no_prop, indicating that | |
+ * we are at the end of the buffer. | |
+ */ | |
+ if (p->raw.next_prop[0] == p->no_prop) { | |
+ return 1; | |
+ } | |
+ | |
+ /* shift in the properties */ | |
+ p->raw.prev_prop[1] = p->raw.prev_prop[0]; | |
+ p->raw.prev_prop[0] = p->raw.next_prop[0]; | |
+ p->raw.next_prop[0] = p->raw.next_prop[1]; | |
+ p->raw.next_prop[1] = prop; | |
+ | |
+ /* advance the middle reader viewing-line */ | |
+ (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp); | |
+ | |
+ /* check skippability-property */ | |
+ if (!p->is_skippable_prop(p->raw.prev_prop[0])) { | |
+ /* | |
+ * the property that has moved past the "raw-viewing-line" | |
+ * (this property is now (after the raw-shift) stored in | |
+ * p->raw.prev_prop[0] and guaranteed not to be a no-prop, | |
+ * guaranteeing that we won't shift a no-prop past the | |
+ * "viewing-line" in the skip-properties) is not a skippable | |
+ * property, thus we need to shift the skip property as well. | |
+ */ | |
+ p->skip.prev_prop[1] = p->skip.prev_prop[0]; | |
+ p->skip.prev_prop[0] = p->skip.next_prop[0]; | |
+ p->skip.next_prop[0] = p->skip.next_prop[1]; | |
+ | |
+ /* | |
+ * call the skip-shift-callback on the property that | |
+ * passed the skip-viewing-line (this property is now | |
+ * stored in p->skip.prev_prop[0]). | |
+ */ | |
+ p->skip_shift_callback(p->skip.prev_prop[0], p->state); | |
+ | |
+ /* determine the next shift property */ | |
+ p->skip.next_prop[1] = p->no_prop; | |
+ while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) … | |
+ HERODOTUS_STATUS_SUCCESS) { | |
+ prop = p->get_break_prop(cp); | |
+ if (!p->is_skippable_prop(prop)) { | |
+ p->skip.next_prop[1] = prop; | |
+ break; | |
+ } | |
+ } | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
inline size_t | |
get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) | |
{ | |
diff --git a/src/util.h b/src/util.h | |
@@ -74,6 +74,22 @@ typedef struct herodotus_writer { | |
size_t first_unwritable_offset; | |
} HERODOTUS_WRITER; | |
+struct proper { | |
+ /* | |
+ * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1] | |
+ */ | |
+ struct { | |
+ uint_least8_t prev_prop[2]; | |
+ uint_least8_t next_prop[2]; | |
+ } raw, skip; | |
+ HERODOTUS_READER mid_reader, raw_reader, skip_reader; | |
+ void *state; | |
+ uint_least8_t no_prop; | |
+ uint_least8_t (*get_break_prop)(uint_least32_t); | |
+ bool (*is_skippable_prop)(uint_least8_t); | |
+ void (*skip_shift_callback)(uint_least8_t, void *); | |
+}; | |
+ | |
void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type, | |
const void *, size_t); | |
void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *); | |
@@ -90,6 +106,13 @@ void herodotus_writer_nul_terminate(HERODOTUS_WRITER *); | |
size_t herodotus_writer_number_written(const HERODOTUS_WRITER *); | |
void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t); | |
+void proper_init(const HERODOTUS_READER *, void *, uint_least8_t, | |
+ uint_least8_t (*get_break_prop)(uint_least32_t), | |
+ bool (*is_skippable_prop)(uint_least8_t), | |
+ void (*skip_shift_callback)(uint_least8_t, void *), | |
+ struct proper *); | |
+int proper_advance(struct proper *); | |
+ | |
size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *); | |
size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *); | |