Introduction
Introduction Statistics Contact Development Disclaimer Help
Add "proper"-property-reader - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
commit b899fd685c50cbc61999296ce1e0a03a45e74f52
parent a4d42053f13e8471ee3903522f964fc0a1d3161a
Author: Laslo Hunhold <[email protected]>
Date: Sun, 2 Oct 2022 21:09:08 +0200
Add "proper"-property-reader
The word- and sentence-segmentation algorithms make use of a complicated
logic to accomodate "raw" and "skip" properties. The code is barely
readable and doesn't separate abstractions away nicely. Moreover, there
is a high probability that certain edge-cases are not handled properly.
To fix this, this commit adds a "proper"-property-reader, which
basically does the whole dirty details in the background using
well-commented and transparent code that builds on top of the
herodotus-reader instead of doing this by hand. This ensures that we
will (provably) never have buffer overflows unless there is a mistake
in the implementation itself, which can be verified relatively easily
given each function has a limited scope.
Signed-off-by: Laslo Hunhold <[email protected]>
Diffstat:
M src/case.c | 25 ++++++++++++-------------
M src/util.c | 159 +++++++++++++++++++++++++++++…
M src/util.h | 23 +++++++++++++++++++++++
3 files changed, 190 insertions(+), 17 deletions(-)
---
diff --git a/src/case.c b/src/case.c
@@ -147,18 +147,14 @@ to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
static size_t
herodotus_next_word_break(const HERODOTUS_READER *r)
{
- if (r->src == NULL || r->off > r->srclen) {
- return 0;
- }
+ HERODOTUS_READER tmp;
+
+ herodotus_reader_copy(r, &tmp);
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
- return grapheme_next_word_break(
- ((const uint_least32_t *)(r->src)) + r->off,
- r->srclen - r->off);
+ return grapheme_next_word_break(tmp.src, tmp.srclen);
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
- return grapheme_next_word_break_utf8(
- ((const char *)(r->src)) + r->off,
- r->srclen - r->off);
+ return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
}
}
@@ -168,9 +164,10 @@ to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
enum case_property prop;
enum herodotus_status s;
uint_least32_t cp;
+ size_t nwb;
- for (;;) {
- herodotus_reader_push_advance_limit(r, herodotus_next_word_bre…
+ for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+ herodotus_reader_push_advance_limit(r, nwb);
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODO…
/* check if we have a cased character */
prop = get_case_property(cp);
@@ -354,9 +351,10 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
enum herodotus_status s;
bool ret = true;
uint_least32_t cp;
+ size_t nwb;
- for (;;) {
- herodotus_reader_push_advance_limit(r, herodotus_next_word_bre…
+ for (; (nwb = herodotus_next_word_break(r)) > 0;) {
+ herodotus_reader_push_advance_limit(r, nwb);
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODO…
/* check if we have a cased character */
prop = get_case_property(cp);
@@ -377,6 +375,7 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
* we did not encounter any cased character
* up to the word break
*/
+ herodotus_reader_pop_limit(r);
continue;
} else {
/*
diff --git a/src/util.c b/src/util.c
@@ -30,14 +30,31 @@ herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTU…
{
size_t i;
+ /*
+ * we copy such that we have a "fresh" start and build
+ * on the fact that src->soft_limit[i] for any i and src->srclen
+ * are always larger or equal to src->off
+ */
dest->type = src->type;
- dest->src = src->src;
- dest->srclen = src->srclen;
- dest->off = src->off;
+ if (src->type == HERODOTUS_TYPE_CODEPOINT) {
+ dest->src = ((const uint_least32_t *)(src->src)) + src->off;
+ } else { /* src->type == HERODOTUS_TYPE_UTF8 */
+ dest->src = ((const char *)(src->src)) + src->off;
+ }
+ if (src->srclen == SIZE_MAX) {
+ dest->srclen = SIZE_MAX;
+ } else {
+ dest->srclen = src->srclen - src->off;
+ }
+ dest->off = 0;
dest->terminated_by_null = src->terminated_by_null;
for (i = 0; i < LEN(src->soft_limit); i++) {
- dest->soft_limit[i] = src->soft_limit[i];
+ if (src->soft_limit[i] == SIZE_MAX) {
+ dest->soft_limit[i] = src->soft_limit[i];
+ } else {
+ dest->soft_limit[i] = src->soft_limit[i] - src->off;
+ }
}
}
@@ -258,6 +275,140 @@ herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least…
}
}
+void
+proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
+ uint_least8_t (*get_break_prop)(uint_least32_t),
+ bool (*is_skippable_prop)(uint_least8_t),
+ void (*skip_shift_callback)(uint_least8_t, void *),
+ struct proper *p)
+{
+ uint_least8_t prop;
+ uint_least32_t cp;
+ size_t i;
+
+ /* set internal variables */
+ p->state = state;
+ p->no_prop = no_prop;
+ p->get_break_prop = get_break_prop;
+ p->is_skippable_prop = is_skippable_prop;
+ p->skip_shift_callback = skip_shift_callback;
+
+ /*
+ * Initialize mid-reader, which is basically just there
+ * to reflect the current position of the viewing-line
+ */
+ herodotus_reader_copy(r, &(p->mid_reader));
+
+ /*
+ * In the initialization, we simply (try to) fill in next_prop.
+ * If we cannot read in more (due to the buffer ending), we
+ * fill in the prop as invalid
+ */
+
+ /*
+ * initialize the previous properties to have no property
+ * (given we are at the start of the buffer)
+ */
+ p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
+ p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
+
+ /*
+ * initialize the next properties
+ */
+
+ /* initialize the raw reader */
+ herodotus_reader_copy(r, &(p->raw_reader));
+
+ /* fill in the two next raw properties (after no-initialization) */
+ p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, …
+ HERODOTUS_STATUS_SUCCESS; ) {
+ p->raw.next_prop[i++] = p->get_break_prop(cp);
+ }
+
+ /* initialize the skip reader */
+ herodotus_reader_copy(r, &(p->skip_reader));
+
+ /* fill in the two next skip properties (after no-initialization) */
+ p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true,…
+ HERODOTUS_STATUS_SUCCESS; ) {
+ prop = p->get_break_prop(cp);
+ if (!p->is_skippable_prop(prop)) {
+ p->skip.next_prop[i++] = prop;
+ }
+ }
+}
+
+int
+proper_advance(struct proper *p)
+{
+ uint_least8_t prop;
+ uint_least32_t cp;
+
+ /* read in next "raw" property */
+ if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = p->get_break_prop(cp);
+ } else {
+ prop = p->no_prop;
+ }
+
+ /*
+ * do a shift-in, unless we find that the property that is to
+ * be moved past the "raw-viewing-line" (this property is stored
+ * in p->raw.next_prop[0]) is a no_prop, indicating that
+ * we are at the end of the buffer.
+ */
+ if (p->raw.next_prop[0] == p->no_prop) {
+ return 1;
+ }
+
+ /* shift in the properties */
+ p->raw.prev_prop[1] = p->raw.prev_prop[0];
+ p->raw.prev_prop[0] = p->raw.next_prop[0];
+ p->raw.next_prop[0] = p->raw.next_prop[1];
+ p->raw.next_prop[1] = prop;
+
+ /* advance the middle reader viewing-line */
+ (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
+
+ /* check skippability-property */
+ if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
+ /*
+ * the property that has moved past the "raw-viewing-line"
+ * (this property is now (after the raw-shift) stored in
+ * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
+ * guaranteeing that we won't shift a no-prop past the
+ * "viewing-line" in the skip-properties) is not a skippable
+ * property, thus we need to shift the skip property as well.
+ */
+ p->skip.prev_prop[1] = p->skip.prev_prop[0];
+ p->skip.prev_prop[0] = p->skip.next_prop[0];
+ p->skip.next_prop[0] = p->skip.next_prop[1];
+
+ /*
+ * call the skip-shift-callback on the property that
+ * passed the skip-viewing-line (this property is now
+ * stored in p->skip.prev_prop[0]).
+ */
+ p->skip_shift_callback(p->skip.prev_prop[0], p->state);
+
+ /* determine the next shift property */
+ p->skip.next_prop[1] = p->no_prop;
+ while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) …
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = p->get_break_prop(cp);
+ if (!p->is_skippable_prop(prop)) {
+ p->skip.next_prop[1] = prop;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
inline size_t
get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
{
diff --git a/src/util.h b/src/util.h
@@ -74,6 +74,22 @@ typedef struct herodotus_writer {
size_t first_unwritable_offset;
} HERODOTUS_WRITER;
+struct proper {
+ /*
+ * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
+ */
+ struct {
+ uint_least8_t prev_prop[2];
+ uint_least8_t next_prop[2];
+ } raw, skip;
+ HERODOTUS_READER mid_reader, raw_reader, skip_reader;
+ void *state;
+ uint_least8_t no_prop;
+ uint_least8_t (*get_break_prop)(uint_least32_t);
+ bool (*is_skippable_prop)(uint_least8_t);
+ void (*skip_shift_callback)(uint_least8_t, void *);
+};
+
void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
const void *, size_t);
void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
@@ -90,6 +106,13 @@ void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
+void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
+ uint_least8_t (*get_break_prop)(uint_least32_t),
+ bool (*is_skippable_prop)(uint_least8_t),
+ void (*skip_shift_callback)(uint_least8_t, void *),
+ struct proper *);
+int proper_advance(struct proper *);
+
size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *);
size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *);
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.