GopherProxy

	Add "proper"-property-reader - libgrapheme - unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit b899fd685c50cbc61999296ce1e0a03a45e74f52
	parent a4d42053f13e8471ee3903522f964fc0a1d3161a
	Author: Laslo Hunhold <[email protected]>
	Date: Sun, 2 Oct 2022 21:09:08 +0200

	Add "proper"-property-reader

	The word- and sentence-segmentation algorithms make use of a complicated
	logic to accomodate "raw" and "skip" properties. The code is barely
	readable and doesn't separate abstractions away nicely. Moreover, there
	is a high probability that certain edge-cases are not handled properly.

	To fix this, this commit adds a "proper"-property-reader, which
	basically does the whole dirty details in the background using
	well-commented and transparent code that builds on top of the
	herodotus-reader instead of doing this by hand. This ensures that we
	will (provably) never have buffer overflows unless there is a mistake
	in the implementation itself, which can be verified relatively easily
	given each function has a limited scope.

	Signed-off-by: Laslo Hunhold <[email protected]>

	Diffstat:
	M src/case.c \| 25 ++++++++++++-------------
	M src/util.c \| 159 +++++++++++++++++++++++++++++…
	M src/util.h \| 23 +++++++++++++++++++++++

	3 files changed, 190 insertions(+), 17 deletions(-)
	---
	diff --git a/src/case.c b/src/case.c
	@@ -147,18 +147,14 @@ to_case(HERODOTUS_READER r, HERODOTUS_WRITER w,
	static size_t
	herodotus_next_word_break(const HERODOTUS_READER *r)
	{
	- if (r->src == NULL \|\| r->off > r->srclen) {
	- return 0;
	- }
	+ HERODOTUS_READER tmp;
	+
	+ herodotus_reader_copy(r, &tmp);

	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
	- return grapheme_next_word_break(
	- ((const uint_least32_t *)(r->src)) + r->off,
	- r->srclen - r->off);
	+ return grapheme_next_word_break(tmp.src, tmp.srclen);
	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
	- return grapheme_next_word_break_utf8(
	- ((const char *)(r->src)) + r->off,
	- r->srclen - r->off);
	+ return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
	}
	}

	@@ -168,9 +164,10 @@ to_titlecase(HERODOTUS_READER r, HERODOTUS_WRITER w)
	enum case_property prop;
	enum herodotus_status s;
	uint_least32_t cp;
	+ size_t nwb;

	- for (;;) {
	- herodotus_reader_push_advance_limit(r, herodotus_next_word_bre…
	+ for (; (nwb = herodotus_next_word_break(r)) > 0;) {
	+ herodotus_reader_push_advance_limit(r, nwb);
	for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODO…
	/* check if we have a cased character */
	prop = get_case_property(cp);
	@@ -354,9 +351,10 @@ is_titlecase(HERODOTUS_READER r, size_t output)
	enum herodotus_status s;
	bool ret = true;
	uint_least32_t cp;
	+ size_t nwb;

	- for (;;) {
	- herodotus_reader_push_advance_limit(r, herodotus_next_word_bre…
	+ for (; (nwb = herodotus_next_word_break(r)) > 0;) {
	+ herodotus_reader_push_advance_limit(r, nwb);
	for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODO…
	/* check if we have a cased character */
	prop = get_case_property(cp);
	@@ -377,6 +375,7 @@ is_titlecase(HERODOTUS_READER r, size_t output)
	* we did not encounter any cased character
	* up to the word break
	*/
	+ herodotus_reader_pop_limit(r);
	continue;
	} else {
	/*
	diff --git a/src/util.c b/src/util.c
	@@ -30,14 +30,31 @@ herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTU…
	{
	size_t i;

	+ /*
	+ * we copy such that we have a "fresh" start and build
	+ * on the fact that src->soft_limit[i] for any i and src->srclen
	+ * are always larger or equal to src->off
	+ */
	dest->type = src->type;
	- dest->src = src->src;
	- dest->srclen = src->srclen;
	- dest->off = src->off;
	+ if (src->type == HERODOTUS_TYPE_CODEPOINT) {
	+ dest->src = ((const uint_least32_t *)(src->src)) + src->off;
	+ } else { /* src->type == HERODOTUS_TYPE_UTF8 */
	+ dest->src = ((const char *)(src->src)) + src->off;
	+ }
	+ if (src->srclen == SIZE_MAX) {
	+ dest->srclen = SIZE_MAX;
	+ } else {
	+ dest->srclen = src->srclen - src->off;
	+ }
	+ dest->off = 0;
	dest->terminated_by_null = src->terminated_by_null;

	for (i = 0; i < LEN(src->soft_limit); i++) {
	- dest->soft_limit[i] = src->soft_limit[i];
	+ if (src->soft_limit[i] == SIZE_MAX) {
	+ dest->soft_limit[i] = src->soft_limit[i];
	+ } else {
	+ dest->soft_limit[i] = src->soft_limit[i] - src->off;
	+ }
	}
	}

	@@ -258,6 +275,140 @@ herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least…
	}
	}

	+void
	+proper_init(const HERODOTUS_READER r, void state, uint_least8_t no_prop,
	+ uint_least8_t (*get_break_prop)(uint_least32_t),
	+ bool (*is_skippable_prop)(uint_least8_t),
	+ void (skip_shift_callback)(uint_least8_t, void ),
	+ struct proper *p)
	+{
	+ uint_least8_t prop;
	+ uint_least32_t cp;
	+ size_t i;
	+
	+ /* set internal variables */
	+ p->state = state;
	+ p->no_prop = no_prop;
	+ p->get_break_prop = get_break_prop;
	+ p->is_skippable_prop = is_skippable_prop;
	+ p->skip_shift_callback = skip_shift_callback;
	+
	+ /*
	+ * Initialize mid-reader, which is basically just there
	+ * to reflect the current position of the viewing-line
	+ */
	+ herodotus_reader_copy(r, &(p->mid_reader));
	+
	+ /*
	+ * In the initialization, we simply (try to) fill in next_prop.
	+ * If we cannot read in more (due to the buffer ending), we
	+ * fill in the prop as invalid
	+ */
	+
	+ /*
	+ * initialize the previous properties to have no property
	+ * (given we are at the start of the buffer)
	+ */
	+ p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
	+ p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
	+
	+ /*
	+ * initialize the next properties
	+ */
	+
	+ /* initialize the raw reader */
	+ herodotus_reader_copy(r, &(p->raw_reader));
	+
	+ /* fill in the two next raw properties (after no-initialization) */
	+ p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
	+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, …
	+ HERODOTUS_STATUS_SUCCESS; ) {
	+ p->raw.next_prop[i++] = p->get_break_prop(cp);
	+ }
	+
	+ /* initialize the skip reader */
	+ herodotus_reader_copy(r, &(p->skip_reader));
	+
	+ /* fill in the two next skip properties (after no-initialization) */
	+ p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
	+ for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true,…
	+ HERODOTUS_STATUS_SUCCESS; ) {
	+ prop = p->get_break_prop(cp);
	+ if (!p->is_skippable_prop(prop)) {
	+ p->skip.next_prop[i++] = prop;
	+ }
	+ }
	+}
	+
	+int
	+proper_advance(struct proper *p)
	+{
	+ uint_least8_t prop;
	+ uint_least32_t cp;
	+
	+ /* read in next "raw" property */
	+ if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
	+ HERODOTUS_STATUS_SUCCESS) {
	+ prop = p->get_break_prop(cp);
	+ } else {
	+ prop = p->no_prop;
	+ }
	+
	+ /*
	+ * do a shift-in, unless we find that the property that is to
	+ * be moved past the "raw-viewing-line" (this property is stored
	+ * in p->raw.next_prop[0]) is a no_prop, indicating that
	+ * we are at the end of the buffer.
	+ */
	+ if (p->raw.next_prop[0] == p->no_prop) {
	+ return 1;
	+ }
	+
	+ /* shift in the properties */
	+ p->raw.prev_prop[1] = p->raw.prev_prop[0];
	+ p->raw.prev_prop[0] = p->raw.next_prop[0];
	+ p->raw.next_prop[0] = p->raw.next_prop[1];
	+ p->raw.next_prop[1] = prop;
	+
	+ /* advance the middle reader viewing-line */
	+ (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
	+
	+ /* check skippability-property */
	+ if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
	+ /*
	+ * the property that has moved past the "raw-viewing-line"
	+ * (this property is now (after the raw-shift) stored in
	+ * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
	+ * guaranteeing that we won't shift a no-prop past the
	+ * "viewing-line" in the skip-properties) is not a skippable
	+ * property, thus we need to shift the skip property as well.
	+ */
	+ p->skip.prev_prop[1] = p->skip.prev_prop[0];
	+ p->skip.prev_prop[0] = p->skip.next_prop[0];
	+ p->skip.next_prop[0] = p->skip.next_prop[1];
	+
	+ /*
	+ * call the skip-shift-callback on the property that
	+ * passed the skip-viewing-line (this property is now
	+ * stored in p->skip.prev_prop[0]).
	+ */
	+ p->skip_shift_callback(p->skip.prev_prop[0], p->state);
	+
	+ /* determine the next shift property */
	+ p->skip.next_prop[1] = p->no_prop;
	+ while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) …
	+ HERODOTUS_STATUS_SUCCESS) {
	+ prop = p->get_break_prop(cp);
	+ if (!p->is_skippable_prop(prop)) {
	+ p->skip.next_prop[1] = prop;
	+ break;
	+ }
	+ }
	+ }
	+
	+ return 0;
	+}
	+
	inline size_t
	get_codepoint(const void str, size_t len, size_t offset, uint_least32_t cp)
	{
	diff --git a/src/util.h b/src/util.h
	@@ -74,6 +74,22 @@ typedef struct herodotus_writer {
	size_t first_unwritable_offset;
	} HERODOTUS_WRITER;

	+struct proper {
	+ /*
	+ * prev_prop[1] prev_prop[0] \| next_prop[0] next_prop[1]
	+ */
	+ struct {
	+ uint_least8_t prev_prop[2];
	+ uint_least8_t next_prop[2];
	+ } raw, skip;
	+ HERODOTUS_READER mid_reader, raw_reader, skip_reader;
	+ void *state;
	+ uint_least8_t no_prop;
	+ uint_least8_t (*get_break_prop)(uint_least32_t);
	+ bool (*is_skippable_prop)(uint_least8_t);
	+ void (skip_shift_callback)(uint_least8_t, void );
	+};
	+
	void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
	const void *, size_t);
	void herodotus_reader_copy(const HERODOTUS_READER , HERODOTUS_READER );
	@@ -90,6 +106,13 @@ void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
	size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
	void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);

	+void proper_init(const HERODOTUS_READER , void , uint_least8_t,
	+ uint_least8_t (*get_break_prop)(uint_least32_t),
	+ bool (*is_skippable_prop)(uint_least8_t),
	+ void (skip_shift_callback)(uint_least8_t, void ),
	+ struct proper *);
	+int proper_advance(struct proper *);
	+
	size_t get_codepoint(const void , size_t, size_t, uint_least32_t );
	size_t get_codepoint_utf8(const void , size_t, size_t, uint_least32_t );