GopherProxy

	Refactor sentence-functions with Proper (using Herodotus in the background) - l…
	git clone git://git.suckless.org/libgrapheme
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit a5b1b0c0c7bc1576b5893175b27585fa963f4433
	parent 52b0e29e02068d6a8123042ef901f73e37b2f38f
	Author: Laslo Hunhold <[email protected]>
	Date: Sun, 2 Oct 2022 22:05:11 +0200

	Refactor sentence-functions with Proper (using Herodotus in the background)

	This refactor was a breeze and it passed all conformance tests on the
	first try. This, just like with the word-functions, leads to a massive
	simplification and separation of concerns in the code. And as with the
	word functions, this fixes some known quirks.

	Signed-off-by: Laslo Hunhold <[email protected]>

	Diffstat:
	M src/sentence.c \| 426 +++++++++++++----------------…

	1 file changed, 181 insertions(+), 245 deletions(-)
	---
	diff --git a/src/sentence.c b/src/sentence.c
	@@ -6,11 +6,17 @@
	#include "../grapheme.h"
	#include "util.h"

	-static inline enum sentence_break_property
	-get_break_prop(uint_least32_t cp)
	+struct sentence_break_state
	+{
	+ uint_least8_t aterm_close_sp_level;
	+ uint_least8_t saterm_close_sp_parasep_level;
	+};
	+
	+static inline uint_least8_t
	+get_sentence_break_prop(uint_least32_t cp)
	{
	if (likely(cp <= 0x10FFFF)) {
	- return (enum sentence_break_property)
	+ return (uint_least8_t)
	sentence_break_minor[sentence_break_major[cp >> 8] +
	(cp & 0xff)];
	} else {
	@@ -18,243 +24,157 @@ get_break_prop(uint_least32_t cp)
	}
	}

	-static size_t
	-next_sentence_break(const void str, size_t len, size_t (get_codepoint)
	- (const void , size_t, size_t, uint_least32_t ))
	+static bool
	+is_skippable_sentence_prop(uint_least8_t prop)
	{
	- struct {
	- enum sentence_break_property a, b, c, d;
	- } raw, skip;
	- enum sentence_break_property res;
	- uint_least32_t cp;
	- uint_least8_t aterm_close_sp_level = 0,
	- saterm_close_sp_parasep_level = 0;
	- size_t off, tmp, new_off;
	+ return prop == SENTENCE_BREAK_PROP_EXTEND \|\|
	+ prop == SENTENCE_BREAK_PROP_FORMAT;
	+}

	- /* check degenerate cases */
	- if (str == NULL \|\| len == 0) {
	- return 0;
	- }
	+static void
	+sentence_skip_shift_callback(uint_least8_t prop, void *s)
	+{
	+ struct sentence_break_state state = (struct sentence_break_state )s;

	/*
	- * Apply sentence breaking algorithm (UAX #29), see
	- * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
	+ * Here comes a bit of magic. The rules
	+ * SB8, SB8a, SB9 and SB10 have very complicated
	+ * left-hand-side-rules of the form
	*
	- * There are 4 slots (a, b, c, d) of "break" properties and
	- * we check if there is a break in the middle between b and c.
	+ * ATerm Close* Sp*
	+ * SATerm Close*
	+ * SATerm Close* Sp*
	+ * SATerm Close* Sp* ParaSep?
	*
	- * The position of this middle spot is determined by off,
	- * which gives the offset of the first element on the right
	- * hand side of said spot, or, in other words, gives the number
	- * of elements on the left hand side.
	+ * but instead of backtracking, we keep the
	+ * state as some kind of "power level" in
	+ * two state-variables
	*
	- * It is further complicated by the fact that the algorithm
	- * expects you to skip certain characters for the second
	- * half of the rules (after SB5). Thus, we do not only have
	- * the "raw" properties as described above, but also the "skip"
	- * properties, where the skip.a and skip.b, for instance,
	- * give the two preceding character properties behind the
	- * currently investigated breakpoint.
	+ * aterm_close_sp_level
	+ * saterm_close_sp_parasep_level
	+ *
	+ * that go from 0 to 3/4:
	+ *
	+ * 0: we are not in the sequence
	+ * 1: we have one ATerm/SATerm to the left of
	+ * the middle spot
	+ * 2: we have one ATerm/SATerm and one or more
	+ * Close to the left of the middle spot
	+ * 3: we have one ATerm/SATerm, zero or more
	+ * Close and one or more Sp to the left of
	+ * the middle spot.
	+ * 4: we have one SATerm, zero or more Close,
	+ * zero or more Sp and one ParaSep to the
	+ * left of the middle spot.
	*
	*/
	-
	- /*
	- * Initialize the different properties such that we have
	- * a good state after the state-update in the loop
	- */
	- raw.b = NUM_SENTENCE_BREAK_PROPS;
	- if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
	- /*
	- * A line is at least one codepoint long, so we can
	- * safely return here
	- */
	- return len;
	+ if ((state->aterm_close_sp_level == 0 \|\|
	+ state->aterm_close_sp_level == 1) &&
	+ prop == SENTENCE_BREAK_PROP_ATERM) {
	+ /* sequence has begun */
	+ state->aterm_close_sp_level = 1;
	+ } else if ((state->aterm_close_sp_level == 1 \|\|
	+ state->aterm_close_sp_level == 2) &&
	+ prop == SENTENCE_BREAK_PROP_CLOSE) {
	+ /* close-sequence begins or continued */
	+ state->aterm_close_sp_level = 2;
	+ } else if ((state->aterm_close_sp_level == 1 \|\|
	+ state->aterm_close_sp_level == 2 \|\|
	+ state->aterm_close_sp_level == 3) &&
	+ prop == SENTENCE_BREAK_PROP_SP) {
	+ /* sp-sequence begins or continued */
	+ state->aterm_close_sp_level = 3;
	+ } else {
	+ /* sequence broke */
	+ state->aterm_close_sp_level = 0;
	}
	- raw.c = get_break_prop(cp);
	- (void)get_codepoint(str, len, off, &cp);
	- raw.d = get_break_prop(cp);
	- skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS;
	-
	- for (; off < len; off = new_off) {
	- /*
	- * Update left side (a and b) of the skip state by
	- * "shifting in" the raw.c property as long as it is
	- * not one of the "ignored" character properties.
	- * While at it, update the RI-counter.
	- *
	- */
	- if (raw.c != SENTENCE_BREAK_PROP_EXTEND &&
	- raw.c != SENTENCE_BREAK_PROP_FORMAT) {
	- skip.a = skip.b;
	- skip.b = raw.c;
	-
	- /*
	- * Here comes a bit of magic. The rules
	- * SB8, SB8a, SB9 and SB10 have very complicated
	- * left-hand-side-rules of the form
	- *
	- * ATerm Close* Sp*
	- * SATerm Close*
	- * SATerm Close* Sp*
	- * SATerm Close* Sp* ParaSep?
	- *
	- * but instead of backtracking, we keep the
	- * state as some kind of "power level" in
	- * two variables
	- *
	- * aterm_close_sp_level
	- * saterm_close_sp_parasep_level
	- *
	- * that go from 0 to 3/4:
	- *
	- * 0: we are not in the sequence
	- * 1: we have one ATerm/SATerm to the left of
	- * the middle spot
	- * 2: we have one ATerm/SATerm and one or more
	- * Close to the left of the middle spot
	- * 3: we have one ATerm/SATerm, zero or more
	- * Close and one or more Sp to the left of
	- * the middle spot.
	- * 4: we have one SATerm, zero or more Close,
	- * zero or more Sp and one ParaSep to the
	- * left of the middle spot.
	- *
	- */
	- if ((aterm_close_sp_level == 0 \|\|
	- aterm_close_sp_level == 1) &&
	- skip.b == SENTENCE_BREAK_PROP_ATERM) {
	- /* sequence has begun */
	- aterm_close_sp_level = 1;
	- } else if ((aterm_close_sp_level == 1 \|\|
	- aterm_close_sp_level == 2) &&
	- skip.b == SENTENCE_BREAK_PROP_CLOSE) {
	- /* close-sequence begins or continued */
	- aterm_close_sp_level = 2;
	- } else if ((aterm_close_sp_level == 1 \|\|
	- aterm_close_sp_level == 2 \|\|
	- aterm_close_sp_level == 3) &&
	- skip.b == SENTENCE_BREAK_PROP_SP) {
	- /* sp-sequence begins or continued */
	- aterm_close_sp_level = 3;
	- } else {
	- /* sequence broke */
	- aterm_close_sp_level = 0;
	- }

	- if ((saterm_close_sp_parasep_level == 0 \|\|
	- saterm_close_sp_parasep_level == 1) &&
	- (skip.b == SENTENCE_BREAK_PROP_STERM \|\|
	- skip.b == SENTENCE_BREAK_PROP_ATERM)) {
	- /* sequence has begun */
	- saterm_close_sp_parasep_level = 1;
	- } else if ((saterm_close_sp_parasep_level == 1 \|\|
	- saterm_close_sp_parasep_level == 2) &&
	- skip.b == SENTENCE_BREAK_PROP_CLOSE) {
	- /* close-sequence begins or continued */
	- saterm_close_sp_parasep_level = 2;
	- } else if ((saterm_close_sp_parasep_level == 1 \|\|
	- saterm_close_sp_parasep_level == 2 \|\|
	- saterm_close_sp_parasep_level == 3) &&
	- skip.b == SENTENCE_BREAK_PROP_SP) {
	- /* sp-sequence begins or continued */
	- saterm_close_sp_parasep_level = 3;
	- } else if ((saterm_close_sp_parasep_level == 1 \|\|
	- saterm_close_sp_parasep_level == 2 \|\|
	- saterm_close_sp_parasep_level == 3) &&
	- (skip.b == SENTENCE_BREAK_PROP_SEP \|\|
	- skip.b == SENTENCE_BREAK_PROP_CR \|\|
	- skip.b == SENTENCE_BREAK_PROP_LF)) {
	- /* ParaSep at the end of the sequence */
	- saterm_close_sp_parasep_level = 4;
	- } else {
	- /* sequence broke */
	- saterm_close_sp_parasep_level = 0;
	- }
	- }
	-
	- /*
	- * Update right side (b and c) of the skip state by
	- * starting at the breakpoint and detecting the two
	- * following non-ignored character classes
	- *
	- */
	- skip.c = NUM_SENTENCE_BREAK_PROPS;
	- for (tmp = off; tmp < len; ) {
	- tmp += get_codepoint(str, len, tmp, &cp);
	- res = get_break_prop(cp);
	-
	- if (res != SENTENCE_BREAK_PROP_EXTEND &&
	- res != SENTENCE_BREAK_PROP_FORMAT) {
	- skip.c = res;
	- break;
	- }
	- }
	- skip.d = NUM_SENTENCE_BREAK_PROPS;
	- for (; tmp < len; ) {
	- tmp += get_codepoint(str, len, tmp, &cp);
	- res = get_break_prop(cp);
	+ if ((state->saterm_close_sp_parasep_level == 0 \|\|
	+ state->saterm_close_sp_parasep_level == 1) &&
	+ (prop == SENTENCE_BREAK_PROP_STERM \|\|
	+ prop == SENTENCE_BREAK_PROP_ATERM)) {
	+ /* sequence has begun */
	+ state->saterm_close_sp_parasep_level = 1;
	+ } else if ((state->saterm_close_sp_parasep_level == 1 \|\|
	+ state->saterm_close_sp_parasep_level == 2) &&
	+ prop == SENTENCE_BREAK_PROP_CLOSE) {
	+ /* close-sequence begins or continued */
	+ state->saterm_close_sp_parasep_level = 2;
	+ } else if ((state->saterm_close_sp_parasep_level == 1 \|\|
	+ state->saterm_close_sp_parasep_level == 2 \|\|
	+ state->saterm_close_sp_parasep_level == 3) &&
	+ prop == SENTENCE_BREAK_PROP_SP) {
	+ /* sp-sequence begins or continued */
	+ state->saterm_close_sp_parasep_level = 3;
	+ } else if ((state->saterm_close_sp_parasep_level == 1 \|\|
	+ state->saterm_close_sp_parasep_level == 2 \|\|
	+ state->saterm_close_sp_parasep_level == 3) &&
	+ (prop == SENTENCE_BREAK_PROP_SEP \|\|
	+ prop == SENTENCE_BREAK_PROP_CR \|\|
	+ prop == SENTENCE_BREAK_PROP_LF)) {
	+ /* ParaSep at the end of the sequence */
	+ state->saterm_close_sp_parasep_level = 4;
	+ } else {
	+ /* sequence broke */
	+ state->saterm_close_sp_parasep_level = 0;
	+ }
	+}

	- if (res != SENTENCE_BREAK_PROP_EXTEND &&
	- res != SENTENCE_BREAK_PROP_FORMAT) {
	- skip.d = res;
	- break;
	- }
	- }
	+static size_t
	+next_sentence_break(HERODOTUS_READER *r)
	+{
	+ HERODOTUS_READER tmp;
	+ enum sentence_break_property prop;
	+ struct proper p;
	+ struct sentence_break_state state = { 0 };
	+ uint_least32_t cp;

	- /*
	- * Update the raw state by simply shifting everything
	- * in and, if we still have data left, determining
	- * the character class of the next codepoint.
	- *
	- */
	- raw.a = raw.b;
	- raw.b = raw.c;
	- raw.c = raw.d;
	- if ((new_off = off + get_codepoint(str, len, off, &cp)) < len)…
	- get_codepoint(str, len, new_off, &cp);
	- raw.d = get_break_prop(cp);
	- } else {
	- raw.d = NUM_SENTENCE_BREAK_PROPS;
	- }
	+ /*
	+ * Apply sentence breaking algorithm (UAX #29), see
	+ * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
	+ */
	+ proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
	+ get_sentence_break_prop, is_skippable_sentence_prop,
	+ sentence_skip_shift_callback, &p);

	+ while (!proper_advance(&p)) {
	/* SB3 */
	- if (raw.b == SENTENCE_BREAK_PROP_CR &&
	- raw.c == SENTENCE_BREAK_PROP_LF) {
	+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
	+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
	continue;
	}

	/* SB4 */
	- if (raw.b == SENTENCE_BREAK_PROP_SEP \|\|
	- raw.b == SENTENCE_BREAK_PROP_CR \|\|
	- raw.b == SENTENCE_BREAK_PROP_LF) {
	+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP \|\|
	+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR \|\|
	+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
	break;
	}

	/* SB5 */
	- if (raw.c == SENTENCE_BREAK_PROP_EXTEND \|\|
	- raw.c == SENTENCE_BREAK_PROP_FORMAT) {
	+ if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND \|\|
	+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
	continue;
	}

	/* SB6 */
	- if (skip.b == SENTENCE_BREAK_PROP_ATERM &&
	- skip.c == SENTENCE_BREAK_PROP_NUMERIC) {
	+ if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
	continue;
	}

	/* SB7 */
	- if (off > 1 &&
	- (skip.a == SENTENCE_BREAK_PROP_UPPER \|\|
	- skip.a == SENTENCE_BREAK_PROP_LOWER) &&
	- skip.b == SENTENCE_BREAK_PROP_ATERM &&
	- skip.c == SENTENCE_BREAK_PROP_UPPER) {
	+ if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER \|\|
	+ p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
	+ p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
	continue;
	}

	/* SB8 */
	- if (aterm_close_sp_level == 1 \|\|
	- aterm_close_sp_level == 2 \|\|
	- aterm_close_sp_level == 3) {
	+ if (state.aterm_close_sp_level == 1 \|\|
	+ state.aterm_close_sp_level == 2 \|\|
	+ state.aterm_close_sp_level == 3) {
	/*
	* This is the most complicated rule, requiring
	* the right-hand-side to satisfy the regular expressi…
	@@ -262,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t (…
	* ( ¬(OLetter \| Upper \| Lower \| ParaSep \| SATerm) )…
	*
	* which we simply check "manually" given LUT-lookups
	- * are very cheap.
	+ * are very cheap by starting at the mid_reader.
	*
	*/
	- for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < …
	- tmp += get_codepoint(str, len, tmp, &cp);
	- res = get_break_prop(cp);
	+ herodotus_reader_copy(&(p.mid_reader), &tmp);
	+
	+ prop = NUM_SENTENCE_BREAK_PROPS;
	+ while (herodotus_read_codepoint(&tmp, true, &cp) ==
	+ HERODOTUS_STATUS_SUCCESS) {
	+ prop = get_sentence_break_prop(cp);

	- if (res == SENTENCE_BREAK_PROP_OLETTER \|\|
	- res == SENTENCE_BREAK_PROP_UPPER \|\|
	- res == SENTENCE_BREAK_PROP_LOWER \|\|
	- res == SENTENCE_BREAK_PROP_SEP \|\|
	- res == SENTENCE_BREAK_PROP_CR \|\|
	- res == SENTENCE_BREAK_PROP_LF \|\|
	- res == SENTENCE_BREAK_PROP_STERM \|\|
	- res == SENTENCE_BREAK_PROP_ATERM) {
	+ /*
	+ * the skippable properties are ignored
	+ * automatically here given they do not
	+ * match the following condition
	+ */
	+ if (prop == SENTENCE_BREAK_PROP_OLETTER \|\|
	+ prop == SENTENCE_BREAK_PROP_UPPER \|\|
	+ prop == SENTENCE_BREAK_PROP_LOWER \|\|
	+ prop == SENTENCE_BREAK_PROP_SEP \|\|
	+ prop == SENTENCE_BREAK_PROP_CR \|\|
	+ prop == SENTENCE_BREAK_PROP_LF \|\|
	+ prop == SENTENCE_BREAK_PROP_STERM \|\|
	+ prop == SENTENCE_BREAK_PROP_ATERM) {
	break;
	}
	}

	- if (res == SENTENCE_BREAK_PROP_LOWER) {
	+ if (prop == SENTENCE_BREAK_PROP_LOWER) {
	continue;
	}
	}

	/* SB8a */
	- if ((saterm_close_sp_parasep_level == 1 \|\|
	- saterm_close_sp_parasep_level == 2 \|\|
	- saterm_close_sp_parasep_level == 3) &&
	- (skip.c == SENTENCE_BREAK_PROP_SCONTINUE \|\|
	- skip.c == SENTENCE_BREAK_PROP_STERM \|\|
	- skip.c == SENTENCE_BREAK_PROP_ATERM)) {
	+ if ((state.saterm_close_sp_parasep_level == 1 \|\|
	+ state.saterm_close_sp_parasep_level == 2 \|\|
	+ state.saterm_close_sp_parasep_level == 3) &&
	+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
	continue;
	}

	/* SB9 */
	- if ((saterm_close_sp_parasep_level == 1 \|\|
	- saterm_close_sp_parasep_level == 2) &&
	- (skip.c == SENTENCE_BREAK_PROP_CLOSE \|\|
	- skip.c == SENTENCE_BREAK_PROP_SP \|\|
	- skip.c == SENTENCE_BREAK_PROP_SEP \|\|
	- skip.c == SENTENCE_BREAK_PROP_CR \|\|
	- skip.c == SENTENCE_BREAK_PROP_LF)) {
	+ if ((state.saterm_close_sp_parasep_level == 1 \|\|
	+ state.saterm_close_sp_parasep_level == 2) &&
	+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
	continue;
	}

	/* SB10 */
	- if ((saterm_close_sp_parasep_level == 1 \|\|
	- saterm_close_sp_parasep_level == 2 \|\|
	- saterm_close_sp_parasep_level == 3) &&
	- (skip.c == SENTENCE_BREAK_PROP_SP \|\|
	- skip.c == SENTENCE_BREAK_PROP_SEP \|\|
	- skip.c == SENTENCE_BREAK_PROP_CR \|\|
	- skip.c == SENTENCE_BREAK_PROP_LF)) {
	+ if ((state.saterm_close_sp_parasep_level == 1 \|\|
	+ state.saterm_close_sp_parasep_level == 2 \|\|
	+ state.saterm_close_sp_parasep_level == 3) &&
	+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR \|\|
	+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
	continue;
	}

	/* SB11 */
	- if (saterm_close_sp_parasep_level == 1 \|\|
	- saterm_close_sp_parasep_level == 2 \|\|
	- saterm_close_sp_parasep_level == 3 \|\|
	- saterm_close_sp_parasep_level == 4) {
	+ if (state.saterm_close_sp_parasep_level == 1 \|\|
	+ state.saterm_close_sp_parasep_level == 2 \|\|
	+ state.saterm_close_sp_parasep_level == 3 \|\|
	+ state.saterm_close_sp_parasep_level == 4) {
	break;
	}

	@@ -330,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t (…
	continue;
	}

	- return off;
	+ return herodotus_reader_number_read(&(p.mid_reader));
	}

	size_t
	grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
	{
	- return next_sentence_break(str, len, get_codepoint);
	+ HERODOTUS_READER r;
	+
	+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
	+
	+ return next_sentence_break(&r);
	}

	size_t
	grapheme_next_sentence_break_utf8(const char *str, size_t len)
	{
	- return next_sentence_break(str, len, get_codepoint_utf8);
	+ HERODOTUS_READER r;
	+
	+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
	+
	+ return next_sentence_break(&r);
	}