Introduction
Introduction Statistics Contact Development Disclaimer Help
Refactor sentence-functions with Proper (using Herodotus in the background) - l…
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
commit a5b1b0c0c7bc1576b5893175b27585fa963f4433
parent 52b0e29e02068d6a8123042ef901f73e37b2f38f
Author: Laslo Hunhold <[email protected]>
Date: Sun, 2 Oct 2022 22:05:11 +0200
Refactor sentence-functions with Proper (using Herodotus in the background)
This refactor was a breeze and it passed all conformance tests on the
first try. This, just like with the word-functions, leads to a massive
simplification and separation of concerns in the code. And as with the
word functions, this fixes some known quirks.
Signed-off-by: Laslo Hunhold <[email protected]>
Diffstat:
M src/sentence.c | 426 +++++++++++++----------------…
1 file changed, 181 insertions(+), 245 deletions(-)
---
diff --git a/src/sentence.c b/src/sentence.c
@@ -6,11 +6,17 @@
#include "../grapheme.h"
#include "util.h"
-static inline enum sentence_break_property
-get_break_prop(uint_least32_t cp)
+struct sentence_break_state
+{
+ uint_least8_t aterm_close_sp_level;
+ uint_least8_t saterm_close_sp_parasep_level;
+};
+
+static inline uint_least8_t
+get_sentence_break_prop(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
- return (enum sentence_break_property)
+ return (uint_least8_t)
sentence_break_minor[sentence_break_major[cp >> 8] +
(cp & 0xff)];
} else {
@@ -18,243 +24,157 @@ get_break_prop(uint_least32_t cp)
}
}
-static size_t
-next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
- (const void *, size_t, size_t, uint_least32_t *))
+static bool
+is_skippable_sentence_prop(uint_least8_t prop)
{
- struct {
- enum sentence_break_property a, b, c, d;
- } raw, skip;
- enum sentence_break_property res;
- uint_least32_t cp;
- uint_least8_t aterm_close_sp_level = 0,
- saterm_close_sp_parasep_level = 0;
- size_t off, tmp, new_off;
+ return prop == SENTENCE_BREAK_PROP_EXTEND ||
+ prop == SENTENCE_BREAK_PROP_FORMAT;
+}
- /* check degenerate cases */
- if (str == NULL || len == 0) {
- return 0;
- }
+static void
+sentence_skip_shift_callback(uint_least8_t prop, void *s)
+{
+ struct sentence_break_state *state = (struct sentence_break_state *)s;
/*
- * Apply sentence breaking algorithm (UAX #29), see
- * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
+ * Here comes a bit of magic. The rules
+ * SB8, SB8a, SB9 and SB10 have very complicated
+ * left-hand-side-rules of the form
*
- * There are 4 slots (a, b, c, d) of "break" properties and
- * we check if there is a break in the middle between b and c.
+ * ATerm Close* Sp*
+ * SATerm Close*
+ * SATerm Close* Sp*
+ * SATerm Close* Sp* ParaSep?
*
- * The position of this middle spot is determined by off,
- * which gives the offset of the first element on the right
- * hand side of said spot, or, in other words, gives the number
- * of elements on the left hand side.
+ * but instead of backtracking, we keep the
+ * state as some kind of "power level" in
+ * two state-variables
*
- * It is further complicated by the fact that the algorithm
- * expects you to skip certain characters for the second
- * half of the rules (after SB5). Thus, we do not only have
- * the "raw" properties as described above, but also the "skip"
- * properties, where the skip.a and skip.b, for instance,
- * give the two preceding character properties behind the
- * currently investigated breakpoint.
+ * aterm_close_sp_level
+ * saterm_close_sp_parasep_level
+ *
+ * that go from 0 to 3/4:
+ *
+ * 0: we are not in the sequence
+ * 1: we have one ATerm/SATerm to the left of
+ * the middle spot
+ * 2: we have one ATerm/SATerm and one or more
+ * Close to the left of the middle spot
+ * 3: we have one ATerm/SATerm, zero or more
+ * Close and one or more Sp to the left of
+ * the middle spot.
+ * 4: we have one SATerm, zero or more Close,
+ * zero or more Sp and one ParaSep to the
+ * left of the middle spot.
*
*/
-
- /*
- * Initialize the different properties such that we have
- * a good state after the state-update in the loop
- */
- raw.b = NUM_SENTENCE_BREAK_PROPS;
- if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
- /*
- * A line is at least one codepoint long, so we can
- * safely return here
- */
- return len;
+ if ((state->aterm_close_sp_level == 0 ||
+ state->aterm_close_sp_level == 1) &&
+ prop == SENTENCE_BREAK_PROP_ATERM) {
+ /* sequence has begun */
+ state->aterm_close_sp_level = 1;
+ } else if ((state->aterm_close_sp_level == 1 ||
+ state->aterm_close_sp_level == 2) &&
+ prop == SENTENCE_BREAK_PROP_CLOSE) {
+ /* close-sequence begins or continued */
+ state->aterm_close_sp_level = 2;
+ } else if ((state->aterm_close_sp_level == 1 ||
+ state->aterm_close_sp_level == 2 ||
+ state->aterm_close_sp_level == 3) &&
+ prop == SENTENCE_BREAK_PROP_SP) {
+ /* sp-sequence begins or continued */
+ state->aterm_close_sp_level = 3;
+ } else {
+ /* sequence broke */
+ state->aterm_close_sp_level = 0;
}
- raw.c = get_break_prop(cp);
- (void)get_codepoint(str, len, off, &cp);
- raw.d = get_break_prop(cp);
- skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS;
-
- for (; off < len; off = new_off) {
- /*
- * Update left side (a and b) of the skip state by
- * "shifting in" the raw.c property as long as it is
- * not one of the "ignored" character properties.
- * While at it, update the RI-counter.
- *
- */
- if (raw.c != SENTENCE_BREAK_PROP_EXTEND &&
- raw.c != SENTENCE_BREAK_PROP_FORMAT) {
- skip.a = skip.b;
- skip.b = raw.c;
-
- /*
- * Here comes a bit of magic. The rules
- * SB8, SB8a, SB9 and SB10 have very complicated
- * left-hand-side-rules of the form
- *
- * ATerm Close* Sp*
- * SATerm Close*
- * SATerm Close* Sp*
- * SATerm Close* Sp* ParaSep?
- *
- * but instead of backtracking, we keep the
- * state as some kind of "power level" in
- * two variables
- *
- * aterm_close_sp_level
- * saterm_close_sp_parasep_level
- *
- * that go from 0 to 3/4:
- *
- * 0: we are not in the sequence
- * 1: we have one ATerm/SATerm to the left of
- * the middle spot
- * 2: we have one ATerm/SATerm and one or more
- * Close to the left of the middle spot
- * 3: we have one ATerm/SATerm, zero or more
- * Close and one or more Sp to the left of
- * the middle spot.
- * 4: we have one SATerm, zero or more Close,
- * zero or more Sp and one ParaSep to the
- * left of the middle spot.
- *
- */
- if ((aterm_close_sp_level == 0 ||
- aterm_close_sp_level == 1) &&
- skip.b == SENTENCE_BREAK_PROP_ATERM) {
- /* sequence has begun */
- aterm_close_sp_level = 1;
- } else if ((aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2) &&
- skip.b == SENTENCE_BREAK_PROP_CLOSE) {
- /* close-sequence begins or continued */
- aterm_close_sp_level = 2;
- } else if ((aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2 ||
- aterm_close_sp_level == 3) &&
- skip.b == SENTENCE_BREAK_PROP_SP) {
- /* sp-sequence begins or continued */
- aterm_close_sp_level = 3;
- } else {
- /* sequence broke */
- aterm_close_sp_level = 0;
- }
- if ((saterm_close_sp_parasep_level == 0 ||
- saterm_close_sp_parasep_level == 1) &&
- (skip.b == SENTENCE_BREAK_PROP_STERM ||
- skip.b == SENTENCE_BREAK_PROP_ATERM)) {
- /* sequence has begun */
- saterm_close_sp_parasep_level = 1;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2) &&
- skip.b == SENTENCE_BREAK_PROP_CLOSE) {
- /* close-sequence begins or continued */
- saterm_close_sp_parasep_level = 2;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- skip.b == SENTENCE_BREAK_PROP_SP) {
- /* sp-sequence begins or continued */
- saterm_close_sp_parasep_level = 3;
- } else if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.b == SENTENCE_BREAK_PROP_SEP ||
- skip.b == SENTENCE_BREAK_PROP_CR ||
- skip.b == SENTENCE_BREAK_PROP_LF)) {
- /* ParaSep at the end of the sequence */
- saterm_close_sp_parasep_level = 4;
- } else {
- /* sequence broke */
- saterm_close_sp_parasep_level = 0;
- }
- }
-
- /*
- * Update right side (b and c) of the skip state by
- * starting at the breakpoint and detecting the two
- * following non-ignored character classes
- *
- */
- skip.c = NUM_SENTENCE_BREAK_PROPS;
- for (tmp = off; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
-
- if (res != SENTENCE_BREAK_PROP_EXTEND &&
- res != SENTENCE_BREAK_PROP_FORMAT) {
- skip.c = res;
- break;
- }
- }
- skip.d = NUM_SENTENCE_BREAK_PROPS;
- for (; tmp < len; ) {
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
+ if ((state->saterm_close_sp_parasep_level == 0 ||
+ state->saterm_close_sp_parasep_level == 1) &&
+ (prop == SENTENCE_BREAK_PROP_STERM ||
+ prop == SENTENCE_BREAK_PROP_ATERM)) {
+ /* sequence has begun */
+ state->saterm_close_sp_parasep_level = 1;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2) &&
+ prop == SENTENCE_BREAK_PROP_CLOSE) {
+ /* close-sequence begins or continued */
+ state->saterm_close_sp_parasep_level = 2;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2 ||
+ state->saterm_close_sp_parasep_level == 3) &&
+ prop == SENTENCE_BREAK_PROP_SP) {
+ /* sp-sequence begins or continued */
+ state->saterm_close_sp_parasep_level = 3;
+ } else if ((state->saterm_close_sp_parasep_level == 1 ||
+ state->saterm_close_sp_parasep_level == 2 ||
+ state->saterm_close_sp_parasep_level == 3) &&
+ (prop == SENTENCE_BREAK_PROP_SEP ||
+ prop == SENTENCE_BREAK_PROP_CR ||
+ prop == SENTENCE_BREAK_PROP_LF)) {
+ /* ParaSep at the end of the sequence */
+ state->saterm_close_sp_parasep_level = 4;
+ } else {
+ /* sequence broke */
+ state->saterm_close_sp_parasep_level = 0;
+ }
+}
- if (res != SENTENCE_BREAK_PROP_EXTEND &&
- res != SENTENCE_BREAK_PROP_FORMAT) {
- skip.d = res;
- break;
- }
- }
+static size_t
+next_sentence_break(HERODOTUS_READER *r)
+{
+ HERODOTUS_READER tmp;
+ enum sentence_break_property prop;
+ struct proper p;
+ struct sentence_break_state state = { 0 };
+ uint_least32_t cp;
- /*
- * Update the raw state by simply shifting everything
- * in and, if we still have data left, determining
- * the character class of the next codepoint.
- *
- */
- raw.a = raw.b;
- raw.b = raw.c;
- raw.c = raw.d;
- if ((new_off = off + get_codepoint(str, len, off, &cp)) < len)…
- get_codepoint(str, len, new_off, &cp);
- raw.d = get_break_prop(cp);
- } else {
- raw.d = NUM_SENTENCE_BREAK_PROPS;
- }
+ /*
+ * Apply sentence breaking algorithm (UAX #29), see
+ * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
+ */
+ proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
+ get_sentence_break_prop, is_skippable_sentence_prop,
+ sentence_skip_shift_callback, &p);
+ while (!proper_advance(&p)) {
/* SB3 */
- if (raw.b == SENTENCE_BREAK_PROP_CR &&
- raw.c == SENTENCE_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
continue;
}
/* SB4 */
- if (raw.b == SENTENCE_BREAK_PROP_SEP ||
- raw.b == SENTENCE_BREAK_PROP_CR ||
- raw.b == SENTENCE_BREAK_PROP_LF) {
+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
break;
}
/* SB5 */
- if (raw.c == SENTENCE_BREAK_PROP_EXTEND ||
- raw.c == SENTENCE_BREAK_PROP_FORMAT) {
+ if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
continue;
}
/* SB6 */
- if (skip.b == SENTENCE_BREAK_PROP_ATERM &&
- skip.c == SENTENCE_BREAK_PROP_NUMERIC) {
+ if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
continue;
}
/* SB7 */
- if (off > 1 &&
- (skip.a == SENTENCE_BREAK_PROP_UPPER ||
- skip.a == SENTENCE_BREAK_PROP_LOWER) &&
- skip.b == SENTENCE_BREAK_PROP_ATERM &&
- skip.c == SENTENCE_BREAK_PROP_UPPER) {
+ if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
+ p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
+ p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
continue;
}
/* SB8 */
- if (aterm_close_sp_level == 1 ||
- aterm_close_sp_level == 2 ||
- aterm_close_sp_level == 3) {
+ if (state.aterm_close_sp_level == 1 ||
+ state.aterm_close_sp_level == 2 ||
+ state.aterm_close_sp_level == 3) {
/*
* This is the most complicated rule, requiring
* the right-hand-side to satisfy the regular expressi…
@@ -262,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t (…
* ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )…
*
* which we simply check "manually" given LUT-lookups
- * are very cheap.
+ * are very cheap by starting at the mid_reader.
*
*/
- for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < …
- tmp += get_codepoint(str, len, tmp, &cp);
- res = get_break_prop(cp);
+ herodotus_reader_copy(&(p.mid_reader), &tmp);
+
+ prop = NUM_SENTENCE_BREAK_PROPS;
+ while (herodotus_read_codepoint(&tmp, true, &cp) ==
+ HERODOTUS_STATUS_SUCCESS) {
+ prop = get_sentence_break_prop(cp);
- if (res == SENTENCE_BREAK_PROP_OLETTER ||
- res == SENTENCE_BREAK_PROP_UPPER ||
- res == SENTENCE_BREAK_PROP_LOWER ||
- res == SENTENCE_BREAK_PROP_SEP ||
- res == SENTENCE_BREAK_PROP_CR ||
- res == SENTENCE_BREAK_PROP_LF ||
- res == SENTENCE_BREAK_PROP_STERM ||
- res == SENTENCE_BREAK_PROP_ATERM) {
+ /*
+ * the skippable properties are ignored
+ * automatically here given they do not
+ * match the following condition
+ */
+ if (prop == SENTENCE_BREAK_PROP_OLETTER ||
+ prop == SENTENCE_BREAK_PROP_UPPER ||
+ prop == SENTENCE_BREAK_PROP_LOWER ||
+ prop == SENTENCE_BREAK_PROP_SEP ||
+ prop == SENTENCE_BREAK_PROP_CR ||
+ prop == SENTENCE_BREAK_PROP_LF ||
+ prop == SENTENCE_BREAK_PROP_STERM ||
+ prop == SENTENCE_BREAK_PROP_ATERM) {
break;
}
}
- if (res == SENTENCE_BREAK_PROP_LOWER) {
+ if (prop == SENTENCE_BREAK_PROP_LOWER) {
continue;
}
}
/* SB8a */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.c == SENTENCE_BREAK_PROP_SCONTINUE ||
- skip.c == SENTENCE_BREAK_PROP_STERM ||
- skip.c == SENTENCE_BREAK_PROP_ATERM)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
continue;
}
/* SB9 */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2) &&
- (skip.c == SENTENCE_BREAK_PROP_CLOSE ||
- skip.c == SENTENCE_BREAK_PROP_SP ||
- skip.c == SENTENCE_BREAK_PROP_SEP ||
- skip.c == SENTENCE_BREAK_PROP_CR ||
- skip.c == SENTENCE_BREAK_PROP_LF)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB10 */
- if ((saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3) &&
- (skip.c == SENTENCE_BREAK_PROP_SP ||
- skip.c == SENTENCE_BREAK_PROP_SEP ||
- skip.c == SENTENCE_BREAK_PROP_CR ||
- skip.c == SENTENCE_BREAK_PROP_LF)) {
+ if ((state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3) &&
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB11 */
- if (saterm_close_sp_parasep_level == 1 ||
- saterm_close_sp_parasep_level == 2 ||
- saterm_close_sp_parasep_level == 3 ||
- saterm_close_sp_parasep_level == 4) {
+ if (state.saterm_close_sp_parasep_level == 1 ||
+ state.saterm_close_sp_parasep_level == 2 ||
+ state.saterm_close_sp_parasep_level == 3 ||
+ state.saterm_close_sp_parasep_level == 4) {
break;
}
@@ -330,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t (…
continue;
}
- return off;
+ return herodotus_reader_number_read(&(p.mid_reader));
}
size_t
grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
{
- return next_sentence_break(str, len, get_codepoint);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
+
+ return next_sentence_break(&r);
}
size_t
grapheme_next_sentence_break_utf8(const char *str, size_t len)
{
- return next_sentence_break(str, len, get_codepoint_utf8);
+ HERODOTUS_READER r;
+
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
+
+ return next_sentence_break(&r);
}
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.