Refactor sentence-functions with Proper (using Herodotus in the background) - l… | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit a5b1b0c0c7bc1576b5893175b27585fa963f4433 | |
parent 52b0e29e02068d6a8123042ef901f73e37b2f38f | |
Author: Laslo Hunhold <[email protected]> | |
Date: Sun, 2 Oct 2022 22:05:11 +0200 | |
Refactor sentence-functions with Proper (using Herodotus in the background) | |
This refactor was a breeze and it passed all conformance tests on the | |
first try. This, just like with the word-functions, leads to a massive | |
simplification and separation of concerns in the code. And as with the | |
word functions, this fixes some known quirks. | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M src/sentence.c | 426 +++++++++++++----------------… | |
1 file changed, 181 insertions(+), 245 deletions(-) | |
--- | |
diff --git a/src/sentence.c b/src/sentence.c | |
@@ -6,11 +6,17 @@ | |
#include "../grapheme.h" | |
#include "util.h" | |
-static inline enum sentence_break_property | |
-get_break_prop(uint_least32_t cp) | |
+struct sentence_break_state | |
+{ | |
+ uint_least8_t aterm_close_sp_level; | |
+ uint_least8_t saterm_close_sp_parasep_level; | |
+}; | |
+ | |
+static inline uint_least8_t | |
+get_sentence_break_prop(uint_least32_t cp) | |
{ | |
if (likely(cp <= 0x10FFFF)) { | |
- return (enum sentence_break_property) | |
+ return (uint_least8_t) | |
sentence_break_minor[sentence_break_major[cp >> 8] + | |
(cp & 0xff)]; | |
} else { | |
@@ -18,243 +24,157 @@ get_break_prop(uint_least32_t cp) | |
} | |
} | |
-static size_t | |
-next_sentence_break(const void *str, size_t len, size_t (*get_codepoint) | |
- (const void *, size_t, size_t, uint_least32_t *)) | |
+static bool | |
+is_skippable_sentence_prop(uint_least8_t prop) | |
{ | |
- struct { | |
- enum sentence_break_property a, b, c, d; | |
- } raw, skip; | |
- enum sentence_break_property res; | |
- uint_least32_t cp; | |
- uint_least8_t aterm_close_sp_level = 0, | |
- saterm_close_sp_parasep_level = 0; | |
- size_t off, tmp, new_off; | |
+ return prop == SENTENCE_BREAK_PROP_EXTEND || | |
+ prop == SENTENCE_BREAK_PROP_FORMAT; | |
+} | |
- /* check degenerate cases */ | |
- if (str == NULL || len == 0) { | |
- return 0; | |
- } | |
+static void | |
+sentence_skip_shift_callback(uint_least8_t prop, void *s) | |
+{ | |
+ struct sentence_break_state *state = (struct sentence_break_state *)s; | |
/* | |
- * Apply sentence breaking algorithm (UAX #29), see | |
- * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules | |
+ * Here comes a bit of magic. The rules | |
+ * SB8, SB8a, SB9 and SB10 have very complicated | |
+ * left-hand-side-rules of the form | |
* | |
- * There are 4 slots (a, b, c, d) of "break" properties and | |
- * we check if there is a break in the middle between b and c. | |
+ * ATerm Close* Sp* | |
+ * SATerm Close* | |
+ * SATerm Close* Sp* | |
+ * SATerm Close* Sp* ParaSep? | |
* | |
- * The position of this middle spot is determined by off, | |
- * which gives the offset of the first element on the right | |
- * hand side of said spot, or, in other words, gives the number | |
- * of elements on the left hand side. | |
+ * but instead of backtracking, we keep the | |
+ * state as some kind of "power level" in | |
+ * two state-variables | |
* | |
- * It is further complicated by the fact that the algorithm | |
- * expects you to skip certain characters for the second | |
- * half of the rules (after SB5). Thus, we do not only have | |
- * the "raw" properties as described above, but also the "skip" | |
- * properties, where the skip.a and skip.b, for instance, | |
- * give the two preceding character properties behind the | |
- * currently investigated breakpoint. | |
+ * aterm_close_sp_level | |
+ * saterm_close_sp_parasep_level | |
+ * | |
+ * that go from 0 to 3/4: | |
+ * | |
+ * 0: we are not in the sequence | |
+ * 1: we have one ATerm/SATerm to the left of | |
+ * the middle spot | |
+ * 2: we have one ATerm/SATerm and one or more | |
+ * Close to the left of the middle spot | |
+ * 3: we have one ATerm/SATerm, zero or more | |
+ * Close and one or more Sp to the left of | |
+ * the middle spot. | |
+ * 4: we have one SATerm, zero or more Close, | |
+ * zero or more Sp and one ParaSep to the | |
+ * left of the middle spot. | |
* | |
*/ | |
- | |
- /* | |
- * Initialize the different properties such that we have | |
- * a good state after the state-update in the loop | |
- */ | |
- raw.b = NUM_SENTENCE_BREAK_PROPS; | |
- if ((off = get_codepoint(str, len, 0, &cp)) >= len) { | |
- /* | |
- * A line is at least one codepoint long, so we can | |
- * safely return here | |
- */ | |
- return len; | |
+ if ((state->aterm_close_sp_level == 0 || | |
+ state->aterm_close_sp_level == 1) && | |
+ prop == SENTENCE_BREAK_PROP_ATERM) { | |
+ /* sequence has begun */ | |
+ state->aterm_close_sp_level = 1; | |
+ } else if ((state->aterm_close_sp_level == 1 || | |
+ state->aterm_close_sp_level == 2) && | |
+ prop == SENTENCE_BREAK_PROP_CLOSE) { | |
+ /* close-sequence begins or continued */ | |
+ state->aterm_close_sp_level = 2; | |
+ } else if ((state->aterm_close_sp_level == 1 || | |
+ state->aterm_close_sp_level == 2 || | |
+ state->aterm_close_sp_level == 3) && | |
+ prop == SENTENCE_BREAK_PROP_SP) { | |
+ /* sp-sequence begins or continued */ | |
+ state->aterm_close_sp_level = 3; | |
+ } else { | |
+ /* sequence broke */ | |
+ state->aterm_close_sp_level = 0; | |
} | |
- raw.c = get_break_prop(cp); | |
- (void)get_codepoint(str, len, off, &cp); | |
- raw.d = get_break_prop(cp); | |
- skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS; | |
- | |
- for (; off < len; off = new_off) { | |
- /* | |
- * Update left side (a and b) of the skip state by | |
- * "shifting in" the raw.c property as long as it is | |
- * not one of the "ignored" character properties. | |
- * While at it, update the RI-counter. | |
- * | |
- */ | |
- if (raw.c != SENTENCE_BREAK_PROP_EXTEND && | |
- raw.c != SENTENCE_BREAK_PROP_FORMAT) { | |
- skip.a = skip.b; | |
- skip.b = raw.c; | |
- | |
- /* | |
- * Here comes a bit of magic. The rules | |
- * SB8, SB8a, SB9 and SB10 have very complicated | |
- * left-hand-side-rules of the form | |
- * | |
- * ATerm Close* Sp* | |
- * SATerm Close* | |
- * SATerm Close* Sp* | |
- * SATerm Close* Sp* ParaSep? | |
- * | |
- * but instead of backtracking, we keep the | |
- * state as some kind of "power level" in | |
- * two variables | |
- * | |
- * aterm_close_sp_level | |
- * saterm_close_sp_parasep_level | |
- * | |
- * that go from 0 to 3/4: | |
- * | |
- * 0: we are not in the sequence | |
- * 1: we have one ATerm/SATerm to the left of | |
- * the middle spot | |
- * 2: we have one ATerm/SATerm and one or more | |
- * Close to the left of the middle spot | |
- * 3: we have one ATerm/SATerm, zero or more | |
- * Close and one or more Sp to the left of | |
- * the middle spot. | |
- * 4: we have one SATerm, zero or more Close, | |
- * zero or more Sp and one ParaSep to the | |
- * left of the middle spot. | |
- * | |
- */ | |
- if ((aterm_close_sp_level == 0 || | |
- aterm_close_sp_level == 1) && | |
- skip.b == SENTENCE_BREAK_PROP_ATERM) { | |
- /* sequence has begun */ | |
- aterm_close_sp_level = 1; | |
- } else if ((aterm_close_sp_level == 1 || | |
- aterm_close_sp_level == 2) && | |
- skip.b == SENTENCE_BREAK_PROP_CLOSE) { | |
- /* close-sequence begins or continued */ | |
- aterm_close_sp_level = 2; | |
- } else if ((aterm_close_sp_level == 1 || | |
- aterm_close_sp_level == 2 || | |
- aterm_close_sp_level == 3) && | |
- skip.b == SENTENCE_BREAK_PROP_SP) { | |
- /* sp-sequence begins or continued */ | |
- aterm_close_sp_level = 3; | |
- } else { | |
- /* sequence broke */ | |
- aterm_close_sp_level = 0; | |
- } | |
- if ((saterm_close_sp_parasep_level == 0 || | |
- saterm_close_sp_parasep_level == 1) && | |
- (skip.b == SENTENCE_BREAK_PROP_STERM || | |
- skip.b == SENTENCE_BREAK_PROP_ATERM)) { | |
- /* sequence has begun */ | |
- saterm_close_sp_parasep_level = 1; | |
- } else if ((saterm_close_sp_parasep_level == 1 || | |
- saterm_close_sp_parasep_level == 2) && | |
- skip.b == SENTENCE_BREAK_PROP_CLOSE) { | |
- /* close-sequence begins or continued */ | |
- saterm_close_sp_parasep_level = 2; | |
- } else if ((saterm_close_sp_parasep_level == 1 || | |
- saterm_close_sp_parasep_level == 2 || | |
- saterm_close_sp_parasep_level == 3) && | |
- skip.b == SENTENCE_BREAK_PROP_SP) { | |
- /* sp-sequence begins or continued */ | |
- saterm_close_sp_parasep_level = 3; | |
- } else if ((saterm_close_sp_parasep_level == 1 || | |
- saterm_close_sp_parasep_level == 2 || | |
- saterm_close_sp_parasep_level == 3) && | |
- (skip.b == SENTENCE_BREAK_PROP_SEP || | |
- skip.b == SENTENCE_BREAK_PROP_CR || | |
- skip.b == SENTENCE_BREAK_PROP_LF)) { | |
- /* ParaSep at the end of the sequence */ | |
- saterm_close_sp_parasep_level = 4; | |
- } else { | |
- /* sequence broke */ | |
- saterm_close_sp_parasep_level = 0; | |
- } | |
- } | |
- | |
- /* | |
- * Update right side (b and c) of the skip state by | |
- * starting at the breakpoint and detecting the two | |
- * following non-ignored character classes | |
- * | |
- */ | |
- skip.c = NUM_SENTENCE_BREAK_PROPS; | |
- for (tmp = off; tmp < len; ) { | |
- tmp += get_codepoint(str, len, tmp, &cp); | |
- res = get_break_prop(cp); | |
- | |
- if (res != SENTENCE_BREAK_PROP_EXTEND && | |
- res != SENTENCE_BREAK_PROP_FORMAT) { | |
- skip.c = res; | |
- break; | |
- } | |
- } | |
- skip.d = NUM_SENTENCE_BREAK_PROPS; | |
- for (; tmp < len; ) { | |
- tmp += get_codepoint(str, len, tmp, &cp); | |
- res = get_break_prop(cp); | |
+ if ((state->saterm_close_sp_parasep_level == 0 || | |
+ state->saterm_close_sp_parasep_level == 1) && | |
+ (prop == SENTENCE_BREAK_PROP_STERM || | |
+ prop == SENTENCE_BREAK_PROP_ATERM)) { | |
+ /* sequence has begun */ | |
+ state->saterm_close_sp_parasep_level = 1; | |
+ } else if ((state->saterm_close_sp_parasep_level == 1 || | |
+ state->saterm_close_sp_parasep_level == 2) && | |
+ prop == SENTENCE_BREAK_PROP_CLOSE) { | |
+ /* close-sequence begins or continued */ | |
+ state->saterm_close_sp_parasep_level = 2; | |
+ } else if ((state->saterm_close_sp_parasep_level == 1 || | |
+ state->saterm_close_sp_parasep_level == 2 || | |
+ state->saterm_close_sp_parasep_level == 3) && | |
+ prop == SENTENCE_BREAK_PROP_SP) { | |
+ /* sp-sequence begins or continued */ | |
+ state->saterm_close_sp_parasep_level = 3; | |
+ } else if ((state->saterm_close_sp_parasep_level == 1 || | |
+ state->saterm_close_sp_parasep_level == 2 || | |
+ state->saterm_close_sp_parasep_level == 3) && | |
+ (prop == SENTENCE_BREAK_PROP_SEP || | |
+ prop == SENTENCE_BREAK_PROP_CR || | |
+ prop == SENTENCE_BREAK_PROP_LF)) { | |
+ /* ParaSep at the end of the sequence */ | |
+ state->saterm_close_sp_parasep_level = 4; | |
+ } else { | |
+ /* sequence broke */ | |
+ state->saterm_close_sp_parasep_level = 0; | |
+ } | |
+} | |
- if (res != SENTENCE_BREAK_PROP_EXTEND && | |
- res != SENTENCE_BREAK_PROP_FORMAT) { | |
- skip.d = res; | |
- break; | |
- } | |
- } | |
+static size_t | |
+next_sentence_break(HERODOTUS_READER *r) | |
+{ | |
+ HERODOTUS_READER tmp; | |
+ enum sentence_break_property prop; | |
+ struct proper p; | |
+ struct sentence_break_state state = { 0 }; | |
+ uint_least32_t cp; | |
- /* | |
- * Update the raw state by simply shifting everything | |
- * in and, if we still have data left, determining | |
- * the character class of the next codepoint. | |
- * | |
- */ | |
- raw.a = raw.b; | |
- raw.b = raw.c; | |
- raw.c = raw.d; | |
- if ((new_off = off + get_codepoint(str, len, off, &cp)) < len)… | |
- get_codepoint(str, len, new_off, &cp); | |
- raw.d = get_break_prop(cp); | |
- } else { | |
- raw.d = NUM_SENTENCE_BREAK_PROPS; | |
- } | |
+ /* | |
+ * Apply sentence breaking algorithm (UAX #29), see | |
+ * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules | |
+ */ | |
+ proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS, | |
+ get_sentence_break_prop, is_skippable_sentence_prop, | |
+ sentence_skip_shift_callback, &p); | |
+ while (!proper_advance(&p)) { | |
/* SB3 */ | |
- if (raw.b == SENTENCE_BREAK_PROP_CR && | |
- raw.c == SENTENCE_BREAK_PROP_LF) { | |
+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR && | |
+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) { | |
continue; | |
} | |
/* SB4 */ | |
- if (raw.b == SENTENCE_BREAK_PROP_SEP || | |
- raw.b == SENTENCE_BREAK_PROP_CR || | |
- raw.b == SENTENCE_BREAK_PROP_LF) { | |
+ if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP || | |
+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR || | |
+ p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) { | |
break; | |
} | |
/* SB5 */ | |
- if (raw.c == SENTENCE_BREAK_PROP_EXTEND || | |
- raw.c == SENTENCE_BREAK_PROP_FORMAT) { | |
+ if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND || | |
+ p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) { | |
continue; | |
} | |
/* SB6 */ | |
- if (skip.b == SENTENCE_BREAK_PROP_ATERM && | |
- skip.c == SENTENCE_BREAK_PROP_NUMERIC) { | |
+ if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) { | |
continue; | |
} | |
/* SB7 */ | |
- if (off > 1 && | |
- (skip.a == SENTENCE_BREAK_PROP_UPPER || | |
- skip.a == SENTENCE_BREAK_PROP_LOWER) && | |
- skip.b == SENTENCE_BREAK_PROP_ATERM && | |
- skip.c == SENTENCE_BREAK_PROP_UPPER) { | |
+ if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER || | |
+ p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) && | |
+ p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) { | |
continue; | |
} | |
/* SB8 */ | |
- if (aterm_close_sp_level == 1 || | |
- aterm_close_sp_level == 2 || | |
- aterm_close_sp_level == 3) { | |
+ if (state.aterm_close_sp_level == 1 || | |
+ state.aterm_close_sp_level == 2 || | |
+ state.aterm_close_sp_level == 3) { | |
/* | |
* This is the most complicated rule, requiring | |
* the right-hand-side to satisfy the regular expressi… | |
@@ -262,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t (… | |
* ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )… | |
* | |
* which we simply check "manually" given LUT-lookups | |
- * are very cheap. | |
+ * are very cheap by starting at the mid_reader. | |
* | |
*/ | |
- for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < … | |
- tmp += get_codepoint(str, len, tmp, &cp); | |
- res = get_break_prop(cp); | |
+ herodotus_reader_copy(&(p.mid_reader), &tmp); | |
+ | |
+ prop = NUM_SENTENCE_BREAK_PROPS; | |
+ while (herodotus_read_codepoint(&tmp, true, &cp) == | |
+ HERODOTUS_STATUS_SUCCESS) { | |
+ prop = get_sentence_break_prop(cp); | |
- if (res == SENTENCE_BREAK_PROP_OLETTER || | |
- res == SENTENCE_BREAK_PROP_UPPER || | |
- res == SENTENCE_BREAK_PROP_LOWER || | |
- res == SENTENCE_BREAK_PROP_SEP || | |
- res == SENTENCE_BREAK_PROP_CR || | |
- res == SENTENCE_BREAK_PROP_LF || | |
- res == SENTENCE_BREAK_PROP_STERM || | |
- res == SENTENCE_BREAK_PROP_ATERM) { | |
+ /* | |
+ * the skippable properties are ignored | |
+ * automatically here given they do not | |
+ * match the following condition | |
+ */ | |
+ if (prop == SENTENCE_BREAK_PROP_OLETTER || | |
+ prop == SENTENCE_BREAK_PROP_UPPER || | |
+ prop == SENTENCE_BREAK_PROP_LOWER || | |
+ prop == SENTENCE_BREAK_PROP_SEP || | |
+ prop == SENTENCE_BREAK_PROP_CR || | |
+ prop == SENTENCE_BREAK_PROP_LF || | |
+ prop == SENTENCE_BREAK_PROP_STERM || | |
+ prop == SENTENCE_BREAK_PROP_ATERM) { | |
break; | |
} | |
} | |
- if (res == SENTENCE_BREAK_PROP_LOWER) { | |
+ if (prop == SENTENCE_BREAK_PROP_LOWER) { | |
continue; | |
} | |
} | |
/* SB8a */ | |
- if ((saterm_close_sp_parasep_level == 1 || | |
- saterm_close_sp_parasep_level == 2 || | |
- saterm_close_sp_parasep_level == 3) && | |
- (skip.c == SENTENCE_BREAK_PROP_SCONTINUE || | |
- skip.c == SENTENCE_BREAK_PROP_STERM || | |
- skip.c == SENTENCE_BREAK_PROP_ATERM)) { | |
+ if ((state.saterm_close_sp_parasep_level == 1 || | |
+ state.saterm_close_sp_parasep_level == 2 || | |
+ state.saterm_close_sp_parasep_level == 3) && | |
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) { | |
continue; | |
} | |
/* SB9 */ | |
- if ((saterm_close_sp_parasep_level == 1 || | |
- saterm_close_sp_parasep_level == 2) && | |
- (skip.c == SENTENCE_BREAK_PROP_CLOSE || | |
- skip.c == SENTENCE_BREAK_PROP_SP || | |
- skip.c == SENTENCE_BREAK_PROP_SEP || | |
- skip.c == SENTENCE_BREAK_PROP_CR || | |
- skip.c == SENTENCE_BREAK_PROP_LF)) { | |
+ if ((state.saterm_close_sp_parasep_level == 1 || | |
+ state.saterm_close_sp_parasep_level == 2) && | |
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { | |
continue; | |
} | |
/* SB10 */ | |
- if ((saterm_close_sp_parasep_level == 1 || | |
- saterm_close_sp_parasep_level == 2 || | |
- saterm_close_sp_parasep_level == 3) && | |
- (skip.c == SENTENCE_BREAK_PROP_SP || | |
- skip.c == SENTENCE_BREAK_PROP_SEP || | |
- skip.c == SENTENCE_BREAK_PROP_CR || | |
- skip.c == SENTENCE_BREAK_PROP_LF)) { | |
+ if ((state.saterm_close_sp_parasep_level == 1 || | |
+ state.saterm_close_sp_parasep_level == 2 || | |
+ state.saterm_close_sp_parasep_level == 3) && | |
+ (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || | |
+ p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { | |
continue; | |
} | |
/* SB11 */ | |
- if (saterm_close_sp_parasep_level == 1 || | |
- saterm_close_sp_parasep_level == 2 || | |
- saterm_close_sp_parasep_level == 3 || | |
- saterm_close_sp_parasep_level == 4) { | |
+ if (state.saterm_close_sp_parasep_level == 1 || | |
+ state.saterm_close_sp_parasep_level == 2 || | |
+ state.saterm_close_sp_parasep_level == 3 || | |
+ state.saterm_close_sp_parasep_level == 4) { | |
break; | |
} | |
@@ -330,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t (… | |
continue; | |
} | |
- return off; | |
+ return herodotus_reader_number_read(&(p.mid_reader)); | |
} | |
size_t | |
grapheme_next_sentence_break(const uint_least32_t *str, size_t len) | |
{ | |
- return next_sentence_break(str, len, get_codepoint); | |
+ HERODOTUS_READER r; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); | |
+ | |
+ return next_sentence_break(&r); | |
} | |
size_t | |
grapheme_next_sentence_break_utf8(const char *str, size_t len) | |
{ | |
- return next_sentence_break(str, len, get_codepoint_utf8); | |
+ HERODOTUS_READER r; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); | |
+ | |
+ return next_sentence_break(&r); | |
} |