Refactor word-functions with Proper (using Herodotus in the background) - libgr… | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 52b0e29e02068d6a8123042ef901f73e37b2f38f | |
parent b899fd685c50cbc61999296ce1e0a03a45e74f52 | |
Author: Laslo Hunhold <[email protected]> | |
Date: Sun, 2 Oct 2022 21:17:03 +0200 | |
Refactor word-functions with Proper (using Herodotus in the background) | |
As promised, this leads to a heavy simplification and separation of | |
concerns in the code. Additionally, this fixes some known quirks in | |
regard to handling NUL-terminated strings. | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M src/word.c | 367 ++++++++++++-----------------… | |
1 file changed, 142 insertions(+), 225 deletions(-) | |
--- | |
diff --git a/src/word.c b/src/word.c | |
@@ -6,328 +6,237 @@ | |
#include "../grapheme.h" | |
#include "util.h" | |
-static inline enum word_break_property | |
-get_break_prop(uint_least32_t cp) | |
+struct word_break_state | |
+{ | |
+ bool ri_even; | |
+}; | |
+ | |
+static inline uint_least8_t | |
+get_word_break_prop(uint_least32_t cp) | |
{ | |
if (likely(cp <= 0x10FFFF)) { | |
- return (enum word_break_property) | |
+ return (uint_least8_t) | |
word_break_minor[word_break_major[cp >> 8] + (cp & 0xff… | |
} else { | |
return WORD_BREAK_PROP_OTHER; | |
} | |
} | |
-static size_t | |
-next_word_break(const void *str, size_t len, size_t (*get_codepoint) | |
- (const void *, size_t, size_t, uint_least32_t *)) | |
+static bool | |
+is_skippable_word_prop(uint_least8_t prop) | |
{ | |
- struct { | |
- enum word_break_property a, b, c, d; | |
- } raw, skip; | |
- enum word_break_property res; | |
- uint_least32_t cp; | |
- size_t off, tmp, new_off; | |
- bool ri_even = true; | |
- | |
- /* check degenerate cases */ | |
- if (str == NULL || len == 0) { | |
- return 0; | |
- } | |
- | |
- /* | |
- * Apply word breaking algorithm (UAX #29), see | |
- * https://unicode.org/reports/tr29/#Word_Boundary_Rules | |
- * | |
- * There are 4 slots (a, b, c, d) of "break" properties and | |
- * we check if there is a break in the middle between b and c. | |
- * | |
- * The position of this middle spot is determined by off, | |
- * which gives the offset of the first element on the right | |
- * hand side of said spot, or, in other words, gives the number | |
- * of elements on the left hand side. | |
- * | |
- * It is further complicated by the fact that the algorithm | |
- * expects you to skip certain characters for the second | |
- * half of the rules (after WB4). Thus, we do not only have | |
- * the "raw" properties as described above, but also the "skip" | |
- * properties, where the skip.a and skip.b, for instance, | |
- * give the two preceding character properties behind the | |
- * currently investigated breakpoint. | |
- * | |
- */ | |
+ return prop == WORD_BREAK_PROP_EXTEND || | |
+ prop == WORD_BREAK_PROP_FORMAT || | |
+ prop == WORD_BREAK_PROP_ZWJ; | |
+} | |
- /* | |
- * Initialize the different properties such that we have | |
- * a good state after the state-update in the loop | |
- */ | |
- raw.b = NUM_WORD_BREAK_PROPS; | |
- if ((off = get_codepoint(str, len, 0, &cp)) >= len) { | |
- /* | |
- * A line is at least one codepoint long, so we can | |
- * safely return here | |
- */ | |
- return len; | |
- } | |
- raw.c = get_break_prop(cp); | |
- (void)get_codepoint(str, len, off, &cp); | |
- raw.d = get_break_prop(cp); | |
- skip.a = skip.b = NUM_WORD_BREAK_PROPS; | |
+static void | |
+word_skip_shift_callback(uint_least8_t prop, void *s) | |
+{ | |
+ struct word_break_state *state = (struct word_break_state *)s; | |
- for (; off < len; off = new_off) { | |
+ if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) { | |
/* | |
- * Update left side (a and b) of the skip state by | |
- * "shifting in" the raw.c property as long as it is | |
- * not one of the "ignored" character properties. | |
- * While at it, update the RI-counter. | |
+ * The property we just shifted in is | |
+ * a regional indicator, increasing the | |
+ * number of consecutive RIs on the left | |
+ * side of the breakpoint by one, changing | |
+ * the oddness. | |
* | |
*/ | |
- if (raw.c != WORD_BREAK_PROP_EXTEND && | |
- raw.c != WORD_BREAK_PROP_FORMAT && | |
- raw.c != WORD_BREAK_PROP_ZWJ) { | |
- skip.a = skip.b; | |
- skip.b = raw.c; | |
- | |
- if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) { | |
- /* | |
- * The property we just shifted in is | |
- * a regional indicator, increasing the | |
- * number of consecutive RIs on the left | |
- * side of the breakpoint by one, changing | |
- * the oddness. | |
- * | |
- */ | |
- ri_even = !ri_even; | |
- } else { | |
- /* | |
- * We saw no regional indicator, so the | |
- * number of consecutive RIs on the left | |
- * side of the breakpoint is zero, which | |
- * is an even number. | |
- * | |
- */ | |
- ri_even = true; | |
- } | |
- } | |
- | |
+ state->ri_even = !(state->ri_even); | |
+ } else { | |
/* | |
- * Update right side (b and c) of the skip state by | |
- * starting at the breakpoint and detecting the two | |
- * following non-ignored character classes | |
+ * We saw no regional indicator, so the | |
+ * number of consecutive RIs on the left | |
+ * side of the breakpoint is zero, which | |
+ * is an even number. | |
* | |
*/ | |
- skip.c = NUM_WORD_BREAK_PROPS; | |
- for (tmp = off; tmp < len; ) { | |
- tmp += get_codepoint(str, len, tmp, &cp); | |
- res = get_break_prop(cp); | |
- | |
- if (res != WORD_BREAK_PROP_EXTEND && | |
- res != WORD_BREAK_PROP_FORMAT && | |
- res != WORD_BREAK_PROP_ZWJ) { | |
- skip.c = res; | |
- break; | |
- } | |
- } | |
- skip.d = NUM_WORD_BREAK_PROPS; | |
- for (; tmp < len; ) { | |
- tmp += get_codepoint(str, len, tmp, &cp); | |
- res = get_break_prop(cp); | |
- | |
- if (res != WORD_BREAK_PROP_EXTEND && | |
- res != WORD_BREAK_PROP_FORMAT && | |
- res != WORD_BREAK_PROP_ZWJ) { | |
- skip.d = res; | |
- break; | |
- } | |
- } | |
+ state->ri_even = true; | |
+ } | |
+} | |
- /* | |
- * Update the raw state by simply shifting everything | |
- * in and, if we still have data left, determining | |
- * the character class of the next codepoint. | |
- * | |
- */ | |
- raw.a = raw.b; | |
- raw.b = raw.c; | |
- raw.c = raw.d; | |
- if ((new_off = off + get_codepoint(str, len, off, &cp)) < len)… | |
- get_codepoint(str, len, new_off, &cp); | |
- raw.d = get_break_prop(cp); | |
- } else { | |
- raw.d = NUM_WORD_BREAK_PROPS; | |
- } | |
+static size_t | |
+next_word_break(HERODOTUS_READER *r) | |
+{ | |
+ struct proper p; | |
+ struct word_break_state state = { .ri_even = true }; | |
+ /* | |
+ * Apply word breaking algorithm (UAX #29), see | |
+ * https://unicode.org/reports/tr29/#Word_Boundary_Rules | |
+ */ | |
+ proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop, | |
+ is_skippable_word_prop, word_skip_shift_callback, &p); | |
+ | |
+ while (!proper_advance(&p)) { | |
/* WB3 */ | |
- if (raw.b == WORD_BREAK_PROP_CR && | |
- raw.c == WORD_BREAK_PROP_LF) { | |
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR && | |
+ p.raw.next_prop[0] == WORD_BREAK_PROP_LF) { | |
continue; | |
} | |
/* WB3a */ | |
- if (raw.b == WORD_BREAK_PROP_NEWLINE || | |
- raw.b == WORD_BREAK_PROP_CR || | |
- raw.b == WORD_BREAK_PROP_LF) { | |
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE || | |
+ p.raw.prev_prop[0] == WORD_BREAK_PROP_CR || | |
+ p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) { | |
break; | |
} | |
/* WB3b */ | |
- if (raw.c == WORD_BREAK_PROP_NEWLINE || | |
- raw.c == WORD_BREAK_PROP_CR || | |
- raw.c == WORD_BREAK_PROP_LF) { | |
+ if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE || | |
+ p.raw.next_prop[0] == WORD_BREAK_PROP_CR || | |
+ p.raw.next_prop[0] == WORD_BREAK_PROP_LF) { | |
break; | |
} | |
/* WB3c */ | |
- if (raw.b == WORD_BREAK_PROP_ZWJ && | |
- (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC || | |
- raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) { | |
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ && | |
+ (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPH… | |
+ p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPIC… | |
continue; | |
} | |
/* WB3d */ | |
- if (raw.b == WORD_BREAK_PROP_WSEGSPACE && | |
- raw.c == WORD_BREAK_PROP_WSEGSPACE) { | |
+ if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE && | |
+ p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) { | |
continue; | |
} | |
/* WB4 */ | |
- if (raw.c == WORD_BREAK_PROP_EXTEND || | |
- raw.c == WORD_BREAK_PROP_FORMAT || | |
- raw.c == WORD_BREAK_PROP_ZWJ) { | |
+ if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND || | |
+ p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT || | |
+ p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) { | |
continue; | |
} | |
/* WB5 */ | |
- if ((skip.b == WORD_BREAK_PROP_ALETTER || | |
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) && | |
- (skip.c == WORD_BREAK_PROP_ALETTER || | |
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) { | |
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) && | |
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) { | |
continue; | |
} | |
/* WB6 */ | |
- if ((skip.b == WORD_BREAK_PROP_ALETTER || | |
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) && | |
- (skip.c == WORD_BREAK_PROP_MIDLETTER || | |
- skip.c == WORD_BREAK_PROP_MIDNUMLET || | |
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) && | |
- len > 2 && | |
- (skip.d == WORD_BREAK_PROP_ALETTER || | |
- skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) { | |
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) && | |
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER || | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET || | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) && | |
+ (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) { | |
continue; | |
} | |
/* WB7 */ | |
- if ((skip.b == WORD_BREAK_PROP_MIDLETTER || | |
- skip.b == WORD_BREAK_PROP_MIDNUMLET || | |
- skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) && | |
- (skip.c == WORD_BREAK_PROP_ALETTER || | |
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER) && | |
- len > 2 && | |
- (skip.a == WORD_BREAK_PROP_ALETTER || | |
- skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) { | |
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER || | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET || | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) && | |
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) && | |
+ (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) { | |
continue; | |
} | |
/* WB7a */ | |
- if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER && | |
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) { | |
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) { | |
continue; | |
} | |
/* WB7b */ | |
- if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER && | |
- skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE && | |
- len > 2 && | |
- skip.d == WORD_BREAK_PROP_HEBREW_LETTER) { | |
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE && | |
+ p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) { | |
continue; | |
} | |
/* WB7c */ | |
- if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE && | |
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER && | |
- off > 1 && | |
- skip.a == WORD_BREAK_PROP_HEBREW_LETTER) { | |
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER && | |
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) { | |
continue; | |
} | |
/* WB8 */ | |
- if (skip.b == WORD_BREAK_PROP_NUMERIC && | |
- skip.c == WORD_BREAK_PROP_NUMERIC) { | |
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) { | |
continue; | |
} | |
/* WB9 */ | |
- if ((skip.b == WORD_BREAK_PROP_ALETTER || | |
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER) && | |
- skip.c == WORD_BREAK_PROP_NUMERIC) { | |
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) { | |
continue; | |
} | |
/* WB10 */ | |
- if (skip.b == WORD_BREAK_PROP_NUMERIC && | |
- (skip.c == WORD_BREAK_PROP_ALETTER || | |
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) { | |
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && | |
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) { | |
continue; | |
} | |
/* WB11 */ | |
- if ((skip.b == WORD_BREAK_PROP_MIDNUM || | |
- skip.b == WORD_BREAK_PROP_MIDNUMLET || | |
- skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) && | |
- skip.c == WORD_BREAK_PROP_NUMERIC && | |
- off > 1 && | |
- skip.a == WORD_BREAK_PROP_NUMERIC) { | |
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM || | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET || | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC && | |
+ p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) { | |
continue; | |
} | |
/* WB12 */ | |
- if (skip.b == WORD_BREAK_PROP_NUMERIC && | |
- (skip.c == WORD_BREAK_PROP_MIDNUM || | |
- skip.c == WORD_BREAK_PROP_MIDNUMLET || | |
- skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) && | |
- len > 2 && | |
- skip.d == WORD_BREAK_PROP_NUMERIC) { | |
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && | |
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM || | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET || | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) && | |
+ p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) { | |
continue; | |
} | |
/* WB13 */ | |
- if (skip.b == WORD_BREAK_PROP_KATAKANA && | |
- skip.c == WORD_BREAK_PROP_KATAKANA) { | |
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) { | |
continue; | |
} | |
/* WB13a */ | |
- if ((skip.b == WORD_BREAK_PROP_ALETTER || | |
- skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.b == WORD_BREAK_PROP_HEBREW_LETTER || | |
- skip.b == WORD_BREAK_PROP_NUMERIC || | |
- skip.b == WORD_BREAK_PROP_KATAKANA || | |
- skip.b == WORD_BREAK_PROP_EXTENDNUMLET) && | |
- skip.c == WORD_BREAK_PROP_EXTENDNUMLET) { | |
+ if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER … | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC … | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA … | |
+ p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) { | |
continue; | |
} | |
/* WB13b */ | |
- if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET && | |
- (skip.c == WORD_BREAK_PROP_ALETTER || | |
- skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
- skip.c == WORD_BREAK_PROP_HEBREW_LETTER || | |
- skip.c == WORD_BREAK_PROP_NUMERIC || | |
- skip.c == WORD_BREAK_PROP_KATAKANA)) { | |
+ if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET && | |
+ (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER … | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPI… | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER … | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC … | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) { | |
continue; | |
} | |
/* WB15 and WB16 */ | |
- if (!ri_even && | |
- skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) { | |
+ if (!state.ri_even && | |
+ p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR)… | |
continue; | |
} | |
@@ -335,17 +244,25 @@ next_word_break(const void *str, size_t len, size_t (*get… | |
break; | |
} | |
- return off; | |
+ return herodotus_reader_number_read(&(p.mid_reader)); | |
} | |
size_t | |
grapheme_next_word_break(const uint_least32_t *str, size_t len) | |
{ | |
- return next_word_break(str, len, get_codepoint); | |
+ HERODOTUS_READER r; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); | |
+ | |
+ return next_word_break(&r); | |
} | |
size_t | |
grapheme_next_word_break_utf8(const char *str, size_t len) | |
{ | |
- return next_word_break(str, len, get_codepoint_utf8); | |
+ HERODOTUS_READER r; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); | |
+ | |
+ return next_word_break(&r); | |
} |