Refactor case-conversion-functions with Herodotus - libgrapheme - unicode strin… | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 5332f7ee034081618617c2b0785733ccc9ec8753 | |
parent 563eb65bfbaa4f27c77d73ae81b51882c916993d | |
Author: Laslo Hunhold <[email protected]> | |
Date: Wed, 21 Sep 2022 20:16:00 +0200 | |
Refactor case-conversion-functions with Herodotus | |
The readability of the code is greatly improved, and the code is now | |
much more robust than before. | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M src/case.c | 255 ++++++++++++++---------------… | |
1 file changed, 112 insertions(+), 143 deletions(-) | |
--- | |
diff --git a/src/case.c b/src/case.c | |
@@ -33,22 +33,18 @@ get_case_offset(uint_least32_t cp, const uint_least16_t *ma… | |
} | |
static inline size_t | |
-to_case(const void *src, size_t srclen, void *dest, size_t destlen, | |
- size_t srcnumprocess, uint_least8_t final_sigma_level, | |
- size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *… | |
- size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t), | |
- const uint_least16_t *major, const int_least32_t *minor, | |
- const struct special_case *sc) | |
+to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w, | |
+ uint_least8_t final_sigma_level, const uint_least16_t *major, | |
+ const int_least32_t *minor, const struct special_case *sc) | |
{ | |
+ HERODOTUS_READER tmp; | |
enum case_property prop; | |
- size_t srcoff, destoff, res, tmp, off, i; | |
+ enum herodotus_status s; | |
+ size_t off, i; | |
uint_least32_t cp, tmp_cp; | |
int_least32_t map; | |
- for (srcoff = 0, destoff = 0; srcoff < srcnumprocess; srcoff += res) { | |
- /* read in next source codepoint */ | |
- res = get_codepoint((const char *)src, srclen, srcoff, &cp); | |
- | |
+ for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCC… | |
if (sc == lower_special) { | |
/* | |
* For the special Final_Sigma-rule (see SpecialCasing… | |
@@ -72,8 +68,10 @@ to_case(const void *src, size_t srclen, void *dest, size_t d… | |
* if the succeeding character is cased, inval… | |
* the after-condition | |
*/ | |
- for (tmp = srcoff + res, prop = NUM_CASE_PROPS… | |
- tmp += get_codepoint(src, srclen, tmp,… | |
+ herodotus_reader_copy(r, &tmp); | |
+ for (prop = NUM_CASE_PROPS; | |
+ (s = herodotus_read_codepoint(&tmp, true,… | |
+ HERODOTUS_STATUS_SUCCESS; ) { | |
prop = get_case_property(tmp_cp); | |
if (prop != CASE_PROP_CASE_IGNORABLE && | |
@@ -83,20 +81,19 @@ to_case(const void *src, size_t srclen, void *dest, size_t … | |
} | |
/* | |
- * Now prop is something other than case-ignor… | |
+ * Now prop is something other than case-ignor… | |
+ * the source-string ended. | |
* If it is something other than cased, we know | |
* that the after-condition holds | |
*/ | |
- if (prop != CASE_PROP_CASED && | |
- prop != CASE_PROP_BOTH_CASED_CASE_IGNORABL… | |
+ if (s != HERODOTUS_STATUS_SUCCESS || | |
+ (prop != CASE_PROP_CASED && | |
+ prop != CASE_PROP_BOTH_CASED_CASE_IGNORAB… | |
/* | |
* write GREEK SMALL LETTER FINAL SIGM… | |
* destination | |
*/ | |
- destoff += set_codepoint(UINT32_C(0x03… | |
- dest, | |
- destlen, | |
- destoff); | |
+ herodotus_write_codepoint(w, UINT32_C(… | |
/* reset Final_Sigma-state and continu… | |
final_sigma_level = 0; | |
@@ -132,191 +129,163 @@ to_case(const void *src, size_t srclen, void *dest, siz… | |
off = (uint_least32_t)map - UINT32_C(0x110000); | |
for (i = 0; i < sc[off].cplen; i++) { | |
- if (likely(destoff < destlen)) { | |
- /* | |
- * write special mapping to destination | |
- */ | |
- destoff += set_codepoint(sc[off].cp[i], | |
- dest, | |
- destlen, | |
- destoff); | |
- } else { | |
- /* | |
- * further increase destoff to indicate | |
- * how much buffer space we need | |
- */ | |
- destoff += set_codepoint(sc[off].cp[i], | |
- NULL, 0, 0); | |
- } | |
+ herodotus_write_codepoint(w, sc[off].cp[i]); | |
} | |
} else { | |
/* we have a simple mapping */ | |
- if (likely(destoff < destlen)) { | |
- destoff += set_codepoint((uint_least32_t)((int… | |
- dest, destlen, destof… | |
- } else { | |
- destoff += set_codepoint((uint_least32_t)((int… | |
- NULL, 0, 0); | |
- } | |
+ herodotus_write_codepoint(w, (uint_least32_t) | |
+ ((int_least32_t)cp + map)); | |
} | |
} | |
- if (set_codepoint == set_codepoint_utf8 && destlen > 0) { | |
- /* | |
- * NUL-terminate destination to always ensure NUL-termination, | |
- * unless in check mode. | |
- * Just like with snprintf() a return value >= destlen indicat… | |
- * truncation. | |
- */ | |
- ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] … | |
+ herodotus_writer_nul_terminate(w); | |
+ | |
+ return herodotus_writer_number_written(w); | |
+} | |
+ | |
+static size_t | |
+herodotus_next_word_break(const HERODOTUS_READER *r) | |
+{ | |
+ if (r->src == NULL || r->off > r->srclen) { | |
+ return 0; | |
} | |
- return destoff; | |
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ return grapheme_next_word_break( | |
+ ((const uint_least32_t *)(r->src)) + r->off, | |
+ r->srclen - r->off); | |
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
+ return grapheme_next_word_break_utf8( | |
+ ((const char *)(r->src)) + r->off, | |
+ r->srclen - r->off); | |
+ } | |
} | |
static inline size_t | |
-to_titlecase(const void *src, size_t srclen, void *dest, size_t destlen, | |
- size_t (*get_codepoint)(const void *, size_t, size_t, uint_least3… | |
- size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t)) | |
+to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w) | |
{ | |
enum case_property prop; | |
- size_t next_wb, srcoff, destoff, res; | |
+ enum herodotus_status s; | |
uint_least32_t cp; | |
- for (srcoff = destoff = 0; ; ) { | |
- if (get_codepoint == get_codepoint_utf8) { | |
- if ((next_wb = grapheme_next_word_break_utf8((const ch… | |
- srclen - … | |
- /* we consumed all of the string */ | |
- break; | |
- } | |
- } else { | |
- if ((next_wb = grapheme_next_word_break((const uint_le… | |
- srclen - srcof… | |
- /* we consumed all of the string */ | |
- break; | |
- } | |
- } | |
- | |
- for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff … | |
+ for (;;) { | |
+ herodotus_reader_push_advance_limit(r, herodotus_next_word_bre… | |
+ for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODO… | |
/* check if we have a cased character */ | |
- res = get_codepoint(src, srclen, srcoff, &cp); | |
prop = get_case_property(cp); | |
if (prop == CASE_PROP_CASED || | |
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) { | |
break; | |
} else { | |
/* write the data to the output verbatim, it i… | |
- destoff += set_codepoint_utf8(cp, dest, destle… | |
- } | |
- } | |
+ herodotus_write_codepoint(w, cp); | |
- if (next_wb > 0) { | |
- /* get character length */ | |
- res = get_codepoint(src, srclen, srcoff, &cp); | |
- | |
- /* we have a cased character at srcoff, map it to titl… | |
- if (get_codepoint == get_codepoint_utf8) { | |
- destoff += to_case((const char *)src + srcoff, | |
- srclen - srcoff, | |
- (char *)dest + destoff, | |
- (destoff < destlen) ? (dest… | |
- res, 0, | |
- get_codepoint_utf8, | |
- set_codepoint_utf8, title_m… | |
- title_minor, title_special); | |
- } else { | |
- destoff += to_case((const uint_least32_t *)src… | |
- srclen - srcoff, | |
- (uint_least32_t *)dest + de… | |
- (destoff < destlen) ? (dest… | |
- res, 0, | |
- get_codepoint, | |
- set_codepoint, title_major, | |
- title_minor, title_special); | |
+ /* increment reader */ | |
+ herodotus_read_codepoint(r, true, &cp); | |
} | |
- | |
- /* we consumed a character */ | |
- srcoff += res; | |
- next_wb -= res; | |
} | |
- /* cast the rest of the codepoints in the word to lowercase */ | |
- if (get_codepoint == get_codepoint_utf8) { | |
- destoff += to_case((const char *)src + srcoff, | |
- srclen - srcoff, | |
- (char *)dest + destoff, | |
- (destoff < destlen) ? (destlen - de… | |
- next_wb, 1, | |
- get_codepoint_utf8, | |
- set_codepoint_utf8, lower_major, | |
- lower_minor, lower_special); | |
+ if (s == HERODOTUS_STATUS_END_OF_BUFFER) { | |
+ /* we are done */ | |
+ break; | |
+ } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) { | |
+ /* | |
+ * we did not encounter any cased character | |
+ * up to the word break | |
+ */ | |
+ continue; | |
} else { | |
- destoff += to_case((const uint_least32_t *)src + srcof… | |
- srclen - srcoff, | |
- (uint_least32_t *)dest + destoff, | |
- (destoff < destlen) ? (destlen - de… | |
- next_wb, 1, | |
- get_codepoint, | |
- set_codepoint, lower_major, | |
- lower_minor, lower_special); | |
+ /* | |
+ * we encountered a cased character before the word | |
+ * break, convert it to titlecase | |
+ */ | |
+ herodotus_reader_push_advance_limit(r, | |
+ herodotus_reader_next_codepoint_break(r)); | |
+ to_case(r, w, 0, title_major, title_minor, title_speci… | |
+ herodotus_reader_pop_limit(r); | |
} | |
- srcoff += next_wb; | |
- } | |
- if (set_codepoint == set_codepoint_utf8) { | |
- /* | |
- * NUL-terminate destination to always ensure NUL-termination. | |
- * Just like with snprintf() a return value >= destlen indicat… | |
- * truncation. | |
- */ | |
- ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] … | |
+ /* cast the rest of the codepoints in the word to lowercase */ | |
+ to_case(r, w, 1, lower_major, lower_minor, lower_special); | |
+ | |
+ herodotus_reader_pop_limit(r); | |
} | |
- return destoff; | |
+ herodotus_writer_nul_terminate(w); | |
+ | |
+ return herodotus_writer_number_written(w); | |
} | |
size_t | |
grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t… | |
{ | |
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, s… | |
- upper_major, upper_minor, upper_special); | |
+ HERODOTUS_READER r; | |
+ HERODOTUS_WRITER w; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); | |
+ | |
+ return to_case(&r, &w, 0, upper_major, upper_minor, upper_special); | |
} | |
size_t | |
grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t… | |
{ | |
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, s… | |
- lower_major, lower_minor, lower_special); | |
+ HERODOTUS_READER r; | |
+ HERODOTUS_WRITER w; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); | |
+ | |
+ return to_case(&r, &w, 0, lower_major, lower_minor, lower_special); | |
} | |
size_t | |
grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t… | |
{ | |
- return to_titlecase(src, srclen, dest, destlen, get_codepoint, | |
- set_codepoint); | |
+ HERODOTUS_READER r; | |
+ HERODOTUS_WRITER w; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen); | |
+ | |
+ return to_titlecase(&r, &w); | |
} | |
size_t | |
grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t … | |
{ | |
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_ut… | |
- upper_major, upper_minor, upper_special); | |
+ HERODOTUS_READER r; | |
+ HERODOTUS_WRITER w; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
+ | |
+ return to_case(&r, &w, 0, upper_major, upper_minor, upper_special); | |
} | |
size_t | |
grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t … | |
{ | |
- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_ut… | |
- lower_major, lower_minor, lower_special); | |
+ HERODOTUS_READER r; | |
+ HERODOTUS_WRITER w; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
+ return to_case(&r, &w, 0, lower_major, lower_minor, lower_special); | |
} | |
size_t | |
grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t … | |
{ | |
- return to_titlecase(src, srclen, dest, destlen, get_codepoint_utf8, | |
- set_codepoint_utf8); | |
+ HERODOTUS_READER r; | |
+ HERODOTUS_WRITER w; | |
+ | |
+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
+ | |
+ return to_titlecase(&r, &w); | |
} | |
static inline bool |