GopherProxy

	Refactor case-conversion-functions with Herodotus - libgrapheme - unicode strin…
	git clone git://git.suckless.org/libgrapheme
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit 5332f7ee034081618617c2b0785733ccc9ec8753
	parent 563eb65bfbaa4f27c77d73ae81b51882c916993d
	Author: Laslo Hunhold <[email protected]>
	Date: Wed, 21 Sep 2022 20:16:00 +0200

	Refactor case-conversion-functions with Herodotus

	The readability of the code is greatly improved, and the code is now
	much more robust than before.

	Signed-off-by: Laslo Hunhold <[email protected]>

	Diffstat:
	M src/case.c \| 255 ++++++++++++++---------------…

	1 file changed, 112 insertions(+), 143 deletions(-)
	---
	diff --git a/src/case.c b/src/case.c
	@@ -33,22 +33,18 @@ get_case_offset(uint_least32_t cp, const uint_least16_t *ma…
	}

	static inline size_t
	-to_case(const void src, size_t srclen, void dest, size_t destlen,
	- size_t srcnumprocess, uint_least8_t final_sigma_level,
	- size_t (get_codepoint)(const void , size_t, size_t, uint_least32_t *…
	- size_t (set_codepoint)(uint_least32_t, void , size_t, size_t),
	- const uint_least16_t major, const int_least32_t minor,
	- const struct special_case *sc)
	+to_case(HERODOTUS_READER r, HERODOTUS_WRITER w,
	+ uint_least8_t final_sigma_level, const uint_least16_t *major,
	+ const int_least32_t minor, const struct special_case sc)
	{
	+ HERODOTUS_READER tmp;
	enum case_property prop;
	- size_t srcoff, destoff, res, tmp, off, i;
	+ enum herodotus_status s;
	+ size_t off, i;
	uint_least32_t cp, tmp_cp;
	int_least32_t map;

	- for (srcoff = 0, destoff = 0; srcoff < srcnumprocess; srcoff += res) {
	- /* read in next source codepoint */
	- res = get_codepoint((const char *)src, srclen, srcoff, &cp);
	-
	+ for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCC…
	if (sc == lower_special) {
	/*
	* For the special Final_Sigma-rule (see SpecialCasing…
	@@ -72,8 +68,10 @@ to_case(const void src, size_t srclen, void dest, size_t d…
	* if the succeeding character is cased, inval…
	* the after-condition
	*/
	- for (tmp = srcoff + res, prop = NUM_CASE_PROPS…
	- tmp += get_codepoint(src, srclen, tmp,…
	+ herodotus_reader_copy(r, &tmp);
	+ for (prop = NUM_CASE_PROPS;
	+ (s = herodotus_read_codepoint(&tmp, true,…
	+ HERODOTUS_STATUS_SUCCESS; ) {
	prop = get_case_property(tmp_cp);

	if (prop != CASE_PROP_CASE_IGNORABLE &&
	@@ -83,20 +81,19 @@ to_case(const void src, size_t srclen, void dest, size_t …
	}

	/*
	- * Now prop is something other than case-ignor…
	+ * Now prop is something other than case-ignor…
	+ * the source-string ended.
	* If it is something other than cased, we know
	* that the after-condition holds
	*/
	- if (prop != CASE_PROP_CASED &&
	- prop != CASE_PROP_BOTH_CASED_CASE_IGNORABL…
	+ if (s != HERODOTUS_STATUS_SUCCESS \|\|
	+ (prop != CASE_PROP_CASED &&
	+ prop != CASE_PROP_BOTH_CASED_CASE_IGNORAB…
	/*
	* write GREEK SMALL LETTER FINAL SIGM…
	* destination
	*/
	- destoff += set_codepoint(UINT32_C(0x03…
	- dest,
	- destlen,
	- destoff);
	+ herodotus_write_codepoint(w, UINT32_C(…

	/* reset Final_Sigma-state and continu…
	final_sigma_level = 0;
	@@ -132,191 +129,163 @@ to_case(const void src, size_t srclen, void dest, siz…
	off = (uint_least32_t)map - UINT32_C(0x110000);

	for (i = 0; i < sc[off].cplen; i++) {
	- if (likely(destoff < destlen)) {
	- /*
	- * write special mapping to destination
	- */
	- destoff += set_codepoint(sc[off].cp[i],
	- dest,
	- destlen,
	- destoff);
	- } else {
	- /*
	- * further increase destoff to indicate
	- * how much buffer space we need
	- */
	- destoff += set_codepoint(sc[off].cp[i],
	- NULL, 0, 0);
	- }
	+ herodotus_write_codepoint(w, sc[off].cp[i]);
	}
	} else {
	/* we have a simple mapping */
	- if (likely(destoff < destlen)) {
	- destoff += set_codepoint((uint_least32_t)((int…
	- dest, destlen, destof…
	- } else {
	- destoff += set_codepoint((uint_least32_t)((int…
	- NULL, 0, 0);
	- }
	+ herodotus_write_codepoint(w, (uint_least32_t)
	+ ((int_least32_t)cp + map));
	}
	}

	- if (set_codepoint == set_codepoint_utf8 && destlen > 0) {
	- /*
	- * NUL-terminate destination to always ensure NUL-termination,
	- * unless in check mode.
	- * Just like with snprintf() a return value >= destlen indicat…
	- * truncation.
	- */
	- ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] …
	+ herodotus_writer_nul_terminate(w);
	+
	+ return herodotus_writer_number_written(w);
	+}
	+
	+static size_t
	+herodotus_next_word_break(const HERODOTUS_READER *r)
	+{
	+ if (r->src == NULL \|\| r->off > r->srclen) {
	+ return 0;
	}

	- return destoff;
	+ if (r->type == HERODOTUS_TYPE_CODEPOINT) {
	+ return grapheme_next_word_break(
	+ ((const uint_least32_t *)(r->src)) + r->off,
	+ r->srclen - r->off);
	+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */
	+ return grapheme_next_word_break_utf8(
	+ ((const char *)(r->src)) + r->off,
	+ r->srclen - r->off);
	+ }
	}

	static inline size_t
	-to_titlecase(const void src, size_t srclen, void dest, size_t destlen,
	- size_t (get_codepoint)(const void , size_t, size_t, uint_least3…
	- size_t (set_codepoint)(uint_least32_t, void , size_t, size_t))
	+to_titlecase(HERODOTUS_READER r, HERODOTUS_WRITER w)
	{
	enum case_property prop;
	- size_t next_wb, srcoff, destoff, res;
	+ enum herodotus_status s;
	uint_least32_t cp;

	- for (srcoff = destoff = 0; ; ) {
	- if (get_codepoint == get_codepoint_utf8) {
	- if ((next_wb = grapheme_next_word_break_utf8((const ch…
	- srclen - …
	- /* we consumed all of the string */
	- break;
	- }
	- } else {
	- if ((next_wb = grapheme_next_word_break((const uint_le…
	- srclen - srcof…
	- /* we consumed all of the string */
	- break;
	- }
	- }
	-
	- for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff …
	+ for (;;) {
	+ herodotus_reader_push_advance_limit(r, herodotus_next_word_bre…
	+ for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODO…
	/* check if we have a cased character */
	- res = get_codepoint(src, srclen, srcoff, &cp);
	prop = get_case_property(cp);
	if (prop == CASE_PROP_CASED \|\|
	prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
	break;
	} else {
	/* write the data to the output verbatim, it i…
	- destoff += set_codepoint_utf8(cp, dest, destle…
	- }
	- }
	+ herodotus_write_codepoint(w, cp);

	- if (next_wb > 0) {
	- /* get character length */
	- res = get_codepoint(src, srclen, srcoff, &cp);
	-
	- /* we have a cased character at srcoff, map it to titl…
	- if (get_codepoint == get_codepoint_utf8) {
	- destoff += to_case((const char *)src + srcoff,
	- srclen - srcoff,
	- (char *)dest + destoff,
	- (destoff < destlen) ? (dest…
	- res, 0,
	- get_codepoint_utf8,
	- set_codepoint_utf8, title_m…
	- title_minor, title_special);
	- } else {
	- destoff += to_case((const uint_least32_t *)src…
	- srclen - srcoff,
	- (uint_least32_t *)dest + de…
	- (destoff < destlen) ? (dest…
	- res, 0,
	- get_codepoint,
	- set_codepoint, title_major,
	- title_minor, title_special);
	+ /* increment reader */
	+ herodotus_read_codepoint(r, true, &cp);
	}
	-
	- /* we consumed a character */
	- srcoff += res;
	- next_wb -= res;
	}

	- /* cast the rest of the codepoints in the word to lowercase */
	- if (get_codepoint == get_codepoint_utf8) {
	- destoff += to_case((const char *)src + srcoff,
	- srclen - srcoff,
	- (char *)dest + destoff,
	- (destoff < destlen) ? (destlen - de…
	- next_wb, 1,
	- get_codepoint_utf8,
	- set_codepoint_utf8, lower_major,
	- lower_minor, lower_special);
	+ if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
	+ /* we are done */
	+ break;
	+ } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
	+ /*
	+ * we did not encounter any cased character
	+ * up to the word break
	+ */
	+ continue;
	} else {
	- destoff += to_case((const uint_least32_t *)src + srcof…
	- srclen - srcoff,
	- (uint_least32_t *)dest + destoff,
	- (destoff < destlen) ? (destlen - de…
	- next_wb, 1,
	- get_codepoint,
	- set_codepoint, lower_major,
	- lower_minor, lower_special);
	+ /*
	+ * we encountered a cased character before the word
	+ * break, convert it to titlecase
	+ */
	+ herodotus_reader_push_advance_limit(r,
	+ herodotus_reader_next_codepoint_break(r));
	+ to_case(r, w, 0, title_major, title_minor, title_speci…
	+ herodotus_reader_pop_limit(r);
	}
	- srcoff += next_wb;
	- }

	- if (set_codepoint == set_codepoint_utf8) {
	- /*
	- * NUL-terminate destination to always ensure NUL-termination.
	- * Just like with snprintf() a return value >= destlen indicat…
	- * truncation.
	- */
	- ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] …
	+ /* cast the rest of the codepoints in the word to lowercase */
	+ to_case(r, w, 1, lower_major, lower_minor, lower_special);
	+
	+ herodotus_reader_pop_limit(r);
	}

	- return destoff;
	+ herodotus_writer_nul_terminate(w);
	+
	+ return herodotus_writer_number_written(w);
	}

	size_t
	grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t…
	{
	- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, s…
	- upper_major, upper_minor, upper_special);
	+ HERODOTUS_READER r;
	+ HERODOTUS_WRITER w;
	+
	+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
	+
	+ return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
	}

	size_t
	grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t…
	{
	- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, s…
	- lower_major, lower_minor, lower_special);
	+ HERODOTUS_READER r;
	+ HERODOTUS_WRITER w;
	+
	+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
	+
	+ return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
	}

	size_t
	grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t…
	{
	- return to_titlecase(src, srclen, dest, destlen, get_codepoint,
	- set_codepoint);
	+ HERODOTUS_READER r;
	+ HERODOTUS_WRITER w;
	+
	+ herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	+ herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
	+
	+ return to_titlecase(&r, &w);
	}

	size_t
	grapheme_to_uppercase_utf8(const char src, size_t srclen, char dest, size_t …
	{
	- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_ut…
	- upper_major, upper_minor, upper_special);
	+ HERODOTUS_READER r;
	+ HERODOTUS_WRITER w;
	+
	+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
	+
	+ return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
	}

	size_t
	grapheme_to_lowercase_utf8(const char src, size_t srclen, char dest, size_t …
	{
	- return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_ut…
	- lower_major, lower_minor, lower_special);
	+ HERODOTUS_READER r;
	+ HERODOTUS_WRITER w;
	+
	+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);

	+ return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
	}

	size_t
	grapheme_to_titlecase_utf8(const char src, size_t srclen, char dest, size_t …
	{
	- return to_titlecase(src, srclen, dest, destlen, get_codepoint_utf8,
	- set_codepoint_utf8);
	+ HERODOTUS_READER r;
	+ HERODOTUS_WRITER w;
	+
	+ herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	+ herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
	+
	+ return to_titlecase(&r, &w);
	}

	static inline bool