GopherProxy

	overhaul utf8decode() - dmenu - dynamic menu
	git clone git://git.suckless.org/dmenu
	Log
	Files
	Refs
	README
	LICENSE
	---
	commit 51e32d49b56c86cd288c64fccf6cd765547781b9
	parent 7be720cc88ed2294338f7182600df10f21c575ce
	Author: NRK <[email protected]>
	Date: Thu, 4 Jul 2024 21:25:37 +0000

	overhaul utf8decode()

	this changes the utf8decode function to:

	* report when an error occurs
	* report how many bytes to advance on error

	these will be useful in the next commit to render invalid utf8
	sequences.

	the new implementation is also shorter and more direct.

	Diffstat:
	M drw.c \| 76 +++++++++++++----------------…

	1 file changed, 31 insertions(+), 45 deletions(-)
	---
	diff --git a/drw.c b/drw.c
	@@ -9,54 +9,40 @@
	#include "util.h"

	#define UTF_INVALID 0xFFFD
	-#define UTF_SIZ 4

	-static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF…
	-static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF…
	-static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x100…
	-static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FF…
	-
	-static long
	-utf8decodebyte(const char c, size_t *i)
	-{
	- for (i = 0; i < (UTF_SIZ + 1); ++(*i))
	- if (((unsigned char)c & utfmask[i]) == utfbyte[i])
	- return (unsigned char)c & ~utfmask[*i];
	- return 0;
	-}
	-
	-static size_t
	-utf8validate(long *u, size_t i)
	+static int
	+utf8decode(const char s_in, long u, int *err)
	{
	- if (!BETWEEN(u, utfmin[i], utfmax[i]) \|\| BETWEEN(u, 0xD800, 0xDFFF))
	- *u = UTF_INVALID;
	- for (i = 1; *u > utfmax[i]; ++i)
	- ;
	- return i;
	-}
	-
	-static size_t
	-utf8decode(const char c, long u, size_t clen)
	-{
	- size_t i, j, len, type;
	- long udecoded;
	-
	+ static const unsigned char lens[] = {
	+ /* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	+ /* 10XXX / 0, 0, 0, 0, 0, 0, 0, 0, / invalid */
	+ /* 110XX */ 2, 2, 2, 2,
	+ /* 1110X */ 3, 3,
	+ /* 11110 */ 4,
	+ /* 11111 / 0, / invalid */
	+ };
	+ static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
	+ static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
	+
	+ const unsigned char s = (const unsigned char )s_in;
	+ int len = lens[*s >> 3];
	*u = UTF_INVALID;
	- if (!clen)
	- return 0;
	- udecoded = utf8decodebyte(c[0], &len);
	- if (!BETWEEN(len, 1, UTF_SIZ))
	+ *err = 1;
	+ if (len == 0)
	return 1;
	- for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
	- udecoded = (udecoded << 6) \| utf8decodebyte(c[i], &type);
	- if (type)
	- return j;
	+
	+ long cp = s[0] & leading_mask[len - 1];
	+ for (int i = 1; i < len; ++i) {
	+ if (s[i] == '\0' \|\| (s[i] & 0xC0) != 0x80)
	+ return i;
	+ cp = (cp << 6) \| (s[i] & 0x3F);
	}
	- if (j < len)
	- return 0;
	- *u = udecoded;
	- utf8validate(u, len);
	+ /* out of range, surrogate, overlong encoding */
	+ if (cp > 0x10FFFF \|\| (cp >> 11) == 0x1B \|\| cp < overlong[len - 1])
	+ return len;

	+ *err = 0;
	+ *u = cp;
	return len;
	}

	@@ -242,7 +228,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned i…
	unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
	XftDraw *d = NULL;
	Fnt usedfont, curfont, *nextfont;
	- int utf8strlen, utf8charlen, render = x \|\| y \|\| w \|\| h;
	+ int utf8strlen, utf8charlen, utf8err, render = x \|\| y \|\| w \|\| h;
	long utf8codepoint = 0;
	const char *utf8str;
	FcCharSet *fccharset;
	@@ -272,11 +258,11 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned…
	if (!ellipsis_width && render)
	ellipsis_width = drw_fontset_getwidth(drw, "...");
	while (1) {
	- ew = ellipsis_len = utf8strlen = 0;
	+ ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
	utf8str = text;
	nextfont = NULL;
	while (*text) {
	- utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ…
	+ utf8charlen = utf8decode(text, &utf8codepoint, &utf8er…
	for (curfont = drw->fonts; curfont; curfont = curfont-…
	charexists = charexists \|\| XftCharExists(drw->…
	if (charexists) {