utf8pad: improve padded printing and printing invalid unicode characters - sfee… | |
git clone git://git.codemadness.org/sfeed_curses | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 7f13213a355aba904f12a595b322909ce630fbe1 | |
parent 1c4116d1fa7db2ddf540d05df381cbf58e932981 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sat, 9 Jan 2021 14:57:57 +0100 | |
utf8pad: improve padded printing and printing invalid unicode characters | |
- Use unicode replacement character (codepoint 0xfffd) when a codepoint is | |
invalid and proceed printing the rest of the characters. | |
- When a codepoint is invalid reset the internal state of mbtowc(3), from the | |
OpenBSD man page: | |
" If a call to mbtowc() resulted in an undefined internal state, mbtowc() | |
must be called with s set to NULL to reset the internal state before it | |
can safely be used again." | |
- Make the function return 0 when `len` is 0 (this should not be not an error). | |
Diffstat: | |
M sfeed_curses.c | 76 +++++++++++++++++++++++------… | |
1 file changed, 56 insertions(+), 20 deletions(-) | |
--- | |
diff --git a/sfeed_curses.c b/sfeed_curses.c | |
@@ -30,6 +30,7 @@ | |
#define PAD_TRUNCATE_SYMBOL "\xe2\x80\xa6" /* symbol: "ellipsis" */ | |
#define SCROLLBAR_SYMBOL_BAR "\xe2\x94\x82" /* symbol: "light vertical" */ | |
#define SCROLLBAR_SYMBOL_TICK " " | |
+#define UTF_INVALID_SYMBOL "\xef\xbf\xbd" /* symbol: "replacement" */ | |
/* color-theme */ | |
#ifndef SFEED_THEME | |
@@ -310,15 +311,28 @@ colw(const char *s) | |
{ | |
wchar_t wc; | |
size_t col = 0, i, slen; | |
- int rl, w; | |
+ int inc, rl, w; | |
slen = strlen(s); | |
- for (i = 0; i < slen; i += rl) { | |
- if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= … | |
- break; | |
- if ((w = wcwidth(wc)) == -1) | |
+ for (i = 0; i < slen; i += inc) { | |
+ inc = 1; | |
+ if ((unsigned char)s[i] < 32) { | |
continue; | |
- col += w; | |
+ } else if ((unsigned char)s[i] >= 127) { | |
+ rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4); | |
+ if (rl < 0) { | |
+ mbtowc(NULL, NULL, 0); /* reset state */ | |
+ inc = 1; /* next byte */ | |
+ w = 1; /* replacement char is one width */ | |
+ } else if ((w = wcwidth(wc)) == -1) { | |
+ continue; | |
+ } else { | |
+ inc = rl; | |
+ } | |
+ col += w; | |
+ } else { | |
+ col++; | |
+ } | |
} | |
return col; | |
} | |
@@ -330,33 +344,55 @@ utf8pad(char *buf, size_t bufsiz, const char *s, size_t l… | |
{ | |
wchar_t wc; | |
size_t col = 0, i, slen, siz = 0; | |
- int rl, w; | |
+ int inc, rl, w; | |
- if (!len) | |
+ if (!bufsiz) | |
return -1; | |
+ if (!len) { | |
+ buf[0] = '\0'; | |
+ return 0; | |
+ } | |
slen = strlen(s); | |
- for (i = 0; i < slen; i += rl) { | |
- if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= … | |
- break; | |
- if ((w = wcwidth(wc)) == -1) | |
+ for (i = 0; i < slen; i += inc) { | |
+ inc = 1; | |
+ if ((unsigned char)s[i] < 32) | |
+ continue; | |
+ | |
+ rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4); | |
+ if (rl < 0) { | |
+ mbtowc(NULL, NULL, 0); /* reset state */ | |
+ inc = 1; /* next byte */ | |
+ w = 1; /* replacement char is one width */ | |
+ } else if ((w = wcwidth(wc)) == -1) { | |
continue; | |
- if (col + w > len || (col + w == len && s[i + rl])) { | |
+ } else { | |
+ inc = rl; | |
+ } | |
+ | |
+ if (col + w > len || (col + w == len && s[i + inc])) { | |
if (siz + 4 >= bufsiz) | |
return -1; | |
memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUN… | |
siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1; | |
- if (col + w == len && w > 1) | |
- buf[siz++] = pad; | |
buf[siz] = '\0'; | |
- return 0; | |
+ col++; | |
+ break; | |
+ } else if (rl < 0) { | |
+ if (siz + 4 >= bufsiz) | |
+ return -1; | |
+ memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVAL… | |
+ siz += sizeof(UTF_INVALID_SYMBOL) - 1; | |
+ buf[siz] = '\0'; | |
+ col++; | |
+ continue; | |
} | |
- if (siz + rl + 1 >= bufsiz) | |
+ if (siz + inc + 1 >= bufsiz) | |
return -1; | |
- memcpy(&buf[siz], &s[i], rl); | |
- col += w; | |
- siz += rl; | |
+ memcpy(&buf[siz], &s[i], inc); | |
+ siz += inc; | |
buf[siz] = '\0'; | |
+ col += w; | |
} | |
len -= col; |