utf8pad: improve padded printing and printing invalid unicode characters - stag… | |
git clone git://bitreich.org/stagit-gopher/ git://enlrupgkhuxnvlhsf6lc3fziv5h2h… | |
Log | |
Files | |
Refs | |
Tags | |
README | |
LICENSE | |
--- | |
commit 554a9fe2e9d12defd9d6253871d8261d3f3ef3c6 | |
parent 7b93d02cd8f26ab9a25d967c72c359a22c91eb74 | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Sat, 9 Jan 2021 14:56:51 +0100 | |
utf8pad: improve padded printing and printing invalid unicode characters | |
- Use unicode replacement character (codepoint 0xfffd) when a codepoint is | |
invalid and proceed printing the rest of the characters. | |
- When a codepoint is invalid reset the internal state of mbtowc(3), from the | |
OpenBSD man page: | |
" If a call to mbtowc() resulted in an undefined internal state, mbtowc() | |
must be called with s set to NULL to reset the internal state before it | |
can safely be used again." | |
- Make the function return 0 when `len` is 0 (this should not be not an error). | |
Diffstat: | |
M stagit-gopher-index.c | 59 ++++++++++++++++++++++-------… | |
M stagit-gopher.c | 58 ++++++++++++++++++++++-------… | |
2 files changed, 83 insertions(+), 34 deletions(-) | |
--- | |
diff --git a/stagit-gopher-index.c b/stagit-gopher-index.c | |
@@ -10,6 +10,9 @@ | |
#include <git2.h> | |
+#define PAD_TRUNCATE_SYMBOL "\xe2\x80\xa6" /* symbol: "ellipsis" */ | |
+#define UTF_INVALID_SYMBOL "\xef\xbf\xbd" /* symbol: "replacement" */ | |
+ | |
static git_repository *repo; | |
static const char *relpath = ""; | |
@@ -17,40 +20,62 @@ static const char *relpath = ""; | |
static char description[255] = "Repositories"; | |
static char *name = ""; | |
-/* format `len' columns of characters. If string is shorter pad the rest | |
+/* Format `len' columns of characters. If string is shorter pad the rest | |
* with characters `pad`. */ | |
int | |
utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad) | |
{ | |
wchar_t wc; | |
size_t col = 0, i, slen, siz = 0; | |
- int rl, w; | |
+ int inc, rl, w; | |
- if (!len) | |
+ if (!bufsiz) | |
return -1; | |
+ if (!len) { | |
+ buf[0] = '\0'; | |
+ return 0; | |
+ } | |
slen = strlen(s); | |
- for (i = 0; i < slen; i += rl) { | |
- if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= … | |
- break; | |
- if ((w = wcwidth(wc)) == -1) | |
+ for (i = 0; i < slen; i += inc) { | |
+ inc = 1; | |
+ if ((unsigned char)s[i] < 32) | |
continue; | |
- if (col + w > len || (col + w == len && s[i + rl])) { | |
+ | |
+ rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4); | |
+ if (rl < 0) { | |
+ mbtowc(NULL, NULL, 0); /* reset state */ | |
+ inc = 1; /* next byte */ | |
+ w = 1; /* replacement char is one width */ | |
+ } else if ((w = wcwidth(wc)) == -1) { | |
+ continue; | |
+ } else { | |
+ inc = rl; | |
+ } | |
+ | |
+ if (col + w > len || (col + w == len && s[i + inc])) { | |
if (siz + 4 >= bufsiz) | |
return -1; | |
- memcpy(&buf[siz], "\xe2\x80\xa6", 3); | |
- siz += 3; | |
- if (col + w == len && w > 1) | |
- buf[siz++] = pad; | |
+ memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUN… | |
+ siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1; | |
buf[siz] = '\0'; | |
- return 0; | |
+ col++; | |
+ break; | |
+ } else if (rl < 0) { | |
+ if (siz + 4 >= bufsiz) | |
+ return -1; | |
+ memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVAL… | |
+ siz += sizeof(UTF_INVALID_SYMBOL) - 1; | |
+ buf[siz] = '\0'; | |
+ col++; | |
+ continue; | |
} | |
- if (siz + rl + 1 >= bufsiz) | |
+ if (siz + inc + 1 >= bufsiz) | |
return -1; | |
- memcpy(&buf[siz], &s[i], rl); | |
- col += w; | |
- siz += rl; | |
+ memcpy(&buf[siz], &s[i], inc); | |
+ siz += inc; | |
buf[siz] = '\0'; | |
+ col += w; | |
} | |
len -= col; | |
diff --git a/stagit-gopher.c b/stagit-gopher.c | |
@@ -19,6 +19,8 @@ | |
#include "compat.h" | |
#define LEN(s) (sizeof(s)/sizeof(*s)) | |
+#define PAD_TRUNCATE_SYMBOL "\xe2\x80\xa6" /* symbol: "ellipsis" */ | |
+#define UTF_INVALID_SYMBOL "\xef\xbf\xbd" /* symbol: "replacement" */ | |
struct deltainfo { | |
git_patch *patch; | |
@@ -80,40 +82,62 @@ static char lastoidstr[GIT_OID_HEXSZ + 2]; /* id + newline … | |
static FILE *rcachefp, *wcachefp; | |
static const char *cachefile; | |
-/* format `len' columns of characters. If string is shorter pad the rest | |
+/* Format `len' columns of characters. If string is shorter pad the rest | |
* with characters `pad`. */ | |
int | |
utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad) | |
{ | |
wchar_t wc; | |
size_t col = 0, i, slen, siz = 0; | |
- int rl, w; | |
+ int inc, rl, w; | |
- if (!len) | |
+ if (!bufsiz) | |
return -1; | |
+ if (!len) { | |
+ buf[0] = '\0'; | |
+ return 0; | |
+ } | |
slen = strlen(s); | |
- for (i = 0; i < slen; i += rl) { | |
- if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= … | |
- break; | |
- if ((w = wcwidth(wc)) == -1) | |
+ for (i = 0; i < slen; i += inc) { | |
+ inc = 1; | |
+ if ((unsigned char)s[i] < 32) | |
+ continue; | |
+ | |
+ rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4); | |
+ if (rl < 0) { | |
+ mbtowc(NULL, NULL, 0); /* reset state */ | |
+ inc = 1; /* next byte */ | |
+ w = 1; /* replacement char is one width */ | |
+ } else if ((w = wcwidth(wc)) == -1) { | |
continue; | |
- if (col + w > len || (col + w == len && s[i + rl])) { | |
+ } else { | |
+ inc = rl; | |
+ } | |
+ | |
+ if (col + w > len || (col + w == len && s[i + inc])) { | |
if (siz + 4 >= bufsiz) | |
return -1; | |
- memcpy(&buf[siz], "\xe2\x80\xa6", 3); | |
- siz += 3; | |
- if (col + w == len && w > 1) | |
- buf[siz++] = pad; | |
+ memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUN… | |
+ siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1; | |
buf[siz] = '\0'; | |
- return 0; | |
+ col++; | |
+ break; | |
+ } else if (rl < 0) { | |
+ if (siz + 4 >= bufsiz) | |
+ return -1; | |
+ memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVAL… | |
+ siz += sizeof(UTF_INVALID_SYMBOL) - 1; | |
+ buf[siz] = '\0'; | |
+ col++; | |
+ continue; | |
} | |
- if (siz + rl + 1 >= bufsiz) | |
+ if (siz + inc + 1 >= bufsiz) | |
return -1; | |
- memcpy(&buf[siz], &s[i], rl); | |
- col += w; | |
- siz += rl; | |
+ memcpy(&buf[siz], &s[i], inc); | |
+ siz += inc; | |
buf[siz] = '\0'; | |
+ col += w; | |
} | |
len -= col; |