libc/wchar: Fix unicode handling - scc - simple c99 compiler | |
git clone git://git.simple-cc.org/scc | |
Log | |
Files | |
Refs | |
Submodules | |
README | |
LICENSE | |
--- | |
commit dee0c6f0c90f7d64fd45cb6e3c48321f4beaf81d | |
parent 2eaef0900f5ebd9f00bebd8ce899423a0b37b4bb | |
Author: Roberto E. Vargas Caballero <[email protected]> | |
Date: Wed, 26 Feb 2025 10:31:51 +0100 | |
libc/wchar: Fix unicode handling | |
* mbrtowc: validate input, handle 4-byte UTF-8 code points, set errno | |
* wcrtomb: if a UTF-8 sequence has N bytes, the leading byte has the first | |
N bits set (with ASCII characters a special case), not the first N-1 bits | |
* _validutf8: negate condition | |
Diffstat: | |
M src/libc/wchar/_validutf8.c | 2 +- | |
M src/libc/wchar/mbrtowc.c | 26 +++++++++++++++++--------- | |
M src/libc/wchar/wcrtomb.c | 9 +++++++-- | |
3 files changed, 25 insertions(+), 12 deletions(-) | |
--- | |
diff --git a/src/libc/wchar/_validutf8.c b/src/libc/wchar/_validutf8.c | |
@@ -23,7 +23,7 @@ _validutf8(wchar_t wc, int *nbytes) | |
}; | |
struct range *bp; | |
- for (bp = ranges; bp->begin <= wc && bp->end > wc; ++bp) | |
+ for (bp = ranges; bp->begin > wc || bp->end <= wc; ++bp) | |
; | |
*nbytes = bp->nbytes; | |
diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c | |
@@ -1,3 +1,5 @@ | |
+#include <errno.h> | |
+#include <stdlib.h> | |
#include <wchar.h> | |
#include "../libc.h" | |
@@ -8,37 +10,43 @@ size_t | |
mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n, | |
mbstate_t *restrict ps) | |
{ | |
- unsigned char *t = (unsigned char *) s; | |
+ const unsigned char *t = (const unsigned char *) s; | |
unsigned long wc; | |
unsigned c; | |
int i, len, maxlen; | |
- if (s == NULL) | |
+ if (t == NULL) | |
return 0; | |
+ if ((wc = *t) == 0) | |
+ goto return_code; | |
- wc = c = *t++; | |
+ c = *t++; | |
for (len = 0; n > 0 && c & 0x80; --n, ++len) | |
c <<= 1; | |
- if (n == 0 || len == 1 || len == 8) | |
- return -1; | |
+ if (n == 0 && c & 0x80) | |
+ return -2; | |
+ if (len == 1 || len > MB_CUR_MAX) | |
+ goto return_error; | |
if (len == 0) | |
goto return_code; | |
wc = (c & 0xFF) >> len; | |
for (i = 0; i < len-1; i++) { | |
if (((c = *t++) & 0xC0) != 0x80) | |
- return -1; | |
+ goto return_error; | |
wc <<= 6; | |
wc |= c & 0x3F; | |
} | |
if (!_validutf8(wc, &maxlen) || len != maxlen) | |
- return -1; | |
+ goto return_error; | |
return_code: | |
if (pwc) | |
*pwc = wc; | |
- if (*s == '\0') | |
- return 0; | |
return t - (unsigned char *) s; | |
+ | |
+return_error: | |
+ errno = EILSEQ; | |
+ return -1; | |
} | |
diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c | |
@@ -14,13 +14,18 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict p… | |
if (!s) | |
return 1; | |
+ if (c < 0x80) { | |
+ *s = wc; | |
+ return 1; | |
+ } | |
+ | |
if (!_validutf8(wc, &n)) { | |
errno = EILSEQ; | |
return -1; | |
} | |
- | |
n--; | |
- *s = 0; | |
+ | |
+ *s = 0x80; | |
for (i = 0; i < n; i++) { | |
*s >>= 1; | |
*s |= 0x80; |