Introduction
Introduction Statistics Contact Development Disclaimer Help
tnew utf decoder - st - [fork] customized build of st, the simple terminal
git clone git://src.adamsgaard.dk/st
Log
Files
Refs
README
LICENSE
---
commit 45b808b88ee63f21a188800ba3473a24a3c4b987
parent 71328cbcdc88f4fdfbb62d8c0324938e245c8971
Author: Damian Okrasa <[email protected]>
Date: Tue, 25 Mar 2014 20:20:26 +0100
new utf decoder
This patch replaces current utf decoder with a new one, which is ~50
lines shorter and should be easier to understand. Parsing 5 and 6
sequences, if necessary, requires trivial modification of UTF_SIZ
constant and utfbyte, utfmask, utfmin, utfmax arrays.
Diffstat:
M st.c | 214 ++++++++++++-----------------…
1 file changed, 81 insertions(+), 133 deletions(-)
---
diff --git a/st.c b/st.c
t@@ -55,6 +55,7 @@ char *argv0;
#define XEMBED_FOCUS_OUT 5
/* Arbitrary sizes */
+#define UTF_INVALID 0xFFFD
#define UTF_SIZ 4
#define ESC_BUF_SIZ (128*UTF_SIZ)
#define ESC_ARG_SIZ 16
t@@ -442,10 +443,12 @@ static void selcopy(void);
static void selscroll(int, int);
static void selsnap(int, int *, int *, int);
-static int utf8decode(char *, long *);
-static int utf8encode(long *, char *);
-static int utf8size(char *);
-static int isfullutf8(char *, int);
+static size_t utf8decode(char *, long *, size_t);
+static long utf8decodebyte(char, size_t *);
+static size_t utf8encode(long, char *, size_t);
+static char utf8encodebyte(long, size_t);
+static size_t utf8len(char *);
+static size_t utf8validate(long *, size_t);
static ssize_t xwrite(int, char *, size_t);
static void *xmalloc(size_t);
t@@ -490,6 +493,11 @@ static int oldbutton = 3; /* button event on startup: 3 =…
static char *usedfont = NULL;
static double usedfontsize = 0;
+static uchar utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
+static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
+static long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
+static long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
+
/* Font Ring Cache */
enum {
FRC_NORMAL,
t@@ -549,128 +557,69 @@ xstrdup(char *s) {
return p;
}
-int
-utf8decode(char *s, long *u) {
- uchar c;
- int i, n, rtn;
-
- rtn = 1;
- c = *s;
- if(~c & 0x80) { /* 0xxxxxxx */
- *u = c;
- return rtn;
- } else if((c & 0xE0) == 0xC0) { /* 110xxxxx */
- *u = c & 0x1F;
- n = 1;
- } else if((c & 0xF0) == 0xE0) { /* 1110xxxx */
- *u = c & 0x0F;
- n = 2;
- } else if((c & 0xF8) == 0xF0) { /* 11110xxx */
- *u = c & 0x07;
- n = 3;
- } else {
- goto invalid;
- }
-
- for(i = n, ++s; i > 0; --i, ++rtn, ++s) {
- c = *s;
- if((c & 0xC0) != 0x80) /* 10xxxxxx */
- goto invalid;
- *u <<= 6;
- *u |= c & 0x3F;
- }
-
- if((n == 1 && *u < 0x80) ||
- (n == 2 && *u < 0x800) ||
- (n == 3 && *u < 0x10000) ||
- (*u >= 0xD800 && *u <= 0xDFFF)) {
- goto invalid;
- }
-
- return rtn;
-invalid:
- *u = 0xFFFD;
-
- return rtn;
-}
+size_t
+utf8decode(char *c, long *u, size_t clen) {
+ size_t i, j, len, type;
+ long udecoded;
-int
-utf8encode(long *u, char *s) {
- uchar *sp;
- ulong uc;
- int i, n;
-
- sp = (uchar *)s;
- uc = *u;
- if(uc < 0x80) {
- *sp = uc; /* 0xxxxxxx */
+ *u = UTF_INVALID;
+ if(!clen)
+ return 0;
+ udecoded = utf8decodebyte(c[0], &len);
+ if(!BETWEEN(len, 1, UTF_SIZ))
return 1;
- } else if(*u < 0x800) {
- *sp = (uc >> 6) | 0xC0; /* 110xxxxx */
- n = 1;
- } else if(uc < 0x10000) {
- *sp = (uc >> 12) | 0xE0; /* 1110xxxx */
- n = 2;
- } else if(uc <= 0x10FFFF) {
- *sp = (uc >> 18) | 0xF0; /* 11110xxx */
- n = 3;
- } else {
- goto invalid;
+ for(i = 1, j = 1; i < clen && j < len; ++i, ++j) {
+ udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
+ if(type != 0)
+ return j;
}
+ if(j < len)
+ return 0;
+ *u = udecoded;
+ utf8validate(u, len);
+ return len;
+}
- for(i=n,++sp; i>0; --i,++sp)
- *sp = ((uc >> 6*(i-1)) & 0x3F) | 0x80; /* 10xxxxxx */
-
- return n+1;
-invalid:
- /* U+FFFD */
- *s++ = '\xEF';
- *s++ = '\xBF';
- *s = '\xBD';
-
- return 3;
+long
+utf8decodebyte(char c, size_t *i) {
+ for(*i = 0; *i < LEN(utfmask); ++(*i))
+ if(((uchar)c & utfmask[*i]) == utfbyte[*i])
+ return (uchar)c & ~utfmask[*i];
+ return 0;
}
-/* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode
- UTF-8 otherwise return 0 */
-int
-isfullutf8(char *s, int b) {
- uchar *c1, *c2, *c3;
+size_t
+utf8encode(long u, char *c, size_t clen) {
+ size_t len, i;
- c1 = (uchar *)s;
- c2 = (uchar *)++s;
- c3 = (uchar *)++s;
- if(b < 1) {
+ len = utf8validate(&u, 0);
+ if(clen < len)
return 0;
- } else if((*c1 & 0xE0) == 0xC0 && b == 1) {
- return 0;
- } else if((*c1 & 0xF0) == 0xE0 &&
- ((b == 1) ||
- ((b == 2) && (*c2 & 0xC0) == 0x80))) {
- return 0;
- } else if((*c1 & 0xF8) == 0xF0 &&
- ((b == 1) ||
- ((b == 2) && (*c2 & 0xC0) == 0x80) ||
- ((b == 3) && (*c2 & 0xC0) == 0x80 && (*c3 & 0xC0) == 0x80))) {
- return 0;
- } else {
- return 1;
+ for(i = len - 1; i != 0; --i) {
+ c[i] = utf8encodebyte(u, 0);
+ u >>= 6;
}
+ c[0] = utf8encodebyte(u, len);
+ return len;
}
-int
-utf8size(char *s) {
- uchar c = *s;
+char
+utf8encodebyte(long u, size_t i) {
+ return utfbyte[i] | (u & ~utfmask[i]);
+}
- if(~c & 0x80) {
- return 1;
- } else if((c & 0xE0) == 0xC0) {
- return 2;
- } else if((c & 0xF0) == 0xE0) {
- return 3;
- } else {
- return 4;
- }
+size_t
+utf8len(char *c) {
+ return utf8decode(c, &(long){0}, UTF_SIZ);
+}
+
+size_t
+utf8validate(long *u, size_t i) {
+ if(!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
+ *u = UTF_INVALID;
+ for(i = 1; *u > utfmax[i]; ++i)
+ ;
+ return i;
}
static void
t@@ -984,7 +933,7 @@ getsel(void) {
if(!selected(x, y) || (gp->mode & ATTR_WDUMMY))
continue;
- size = utf8size(gp->c);
+ size = utf8len(gp->c);
memcpy(ptr, gp->c, size);
ptr += size;
}
t@@ -1298,7 +1247,7 @@ ttyread(void) {
char *ptr;
char s[UTF_SIZ];
int charsize; /* size of utf8 char in bytes */
- long utf8c;
+ long unicodep;
int ret;
/* append read bytes to unprocessed bytes */
t@@ -1308,9 +1257,8 @@ ttyread(void) {
/* process every complete utf8 char */
buflen += ret;
ptr = buf;
- while(buflen >= UTF_SIZ || isfullutf8(ptr,buflen)) {
- charsize = utf8decode(ptr, &utf8c);
- utf8encode(&utf8c, s);
+ while(charsize = utf8decode(ptr, &unicodep, buflen)) {
+ utf8encode(unicodep, s, UTF_SIZ);
tputc(s, charsize);
ptr += charsize;
buflen -= charsize;
t@@ -2414,14 +2362,14 @@ void
tputc(char *c, int len) {
uchar ascii = *c;
bool control = ascii < '\x20' || ascii == 0177;
- long u8char;
+ long unicodep;
int width;
if(len == 1) {
width = 1;
} else {
- utf8decode(c, &u8char);
- width = wcwidth(u8char);
+ utf8decode(c, &unicodep, UTF_SIZ);
+ width = wcwidth(unicodep);
}
if(IS_SET(MODE_PRINT))
t@@ -3150,7 +3098,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, i…
int frcflags;
int u8fl, u8fblen, u8cblen, doesexist;
char *u8c, *u8fs;
- long u8char;
+ long unicodep;
Font *font = &dc.font;
FcResult fcres;
FcPattern *fcpattern, *fontpattern;
t@@ -3293,11 +3241,11 @@ xdraws(char *s, Glyph base, int x, int y, int charlen,…
oneatatime = font->width != xw.cw;
for(;;) {
u8c = s;
- u8cblen = utf8decode(s, &u8char);
+ u8cblen = utf8decode(s, &unicodep, UTF_SIZ);
s += u8cblen;
bytelen -= u8cblen;
- doesexist = XftCharExists(xw.dpy, font->match, u8char);
+ doesexist = XftCharExists(xw.dpy, font->match, unicode…
if(oneatatime || !doesexist || bytelen <= 0) {
if(oneatatime || bytelen <= 0) {
if(doesexist) {
t@@ -3329,7 +3277,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, i…
/* Search the font cache. */
for(i = 0; i < frclen; i++) {
- if(XftCharExists(xw.dpy, frc[i].font, u8char)
+ if(XftCharExists(xw.dpy, frc[i].font, unicodep)
&& frc[i].flags == frcflags) {
break;
}
t@@ -3351,7 +3299,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, i…
fcpattern = FcPatternDuplicate(font->pattern);
fccharset = FcCharSetCreate();
- FcCharSetAddChar(fccharset, u8char);
+ FcCharSetAddChar(fccharset, unicodep);
FcPatternAddCharSet(fcpattern, FC_CHARSET,
fccharset);
FcPatternAddBool(fcpattern, FC_SCALABLE,
t@@ -3387,7 +3335,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, i…
xp, winy + frc[i].font->ascent,
(FcChar8 *)u8c, u8cblen);
- xp += xw.cw * wcwidth(u8char);
+ xp += xw.cw * wcwidth(unicodep);
}
/*
t@@ -3430,7 +3378,7 @@ xdrawcursor(void) {
memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ);
/* remove the old cursor */
- sl = utf8size(term.line[oldy][oldx].c);
+ sl = utf8len(term.line[oldy][oldx].c);
width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1;
xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx,
oldy, width, sl);
t@@ -3444,7 +3392,7 @@ xdrawcursor(void) {
g.bg = defaultfg;
}
- sl = utf8size(g.c);
+ sl = utf8len(g.c);
width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\
? 2 : 1;
xdraws(g.c, g, term.c.x, term.c.y, width, sl);
t@@ -3516,7 +3464,7 @@ drawregion(int x1, int y1, int x2, int y2) {
Glyph base, new;
char buf[DRAW_BUF_SIZ];
bool ena_sel = sel.ob.x != -1;
- long u8char;
+ long unicodep;
if(sel.alt ^ IS_SET(MODE_ALTSCREEN))
ena_sel = 0;
t@@ -3548,7 +3496,7 @@ drawregion(int x1, int y1, int x2, int y2) {
base = new;
}
- sl = utf8decode(new.c, &u8char);
+ sl = utf8decode(new.c, &unicodep, UTF_SIZ);
memcpy(buf+ib, new.c, sl);
ib += sl;
ic += (new.mode & ATTR_WIDE)? 2 : 1;
t@@ -3707,7 +3655,7 @@ kpress(XEvent *ev) {
if(IS_SET(MODE_8BIT)) {
if(*buf < 0177) {
c = *buf | 0x80;
- len = utf8encode(&c, buf);
+ len = utf8encode(c, buf, UTF_SIZ);
}
} else {
buf[1] = buf[0];
You are viewing proxied material from mx1.adamsgaard.dk. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.