| tconvert to 4-byte UTF-8 and 32-bit Rune - plan9port - [fork] Plan 9 from user … | |
| git clone git://src.adamsgaard.dk/plan9port | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| commit 0cadb4301d18724e7513d7489cb5bebd262c82f1 | |
| parent 4dbf255619efac4f0a00e4216d6c999128910df2 | |
| Author: Russ Cox <[email protected]> | |
| Date: Fri, 11 Sep 2009 17:03:06 -0400 | |
| convert to 4-byte UTF-8 and 32-bit Rune | |
| http://codereview.appspot.com/116075 | |
| Diffstat: | |
| M include/fmt.h | 2 +- | |
| M include/utf.h | 7 ++++--- | |
| M src/cmd/9term/wind.c | 2 +- | |
| M src/cmd/acme/regx.c | 4 ++-- | |
| M src/cmd/sam/cmd.c | 2 +- | |
| M src/cmd/sam/regexp.c | 4 ++-- | |
| M src/cmd/sed.c | 4 ++-- | |
| M src/cmd/tr.c | 2 +- | |
| M src/cmd/troff/mbwc.c | 4 ++-- | |
| M src/cmd/unicode.c | 6 +++--- | |
| M src/lib9/fmt/dofmt.c | 11 ++++++----- | |
| M src/lib9/utf/rune.c | 78 +++++++++++++++++++++++------… | |
| M src/libbio/bgetrune.c | 2 +- | |
| M src/libbio/bputrune.c | 2 +- | |
| 14 files changed, 86 insertions(+), 44 deletions(-) | |
| --- | |
| diff --git a/include/fmt.h b/include/fmt.h | |
| t@@ -30,7 +30,7 @@ struct Fmt{ | |
| void *farg; /* to make flush a closure */ | |
| int nfmt; /* num chars formatted so far … | |
| va_list args; /* args passed to dofmt */ | |
| - int r; /* % format Rune */ | |
| + Rune r; /* % format Rune */ | |
| int width; | |
| int prec; | |
| unsigned long flags; | |
| diff --git a/include/utf.h b/include/utf.h | |
| t@@ -4,14 +4,15 @@ | |
| extern "C" { | |
| #endif | |
| -typedef unsigned short Rune; /* 16 bits */ | |
| +typedef unsigned int Rune; /* 32 bits */ | |
| enum | |
| { | |
| - UTFmax = 3, /* maximum bytes per rune */ | |
| + UTFmax = 4, /* maximum bytes per rune */ | |
| Runesync = 0x80, /* cannot represent part of a U… | |
| Runeself = 0x80, /* rune and UTF sequences are t… | |
| - Runeerror = 0xFFFD /* decoding error in UTF */ | |
| + Runeerror = 0xFFFD, /* decoding error in UTF */ | |
| + Runemax = 0x10FFFF /* maximum rune value */ | |
| }; | |
| /* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */ | |
| diff --git a/src/cmd/9term/wind.c b/src/cmd/9term/wind.c | |
| t@@ -193,7 +193,7 @@ winctl(void *arg) | |
| Rune *rp, *bp, *up, *kbdr; | |
| uint qh; | |
| int nr, nb, c, wid, i, npart, initial, lastb; | |
| - char *s, *t, part[3]; | |
| + char *s, *t, part[UTFmax]; | |
| Window *w; | |
| Mousestate *mp, m; | |
| enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT … | |
| diff --git a/src/cmd/acme/regx.c b/src/cmd/acme/regx.c | |
| t@@ -488,7 +488,7 @@ bldcclass(void) | |
| exprp++; /* eat '-' */ | |
| if((c2 = nextrec()) == ']') | |
| goto Error; | |
| - classp[n+0] = 0xFFFF; | |
| + classp[n+0] = Runemax; | |
| classp[n+1] = c1; | |
| classp[n+2] = c2; | |
| n += 3; | |
| t@@ -510,7 +510,7 @@ classmatch(int classno, int c, int negate) | |
| p = class[classno]; | |
| while(*p){ | |
| - if(*p == 0xFFFF){ | |
| + if(*p == Runemax){ | |
| if(p[1]<=c && c<=p[2]) | |
| return !negate; | |
| p += 3; | |
| diff --git a/src/cmd/sam/cmd.c b/src/cmd/sam/cmd.c | |
| t@@ -71,7 +71,7 @@ int | |
| inputc(void) | |
| { | |
| int n, nbuf; | |
| - char buf[3]; | |
| + char buf[UTFmax]; | |
| Rune r; | |
| Again: | |
| diff --git a/src/cmd/sam/regexp.c b/src/cmd/sam/regexp.c | |
| t@@ -494,7 +494,7 @@ bldcclass(void) | |
| exprp++; /* eat '-' */ | |
| if((c2 = nextrec()) == ']') | |
| goto Error; | |
| - classp[n+0] = 0xFFFF; | |
| + classp[n+0] = Runemax; | |
| classp[n+1] = c1; | |
| classp[n+2] = c2; | |
| n += 3; | |
| t@@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate) | |
| p = class[classno]; | |
| while(*p){ | |
| - if(*p == 0xFFFF){ | |
| + if(*p == Runemax){ | |
| if(p[1]<=c && c<=p[2]) | |
| return !negate; | |
| p += 3; | |
| diff --git a/src/cmd/sed.c b/src/cmd/sed.c | |
| t@@ -615,7 +615,7 @@ compsub(Rune *rhs, Rune *end) | |
| while ((r = *cp++) != '\0') { | |
| if(r == '\\') { | |
| if (rhs < end) | |
| - *rhs++ = 0xFFFF; | |
| + *rhs++ = Runemax; | |
| else | |
| return 0; | |
| r = *cp++; | |
| t@@ -1050,7 +1050,7 @@ dosub(Rune *rhsbuf) | |
| sp = place(sp, loc1, loc2); | |
| continue; | |
| } | |
| - if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB+'0') { | |
| + if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB+'0') { | |
| n = c-'0'; | |
| if (subexp[n].s.rsp && subexp[n].e.rep) { | |
| sp = place(sp, subexp[n].s.rsp, subexp[n].e.re… | |
| diff --git a/src/cmd/tr.c b/src/cmd/tr.c | |
| t@@ -15,7 +15,7 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 }; | |
| #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) | |
| #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) | |
| -#define MAXRUNE 0xFFFF | |
| +#define MAXRUNE Runemax | |
| uchar f[(MAXRUNE+1)/8]; | |
| uchar t[(MAXRUNE+1)/8]; | |
| diff --git a/src/cmd/troff/mbwc.c b/src/cmd/troff/mbwc.c | |
| t@@ -152,9 +152,9 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n) | |
| if(p+d <= pe+3) { | |
| *p++ = buf[0]; | |
| if(d > 1) { | |
| - *p++ = buf[2]; | |
| + *p++ = buf[1]; | |
| if(d > 2) | |
| - *p++ = buf[3]; | |
| + *p++ = buf[2]; | |
| } | |
| } | |
| if(c == 0) | |
| diff --git a/src/cmd/unicode.c b/src/cmd/unicode.c | |
| t@@ -51,13 +51,13 @@ range(char *argv[]) | |
| return "bad range"; | |
| } | |
| min = strtoul(q, &q, 16); | |
| - if(min<0 || min>0xFFFF || *q!='-') | |
| + if(min<0 || min>Runemax || *q!='-') | |
| goto err; | |
| q++; | |
| if(strchr(hex, *q) == 0) | |
| goto err; | |
| max = strtoul(q, &q, 16); | |
| - if(max<0 || max>0xFFFF || max<min || *q!=0) | |
| + if(max<0 || max>Runemax || max<min || *q!=0) | |
| goto err; | |
| i = 0; | |
| do{ | |
| t@@ -111,7 +111,7 @@ chars(char *argv[]) | |
| return "bad char"; | |
| } | |
| m = strtoul(q, &q, 16); | |
| - if(m<0 || m>0xFFFF || *q!=0) | |
| + if(m<0 || m>Runemax || *q!=0) | |
| goto err; | |
| Bprint(&bout, "%C", m); | |
| if(!text) | |
| diff --git a/src/lib9/fmt/dofmt.c b/src/lib9/fmt/dofmt.c | |
| t@@ -605,12 +605,13 @@ __flagfmt(Fmt *f) | |
| int | |
| __badfmt(Fmt *f) | |
| { | |
| - char x[3]; | |
| + char x[2+UTFmax]; | |
| + int n; | |
| x[0] = '%'; | |
| - x[1] = f->r; | |
| - x[2] = '%'; | |
| - f->prec = 3; | |
| - __fmtcpy(f, (const void*)x, 3, 3); | |
| + n = 1 + runetochar(x+1, &f->r); | |
| + x[n++] = '%'; | |
| + f->prec = n; | |
| + __fmtcpy(f, (const void*)x, n, n); | |
| return 0; | |
| } | |
| diff --git a/src/lib9/utf/rune.c b/src/lib9/utf/rune.c | |
| t@@ -23,16 +23,19 @@ enum | |
| Bit2 = 5, | |
| Bit3 = 4, | |
| Bit4 = 3, | |
| + Bit5 = 2, | |
| T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ | |
| Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ | |
| T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ | |
| T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ | |
| T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ | |
| + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ | |
| - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 … | |
| - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 … | |
| - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 … | |
| + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 … | |
| + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 … | |
| + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 … | |
| + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 … | |
| Maskx = (1<<Bitx)-1, /* 0011 1111 */ | |
| Testx = Maskx ^ 0xFF, /* 1100 0000 */ | |
| t@@ -43,7 +46,7 @@ enum | |
| int | |
| chartorune(Rune *rune, char *str) | |
| { | |
| - int c, c1, c2; | |
| + int c, c1, c2, c3; | |
| long l; | |
| /* | |
| t@@ -89,6 +92,25 @@ chartorune(Rune *rune, char *str) | |
| } | |
| /* | |
| + * four character sequence | |
| + * 10000-10FFFF => T4 Tx Tx Tx | |
| + */ | |
| + if(UTFmax >= 4) { | |
| + c3 = *(uchar*)(str+3) ^ Tx; | |
| + if(c3 & Testx) | |
| + goto bad; | |
| + if(c < T5) { | |
| + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c… | |
| + if(l <= Rune3) | |
| + goto bad; | |
| + if(l > Runemax) | |
| + goto bad; | |
| + *rune = l; | |
| + return 4; | |
| + } | |
| + } | |
| + | |
| + /* | |
| * bad decoding | |
| */ | |
| bad: | |
| t@@ -113,7 +135,7 @@ runetochar(char *str, Rune *rune) | |
| /* | |
| * two character sequence | |
| - * 0080-07FF => T2 Tx | |
| + * 00080-007FF => T2 Tx | |
| */ | |
| if(c <= Rune2) { | |
| str[0] = T2 | (c >> 1*Bitx); | |
| t@@ -123,12 +145,26 @@ runetochar(char *str, Rune *rune) | |
| /* | |
| * three character sequence | |
| - * 0800-FFFF => T3 Tx Tx | |
| + * 00800-0FFFF => T3 Tx Tx | |
| */ | |
| - str[0] = T3 | (c >> 2*Bitx); | |
| - str[1] = Tx | ((c >> 1*Bitx) & Maskx); | |
| - str[2] = Tx | (c & Maskx); | |
| - return 3; | |
| + if(c > Runemax) | |
| + c = Runeerror; | |
| + if(c <= Rune3) { | |
| + str[0] = T3 | (c >> 2*Bitx); | |
| + str[1] = Tx | ((c >> 1*Bitx) & Maskx); | |
| + str[2] = Tx | (c & Maskx); | |
| + return 3; | |
| + } | |
| + | |
| + /* | |
| + * four character sequence | |
| + * 010000-1FFFFF => T4 Tx Tx Tx | |
| + */ | |
| + str[0] = T4 | (c >> 3*Bitx); | |
| + str[1] = Tx | ((c >> 2*Bitx) & Maskx); | |
| + str[2] = Tx | ((c >> 1*Bitx) & Maskx); | |
| + str[3] = Tx | (c & Maskx); | |
| + return 4; | |
| } | |
| int | |
| t@@ -155,7 +191,10 @@ runenlen(Rune *r, int nrune) | |
| if(c <= Rune2) | |
| nb += 2; | |
| else | |
| + if(c <= Rune3 || c > Runemax) | |
| nb += 3; | |
| + else | |
| + nb += 4; | |
| } | |
| return nb; | |
| } | |
| t@@ -165,13 +204,14 @@ fullrune(char *str, int n) | |
| { | |
| int c; | |
| - if(n > 0) { | |
| - c = *(uchar*)str; | |
| - if(c < Tx) | |
| - return 1; | |
| - if(n > 1) | |
| - if(c < T3 || n > 2) | |
| - return 1; | |
| - } | |
| - return 0; | |
| + if(n <= 0) | |
| + return 0; | |
| + c = *(uchar*)str; | |
| + if(c < Tx) | |
| + return 1; | |
| + if(c < T3) | |
| + return n >= 2; | |
| + if(UTFmax == 3 || c < T4) | |
| + return n >= 3; | |
| + return n >= 4; | |
| } | |
| diff --git a/src/libbio/bgetrune.c b/src/libbio/bgetrune.c | |
| t@@ -7,7 +7,7 @@ Bgetrune(Biobuf *bp) | |
| { | |
| int c, i; | |
| Rune rune; | |
| - char str[4]; | |
| + char str[UTFmax]; | |
| c = Bgetc(bp); | |
| if(c < Runeself) { /* one char */ | |
| diff --git a/src/libbio/bputrune.c b/src/libbio/bputrune.c | |
| t@@ -6,7 +6,7 @@ int | |
| Bputrune(Biobuf *bp, long c) | |
| { | |
| Rune rune; | |
| - char str[4]; | |
| + char str[UTFmax]; | |
| int n; | |
| rune = c; |