Introduction
Introduction Statistics Contact Development Disclaimer Help
tconvert to 4-byte UTF-8 and 32-bit Rune - plan9port - [fork] Plan 9 from user …
git clone git://src.adamsgaard.dk/plan9port
Log
Files
Refs
README
LICENSE
---
commit 0cadb4301d18724e7513d7489cb5bebd262c82f1
parent 4dbf255619efac4f0a00e4216d6c999128910df2
Author: Russ Cox <[email protected]>
Date: Fri, 11 Sep 2009 17:03:06 -0400
convert to 4-byte UTF-8 and 32-bit Rune
http://codereview.appspot.com/116075
Diffstat:
M include/fmt.h | 2 +-
M include/utf.h | 7 ++++---
M src/cmd/9term/wind.c | 2 +-
M src/cmd/acme/regx.c | 4 ++--
M src/cmd/sam/cmd.c | 2 +-
M src/cmd/sam/regexp.c | 4 ++--
M src/cmd/sed.c | 4 ++--
M src/cmd/tr.c | 2 +-
M src/cmd/troff/mbwc.c | 4 ++--
M src/cmd/unicode.c | 6 +++---
M src/lib9/fmt/dofmt.c | 11 ++++++-----
M src/lib9/utf/rune.c | 78 +++++++++++++++++++++++------…
M src/libbio/bgetrune.c | 2 +-
M src/libbio/bputrune.c | 2 +-
14 files changed, 86 insertions(+), 44 deletions(-)
---
diff --git a/include/fmt.h b/include/fmt.h
t@@ -30,7 +30,7 @@ struct Fmt{
void *farg; /* to make flush a closure */
int nfmt; /* num chars formatted so far …
va_list args; /* args passed to dofmt */
- int r; /* % format Rune */
+ Rune r; /* % format Rune */
int width;
int prec;
unsigned long flags;
diff --git a/include/utf.h b/include/utf.h
t@@ -4,14 +4,15 @@
extern "C" {
#endif
-typedef unsigned short Rune; /* 16 bits */
+typedef unsigned int Rune; /* 32 bits */
enum
{
- UTFmax = 3, /* maximum bytes per rune */
+ UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a U…
Runeself = 0x80, /* rune and UTF sequences are t…
- Runeerror = 0xFFFD /* decoding error in UTF */
+ Runeerror = 0xFFFD, /* decoding error in UTF */
+ Runemax = 0x10FFFF /* maximum rune value */
};
/* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */
diff --git a/src/cmd/9term/wind.c b/src/cmd/9term/wind.c
t@@ -193,7 +193,7 @@ winctl(void *arg)
Rune *rp, *bp, *up, *kbdr;
uint qh;
int nr, nb, c, wid, i, npart, initial, lastb;
- char *s, *t, part[3];
+ char *s, *t, part[UTFmax];
Window *w;
Mousestate *mp, m;
enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT …
diff --git a/src/cmd/acme/regx.c b/src/cmd/acme/regx.c
t@@ -488,7 +488,7 @@ bldcclass(void)
exprp++; /* eat '-' */
if((c2 = nextrec()) == ']')
goto Error;
- classp[n+0] = 0xFFFF;
+ classp[n+0] = Runemax;
classp[n+1] = c1;
classp[n+2] = c2;
n += 3;
t@@ -510,7 +510,7 @@ classmatch(int classno, int c, int negate)
p = class[classno];
while(*p){
- if(*p == 0xFFFF){
+ if(*p == Runemax){
if(p[1]<=c && c<=p[2])
return !negate;
p += 3;
diff --git a/src/cmd/sam/cmd.c b/src/cmd/sam/cmd.c
t@@ -71,7 +71,7 @@ int
inputc(void)
{
int n, nbuf;
- char buf[3];
+ char buf[UTFmax];
Rune r;
Again:
diff --git a/src/cmd/sam/regexp.c b/src/cmd/sam/regexp.c
t@@ -494,7 +494,7 @@ bldcclass(void)
exprp++; /* eat '-' */
if((c2 = nextrec()) == ']')
goto Error;
- classp[n+0] = 0xFFFF;
+ classp[n+0] = Runemax;
classp[n+1] = c1;
classp[n+2] = c2;
n += 3;
t@@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate)
p = class[classno];
while(*p){
- if(*p == 0xFFFF){
+ if(*p == Runemax){
if(p[1]<=c && c<=p[2])
return !negate;
p += 3;
diff --git a/src/cmd/sed.c b/src/cmd/sed.c
t@@ -615,7 +615,7 @@ compsub(Rune *rhs, Rune *end)
while ((r = *cp++) != '\0') {
if(r == '\\') {
if (rhs < end)
- *rhs++ = 0xFFFF;
+ *rhs++ = Runemax;
else
return 0;
r = *cp++;
t@@ -1050,7 +1050,7 @@ dosub(Rune *rhsbuf)
sp = place(sp, loc1, loc2);
continue;
}
- if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB+'0') {
+ if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB+'0') {
n = c-'0';
if (subexp[n].s.rsp && subexp[n].e.rep) {
sp = place(sp, subexp[n].s.rsp, subexp[n].e.re…
diff --git a/src/cmd/tr.c b/src/cmd/tr.c
t@@ -15,7 +15,7 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
-#define MAXRUNE 0xFFFF
+#define MAXRUNE Runemax
uchar f[(MAXRUNE+1)/8];
uchar t[(MAXRUNE+1)/8];
diff --git a/src/cmd/troff/mbwc.c b/src/cmd/troff/mbwc.c
t@@ -152,9 +152,9 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
if(p+d <= pe+3) {
*p++ = buf[0];
if(d > 1) {
- *p++ = buf[2];
+ *p++ = buf[1];
if(d > 2)
- *p++ = buf[3];
+ *p++ = buf[2];
}
}
if(c == 0)
diff --git a/src/cmd/unicode.c b/src/cmd/unicode.c
t@@ -51,13 +51,13 @@ range(char *argv[])
return "bad range";
}
min = strtoul(q, &q, 16);
- if(min<0 || min>0xFFFF || *q!='-')
+ if(min<0 || min>Runemax || *q!='-')
goto err;
q++;
if(strchr(hex, *q) == 0)
goto err;
max = strtoul(q, &q, 16);
- if(max<0 || max>0xFFFF || max<min || *q!=0)
+ if(max<0 || max>Runemax || max<min || *q!=0)
goto err;
i = 0;
do{
t@@ -111,7 +111,7 @@ chars(char *argv[])
return "bad char";
}
m = strtoul(q, &q, 16);
- if(m<0 || m>0xFFFF || *q!=0)
+ if(m<0 || m>Runemax || *q!=0)
goto err;
Bprint(&bout, "%C", m);
if(!text)
diff --git a/src/lib9/fmt/dofmt.c b/src/lib9/fmt/dofmt.c
t@@ -605,12 +605,13 @@ __flagfmt(Fmt *f)
int
__badfmt(Fmt *f)
{
- char x[3];
+ char x[2+UTFmax];
+ int n;
x[0] = '%';
- x[1] = f->r;
- x[2] = '%';
- f->prec = 3;
- __fmtcpy(f, (const void*)x, 3, 3);
+ n = 1 + runetochar(x+1, &f->r);
+ x[n++] = '%';
+ f->prec = n;
+ __fmtcpy(f, (const void*)x, n, n);
return 0;
}
diff --git a/src/lib9/utf/rune.c b/src/lib9/utf/rune.c
t@@ -23,16 +23,19 @@ enum
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
+ Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
- Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 …
- Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 …
- Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 …
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 …
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 …
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 …
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 …
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
t@@ -43,7 +46,7 @@ enum
int
chartorune(Rune *rune, char *str)
{
- int c, c1, c2;
+ int c, c1, c2, c3;
long l;
/*
t@@ -89,6 +92,25 @@ chartorune(Rune *rune, char *str)
}
/*
+ * four character sequence
+ * 10000-10FFFF => T4 Tx Tx Tx
+ */
+ if(UTFmax >= 4) {
+ c3 = *(uchar*)(str+3) ^ Tx;
+ if(c3 & Testx)
+ goto bad;
+ if(c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c…
+ if(l <= Rune3)
+ goto bad;
+ if(l > Runemax)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+ }
+
+ /*
* bad decoding
*/
bad:
t@@ -113,7 +135,7 @@ runetochar(char *str, Rune *rune)
/*
* two character sequence
- * 0080-07FF => T2 Tx
+ * 00080-007FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
t@@ -123,12 +145,26 @@ runetochar(char *str, Rune *rune)
/*
* three character sequence
- * 0800-FFFF => T3 Tx Tx
+ * 00800-0FFFF => T3 Tx Tx
*/
- str[0] = T3 | (c >> 2*Bitx);
- str[1] = Tx | ((c >> 1*Bitx) & Maskx);
- str[2] = Tx | (c & Maskx);
- return 3;
+ if(c > Runemax)
+ c = Runeerror;
+ if(c <= Rune3) {
+ str[0] = T3 | (c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /*
+ * four character sequence
+ * 010000-1FFFFF => T4 Tx Tx Tx
+ */
+ str[0] = T4 | (c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
}
int
t@@ -155,7 +191,10 @@ runenlen(Rune *r, int nrune)
if(c <= Rune2)
nb += 2;
else
+ if(c <= Rune3 || c > Runemax)
nb += 3;
+ else
+ nb += 4;
}
return nb;
}
t@@ -165,13 +204,14 @@ fullrune(char *str, int n)
{
int c;
- if(n > 0) {
- c = *(uchar*)str;
- if(c < Tx)
- return 1;
- if(n > 1)
- if(c < T3 || n > 2)
- return 1;
- }
- return 0;
+ if(n <= 0)
+ return 0;
+ c = *(uchar*)str;
+ if(c < Tx)
+ return 1;
+ if(c < T3)
+ return n >= 2;
+ if(UTFmax == 3 || c < T4)
+ return n >= 3;
+ return n >= 4;
}
diff --git a/src/libbio/bgetrune.c b/src/libbio/bgetrune.c
t@@ -7,7 +7,7 @@ Bgetrune(Biobuf *bp)
{
int c, i;
Rune rune;
- char str[4];
+ char str[UTFmax];
c = Bgetc(bp);
if(c < Runeself) { /* one char */
diff --git a/src/libbio/bputrune.c b/src/libbio/bputrune.c
t@@ -6,7 +6,7 @@ int
Bputrune(Biobuf *bp, long c)
{
Rune rune;
- char str[4];
+ char str[UTFmax];
int n;
rune = c;
You are viewing proxied material from mx1.adamsgaard.dk. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.