tutf.c - plan9port - [fork] Plan 9 from user space | |
git clone git://src.adamsgaard.dk/plan9port | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
tutf.c (5985B) | |
--- | |
1 #ifdef PLAN9 | |
2 #include <u.h> | |
3 #include <libc.h> | |
4 #include <bio.h> | |
5 #ifdef PLAN9PORT | |
6 #include <errno.h> | |
7 #else | |
8 extern int errno; | |
9 #endif | |
10 #else | |
11 #include <sys/types.h> | |
12 #include <stdio.h> | |
13 #include <stdlib.h> | |
14 #include <string.h> | |
15 #include <unistd.h> | |
16 #include <errno.h> | |
17 #include "plan9.h" | |
18 #endif | |
19 #include "hdr.h" | |
20 #ifndef EILSEQ | |
21 #define EILSEQ 9998 | |
22 #endif | |
23 | |
24 /* | |
25 the our_* routines are implementations for the corresponding lib… | |
26 routines. for a while, i tried to actually name them wctomb etc | |
27 but stopped that after i found a system which made wchar_t an | |
28 unsigned char. | |
29 */ | |
30 | |
31 int our_wctomb(char *s, unsigned long wc); | |
32 int our_mbtowc(unsigned long *p, char *s, unsigned n); | |
33 int runetoisoutf(char *str, Rune *rune); | |
34 int fullisorune(char *str, int n); | |
35 int isochartorune(Rune *rune, char *str); | |
36 | |
37 void | |
38 utf_in(int fd, long *notused, struct convert *out) | |
39 { | |
40 char buf[N]; | |
41 int i, j, c, n, tot; | |
42 ulong l; | |
43 | |
44 USED(notused); | |
45 tot = 0; | |
46 while((n = read(fd, buf+tot, N-tot)) >= 0){ | |
47 tot += n; | |
48 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(… | |
49 c = our_mbtowc(&l, buf+i, tot-i); | |
50 if(c == -1){ | |
51 if(squawk) | |
52 EPR "%s: bad UTF sequence near b… | |
53 if(clean){ | |
54 i++; | |
55 continue; | |
56 } | |
57 nerrors++; | |
58 l = Runeerror; | |
59 c = 1; | |
60 } | |
61 runes[j++] = l; | |
62 i += c; | |
63 } | |
64 OUT(out, runes, j); | |
65 tot -= i; | |
66 ninput += i; | |
67 if(tot) | |
68 memmove(buf, buf+i, tot); | |
69 if(n == 0) | |
70 break; | |
71 } | |
72 OUT(out, runes, 0); | |
73 } | |
74 | |
75 void | |
76 utf_out(Rune *base, int n, long *notused) | |
77 { | |
78 char *p; | |
79 Rune *r; | |
80 | |
81 USED(notused); | |
82 nrunes += n; | |
83 for(r = base, p = obuf; n-- > 0; r++){ | |
84 p += our_wctomb(p, *r); | |
85 } | |
86 noutput += p-obuf; | |
87 write(1, obuf, p-obuf); | |
88 } | |
89 | |
90 void | |
91 isoutf_in(int fd, long *notused, struct convert *out) | |
92 { | |
93 char buf[N]; | |
94 int i, j, c, n, tot; | |
95 | |
96 USED(notused); | |
97 tot = 0; | |
98 while((n = read(fd, buf+tot, N-tot)) >= 0){ | |
99 tot += n; | |
100 for(i=j=0; i<tot; ){ | |
101 if(!fullisorune(buf+i, tot-i)) | |
102 break; | |
103 c = isochartorune(&runes[j], buf+i); | |
104 if(runes[j] == Runeerror && c == 1){ | |
105 if(squawk) | |
106 EPR "%s: bad UTF sequence near b… | |
107 if(clean){ | |
108 i++; | |
109 continue; | |
110 } | |
111 nerrors++; | |
112 } | |
113 j++; | |
114 i += c; | |
115 } | |
116 OUT(out, runes, j); | |
117 tot -= i; | |
118 ninput += i; | |
119 if(tot) | |
120 memmove(buf, buf+i, tot); | |
121 if(n == 0) | |
122 break; | |
123 } | |
124 OUT(out, runes, 0); | |
125 } | |
126 | |
127 void | |
128 isoutf_out(Rune *base, int n, long *notused) | |
129 { | |
130 char *p; | |
131 Rune *r; | |
132 | |
133 USED(notused); | |
134 nrunes += n; | |
135 for(r = base, p = obuf; n-- > 0; r++) | |
136 p += runetoisoutf(p, r); | |
137 noutput += p-obuf; | |
138 write(1, obuf, p-obuf); | |
139 } | |
140 | |
141 | |
142 int | |
143 isochartorune(Rune *rune, char *str) | |
144 { | |
145 return chartorune(rune, str); | |
146 } | |
147 | |
148 int | |
149 runetoisoutf(char *str, Rune *rune) | |
150 { | |
151 return runetochar(str, rune); | |
152 } | |
153 | |
154 int | |
155 fullisorune(char *str, int n) | |
156 { | |
157 return fullrune(str, n); | |
158 } | |
159 | |
160 enum | |
161 { | |
162 T1 = 0x00, | |
163 Tx = 0x80, | |
164 T2 = 0xC0, | |
165 T3 = 0xE0, | |
166 T4 = 0xF0, | |
167 T5 = 0xF8, | |
168 T6 = 0xFC, | |
169 | |
170 Bit1 = 7, | |
171 Bitx = 6, | |
172 Bit2 = 5, | |
173 Bit3 = 4, | |
174 Bit4 = 3, | |
175 Bit5 = 2, | |
176 Bit6 = 2, | |
177 | |
178 Mask1 = (1<<Bit1)-1, | |
179 Maskx = (1<<Bitx)-1, | |
180 Mask2 = (1<<Bit2)-1, | |
181 Mask3 = (1<<Bit3)-1, | |
182 Mask4 = (1<<Bit4)-1, | |
183 Mask5 = (1<<Bit5)-1, | |
184 Mask6 = (1<<Bit6)-1, | |
185 | |
186 Wchar1 = (1UL<<Bit1)-1, | |
187 Wchar2 = (1UL<<(Bit2+Bitx))-1, | |
188 Wchar3 = (1UL<<(Bit3+2*Bitx))-1, | |
189 Wchar4 = (1UL<<(Bit4+3*Bitx))-1, | |
190 Wchar5 = (1UL<<(Bit5+4*Bitx))-1 | |
191 }; | |
192 | |
193 int | |
194 our_wctomb(char *s, unsigned long wc) | |
195 { | |
196 if(s == 0) | |
197 return 0; /* no shift states */ | |
198 if(wc & ~Wchar2) { | |
199 if(wc & ~Wchar4) { | |
200 if(wc & ~Wchar5) { | |
201 /* 6 bytes */ | |
202 s[0] = T6 | ((wc >> 5*Bitx) & Mask6); | |
203 s[1] = Tx | ((wc >> 4*Bitx) & Maskx); | |
204 s[2] = Tx | ((wc >> 3*Bitx) & Maskx); | |
205 s[3] = Tx | ((wc >> 2*Bitx) & Maskx); | |
206 s[4] = Tx | ((wc >> 1*Bitx) & Maskx); | |
207 s[5] = Tx | (wc & Maskx); | |
208 return 6; | |
209 } | |
210 /* 5 bytes */ | |
211 s[0] = T5 | (wc >> 4*Bitx); | |
212 s[1] = Tx | ((wc >> 3*Bitx) & Maskx); | |
213 s[2] = Tx | ((wc >> 2*Bitx) & Maskx); | |
214 s[3] = Tx | ((wc >> 1*Bitx) & Maskx); | |
215 s[4] = Tx | (wc & Maskx); | |
216 return 5; | |
217 } | |
218 if(wc & ~Wchar3) { | |
219 /* 4 bytes */ | |
220 s[0] = T4 | (wc >> 3*Bitx); | |
221 s[1] = Tx | ((wc >> 2*Bitx) & Maskx); | |
222 s[2] = Tx | ((wc >> 1*Bitx) & Maskx); | |
223 s[3] = Tx | (wc & Maskx); | |
224 return 4; | |
225 } | |
226 /* 3 bytes */ | |
227 s[0] = T3 | (wc >> 2*Bitx); | |
228 s[1] = Tx | ((wc >> 1*Bitx) & Maskx); | |
229 s[2] = Tx | (wc & Maskx); | |
230 return 3; | |
231 } | |
232 if(wc & ~Wchar1) { | |
233 /* 2 bytes */ | |
234 s[0] = T2 | (wc >> 1*Bitx); | |
235 s[1] = Tx | (wc & Maskx); | |
236 return 2; | |
237 } | |
238 /* 1 byte */ | |
239 s[0] = T1 | wc; | |
240 return 1; | |
241 } | |
242 | |
243 int | |
244 our_mbtowc(unsigned long *p, char *s, unsigned n) | |
245 { | |
246 uchar *us; | |
247 int c0, c1, c2, c3, c4, c5; | |
248 unsigned long wc; | |
249 | |
250 if(s == 0) | |
251 return 0; /* no shift states */ | |
252 | |
253 if(n < 1) | |
254 goto bad; | |
255 us = (uchar*)s; | |
256 c0 = us[0]; | |
257 if(c0 >= T3) { | |
258 if(n < 3) | |
259 goto bad; | |
260 c1 = us[1] ^ Tx; | |
261 c2 = us[2] ^ Tx; | |
262 if((c1|c2) & T2) | |
263 goto bad; | |
264 if(c0 >= T5) { | |
265 if(n < 5) | |
266 goto bad; | |
267 c3 = us[3] ^ Tx; | |
268 c4 = us[4] ^ Tx; | |
269 if((c3|c4) & T2) | |
270 goto bad; | |
271 if(c0 >= T6) { | |
272 /* 6 bytes */ | |
273 if(n < 6) | |
274 goto bad; | |
275 c5 = us[5] ^ Tx; | |
276 if(c5 & T2) | |
277 goto bad; | |
278 wc = ((((((((((c0 & Mask6) << Bitx) | | |
279 c1) << Bitx) | c2) << Bitx) | | |
280 c3) << Bitx) | c4) << Bitx) | c5; | |
281 if(wc <= Wchar5) | |
282 goto bad; | |
283 *p = wc; | |
284 return 6; | |
285 } | |
286 /* 5 bytes */ | |
287 wc = ((((((((c0 & Mask5) << Bitx) | | |
288 c1) << Bitx) | c2) << Bitx) | | |
289 c3) << Bitx) | c4; | |
290 if(wc <= Wchar4) | |
291 goto bad; | |
292 *p = wc; | |
293 return 5; | |
294 } | |
295 if(c0 >= T4) { | |
296 /* 4 bytes */ | |
297 if(n < 4) | |
298 goto bad; | |
299 c3 = us[3] ^ Tx; | |
300 if(c3 & T2) | |
301 goto bad; | |
302 wc = ((((((c0 & Mask4) << Bitx) | | |
303 c1) << Bitx) | c2) << Bitx) | | |
304 c3; | |
305 if(wc <= Wchar3) | |
306 goto bad; | |
307 *p = wc; | |
308 return 4; | |
309 } | |
310 /* 3 bytes */ | |
311 wc = ((((c0 & Mask3) << Bitx) | | |
312 c1) << Bitx) | c2; | |
313 if(wc <= Wchar2) | |
314 goto bad; | |
315 *p = wc; | |
316 return 3; | |
317 } | |
318 if(c0 >= T2) { | |
319 /* 2 bytes */ | |
320 if(n < 2) | |
321 goto bad; | |
322 c1 = us[1] ^ Tx; | |
323 if(c1 & T2) | |
324 goto bad; | |
325 wc = ((c0 & Mask2) << Bitx) | | |
326 c1; | |
327 if(wc <= Wchar1) | |
328 goto bad; | |
329 *p = wc; | |
330 return 2; | |
331 } | |
332 /* 1 byte */ | |
333 if(c0 >= Tx) | |
334 goto bad; | |
335 *p = c0; | |
336 return 1; | |
337 | |
338 bad: | |
339 errno = EILSEQ; | |
340 return -1; | |
341 } |