xml.c - webdump - HTML to plain-text converter for webpages | |
git clone git://git.codemadness.org/webdump | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
xml.c (11720B) | |
--- | |
1 #include <errno.h> | |
2 #include <stdio.h> | |
3 #include <stdlib.h> | |
4 #include <string.h> | |
5 | |
6 #include "xml.h" | |
7 | |
8 /* ifdef for HTML mode. To differentiate xml.c and webdump HTML changes … | |
9 #define HTML_MODE | |
10 | |
11 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) | |
12 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) | |
13 | |
14 static void | |
15 xml_parseattrs(XMLParser *x) | |
16 { | |
17 size_t namelen = 0, valuelen; | |
18 int c, endsep, endname = 0, valuestart = 0; | |
19 | |
20 while ((c = GETNEXT()) != EOF) { | |
21 if (ISSPACE(c)) { | |
22 if (namelen) | |
23 endname = 1; | |
24 continue; | |
25 } else if (c == '?') | |
26 ; /* ignore */ | |
27 else if (c == '=') { | |
28 x->name[namelen] = '\0'; | |
29 valuestart = 1; | |
30 endname = 1; | |
31 } else if (namelen && ((endname && !valuestart && ISALPH… | |
32 /* attribute without value */ | |
33 x->name[namelen] = '\0'; | |
34 if (x->xmlattrstart) | |
35 x->xmlattrstart(x, x->tag, x->taglen, x-… | |
36 if (x->xmlattr) | |
37 x->xmlattr(x, x->tag, x->taglen, x->name… | |
38 if (x->xmlattrend) | |
39 x->xmlattrend(x, x->tag, x->taglen, x->n… | |
40 endname = 0; | |
41 x->name[0] = c; | |
42 namelen = 1; | |
43 } else if (namelen && valuestart) { | |
44 /* attribute with value */ | |
45 if (x->xmlattrstart) | |
46 x->xmlattrstart(x, x->tag, x->taglen, x-… | |
47 | |
48 valuelen = 0; | |
49 if (c == '\'' || c == '"') { | |
50 endsep = c; | |
51 } else { | |
52 endsep = ' '; /* ISSPACE() */ | |
53 goto startvalue; | |
54 } | |
55 | |
56 while ((c = GETNEXT()) != EOF) { | |
57 startvalue: | |
58 if (c == '&') { /* entities */ | |
59 x->data[valuelen] = '\0'; | |
60 /* call data function with data … | |
61 if (valuelen && x->xmlattr) | |
62 x->xmlattr(x, x->tag, x-… | |
63 x->data[0] = c; | |
64 valuelen = 1; | |
65 while ((c = GETNEXT()) != EOF) { | |
66 if (c == endsep || (ends… | |
67 break; | |
68 if (valuelen < sizeof(x-… | |
69 x->data[valuelen… | |
70 else { | |
71 /* entity too lo… | |
72 x->data[valuelen… | |
73 if (x->xmlattr) | |
74 x->xmlat… | |
75 x->data[0] = c; | |
76 valuelen = 1; | |
77 break; | |
78 } | |
79 if (c == ';') { | |
80 x->data[valuelen… | |
81 if (x->xmlattren… | |
82 x->xmlat… | |
83 valuelen = 0; | |
84 break; | |
85 } | |
86 } | |
87 } else if (c != endsep && !(endsep == ' … | |
88 if (valuelen < sizeof(x->data) -… | |
89 x->data[valuelen++] = c; | |
90 } else { | |
91 x->data[valuelen] = '\0'; | |
92 if (x->xmlattr) | |
93 x->xmlattr(x, x-… | |
94 x->data[0] = c; | |
95 valuelen = 1; | |
96 } | |
97 } | |
98 if (c == endsep || (endsep == ' ' && (c … | |
99 x->data[valuelen] = '\0'; | |
100 if (x->xmlattr) | |
101 x->xmlattr(x, x->tag, x-… | |
102 if (x->xmlattrend) | |
103 x->xmlattrend(x, x->tag,… | |
104 break; | |
105 } | |
106 } | |
107 namelen = endname = valuestart = 0; | |
108 } else if (namelen < sizeof(x->name) - 1) { | |
109 x->name[namelen++] = c; | |
110 } | |
111 if (c == '>') { | |
112 break; | |
113 } else if (c == '/') { | |
114 x->isshorttag = 1; | |
115 x->name[0] = '\0'; | |
116 namelen = 0; | |
117 } | |
118 } | |
119 } | |
120 | |
121 static void | |
122 xml_parsecomment(XMLParser *x) | |
123 { | |
124 size_t datalen = 0, i = 0; | |
125 int c; | |
126 | |
127 if (x->xmlcommentstart) | |
128 x->xmlcommentstart(x); | |
129 while ((c = GETNEXT()) != EOF) { | |
130 if (c == '-' || c == '>') { | |
131 if (x->xmlcomment && datalen) { | |
132 x->data[datalen] = '\0'; | |
133 x->xmlcomment(x, x->data, datalen); | |
134 datalen = 0; | |
135 } | |
136 } | |
137 | |
138 if (c == '-') { | |
139 if (++i > 2) { | |
140 if (x->xmlcomment) | |
141 for (; i > 2; i--) | |
142 x->xmlcomment(x, "-", 1); | |
143 i = 2; | |
144 } | |
145 continue; | |
146 } else if (c == '>' && i == 2) { | |
147 if (x->xmlcommentend) | |
148 x->xmlcommentend(x); | |
149 return; | |
150 } else if (i) { | |
151 if (x->xmlcomment) { | |
152 for (; i > 0; i--) | |
153 x->xmlcomment(x, "-", 1); | |
154 } | |
155 i = 0; | |
156 } | |
157 | |
158 if (datalen < sizeof(x->data) - 1) { | |
159 x->data[datalen++] = c; | |
160 } else { | |
161 x->data[datalen] = '\0'; | |
162 if (x->xmlcomment) | |
163 x->xmlcomment(x, x->data, datalen); | |
164 x->data[0] = c; | |
165 datalen = 1; | |
166 } | |
167 } | |
168 } | |
169 | |
170 static void | |
171 xml_parsecdata(XMLParser *x) | |
172 { | |
173 size_t datalen = 0, i = 0; | |
174 int c; | |
175 | |
176 if (x->xmlcdatastart) | |
177 x->xmlcdatastart(x); | |
178 while ((c = GETNEXT()) != EOF) { | |
179 if (c == ']' || c == '>') { | |
180 if (x->xmlcdata && datalen) { | |
181 x->data[datalen] = '\0'; | |
182 x->xmlcdata(x, x->data, datalen); | |
183 datalen = 0; | |
184 } | |
185 } | |
186 | |
187 if (c == ']') { | |
188 if (++i > 2) { | |
189 if (x->xmlcdata) | |
190 for (; i > 2; i--) | |
191 x->xmlcdata(x, "]", 1); | |
192 i = 2; | |
193 } | |
194 continue; | |
195 } else if (c == '>' && i == 2) { | |
196 if (x->xmlcdataend) | |
197 x->xmlcdataend(x); | |
198 return; | |
199 } else if (i) { | |
200 if (x->xmlcdata) | |
201 for (; i > 0; i--) | |
202 x->xmlcdata(x, "]", 1); | |
203 i = 0; | |
204 } | |
205 | |
206 if (datalen < sizeof(x->data) - 1) { | |
207 x->data[datalen++] = c; | |
208 } else { | |
209 x->data[datalen] = '\0'; | |
210 if (x->xmlcdata) | |
211 x->xmlcdata(x, x->data, datalen); | |
212 x->data[0] = c; | |
213 datalen = 1; | |
214 } | |
215 } | |
216 } | |
217 | |
218 static int | |
219 codepointtoutf8(long r, char *s) | |
220 { | |
221 if (r == 0) { | |
222 return 0; /* NUL byte */ | |
223 } else if (r <= 0x7F) { | |
224 /* 1 byte: 0aaaaaaa */ | |
225 s[0] = r; | |
226 return 1; | |
227 } else if (r <= 0x07FF) { | |
228 /* 2 bytes: 00000aaa aabbbbbb */ | |
229 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ | |
230 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ | |
231 return 2; | |
232 } else if (r <= 0xFFFF) { | |
233 /* 3 bytes: aaaabbbb bbcccccc */ | |
234 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ | |
235 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ | |
236 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ | |
237 return 3; | |
238 } else { | |
239 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ | |
240 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ | |
241 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ | |
242 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ | |
243 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ | |
244 return 4; | |
245 } | |
246 } | |
247 | |
248 struct namedentity { | |
249 const char *entity; | |
250 long cp; | |
251 }; | |
252 | |
253 static int | |
254 namedentitycmp(const void *v1, const void *v2) | |
255 { | |
256 struct namedentity *n1 = (struct namedentity *)v1; | |
257 struct namedentity *n2 = (struct namedentity *)v2; | |
258 | |
259 return strcmp(n1->entity, n2->entity); | |
260 } | |
261 | |
262 static const struct namedentity entities[] = { | |
263 #include "namedentities.h" | |
264 }; | |
265 | |
266 static int | |
267 namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
268 { | |
269 struct namedentity find, *found; | |
270 size_t i; | |
271 | |
272 /* buffer is too small */ | |
273 if (bufsiz < 5) | |
274 return -1; | |
275 | |
276 find.entity = e; | |
277 found = bsearch(&find, entities, sizeof(entities) / sizeof(*enti… | |
278 sizeof(*entities), namedentitycmp); | |
279 if (found) { | |
280 i = codepointtoutf8(found->cp, buf); | |
281 buf[i] = '\0'; | |
282 return i; | |
283 } | |
284 return -1; | |
285 } | |
286 | |
287 static int | |
288 numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
289 { | |
290 long l; | |
291 int len; | |
292 char *end; | |
293 | |
294 /* buffer is too small */ | |
295 if (bufsiz < 5) | |
296 return -1; | |
297 | |
298 errno = 0; | |
299 /* hex (16) or decimal (10) */ | |
300 if (*e == 'x') | |
301 l = strtol(++e, &end, 16); | |
302 else | |
303 l = strtol(e, &end, 10); | |
304 /* invalid value or not a well-formed entity or invalid code poi… | |
305 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || | |
306 (l >= 0xd800 && l <= 0xdfff)) | |
307 return -1; | |
308 len = codepointtoutf8(l, buf); | |
309 buf[len] = '\0'; | |
310 | |
311 return len; | |
312 } | |
313 | |
314 /* convert named- or numeric entity string to buffer string | |
315 * returns byte-length of string or -1 on failure. */ | |
316 int | |
317 xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
318 { | |
319 /* doesn't start with & */ | |
320 if (e[0] != '&') | |
321 return -1; | |
322 /* numeric entity */ | |
323 if (e[1] == '#') | |
324 return numericentitytostr(e + 2, buf, bufsiz); | |
325 else /* named entity */ | |
326 return namedentitytostr(e + 1, buf, bufsiz); | |
327 } | |
328 | |
329 void | |
330 xml_parse(XMLParser *x) | |
331 { | |
332 size_t datalen, tagdatalen; | |
333 int c, isend; | |
334 | |
335 #ifdef HTML_MODE | |
336 goto read_data; | |
337 #else | |
338 /* HTML: process data before a tag occured aswell */ | |
339 while ((c = GETNEXT()) != EOF && c != '<') | |
340 ; /* skip until < */ | |
341 #endif | |
342 | |
343 while (c != EOF) { | |
344 if (c == '<') { /* parse tag */ | |
345 if ((c = GETNEXT()) == EOF) | |
346 return; | |
347 | |
348 if (c == '!') { /* CDATA and comments */ | |
349 for (tagdatalen = 0; (c = GETNEXT()) != … | |
350 /* NOTE: sizeof(x->data) must be… | |
351 if (tagdatalen <= sizeof("[CDATA… | |
352 x->data[tagdatalen++] = … | |
353 if (c == '>') | |
354 break; | |
355 else if (c == '-' && tagdatalen … | |
356 (x->data[0] == '… | |
357 xml_parsecomment(x); | |
358 break; | |
359 } else if (c == '[') { | |
360 if (tagdatalen == sizeof… | |
361 !strncmp(x->data, "[… | |
362 xml_parsecdata(x… | |
363 break; | |
364 } | |
365 } | |
366 } | |
367 } else { | |
368 /* normal tag (open, short open, close),… | |
369 x->tag[0] = c; | |
370 x->taglen = 1; | |
371 x->isshorttag = isend = 0; | |
372 | |
373 /* treat processing instruction as short… | |
374 if (c == '?') { | |
375 x->isshorttag = 1; | |
376 } else if (c == '/') { | |
377 if ((c = GETNEXT()) == EOF) | |
378 return; | |
379 x->tag[0] = c; | |
380 isend = 1; | |
381 } | |
382 | |
383 while ((c = GETNEXT()) != EOF) { | |
384 if (c == '/') | |
385 x->isshorttag = 1; /* sh… | |
386 else if (c == '>' || ISSPACE(c))… | |
387 x->tag[x->taglen] = '\0'; | |
388 if (isend) { /* end tag,… | |
389 while (c != '>' … | |
390 c = GETN… | |
391 if (x->xmltagend) | |
392 x->xmlta… | |
393 x->tag[0] = '\0'; | |
394 x->taglen = 0; | |
395 } else { | |
396 /* start tag */ | |
397 if (x->xmltagsta… | |
398 x->xmlta… | |
399 if (ISSPACE(c)) | |
400 xml_pars… | |
401 if (x->xmltagsta… | |
402 x->xmlta… | |
403 } | |
404 /* call tagend for short… | |
405 if (x->isshorttag) { | |
406 if (x->xmltagend) | |
407 x->xmlta… | |
408 x->tag[0] = '\0'; | |
409 x->taglen = 0; | |
410 } | |
411 break; | |
412 } else if (x->taglen < sizeof(x-… | |
413 x->tag[x->taglen++] = c;… | |
414 } | |
415 } | |
416 } else { | |
417 #ifdef HTML_MODE | |
418 read_data: | |
419 #endif | |
420 /* parse tag data */ | |
421 datalen = 0; | |
422 if (x->xmldatastart) | |
423 x->xmldatastart(x); | |
424 while ((c = GETNEXT()) != EOF) { | |
425 if (c == '&') { | |
426 if (datalen) { | |
427 x->data[datalen] = '\0'; | |
428 if (x->xmldata) | |
429 x->xmldata(x, x-… | |
430 } | |
431 x->data[0] = c; | |
432 datalen = 1; | |
433 while ((c = GETNEXT()) != EOF) { | |
434 if (c == '<') | |
435 break; | |
436 if (datalen < sizeof(x->… | |
437 x->data[datalen+… | |
438 else { | |
439 /* entity too lo… | |
440 x->data[datalen]… | |
441 if (x->xmldata) | |
442 x->xmlda… | |
443 x->data[0] = c; | |
444 datalen = 1; | |
445 break; | |
446 } | |
447 if (c == ';') { | |
448 x->data[datalen]… | |
449 if (x->xmldataen… | |
450 x->xmlda… | |
451 datalen = 0; | |
452 break; | |
453 } | |
454 } | |
455 } else if (c != '<') { | |
456 if (datalen < sizeof(x->data) - … | |
457 x->data[datalen++] = c; | |
458 } else { | |
459 x->data[datalen] = '\0'; | |
460 if (x->xmldata) | |
461 x->xmldata(x, x-… | |
462 x->data[0] = c; | |
463 datalen = 1; | |
464 } | |
465 } | |
466 if (c == '<') { | |
467 x->data[datalen] = '\0'; | |
468 if (x->xmldata && datalen) | |
469 x->xmldata(x, x->data, d… | |
470 if (x->xmldataend) | |
471 x->xmldataend(x); | |
472 #ifdef HTML_MODE | |
473 datalen = 0; | |
474 #endif | |
475 break; | |
476 } | |
477 } | |
478 | |
479 #ifdef HTML_MODE | |
480 /* pending data, even if a tag didn't close (EOF… | |
481 if (datalen) { | |
482 x->data[datalen] = '\0'; | |
483 if (x->xmldata && datalen) | |
484 x->xmldata(x, x->data, datalen); | |
485 if (x->xmldataend) | |
486 x->xmldataend(x); | |
487 datalen = 0; | |
488 } | |
489 #endif | |
490 } | |
491 } | |
492 } |