Introduction
Introduction Statistics Contact Development Disclaimer Help
xml.c - webdump - HTML to plain-text converter for webpages
git clone git://git.codemadness.org/webdump
Log
Files
Refs
README
LICENSE
---
xml.c (11720B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 /* ifdef for HTML mode. To differentiate xml.c and webdump HTML changes …
9 #define HTML_MODE
10
11 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
12 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
13
14 static void
15 xml_parseattrs(XMLParser *x)
16 {
17 size_t namelen = 0, valuelen;
18 int c, endsep, endname = 0, valuestart = 0;
19
20 while ((c = GETNEXT()) != EOF) {
21 if (ISSPACE(c)) {
22 if (namelen)
23 endname = 1;
24 continue;
25 } else if (c == '?')
26 ; /* ignore */
27 else if (c == '=') {
28 x->name[namelen] = '\0';
29 valuestart = 1;
30 endname = 1;
31 } else if (namelen && ((endname && !valuestart && ISALPH…
32 /* attribute without value */
33 x->name[namelen] = '\0';
34 if (x->xmlattrstart)
35 x->xmlattrstart(x, x->tag, x->taglen, x-…
36 if (x->xmlattr)
37 x->xmlattr(x, x->tag, x->taglen, x->name…
38 if (x->xmlattrend)
39 x->xmlattrend(x, x->tag, x->taglen, x->n…
40 endname = 0;
41 x->name[0] = c;
42 namelen = 1;
43 } else if (namelen && valuestart) {
44 /* attribute with value */
45 if (x->xmlattrstart)
46 x->xmlattrstart(x, x->tag, x->taglen, x-…
47
48 valuelen = 0;
49 if (c == '\'' || c == '"') {
50 endsep = c;
51 } else {
52 endsep = ' '; /* ISSPACE() */
53 goto startvalue;
54 }
55
56 while ((c = GETNEXT()) != EOF) {
57 startvalue:
58 if (c == '&') { /* entities */
59 x->data[valuelen] = '\0';
60 /* call data function with data …
61 if (valuelen && x->xmlattr)
62 x->xmlattr(x, x->tag, x-…
63 x->data[0] = c;
64 valuelen = 1;
65 while ((c = GETNEXT()) != EOF) {
66 if (c == endsep || (ends…
67 break;
68 if (valuelen < sizeof(x-…
69 x->data[valuelen…
70 else {
71 /* entity too lo…
72 x->data[valuelen…
73 if (x->xmlattr)
74 x->xmlat…
75 x->data[0] = c;
76 valuelen = 1;
77 break;
78 }
79 if (c == ';') {
80 x->data[valuelen…
81 if (x->xmlattren…
82 x->xmlat…
83 valuelen = 0;
84 break;
85 }
86 }
87 } else if (c != endsep && !(endsep == ' …
88 if (valuelen < sizeof(x->data) -…
89 x->data[valuelen++] = c;
90 } else {
91 x->data[valuelen] = '\0';
92 if (x->xmlattr)
93 x->xmlattr(x, x-…
94 x->data[0] = c;
95 valuelen = 1;
96 }
97 }
98 if (c == endsep || (endsep == ' ' && (c …
99 x->data[valuelen] = '\0';
100 if (x->xmlattr)
101 x->xmlattr(x, x->tag, x-…
102 if (x->xmlattrend)
103 x->xmlattrend(x, x->tag,…
104 break;
105 }
106 }
107 namelen = endname = valuestart = 0;
108 } else if (namelen < sizeof(x->name) - 1) {
109 x->name[namelen++] = c;
110 }
111 if (c == '>') {
112 break;
113 } else if (c == '/') {
114 x->isshorttag = 1;
115 x->name[0] = '\0';
116 namelen = 0;
117 }
118 }
119 }
120
121 static void
122 xml_parsecomment(XMLParser *x)
123 {
124 size_t datalen = 0, i = 0;
125 int c;
126
127 if (x->xmlcommentstart)
128 x->xmlcommentstart(x);
129 while ((c = GETNEXT()) != EOF) {
130 if (c == '-' || c == '>') {
131 if (x->xmlcomment && datalen) {
132 x->data[datalen] = '\0';
133 x->xmlcomment(x, x->data, datalen);
134 datalen = 0;
135 }
136 }
137
138 if (c == '-') {
139 if (++i > 2) {
140 if (x->xmlcomment)
141 for (; i > 2; i--)
142 x->xmlcomment(x, "-", 1);
143 i = 2;
144 }
145 continue;
146 } else if (c == '>' && i == 2) {
147 if (x->xmlcommentend)
148 x->xmlcommentend(x);
149 return;
150 } else if (i) {
151 if (x->xmlcomment) {
152 for (; i > 0; i--)
153 x->xmlcomment(x, "-", 1);
154 }
155 i = 0;
156 }
157
158 if (datalen < sizeof(x->data) - 1) {
159 x->data[datalen++] = c;
160 } else {
161 x->data[datalen] = '\0';
162 if (x->xmlcomment)
163 x->xmlcomment(x, x->data, datalen);
164 x->data[0] = c;
165 datalen = 1;
166 }
167 }
168 }
169
170 static void
171 xml_parsecdata(XMLParser *x)
172 {
173 size_t datalen = 0, i = 0;
174 int c;
175
176 if (x->xmlcdatastart)
177 x->xmlcdatastart(x);
178 while ((c = GETNEXT()) != EOF) {
179 if (c == ']' || c == '>') {
180 if (x->xmlcdata && datalen) {
181 x->data[datalen] = '\0';
182 x->xmlcdata(x, x->data, datalen);
183 datalen = 0;
184 }
185 }
186
187 if (c == ']') {
188 if (++i > 2) {
189 if (x->xmlcdata)
190 for (; i > 2; i--)
191 x->xmlcdata(x, "]", 1);
192 i = 2;
193 }
194 continue;
195 } else if (c == '>' && i == 2) {
196 if (x->xmlcdataend)
197 x->xmlcdataend(x);
198 return;
199 } else if (i) {
200 if (x->xmlcdata)
201 for (; i > 0; i--)
202 x->xmlcdata(x, "]", 1);
203 i = 0;
204 }
205
206 if (datalen < sizeof(x->data) - 1) {
207 x->data[datalen++] = c;
208 } else {
209 x->data[datalen] = '\0';
210 if (x->xmlcdata)
211 x->xmlcdata(x, x->data, datalen);
212 x->data[0] = c;
213 datalen = 1;
214 }
215 }
216 }
217
218 static int
219 codepointtoutf8(long r, char *s)
220 {
221 if (r == 0) {
222 return 0; /* NUL byte */
223 } else if (r <= 0x7F) {
224 /* 1 byte: 0aaaaaaa */
225 s[0] = r;
226 return 1;
227 } else if (r <= 0x07FF) {
228 /* 2 bytes: 00000aaa aabbbbbb */
229 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
230 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
231 return 2;
232 } else if (r <= 0xFFFF) {
233 /* 3 bytes: aaaabbbb bbcccccc */
234 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
235 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
236 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
237 return 3;
238 } else {
239 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
240 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
241 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
242 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
243 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
244 return 4;
245 }
246 }
247
248 struct namedentity {
249 const char *entity;
250 long cp;
251 };
252
253 static int
254 namedentitycmp(const void *v1, const void *v2)
255 {
256 struct namedentity *n1 = (struct namedentity *)v1;
257 struct namedentity *n2 = (struct namedentity *)v2;
258
259 return strcmp(n1->entity, n2->entity);
260 }
261
262 static const struct namedentity entities[] = {
263 #include "namedentities.h"
264 };
265
266 static int
267 namedentitytostr(const char *e, char *buf, size_t bufsiz)
268 {
269 struct namedentity find, *found;
270 size_t i;
271
272 /* buffer is too small */
273 if (bufsiz < 5)
274 return -1;
275
276 find.entity = e;
277 found = bsearch(&find, entities, sizeof(entities) / sizeof(*enti…
278 sizeof(*entities), namedentitycmp);
279 if (found) {
280 i = codepointtoutf8(found->cp, buf);
281 buf[i] = '\0';
282 return i;
283 }
284 return -1;
285 }
286
287 static int
288 numericentitytostr(const char *e, char *buf, size_t bufsiz)
289 {
290 long l;
291 int len;
292 char *end;
293
294 /* buffer is too small */
295 if (bufsiz < 5)
296 return -1;
297
298 errno = 0;
299 /* hex (16) or decimal (10) */
300 if (*e == 'x')
301 l = strtol(++e, &end, 16);
302 else
303 l = strtol(e, &end, 10);
304 /* invalid value or not a well-formed entity or invalid code poi…
305 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
306 (l >= 0xd800 && l <= 0xdfff))
307 return -1;
308 len = codepointtoutf8(l, buf);
309 buf[len] = '\0';
310
311 return len;
312 }
313
314 /* convert named- or numeric entity string to buffer string
315 * returns byte-length of string or -1 on failure. */
316 int
317 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
318 {
319 /* doesn't start with & */
320 if (e[0] != '&')
321 return -1;
322 /* numeric entity */
323 if (e[1] == '#')
324 return numericentitytostr(e + 2, buf, bufsiz);
325 else /* named entity */
326 return namedentitytostr(e + 1, buf, bufsiz);
327 }
328
329 void
330 xml_parse(XMLParser *x)
331 {
332 size_t datalen, tagdatalen;
333 int c, isend;
334
335 #ifdef HTML_MODE
336 goto read_data;
337 #else
338 /* HTML: process data before a tag occured aswell */
339 while ((c = GETNEXT()) != EOF && c != '<')
340 ; /* skip until < */
341 #endif
342
343 while (c != EOF) {
344 if (c == '<') { /* parse tag */
345 if ((c = GETNEXT()) == EOF)
346 return;
347
348 if (c == '!') { /* CDATA and comments */
349 for (tagdatalen = 0; (c = GETNEXT()) != …
350 /* NOTE: sizeof(x->data) must be…
351 if (tagdatalen <= sizeof("[CDATA…
352 x->data[tagdatalen++] = …
353 if (c == '>')
354 break;
355 else if (c == '-' && tagdatalen …
356 (x->data[0] == '…
357 xml_parsecomment(x);
358 break;
359 } else if (c == '[') {
360 if (tagdatalen == sizeof…
361 !strncmp(x->data, "[…
362 xml_parsecdata(x…
363 break;
364 }
365 }
366 }
367 } else {
368 /* normal tag (open, short open, close),…
369 x->tag[0] = c;
370 x->taglen = 1;
371 x->isshorttag = isend = 0;
372
373 /* treat processing instruction as short…
374 if (c == '?') {
375 x->isshorttag = 1;
376 } else if (c == '/') {
377 if ((c = GETNEXT()) == EOF)
378 return;
379 x->tag[0] = c;
380 isend = 1;
381 }
382
383 while ((c = GETNEXT()) != EOF) {
384 if (c == '/')
385 x->isshorttag = 1; /* sh…
386 else if (c == '>' || ISSPACE(c))…
387 x->tag[x->taglen] = '\0';
388 if (isend) { /* end tag,…
389 while (c != '>' …
390 c = GETN…
391 if (x->xmltagend)
392 x->xmlta…
393 x->tag[0] = '\0';
394 x->taglen = 0;
395 } else {
396 /* start tag */
397 if (x->xmltagsta…
398 x->xmlta…
399 if (ISSPACE(c))
400 xml_pars…
401 if (x->xmltagsta…
402 x->xmlta…
403 }
404 /* call tagend for short…
405 if (x->isshorttag) {
406 if (x->xmltagend)
407 x->xmlta…
408 x->tag[0] = '\0';
409 x->taglen = 0;
410 }
411 break;
412 } else if (x->taglen < sizeof(x-…
413 x->tag[x->taglen++] = c;…
414 }
415 }
416 } else {
417 #ifdef HTML_MODE
418 read_data:
419 #endif
420 /* parse tag data */
421 datalen = 0;
422 if (x->xmldatastart)
423 x->xmldatastart(x);
424 while ((c = GETNEXT()) != EOF) {
425 if (c == '&') {
426 if (datalen) {
427 x->data[datalen] = '\0';
428 if (x->xmldata)
429 x->xmldata(x, x-…
430 }
431 x->data[0] = c;
432 datalen = 1;
433 while ((c = GETNEXT()) != EOF) {
434 if (c == '<')
435 break;
436 if (datalen < sizeof(x->…
437 x->data[datalen+…
438 else {
439 /* entity too lo…
440 x->data[datalen]…
441 if (x->xmldata)
442 x->xmlda…
443 x->data[0] = c;
444 datalen = 1;
445 break;
446 }
447 if (c == ';') {
448 x->data[datalen]…
449 if (x->xmldataen…
450 x->xmlda…
451 datalen = 0;
452 break;
453 }
454 }
455 } else if (c != '<') {
456 if (datalen < sizeof(x->data) - …
457 x->data[datalen++] = c;
458 } else {
459 x->data[datalen] = '\0';
460 if (x->xmldata)
461 x->xmldata(x, x-…
462 x->data[0] = c;
463 datalen = 1;
464 }
465 }
466 if (c == '<') {
467 x->data[datalen] = '\0';
468 if (x->xmldata && datalen)
469 x->xmldata(x, x->data, d…
470 if (x->xmldataend)
471 x->xmldataend(x);
472 #ifdef HTML_MODE
473 datalen = 0;
474 #endif
475 break;
476 }
477 }
478
479 #ifdef HTML_MODE
480 /* pending data, even if a tag didn't close (EOF…
481 if (datalen) {
482 x->data[datalen] = '\0';
483 if (x->xmldata && datalen)
484 x->xmldata(x, x->data, datalen);
485 if (x->xmldataend)
486 x->xmldataend(x);
487 datalen = 0;
488 }
489 #endif
490 }
491 }
492 }
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.