xml.c - sfeed - RSS and Atom parser | |
git clone git://git.codemadness.org/sfeed | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
xml.c (10090B) | |
--- | |
1 #include <errno.h> | |
2 #include <stdio.h> | |
3 #include <stdlib.h> | |
4 #include <string.h> | |
5 | |
6 #include "xml.h" | |
7 | |
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) | |
9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) | |
10 | |
11 static void | |
12 xml_parseattrs(XMLParser *x) | |
13 { | |
14 size_t namelen = 0, valuelen; | |
15 int c, endsep, endname = 0, valuestart = 0; | |
16 | |
17 while ((c = GETNEXT()) != EOF) { | |
18 if (ISSPACE(c)) { | |
19 if (namelen) | |
20 endname = 1; | |
21 continue; | |
22 } else if (c == '?') | |
23 ; /* ignore */ | |
24 else if (c == '=') { | |
25 x->name[namelen] = '\0'; | |
26 valuestart = 1; | |
27 endname = 1; | |
28 } else if (namelen && ((endname && !valuestart && ISALPH… | |
29 /* attribute without value */ | |
30 x->name[namelen] = '\0'; | |
31 if (x->xmlattrstart) | |
32 x->xmlattrstart(x, x->tag, x->taglen, x-… | |
33 if (x->xmlattr) | |
34 x->xmlattr(x, x->tag, x->taglen, x->name… | |
35 if (x->xmlattrend) | |
36 x->xmlattrend(x, x->tag, x->taglen, x->n… | |
37 endname = 0; | |
38 x->name[0] = c; | |
39 namelen = 1; | |
40 } else if (namelen && valuestart) { | |
41 /* attribute with value */ | |
42 if (x->xmlattrstart) | |
43 x->xmlattrstart(x, x->tag, x->taglen, x-… | |
44 | |
45 valuelen = 0; | |
46 if (c == '\'' || c == '"') { | |
47 endsep = c; | |
48 } else { | |
49 endsep = ' '; /* ISSPACE() */ | |
50 goto startvalue; | |
51 } | |
52 | |
53 while ((c = GETNEXT()) != EOF) { | |
54 startvalue: | |
55 if (c == '&') { /* entities */ | |
56 x->data[valuelen] = '\0'; | |
57 /* call data function with data … | |
58 if (valuelen && x->xmlattr) | |
59 x->xmlattr(x, x->tag, x-… | |
60 x->data[0] = c; | |
61 valuelen = 1; | |
62 while ((c = GETNEXT()) != EOF) { | |
63 if (c == endsep || (ends… | |
64 break; | |
65 if (valuelen < sizeof(x-… | |
66 x->data[valuelen… | |
67 else { | |
68 /* entity too lo… | |
69 x->data[valuelen… | |
70 if (x->xmlattr) | |
71 x->xmlat… | |
72 x->data[0] = c; | |
73 valuelen = 1; | |
74 break; | |
75 } | |
76 if (c == ';') { | |
77 x->data[valuelen… | |
78 if (x->xmlattren… | |
79 x->xmlat… | |
80 valuelen = 0; | |
81 break; | |
82 } | |
83 } | |
84 } else if (c != endsep && !(endsep == ' … | |
85 if (valuelen < sizeof(x->data) -… | |
86 x->data[valuelen++] = c; | |
87 } else { | |
88 x->data[valuelen] = '\0'; | |
89 if (x->xmlattr) | |
90 x->xmlattr(x, x-… | |
91 x->data[0] = c; | |
92 valuelen = 1; | |
93 } | |
94 } | |
95 if (c == endsep || (endsep == ' ' && (c … | |
96 x->data[valuelen] = '\0'; | |
97 if (x->xmlattr) | |
98 x->xmlattr(x, x->tag, x-… | |
99 if (x->xmlattrend) | |
100 x->xmlattrend(x, x->tag,… | |
101 break; | |
102 } | |
103 } | |
104 namelen = endname = valuestart = 0; | |
105 } else if (namelen < sizeof(x->name) - 1) { | |
106 x->name[namelen++] = c; | |
107 } | |
108 if (c == '>') { | |
109 break; | |
110 } else if (c == '/') { | |
111 x->isshorttag = 1; | |
112 x->name[0] = '\0'; | |
113 namelen = 0; | |
114 } | |
115 } | |
116 } | |
117 | |
118 static void | |
119 xml_parsecomment(XMLParser *x) | |
120 { | |
121 int c, i = 0; | |
122 | |
123 while ((c = GETNEXT()) != EOF) { | |
124 if (c == '-') { | |
125 if (++i > 2) | |
126 i = 2; | |
127 continue; | |
128 } else if (c == '>' && i == 2) { | |
129 return; | |
130 } else if (i) { | |
131 i = 0; | |
132 } | |
133 } | |
134 } | |
135 | |
136 static void | |
137 xml_parsecdata(XMLParser *x) | |
138 { | |
139 size_t datalen = 0, i = 0; | |
140 int c; | |
141 | |
142 while ((c = GETNEXT()) != EOF) { | |
143 if (c == ']' || c == '>') { | |
144 if (x->xmlcdata && datalen) { | |
145 x->data[datalen] = '\0'; | |
146 x->xmlcdata(x, x->data, datalen); | |
147 datalen = 0; | |
148 } | |
149 } | |
150 | |
151 if (c == ']') { | |
152 if (++i > 2) { | |
153 if (x->xmlcdata) | |
154 for (; i > 2; i--) | |
155 x->xmlcdata(x, "]", 1); | |
156 i = 2; | |
157 } | |
158 continue; | |
159 } else if (c == '>' && i == 2) { | |
160 return; | |
161 } else if (i) { | |
162 if (x->xmlcdata) | |
163 for (; i > 0; i--) | |
164 x->xmlcdata(x, "]", 1); | |
165 i = 0; | |
166 } | |
167 | |
168 if (datalen < sizeof(x->data) - 1) { | |
169 x->data[datalen++] = c; | |
170 } else { | |
171 x->data[datalen] = '\0'; | |
172 if (x->xmlcdata) | |
173 x->xmlcdata(x, x->data, datalen); | |
174 x->data[0] = c; | |
175 datalen = 1; | |
176 } | |
177 } | |
178 } | |
179 | |
180 static int | |
181 codepointtoutf8(long r, char *s) | |
182 { | |
183 if (r == 0) { | |
184 return 0; /* NUL byte */ | |
185 } else if (r <= 0x7F) { | |
186 /* 1 byte: 0aaaaaaa */ | |
187 s[0] = r; | |
188 return 1; | |
189 } else if (r <= 0x07FF) { | |
190 /* 2 bytes: 00000aaa aabbbbbb */ | |
191 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ | |
192 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ | |
193 return 2; | |
194 } else if (r <= 0xFFFF) { | |
195 /* 3 bytes: aaaabbbb bbcccccc */ | |
196 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ | |
197 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ | |
198 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ | |
199 return 3; | |
200 } else { | |
201 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ | |
202 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ | |
203 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ | |
204 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ | |
205 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ | |
206 return 4; | |
207 } | |
208 } | |
209 | |
210 static int | |
211 namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
212 { | |
213 static const struct { | |
214 const char *entity; | |
215 int c; | |
216 } entities[] = { | |
217 { "amp;", '&' }, | |
218 { "lt;", '<' }, | |
219 { "gt;", '>' }, | |
220 { "apos;", '\'' }, | |
221 { "quot;", '"' }, | |
222 }; | |
223 size_t i; | |
224 | |
225 /* buffer is too small */ | |
226 if (bufsiz < 2) | |
227 return -1; | |
228 | |
229 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { | |
230 if (!strcmp(e, entities[i].entity)) { | |
231 buf[0] = entities[i].c; | |
232 buf[1] = '\0'; | |
233 return 1; | |
234 } | |
235 } | |
236 return -1; | |
237 } | |
238 | |
239 static int | |
240 numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
241 { | |
242 long l; | |
243 int len; | |
244 char *end; | |
245 | |
246 /* buffer is too small */ | |
247 if (bufsiz < 5) | |
248 return -1; | |
249 | |
250 errno = 0; | |
251 /* hex (16) or decimal (10) */ | |
252 if (*e == 'x') | |
253 l = strtol(++e, &end, 16); | |
254 else | |
255 l = strtol(e, &end, 10); | |
256 /* invalid value or not a well-formed entity or invalid code poi… | |
257 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || | |
258 (l >= 0xd800 && l <= 0xdfff)) | |
259 return -1; | |
260 len = codepointtoutf8(l, buf); | |
261 buf[len] = '\0'; | |
262 | |
263 return len; | |
264 } | |
265 | |
266 /* convert named- or numeric entity string to buffer string | |
267 * returns byte-length of string or -1 on failure. */ | |
268 int | |
269 xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
270 { | |
271 /* doesn't start with & */ | |
272 if (e[0] != '&') | |
273 return -1; | |
274 /* numeric entity */ | |
275 if (e[1] == '#') | |
276 return numericentitytostr(e + 2, buf, bufsiz); | |
277 else /* named entity */ | |
278 return namedentitytostr(e + 1, buf, bufsiz); | |
279 } | |
280 | |
281 void | |
282 xml_parse(XMLParser *x) | |
283 { | |
284 size_t datalen, tagdatalen; | |
285 int c, isend; | |
286 | |
287 while ((c = GETNEXT()) != EOF && c != '<') | |
288 ; /* skip until < */ | |
289 | |
290 while (c != EOF) { | |
291 if (c == '<') { /* parse tag */ | |
292 if ((c = GETNEXT()) == EOF) | |
293 return; | |
294 | |
295 if (c == '!') { /* CDATA and comments */ | |
296 for (tagdatalen = 0; (c = GETNEXT()) != … | |
297 /* NOTE: sizeof(x->data) must be… | |
298 if (tagdatalen <= sizeof("[CDATA… | |
299 x->data[tagdatalen++] = … | |
300 if (c == '>') | |
301 break; | |
302 else if (c == '-' && tagdatalen … | |
303 (x->data[0] == '… | |
304 xml_parsecomment(x); | |
305 break; | |
306 } else if (c == '[') { | |
307 if (tagdatalen == sizeof… | |
308 !strncmp(x->data, "[… | |
309 xml_parsecdata(x… | |
310 break; | |
311 } | |
312 } | |
313 } | |
314 } else { | |
315 /* normal tag (open, short open, close),… | |
316 x->tag[0] = c; | |
317 x->taglen = 1; | |
318 x->isshorttag = isend = 0; | |
319 | |
320 /* treat processing instruction as short… | |
321 if (c == '?') { | |
322 x->isshorttag = 1; | |
323 } else if (c == '/') { | |
324 if ((c = GETNEXT()) == EOF) | |
325 return; | |
326 x->tag[0] = c; | |
327 isend = 1; | |
328 } | |
329 | |
330 while ((c = GETNEXT()) != EOF) { | |
331 if (c == '/') | |
332 x->isshorttag = 1; /* sh… | |
333 else if (c == '>' || ISSPACE(c))… | |
334 x->tag[x->taglen] = '\0'; | |
335 if (isend) { /* end tag,… | |
336 while (c != '>' … | |
337 c = GETN… | |
338 if (x->xmltagend) | |
339 x->xmlta… | |
340 x->tag[0] = '\0'; | |
341 x->taglen = 0; | |
342 } else { | |
343 /* start tag */ | |
344 if (x->xmltagsta… | |
345 x->xmlta… | |
346 if (ISSPACE(c)) | |
347 xml_pars… | |
348 if (x->xmltagsta… | |
349 x->xmlta… | |
350 } | |
351 /* call tagend for short… | |
352 if (x->isshorttag) { | |
353 if (x->xmltagend) | |
354 x->xmlta… | |
355 x->tag[0] = '\0'; | |
356 x->taglen = 0; | |
357 } | |
358 break; | |
359 } else if (x->taglen < sizeof(x-… | |
360 x->tag[x->taglen++] = c;… | |
361 } | |
362 } | |
363 } else { | |
364 /* parse tag data */ | |
365 datalen = 0; | |
366 while ((c = GETNEXT()) != EOF) { | |
367 if (c == '&') { | |
368 if (datalen) { | |
369 x->data[datalen] = '\0'; | |
370 if (x->xmldata) | |
371 x->xmldata(x, x-… | |
372 } | |
373 x->data[0] = c; | |
374 datalen = 1; | |
375 while ((c = GETNEXT()) != EOF) { | |
376 if (c == '<') | |
377 break; | |
378 if (datalen < sizeof(x->… | |
379 x->data[datalen+… | |
380 else { | |
381 /* entity too lo… | |
382 x->data[datalen]… | |
383 if (x->xmldata) | |
384 x->xmlda… | |
385 x->data[0] = c; | |
386 datalen = 1; | |
387 break; | |
388 } | |
389 if (c == ';') { | |
390 x->data[datalen]… | |
391 if (x->xmldataen… | |
392 x->xmlda… | |
393 datalen = 0; | |
394 break; | |
395 } | |
396 } | |
397 } else if (c != '<') { | |
398 if (datalen < sizeof(x->data) - … | |
399 x->data[datalen++] = c; | |
400 } else { | |
401 x->data[datalen] = '\0'; | |
402 if (x->xmldata) | |
403 x->xmldata(x, x-… | |
404 x->data[0] = c; | |
405 datalen = 1; | |
406 } | |
407 } | |
408 if (c == '<') { | |
409 x->data[datalen] = '\0'; | |
410 if (x->xmldata && datalen) | |
411 x->xmldata(x, x->data, d… | |
412 break; | |
413 } | |
414 } | |
415 } | |
416 } | |
417 } |