xml.c - sub - subscene.com subtitle search | |
git clone git://git.codemadness.org/sub | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
xml.c (11059B) | |
--- | |
1 #include <sys/types.h> | |
2 | |
3 #include <ctype.h> | |
4 #include <errno.h> | |
5 #include <limits.h> | |
6 #include <stdio.h> | |
7 #include <stdlib.h> | |
8 #include <string.h> | |
9 | |
10 #include "xml.h" | |
11 | |
12 static void | |
13 xml_parseattrs(XMLParser *x) | |
14 { | |
15 size_t namelen = 0, valuelen; | |
16 int c, endsep, endname = 0, valuestart = 0; | |
17 | |
18 while ((c = x->getnext()) != EOF) { | |
19 if (isspace(c)) { | |
20 if (namelen) | |
21 endname = 1; | |
22 continue; | |
23 } else if (c == '?') | |
24 ; /* ignore */ | |
25 else if (c == '=') { | |
26 x->name[namelen] = '\0'; | |
27 valuestart = 1; | |
28 endname = 1; | |
29 } else if (namelen && ((endname && !valuestart && isalph… | |
30 /* attribute without value */ | |
31 x->name[namelen] = '\0'; | |
32 if (x->xmlattrstart) | |
33 x->xmlattrstart(x, x->tag, x->taglen, x-… | |
34 if (x->xmlattr) | |
35 x->xmlattr(x, x->tag, x->taglen, x->name… | |
36 if (x->xmlattrend) | |
37 x->xmlattrend(x, x->tag, x->taglen, x->n… | |
38 endname = 0; | |
39 x->name[0] = c; | |
40 namelen = 1; | |
41 } else if (namelen && valuestart) { | |
42 /* attribute with value */ | |
43 if (x->xmlattrstart) | |
44 x->xmlattrstart(x, x->tag, x->taglen, x-… | |
45 | |
46 valuelen = 0; | |
47 if (c == '\'' || c == '"') { | |
48 endsep = c; | |
49 } else { | |
50 endsep = ' '; /* isspace() */ | |
51 goto startvalue; | |
52 } | |
53 | |
54 while ((c = x->getnext()) != EOF) { | |
55 startvalue: | |
56 if (c == '&') { /* entities */ | |
57 x->data[valuelen] = '\0'; | |
58 /* call data function with data … | |
59 if (valuelen && x->xmlattr) | |
60 x->xmlattr(x, x->tag, x-… | |
61 x->data[0] = c; | |
62 valuelen = 1; | |
63 while ((c = x->getnext()) != EOF… | |
64 if (c == endsep || (ends… | |
65 break; | |
66 if (valuelen < sizeof(x-… | |
67 x->data[valuelen… | |
68 else { | |
69 /* entity too lo… | |
70 x->data[valuelen… | |
71 if (x->xmlattr) | |
72 x->xmlat… | |
73 x->data[0] = c; | |
74 valuelen = 1; | |
75 break; | |
76 } | |
77 if (c == ';') { | |
78 x->data[valuelen… | |
79 if (x->xmlattren… | |
80 x->xmlat… | |
81 valuelen = 0; | |
82 break; | |
83 } | |
84 } | |
85 } else if (c != endsep && !(endsep == ' … | |
86 if (valuelen < sizeof(x->data) -… | |
87 x->data[valuelen++] = c; | |
88 } else { | |
89 x->data[valuelen] = '\0'; | |
90 if (x->xmlattr) | |
91 x->xmlattr(x, x-… | |
92 x->data[0] = c; | |
93 valuelen = 1; | |
94 } | |
95 } | |
96 if (c == endsep || (endsep == ' ' && (c … | |
97 x->data[valuelen] = '\0'; | |
98 if (x->xmlattr) | |
99 x->xmlattr(x, x->tag, x-… | |
100 if (x->xmlattrend) | |
101 x->xmlattrend(x, x->tag,… | |
102 break; | |
103 } | |
104 } | |
105 namelen = endname = valuestart = 0; | |
106 } else if (namelen < sizeof(x->name) - 1) { | |
107 x->name[namelen++] = c; | |
108 } | |
109 if (c == '>') { | |
110 break; | |
111 } else if (c == '/') { | |
112 x->isshorttag = 1; | |
113 x->name[0] = '\0'; | |
114 namelen = 0; | |
115 } | |
116 } | |
117 } | |
118 | |
119 static void | |
120 xml_parsecomment(XMLParser *x) | |
121 { | |
122 size_t datalen = 0, i = 0; | |
123 int c; | |
124 | |
125 if (x->xmlcommentstart) | |
126 x->xmlcommentstart(x); | |
127 while ((c = x->getnext()) != EOF) { | |
128 if (c == '-' || c == '>') { | |
129 if (x->xmlcomment) { | |
130 x->data[datalen] = '\0'; | |
131 x->xmlcomment(x, x->data, datalen); | |
132 datalen = 0; | |
133 } | |
134 } | |
135 | |
136 if (c == '-') { | |
137 if (++i > 2) { | |
138 if (x->xmlcomment) | |
139 for (; i > 2; i--) | |
140 x->xmlcomment(x, "-", 1); | |
141 i = 2; | |
142 } | |
143 continue; | |
144 } else if (c == '>' && i == 2) { | |
145 if (x->xmlcommentend) | |
146 x->xmlcommentend(x); | |
147 return; | |
148 } else if (i) { | |
149 if (x->xmlcomment) { | |
150 for (; i > 0; i--) | |
151 x->xmlcomment(x, "-", 1); | |
152 } | |
153 i = 0; | |
154 } | |
155 | |
156 if (datalen < sizeof(x->data) - 1) { | |
157 x->data[datalen++] = c; | |
158 } else { | |
159 x->data[datalen] = '\0'; | |
160 if (x->xmlcomment) | |
161 x->xmlcomment(x, x->data, datalen); | |
162 x->data[0] = c; | |
163 datalen = 1; | |
164 } | |
165 } | |
166 } | |
167 | |
168 static void | |
169 xml_parsecdata(XMLParser *x) | |
170 { | |
171 size_t datalen = 0, i = 0; | |
172 int c; | |
173 | |
174 if (x->xmlcdatastart) | |
175 x->xmlcdatastart(x); | |
176 while ((c = x->getnext()) != EOF) { | |
177 if (c == ']' || c == '>') { | |
178 if (x->xmlcdata) { | |
179 x->data[datalen] = '\0'; | |
180 x->xmlcdata(x, x->data, datalen); | |
181 datalen = 0; | |
182 } | |
183 } | |
184 | |
185 if (c == ']') { | |
186 if (++i > 2) { | |
187 if (x->xmlcdata) | |
188 for (; i > 2; i--) | |
189 x->xmlcdata(x, "]", 1); | |
190 i = 2; | |
191 } | |
192 continue; | |
193 } else if (c == '>' && i == 2) { | |
194 if (x->xmlcdataend) | |
195 x->xmlcdataend(x); | |
196 return; | |
197 } else if (i) { | |
198 if (x->xmlcdata) | |
199 for (; i > 0; i--) | |
200 x->xmlcdata(x, "]", 1); | |
201 i = 0; | |
202 } | |
203 | |
204 if (datalen < sizeof(x->data) - 1) { | |
205 x->data[datalen++] = c; | |
206 } else { | |
207 x->data[datalen] = '\0'; | |
208 if (x->xmlcdata) | |
209 x->xmlcdata(x, x->data, datalen); | |
210 x->data[0] = c; | |
211 datalen = 1; | |
212 } | |
213 } | |
214 } | |
215 | |
216 static int | |
217 codepointtoutf8(long r, char *s) | |
218 { | |
219 if (r == 0) { | |
220 return 0; /* NUL byte */ | |
221 } else if (r <= 0x7F) { | |
222 /* 1 byte: 0aaaaaaa */ | |
223 s[0] = r; | |
224 return 1; | |
225 } else if (r <= 0x07FF) { | |
226 /* 2 bytes: 00000aaa aabbbbbb */ | |
227 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ | |
228 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ | |
229 return 2; | |
230 } else if (r <= 0xFFFF) { | |
231 /* 3 bytes: aaaabbbb bbcccccc */ | |
232 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ | |
233 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ | |
234 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ | |
235 return 3; | |
236 } else { | |
237 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ | |
238 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ | |
239 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ | |
240 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ | |
241 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ | |
242 return 4; | |
243 } | |
244 } | |
245 | |
246 static int | |
247 namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
248 { | |
249 static const struct { | |
250 char *entity; | |
251 int c; | |
252 } entities[] = { | |
253 { "&", '&' }, | |
254 { "<", '<' }, | |
255 { ">", '>' }, | |
256 { "'", '\'' }, | |
257 { """, '"' }, | |
258 { "&", '&' }, | |
259 { "<", '<' }, | |
260 { ">", '>' }, | |
261 { "&APOS;", '\'' }, | |
262 { """, '"' } | |
263 }; | |
264 size_t i; | |
265 | |
266 /* buffer is too small */ | |
267 if (bufsiz < 2) | |
268 return -1; | |
269 | |
270 /* doesn't start with &: can't match */ | |
271 if (*e != '&') | |
272 return 0; | |
273 | |
274 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { | |
275 if (!strcmp(e, entities[i].entity)) { | |
276 buf[0] = entities[i].c; | |
277 buf[1] = '\0'; | |
278 return 1; | |
279 } | |
280 } | |
281 return 0; | |
282 } | |
283 | |
284 static int | |
285 numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
286 { | |
287 long l; | |
288 int len; | |
289 char *end; | |
290 | |
291 /* buffer is too small */ | |
292 if (bufsiz < 5) | |
293 return -1; | |
294 | |
295 /* not a numeric entity */ | |
296 if (e[0] != '&' || e[1] != '#') | |
297 return 0; | |
298 | |
299 /* e[1] == '#', numeric / hexadecimal entity */ | |
300 e += 2; /* skip "&#" */ | |
301 errno = 0; | |
302 /* hex (16) or decimal (10) */ | |
303 if (*e == 'x') | |
304 l = strtoul(e + 1, &end, 16); | |
305 else | |
306 l = strtoul(e, &end, 10); | |
307 /* invalid value or not a well-formed entity or too high codepoi… | |
308 if (errno || *end != ';' || l > 0x10FFFF) | |
309 return 0; | |
310 len = codepointtoutf8(l, buf); | |
311 buf[len] = '\0'; | |
312 | |
313 return len; | |
314 } | |
315 | |
316 /* convert named- or numeric entity string to buffer string | |
317 * returns byte-length of string. */ | |
318 int | |
319 xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
320 { | |
321 /* buffer is too small */ | |
322 if (bufsiz < 5) | |
323 return -1; | |
324 /* doesn't start with & */ | |
325 if (e[0] != '&') | |
326 return 0; | |
327 /* named entity */ | |
328 if (e[1] != '#') | |
329 return namedentitytostr(e, buf, bufsiz); | |
330 else /* numeric entity */ | |
331 return numericentitytostr(e, buf, bufsiz); | |
332 } | |
333 | |
334 void | |
335 xml_parse(XMLParser *x) | |
336 { | |
337 int c, ispi; | |
338 size_t datalen, tagdatalen, taglen; | |
339 | |
340 if (!x->getnext) | |
341 return; | |
342 while ((c = x->getnext()) != EOF && c != '<') | |
343 ; /* skip until < */ | |
344 | |
345 while (c != EOF) { | |
346 if (c == '<') { /* parse tag */ | |
347 if ((c = x->getnext()) == EOF) | |
348 return; | |
349 | |
350 if (c == '!') { /* cdata and comments */ | |
351 for (tagdatalen = 0; (c = x->getnext()) … | |
352 /* NOTE: sizeof(x->data) must be… | |
353 if (tagdatalen <= sizeof("[CDATA… | |
354 x->data[tagdatalen++] = … | |
355 if (c == '>') | |
356 break; | |
357 else if (c == '-' && tagdatalen … | |
358 (x->data[0] == '… | |
359 xml_parsecomment(x); | |
360 break; | |
361 } else if (c == '[') { | |
362 if (tagdatalen == sizeof… | |
363 !strncmp(x->data, "[… | |
364 xml_parsecdata(x… | |
365 break; | |
366 } | |
367 } | |
368 } | |
369 } else { | |
370 x->tag[0] = '\0'; | |
371 x->taglen = 0; | |
372 | |
373 /* normal tag (open, short open, close),… | |
374 if (isspace(c)) | |
375 while ((c = x->getnext()) != EOF… | |
376 ; | |
377 if (c == EOF) | |
378 return; | |
379 x->tag[0] = c; | |
380 ispi = (c == '?') ? 1 : 0; | |
381 x->isshorttag = ispi; | |
382 taglen = 1; | |
383 while ((c = x->getnext()) != EOF) { | |
384 if (c == '/') | |
385 x->isshorttag = 1; /* sh… | |
386 else if (c == '>' || isspace(c))… | |
387 x->tag[taglen] = '\0'; | |
388 if (x->tag[0] == '/') { … | |
389 x->taglen = --ta… | |
390 if (taglen && x-… | |
391 x->xmlta… | |
392 } else { | |
393 x->taglen = tagl… | |
394 /* start tag */ | |
395 if (x->xmltagsta… | |
396 x->xmlta… | |
397 if (isspace(c)) | |
398 xml_pars… | |
399 if (x->xmltagsta… | |
400 x->xmlta… | |
401 } | |
402 /* call tagend for short… | |
403 if ((x->isshorttag || is… | |
404 x->xmltagend(x, … | |
405 break; | |
406 } else if (taglen < sizeof(x->ta… | |
407 x->tag[taglen++] = c; /*… | |
408 } | |
409 } | |
410 } else { | |
411 /* parse tag data */ | |
412 datalen = 0; | |
413 if (x->xmldatastart) | |
414 x->xmldatastart(x); | |
415 while ((c = x->getnext()) != EOF) { | |
416 if (c == '&') { | |
417 if (datalen) { | |
418 x->data[datalen] = '\0'; | |
419 if (x->xmldata) | |
420 x->xmldata(x, x-… | |
421 } | |
422 x->data[0] = c; | |
423 datalen = 1; | |
424 while ((c = x->getnext()) != EOF… | |
425 if (c == '<') | |
426 break; | |
427 if (datalen < sizeof(x->… | |
428 x->data[datalen+… | |
429 else { | |
430 /* entity too lo… | |
431 x->data[datalen]… | |
432 if (x->xmldata) | |
433 x->xmlda… | |
434 x->data[0] = c; | |
435 datalen = 1; | |
436 break; | |
437 } | |
438 if (c == ';') { | |
439 x->data[datalen]… | |
440 if (x->xmldataen… | |
441 x->xmlda… | |
442 datalen = 0; | |
443 break; | |
444 } | |
445 } | |
446 } else if (c != '<') { | |
447 if (datalen < sizeof(x->data) - … | |
448 x->data[datalen++] = c; | |
449 } else { | |
450 x->data[datalen] = '\0'; | |
451 if (x->xmldata) | |
452 x->xmldata(x, x-… | |
453 x->data[0] = c; | |
454 datalen = 1; | |
455 } | |
456 } | |
457 if (c == '<') { | |
458 x->data[datalen] = '\0'; | |
459 if (x->xmldata && datalen) | |
460 x->xmldata(x, x->data, d… | |
461 if (x->xmldataend) | |
462 x->xmldataend(x); | |
463 break; | |
464 } | |
465 } | |
466 } | |
467 } | |
468 } |