xml.c - grabtitle - stupid HTML title grabber | |
git clone git://git.codemadness.org/grabtitle | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
xml.c (8078B) | |
--- | |
1 #include <errno.h> | |
2 #include <stdio.h> | |
3 #include <stdlib.h> | |
4 #include <string.h> | |
5 | |
6 #include "xml.h" | |
7 | |
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) | |
9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) | |
10 | |
11 static void | |
12 xml_parseattrs(XMLParser *x) | |
13 { | |
14 size_t namelen = 0; | |
15 int c, endsep, endname = 0, valuestart = 0; | |
16 | |
17 while ((c = GETNEXT()) != EOF) { | |
18 if (ISSPACE(c)) { | |
19 if (namelen) | |
20 endname = 1; | |
21 continue; | |
22 } else if (c == '?') | |
23 ; /* ignore */ | |
24 else if (c == '=') { | |
25 valuestart = 1; | |
26 endname = 1; | |
27 } else if (namelen && ((endname && !valuestart && ISALPH… | |
28 endname = 0; | |
29 namelen = 1; | |
30 } else if (namelen && valuestart) { | |
31 /* attribute with value */ | |
32 if (c == '\'' || c == '"') { | |
33 endsep = c; | |
34 while ((c = GETNEXT()) != EOF) { | |
35 if (c == endsep) | |
36 break; | |
37 } | |
38 } else { | |
39 while ((c = GETNEXT()) != EOF) { | |
40 if (c == '>' || ISSPACE(c)) | |
41 break; | |
42 } | |
43 } | |
44 namelen = endname = valuestart = 0; | |
45 } else { | |
46 namelen = 1; | |
47 } | |
48 if (c == '>') { | |
49 break; | |
50 } else if (c == '/') { | |
51 x->isshorttag = 1; | |
52 namelen = 0; | |
53 } | |
54 } | |
55 } | |
56 | |
57 static void | |
58 xml_parsecomment(XMLParser *x) | |
59 { | |
60 size_t i = 0; | |
61 int c; | |
62 | |
63 while ((c = GETNEXT()) != EOF) { | |
64 if (c == '-') { | |
65 if (i < 2) | |
66 i++; | |
67 } else if (c == '>' && i == 2) { | |
68 return; | |
69 } else { | |
70 i = 0; | |
71 } | |
72 } | |
73 } | |
74 | |
75 static void | |
76 xml_parsecdata(XMLParser *x) | |
77 { | |
78 size_t datalen = 0, i = 0; | |
79 int c; | |
80 | |
81 while ((c = GETNEXT()) != EOF) { | |
82 if (c == ']' || c == '>') { | |
83 if (x->xmlcdata) { | |
84 x->data[datalen] = '\0'; | |
85 x->xmlcdata(x, x->data, datalen); | |
86 datalen = 0; | |
87 } | |
88 } | |
89 | |
90 if (c == ']') { | |
91 if (++i > 2) { | |
92 if (x->xmlcdata) | |
93 for (; i > 2; i--) | |
94 x->xmlcdata(x, "]", 1); | |
95 i = 2; | |
96 } | |
97 continue; | |
98 } else if (c == '>' && i == 2) { | |
99 return; | |
100 } else { | |
101 if (x->xmlcdata) | |
102 for (; i > 0; i--) | |
103 x->xmlcdata(x, "]", 1); | |
104 i = 0; | |
105 } | |
106 | |
107 if (datalen < sizeof(x->data) - 1) { | |
108 x->data[datalen++] = c; | |
109 } else { | |
110 x->data[datalen] = '\0'; | |
111 if (x->xmlcdata) | |
112 x->xmlcdata(x, x->data, datalen); | |
113 x->data[0] = c; | |
114 datalen = 1; | |
115 } | |
116 } | |
117 } | |
118 | |
119 static int | |
120 codepointtoutf8(long r, char *s) | |
121 { | |
122 if (r == 0) { | |
123 return 0; /* NUL byte */ | |
124 } else if (r <= 0x7F) { | |
125 /* 1 byte: 0aaaaaaa */ | |
126 s[0] = r; | |
127 return 1; | |
128 } else if (r <= 0x07FF) { | |
129 /* 2 bytes: 00000aaa aabbbbbb */ | |
130 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ | |
131 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ | |
132 return 2; | |
133 } else if (r <= 0xFFFF) { | |
134 /* 3 bytes: aaaabbbb bbcccccc */ | |
135 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ | |
136 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ | |
137 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ | |
138 return 3; | |
139 } else { | |
140 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ | |
141 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ | |
142 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ | |
143 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ | |
144 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ | |
145 return 4; | |
146 } | |
147 } | |
148 | |
149 struct namedentity { | |
150 const char *entity; | |
151 long cp; | |
152 }; | |
153 | |
154 int | |
155 namedentitycmp(const void *v1, const void *v2) | |
156 { | |
157 struct namedentity *n1 = (struct namedentity *)v1; | |
158 struct namedentity *n2 = (struct namedentity *)v2; | |
159 | |
160 return strcmp(n1->entity, n2->entity); | |
161 } | |
162 | |
163 static int | |
164 namedentitytostr(const char *e, char *buf, size_t bufsiz) | |
165 { | |
166 static const struct namedentity entities[] = { | |
167 #include "namedentities.h" | |
168 }; | |
169 struct namedentity find, *found; | |
170 size_t i; | |
171 | |
172 /* buffer is too small */ | |
173 if (bufsiz < 5) | |
174 return -1; | |
175 | |
176 find.entity = e; | |
177 found = bsearch(&find, entities, sizeof(entities) / sizeof(*enti… | |
178 sizeof(*entities), namedentitycmp); | |
179 if (found) { | |
180 i = codepointtoutf8(found->cp, buf); | |
181 buf[i] = '\0'; | |
182 return i; | |
183 } | |
184 return -1; | |
185 } | |
186 | |
187 static int | |
188 numericentitytostr(const char *e, char *buf, size_t bufsiz) | |
189 { | |
190 long l; | |
191 int len; | |
192 char *end; | |
193 | |
194 /* buffer is too small */ | |
195 if (bufsiz < 5) | |
196 return -1; | |
197 | |
198 errno = 0; | |
199 /* hex (16) or decimal (10) */ | |
200 if (*e == 'x') | |
201 l = strtol(++e, &end, 16); | |
202 else | |
203 l = strtol(e, &end, 10); | |
204 /* invalid value or not a well-formed entity or invalid code poi… | |
205 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff || | |
206 (l >= 0xd800 && l <= 0xdfff)) | |
207 return -1; | |
208 len = codepointtoutf8(l, buf); | |
209 buf[len] = '\0'; | |
210 | |
211 return len; | |
212 } | |
213 | |
214 /* convert named- or numeric entity string to buffer string | |
215 * returns byte-length of string or -1 on failure. */ | |
216 int | |
217 xml_entitytostr(const char *e, char *buf, size_t bufsiz) | |
218 { | |
219 /* doesn't start with & */ | |
220 if (e[0] != '&') | |
221 return -1; | |
222 /* numeric entity */ | |
223 if (e[1] == '#') | |
224 return numericentitytostr(e + 2, buf, bufsiz); | |
225 else /* named entity */ | |
226 return namedentitytostr(e + 1, buf, bufsiz); | |
227 } | |
228 | |
229 void | |
230 xml_parse(XMLParser *x) | |
231 { | |
232 size_t datalen, tagdatalen; | |
233 int c, isend; | |
234 | |
235 while ((c = GETNEXT()) != EOF && c != '<') | |
236 ; /* skip until < */ | |
237 | |
238 while (c != EOF) { | |
239 if (c == '<') { /* parse tag */ | |
240 if ((c = GETNEXT()) == EOF) | |
241 return; | |
242 | |
243 if (c == '!') { /* CDATA and comments */ | |
244 for (tagdatalen = 0; (c = GETNEXT()) != … | |
245 /* NOTE: sizeof(x->data) must be… | |
246 if (tagdatalen <= sizeof("[CDATA… | |
247 x->data[tagdatalen++] = … | |
248 if (c == '>') | |
249 break; | |
250 else if (c == '-' && tagdatalen … | |
251 (x->data[0] == '… | |
252 xml_parsecomment(x); | |
253 break; | |
254 } else if (c == '[') { | |
255 if (tagdatalen == sizeof… | |
256 !strncmp(x->data, "[… | |
257 xml_parsecdata(x… | |
258 break; | |
259 } | |
260 } | |
261 } | |
262 } else { | |
263 /* normal tag (open, short open, close),… | |
264 x->tag[0] = c; | |
265 x->taglen = 1; | |
266 x->isshorttag = isend = 0; | |
267 | |
268 /* treat processing instruction as short… | |
269 if (c == '?') { | |
270 x->isshorttag = 1; | |
271 } else if (c == '/') { | |
272 if ((c = GETNEXT()) == EOF) | |
273 return; | |
274 x->tag[0] = c; | |
275 isend = 1; | |
276 } | |
277 | |
278 while ((c = GETNEXT()) != EOF) { | |
279 if (c == '/') | |
280 x->isshorttag = 1; /* sh… | |
281 else if (c == '>' || ISSPACE(c))… | |
282 x->tag[x->taglen] = '\0'; | |
283 if (isend) { /* end tag,… | |
284 while (c != '>' … | |
285 c = GETN… | |
286 if (x->xmltagend) | |
287 x->xmlta… | |
288 x->tag[0] = '\0'; | |
289 x->taglen = 0; | |
290 } else { | |
291 /* start tag */ | |
292 if (x->xmltagsta… | |
293 x->xmlta… | |
294 if (ISSPACE(c)) | |
295 xml_pars… | |
296 } | |
297 /* call tagend for short… | |
298 if (x->isshorttag) { | |
299 if (x->xmltagend) | |
300 x->xmlta… | |
301 x->tag[0] = '\0'; | |
302 x->taglen = 0; | |
303 } | |
304 break; | |
305 } else if (x->taglen < sizeof(x-… | |
306 x->tag[x->taglen++] = c;… | |
307 } | |
308 } | |
309 } else { | |
310 /* parse tag data */ | |
311 datalen = 0; | |
312 while ((c = GETNEXT()) != EOF) { | |
313 if (c == '&') { | |
314 if (datalen) { | |
315 x->data[datalen] = '\0'; | |
316 if (x->xmldata) | |
317 x->xmldata(x, x-… | |
318 } | |
319 x->data[0] = c; | |
320 datalen = 1; | |
321 while ((c = GETNEXT()) != EOF) { | |
322 if (c == '<') | |
323 break; | |
324 if (datalen < sizeof(x->… | |
325 x->data[datalen+… | |
326 else { | |
327 /* entity too lo… | |
328 x->data[datalen]… | |
329 if (x->xmldata) | |
330 x->xmlda… | |
331 x->data[0] = c; | |
332 datalen = 1; | |
333 break; | |
334 } | |
335 if (c == ';') { | |
336 x->data[datalen]… | |
337 if (x->xmldataen… | |
338 x->xmlda… | |
339 datalen = 0; | |
340 break; | |
341 } | |
342 } | |
343 } else if (c != '<') { | |
344 if (datalen < sizeof(x->data) - … | |
345 x->data[datalen++] = c; | |
346 } else { | |
347 x->data[datalen] = '\0'; | |
348 if (x->xmldata) | |
349 x->xmldata(x, x-… | |
350 x->data[0] = c; | |
351 datalen = 1; | |
352 } | |
353 } | |
354 if (c == '<') { | |
355 x->data[datalen] = '\0'; | |
356 if (x->xmldata && datalen) | |
357 x->xmldata(x, x->data, d… | |
358 break; | |
359 } | |
360 } | |
361 } | |
362 } | |
363 } |