Introduction
Introduction Statistics Contact Development Disclaimer Help
xml.c - grabtitle - stupid HTML title grabber
git clone git://git.codemadness.org/grabtitle
Log
Files
Refs
README
LICENSE
---
xml.c (8078B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
10
11 static void
12 xml_parseattrs(XMLParser *x)
13 {
14 size_t namelen = 0;
15 int c, endsep, endname = 0, valuestart = 0;
16
17 while ((c = GETNEXT()) != EOF) {
18 if (ISSPACE(c)) {
19 if (namelen)
20 endname = 1;
21 continue;
22 } else if (c == '?')
23 ; /* ignore */
24 else if (c == '=') {
25 valuestart = 1;
26 endname = 1;
27 } else if (namelen && ((endname && !valuestart && ISALPH…
28 endname = 0;
29 namelen = 1;
30 } else if (namelen && valuestart) {
31 /* attribute with value */
32 if (c == '\'' || c == '"') {
33 endsep = c;
34 while ((c = GETNEXT()) != EOF) {
35 if (c == endsep)
36 break;
37 }
38 } else {
39 while ((c = GETNEXT()) != EOF) {
40 if (c == '>' || ISSPACE(c))
41 break;
42 }
43 }
44 namelen = endname = valuestart = 0;
45 } else {
46 namelen = 1;
47 }
48 if (c == '>') {
49 break;
50 } else if (c == '/') {
51 x->isshorttag = 1;
52 namelen = 0;
53 }
54 }
55 }
56
57 static void
58 xml_parsecomment(XMLParser *x)
59 {
60 size_t i = 0;
61 int c;
62
63 while ((c = GETNEXT()) != EOF) {
64 if (c == '-') {
65 if (i < 2)
66 i++;
67 } else if (c == '>' && i == 2) {
68 return;
69 } else {
70 i = 0;
71 }
72 }
73 }
74
75 static void
76 xml_parsecdata(XMLParser *x)
77 {
78 size_t datalen = 0, i = 0;
79 int c;
80
81 while ((c = GETNEXT()) != EOF) {
82 if (c == ']' || c == '>') {
83 if (x->xmlcdata) {
84 x->data[datalen] = '\0';
85 x->xmlcdata(x, x->data, datalen);
86 datalen = 0;
87 }
88 }
89
90 if (c == ']') {
91 if (++i > 2) {
92 if (x->xmlcdata)
93 for (; i > 2; i--)
94 x->xmlcdata(x, "]", 1);
95 i = 2;
96 }
97 continue;
98 } else if (c == '>' && i == 2) {
99 return;
100 } else {
101 if (x->xmlcdata)
102 for (; i > 0; i--)
103 x->xmlcdata(x, "]", 1);
104 i = 0;
105 }
106
107 if (datalen < sizeof(x->data) - 1) {
108 x->data[datalen++] = c;
109 } else {
110 x->data[datalen] = '\0';
111 if (x->xmlcdata)
112 x->xmlcdata(x, x->data, datalen);
113 x->data[0] = c;
114 datalen = 1;
115 }
116 }
117 }
118
119 static int
120 codepointtoutf8(long r, char *s)
121 {
122 if (r == 0) {
123 return 0; /* NUL byte */
124 } else if (r <= 0x7F) {
125 /* 1 byte: 0aaaaaaa */
126 s[0] = r;
127 return 1;
128 } else if (r <= 0x07FF) {
129 /* 2 bytes: 00000aaa aabbbbbb */
130 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
131 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
132 return 2;
133 } else if (r <= 0xFFFF) {
134 /* 3 bytes: aaaabbbb bbcccccc */
135 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
136 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
137 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
138 return 3;
139 } else {
140 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
141 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
142 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
143 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
144 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
145 return 4;
146 }
147 }
148
149 struct namedentity {
150 const char *entity;
151 long cp;
152 };
153
154 int
155 namedentitycmp(const void *v1, const void *v2)
156 {
157 struct namedentity *n1 = (struct namedentity *)v1;
158 struct namedentity *n2 = (struct namedentity *)v2;
159
160 return strcmp(n1->entity, n2->entity);
161 }
162
163 static int
164 namedentitytostr(const char *e, char *buf, size_t bufsiz)
165 {
166 static const struct namedentity entities[] = {
167 #include "namedentities.h"
168 };
169 struct namedentity find, *found;
170 size_t i;
171
172 /* buffer is too small */
173 if (bufsiz < 5)
174 return -1;
175
176 find.entity = e;
177 found = bsearch(&find, entities, sizeof(entities) / sizeof(*enti…
178 sizeof(*entities), namedentitycmp);
179 if (found) {
180 i = codepointtoutf8(found->cp, buf);
181 buf[i] = '\0';
182 return i;
183 }
184 return -1;
185 }
186
187 static int
188 numericentitytostr(const char *e, char *buf, size_t bufsiz)
189 {
190 long l;
191 int len;
192 char *end;
193
194 /* buffer is too small */
195 if (bufsiz < 5)
196 return -1;
197
198 errno = 0;
199 /* hex (16) or decimal (10) */
200 if (*e == 'x')
201 l = strtol(++e, &end, 16);
202 else
203 l = strtol(e, &end, 10);
204 /* invalid value or not a well-formed entity or invalid code poi…
205 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
206 (l >= 0xd800 && l <= 0xdfff))
207 return -1;
208 len = codepointtoutf8(l, buf);
209 buf[len] = '\0';
210
211 return len;
212 }
213
214 /* convert named- or numeric entity string to buffer string
215 * returns byte-length of string or -1 on failure. */
216 int
217 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
218 {
219 /* doesn't start with & */
220 if (e[0] != '&')
221 return -1;
222 /* numeric entity */
223 if (e[1] == '#')
224 return numericentitytostr(e + 2, buf, bufsiz);
225 else /* named entity */
226 return namedentitytostr(e + 1, buf, bufsiz);
227 }
228
229 void
230 xml_parse(XMLParser *x)
231 {
232 size_t datalen, tagdatalen;
233 int c, isend;
234
235 while ((c = GETNEXT()) != EOF && c != '<')
236 ; /* skip until < */
237
238 while (c != EOF) {
239 if (c == '<') { /* parse tag */
240 if ((c = GETNEXT()) == EOF)
241 return;
242
243 if (c == '!') { /* CDATA and comments */
244 for (tagdatalen = 0; (c = GETNEXT()) != …
245 /* NOTE: sizeof(x->data) must be…
246 if (tagdatalen <= sizeof("[CDATA…
247 x->data[tagdatalen++] = …
248 if (c == '>')
249 break;
250 else if (c == '-' && tagdatalen …
251 (x->data[0] == '…
252 xml_parsecomment(x);
253 break;
254 } else if (c == '[') {
255 if (tagdatalen == sizeof…
256 !strncmp(x->data, "[…
257 xml_parsecdata(x…
258 break;
259 }
260 }
261 }
262 } else {
263 /* normal tag (open, short open, close),…
264 x->tag[0] = c;
265 x->taglen = 1;
266 x->isshorttag = isend = 0;
267
268 /* treat processing instruction as short…
269 if (c == '?') {
270 x->isshorttag = 1;
271 } else if (c == '/') {
272 if ((c = GETNEXT()) == EOF)
273 return;
274 x->tag[0] = c;
275 isend = 1;
276 }
277
278 while ((c = GETNEXT()) != EOF) {
279 if (c == '/')
280 x->isshorttag = 1; /* sh…
281 else if (c == '>' || ISSPACE(c))…
282 x->tag[x->taglen] = '\0';
283 if (isend) { /* end tag,…
284 while (c != '>' …
285 c = GETN…
286 if (x->xmltagend)
287 x->xmlta…
288 x->tag[0] = '\0';
289 x->taglen = 0;
290 } else {
291 /* start tag */
292 if (x->xmltagsta…
293 x->xmlta…
294 if (ISSPACE(c))
295 xml_pars…
296 }
297 /* call tagend for short…
298 if (x->isshorttag) {
299 if (x->xmltagend)
300 x->xmlta…
301 x->tag[0] = '\0';
302 x->taglen = 0;
303 }
304 break;
305 } else if (x->taglen < sizeof(x-…
306 x->tag[x->taglen++] = c;…
307 }
308 }
309 } else {
310 /* parse tag data */
311 datalen = 0;
312 while ((c = GETNEXT()) != EOF) {
313 if (c == '&') {
314 if (datalen) {
315 x->data[datalen] = '\0';
316 if (x->xmldata)
317 x->xmldata(x, x-…
318 }
319 x->data[0] = c;
320 datalen = 1;
321 while ((c = GETNEXT()) != EOF) {
322 if (c == '<')
323 break;
324 if (datalen < sizeof(x->…
325 x->data[datalen+…
326 else {
327 /* entity too lo…
328 x->data[datalen]…
329 if (x->xmldata)
330 x->xmlda…
331 x->data[0] = c;
332 datalen = 1;
333 break;
334 }
335 if (c == ';') {
336 x->data[datalen]…
337 if (x->xmldataen…
338 x->xmlda…
339 datalen = 0;
340 break;
341 }
342 }
343 } else if (c != '<') {
344 if (datalen < sizeof(x->data) - …
345 x->data[datalen++] = c;
346 } else {
347 x->data[datalen] = '\0';
348 if (x->xmldata)
349 x->xmldata(x, x-…
350 x->data[0] = c;
351 datalen = 1;
352 }
353 }
354 if (c == '<') {
355 x->data[datalen] = '\0';
356 if (x->xmldata && datalen)
357 x->xmldata(x, x->data, d…
358 break;
359 }
360 }
361 }
362 }
363 }
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.