extractjson.c - extractjson - extract embedded JSON metadata from HTML pages | |
git clone git://git.codemadness.org/extractjson | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
extractjson.c (7744B) | |
--- | |
1 #include <ctype.h> | |
2 #include <errno.h> | |
3 #include <stdio.h> | |
4 #include <stdlib.h> | |
5 #include <string.h> | |
6 #include <strings.h> | |
7 | |
8 #define GETNEXT getnext | |
9 | |
10 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) | |
11 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) | |
12 | |
13 typedef struct xmlparser { | |
14 /* current tag */ | |
15 char tag[1024]; | |
16 size_t taglen; | |
17 /* current tag is in shortform ? <tag /> */ | |
18 int isshorttag; | |
19 /* current attribute name */ | |
20 char name[1024]; | |
21 /* data buffer used for tag data, cdata and attribute data */ | |
22 char data[BUFSIZ]; | |
23 } XMLParser; | |
24 | |
25 static XMLParser parser; | |
26 static int isjson; | |
27 static const char *ignorestate, *endtag; | |
28 static int (*getnext)(void) = getchar; | |
29 | |
30 /* ignore parsing all HTML data inside <script> tags, because they may c… | |
31 characters such as '<' and '>' */ | |
32 static int | |
33 getnext_json(void) | |
34 { | |
35 int c; | |
36 | |
37 if ((c = getchar()) == EOF) | |
38 return EOF; | |
39 | |
40 if (tolower(c) == tolower((unsigned char)*ignorestate)) { | |
41 ignorestate++; | |
42 if (*ignorestate == '\0') { | |
43 getnext = getchar; /* restore */ | |
44 putchar('\n'); | |
45 isjson = 0; | |
46 return c; | |
47 } | |
48 | |
49 } else { | |
50 ignorestate = endtag; | |
51 if (c != '\r' && c != '\n') | |
52 putchar(c); | |
53 } | |
54 | |
55 return ' '; | |
56 } | |
57 | |
58 static void | |
59 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, | |
60 const char *v, size_t vl) | |
61 { | |
62 if (!strcasecmp(t, "script") && | |
63 !strcasecmp(a, "type") && | |
64 (strstr(v, "application/json") || | |
65 strstr(v, "application/ld+json") || | |
66 strstr(v, "text/json"))) | |
67 isjson = 1; | |
68 } | |
69 | |
70 static void | |
71 xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) | |
72 { | |
73 if (!strcasecmp(t, "script") && isjson) { | |
74 ignorestate = endtag = "</script>"; | |
75 getnext = getnext_json; | |
76 return; | |
77 } | |
78 } | |
79 | |
80 static void | |
81 xml_parseattrs(XMLParser *x) | |
82 { | |
83 size_t namelen = 0, valuelen; | |
84 int c, endsep, endname = 0, valuestart = 0; | |
85 | |
86 while ((c = GETNEXT()) != EOF) { | |
87 if (ISSPACE(c)) { | |
88 if (namelen) | |
89 endname = 1; | |
90 continue; | |
91 } else if (c == '?') | |
92 ; /* ignore */ | |
93 else if (c == '=') { | |
94 x->name[namelen] = '\0'; | |
95 valuestart = 1; | |
96 endname = 1; | |
97 } else if (namelen && ((endname && !valuestart && ISALPH… | |
98 /* attribute without value */ | |
99 xmlattr(x, x->tag, x->taglen, x->name, namelen, … | |
100 x->name[namelen] = '\0'; | |
101 endname = 0; | |
102 x->name[0] = c; | |
103 namelen = 1; | |
104 } else if (namelen && valuestart) { | |
105 /* attribute with value */ | |
106 valuelen = 0; | |
107 if (c == '\'' || c == '"') { | |
108 endsep = c; | |
109 } else { | |
110 endsep = ' '; /* ISSPACE() */ | |
111 goto startvalue; | |
112 } | |
113 | |
114 while ((c = GETNEXT()) != EOF) { | |
115 startvalue: | |
116 if (c == '&') { /* entities */ | |
117 x->data[valuelen] = '\0'; | |
118 /* call data function with data … | |
119 if (valuelen) | |
120 xmlattr(x, x->tag, x->ta… | |
121 x->data[0] = c; | |
122 valuelen = 1; | |
123 while ((c = GETNEXT()) != EOF) { | |
124 if (c == endsep || (ends… | |
125 break; | |
126 if (valuelen < sizeof(x-… | |
127 x->data[valuelen… | |
128 else { | |
129 /* entity too lo… | |
130 x->data[valuelen… | |
131 xmlattr(x, x->ta… | |
132 x->data[0] = c; | |
133 valuelen = 1; | |
134 break; | |
135 } | |
136 if (c == ';') { | |
137 x->data[valuelen… | |
138 valuelen = 0; | |
139 break; | |
140 } | |
141 } | |
142 } else if (c != endsep && !(endsep == ' … | |
143 if (valuelen < sizeof(x->data) -… | |
144 x->data[valuelen++] = c; | |
145 } else { | |
146 x->data[valuelen] = '\0'; | |
147 xmlattr(x, x->tag, x->ta… | |
148 x->data[0] = c; | |
149 valuelen = 1; | |
150 } | |
151 } | |
152 if (c == endsep || (endsep == ' ' && (c … | |
153 x->data[valuelen] = '\0'; | |
154 xmlattr(x, x->tag, x->taglen, x-… | |
155 break; | |
156 } | |
157 } | |
158 namelen = endname = valuestart = 0; | |
159 } else if (namelen < sizeof(x->name) - 1) { | |
160 x->name[namelen++] = c; | |
161 } | |
162 if (c == '>') { | |
163 break; | |
164 } else if (c == '/') { | |
165 x->isshorttag = 1; | |
166 x->name[0] = '\0'; | |
167 namelen = 0; | |
168 } | |
169 } | |
170 } | |
171 | |
172 static void | |
173 xml_parsecomment(XMLParser *x) | |
174 { | |
175 int c, i = 0; | |
176 | |
177 while ((c = GETNEXT()) != EOF) { | |
178 if (c == '-') { | |
179 if (++i > 2) | |
180 i = 2; | |
181 continue; | |
182 } else if (c == '>' && i == 2) { | |
183 return; | |
184 } else if (i) { | |
185 i = 0; | |
186 } | |
187 } | |
188 } | |
189 | |
190 static void | |
191 xml_parsecdata(XMLParser *x) | |
192 { | |
193 size_t datalen = 0, i = 0; | |
194 int c; | |
195 | |
196 while ((c = GETNEXT()) != EOF) { | |
197 if (c == ']') { | |
198 if (++i > 2) | |
199 i = 2; | |
200 continue; | |
201 } else if (c == '>' && i == 2) { | |
202 return; | |
203 } else if (i) { | |
204 i = 0; | |
205 } | |
206 | |
207 if (datalen < sizeof(x->data) - 1) { | |
208 x->data[datalen++] = c; | |
209 } else { | |
210 x->data[datalen] = '\0'; | |
211 x->data[0] = c; | |
212 datalen = 1; | |
213 } | |
214 } | |
215 } | |
216 | |
217 static void | |
218 xml_parse(XMLParser *x) | |
219 { | |
220 size_t datalen, tagdatalen; | |
221 int c, isend; | |
222 | |
223 while ((c = GETNEXT()) != EOF && c != '<') | |
224 ; /* skip until < */ | |
225 | |
226 while (c != EOF) { | |
227 if (c == '<') { /* parse tag */ | |
228 if ((c = GETNEXT()) == EOF) | |
229 return; | |
230 | |
231 if (c == '!') { /* cdata and comments */ | |
232 for (tagdatalen = 0; (c = GETNEXT()) != … | |
233 /* NOTE: sizeof(x->data) must be… | |
234 if (tagdatalen <= sizeof("[CDATA… | |
235 x->data[tagdatalen++] = … | |
236 if (c == '>') | |
237 break; | |
238 else if (c == '-' && tagdatalen … | |
239 (x->data[0] == '… | |
240 xml_parsecomment(x); | |
241 break; | |
242 } else if (c == '[') { | |
243 if (tagdatalen == sizeof… | |
244 !strncmp(x->data, "[… | |
245 xml_parsecdata(x… | |
246 break; | |
247 } | |
248 } | |
249 } | |
250 } else { | |
251 /* normal tag (open, short open, close),… | |
252 x->tag[0] = c; | |
253 x->taglen = 1; | |
254 x->isshorttag = isend = 0; | |
255 | |
256 /* treat processing instruction as short… | |
257 if (c == '?') { | |
258 x->isshorttag = 1; | |
259 } else if (c == '/') { | |
260 if ((c = GETNEXT()) == EOF) | |
261 return; | |
262 x->tag[0] = c; | |
263 isend = 1; | |
264 } | |
265 | |
266 while ((c = GETNEXT()) != EOF) { | |
267 if (c == '/') | |
268 x->isshorttag = 1; /* sh… | |
269 else if (c == '>' || ISSPACE(c))… | |
270 x->tag[x->taglen] = '\0'; | |
271 if (isend) { /* end tag,… | |
272 while (c != '>' … | |
273 c = GETN… | |
274 x->tag[0] = '\0'; | |
275 x->taglen = 0; | |
276 } else { | |
277 /* start tag */ | |
278 if (ISSPACE(c)) | |
279 xml_pars… | |
280 xmltagstartparse… | |
281 } | |
282 /* call tagend for short… | |
283 if (x->isshorttag) { | |
284 x->tag[0] = '\0'; | |
285 x->taglen = 0; | |
286 } | |
287 break; | |
288 } else if (x->taglen < sizeof(x-… | |
289 x->tag[x->taglen++] = c;… | |
290 } | |
291 } | |
292 } else { | |
293 /* parse tag data */ | |
294 datalen = 0; | |
295 while ((c = GETNEXT()) != EOF) { | |
296 if (c == '&') { | |
297 if (datalen) | |
298 x->data[datalen] = '\0'; | |
299 x->data[0] = c; | |
300 datalen = 1; | |
301 while ((c = GETNEXT()) != EOF) { | |
302 if (c == '<') | |
303 break; | |
304 if (datalen < sizeof(x->… | |
305 x->data[datalen+… | |
306 else { | |
307 /* entity too lo… | |
308 x->data[datalen]… | |
309 x->data[0] = c; | |
310 datalen = 1; | |
311 break; | |
312 } | |
313 if (c == ';') { | |
314 x->data[datalen]… | |
315 datalen = 0; | |
316 break; | |
317 } | |
318 } | |
319 } else if (c != '<') { | |
320 if (datalen < sizeof(x->data) - … | |
321 x->data[datalen++] = c; | |
322 } else { | |
323 x->data[datalen] = '\0'; | |
324 x->data[0] = c; | |
325 datalen = 1; | |
326 } | |
327 } | |
328 if (c == '<') { | |
329 x->data[datalen] = '\0'; | |
330 break; | |
331 } | |
332 } | |
333 } | |
334 } | |
335 } | |
336 | |
337 int | |
338 main(void) | |
339 { | |
340 xml_parse(&parser); | |
341 | |
342 return 0; | |
343 } |