Introduction
Introduction Statistics Contact Development Disclaimer Help
extractjson.c - extractjson - extract embedded JSON metadata from HTML pages
git clone git://git.codemadness.org/extractjson
Log
Files
Refs
README
LICENSE
---
extractjson.c (7744B)
---
1 #include <ctype.h>
2 #include <errno.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <strings.h>
7
8 #define GETNEXT getnext
9
10 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
11 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
12
13 typedef struct xmlparser {
14 /* current tag */
15 char tag[1024];
16 size_t taglen;
17 /* current tag is in shortform ? <tag /> */
18 int isshorttag;
19 /* current attribute name */
20 char name[1024];
21 /* data buffer used for tag data, cdata and attribute data */
22 char data[BUFSIZ];
23 } XMLParser;
24
25 static XMLParser parser;
26 static int isjson;
27 static const char *ignorestate, *endtag;
28 static int (*getnext)(void) = getchar;
29
30 /* ignore parsing all HTML data inside <script> tags, because they may c…
31 characters such as '<' and '>' */
32 static int
33 getnext_json(void)
34 {
35 int c;
36
37 if ((c = getchar()) == EOF)
38 return EOF;
39
40 if (tolower(c) == tolower((unsigned char)*ignorestate)) {
41 ignorestate++;
42 if (*ignorestate == '\0') {
43 getnext = getchar; /* restore */
44 putchar('\n');
45 isjson = 0;
46 return c;
47 }
48
49 } else {
50 ignorestate = endtag;
51 if (c != '\r' && c != '\n')
52 putchar(c);
53 }
54
55 return ' ';
56 }
57
58 static void
59 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
60 const char *v, size_t vl)
61 {
62 if (!strcasecmp(t, "script") &&
63 !strcasecmp(a, "type") &&
64 (strstr(v, "application/json") ||
65 strstr(v, "application/ld+json") ||
66 strstr(v, "text/json")))
67 isjson = 1;
68 }
69
70 static void
71 xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
72 {
73 if (!strcasecmp(t, "script") && isjson) {
74 ignorestate = endtag = "</script>";
75 getnext = getnext_json;
76 return;
77 }
78 }
79
80 static void
81 xml_parseattrs(XMLParser *x)
82 {
83 size_t namelen = 0, valuelen;
84 int c, endsep, endname = 0, valuestart = 0;
85
86 while ((c = GETNEXT()) != EOF) {
87 if (ISSPACE(c)) {
88 if (namelen)
89 endname = 1;
90 continue;
91 } else if (c == '?')
92 ; /* ignore */
93 else if (c == '=') {
94 x->name[namelen] = '\0';
95 valuestart = 1;
96 endname = 1;
97 } else if (namelen && ((endname && !valuestart && ISALPH…
98 /* attribute without value */
99 xmlattr(x, x->tag, x->taglen, x->name, namelen, …
100 x->name[namelen] = '\0';
101 endname = 0;
102 x->name[0] = c;
103 namelen = 1;
104 } else if (namelen && valuestart) {
105 /* attribute with value */
106 valuelen = 0;
107 if (c == '\'' || c == '"') {
108 endsep = c;
109 } else {
110 endsep = ' '; /* ISSPACE() */
111 goto startvalue;
112 }
113
114 while ((c = GETNEXT()) != EOF) {
115 startvalue:
116 if (c == '&') { /* entities */
117 x->data[valuelen] = '\0';
118 /* call data function with data …
119 if (valuelen)
120 xmlattr(x, x->tag, x->ta…
121 x->data[0] = c;
122 valuelen = 1;
123 while ((c = GETNEXT()) != EOF) {
124 if (c == endsep || (ends…
125 break;
126 if (valuelen < sizeof(x-…
127 x->data[valuelen…
128 else {
129 /* entity too lo…
130 x->data[valuelen…
131 xmlattr(x, x->ta…
132 x->data[0] = c;
133 valuelen = 1;
134 break;
135 }
136 if (c == ';') {
137 x->data[valuelen…
138 valuelen = 0;
139 break;
140 }
141 }
142 } else if (c != endsep && !(endsep == ' …
143 if (valuelen < sizeof(x->data) -…
144 x->data[valuelen++] = c;
145 } else {
146 x->data[valuelen] = '\0';
147 xmlattr(x, x->tag, x->ta…
148 x->data[0] = c;
149 valuelen = 1;
150 }
151 }
152 if (c == endsep || (endsep == ' ' && (c …
153 x->data[valuelen] = '\0';
154 xmlattr(x, x->tag, x->taglen, x-…
155 break;
156 }
157 }
158 namelen = endname = valuestart = 0;
159 } else if (namelen < sizeof(x->name) - 1) {
160 x->name[namelen++] = c;
161 }
162 if (c == '>') {
163 break;
164 } else if (c == '/') {
165 x->isshorttag = 1;
166 x->name[0] = '\0';
167 namelen = 0;
168 }
169 }
170 }
171
172 static void
173 xml_parsecomment(XMLParser *x)
174 {
175 int c, i = 0;
176
177 while ((c = GETNEXT()) != EOF) {
178 if (c == '-') {
179 if (++i > 2)
180 i = 2;
181 continue;
182 } else if (c == '>' && i == 2) {
183 return;
184 } else if (i) {
185 i = 0;
186 }
187 }
188 }
189
190 static void
191 xml_parsecdata(XMLParser *x)
192 {
193 size_t datalen = 0, i = 0;
194 int c;
195
196 while ((c = GETNEXT()) != EOF) {
197 if (c == ']') {
198 if (++i > 2)
199 i = 2;
200 continue;
201 } else if (c == '>' && i == 2) {
202 return;
203 } else if (i) {
204 i = 0;
205 }
206
207 if (datalen < sizeof(x->data) - 1) {
208 x->data[datalen++] = c;
209 } else {
210 x->data[datalen] = '\0';
211 x->data[0] = c;
212 datalen = 1;
213 }
214 }
215 }
216
217 static void
218 xml_parse(XMLParser *x)
219 {
220 size_t datalen, tagdatalen;
221 int c, isend;
222
223 while ((c = GETNEXT()) != EOF && c != '<')
224 ; /* skip until < */
225
226 while (c != EOF) {
227 if (c == '<') { /* parse tag */
228 if ((c = GETNEXT()) == EOF)
229 return;
230
231 if (c == '!') { /* cdata and comments */
232 for (tagdatalen = 0; (c = GETNEXT()) != …
233 /* NOTE: sizeof(x->data) must be…
234 if (tagdatalen <= sizeof("[CDATA…
235 x->data[tagdatalen++] = …
236 if (c == '>')
237 break;
238 else if (c == '-' && tagdatalen …
239 (x->data[0] == '…
240 xml_parsecomment(x);
241 break;
242 } else if (c == '[') {
243 if (tagdatalen == sizeof…
244 !strncmp(x->data, "[…
245 xml_parsecdata(x…
246 break;
247 }
248 }
249 }
250 } else {
251 /* normal tag (open, short open, close),…
252 x->tag[0] = c;
253 x->taglen = 1;
254 x->isshorttag = isend = 0;
255
256 /* treat processing instruction as short…
257 if (c == '?') {
258 x->isshorttag = 1;
259 } else if (c == '/') {
260 if ((c = GETNEXT()) == EOF)
261 return;
262 x->tag[0] = c;
263 isend = 1;
264 }
265
266 while ((c = GETNEXT()) != EOF) {
267 if (c == '/')
268 x->isshorttag = 1; /* sh…
269 else if (c == '>' || ISSPACE(c))…
270 x->tag[x->taglen] = '\0';
271 if (isend) { /* end tag,…
272 while (c != '>' …
273 c = GETN…
274 x->tag[0] = '\0';
275 x->taglen = 0;
276 } else {
277 /* start tag */
278 if (ISSPACE(c))
279 xml_pars…
280 xmltagstartparse…
281 }
282 /* call tagend for short…
283 if (x->isshorttag) {
284 x->tag[0] = '\0';
285 x->taglen = 0;
286 }
287 break;
288 } else if (x->taglen < sizeof(x-…
289 x->tag[x->taglen++] = c;…
290 }
291 }
292 } else {
293 /* parse tag data */
294 datalen = 0;
295 while ((c = GETNEXT()) != EOF) {
296 if (c == '&') {
297 if (datalen)
298 x->data[datalen] = '\0';
299 x->data[0] = c;
300 datalen = 1;
301 while ((c = GETNEXT()) != EOF) {
302 if (c == '<')
303 break;
304 if (datalen < sizeof(x->…
305 x->data[datalen+…
306 else {
307 /* entity too lo…
308 x->data[datalen]…
309 x->data[0] = c;
310 datalen = 1;
311 break;
312 }
313 if (c == ';') {
314 x->data[datalen]…
315 datalen = 0;
316 break;
317 }
318 }
319 } else if (c != '<') {
320 if (datalen < sizeof(x->data) - …
321 x->data[datalen++] = c;
322 } else {
323 x->data[datalen] = '\0';
324 x->data[0] = c;
325 datalen = 1;
326 }
327 }
328 if (c == '<') {
329 x->data[datalen] = '\0';
330 break;
331 }
332 }
333 }
334 }
335 }
336
337 int
338 main(void)
339 {
340 xml_parse(&parser);
341
342 return 0;
343 }
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.