webdump.c - webdump - HTML to plain-text converter for webpages | |
git clone git://git.codemadness.org/webdump | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
webdump.c (66818B) | |
--- | |
1 #include <errno.h> | |
2 #include <limits.h> | |
3 #include <stdio.h> | |
4 #include <stdarg.h> | |
5 #include <stdlib.h> | |
6 #include <string.h> | |
7 #include <strings.h> | |
8 #include <unistd.h> | |
9 | |
10 #include "arg.h" | |
11 char *argv0; | |
12 | |
13 #include "tree.h" | |
14 #include "xml.h" | |
15 | |
16 static XMLParser parser; | |
17 | |
18 #ifndef __OpenBSD__ | |
19 #define pledge(p1,p2) 0 | |
20 #endif | |
21 | |
22 #undef strlcat | |
23 size_t strlcat(char *, const char *, size_t); | |
24 #undef strlcpy | |
25 size_t strlcpy(char *, const char *, size_t); | |
26 | |
27 /* ctype-like macros, but always compatible with ASCII / UTF-8 */ | |
28 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) | |
29 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f) | |
30 #define ISDIGIT(c) (((unsigned)c) - '0' < 10) | |
31 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) | |
32 #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c)) | |
33 | |
34 #define LEN(x) (sizeof(x) / sizeof(x[0])) | |
35 | |
36 /* URI */ | |
37 struct uri { | |
38 char proto[48]; /* scheme including ":" or "://" */ | |
39 char userinfo[256]; /* username [:password] */ | |
40 char host[256]; | |
41 char port[6]; /* numeric port */ | |
42 char path[1024]; | |
43 char query[1024]; | |
44 char fragment[1024]; | |
45 }; | |
46 | |
47 /* options */ | |
48 static int allowansi = 0; /* (-a) allow ANSI escape codes */ | |
49 static int uniqrefs = 0; /* (-d) number unique references */ | |
50 static int showrefinline = 0; /* (-i) show link reference number inline… | |
51 static int showurlinline = 0; /* (-I) show full link reference inline */ | |
52 static int showrefbottom = 0; /* (-l) show link references at the botto… | |
53 static int allowlinewrap = 0; /* (-r) line-wrapping */ | |
54 static int termwidth = 77; /* (-w) terminal width */ | |
55 static int resources = 0; /* (-x) write resources line-by-line to f… | |
56 | |
57 enum DisplayType { | |
58 DisplayUnknown = 0, | |
59 DisplayInline = 1 << 0, | |
60 DisplayInlineBlock = 1 << 1, /* unused for now */ | |
61 DisplayBlock = 1 << 2, | |
62 DisplayNone = 1 << 3, | |
63 DisplayPre = 1 << 4, | |
64 DisplayList = 1 << 5, | |
65 DisplayListOrdered = 1 << 6, | |
66 DisplayListItem = 1 << 7, | |
67 DisplayTable = 1 << 8, | |
68 DisplayTableRow = 1 << 9, | |
69 DisplayTableCell = 1 << 10, | |
70 DisplayHeader = 1 << 11, | |
71 DisplayDl = 1 << 12, | |
72 DisplayInput = 1 << 13, | |
73 DisplayButton = 1 << 14, | |
74 DisplaySelect = 1 << 15, | |
75 DisplaySelectMulti = 1 << 16, | |
76 DisplayOption = 1 << 17 | |
77 }; | |
78 | |
79 /* ANSI markup */ | |
80 enum MarkupType { | |
81 MarkupNone = 0, | |
82 MarkupBold = 1 << 0, | |
83 MarkupItalic = 1 << 1, | |
84 MarkupUnderline = 1 << 2, | |
85 MarkupBlink = 1 << 3, /* lol */ | |
86 MarkupReverse = 1 << 4, | |
87 MarkupStrike = 1 << 5 | |
88 }; | |
89 | |
90 /* String data / memory pool */ | |
91 typedef struct string { | |
92 char *data; /* data */ | |
93 size_t len; /* string length */ | |
94 size_t bufsiz; /* allocated size */ | |
95 } String; | |
96 | |
97 enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAud… | |
98 TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButto… | |
99 TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDet… | |
100 TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFields… | |
101 TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, T… | |
102 TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, … | |
103 TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi, | |
104 TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, … | |
105 TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch, | |
106 TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle, | |
107 TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate, | |
108 TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrac… | |
109 TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp }; | |
110 | |
111 struct tag { | |
112 const char *name; | |
113 enum TagId id; | |
114 enum DisplayType displaytype; | |
115 enum MarkupType markuptype; /* ANSI markup */ | |
116 enum DisplayType parenttype; /* display type belonging to elemen… | |
117 int isvoid; /* "void" element */ | |
118 int isoptional; /* optional to close tag */ | |
119 int margintop; /* newlines when the tag starts */ | |
120 int marginbottom; /* newlines after the tag ends */ | |
121 int indent; /* indent in cells */ | |
122 }; | |
123 | |
124 struct node { | |
125 char tagname[256]; | |
126 struct tag tag; | |
127 size_t nchildren; /* child node count */ | |
128 size_t visnchildren; /* child node count which are visible */ | |
129 /* attributes */ | |
130 char id[256]; | |
131 char classnames[1024]; | |
132 int indent; /* indent per node, for formatting */ | |
133 int hasdata; /* tag contains some data, for formatting */ | |
134 }; | |
135 | |
136 struct selectornode { | |
137 char tagname[256]; | |
138 long index; /* index of node to match on: -1 if not matching on … | |
139 /* attributes */ | |
140 char id[256]; | |
141 char classnames[1024]; | |
142 }; | |
143 | |
144 struct selector { | |
145 const char *text; | |
146 struct selectornode nodes[32]; | |
147 int depth; | |
148 }; | |
149 | |
150 /* list of selectors */ | |
151 struct selectors { | |
152 struct selector **selectors; | |
153 size_t count; | |
154 }; | |
155 | |
156 /* RB tree of link references */ | |
157 struct linkref { | |
158 char *type; | |
159 enum TagId tagid; | |
160 char *url; | |
161 int ishidden; | |
162 size_t linknr; | |
163 RB_ENTRY(linkref) entry; | |
164 }; | |
165 | |
166 /* link references and hidden link references */ | |
167 static struct linkref **visrefs; | |
168 static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */ | |
169 static struct linkref **hiddenrefs; | |
170 static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capaci… | |
171 | |
172 /* compare link by URL for link references RB-tree */ | |
173 static int | |
174 linkrefcmp(struct linkref *r1, struct linkref *r2) | |
175 { | |
176 return strcmp(r1->url, r2->url); | |
177 } | |
178 | |
179 RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead); | |
180 RB_GENERATE(linkreftree, linkref, entry, linkrefcmp) | |
181 | |
182 static const char *str_bullet_item = "* "; | |
183 static const char *str_checkbox_checked = "x"; | |
184 static const char *str_ruler = "-"; | |
185 static const char *str_radio_checked = "*"; | |
186 | |
187 /* base href, to make URLs absolute */ | |
188 static char basehrefdoc[4096]; /* buffer for base href in document, if a… | |
189 static int basehrefset; /* base href set and can be used? */ | |
190 static struct uri base; /* parsed current base href */ | |
191 | |
192 /* buffers for some attributes of the current tag */ | |
193 static String attr_alt; /* alt attribute */ | |
194 static String attr_checked; /* checked attribute */ | |
195 static String attr_class; /* class attribute */ | |
196 static int attr_class_set; /* class attribute is set already */ | |
197 static String attr_data; /* data attribute */ | |
198 static String attr_href; /* href attribute */ | |
199 static String attr_id; /* id attribute */ | |
200 static int attr_id_set; /* class attribute is set already */ | |
201 static String attr_src; /* src attribute */ | |
202 static String attr_type; /* type attribute */ | |
203 static String attr_value; /* value attribute */ | |
204 | |
205 static String htmldata; /* buffered HTML data near the current tag */ | |
206 | |
207 /* for white-space output handling: | |
208 1 = whitespace emitted (suppress repeated), 2 = other characters on t… | |
209 Behaviour: | |
210 * White-space data before non-whitespace data in tags are ignored on … | |
211 * Repeated white-space are ignored: a single space (' ') is emitted. | |
212 */ | |
213 static int whitespace_mode; | |
214 static int nbytesline; /* bytes on this line */ | |
215 static int ncells; /* current cell/column count */ | |
216 static int hadnewline; /* count for repeated newlines */ | |
217 /* flag for skipping initial white-space in tag: for HTML white-space ha… | |
218 static int skipinitialws = 1; | |
219 #define DEFAULT_INDENT 2 | |
220 static const int defaultindent = DEFAULT_INDENT; /* default indent / mar… | |
221 static int indent; /* indent for the current line, in columns */ | |
222 /* previous output sequential newlines, used for calculating margins bet… | |
223 elements and reducing excessive newlines */ | |
224 static int currentnewlines; | |
225 | |
226 /* buffers for line-wrapping (buffer per word boundary) */ | |
227 static char rbuf[1024]; | |
228 static int rbuflen; | |
229 static int rnbufcells; /* pending cell count to add */ | |
230 | |
231 #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */ | |
232 static struct node *nodes; /* node tree (one per level is remembered) */ | |
233 static String *nodes_links; /* keep track of links per node */ | |
234 static size_t ncapnodes; /* current allocated node capacity */ | |
235 static int curnode; /* current node depth */ | |
236 | |
237 /* reader / selector mode (-s) */ | |
238 static int reader_mode; | |
239 /* flag if the tags and their children should be ignored in the current … | |
240 static int reader_ignore; | |
241 | |
242 static enum MarkupType curmarkup; /* current markup state (bold, underli… | |
243 static int linewrap; /* allow linewrap in this context */ | |
244 | |
245 /* selector to match (for -s and -u) */ | |
246 static struct selectors *sel_hide, *sel_show; | |
247 | |
248 /* tags table: needs to be sorted like tagcmp(), alphabetically */ | |
249 | |
250 /* tag id displaytype markup … | |
251 static struct tag tags[] = { | |
252 { "a", TagA, DisplayInline, MarkupU… | |
253 { "address", TagAddress, DisplayBlock, 0, … | |
254 { "area", TagArea, DisplayInline, 0, … | |
255 { "article", TagArticle, DisplayBlock, 0, … | |
256 { "aside", TagAside, DisplayBlock, 0, … | |
257 { "audio", TagAudio, DisplayInline, MarkupU… | |
258 { "b", TagB, DisplayInline, MarkupB… | |
259 { "base", TagBase, DisplayInline, 0, … | |
260 { "blink", TagBlink, DisplayInline, MarkupB… | |
261 { "blockquote", TagBlockquote, DisplayBlock, 0, … | |
262 { "body", TagBody, DisplayBlock, 0, … | |
263 { "br", TagBr, 0, 0, … | |
264 { "button", TagButton, DisplayInline | DisplayButton, 0, … | |
265 { "cite", TagCite, DisplayInline, MarkupI… | |
266 { "col", TagCol, DisplayInline, 0, … | |
267 { "colgroup", TagColgroup, DisplayInline, 0, … | |
268 { "datalist", TagDatalist, DisplayNone, 0, … | |
269 { "dd", TagDd, DisplayBlock, 0, … | |
270 { "del", TagDel, DisplayInline, MarkupS… | |
271 { "details", TagDetails, DisplayBlock, 0, … | |
272 { "dfn", TagDfn, DisplayInline, MarkupI… | |
273 { "dir", TagDir, DisplayList, 0, … | |
274 { "div", TagDiv, DisplayBlock, 0, … | |
275 { "dl", TagDl, DisplayBlock | DisplayDl, 0, … | |
276 { "dt", TagDt, DisplayBlock, MarkupB… | |
277 { "em", TagEm, DisplayInline, MarkupI… | |
278 { "embed", TagEmbed, DisplayInline, 0, … | |
279 { "fieldset", TagFieldset, DisplayBlock, 0, … | |
280 { "figcaption", TagFigcaption, DisplayBlock, 0, … | |
281 { "figure", TagFigure, DisplayBlock, 0, … | |
282 { "footer", TagFooter, DisplayBlock, 0, … | |
283 { "form", TagForm, DisplayBlock, 0, … | |
284 { "frame", TagFrame, DisplayInline, 0, … | |
285 { "h1", TagH1, DisplayHeader, MarkupB… | |
286 { "h2", TagH2, DisplayHeader, MarkupB… | |
287 { "h3", TagH3, DisplayHeader, MarkupB… | |
288 { "h4", TagH4, DisplayHeader, MarkupB… | |
289 { "h5", TagH5, DisplayHeader, MarkupB… | |
290 { "h6", TagH6, DisplayHeader, MarkupB… | |
291 { "head", TagHead, DisplayBlock, 0, … | |
292 { "header", TagHeader, DisplayBlock, 0, … | |
293 { "hr", TagHr, DisplayBlock, 0, … | |
294 { "html", TagHtml, DisplayBlock, 0, … | |
295 { "i", TagI, DisplayInline, MarkupI… | |
296 { "iframe", TagIframe, DisplayInline, 0, … | |
297 { "img", TagImg, DisplayInline, MarkupU… | |
298 { "input", TagInput, DisplayInput, 0, … | |
299 { "ins", TagIns, DisplayInline, MarkupU… | |
300 { "label", TagLabel, DisplayInline, 0, … | |
301 { "legend", TagLegend, DisplayBlock, 0, … | |
302 { "li", TagLi, DisplayListItem, 0, … | |
303 { "link", TagLink, DisplayInline, 0, … | |
304 { "main", TagMain, DisplayBlock, 0, … | |
305 { "mark", TagMark, DisplayInline, MarkupR… | |
306 { "menu", TagMenu, DisplayList, 0, … | |
307 { "meta", TagMeta, DisplayInline, 0, … | |
308 { "nav", TagNav, DisplayBlock, 0, … | |
309 { "object", TagObject, DisplayInline, 0, … | |
310 { "ol", TagOl, DisplayList | DisplayListOrdered, 0, … | |
311 { "option", TagOption, DisplayInline | DisplayOption, 0, … | |
312 { "p", TagP, DisplayBlock, 0, … | |
313 { "param", TagParam, DisplayInline, 0, … | |
314 { "pre", TagPre, DisplayPre, 0, … | |
315 { "s", TagS, DisplayInline, MarkupS… | |
316 { "script", TagScript, DisplayNone, 0, … | |
317 { "search", TagSearch, DisplayBlock, 0, … | |
318 { "section", TagSection, DisplayBlock, 0, … | |
319 { "select", TagSelect, DisplayInline | DisplaySelect, 0, … | |
320 { "source", TagSource, DisplayInline, 0, … | |
321 { "strike", TagStrike, DisplayInline, MarkupS… | |
322 { "strong", TagStrong, DisplayInline, MarkupB… | |
323 { "style", TagStyle, DisplayNone, 0, … | |
324 { "summary", TagSummary, DisplayBlock, 0, … | |
325 { "svg", TagSvg, DisplayNone, 0, … | |
326 { "table", TagTable, DisplayTable, 0, … | |
327 { "tbody", TagTbody, DisplayInline, 0, … | |
328 { "td", TagTd, DisplayTableCell, 0, … | |
329 { "template", TagTemplate, DisplayNone, 0, … | |
330 { "textarea", TagTextarea, DisplayInline, 0, … | |
331 { "tfoot", TagTfoot, DisplayInline, 0, … | |
332 { "th", TagTh, DisplayTableCell, MarkupB… | |
333 { "thead", TagThead, DisplayInline, 0, … | |
334 { "title", TagTitle, DisplayBlock, 0, … | |
335 { "tr", TagTr, DisplayTableRow, 0, … | |
336 { "track", TagTrack, DisplayInline, 0, … | |
337 { "u", TagU, DisplayInline, MarkupU… | |
338 { "ul", TagUl, DisplayList, 0, … | |
339 { "var", TagVar, DisplayInline, MarkupI… | |
340 { "video", TagVideo, DisplayInline, MarkupU… | |
341 { "wbr", TagWbr, DisplayInline, 0, … | |
342 { "xmp", TagXmp, DisplayPre, 0, … | |
343 }; | |
344 | |
345 /* hint for compilers and static analyzers that a function exits */ | |
346 #ifndef __dead | |
347 #define __dead | |
348 #endif | |
349 | |
350 /* print to stderr, print error message of errno and exit(). */ | |
351 __dead static void | |
352 err(int exitstatus, const char *fmt, ...) | |
353 { | |
354 va_list ap; | |
355 int saved_errno; | |
356 | |
357 saved_errno = errno; | |
358 | |
359 fputs("webdump: ", stderr); | |
360 if (fmt) { | |
361 va_start(ap, fmt); | |
362 vfprintf(stderr, fmt, ap); | |
363 va_end(ap); | |
364 fputs(": ", stderr); | |
365 } | |
366 fprintf(stderr, "%s\n", strerror(saved_errno)); | |
367 | |
368 exit(exitstatus); | |
369 } | |
370 | |
371 /* print to stderr and exit(). */ | |
372 __dead static void | |
373 errx(int exitstatus, const char *fmt, ...) | |
374 { | |
375 va_list ap; | |
376 | |
377 fputs("webdump: ", stderr); | |
378 if (fmt) { | |
379 va_start(ap, fmt); | |
380 vfprintf(stderr, fmt, ap); | |
381 va_end(ap); | |
382 } | |
383 fputs("\n", stderr); | |
384 | |
385 exit(exitstatus); | |
386 } | |
387 | |
388 static const char *ignorestate, *endtag; | |
389 static int (*getnext)(void); | |
390 | |
391 /* return a space for all data until some case-insensitive string occurs… | |
392 is used to parse incorrect HTML/XML that contains unescaped HTML in s… | |
393 or style tags. If you see some </script> tag in a CDATA or comment | |
394 section then e-mail W3C and tell them the web is too complex. */ | |
395 static inline int | |
396 getnext_ignore(void) | |
397 { | |
398 int c; | |
399 | |
400 if ((c = getnext()) == EOF) | |
401 return EOF; | |
402 | |
403 if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignores… | |
404 ignorestate++; | |
405 if (*ignorestate == '\0') { | |
406 parser.getnext = getnext; /* restore */ | |
407 return ' '; | |
408 } | |
409 } else { | |
410 ignorestate = endtag; /* no full match: reset to beginni… | |
411 } | |
412 | |
413 return ' '; /* pretend there is just SPACEs */ | |
414 } | |
415 | |
416 /* Clear string only; don't free, prevents unnecessary reallocation. */ | |
417 static void | |
418 string_clear(String *s) | |
419 { | |
420 if (s->data) | |
421 s->data[0] = '\0'; | |
422 s->len = 0; | |
423 } | |
424 | |
425 static void | |
426 string_buffer_realloc(String *s, size_t newlen) | |
427 { | |
428 size_t alloclen; | |
429 | |
430 for (alloclen = 64; alloclen <= newlen; alloclen *= 2) | |
431 ; | |
432 if (!(s->data = realloc(s->data, alloclen))) | |
433 err(1, "realloc"); | |
434 s->bufsiz = alloclen; | |
435 } | |
436 | |
437 static void | |
438 string_append(String *s, const char *data, size_t len) | |
439 { | |
440 if (!len) | |
441 return; | |
442 /* check if allocation is necesary, don't shrink buffer, | |
443 * should be more than bufsiz ofcourse. */ | |
444 if (s->len + len >= s->bufsiz) | |
445 string_buffer_realloc(s, s->len + len + 1); | |
446 memcpy(s->data + s->len, data, len); | |
447 s->len += len; | |
448 s->data[s->len] = '\0'; | |
449 } | |
450 | |
451 static char * | |
452 estrdup(const char *s) | |
453 { | |
454 char *p; | |
455 | |
456 if (!(p = strdup(s))) | |
457 err(1, "strdup"); | |
458 return p; | |
459 } | |
460 | |
461 static char * | |
462 estrndup(const char *s, size_t n) | |
463 { | |
464 char *p; | |
465 | |
466 if (!(p = strndup(s, n))) | |
467 err(1, "strndup"); | |
468 return p; | |
469 } | |
470 | |
471 static void * | |
472 erealloc(void *p, size_t siz) | |
473 { | |
474 if (!(p = realloc(p, siz))) | |
475 err(1, "realloc"); | |
476 | |
477 return p; | |
478 } | |
479 | |
480 static void * | |
481 ecalloc(size_t nmemb, size_t size) | |
482 { | |
483 void *p; | |
484 | |
485 if (!(p = calloc(nmemb, size))) | |
486 err(1, "calloc"); | |
487 return p; | |
488 } | |
489 | |
490 /* check if string has a non-empty scheme / protocol part */ | |
491 static int | |
492 uri_hasscheme(const char *s) | |
493 { | |
494 const char *p = s; | |
495 | |
496 for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) … | |
497 *p == '+' || *p == '-' || *p == '.'; p++) | |
498 ; | |
499 /* scheme, except if empty and starts with ":" then it is a path… | |
500 return (*p == ':' && p != s); | |
501 } | |
502 | |
503 static int | |
504 uri_parse(const char *s, struct uri *u) | |
505 { | |
506 const char *p = s; | |
507 char *endptr; | |
508 size_t i; | |
509 long l; | |
510 | |
511 u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0'; | |
512 u->path[0] = u->query[0] = u->fragment[0] = '\0'; | |
513 | |
514 /* protocol-relative */ | |
515 if (*p == '/' && *(p + 1) == '/') { | |
516 p += 2; /* skip "//" */ | |
517 goto parseauth; | |
518 } | |
519 | |
520 /* scheme / protocol part */ | |
521 for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) … | |
522 *p == '+' || *p == '-' || *p == '.'; p++) | |
523 ; | |
524 /* scheme, except if empty and starts with ":" then it is a path… | |
525 if (*p == ':' && p != s) { | |
526 if (*(p + 1) == '/' && *(p + 2) == '/') | |
527 p += 3; /* skip "://" */ | |
528 else | |
529 p++; /* skip ":" */ | |
530 | |
531 if ((size_t)(p - s) >= sizeof(u->proto)) | |
532 return -1; /* protocol too long */ | |
533 memcpy(u->proto, s, p - s); | |
534 u->proto[p - s] = '\0'; | |
535 | |
536 if (*(p - 1) != '/') | |
537 goto parsepath; | |
538 } else { | |
539 p = s; /* no scheme format, reset to start */ | |
540 goto parsepath; | |
541 } | |
542 | |
543 parseauth: | |
544 /* userinfo (username:password) */ | |
545 i = strcspn(p, "@/?#"); | |
546 if (p[i] == '@') { | |
547 if (i >= sizeof(u->userinfo)) | |
548 return -1; /* userinfo too long */ | |
549 memcpy(u->userinfo, p, i); | |
550 u->userinfo[i] = '\0'; | |
551 p += i + 1; | |
552 } | |
553 | |
554 /* IPv6 address */ | |
555 if (*p == '[') { | |
556 /* bracket not found, host too short or too long */ | |
557 i = strcspn(p, "]"); | |
558 if (p[i] != ']' || i < 3) | |
559 return -1; | |
560 i++; /* including "]" */ | |
561 } else { | |
562 /* domain / host part, skip until port, path or end. */ | |
563 i = strcspn(p, ":/?#"); | |
564 } | |
565 if (i >= sizeof(u->host)) | |
566 return -1; /* host too long */ | |
567 memcpy(u->host, p, i); | |
568 u->host[i] = '\0'; | |
569 p += i; | |
570 | |
571 /* port */ | |
572 if (*p == ':') { | |
573 p++; | |
574 if ((i = strcspn(p, "/?#")) >= sizeof(u->port)) | |
575 return -1; /* port too long */ | |
576 memcpy(u->port, p, i); | |
577 u->port[i] = '\0'; | |
578 /* check for valid port: range 1 - 65535, may be empty */ | |
579 errno = 0; | |
580 l = strtol(u->port, &endptr, 10); | |
581 if (i && (errno || *endptr || l <= 0 || l > 65535)) | |
582 return -1; | |
583 p += i; | |
584 } | |
585 | |
586 parsepath: | |
587 /* path */ | |
588 if ((i = strcspn(p, "?#")) >= sizeof(u->path)) | |
589 return -1; /* path too long */ | |
590 memcpy(u->path, p, i); | |
591 u->path[i] = '\0'; | |
592 p += i; | |
593 | |
594 /* query */ | |
595 if (*p == '?') { | |
596 p++; | |
597 if ((i = strcspn(p, "#")) >= sizeof(u->query)) | |
598 return -1; /* query too long */ | |
599 memcpy(u->query, p, i); | |
600 u->query[i] = '\0'; | |
601 p += i; | |
602 } | |
603 | |
604 /* fragment */ | |
605 if (*p == '#') { | |
606 p++; | |
607 if ((i = strlen(p)) >= sizeof(u->fragment)) | |
608 return -1; /* fragment too long */ | |
609 memcpy(u->fragment, p, i); | |
610 u->fragment[i] = '\0'; | |
611 } | |
612 | |
613 return 0; | |
614 } | |
615 | |
616 /* Transform and try to make the URI `u` absolute using base URI `b` int… | |
617 Follows some of the logic from "RFC 3986 - 5.2.2. Transform Reference… | |
618 Returns 0 on success, -1 on error or truncation. */ | |
619 static int | |
620 uri_makeabs(struct uri *a, struct uri *u, struct uri *b) | |
621 { | |
622 char *p; | |
623 int c; | |
624 | |
625 strlcpy(a->fragment, u->fragment, sizeof(a->fragment)); | |
626 | |
627 if (u->proto[0] || u->host[0]) { | |
628 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, siz… | |
629 strlcpy(a->host, u->host, sizeof(a->host)); | |
630 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo)); | |
631 strlcpy(a->host, u->host, sizeof(a->host)); | |
632 strlcpy(a->port, u->port, sizeof(a->port)); | |
633 strlcpy(a->path, u->path, sizeof(a->path)); | |
634 strlcpy(a->query, u->query, sizeof(a->query)); | |
635 return 0; | |
636 } | |
637 | |
638 strlcpy(a->proto, b->proto, sizeof(a->proto)); | |
639 strlcpy(a->host, b->host, sizeof(a->host)); | |
640 strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo)); | |
641 strlcpy(a->host, b->host, sizeof(a->host)); | |
642 strlcpy(a->port, b->port, sizeof(a->port)); | |
643 | |
644 if (!u->path[0]) { | |
645 strlcpy(a->path, b->path, sizeof(a->path)); | |
646 } else if (u->path[0] == '/') { | |
647 strlcpy(a->path, u->path, sizeof(a->path)); | |
648 } else { | |
649 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '… | |
650 a->path[1] = '\0'; | |
651 | |
652 if ((p = strrchr(b->path, '/'))) { | |
653 c = *(++p); | |
654 *p = '\0'; /* temporary NUL-terminate */ | |
655 if (strlcat(a->path, b->path, sizeof(a->path)) >… | |
656 return -1; | |
657 *p = c; /* restore */ | |
658 } | |
659 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof… | |
660 return -1; | |
661 } | |
662 | |
663 if (u->path[0] || u->query[0]) | |
664 strlcpy(a->query, u->query, sizeof(a->query)); | |
665 else | |
666 strlcpy(a->query, b->query, sizeof(a->query)); | |
667 | |
668 return 0; | |
669 } | |
670 | |
671 static int | |
672 uri_format(char *buf, size_t bufsiz, struct uri *u) | |
673 { | |
674 return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s", | |
675 u->proto, | |
676 u->userinfo[0] ? u->userinfo : "", | |
677 u->userinfo[0] ? "@" : "", | |
678 u->host, | |
679 u->port[0] ? ":" : "", | |
680 u->port, | |
681 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "", | |
682 u->path, | |
683 u->query[0] ? "?" : "", | |
684 u->query, | |
685 u->fragment[0] ? "#" : "", | |
686 u->fragment); | |
687 } | |
688 | |
689 /* compare tag name (case-insensitive) */ | |
690 static int | |
691 tagcmp(const char *s1, const char *s2) | |
692 { | |
693 return strcasecmp(s1, s2); | |
694 } | |
695 | |
696 /* compare attribute name (case-insensitive) */ | |
697 static int | |
698 attrcmp(const char *s1, const char *s2) | |
699 { | |
700 return strcasecmp(s1, s2); | |
701 } | |
702 | |
703 static void | |
704 rindent(void) | |
705 { | |
706 int i, total; | |
707 | |
708 total = indent + defaultindent; | |
709 if (total < 0) | |
710 total = 0; | |
711 for (i = 0; i < total; i++) | |
712 putchar(' '); | |
713 | |
714 nbytesline += total; | |
715 ncells += total; | |
716 } | |
717 | |
718 static void | |
719 emitmarkup(int markuptype) | |
720 { | |
721 if (!allowansi) | |
722 return; | |
723 | |
724 if (!markuptype) | |
725 fputs("\033[0m", stdout); /* reset all attributes */ | |
726 | |
727 /* set */ | |
728 if (markuptype & MarkupBold) | |
729 fputs("\033[1m", stdout); | |
730 if (markuptype & MarkupItalic) | |
731 fputs("\033[3m", stdout); | |
732 if (markuptype & MarkupUnderline) | |
733 fputs("\033[4m", stdout); | |
734 if (markuptype & MarkupBlink) | |
735 fputs("\033[5m", stdout); | |
736 if (markuptype & MarkupReverse) | |
737 fputs("\033[7m", stdout); | |
738 if (markuptype & MarkupStrike) | |
739 fputs("\033[9m", stdout); | |
740 } | |
741 | |
742 /* flush remaining buffer (containing a word): used for word-wrap handli… | |
743 static void | |
744 hflush(void) | |
745 { | |
746 int i; | |
747 | |
748 if (!rbuflen) | |
749 return; | |
750 | |
751 if (!nbytesline) { | |
752 if (curmarkup) | |
753 emitmarkup(0); | |
754 rindent(); | |
755 /* emit code again per line, needed for GNU/less -R */ | |
756 if (curmarkup) | |
757 emitmarkup(curmarkup); | |
758 } | |
759 | |
760 for (i = 0; i < rbuflen; i++) | |
761 putchar(rbuf[i]); | |
762 | |
763 nbytesline += rbuflen; | |
764 ncells += rnbufcells; | |
765 rbuflen = 0; | |
766 rnbufcells = 0; | |
767 } | |
768 | |
769 static void | |
770 printansi(const char *s) | |
771 { | |
772 size_t len; | |
773 | |
774 if (!allowansi) | |
775 return; | |
776 | |
777 if (linewrap) { | |
778 len = strlen(s); | |
779 if (rbuflen + len + 1 >= sizeof(rbuf)) | |
780 hflush(); | |
781 if (rbuflen + len + 1 < sizeof(rbuf)) { | |
782 memcpy(rbuf + rbuflen, s, len); | |
783 rbuflen += len; | |
784 /* NOTE: nbytesline and ncells are not counted f… | |
785 } | |
786 } else { | |
787 fputs(s, stdout); | |
788 } | |
789 } | |
790 | |
791 static void | |
792 setmarkup(int markuptype) | |
793 { | |
794 if (!allowansi) | |
795 return; | |
796 | |
797 /* need change? */ | |
798 if (curmarkup == markuptype) | |
799 return; | |
800 | |
801 if (!markuptype) { | |
802 printansi("\033[0m"); /* reset all attributes */ | |
803 curmarkup = markuptype; | |
804 return; | |
805 } | |
806 | |
807 /* set */ | |
808 if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold)) | |
809 printansi("\033[1m"); | |
810 if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic)) | |
811 printansi("\033[3m"); | |
812 if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderl… | |
813 printansi("\033[4m"); | |
814 if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink)) | |
815 printansi("\033[5m"); | |
816 if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse)) | |
817 printansi("\033[7m"); | |
818 if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike)) | |
819 printansi("\033[9m"); | |
820 | |
821 /* unset */ | |
822 if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold)) | |
823 printansi("\033[22m"); /* reset bold or faint */ | |
824 if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic)) | |
825 printansi("\033[23m"); /* reset italic */ | |
826 if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderl… | |
827 printansi("\033[24m"); /* reset underline */ | |
828 if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink)) | |
829 printansi("\033[25m"); /* reset blink */ | |
830 if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse)) | |
831 printansi("\033[27m"); /* reset reverse */ | |
832 if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike)) | |
833 printansi("\033[29m"); /* reset strike */ | |
834 | |
835 curmarkup = markuptype; | |
836 } | |
837 | |
838 static void | |
839 startmarkup(int markuptype) | |
840 { | |
841 setmarkup(curmarkup | markuptype); | |
842 } | |
843 | |
844 static void | |
845 endmarkup(int markuptype) | |
846 { | |
847 setmarkup(curmarkup & ~markuptype); | |
848 } | |
849 | |
850 /* rough cell width of a unicode codepoint by counting a unicode codepoi… | |
851 cell in general. | |
852 NOTE: this is of course incorrect since characters can be 2 width asw… | |
853 in the future maybe replace this with wcwidth() or similar */ | |
854 static int | |
855 utfwidth(int c) | |
856 { | |
857 /* not the start of a codepoint */ | |
858 if ((c & 0xc0) == 0x80) | |
859 return 0; | |
860 /* count TAB as 8 */ | |
861 if (c == '\t') | |
862 return 8; | |
863 return 1; | |
864 } | |
865 | |
866 /* write a character, handling state of repeated newlines, some HTML | |
867 white-space rules, indentation and word-wrapping */ | |
868 static void | |
869 hputchar(int c) | |
870 { | |
871 struct node *cur = &nodes[curnode]; | |
872 cur->hasdata = 1; | |
873 | |
874 if (c == '\n') { | |
875 /* previous line had characters, so not a repeated newli… | |
876 if (nbytesline > 0) | |
877 hadnewline = 0; | |
878 | |
879 /* start a new line, no chars on this line yet */ | |
880 whitespace_mode &= ~2; /* no chars on this line yet */ | |
881 nbytesline = 0; | |
882 ncells = 0; | |
883 | |
884 if (hadnewline) | |
885 currentnewlines++; /* repeating newlines */ | |
886 hadnewline = 1; | |
887 } else { | |
888 hadnewline = 0; | |
889 currentnewlines = 0; | |
890 } | |
891 | |
892 /* skip initial/leading white-space */ | |
893 if (ISSPACE((unsigned char)c)) { | |
894 if (skipinitialws) | |
895 return; | |
896 } else { | |
897 skipinitialws = 0; | |
898 } | |
899 | |
900 if (!(c == '\n' || c == '\t' || !ISCNTRL((unsigned char)c))) | |
901 return; | |
902 | |
903 if (!linewrap) { | |
904 if (c == '\n') { | |
905 putchar('\n'); | |
906 nbytesline = 0; | |
907 ncells = 0; | |
908 } else { | |
909 if (!nbytesline) { | |
910 if (curmarkup) | |
911 emitmarkup(0); | |
912 rindent(); | |
913 /* emit code again per line, needed for … | |
914 if (curmarkup) | |
915 emitmarkup(curmarkup); | |
916 } | |
917 putchar(c); | |
918 nbytesline++; | |
919 ncells += utfwidth(c); | |
920 } | |
921 return; | |
922 } | |
923 | |
924 /* really too long: the whole word doesn't even fit, flush it */ | |
925 if (ncells + rnbufcells >= termwidth || rbuflen >= sizeof(rbuf) … | |
926 putchar('\n'); | |
927 nbytesline = 0; | |
928 ncells = 0; | |
929 hflush(); | |
930 } | |
931 | |
932 if (c == '\n') { | |
933 putchar('\n'); | |
934 hflush(); | |
935 return; | |
936 } else if (ISSPACE((unsigned char)c) || c == '-') { | |
937 if (ncells + rnbufcells >= termwidth) { | |
938 putchar('\n'); | |
939 nbytesline = 0; | |
940 ncells = 0; | |
941 } | |
942 rbuf[rbuflen++] = c; | |
943 rnbufcells += utfwidth(c); | |
944 hflush(); | |
945 return; | |
946 } | |
947 | |
948 rbuf[rbuflen++] = c; | |
949 rnbufcells += utfwidth(c); | |
950 } | |
951 | |
952 /* calculate indentation of current node depth, using the sum of each | |
953 indentation per node */ | |
954 static int | |
955 calcindent(void) | |
956 { | |
957 int i, n = 0; | |
958 | |
959 for (i = curnode; i >= 0; i--) | |
960 n += nodes[i].indent; | |
961 | |
962 return n; | |
963 } | |
964 | |
965 static void | |
966 hprint(const char *s) | |
967 { | |
968 for (; *s; ++s) | |
969 hputchar(*s); | |
970 } | |
971 | |
972 /* printf(), max 256 bytes for now */ | |
973 static void | |
974 hprintf(const char *fmt, ...) | |
975 { | |
976 va_list ap; | |
977 char buf[256]; | |
978 | |
979 va_start(ap, fmt); | |
980 vsnprintf(buf, sizeof(buf), fmt, ap); | |
981 va_end(ap); | |
982 | |
983 /* use hprint() formatting logic. */ | |
984 hprint(buf); | |
985 } | |
986 | |
987 static void | |
988 newline(void) | |
989 { | |
990 if (skipinitialws) | |
991 return; | |
992 hputchar('\n'); | |
993 } | |
994 | |
995 static int | |
996 parentcontainerhasdata(int curtype, int n) | |
997 { | |
998 int i; | |
999 | |
1000 for (i = n; i >= 0; i--) { | |
1001 if (nodes[i].tag.displaytype & (DisplayList|DisplayTable… | |
1002 break; | |
1003 if (nodes[i].hasdata) | |
1004 return 1; | |
1005 } | |
1006 | |
1007 return 0; | |
1008 } | |
1009 | |
1010 /* start on a newline for the start of a block element or not */ | |
1011 static void | |
1012 startblock(void) | |
1013 { | |
1014 hflush(); | |
1015 whitespace_mode &= ~2; /* no characters on this line yet */ | |
1016 if (nbytesline <= 0) | |
1017 return; | |
1018 if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata) | |
1019 hputchar('\n'); | |
1020 } | |
1021 | |
1022 /* start on a newline for the end of a block element or not */ | |
1023 static void | |
1024 endblock(void) | |
1025 { | |
1026 hflush(); | |
1027 whitespace_mode &= ~2; /* no characters on this line yet */ | |
1028 if (nbytesline <= 0) | |
1029 return; | |
1030 if (!hadnewline) | |
1031 hputchar('\n'); | |
1032 } | |
1033 | |
1034 /* print one character safely: no control characters, | |
1035 handle HTML white-space rules */ | |
1036 static void | |
1037 printc(int c) | |
1038 { | |
1039 if (ISSPACE((unsigned char)c)) { | |
1040 if (whitespace_mode == 2) | |
1041 hputchar(' '); | |
1042 whitespace_mode |= 1; | |
1043 } else { | |
1044 whitespace_mode = 2; | |
1045 if (!ISCNTRL((unsigned char)c)) | |
1046 hputchar(c); | |
1047 } | |
1048 } | |
1049 | |
1050 static void | |
1051 printpre(const char *s, size_t len) | |
1052 { | |
1053 struct node *cur; | |
1054 size_t i; | |
1055 | |
1056 /* reset state of newlines because this data is printed literall… | |
1057 hadnewline = 0; | |
1058 currentnewlines = 0; | |
1059 | |
1060 /* skip leading newline */ | |
1061 i = 0; | |
1062 if (skipinitialws) { | |
1063 if (*s == '\n' && i < len) { | |
1064 s++; | |
1065 i++; | |
1066 } | |
1067 } | |
1068 | |
1069 hflush(); | |
1070 | |
1071 skipinitialws = 0; | |
1072 | |
1073 if (*s) { | |
1074 cur = &nodes[curnode]; | |
1075 cur->hasdata = 1; | |
1076 } | |
1077 | |
1078 for (; *s && i < len; s++, i++) { | |
1079 switch (*s) { | |
1080 case '\n': | |
1081 putchar('\n'); | |
1082 nbytesline = 0; | |
1083 ncells = 0; | |
1084 break; | |
1085 case '\t': | |
1086 hadnewline = 0; | |
1087 if (!nbytesline) { | |
1088 if (curmarkup) | |
1089 emitmarkup(0); | |
1090 rindent(); | |
1091 /* emit code again per line, needed for … | |
1092 if (curmarkup) | |
1093 emitmarkup(curmarkup); | |
1094 } | |
1095 | |
1096 /* TAB to 8 spaces */ | |
1097 fputs(" ", stdout); | |
1098 nbytesline += 8; | |
1099 ncells += 8; | |
1100 break; | |
1101 default: | |
1102 if (ISCNTRL((unsigned char)*s)) | |
1103 continue; | |
1104 | |
1105 if (!nbytesline) { | |
1106 if (curmarkup) | |
1107 emitmarkup(0); | |
1108 rindent(); | |
1109 /* emit code again per line, needed for … | |
1110 if (curmarkup) | |
1111 emitmarkup(curmarkup); | |
1112 } | |
1113 | |
1114 putchar(*s); | |
1115 nbytesline++; | |
1116 /* start of rune: incorrectly assume 1 rune is 1… | |
1117 ncells += utfwidth((unsigned char)*s); | |
1118 } | |
1119 } | |
1120 } | |
1121 | |
1122 static struct node * | |
1123 findparenttype(int cur, int findtype) | |
1124 { | |
1125 int i; | |
1126 | |
1127 for (i = cur; i >= 0; i--) { | |
1128 if ((nodes[i].tag.displaytype & findtype)) | |
1129 return &nodes[i]; | |
1130 } | |
1131 return NULL; | |
1132 } | |
1133 | |
1134 static int | |
1135 isclassmatch(const char *haystack, const char *needle) | |
1136 { | |
1137 const char *p; | |
1138 size_t needlelen; | |
1139 size_t matched = 0; | |
1140 | |
1141 needlelen = strlen(needle); | |
1142 for (p = haystack; *p; p++) { | |
1143 if (ISSPACE((unsigned char)*p)) { | |
1144 matched = 0; | |
1145 continue; | |
1146 } | |
1147 if (needle[matched] == *p) | |
1148 matched++; | |
1149 else | |
1150 matched = 0; | |
1151 if (matched == needlelen) { | |
1152 if (*(p + 1) == '\0' || ISSPACE((unsigned char)*… | |
1153 return 1; | |
1154 } | |
1155 } | |
1156 | |
1157 return 0; | |
1158 } | |
1159 | |
1160 /* very limited CSS-like selector, supports: main, main#id, main.class, | |
1161 ".class", "#id", "ul li a" */ | |
1162 static int | |
1163 compileselector(const char *sel, struct selectornode *nodes, size_t maxn… | |
1164 { | |
1165 int depth = 0, len; | |
1166 long l; | |
1167 const char *s, *start; | |
1168 char tmp[256]; | |
1169 int nameset = 0; | |
1170 | |
1171 memset(&nodes[0], 0, sizeof(nodes[0])); | |
1172 nodes[0].index = -1; | |
1173 | |
1174 s = sel; | |
1175 for (; *s && ISSPACE((unsigned char)*s); s++) | |
1176 ; | |
1177 | |
1178 start = s; | |
1179 for (; ; s++) { | |
1180 /* end of tag */ | |
1181 if (!nameset && | |
1182 (*s == '#' || *s == '.' || *s == '@' || | |
1183 *s == '\0' || ISSPACE((unsigned char)*s))) { | |
1184 nameset = 1; | |
1185 len = s - start; /* tag name */ | |
1186 if (len >= sizeof(tmp)) | |
1187 return 0; | |
1188 if (len) | |
1189 memcpy(tmp, start, len); | |
1190 tmp[len] = '\0'; | |
1191 | |
1192 memcpy(nodes[depth].tagname, tmp, len + 1); | |
1193 } | |
1194 | |
1195 /* end */ | |
1196 if (*s == '\0' || ISSPACE((unsigned char)*s)) { | |
1197 for (; ISSPACE((unsigned char)*s); s++) | |
1198 ; | |
1199 start = s; /* start of a new tag */ | |
1200 depth++; | |
1201 if (depth >= maxnodes) | |
1202 return 0; | |
1203 | |
1204 nameset = 0; | |
1205 memset(&nodes[depth], 0, sizeof(nodes[depth])); | |
1206 nodes[depth].index = -1; | |
1207 | |
1208 /* end of selector */ | |
1209 if (*s == '\0') | |
1210 break; | |
1211 } | |
1212 | |
1213 /* index */ | |
1214 if (*s == '@') { | |
1215 len = strcspn(s + 1, ".#@ \t\n"); | |
1216 if (len >= sizeof(tmp)) | |
1217 return 0; | |
1218 memcpy(tmp, s + 1, len); | |
1219 tmp[len] = '\0'; | |
1220 | |
1221 l = strtol(tmp, NULL, 10); | |
1222 if (l >= 0) | |
1223 nodes[depth].index = l; | |
1224 s += len; | |
1225 start = s + 1; | |
1226 continue; | |
1227 } | |
1228 | |
1229 /* id */ | |
1230 if (*s == '#') { | |
1231 len = strcspn(s + 1, ".#@ \t\n"); | |
1232 if (len >= sizeof(tmp)) | |
1233 return 0; | |
1234 memcpy(tmp, s + 1, len); | |
1235 tmp[len] = '\0'; | |
1236 memcpy(nodes[depth].id, tmp, len + 1); | |
1237 s += len; | |
1238 start = s + 1; | |
1239 continue; | |
1240 } | |
1241 | |
1242 /* class */ | |
1243 if (*s == '.') { | |
1244 len = strcspn(s + 1, ".#@ \t\n"); | |
1245 if (len >= sizeof(tmp)) | |
1246 return 0; | |
1247 memcpy(tmp, s + 1, len); | |
1248 tmp[len] = '\0'; | |
1249 /* allow only one classname for now */ | |
1250 memcpy(nodes[depth].classnames, tmp, len + 1); | |
1251 s += len; | |
1252 start = s + 1; | |
1253 continue; | |
1254 } | |
1255 } | |
1256 | |
1257 return depth; | |
1258 } | |
1259 | |
1260 static struct selector * | |
1261 newselector(const char *q) | |
1262 { | |
1263 struct selector *sel; | |
1264 int r; | |
1265 | |
1266 sel = ecalloc(1, sizeof(*sel)); | |
1267 sel->text = estrdup(q); | |
1268 | |
1269 r = compileselector(sel->text, sel->nodes, LEN(sel->nodes)); | |
1270 if (r <= 0) { | |
1271 free(sel); | |
1272 return NULL; | |
1273 } | |
1274 sel->depth = r; | |
1275 | |
1276 return sel; | |
1277 } | |
1278 | |
1279 static struct selectors * | |
1280 compileselectors(const char *q) | |
1281 { | |
1282 struct selectors *sels = NULL; | |
1283 struct selector *sel; | |
1284 const char *start; | |
1285 char *qe; | |
1286 int count = 0; | |
1287 size_t siz; | |
1288 | |
1289 sels = ecalloc(1, sizeof(*sels)); | |
1290 | |
1291 start = q; | |
1292 for (; ; q++) { | |
1293 if (*q == ',' || *q == '\0') { | |
1294 qe = estrndup(start, q - start); | |
1295 sel = newselector(qe); | |
1296 free(qe); | |
1297 | |
1298 /* add new selector */ | |
1299 siz = (count + 1) * sizeof(struct selector *); | |
1300 sels->selectors = erealloc(sels->selectors, siz); | |
1301 sels->selectors[count] = sel; | |
1302 count++; | |
1303 | |
1304 if (*q == '\0') | |
1305 break; | |
1306 start = q + 1; | |
1307 } | |
1308 } | |
1309 sels->count = count; | |
1310 | |
1311 return sels; | |
1312 } | |
1313 | |
1314 /* very limited CSS-like matcher, supports: main, main#id, main.class, | |
1315 ".class", "#id", "ul li a" */ | |
1316 static int | |
1317 iscssmatch(struct selector *sel, struct node *root, int maxdepth) | |
1318 { | |
1319 int d, md = 0; | |
1320 | |
1321 for (d = 0; d <= maxdepth; d++) { | |
1322 /* tag matched? */ | |
1323 if (sel->nodes[md].tagname[0] && | |
1324 strcasecmp(sel->nodes[md].tagname, root[d].tagname)) | |
1325 continue; /* no */ | |
1326 | |
1327 /* id matched? */ | |
1328 if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, ro… | |
1329 continue; /* no */ | |
1330 | |
1331 /* class matched, for now allow only one classname in th… | |
1332 matching multiple classnames */ | |
1333 if (sel->nodes[md].classnames[0] && | |
1334 !isclassmatch(root[d].classnames, sel->nodes[md].cla… | |
1335 continue; /* no */ | |
1336 | |
1337 /* index matched */ | |
1338 if (sel->nodes[md].index != -1 && | |
1339 (d == 0 || | |
1340 root[d - 1].nchildren == 0 || | |
1341 sel->nodes[md].index != root[d - 1].nchildren - 1)) | |
1342 continue; | |
1343 | |
1344 md++; | |
1345 /* all matched of one selector */ | |
1346 if (md == sel->depth) | |
1347 return 1; | |
1348 } | |
1349 | |
1350 return 0; | |
1351 } | |
1352 | |
1353 static int | |
1354 iscssmatchany(struct selectors *sels, struct node *root, int maxdepth) | |
1355 { | |
1356 struct selector *sel; | |
1357 int i; | |
1358 | |
1359 for (i = 0; i < sels->count; i++) { | |
1360 sel = sels->selectors[i]; | |
1361 if (iscssmatch(sel, root, maxdepth)) | |
1362 return 1; | |
1363 } | |
1364 return 0; | |
1365 } | |
1366 | |
1367 static void | |
1368 handleinlinealt(void) | |
1369 { | |
1370 struct node *cur; | |
1371 char *start, *s, *e; | |
1372 | |
1373 /* do not show the alt text if the element is hidden */ | |
1374 cur = &nodes[curnode]; | |
1375 if (cur->tag.displaytype & DisplayNone) | |
1376 return; | |
1377 | |
1378 /* show img alt attribute as text. */ | |
1379 if (attr_alt.len) { | |
1380 start = attr_alt.data; | |
1381 e = attr_alt.data + attr_alt.len; | |
1382 | |
1383 for (s = start; s < e; s++) | |
1384 printc((unsigned char)*s); | |
1385 hflush(); | |
1386 } else if (cur->tag.id == TagImg && !showurlinline) { | |
1387 /* if there is no alt text and no URL is shown inline, t… | |
1388 show "[IMG]" to indicate there was an image there */ | |
1389 hprint("[IMG]"); | |
1390 } | |
1391 } | |
1392 | |
1393 /* lookup a link reference by url in the red-black tree */ | |
1394 static struct linkref * | |
1395 findlinkref(const char *url) | |
1396 { | |
1397 struct linkref find; | |
1398 | |
1399 find.url = (char *)url; | |
1400 | |
1401 return RB_FIND(linkreftree, &linkrefhead, &find); | |
1402 } | |
1403 | |
1404 /* add a link reference. Returns the added link reference, or the existi… | |
1405 reference if links are deduplicated */ | |
1406 static struct linkref * | |
1407 addlinkref(const char *url, const char *_type, enum TagId tagid, int ish… | |
1408 { | |
1409 struct linkref *link; | |
1410 size_t linknr; | |
1411 | |
1412 /* if links are deduplicates return the existing link */ | |
1413 if (uniqrefs && (link = findlinkref(url))) | |
1414 return link; | |
1415 | |
1416 if (tagid == TagA) | |
1417 _type = "link"; | |
1418 | |
1419 link = ecalloc(1, sizeof(*link)); | |
1420 | |
1421 if (!ishidden) { | |
1422 linknr = ++nvisrefs; | |
1423 if (nvisrefs >= ncapvisrefs) { | |
1424 ncapvisrefs += 256; /* greedy alloc */ | |
1425 visrefs = erealloc(visrefs, sizeof(*visrefs) * n… | |
1426 } | |
1427 visrefs[linknr - 1] = link; /* add pointer to list */ | |
1428 } else { | |
1429 linknr = ++nhiddenrefs; | |
1430 if (nhiddenrefs >= ncaphiddenrefs) { | |
1431 ncaphiddenrefs += 256; /* greedy alloc */ | |
1432 hiddenrefs = erealloc(hiddenrefs, sizeof(*hidden… | |
1433 } | |
1434 hiddenrefs[linknr - 1] = link; /* add pointer to list */ | |
1435 } | |
1436 | |
1437 link->url = estrdup(url); | |
1438 link->type = estrdup(_type); | |
1439 link->tagid = tagid; | |
1440 link->ishidden = ishidden; | |
1441 link->linknr = linknr; | |
1442 | |
1443 /* add to tree: the tree is only used for checking unique link r… | |
1444 if (uniqrefs) | |
1445 RB_INSERT(linkreftree, &linkrefhead, link); | |
1446 | |
1447 return link; | |
1448 } | |
1449 | |
1450 static void | |
1451 handleinlinelink(void) | |
1452 { | |
1453 struct uri newuri, olduri; | |
1454 struct node *cur; | |
1455 char buf[4096], *url; | |
1456 int r; | |
1457 | |
1458 if (!showrefbottom && !showrefinline && !showurlinline && !resou… | |
1459 return; /* there is no need to collect the reference */ | |
1460 | |
1461 if (!attr_href.len && !attr_src.len && !attr_data.len) | |
1462 return; /* there is no reference */ | |
1463 | |
1464 /* by default use the original URL */ | |
1465 if (attr_src.len) | |
1466 url = attr_src.data; | |
1467 else if (attr_href.len) | |
1468 url = attr_href.data; | |
1469 else | |
1470 url = attr_data.data; | |
1471 | |
1472 if (!url) | |
1473 return; | |
1474 | |
1475 /* Not an absolute URL yet: try to make it absolute. | |
1476 If it is not possible use the relative URL */ | |
1477 if (!uri_hasscheme(url) && basehrefset && | |
1478 uri_parse(url, &olduri) != -1 && | |
1479 uri_makeabs(&newuri, &olduri, &base) != -1 && | |
1480 newuri.proto[0]) { | |
1481 r = uri_format(buf, sizeof(buf), &newuri); | |
1482 if (r >= 0 && (size_t)r < sizeof(buf)) | |
1483 url = buf; | |
1484 } | |
1485 | |
1486 if (!url[0]) | |
1487 return; | |
1488 | |
1489 cur = &nodes[curnode]; | |
1490 | |
1491 if (!(cur->tag.displaytype & DisplayNone)) { | |
1492 string_clear(&nodes_links[curnode]); | |
1493 string_append(&nodes_links[curnode], url, strlen(url)); | |
1494 } | |
1495 | |
1496 /* add hidden links directly to the reference, | |
1497 the order doesn't matter */ | |
1498 if (cur->tag.displaytype & DisplayNone) | |
1499 addlinkref(url, cur->tag.name, cur->tag.id, 1); | |
1500 } | |
1501 | |
1502 static void | |
1503 printlinkrefs(void) | |
1504 { | |
1505 struct linkref *ref; | |
1506 size_t i; | |
1507 | |
1508 if (!nvisrefs && !nhiddenrefs) | |
1509 return; | |
1510 | |
1511 if (resources) { | |
1512 for (i = 0; i < nvisrefs; i++) { | |
1513 ref = visrefs[i]; | |
1514 dprintf(3, "%s\t%s\n", ref->type, ref->url); | |
1515 } | |
1516 for (i = 0; i < nhiddenrefs; i++) { | |
1517 ref = hiddenrefs[i]; | |
1518 dprintf(3, "%s\t%s\n", ref->type, ref->url); | |
1519 } | |
1520 } | |
1521 | |
1522 printf("\nReferences\n\n"); | |
1523 | |
1524 for (i = 0; i < nvisrefs; i++) { | |
1525 ref = visrefs[i]; | |
1526 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->ty… | |
1527 } | |
1528 | |
1529 if (nhiddenrefs > 0) | |
1530 printf("\n\nHidden references\n\n"); | |
1531 /* hidden links don't have a link number, just count them */ | |
1532 for (i = 0; i < nhiddenrefs; i++) { | |
1533 ref = hiddenrefs[i]; | |
1534 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->ty… | |
1535 } | |
1536 } | |
1537 | |
1538 /* size to grow node capacity (greedy) */ | |
1539 #define NODE_CAP_INC 16 | |
1540 | |
1541 /* increase node depth, allocate space for nodes if needed */ | |
1542 static void | |
1543 incnode(void) | |
1544 { | |
1545 size_t i; | |
1546 | |
1547 curnode++; | |
1548 | |
1549 if (curnode >= MAX_NODE_DEPTH) | |
1550 errx(1, "max node depth reached: %d", curnode); | |
1551 | |
1552 if (curnode >= ncapnodes) { | |
1553 nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NO… | |
1554 nodes_links = erealloc(nodes_links, sizeof(*nodes_links)… | |
1555 | |
1556 /* clear new region */ | |
1557 memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_I… | |
1558 memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) … | |
1559 | |
1560 for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) { | |
1561 nodes[i].tag.displaytype = DisplayInline; | |
1562 nodes[i].tag.name = nodes[i].tagname; /* assign … | |
1563 } | |
1564 | |
1565 ncapnodes += NODE_CAP_INC; /* greedy alloc */ | |
1566 } | |
1567 } | |
1568 | |
1569 static void | |
1570 xmldatastart(XMLParser *p) | |
1571 { | |
1572 } | |
1573 | |
1574 static void | |
1575 xmldataend(XMLParser *p) | |
1576 { | |
1577 struct node *cur; | |
1578 char *start, *s, *e; | |
1579 | |
1580 if (!htmldata.data || !htmldata.len) | |
1581 return; | |
1582 | |
1583 cur = &nodes[curnode]; | |
1584 | |
1585 if (reader_ignore || (cur->tag.displaytype & DisplayNone)) { | |
1586 /* print nothing */ | |
1587 } else if ((cur->tag.displaytype & DisplayPre) || | |
1588 findparenttype(curnode - 1, DisplayPre)) { | |
1589 printpre(htmldata.data, htmldata.len); | |
1590 } else { | |
1591 start = htmldata.data; | |
1592 e = htmldata.data + htmldata.len; | |
1593 | |
1594 for (s = start; s < e; s++) | |
1595 printc((unsigned char)*s); | |
1596 } | |
1597 | |
1598 string_clear(&htmldata); | |
1599 } | |
1600 | |
1601 static void | |
1602 xmldata(XMLParser *p, const char *data, size_t datalen) | |
1603 { | |
1604 struct node *cur; | |
1605 | |
1606 if (reader_ignore) | |
1607 return; | |
1608 | |
1609 cur = &nodes[curnode]; | |
1610 if (cur->tag.displaytype & DisplayNone) | |
1611 return; | |
1612 | |
1613 string_append(&htmldata, data, datalen); | |
1614 } | |
1615 | |
1616 static void | |
1617 xmldataentity(XMLParser *p, const char *data, size_t datalen) | |
1618 { | |
1619 struct node *cur; | |
1620 char buf[16]; | |
1621 int n; | |
1622 | |
1623 if (reader_ignore) | |
1624 return; | |
1625 | |
1626 cur = &nodes[curnode]; | |
1627 if (cur->tag.displaytype & DisplayNone) | |
1628 return; | |
1629 | |
1630 n = xml_entitytostr(data, buf, sizeof(buf)); | |
1631 if (n > 0) | |
1632 xmldata(p, buf, (size_t)n); | |
1633 else | |
1634 xmldata(p, data, datalen); | |
1635 } | |
1636 | |
1637 static void | |
1638 xmlcdatastart(XMLParser *p) | |
1639 { | |
1640 xmldatastart(p); | |
1641 } | |
1642 | |
1643 static void | |
1644 xmlcdataend(XMLParser *p) | |
1645 { | |
1646 xmldataend(p); /* treat CDATA as data */ | |
1647 } | |
1648 | |
1649 static void | |
1650 xmlcdata(XMLParser *p, const char *data, size_t datalen) | |
1651 { | |
1652 xmldata(p, data, datalen); /* treat CDATA as data */ | |
1653 } | |
1654 | |
1655 /* lookup function to compare tag name (case-insensitive) for sort funct… | |
1656 static int | |
1657 findtagcmp(const void *v1, const void *v2) | |
1658 { | |
1659 struct tag *t1 = (struct tag *)v1; | |
1660 struct tag *t2 = (struct tag *)v2; | |
1661 | |
1662 return strcasecmp(t1->name, t2->name); | |
1663 } | |
1664 | |
1665 /* binary search tag by tag name */ | |
1666 static struct tag * | |
1667 findtag(const char *t) | |
1668 { | |
1669 struct tag find = { 0 }; | |
1670 | |
1671 find.name = t; | |
1672 | |
1673 return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp… | |
1674 } | |
1675 | |
1676 static void | |
1677 handleendtag(struct tag *tag) | |
1678 { | |
1679 int i, marginbottom; | |
1680 | |
1681 if (tag->displaytype & DisplayNone) | |
1682 return; | |
1683 if (reader_ignore) | |
1684 return; | |
1685 | |
1686 if (tag->displaytype & (DisplayButton | DisplayOption)) { | |
1687 hputchar(']'); | |
1688 hflush(); | |
1689 } | |
1690 | |
1691 if (tag->displaytype & (DisplayBlock | DisplayHeader | DisplayTa… | |
1692 DisplayList | DisplayListItem | DisplayPre)) { | |
1693 endblock(); /* break line if needed */ | |
1694 } | |
1695 | |
1696 /* when a list ends and its not inside a list add an extra botto… | |
1697 marginbottom = tag->marginbottom; | |
1698 | |
1699 if (marginbottom > 0) { | |
1700 if (tag->displaytype & DisplayList) { | |
1701 if (findparenttype(curnode - 1, DisplayList)) | |
1702 marginbottom--; | |
1703 } | |
1704 } | |
1705 | |
1706 if (marginbottom > 0) { | |
1707 hflush(); | |
1708 for (i = currentnewlines; i < marginbottom; i++) { | |
1709 putchar('\n'); | |
1710 nbytesline = 0; | |
1711 ncells = 0; | |
1712 currentnewlines++; | |
1713 } | |
1714 hadnewline = 1; | |
1715 } | |
1716 } | |
1717 | |
1718 static void | |
1719 endnode(struct node *cur) | |
1720 { | |
1721 struct linkref *ref; | |
1722 int i, ishidden; | |
1723 | |
1724 /* set a flag indicating the element and its parent containers h… | |
1725 This is used for some formatting */ | |
1726 if (cur->hasdata) { | |
1727 for (i = curnode; i >= 0; i--) | |
1728 nodes[i].hasdata = 1; | |
1729 } | |
1730 | |
1731 endmarkup(cur->tag.markuptype); | |
1732 | |
1733 ishidden = reader_ignore || (cur->tag.displaytype & DisplayNone); | |
1734 | |
1735 /* add link and show the link number in the visible order */ | |
1736 if (!ishidden && nodes_links[curnode].len > 0) { | |
1737 ref = addlinkref(nodes_links[curnode].data, | |
1738 cur->tag.name, cur->tag.id, ishidden); | |
1739 | |
1740 if (showrefinline || showurlinline) { | |
1741 hflush(); | |
1742 startmarkup(MarkupReverse); | |
1743 } | |
1744 | |
1745 if (showrefinline) | |
1746 hprintf("[%zu]", ref->linknr); | |
1747 if (showurlinline) { | |
1748 if (ref->tagid == TagA) | |
1749 hprintf("[%s]", ref->url); | |
1750 else | |
1751 hprintf("[%s: %s]", ref->type, ref->url); | |
1752 } | |
1753 if (showrefinline || showurlinline) { | |
1754 endmarkup(MarkupReverse); | |
1755 hflush(); | |
1756 } | |
1757 } | |
1758 | |
1759 handleendtag(&(cur->tag)); | |
1760 } | |
1761 | |
1762 static void | |
1763 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) | |
1764 { | |
1765 struct tag *found, *tag; | |
1766 enum TagId child, childs[16]; | |
1767 size_t nchilds; | |
1768 int i, j, k, nchildfound, parenttype; | |
1769 | |
1770 /* match tag and lookup metadata */ | |
1771 /* ignore closing of void elements, like </br>, which is not all… | |
1772 if ((found = findtag(t))) { | |
1773 if (!isshort && found->isvoid) | |
1774 return; | |
1775 } | |
1776 | |
1777 /* TODO: implement more complete optional tag handling. | |
1778 in reality the optional tag rules are more complex, see: | |
1779 https://html.spec.whatwg.org/multipage/syntax.html#optional-t… | |
1780 | |
1781 child = 0; | |
1782 nchilds = 0; | |
1783 nchildfound = 0; | |
1784 parenttype = 0; /* by default, seek until the root */ | |
1785 | |
1786 if (found && found->displaytype & DisplayPre) { | |
1787 skipinitialws = 0; /* do not skip white-space, for margi… | |
1788 } else if (found && found->displaytype & DisplayList) { | |
1789 childs[0] = TagLi; | |
1790 nchilds = 1; | |
1791 parenttype = DisplayList; | |
1792 } else if (found && found->displaytype & DisplayTableRow) { | |
1793 childs[0] = TagTd; | |
1794 nchilds = 1; | |
1795 parenttype = DisplayTableRow; | |
1796 } else if (found && found->displaytype & DisplayTable) { | |
1797 childs[0] = TagTd; | |
1798 nchilds = 1; | |
1799 parenttype = DisplayTable; | |
1800 } else if (found && found->displaytype & DisplaySelect) { | |
1801 childs[0] = TagOption; | |
1802 nchilds = 1; | |
1803 parenttype = DisplaySelect; | |
1804 } else if (found && found->displaytype & DisplayDl) { | |
1805 childs[0] = TagP; | |
1806 childs[1] = TagDd; | |
1807 childs[2] = TagDt; | |
1808 nchilds = 3; | |
1809 parenttype = DisplayDl; | |
1810 } else if (found && found->displaytype & DisplayBlock) { | |
1811 childs[0] = TagP; | |
1812 nchilds = 1; | |
1813 parenttype = 0; /* seek until the root */ | |
1814 } | |
1815 | |
1816 if (nchilds > 0) { | |
1817 for (i = curnode; i >= 0; i--) { | |
1818 if (nchildfound) | |
1819 break; | |
1820 if ((nodes[i].tag.displaytype & parenttype)) | |
1821 break; | |
1822 for (j = 0; j < nchilds; j++) { | |
1823 child = childs[j]; | |
1824 if (nodes[i].tag.id == child) { | |
1825 /* fake closing the previous tag… | |
1826 for (k = curnode; k >= i; k--) | |
1827 endnode(&nodes[k]); | |
1828 curnode = k; | |
1829 nchildfound = 1; | |
1830 break; | |
1831 } | |
1832 } | |
1833 } | |
1834 } | |
1835 | |
1836 /* if the current closing tag matches the current open tag */ | |
1837 if (nodes[curnode].tag.name && | |
1838 !tagcmp(nodes[curnode].tag.name, t)) { | |
1839 endnode(&nodes[curnode]); | |
1840 if (curnode) | |
1841 curnode--; | |
1842 } else { | |
1843 /* ... else lookup the first matching start tag. This is… | |
1844 for handling optional closing tags */ | |
1845 tag = NULL; | |
1846 for (i = curnode; i >= 0; i--) { | |
1847 if (nodes[i].tag.name && | |
1848 !tagcmp(nodes[i].tag.name, t)) { | |
1849 endnode(&nodes[i]); | |
1850 curnode = i > 0 ? i - 1 : 0; | |
1851 tag = &nodes[i].tag; | |
1852 break; | |
1853 } | |
1854 } | |
1855 /* unmatched closing tag found */ | |
1856 if (!tag && found) | |
1857 handleendtag(found); | |
1858 } | |
1859 indent = calcindent(); | |
1860 | |
1861 #if 0 | |
1862 /* check if linewrap is enabled, but currently is disabled and n… | |
1863 be restored */ | |
1864 if (allowlinewrap && !linewrap) { | |
1865 tag = NULL; | |
1866 for (i = curnode; i >= 0; i--) { | |
1867 if (nodes[i].tag.id == TagTable) { | |
1868 tag = &nodes[i].tag; | |
1869 break; | |
1870 } | |
1871 } | |
1872 if (!tag) | |
1873 linewrap = allowlinewrap; | |
1874 } | |
1875 #endif | |
1876 | |
1877 /* restore markup of the tag we are in now */ | |
1878 startmarkup(nodes[curnode].tag.markuptype); | |
1879 | |
1880 /* check if the current node still matches the visible selector … | |
1881 if (reader_mode && sel_show && !reader_ignore) { | |
1882 if (!iscssmatchany(sel_show, nodes, curnode)) { | |
1883 reader_ignore = 1; | |
1884 newline(); | |
1885 } | |
1886 } | |
1887 } | |
1888 | |
1889 static void | |
1890 xmltagstart(XMLParser *p, const char *t, size_t tl) | |
1891 { | |
1892 struct tag *found; | |
1893 struct node *cur; | |
1894 enum TagId tagid; | |
1895 enum TagId child, childs[16]; | |
1896 size_t nchilds; | |
1897 char *s; | |
1898 int i, j, k, nchildfound, parenttype; | |
1899 | |
1900 cur = &nodes[curnode]; | |
1901 | |
1902 string_clear(&attr_alt); | |
1903 string_clear(&attr_checked); | |
1904 string_clear(&attr_class); | |
1905 attr_class_set = 0; | |
1906 string_clear(&attr_data); | |
1907 string_clear(&attr_href); | |
1908 string_clear(&attr_id); | |
1909 attr_id_set = 0; | |
1910 string_clear(&attr_src); | |
1911 string_clear(&attr_type); | |
1912 string_clear(&attr_value); | |
1913 | |
1914 /* match tag and lookup metadata */ | |
1915 found = findtag(t); | |
1916 | |
1917 /* TODO: implement more complete optional tag handling. | |
1918 in reality the optional tag rules are more complex, see: | |
1919 https://html.spec.whatwg.org/multipage/syntax.html#optional-t… | |
1920 | |
1921 child = 0; | |
1922 nchilds = 0; | |
1923 nchildfound = 0; | |
1924 parenttype = 0; /* by default, seek until the root */ | |
1925 | |
1926 /* if optional tag <p> is open and a list element is found, clos… | |
1927 if (found && found->displaytype & DisplayList) { | |
1928 /* not inside a list */ | |
1929 childs[0] = TagP; | |
1930 nchilds = 1; | |
1931 parenttype = DisplayList; | |
1932 } else if (found && found->isoptional) { | |
1933 tagid = found->id; | |
1934 if (tagid == TagLi) { | |
1935 childs[0] = TagLi; | |
1936 nchilds = 1; | |
1937 parenttype = DisplayList; | |
1938 } else if (tagid == TagTd) { | |
1939 childs[0] = TagTd; | |
1940 nchilds = 1; | |
1941 parenttype = DisplayTableRow; | |
1942 } else if (tagid == TagTr) { | |
1943 childs[0] = TagTr; | |
1944 nchilds = 1; | |
1945 parenttype = DisplayTable; | |
1946 } else if (tagid == TagP) { | |
1947 childs[0] = TagP; | |
1948 nchilds = 1; | |
1949 parenttype = 0; /* seek until the root */ | |
1950 } else if (tagid == TagOption) { | |
1951 childs[0] = TagOption; | |
1952 nchilds = 1; | |
1953 parenttype = DisplaySelect; | |
1954 } else if (tagid == TagDt) { | |
1955 childs[0] = TagDd; | |
1956 nchilds = 1; | |
1957 parenttype = DisplayDl; | |
1958 } else if (tagid == TagDd) { | |
1959 childs[0] = TagDd; | |
1960 childs[1] = TagDt; | |
1961 nchilds = 2; | |
1962 parenttype = DisplayDl; | |
1963 } else if (tagid == cur->tag.id) { | |
1964 /* fake closing the previous tag if it is the sa… | |
1965 xmltagend(p, t, tl, 0); | |
1966 } | |
1967 } else if (found && found->displaytype & DisplayBlock) { | |
1968 /* check if we have an open "<p>" tag */ | |
1969 childs[0] = TagP; | |
1970 childs[1] = TagDl; | |
1971 nchilds = 2; | |
1972 parenttype = DisplayDl; | |
1973 } | |
1974 | |
1975 if (nchilds > 0) { | |
1976 for (i = curnode; i >= 0; i--) { | |
1977 if (nchildfound) | |
1978 break; | |
1979 if ((nodes[i].tag.displaytype & parenttype)) | |
1980 break; | |
1981 for (j = 0; j < nchilds; j++) { | |
1982 child = childs[j]; | |
1983 if (nodes[i].tag.id == child) { | |
1984 /* fake closing the previous tag… | |
1985 for (k = curnode; k >= i; k--) | |
1986 xmltagend(p, nodes[k].ta… | |
1987 nchildfound = 1; | |
1988 break; | |
1989 } | |
1990 } | |
1991 } | |
1992 } | |
1993 | |
1994 incnode(); | |
1995 string_clear(&nodes_links[curnode]); /* clear possible link refe… | |
1996 cur = &nodes[curnode]; | |
1997 memset(cur, 0, sizeof(*cur)); /* clear / reset node */ | |
1998 /* tag defaults */ | |
1999 cur->tag.displaytype = DisplayInline; | |
2000 cur->tag.name = cur->tagname; /* assign fixed-size buffer */ | |
2001 strlcpy(cur->tagname, t, sizeof(cur->tagname)); | |
2002 | |
2003 /* force to lowercase */ | |
2004 for (s = cur->tagname; *s; s++) | |
2005 *s = TOLOWER((unsigned char)*s); | |
2006 | |
2007 /* matched tag: copy tag information to current node */ | |
2008 if (found) | |
2009 memcpy(&(cur->tag), found, sizeof(*found)); | |
2010 | |
2011 /* if parent tag is hidden then hide itself too */ | |
2012 if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & Display… | |
2013 cur->tag.displaytype |= DisplayNone; | |
2014 } | |
2015 | |
2016 static void | |
2017 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) | |
2018 { | |
2019 struct tag *found; | |
2020 enum TagId tagid; | |
2021 struct node *cur, *parent; | |
2022 int i, margintop; | |
2023 | |
2024 /* match tag and lookup metadata */ | |
2025 tagid = 0; | |
2026 if ((found = findtag(t))) | |
2027 tagid = found->id; | |
2028 | |
2029 /* temporary replace the callback except the reader and end of t… | |
2030 restore the context once we receive the same ignored tag in t… | |
2031 end tag handler */ | |
2032 if (tagid == TagScript) { | |
2033 ignorestate = endtag = "</script>"; | |
2034 getnext = p->getnext; /* for restore */ | |
2035 p->getnext = getnext_ignore; | |
2036 xmltagend(p, t, tl, 0); /* fake the call the tag was end… | |
2037 return; | |
2038 } else if (tagid == TagStyle) { | |
2039 ignorestate = endtag = "</style>"; | |
2040 getnext = p->getnext; /* for restore */ | |
2041 p->getnext = getnext_ignore; | |
2042 xmltagend(p, t, tl, 0); /* fake the call the tag was end… | |
2043 return; | |
2044 } | |
2045 | |
2046 #if 0 | |
2047 /* disable line-wrapping inside tables */ | |
2048 if (tagid == TagTable) | |
2049 linewrap = 0; | |
2050 #endif | |
2051 | |
2052 cur = &nodes[curnode]; | |
2053 | |
2054 /* copy attributes if set */ | |
2055 if (attr_id.len) | |
2056 strlcpy(cur->id, attr_id.data, sizeof(cur->id)); | |
2057 else | |
2058 cur->id[0] = '\0'; | |
2059 if (attr_class.len) | |
2060 strlcpy(cur->classnames, attr_class.data, sizeof(cur->cl… | |
2061 else | |
2062 cur->classnames[0] = '\0'; | |
2063 | |
2064 /* parent node */ | |
2065 if (curnode > 0) { | |
2066 parent = &nodes[curnode - 1]; | |
2067 parent->nchildren++; /* increase child node count */ | |
2068 /* count visible childnodes */ | |
2069 if (!(cur->tag.displaytype & DisplayNone)) | |
2070 parent->visnchildren++; | |
2071 } else { | |
2072 parent = NULL; | |
2073 } | |
2074 | |
2075 if (reader_mode && sel_show && reader_ignore && | |
2076 iscssmatchany(sel_show, nodes, curnode)) | |
2077 reader_ignore = 0; | |
2078 | |
2079 /* hide element */ | |
2080 if (reader_mode && sel_hide && | |
2081 iscssmatchany(sel_hide, nodes, curnode)) | |
2082 cur->tag.displaytype |= DisplayNone; | |
2083 | |
2084 /* indent for this tag */ | |
2085 cur->indent = cur->tag.indent; | |
2086 | |
2087 if (!reader_ignore) { | |
2088 /* add link reference, print links and alt text */ | |
2089 handleinlinelink(); | |
2090 handleinlinealt(); | |
2091 } | |
2092 | |
2093 /* <select><option> */ | |
2094 if (cur->tag.displaytype & DisplayOption) { | |
2095 /* <select multiple>: show all options */ | |
2096 if (parent->tag.displaytype & DisplaySelectMulti) | |
2097 cur->tag.displaytype |= DisplayBlock; | |
2098 else if (parent->nchildren > 1) /* show the first item a… | |
2099 cur->tag.displaytype |= DisplayNone; /* else hid… | |
2100 } | |
2101 | |
2102 if (cur->tag.displaytype & DisplayNone) | |
2103 return; | |
2104 | |
2105 if (reader_ignore) | |
2106 return; | |
2107 | |
2108 indent = calcindent(); | |
2109 | |
2110 if ((cur->tag.displaytype & (DisplayBlock | DisplayHeader | Disp… | |
2111 DisplayTable | DisplayTableRow | | |
2112 DisplayList | DisplayListItem))) { | |
2113 startblock(); /* break line if needed */ | |
2114 } | |
2115 | |
2116 if (cur->tag.displaytype & (DisplayButton | DisplayOption)) { | |
2117 hflush(); | |
2118 hputchar('['); | |
2119 } | |
2120 | |
2121 margintop = cur->tag.margintop; | |
2122 if (cur->tag.displaytype & (DisplayList)) { | |
2123 for (i = curnode - 1; i >= 0; i--) { | |
2124 if (nodes[i].tag.displaytype & DisplayList) | |
2125 break; | |
2126 if (!(nodes[i].tag.displaytype & DisplayListItem… | |
2127 continue; | |
2128 if (nodes[i].hasdata && margintop > 0) { | |
2129 margintop--; | |
2130 break; | |
2131 } | |
2132 } | |
2133 } else if (cur->tag.displaytype & (DisplayBlock|DisplayTable)) { | |
2134 if (!parentcontainerhasdata(cur->tag.displaytype, curnod… | |
2135 if (margintop > 0) | |
2136 margintop--; | |
2137 } | |
2138 } | |
2139 | |
2140 if (margintop > 0) { | |
2141 hflush(); | |
2142 for (i = currentnewlines; i < margintop; i++) { | |
2143 putchar('\n'); | |
2144 nbytesline = 0; | |
2145 ncells = 0; | |
2146 currentnewlines++; | |
2147 } | |
2148 hadnewline = 1; | |
2149 } | |
2150 | |
2151 if (cur->tag.displaytype & DisplayPre) { | |
2152 skipinitialws = 1; | |
2153 } else if (cur->tag.displaytype & DisplayTableCell) { | |
2154 if (parent && parent->visnchildren > 1) | |
2155 hputchar('\t'); | |
2156 } else if (cur->tag.displaytype & DisplayListItem) { | |
2157 /* find first parent node and ordered numbers or unorder… | |
2158 if (parent) { | |
2159 skipinitialws = 0; | |
2160 | |
2161 /* print bullet, add columns to indentation leve… | |
2162 if (parent->tag.displaytype & DisplayListOrdered… | |
2163 hprintf("%4zu. ", parent->nchildren); | |
2164 cur->indent = 6; | |
2165 indent += cur->indent; /* align to numbe… | |
2166 } else if (parent->tag.displaytype & DisplayList… | |
2167 hprint(str_bullet_item); | |
2168 cur->indent = 2; | |
2169 indent += 2; /* align to bullet */ | |
2170 } | |
2171 } | |
2172 skipinitialws = 0; | |
2173 } else if (cur->tag.displaytype & DisplayInput) { | |
2174 if (!attr_type.len) { | |
2175 hprintf("[%-15s]", attr_value.len ? attr_value.d… | |
2176 } else if (!strcasecmp(attr_type.data, "button")) { | |
2177 hprintf("[%s]", attr_value.len ? attr_value.data… | |
2178 } else if (!strcasecmp(attr_type.data, "submit")) { | |
2179 hprintf("[%s]", attr_value.len ? attr_value.data… | |
2180 } else if (!strcasecmp(attr_type.data, "reset")) { | |
2181 hprintf("[%s]", attr_value.len ? attr_value.data… | |
2182 } else if (!strcasecmp(attr_type.data, "checkbox")) { | |
2183 hprintf("[%s]", | |
2184 attr_checked.len && | |
2185 !strcasecmp(attr_checked.data, "checked"… | |
2186 } else if (!strcasecmp(attr_type.data, "radio")) { | |
2187 hprintf("[%s]", | |
2188 attr_checked.len && | |
2189 !strcasecmp(attr_checked.data, "checked"… | |
2190 } else if (!strcasecmp(attr_type.data, "hidden")) { | |
2191 cur->tag.displaytype |= DisplayNone; | |
2192 } else { | |
2193 /* unrecognized / default case is text */ | |
2194 hprintf("[%-15s]", attr_value.len ? attr_value.d… | |
2195 } | |
2196 } | |
2197 | |
2198 startmarkup(cur->tag.markuptype); | |
2199 | |
2200 /* do not count data such as an item bullet as part of the data … | |
2201 the node */ | |
2202 cur->hasdata = 0; | |
2203 | |
2204 if (tagid == TagHr) { /* ruler */ | |
2205 i = termwidth - indent - defaultindent; | |
2206 for (; i > 0; i--) | |
2207 hprint(str_ruler); | |
2208 cur->hasdata = 1; /* treat <hr/> as data */ | |
2209 } else if (tagid == TagBr) { | |
2210 hflush(); | |
2211 hadnewline = 0; /* forced newline */ | |
2212 hputchar('\n'); | |
2213 cur->hasdata = 1; /* treat <br/> as data */ | |
2214 } | |
2215 | |
2216 /* autoclose tags, such as <br>, pretend we are <br/> */ | |
2217 if (!isshort && cur->tag.isvoid) | |
2218 xmltagend(p, t, tl, 1); /* pretend close of short tag */ | |
2219 } | |
2220 | |
2221 static void | |
2222 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, | |
2223 size_t nl, const char *v, size_t vl) | |
2224 { | |
2225 struct node *cur; | |
2226 enum TagId tagid; | |
2227 | |
2228 cur = &nodes[curnode]; | |
2229 tagid = cur->tag.id; | |
2230 | |
2231 /* hide tags with attribute aria-hidden or hidden */ | |
2232 if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden")) | |
2233 cur->tag.displaytype |= DisplayNone; | |
2234 | |
2235 if (!attr_class_set && !attrcmp(n, "class")) /* use the first se… | |
2236 string_append(&attr_class, v, vl); | |
2237 else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set… | |
2238 string_append(&attr_id, v, vl); | |
2239 else if (!attrcmp(n, "type")) | |
2240 string_append(&attr_type, v, vl); | |
2241 else if (!attrcmp(n, "value")) | |
2242 string_append(&attr_value, v, vl); | |
2243 | |
2244 /* <base href="..." /> */ | |
2245 if (!basehrefset && tagid == TagBase && !attrcmp(n, "href")) | |
2246 strlcat(basehrefdoc, v, sizeof(basehrefdoc)); | |
2247 | |
2248 if (tagid == TagA && !attrcmp(n, "href")) | |
2249 string_append(&attr_href, v, vl); | |
2250 | |
2251 if (tagid == TagSelect && !attrcmp(n, "multiple")) | |
2252 cur->tag.displaytype |= DisplaySelectMulti; | |
2253 | |
2254 if (tagid == TagObject && !attrcmp(n, "data")) | |
2255 string_append(&attr_data, v, vl); | |
2256 | |
2257 /* show img alt attribute as text. */ | |
2258 if (tagid == TagImg && !attrcmp(n, "alt")) | |
2259 string_append(&attr_alt, v, vl); | |
2260 | |
2261 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"… | |
2262 string_append(&attr_checked, v, vl); | |
2263 | |
2264 /* src attribute */ | |
2265 switch (tagid) { | |
2266 case TagAudio: | |
2267 case TagEmbed: | |
2268 case TagFrame: | |
2269 case TagIframe: | |
2270 case TagImg: | |
2271 case TagSource: | |
2272 case TagTrack: | |
2273 case TagVideo: | |
2274 if (!attrcmp(n, "src")) | |
2275 string_append(&attr_src, v, vl); | |
2276 break; | |
2277 default: | |
2278 break; | |
2279 } | |
2280 } | |
2281 | |
2282 static void | |
2283 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, | |
2284 size_t nl, const char *v, size_t vl) | |
2285 { | |
2286 char buf[16]; | |
2287 int len; | |
2288 | |
2289 len = xml_entitytostr(v, buf, sizeof(buf)); | |
2290 if (len > 0) | |
2291 xmlattr(p, t, tl, n, nl, buf, (size_t)len); | |
2292 else | |
2293 xmlattr(p, t, tl, n, nl, v, vl); | |
2294 } | |
2295 | |
2296 static void | |
2297 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, | |
2298 size_t nl) | |
2299 { | |
2300 struct node *cur; | |
2301 enum TagId tagid; | |
2302 | |
2303 cur = &nodes[curnode]; | |
2304 tagid = cur->tag.id; | |
2305 | |
2306 if (!attr_class_set && !attrcmp(n, "class")) | |
2307 attr_class_set = 1; | |
2308 else if (!attr_id_set && !attrcmp(n, "id")) | |
2309 attr_id_set = 1; | |
2310 | |
2311 /* set base URL, if it is set it cannot be overwritten again */ | |
2312 if (!basehrefset && basehrefdoc[0] && | |
2313 tagid == TagBase && !attrcmp(n, "href")) | |
2314 basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : … | |
2315 | |
2316 /* if attribute checked is set but it has no value then set it t… | |
2317 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"… | |
2318 string_append(&attr_checked, "checked", sizeof("checked"… | |
2319 } | |
2320 | |
2321 static void | |
2322 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, | |
2323 size_t nl) | |
2324 { | |
2325 struct node *cur; | |
2326 enum TagId tagid; | |
2327 | |
2328 cur = &nodes[curnode]; | |
2329 tagid = cur->tag.id; | |
2330 | |
2331 if (!attrcmp(n, "alt")) | |
2332 string_clear(&attr_alt); | |
2333 else if (!attrcmp(n, "checked")) | |
2334 string_clear(&attr_checked); | |
2335 else if (!attr_class_set && !attrcmp(n, "class")) | |
2336 string_clear(&attr_class); | |
2337 else if (!attrcmp(n, "data")) | |
2338 string_clear(&attr_data); | |
2339 else if (!attrcmp(n, "href")) | |
2340 string_clear(&attr_href); | |
2341 else if (!attr_id_set && !attrcmp(n, "id")) | |
2342 string_clear(&attr_id); | |
2343 else if (!attrcmp(n, "src")) | |
2344 string_clear(&attr_src); | |
2345 else if (!attrcmp(n, "type")) | |
2346 string_clear(&attr_type); | |
2347 else if (!attrcmp(n, "value")) | |
2348 string_clear(&attr_value); | |
2349 | |
2350 if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href")) | |
2351 basehrefdoc[0] = '\0'; | |
2352 } | |
2353 | |
2354 static void | |
2355 usage(void) | |
2356 { | |
2357 fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u … | |
2358 exit(1); | |
2359 } | |
2360 | |
2361 int | |
2362 main(int argc, char **argv) | |
2363 { | |
2364 char *basehref; | |
2365 | |
2366 if (pledge("stdio", NULL) < 0) | |
2367 err(1, "pledge"); | |
2368 | |
2369 ARGBEGIN { | |
2370 case '8': | |
2371 str_bullet_item = "\xe2\x80\xa2 "; | |
2372 str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal… | |
2373 break; | |
2374 case 'a': | |
2375 allowansi = !allowansi; | |
2376 break; | |
2377 case 'b': | |
2378 basehref = EARGF(usage()); | |
2379 if (uri_parse(basehref, &base) == -1 || | |
2380 !base.proto[0]) | |
2381 usage(); | |
2382 basehrefset = 1; | |
2383 break; | |
2384 case 'd': | |
2385 uniqrefs = !uniqrefs; | |
2386 break; | |
2387 case 'i': | |
2388 showrefinline = !showrefinline; | |
2389 break; | |
2390 case 'I': | |
2391 showurlinline = !showurlinline; | |
2392 break; | |
2393 case 'l': | |
2394 showrefbottom = !showrefbottom; | |
2395 break; | |
2396 case 'r': | |
2397 allowlinewrap = !allowlinewrap; | |
2398 break; | |
2399 case 's': | |
2400 sel_show = compileselectors(EARGF(usage())); | |
2401 /* switch to reader/selector mode, ignore all data excep… | |
2402 reader_mode = 1; | |
2403 reader_ignore = 1; | |
2404 break; | |
2405 case 'u': | |
2406 sel_hide = compileselectors(EARGF(usage())); | |
2407 /* switch to reader/selector mode */ | |
2408 reader_mode = 1; | |
2409 break; | |
2410 case 'w': | |
2411 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1) | |
2412 usage(); | |
2413 break; | |
2414 case 'x': | |
2415 resources = !resources; | |
2416 break; | |
2417 default: | |
2418 usage(); | |
2419 } ARGEND | |
2420 | |
2421 linewrap = allowlinewrap; | |
2422 | |
2423 /* initial nodes */ | |
2424 ncapnodes = NODE_CAP_INC; | |
2425 nodes = ecalloc(ncapnodes, sizeof(*nodes)); | |
2426 nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links)); | |
2427 | |
2428 parser.xmlattrstart = xmlattrstart; | |
2429 parser.xmlattr = xmlattr; | |
2430 parser.xmlattrentity = xmlattrentity; | |
2431 parser.xmlattrend = xmlattrend; | |
2432 parser.xmlcdatastart = xmlcdatastart; | |
2433 parser.xmlcdata = xmlcdata; | |
2434 parser.xmlcdataend = xmlcdataend; | |
2435 parser.xmldatastart = xmldatastart; | |
2436 parser.xmldata = xmldata; | |
2437 parser.xmldataentity = xmldataentity; | |
2438 parser.xmldataend = xmldataend; | |
2439 parser.xmltagstart = xmltagstart; | |
2440 parser.xmltagstartparsed = xmltagstartparsed; | |
2441 parser.xmltagend = xmltagend; | |
2442 | |
2443 parser.getnext = getchar; | |
2444 xml_parse(&parser); | |
2445 | |
2446 hflush(); | |
2447 if (ncells > 0) | |
2448 newline(); | |
2449 | |
2450 if (showrefbottom || resources) | |
2451 printlinkrefs(); | |
2452 | |
2453 hflush(); | |
2454 setmarkup(0); | |
2455 | |
2456 return 0; | |
2457 } |