Introduction
Introduction Statistics Contact Development Disclaimer Help
webdump.c - webdump - HTML to plain-text converter for webpages
git clone git://git.codemadness.org/webdump
Log
Files
Refs
README
LICENSE
---
webdump.c (66818B)
---
1 #include <errno.h>
2 #include <limits.h>
3 #include <stdio.h>
4 #include <stdarg.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <strings.h>
8 #include <unistd.h>
9
10 #include "arg.h"
11 char *argv0;
12
13 #include "tree.h"
14 #include "xml.h"
15
16 static XMLParser parser;
17
18 #ifndef __OpenBSD__
19 #define pledge(p1,p2) 0
20 #endif
21
22 #undef strlcat
23 size_t strlcat(char *, const char *, size_t);
24 #undef strlcpy
25 size_t strlcpy(char *, const char *, size_t);
26
27 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
28 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
29 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
30 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
31 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
32 #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c))
33
34 #define LEN(x) (sizeof(x) / sizeof(x[0]))
35
36 /* URI */
37 struct uri {
38 char proto[48]; /* scheme including ":" or "://" */
39 char userinfo[256]; /* username [:password] */
40 char host[256];
41 char port[6]; /* numeric port */
42 char path[1024];
43 char query[1024];
44 char fragment[1024];
45 };
46
47 /* options */
48 static int allowansi = 0; /* (-a) allow ANSI escape codes */
49 static int uniqrefs = 0; /* (-d) number unique references */
50 static int showrefinline = 0; /* (-i) show link reference number inline…
51 static int showurlinline = 0; /* (-I) show full link reference inline */
52 static int showrefbottom = 0; /* (-l) show link references at the botto…
53 static int allowlinewrap = 0; /* (-r) line-wrapping */
54 static int termwidth = 77; /* (-w) terminal width */
55 static int resources = 0; /* (-x) write resources line-by-line to f…
56
57 enum DisplayType {
58 DisplayUnknown = 0,
59 DisplayInline = 1 << 0,
60 DisplayInlineBlock = 1 << 1, /* unused for now */
61 DisplayBlock = 1 << 2,
62 DisplayNone = 1 << 3,
63 DisplayPre = 1 << 4,
64 DisplayList = 1 << 5,
65 DisplayListOrdered = 1 << 6,
66 DisplayListItem = 1 << 7,
67 DisplayTable = 1 << 8,
68 DisplayTableRow = 1 << 9,
69 DisplayTableCell = 1 << 10,
70 DisplayHeader = 1 << 11,
71 DisplayDl = 1 << 12,
72 DisplayInput = 1 << 13,
73 DisplayButton = 1 << 14,
74 DisplaySelect = 1 << 15,
75 DisplaySelectMulti = 1 << 16,
76 DisplayOption = 1 << 17
77 };
78
79 /* ANSI markup */
80 enum MarkupType {
81 MarkupNone = 0,
82 MarkupBold = 1 << 0,
83 MarkupItalic = 1 << 1,
84 MarkupUnderline = 1 << 2,
85 MarkupBlink = 1 << 3, /* lol */
86 MarkupReverse = 1 << 4,
87 MarkupStrike = 1 << 5
88 };
89
90 /* String data / memory pool */
91 typedef struct string {
92 char *data; /* data */
93 size_t len; /* string length */
94 size_t bufsiz; /* allocated size */
95 } String;
96
97 enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAud…
98 TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButto…
99 TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDet…
100 TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFields…
101 TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, T…
102 TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, …
103 TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi,
104 TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, …
105 TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch,
106 TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
107 TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate,
108 TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrac…
109 TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp };
110
111 struct tag {
112 const char *name;
113 enum TagId id;
114 enum DisplayType displaytype;
115 enum MarkupType markuptype; /* ANSI markup */
116 enum DisplayType parenttype; /* display type belonging to elemen…
117 int isvoid; /* "void" element */
118 int isoptional; /* optional to close tag */
119 int margintop; /* newlines when the tag starts */
120 int marginbottom; /* newlines after the tag ends */
121 int indent; /* indent in cells */
122 };
123
124 struct node {
125 char tagname[256];
126 struct tag tag;
127 size_t nchildren; /* child node count */
128 size_t visnchildren; /* child node count which are visible */
129 /* attributes */
130 char id[256];
131 char classnames[1024];
132 int indent; /* indent per node, for formatting */
133 int hasdata; /* tag contains some data, for formatting */
134 };
135
136 struct selectornode {
137 char tagname[256];
138 long index; /* index of node to match on: -1 if not matching on …
139 /* attributes */
140 char id[256];
141 char classnames[1024];
142 };
143
144 struct selector {
145 const char *text;
146 struct selectornode nodes[32];
147 int depth;
148 };
149
150 /* list of selectors */
151 struct selectors {
152 struct selector **selectors;
153 size_t count;
154 };
155
156 /* RB tree of link references */
157 struct linkref {
158 char *type;
159 enum TagId tagid;
160 char *url;
161 int ishidden;
162 size_t linknr;
163 RB_ENTRY(linkref) entry;
164 };
165
166 /* link references and hidden link references */
167 static struct linkref **visrefs;
168 static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */
169 static struct linkref **hiddenrefs;
170 static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capaci…
171
172 /* compare link by URL for link references RB-tree */
173 static int
174 linkrefcmp(struct linkref *r1, struct linkref *r2)
175 {
176 return strcmp(r1->url, r2->url);
177 }
178
179 RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
180 RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
181
182 static const char *str_bullet_item = "* ";
183 static const char *str_checkbox_checked = "x";
184 static const char *str_ruler = "-";
185 static const char *str_radio_checked = "*";
186
187 /* base href, to make URLs absolute */
188 static char basehrefdoc[4096]; /* buffer for base href in document, if a…
189 static int basehrefset; /* base href set and can be used? */
190 static struct uri base; /* parsed current base href */
191
192 /* buffers for some attributes of the current tag */
193 static String attr_alt; /* alt attribute */
194 static String attr_checked; /* checked attribute */
195 static String attr_class; /* class attribute */
196 static int attr_class_set; /* class attribute is set already */
197 static String attr_data; /* data attribute */
198 static String attr_href; /* href attribute */
199 static String attr_id; /* id attribute */
200 static int attr_id_set; /* class attribute is set already */
201 static String attr_src; /* src attribute */
202 static String attr_type; /* type attribute */
203 static String attr_value; /* value attribute */
204
205 static String htmldata; /* buffered HTML data near the current tag */
206
207 /* for white-space output handling:
208 1 = whitespace emitted (suppress repeated), 2 = other characters on t…
209 Behaviour:
210 * White-space data before non-whitespace data in tags are ignored on …
211 * Repeated white-space are ignored: a single space (' ') is emitted.
212 */
213 static int whitespace_mode;
214 static int nbytesline; /* bytes on this line */
215 static int ncells; /* current cell/column count */
216 static int hadnewline; /* count for repeated newlines */
217 /* flag for skipping initial white-space in tag: for HTML white-space ha…
218 static int skipinitialws = 1;
219 #define DEFAULT_INDENT 2
220 static const int defaultindent = DEFAULT_INDENT; /* default indent / mar…
221 static int indent; /* indent for the current line, in columns */
222 /* previous output sequential newlines, used for calculating margins bet…
223 elements and reducing excessive newlines */
224 static int currentnewlines;
225
226 /* buffers for line-wrapping (buffer per word boundary) */
227 static char rbuf[1024];
228 static int rbuflen;
229 static int rnbufcells; /* pending cell count to add */
230
231 #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */
232 static struct node *nodes; /* node tree (one per level is remembered) */
233 static String *nodes_links; /* keep track of links per node */
234 static size_t ncapnodes; /* current allocated node capacity */
235 static int curnode; /* current node depth */
236
237 /* reader / selector mode (-s) */
238 static int reader_mode;
239 /* flag if the tags and their children should be ignored in the current …
240 static int reader_ignore;
241
242 static enum MarkupType curmarkup; /* current markup state (bold, underli…
243 static int linewrap; /* allow linewrap in this context */
244
245 /* selector to match (for -s and -u) */
246 static struct selectors *sel_hide, *sel_show;
247
248 /* tags table: needs to be sorted like tagcmp(), alphabetically */
249
250 /* tag id displaytype markup …
251 static struct tag tags[] = {
252 { "a", TagA, DisplayInline, MarkupU…
253 { "address", TagAddress, DisplayBlock, 0, …
254 { "area", TagArea, DisplayInline, 0, …
255 { "article", TagArticle, DisplayBlock, 0, …
256 { "aside", TagAside, DisplayBlock, 0, …
257 { "audio", TagAudio, DisplayInline, MarkupU…
258 { "b", TagB, DisplayInline, MarkupB…
259 { "base", TagBase, DisplayInline, 0, …
260 { "blink", TagBlink, DisplayInline, MarkupB…
261 { "blockquote", TagBlockquote, DisplayBlock, 0, …
262 { "body", TagBody, DisplayBlock, 0, …
263 { "br", TagBr, 0, 0, …
264 { "button", TagButton, DisplayInline | DisplayButton, 0, …
265 { "cite", TagCite, DisplayInline, MarkupI…
266 { "col", TagCol, DisplayInline, 0, …
267 { "colgroup", TagColgroup, DisplayInline, 0, …
268 { "datalist", TagDatalist, DisplayNone, 0, …
269 { "dd", TagDd, DisplayBlock, 0, …
270 { "del", TagDel, DisplayInline, MarkupS…
271 { "details", TagDetails, DisplayBlock, 0, …
272 { "dfn", TagDfn, DisplayInline, MarkupI…
273 { "dir", TagDir, DisplayList, 0, …
274 { "div", TagDiv, DisplayBlock, 0, …
275 { "dl", TagDl, DisplayBlock | DisplayDl, 0, …
276 { "dt", TagDt, DisplayBlock, MarkupB…
277 { "em", TagEm, DisplayInline, MarkupI…
278 { "embed", TagEmbed, DisplayInline, 0, …
279 { "fieldset", TagFieldset, DisplayBlock, 0, …
280 { "figcaption", TagFigcaption, DisplayBlock, 0, …
281 { "figure", TagFigure, DisplayBlock, 0, …
282 { "footer", TagFooter, DisplayBlock, 0, …
283 { "form", TagForm, DisplayBlock, 0, …
284 { "frame", TagFrame, DisplayInline, 0, …
285 { "h1", TagH1, DisplayHeader, MarkupB…
286 { "h2", TagH2, DisplayHeader, MarkupB…
287 { "h3", TagH3, DisplayHeader, MarkupB…
288 { "h4", TagH4, DisplayHeader, MarkupB…
289 { "h5", TagH5, DisplayHeader, MarkupB…
290 { "h6", TagH6, DisplayHeader, MarkupB…
291 { "head", TagHead, DisplayBlock, 0, …
292 { "header", TagHeader, DisplayBlock, 0, …
293 { "hr", TagHr, DisplayBlock, 0, …
294 { "html", TagHtml, DisplayBlock, 0, …
295 { "i", TagI, DisplayInline, MarkupI…
296 { "iframe", TagIframe, DisplayInline, 0, …
297 { "img", TagImg, DisplayInline, MarkupU…
298 { "input", TagInput, DisplayInput, 0, …
299 { "ins", TagIns, DisplayInline, MarkupU…
300 { "label", TagLabel, DisplayInline, 0, …
301 { "legend", TagLegend, DisplayBlock, 0, …
302 { "li", TagLi, DisplayListItem, 0, …
303 { "link", TagLink, DisplayInline, 0, …
304 { "main", TagMain, DisplayBlock, 0, …
305 { "mark", TagMark, DisplayInline, MarkupR…
306 { "menu", TagMenu, DisplayList, 0, …
307 { "meta", TagMeta, DisplayInline, 0, …
308 { "nav", TagNav, DisplayBlock, 0, …
309 { "object", TagObject, DisplayInline, 0, …
310 { "ol", TagOl, DisplayList | DisplayListOrdered, 0, …
311 { "option", TagOption, DisplayInline | DisplayOption, 0, …
312 { "p", TagP, DisplayBlock, 0, …
313 { "param", TagParam, DisplayInline, 0, …
314 { "pre", TagPre, DisplayPre, 0, …
315 { "s", TagS, DisplayInline, MarkupS…
316 { "script", TagScript, DisplayNone, 0, …
317 { "search", TagSearch, DisplayBlock, 0, …
318 { "section", TagSection, DisplayBlock, 0, …
319 { "select", TagSelect, DisplayInline | DisplaySelect, 0, …
320 { "source", TagSource, DisplayInline, 0, …
321 { "strike", TagStrike, DisplayInline, MarkupS…
322 { "strong", TagStrong, DisplayInline, MarkupB…
323 { "style", TagStyle, DisplayNone, 0, …
324 { "summary", TagSummary, DisplayBlock, 0, …
325 { "svg", TagSvg, DisplayNone, 0, …
326 { "table", TagTable, DisplayTable, 0, …
327 { "tbody", TagTbody, DisplayInline, 0, …
328 { "td", TagTd, DisplayTableCell, 0, …
329 { "template", TagTemplate, DisplayNone, 0, …
330 { "textarea", TagTextarea, DisplayInline, 0, …
331 { "tfoot", TagTfoot, DisplayInline, 0, …
332 { "th", TagTh, DisplayTableCell, MarkupB…
333 { "thead", TagThead, DisplayInline, 0, …
334 { "title", TagTitle, DisplayBlock, 0, …
335 { "tr", TagTr, DisplayTableRow, 0, …
336 { "track", TagTrack, DisplayInline, 0, …
337 { "u", TagU, DisplayInline, MarkupU…
338 { "ul", TagUl, DisplayList, 0, …
339 { "var", TagVar, DisplayInline, MarkupI…
340 { "video", TagVideo, DisplayInline, MarkupU…
341 { "wbr", TagWbr, DisplayInline, 0, …
342 { "xmp", TagXmp, DisplayPre, 0, …
343 };
344
345 /* hint for compilers and static analyzers that a function exits */
346 #ifndef __dead
347 #define __dead
348 #endif
349
350 /* print to stderr, print error message of errno and exit(). */
351 __dead static void
352 err(int exitstatus, const char *fmt, ...)
353 {
354 va_list ap;
355 int saved_errno;
356
357 saved_errno = errno;
358
359 fputs("webdump: ", stderr);
360 if (fmt) {
361 va_start(ap, fmt);
362 vfprintf(stderr, fmt, ap);
363 va_end(ap);
364 fputs(": ", stderr);
365 }
366 fprintf(stderr, "%s\n", strerror(saved_errno));
367
368 exit(exitstatus);
369 }
370
371 /* print to stderr and exit(). */
372 __dead static void
373 errx(int exitstatus, const char *fmt, ...)
374 {
375 va_list ap;
376
377 fputs("webdump: ", stderr);
378 if (fmt) {
379 va_start(ap, fmt);
380 vfprintf(stderr, fmt, ap);
381 va_end(ap);
382 }
383 fputs("\n", stderr);
384
385 exit(exitstatus);
386 }
387
388 static const char *ignorestate, *endtag;
389 static int (*getnext)(void);
390
391 /* return a space for all data until some case-insensitive string occurs…
392 is used to parse incorrect HTML/XML that contains unescaped HTML in s…
393 or style tags. If you see some </script> tag in a CDATA or comment
394 section then e-mail W3C and tell them the web is too complex. */
395 static inline int
396 getnext_ignore(void)
397 {
398 int c;
399
400 if ((c = getnext()) == EOF)
401 return EOF;
402
403 if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignores…
404 ignorestate++;
405 if (*ignorestate == '\0') {
406 parser.getnext = getnext; /* restore */
407 return ' ';
408 }
409 } else {
410 ignorestate = endtag; /* no full match: reset to beginni…
411 }
412
413 return ' '; /* pretend there is just SPACEs */
414 }
415
416 /* Clear string only; don't free, prevents unnecessary reallocation. */
417 static void
418 string_clear(String *s)
419 {
420 if (s->data)
421 s->data[0] = '\0';
422 s->len = 0;
423 }
424
425 static void
426 string_buffer_realloc(String *s, size_t newlen)
427 {
428 size_t alloclen;
429
430 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
431 ;
432 if (!(s->data = realloc(s->data, alloclen)))
433 err(1, "realloc");
434 s->bufsiz = alloclen;
435 }
436
437 static void
438 string_append(String *s, const char *data, size_t len)
439 {
440 if (!len)
441 return;
442 /* check if allocation is necesary, don't shrink buffer,
443 * should be more than bufsiz ofcourse. */
444 if (s->len + len >= s->bufsiz)
445 string_buffer_realloc(s, s->len + len + 1);
446 memcpy(s->data + s->len, data, len);
447 s->len += len;
448 s->data[s->len] = '\0';
449 }
450
451 static char *
452 estrdup(const char *s)
453 {
454 char *p;
455
456 if (!(p = strdup(s)))
457 err(1, "strdup");
458 return p;
459 }
460
461 static char *
462 estrndup(const char *s, size_t n)
463 {
464 char *p;
465
466 if (!(p = strndup(s, n)))
467 err(1, "strndup");
468 return p;
469 }
470
471 static void *
472 erealloc(void *p, size_t siz)
473 {
474 if (!(p = realloc(p, siz)))
475 err(1, "realloc");
476
477 return p;
478 }
479
480 static void *
481 ecalloc(size_t nmemb, size_t size)
482 {
483 void *p;
484
485 if (!(p = calloc(nmemb, size)))
486 err(1, "calloc");
487 return p;
488 }
489
490 /* check if string has a non-empty scheme / protocol part */
491 static int
492 uri_hasscheme(const char *s)
493 {
494 const char *p = s;
495
496 for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) …
497 *p == '+' || *p == '-' || *p == '.'; p++)
498 ;
499 /* scheme, except if empty and starts with ":" then it is a path…
500 return (*p == ':' && p != s);
501 }
502
503 static int
504 uri_parse(const char *s, struct uri *u)
505 {
506 const char *p = s;
507 char *endptr;
508 size_t i;
509 long l;
510
511 u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
512 u->path[0] = u->query[0] = u->fragment[0] = '\0';
513
514 /* protocol-relative */
515 if (*p == '/' && *(p + 1) == '/') {
516 p += 2; /* skip "//" */
517 goto parseauth;
518 }
519
520 /* scheme / protocol part */
521 for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) …
522 *p == '+' || *p == '-' || *p == '.'; p++)
523 ;
524 /* scheme, except if empty and starts with ":" then it is a path…
525 if (*p == ':' && p != s) {
526 if (*(p + 1) == '/' && *(p + 2) == '/')
527 p += 3; /* skip "://" */
528 else
529 p++; /* skip ":" */
530
531 if ((size_t)(p - s) >= sizeof(u->proto))
532 return -1; /* protocol too long */
533 memcpy(u->proto, s, p - s);
534 u->proto[p - s] = '\0';
535
536 if (*(p - 1) != '/')
537 goto parsepath;
538 } else {
539 p = s; /* no scheme format, reset to start */
540 goto parsepath;
541 }
542
543 parseauth:
544 /* userinfo (username:password) */
545 i = strcspn(p, "@/?#");
546 if (p[i] == '@') {
547 if (i >= sizeof(u->userinfo))
548 return -1; /* userinfo too long */
549 memcpy(u->userinfo, p, i);
550 u->userinfo[i] = '\0';
551 p += i + 1;
552 }
553
554 /* IPv6 address */
555 if (*p == '[') {
556 /* bracket not found, host too short or too long */
557 i = strcspn(p, "]");
558 if (p[i] != ']' || i < 3)
559 return -1;
560 i++; /* including "]" */
561 } else {
562 /* domain / host part, skip until port, path or end. */
563 i = strcspn(p, ":/?#");
564 }
565 if (i >= sizeof(u->host))
566 return -1; /* host too long */
567 memcpy(u->host, p, i);
568 u->host[i] = '\0';
569 p += i;
570
571 /* port */
572 if (*p == ':') {
573 p++;
574 if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
575 return -1; /* port too long */
576 memcpy(u->port, p, i);
577 u->port[i] = '\0';
578 /* check for valid port: range 1 - 65535, may be empty */
579 errno = 0;
580 l = strtol(u->port, &endptr, 10);
581 if (i && (errno || *endptr || l <= 0 || l > 65535))
582 return -1;
583 p += i;
584 }
585
586 parsepath:
587 /* path */
588 if ((i = strcspn(p, "?#")) >= sizeof(u->path))
589 return -1; /* path too long */
590 memcpy(u->path, p, i);
591 u->path[i] = '\0';
592 p += i;
593
594 /* query */
595 if (*p == '?') {
596 p++;
597 if ((i = strcspn(p, "#")) >= sizeof(u->query))
598 return -1; /* query too long */
599 memcpy(u->query, p, i);
600 u->query[i] = '\0';
601 p += i;
602 }
603
604 /* fragment */
605 if (*p == '#') {
606 p++;
607 if ((i = strlen(p)) >= sizeof(u->fragment))
608 return -1; /* fragment too long */
609 memcpy(u->fragment, p, i);
610 u->fragment[i] = '\0';
611 }
612
613 return 0;
614 }
615
616 /* Transform and try to make the URI `u` absolute using base URI `b` int…
617 Follows some of the logic from "RFC 3986 - 5.2.2. Transform Reference…
618 Returns 0 on success, -1 on error or truncation. */
619 static int
620 uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
621 {
622 char *p;
623 int c;
624
625 strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
626
627 if (u->proto[0] || u->host[0]) {
628 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, siz…
629 strlcpy(a->host, u->host, sizeof(a->host));
630 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
631 strlcpy(a->host, u->host, sizeof(a->host));
632 strlcpy(a->port, u->port, sizeof(a->port));
633 strlcpy(a->path, u->path, sizeof(a->path));
634 strlcpy(a->query, u->query, sizeof(a->query));
635 return 0;
636 }
637
638 strlcpy(a->proto, b->proto, sizeof(a->proto));
639 strlcpy(a->host, b->host, sizeof(a->host));
640 strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
641 strlcpy(a->host, b->host, sizeof(a->host));
642 strlcpy(a->port, b->port, sizeof(a->port));
643
644 if (!u->path[0]) {
645 strlcpy(a->path, b->path, sizeof(a->path));
646 } else if (u->path[0] == '/') {
647 strlcpy(a->path, u->path, sizeof(a->path));
648 } else {
649 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '…
650 a->path[1] = '\0';
651
652 if ((p = strrchr(b->path, '/'))) {
653 c = *(++p);
654 *p = '\0'; /* temporary NUL-terminate */
655 if (strlcat(a->path, b->path, sizeof(a->path)) >…
656 return -1;
657 *p = c; /* restore */
658 }
659 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof…
660 return -1;
661 }
662
663 if (u->path[0] || u->query[0])
664 strlcpy(a->query, u->query, sizeof(a->query));
665 else
666 strlcpy(a->query, b->query, sizeof(a->query));
667
668 return 0;
669 }
670
671 static int
672 uri_format(char *buf, size_t bufsiz, struct uri *u)
673 {
674 return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
675 u->proto,
676 u->userinfo[0] ? u->userinfo : "",
677 u->userinfo[0] ? "@" : "",
678 u->host,
679 u->port[0] ? ":" : "",
680 u->port,
681 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
682 u->path,
683 u->query[0] ? "?" : "",
684 u->query,
685 u->fragment[0] ? "#" : "",
686 u->fragment);
687 }
688
689 /* compare tag name (case-insensitive) */
690 static int
691 tagcmp(const char *s1, const char *s2)
692 {
693 return strcasecmp(s1, s2);
694 }
695
696 /* compare attribute name (case-insensitive) */
697 static int
698 attrcmp(const char *s1, const char *s2)
699 {
700 return strcasecmp(s1, s2);
701 }
702
703 static void
704 rindent(void)
705 {
706 int i, total;
707
708 total = indent + defaultindent;
709 if (total < 0)
710 total = 0;
711 for (i = 0; i < total; i++)
712 putchar(' ');
713
714 nbytesline += total;
715 ncells += total;
716 }
717
718 static void
719 emitmarkup(int markuptype)
720 {
721 if (!allowansi)
722 return;
723
724 if (!markuptype)
725 fputs("\033[0m", stdout); /* reset all attributes */
726
727 /* set */
728 if (markuptype & MarkupBold)
729 fputs("\033[1m", stdout);
730 if (markuptype & MarkupItalic)
731 fputs("\033[3m", stdout);
732 if (markuptype & MarkupUnderline)
733 fputs("\033[4m", stdout);
734 if (markuptype & MarkupBlink)
735 fputs("\033[5m", stdout);
736 if (markuptype & MarkupReverse)
737 fputs("\033[7m", stdout);
738 if (markuptype & MarkupStrike)
739 fputs("\033[9m", stdout);
740 }
741
742 /* flush remaining buffer (containing a word): used for word-wrap handli…
743 static void
744 hflush(void)
745 {
746 int i;
747
748 if (!rbuflen)
749 return;
750
751 if (!nbytesline) {
752 if (curmarkup)
753 emitmarkup(0);
754 rindent();
755 /* emit code again per line, needed for GNU/less -R */
756 if (curmarkup)
757 emitmarkup(curmarkup);
758 }
759
760 for (i = 0; i < rbuflen; i++)
761 putchar(rbuf[i]);
762
763 nbytesline += rbuflen;
764 ncells += rnbufcells;
765 rbuflen = 0;
766 rnbufcells = 0;
767 }
768
769 static void
770 printansi(const char *s)
771 {
772 size_t len;
773
774 if (!allowansi)
775 return;
776
777 if (linewrap) {
778 len = strlen(s);
779 if (rbuflen + len + 1 >= sizeof(rbuf))
780 hflush();
781 if (rbuflen + len + 1 < sizeof(rbuf)) {
782 memcpy(rbuf + rbuflen, s, len);
783 rbuflen += len;
784 /* NOTE: nbytesline and ncells are not counted f…
785 }
786 } else {
787 fputs(s, stdout);
788 }
789 }
790
791 static void
792 setmarkup(int markuptype)
793 {
794 if (!allowansi)
795 return;
796
797 /* need change? */
798 if (curmarkup == markuptype)
799 return;
800
801 if (!markuptype) {
802 printansi("\033[0m"); /* reset all attributes */
803 curmarkup = markuptype;
804 return;
805 }
806
807 /* set */
808 if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold))
809 printansi("\033[1m");
810 if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic))
811 printansi("\033[3m");
812 if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderl…
813 printansi("\033[4m");
814 if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink))
815 printansi("\033[5m");
816 if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse))
817 printansi("\033[7m");
818 if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike))
819 printansi("\033[9m");
820
821 /* unset */
822 if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold))
823 printansi("\033[22m"); /* reset bold or faint */
824 if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic))
825 printansi("\033[23m"); /* reset italic */
826 if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderl…
827 printansi("\033[24m"); /* reset underline */
828 if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink))
829 printansi("\033[25m"); /* reset blink */
830 if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse))
831 printansi("\033[27m"); /* reset reverse */
832 if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike))
833 printansi("\033[29m"); /* reset strike */
834
835 curmarkup = markuptype;
836 }
837
838 static void
839 startmarkup(int markuptype)
840 {
841 setmarkup(curmarkup | markuptype);
842 }
843
844 static void
845 endmarkup(int markuptype)
846 {
847 setmarkup(curmarkup & ~markuptype);
848 }
849
850 /* rough cell width of a unicode codepoint by counting a unicode codepoi…
851 cell in general.
852 NOTE: this is of course incorrect since characters can be 2 width asw…
853 in the future maybe replace this with wcwidth() or similar */
854 static int
855 utfwidth(int c)
856 {
857 /* not the start of a codepoint */
858 if ((c & 0xc0) == 0x80)
859 return 0;
860 /* count TAB as 8 */
861 if (c == '\t')
862 return 8;
863 return 1;
864 }
865
866 /* write a character, handling state of repeated newlines, some HTML
867 white-space rules, indentation and word-wrapping */
868 static void
869 hputchar(int c)
870 {
871 struct node *cur = &nodes[curnode];
872 cur->hasdata = 1;
873
874 if (c == '\n') {
875 /* previous line had characters, so not a repeated newli…
876 if (nbytesline > 0)
877 hadnewline = 0;
878
879 /* start a new line, no chars on this line yet */
880 whitespace_mode &= ~2; /* no chars on this line yet */
881 nbytesline = 0;
882 ncells = 0;
883
884 if (hadnewline)
885 currentnewlines++; /* repeating newlines */
886 hadnewline = 1;
887 } else {
888 hadnewline = 0;
889 currentnewlines = 0;
890 }
891
892 /* skip initial/leading white-space */
893 if (ISSPACE((unsigned char)c)) {
894 if (skipinitialws)
895 return;
896 } else {
897 skipinitialws = 0;
898 }
899
900 if (!(c == '\n' || c == '\t' || !ISCNTRL((unsigned char)c)))
901 return;
902
903 if (!linewrap) {
904 if (c == '\n') {
905 putchar('\n');
906 nbytesline = 0;
907 ncells = 0;
908 } else {
909 if (!nbytesline) {
910 if (curmarkup)
911 emitmarkup(0);
912 rindent();
913 /* emit code again per line, needed for …
914 if (curmarkup)
915 emitmarkup(curmarkup);
916 }
917 putchar(c);
918 nbytesline++;
919 ncells += utfwidth(c);
920 }
921 return;
922 }
923
924 /* really too long: the whole word doesn't even fit, flush it */
925 if (ncells + rnbufcells >= termwidth || rbuflen >= sizeof(rbuf) …
926 putchar('\n');
927 nbytesline = 0;
928 ncells = 0;
929 hflush();
930 }
931
932 if (c == '\n') {
933 putchar('\n');
934 hflush();
935 return;
936 } else if (ISSPACE((unsigned char)c) || c == '-') {
937 if (ncells + rnbufcells >= termwidth) {
938 putchar('\n');
939 nbytesline = 0;
940 ncells = 0;
941 }
942 rbuf[rbuflen++] = c;
943 rnbufcells += utfwidth(c);
944 hflush();
945 return;
946 }
947
948 rbuf[rbuflen++] = c;
949 rnbufcells += utfwidth(c);
950 }
951
952 /* calculate indentation of current node depth, using the sum of each
953 indentation per node */
954 static int
955 calcindent(void)
956 {
957 int i, n = 0;
958
959 for (i = curnode; i >= 0; i--)
960 n += nodes[i].indent;
961
962 return n;
963 }
964
965 static void
966 hprint(const char *s)
967 {
968 for (; *s; ++s)
969 hputchar(*s);
970 }
971
972 /* printf(), max 256 bytes for now */
973 static void
974 hprintf(const char *fmt, ...)
975 {
976 va_list ap;
977 char buf[256];
978
979 va_start(ap, fmt);
980 vsnprintf(buf, sizeof(buf), fmt, ap);
981 va_end(ap);
982
983 /* use hprint() formatting logic. */
984 hprint(buf);
985 }
986
987 static void
988 newline(void)
989 {
990 if (skipinitialws)
991 return;
992 hputchar('\n');
993 }
994
995 static int
996 parentcontainerhasdata(int curtype, int n)
997 {
998 int i;
999
1000 for (i = n; i >= 0; i--) {
1001 if (nodes[i].tag.displaytype & (DisplayList|DisplayTable…
1002 break;
1003 if (nodes[i].hasdata)
1004 return 1;
1005 }
1006
1007 return 0;
1008 }
1009
1010 /* start on a newline for the start of a block element or not */
1011 static void
1012 startblock(void)
1013 {
1014 hflush();
1015 whitespace_mode &= ~2; /* no characters on this line yet */
1016 if (nbytesline <= 0)
1017 return;
1018 if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata)
1019 hputchar('\n');
1020 }
1021
1022 /* start on a newline for the end of a block element or not */
1023 static void
1024 endblock(void)
1025 {
1026 hflush();
1027 whitespace_mode &= ~2; /* no characters on this line yet */
1028 if (nbytesline <= 0)
1029 return;
1030 if (!hadnewline)
1031 hputchar('\n');
1032 }
1033
1034 /* print one character safely: no control characters,
1035 handle HTML white-space rules */
1036 static void
1037 printc(int c)
1038 {
1039 if (ISSPACE((unsigned char)c)) {
1040 if (whitespace_mode == 2)
1041 hputchar(' ');
1042 whitespace_mode |= 1;
1043 } else {
1044 whitespace_mode = 2;
1045 if (!ISCNTRL((unsigned char)c))
1046 hputchar(c);
1047 }
1048 }
1049
1050 static void
1051 printpre(const char *s, size_t len)
1052 {
1053 struct node *cur;
1054 size_t i;
1055
1056 /* reset state of newlines because this data is printed literall…
1057 hadnewline = 0;
1058 currentnewlines = 0;
1059
1060 /* skip leading newline */
1061 i = 0;
1062 if (skipinitialws) {
1063 if (*s == '\n' && i < len) {
1064 s++;
1065 i++;
1066 }
1067 }
1068
1069 hflush();
1070
1071 skipinitialws = 0;
1072
1073 if (*s) {
1074 cur = &nodes[curnode];
1075 cur->hasdata = 1;
1076 }
1077
1078 for (; *s && i < len; s++, i++) {
1079 switch (*s) {
1080 case '\n':
1081 putchar('\n');
1082 nbytesline = 0;
1083 ncells = 0;
1084 break;
1085 case '\t':
1086 hadnewline = 0;
1087 if (!nbytesline) {
1088 if (curmarkup)
1089 emitmarkup(0);
1090 rindent();
1091 /* emit code again per line, needed for …
1092 if (curmarkup)
1093 emitmarkup(curmarkup);
1094 }
1095
1096 /* TAB to 8 spaces */
1097 fputs(" ", stdout);
1098 nbytesline += 8;
1099 ncells += 8;
1100 break;
1101 default:
1102 if (ISCNTRL((unsigned char)*s))
1103 continue;
1104
1105 if (!nbytesline) {
1106 if (curmarkup)
1107 emitmarkup(0);
1108 rindent();
1109 /* emit code again per line, needed for …
1110 if (curmarkup)
1111 emitmarkup(curmarkup);
1112 }
1113
1114 putchar(*s);
1115 nbytesline++;
1116 /* start of rune: incorrectly assume 1 rune is 1…
1117 ncells += utfwidth((unsigned char)*s);
1118 }
1119 }
1120 }
1121
1122 static struct node *
1123 findparenttype(int cur, int findtype)
1124 {
1125 int i;
1126
1127 for (i = cur; i >= 0; i--) {
1128 if ((nodes[i].tag.displaytype & findtype))
1129 return &nodes[i];
1130 }
1131 return NULL;
1132 }
1133
1134 static int
1135 isclassmatch(const char *haystack, const char *needle)
1136 {
1137 const char *p;
1138 size_t needlelen;
1139 size_t matched = 0;
1140
1141 needlelen = strlen(needle);
1142 for (p = haystack; *p; p++) {
1143 if (ISSPACE((unsigned char)*p)) {
1144 matched = 0;
1145 continue;
1146 }
1147 if (needle[matched] == *p)
1148 matched++;
1149 else
1150 matched = 0;
1151 if (matched == needlelen) {
1152 if (*(p + 1) == '\0' || ISSPACE((unsigned char)*…
1153 return 1;
1154 }
1155 }
1156
1157 return 0;
1158 }
1159
1160 /* very limited CSS-like selector, supports: main, main#id, main.class,
1161 ".class", "#id", "ul li a" */
1162 static int
1163 compileselector(const char *sel, struct selectornode *nodes, size_t maxn…
1164 {
1165 int depth = 0, len;
1166 long l;
1167 const char *s, *start;
1168 char tmp[256];
1169 int nameset = 0;
1170
1171 memset(&nodes[0], 0, sizeof(nodes[0]));
1172 nodes[0].index = -1;
1173
1174 s = sel;
1175 for (; *s && ISSPACE((unsigned char)*s); s++)
1176 ;
1177
1178 start = s;
1179 for (; ; s++) {
1180 /* end of tag */
1181 if (!nameset &&
1182 (*s == '#' || *s == '.' || *s == '@' ||
1183 *s == '\0' || ISSPACE((unsigned char)*s))) {
1184 nameset = 1;
1185 len = s - start; /* tag name */
1186 if (len >= sizeof(tmp))
1187 return 0;
1188 if (len)
1189 memcpy(tmp, start, len);
1190 tmp[len] = '\0';
1191
1192 memcpy(nodes[depth].tagname, tmp, len + 1);
1193 }
1194
1195 /* end */
1196 if (*s == '\0' || ISSPACE((unsigned char)*s)) {
1197 for (; ISSPACE((unsigned char)*s); s++)
1198 ;
1199 start = s; /* start of a new tag */
1200 depth++;
1201 if (depth >= maxnodes)
1202 return 0;
1203
1204 nameset = 0;
1205 memset(&nodes[depth], 0, sizeof(nodes[depth]));
1206 nodes[depth].index = -1;
1207
1208 /* end of selector */
1209 if (*s == '\0')
1210 break;
1211 }
1212
1213 /* index */
1214 if (*s == '@') {
1215 len = strcspn(s + 1, ".#@ \t\n");
1216 if (len >= sizeof(tmp))
1217 return 0;
1218 memcpy(tmp, s + 1, len);
1219 tmp[len] = '\0';
1220
1221 l = strtol(tmp, NULL, 10);
1222 if (l >= 0)
1223 nodes[depth].index = l;
1224 s += len;
1225 start = s + 1;
1226 continue;
1227 }
1228
1229 /* id */
1230 if (*s == '#') {
1231 len = strcspn(s + 1, ".#@ \t\n");
1232 if (len >= sizeof(tmp))
1233 return 0;
1234 memcpy(tmp, s + 1, len);
1235 tmp[len] = '\0';
1236 memcpy(nodes[depth].id, tmp, len + 1);
1237 s += len;
1238 start = s + 1;
1239 continue;
1240 }
1241
1242 /* class */
1243 if (*s == '.') {
1244 len = strcspn(s + 1, ".#@ \t\n");
1245 if (len >= sizeof(tmp))
1246 return 0;
1247 memcpy(tmp, s + 1, len);
1248 tmp[len] = '\0';
1249 /* allow only one classname for now */
1250 memcpy(nodes[depth].classnames, tmp, len + 1);
1251 s += len;
1252 start = s + 1;
1253 continue;
1254 }
1255 }
1256
1257 return depth;
1258 }
1259
1260 static struct selector *
1261 newselector(const char *q)
1262 {
1263 struct selector *sel;
1264 int r;
1265
1266 sel = ecalloc(1, sizeof(*sel));
1267 sel->text = estrdup(q);
1268
1269 r = compileselector(sel->text, sel->nodes, LEN(sel->nodes));
1270 if (r <= 0) {
1271 free(sel);
1272 return NULL;
1273 }
1274 sel->depth = r;
1275
1276 return sel;
1277 }
1278
1279 static struct selectors *
1280 compileselectors(const char *q)
1281 {
1282 struct selectors *sels = NULL;
1283 struct selector *sel;
1284 const char *start;
1285 char *qe;
1286 int count = 0;
1287 size_t siz;
1288
1289 sels = ecalloc(1, sizeof(*sels));
1290
1291 start = q;
1292 for (; ; q++) {
1293 if (*q == ',' || *q == '\0') {
1294 qe = estrndup(start, q - start);
1295 sel = newselector(qe);
1296 free(qe);
1297
1298 /* add new selector */
1299 siz = (count + 1) * sizeof(struct selector *);
1300 sels->selectors = erealloc(sels->selectors, siz);
1301 sels->selectors[count] = sel;
1302 count++;
1303
1304 if (*q == '\0')
1305 break;
1306 start = q + 1;
1307 }
1308 }
1309 sels->count = count;
1310
1311 return sels;
1312 }
1313
1314 /* very limited CSS-like matcher, supports: main, main#id, main.class,
1315 ".class", "#id", "ul li a" */
1316 static int
1317 iscssmatch(struct selector *sel, struct node *root, int maxdepth)
1318 {
1319 int d, md = 0;
1320
1321 for (d = 0; d <= maxdepth; d++) {
1322 /* tag matched? */
1323 if (sel->nodes[md].tagname[0] &&
1324 strcasecmp(sel->nodes[md].tagname, root[d].tagname))
1325 continue; /* no */
1326
1327 /* id matched? */
1328 if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, ro…
1329 continue; /* no */
1330
1331 /* class matched, for now allow only one classname in th…
1332 matching multiple classnames */
1333 if (sel->nodes[md].classnames[0] &&
1334 !isclassmatch(root[d].classnames, sel->nodes[md].cla…
1335 continue; /* no */
1336
1337 /* index matched */
1338 if (sel->nodes[md].index != -1 &&
1339 (d == 0 ||
1340 root[d - 1].nchildren == 0 ||
1341 sel->nodes[md].index != root[d - 1].nchildren - 1))
1342 continue;
1343
1344 md++;
1345 /* all matched of one selector */
1346 if (md == sel->depth)
1347 return 1;
1348 }
1349
1350 return 0;
1351 }
1352
1353 static int
1354 iscssmatchany(struct selectors *sels, struct node *root, int maxdepth)
1355 {
1356 struct selector *sel;
1357 int i;
1358
1359 for (i = 0; i < sels->count; i++) {
1360 sel = sels->selectors[i];
1361 if (iscssmatch(sel, root, maxdepth))
1362 return 1;
1363 }
1364 return 0;
1365 }
1366
1367 static void
1368 handleinlinealt(void)
1369 {
1370 struct node *cur;
1371 char *start, *s, *e;
1372
1373 /* do not show the alt text if the element is hidden */
1374 cur = &nodes[curnode];
1375 if (cur->tag.displaytype & DisplayNone)
1376 return;
1377
1378 /* show img alt attribute as text. */
1379 if (attr_alt.len) {
1380 start = attr_alt.data;
1381 e = attr_alt.data + attr_alt.len;
1382
1383 for (s = start; s < e; s++)
1384 printc((unsigned char)*s);
1385 hflush();
1386 } else if (cur->tag.id == TagImg && !showurlinline) {
1387 /* if there is no alt text and no URL is shown inline, t…
1388 show "[IMG]" to indicate there was an image there */
1389 hprint("[IMG]");
1390 }
1391 }
1392
1393 /* lookup a link reference by url in the red-black tree */
1394 static struct linkref *
1395 findlinkref(const char *url)
1396 {
1397 struct linkref find;
1398
1399 find.url = (char *)url;
1400
1401 return RB_FIND(linkreftree, &linkrefhead, &find);
1402 }
1403
1404 /* add a link reference. Returns the added link reference, or the existi…
1405 reference if links are deduplicated */
1406 static struct linkref *
1407 addlinkref(const char *url, const char *_type, enum TagId tagid, int ish…
1408 {
1409 struct linkref *link;
1410 size_t linknr;
1411
1412 /* if links are deduplicates return the existing link */
1413 if (uniqrefs && (link = findlinkref(url)))
1414 return link;
1415
1416 if (tagid == TagA)
1417 _type = "link";
1418
1419 link = ecalloc(1, sizeof(*link));
1420
1421 if (!ishidden) {
1422 linknr = ++nvisrefs;
1423 if (nvisrefs >= ncapvisrefs) {
1424 ncapvisrefs += 256; /* greedy alloc */
1425 visrefs = erealloc(visrefs, sizeof(*visrefs) * n…
1426 }
1427 visrefs[linknr - 1] = link; /* add pointer to list */
1428 } else {
1429 linknr = ++nhiddenrefs;
1430 if (nhiddenrefs >= ncaphiddenrefs) {
1431 ncaphiddenrefs += 256; /* greedy alloc */
1432 hiddenrefs = erealloc(hiddenrefs, sizeof(*hidden…
1433 }
1434 hiddenrefs[linknr - 1] = link; /* add pointer to list */
1435 }
1436
1437 link->url = estrdup(url);
1438 link->type = estrdup(_type);
1439 link->tagid = tagid;
1440 link->ishidden = ishidden;
1441 link->linknr = linknr;
1442
1443 /* add to tree: the tree is only used for checking unique link r…
1444 if (uniqrefs)
1445 RB_INSERT(linkreftree, &linkrefhead, link);
1446
1447 return link;
1448 }
1449
1450 static void
1451 handleinlinelink(void)
1452 {
1453 struct uri newuri, olduri;
1454 struct node *cur;
1455 char buf[4096], *url;
1456 int r;
1457
1458 if (!showrefbottom && !showrefinline && !showurlinline && !resou…
1459 return; /* there is no need to collect the reference */
1460
1461 if (!attr_href.len && !attr_src.len && !attr_data.len)
1462 return; /* there is no reference */
1463
1464 /* by default use the original URL */
1465 if (attr_src.len)
1466 url = attr_src.data;
1467 else if (attr_href.len)
1468 url = attr_href.data;
1469 else
1470 url = attr_data.data;
1471
1472 if (!url)
1473 return;
1474
1475 /* Not an absolute URL yet: try to make it absolute.
1476 If it is not possible use the relative URL */
1477 if (!uri_hasscheme(url) && basehrefset &&
1478 uri_parse(url, &olduri) != -1 &&
1479 uri_makeabs(&newuri, &olduri, &base) != -1 &&
1480 newuri.proto[0]) {
1481 r = uri_format(buf, sizeof(buf), &newuri);
1482 if (r >= 0 && (size_t)r < sizeof(buf))
1483 url = buf;
1484 }
1485
1486 if (!url[0])
1487 return;
1488
1489 cur = &nodes[curnode];
1490
1491 if (!(cur->tag.displaytype & DisplayNone)) {
1492 string_clear(&nodes_links[curnode]);
1493 string_append(&nodes_links[curnode], url, strlen(url));
1494 }
1495
1496 /* add hidden links directly to the reference,
1497 the order doesn't matter */
1498 if (cur->tag.displaytype & DisplayNone)
1499 addlinkref(url, cur->tag.name, cur->tag.id, 1);
1500 }
1501
1502 static void
1503 printlinkrefs(void)
1504 {
1505 struct linkref *ref;
1506 size_t i;
1507
1508 if (!nvisrefs && !nhiddenrefs)
1509 return;
1510
1511 if (resources) {
1512 for (i = 0; i < nvisrefs; i++) {
1513 ref = visrefs[i];
1514 dprintf(3, "%s\t%s\n", ref->type, ref->url);
1515 }
1516 for (i = 0; i < nhiddenrefs; i++) {
1517 ref = hiddenrefs[i];
1518 dprintf(3, "%s\t%s\n", ref->type, ref->url);
1519 }
1520 }
1521
1522 printf("\nReferences\n\n");
1523
1524 for (i = 0; i < nvisrefs; i++) {
1525 ref = visrefs[i];
1526 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->ty…
1527 }
1528
1529 if (nhiddenrefs > 0)
1530 printf("\n\nHidden references\n\n");
1531 /* hidden links don't have a link number, just count them */
1532 for (i = 0; i < nhiddenrefs; i++) {
1533 ref = hiddenrefs[i];
1534 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->ty…
1535 }
1536 }
1537
1538 /* size to grow node capacity (greedy) */
1539 #define NODE_CAP_INC 16
1540
1541 /* increase node depth, allocate space for nodes if needed */
1542 static void
1543 incnode(void)
1544 {
1545 size_t i;
1546
1547 curnode++;
1548
1549 if (curnode >= MAX_NODE_DEPTH)
1550 errx(1, "max node depth reached: %d", curnode);
1551
1552 if (curnode >= ncapnodes) {
1553 nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NO…
1554 nodes_links = erealloc(nodes_links, sizeof(*nodes_links)…
1555
1556 /* clear new region */
1557 memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_I…
1558 memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) …
1559
1560 for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) {
1561 nodes[i].tag.displaytype = DisplayInline;
1562 nodes[i].tag.name = nodes[i].tagname; /* assign …
1563 }
1564
1565 ncapnodes += NODE_CAP_INC; /* greedy alloc */
1566 }
1567 }
1568
1569 static void
1570 xmldatastart(XMLParser *p)
1571 {
1572 }
1573
1574 static void
1575 xmldataend(XMLParser *p)
1576 {
1577 struct node *cur;
1578 char *start, *s, *e;
1579
1580 if (!htmldata.data || !htmldata.len)
1581 return;
1582
1583 cur = &nodes[curnode];
1584
1585 if (reader_ignore || (cur->tag.displaytype & DisplayNone)) {
1586 /* print nothing */
1587 } else if ((cur->tag.displaytype & DisplayPre) ||
1588 findparenttype(curnode - 1, DisplayPre)) {
1589 printpre(htmldata.data, htmldata.len);
1590 } else {
1591 start = htmldata.data;
1592 e = htmldata.data + htmldata.len;
1593
1594 for (s = start; s < e; s++)
1595 printc((unsigned char)*s);
1596 }
1597
1598 string_clear(&htmldata);
1599 }
1600
1601 static void
1602 xmldata(XMLParser *p, const char *data, size_t datalen)
1603 {
1604 struct node *cur;
1605
1606 if (reader_ignore)
1607 return;
1608
1609 cur = &nodes[curnode];
1610 if (cur->tag.displaytype & DisplayNone)
1611 return;
1612
1613 string_append(&htmldata, data, datalen);
1614 }
1615
1616 static void
1617 xmldataentity(XMLParser *p, const char *data, size_t datalen)
1618 {
1619 struct node *cur;
1620 char buf[16];
1621 int n;
1622
1623 if (reader_ignore)
1624 return;
1625
1626 cur = &nodes[curnode];
1627 if (cur->tag.displaytype & DisplayNone)
1628 return;
1629
1630 n = xml_entitytostr(data, buf, sizeof(buf));
1631 if (n > 0)
1632 xmldata(p, buf, (size_t)n);
1633 else
1634 xmldata(p, data, datalen);
1635 }
1636
1637 static void
1638 xmlcdatastart(XMLParser *p)
1639 {
1640 xmldatastart(p);
1641 }
1642
1643 static void
1644 xmlcdataend(XMLParser *p)
1645 {
1646 xmldataend(p); /* treat CDATA as data */
1647 }
1648
1649 static void
1650 xmlcdata(XMLParser *p, const char *data, size_t datalen)
1651 {
1652 xmldata(p, data, datalen); /* treat CDATA as data */
1653 }
1654
1655 /* lookup function to compare tag name (case-insensitive) for sort funct…
1656 static int
1657 findtagcmp(const void *v1, const void *v2)
1658 {
1659 struct tag *t1 = (struct tag *)v1;
1660 struct tag *t2 = (struct tag *)v2;
1661
1662 return strcasecmp(t1->name, t2->name);
1663 }
1664
1665 /* binary search tag by tag name */
1666 static struct tag *
1667 findtag(const char *t)
1668 {
1669 struct tag find = { 0 };
1670
1671 find.name = t;
1672
1673 return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp…
1674 }
1675
1676 static void
1677 handleendtag(struct tag *tag)
1678 {
1679 int i, marginbottom;
1680
1681 if (tag->displaytype & DisplayNone)
1682 return;
1683 if (reader_ignore)
1684 return;
1685
1686 if (tag->displaytype & (DisplayButton | DisplayOption)) {
1687 hputchar(']');
1688 hflush();
1689 }
1690
1691 if (tag->displaytype & (DisplayBlock | DisplayHeader | DisplayTa…
1692 DisplayList | DisplayListItem | DisplayPre)) {
1693 endblock(); /* break line if needed */
1694 }
1695
1696 /* when a list ends and its not inside a list add an extra botto…
1697 marginbottom = tag->marginbottom;
1698
1699 if (marginbottom > 0) {
1700 if (tag->displaytype & DisplayList) {
1701 if (findparenttype(curnode - 1, DisplayList))
1702 marginbottom--;
1703 }
1704 }
1705
1706 if (marginbottom > 0) {
1707 hflush();
1708 for (i = currentnewlines; i < marginbottom; i++) {
1709 putchar('\n');
1710 nbytesline = 0;
1711 ncells = 0;
1712 currentnewlines++;
1713 }
1714 hadnewline = 1;
1715 }
1716 }
1717
1718 static void
1719 endnode(struct node *cur)
1720 {
1721 struct linkref *ref;
1722 int i, ishidden;
1723
1724 /* set a flag indicating the element and its parent containers h…
1725 This is used for some formatting */
1726 if (cur->hasdata) {
1727 for (i = curnode; i >= 0; i--)
1728 nodes[i].hasdata = 1;
1729 }
1730
1731 endmarkup(cur->tag.markuptype);
1732
1733 ishidden = reader_ignore || (cur->tag.displaytype & DisplayNone);
1734
1735 /* add link and show the link number in the visible order */
1736 if (!ishidden && nodes_links[curnode].len > 0) {
1737 ref = addlinkref(nodes_links[curnode].data,
1738 cur->tag.name, cur->tag.id, ishidden);
1739
1740 if (showrefinline || showurlinline) {
1741 hflush();
1742 startmarkup(MarkupReverse);
1743 }
1744
1745 if (showrefinline)
1746 hprintf("[%zu]", ref->linknr);
1747 if (showurlinline) {
1748 if (ref->tagid == TagA)
1749 hprintf("[%s]", ref->url);
1750 else
1751 hprintf("[%s: %s]", ref->type, ref->url);
1752 }
1753 if (showrefinline || showurlinline) {
1754 endmarkup(MarkupReverse);
1755 hflush();
1756 }
1757 }
1758
1759 handleendtag(&(cur->tag));
1760 }
1761
1762 static void
1763 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
1764 {
1765 struct tag *found, *tag;
1766 enum TagId child, childs[16];
1767 size_t nchilds;
1768 int i, j, k, nchildfound, parenttype;
1769
1770 /* match tag and lookup metadata */
1771 /* ignore closing of void elements, like </br>, which is not all…
1772 if ((found = findtag(t))) {
1773 if (!isshort && found->isvoid)
1774 return;
1775 }
1776
1777 /* TODO: implement more complete optional tag handling.
1778 in reality the optional tag rules are more complex, see:
1779 https://html.spec.whatwg.org/multipage/syntax.html#optional-t…
1780
1781 child = 0;
1782 nchilds = 0;
1783 nchildfound = 0;
1784 parenttype = 0; /* by default, seek until the root */
1785
1786 if (found && found->displaytype & DisplayPre) {
1787 skipinitialws = 0; /* do not skip white-space, for margi…
1788 } else if (found && found->displaytype & DisplayList) {
1789 childs[0] = TagLi;
1790 nchilds = 1;
1791 parenttype = DisplayList;
1792 } else if (found && found->displaytype & DisplayTableRow) {
1793 childs[0] = TagTd;
1794 nchilds = 1;
1795 parenttype = DisplayTableRow;
1796 } else if (found && found->displaytype & DisplayTable) {
1797 childs[0] = TagTd;
1798 nchilds = 1;
1799 parenttype = DisplayTable;
1800 } else if (found && found->displaytype & DisplaySelect) {
1801 childs[0] = TagOption;
1802 nchilds = 1;
1803 parenttype = DisplaySelect;
1804 } else if (found && found->displaytype & DisplayDl) {
1805 childs[0] = TagP;
1806 childs[1] = TagDd;
1807 childs[2] = TagDt;
1808 nchilds = 3;
1809 parenttype = DisplayDl;
1810 } else if (found && found->displaytype & DisplayBlock) {
1811 childs[0] = TagP;
1812 nchilds = 1;
1813 parenttype = 0; /* seek until the root */
1814 }
1815
1816 if (nchilds > 0) {
1817 for (i = curnode; i >= 0; i--) {
1818 if (nchildfound)
1819 break;
1820 if ((nodes[i].tag.displaytype & parenttype))
1821 break;
1822 for (j = 0; j < nchilds; j++) {
1823 child = childs[j];
1824 if (nodes[i].tag.id == child) {
1825 /* fake closing the previous tag…
1826 for (k = curnode; k >= i; k--)
1827 endnode(&nodes[k]);
1828 curnode = k;
1829 nchildfound = 1;
1830 break;
1831 }
1832 }
1833 }
1834 }
1835
1836 /* if the current closing tag matches the current open tag */
1837 if (nodes[curnode].tag.name &&
1838 !tagcmp(nodes[curnode].tag.name, t)) {
1839 endnode(&nodes[curnode]);
1840 if (curnode)
1841 curnode--;
1842 } else {
1843 /* ... else lookup the first matching start tag. This is…
1844 for handling optional closing tags */
1845 tag = NULL;
1846 for (i = curnode; i >= 0; i--) {
1847 if (nodes[i].tag.name &&
1848 !tagcmp(nodes[i].tag.name, t)) {
1849 endnode(&nodes[i]);
1850 curnode = i > 0 ? i - 1 : 0;
1851 tag = &nodes[i].tag;
1852 break;
1853 }
1854 }
1855 /* unmatched closing tag found */
1856 if (!tag && found)
1857 handleendtag(found);
1858 }
1859 indent = calcindent();
1860
1861 #if 0
1862 /* check if linewrap is enabled, but currently is disabled and n…
1863 be restored */
1864 if (allowlinewrap && !linewrap) {
1865 tag = NULL;
1866 for (i = curnode; i >= 0; i--) {
1867 if (nodes[i].tag.id == TagTable) {
1868 tag = &nodes[i].tag;
1869 break;
1870 }
1871 }
1872 if (!tag)
1873 linewrap = allowlinewrap;
1874 }
1875 #endif
1876
1877 /* restore markup of the tag we are in now */
1878 startmarkup(nodes[curnode].tag.markuptype);
1879
1880 /* check if the current node still matches the visible selector …
1881 if (reader_mode && sel_show && !reader_ignore) {
1882 if (!iscssmatchany(sel_show, nodes, curnode)) {
1883 reader_ignore = 1;
1884 newline();
1885 }
1886 }
1887 }
1888
1889 static void
1890 xmltagstart(XMLParser *p, const char *t, size_t tl)
1891 {
1892 struct tag *found;
1893 struct node *cur;
1894 enum TagId tagid;
1895 enum TagId child, childs[16];
1896 size_t nchilds;
1897 char *s;
1898 int i, j, k, nchildfound, parenttype;
1899
1900 cur = &nodes[curnode];
1901
1902 string_clear(&attr_alt);
1903 string_clear(&attr_checked);
1904 string_clear(&attr_class);
1905 attr_class_set = 0;
1906 string_clear(&attr_data);
1907 string_clear(&attr_href);
1908 string_clear(&attr_id);
1909 attr_id_set = 0;
1910 string_clear(&attr_src);
1911 string_clear(&attr_type);
1912 string_clear(&attr_value);
1913
1914 /* match tag and lookup metadata */
1915 found = findtag(t);
1916
1917 /* TODO: implement more complete optional tag handling.
1918 in reality the optional tag rules are more complex, see:
1919 https://html.spec.whatwg.org/multipage/syntax.html#optional-t…
1920
1921 child = 0;
1922 nchilds = 0;
1923 nchildfound = 0;
1924 parenttype = 0; /* by default, seek until the root */
1925
1926 /* if optional tag <p> is open and a list element is found, clos…
1927 if (found && found->displaytype & DisplayList) {
1928 /* not inside a list */
1929 childs[0] = TagP;
1930 nchilds = 1;
1931 parenttype = DisplayList;
1932 } else if (found && found->isoptional) {
1933 tagid = found->id;
1934 if (tagid == TagLi) {
1935 childs[0] = TagLi;
1936 nchilds = 1;
1937 parenttype = DisplayList;
1938 } else if (tagid == TagTd) {
1939 childs[0] = TagTd;
1940 nchilds = 1;
1941 parenttype = DisplayTableRow;
1942 } else if (tagid == TagTr) {
1943 childs[0] = TagTr;
1944 nchilds = 1;
1945 parenttype = DisplayTable;
1946 } else if (tagid == TagP) {
1947 childs[0] = TagP;
1948 nchilds = 1;
1949 parenttype = 0; /* seek until the root */
1950 } else if (tagid == TagOption) {
1951 childs[0] = TagOption;
1952 nchilds = 1;
1953 parenttype = DisplaySelect;
1954 } else if (tagid == TagDt) {
1955 childs[0] = TagDd;
1956 nchilds = 1;
1957 parenttype = DisplayDl;
1958 } else if (tagid == TagDd) {
1959 childs[0] = TagDd;
1960 childs[1] = TagDt;
1961 nchilds = 2;
1962 parenttype = DisplayDl;
1963 } else if (tagid == cur->tag.id) {
1964 /* fake closing the previous tag if it is the sa…
1965 xmltagend(p, t, tl, 0);
1966 }
1967 } else if (found && found->displaytype & DisplayBlock) {
1968 /* check if we have an open "<p>" tag */
1969 childs[0] = TagP;
1970 childs[1] = TagDl;
1971 nchilds = 2;
1972 parenttype = DisplayDl;
1973 }
1974
1975 if (nchilds > 0) {
1976 for (i = curnode; i >= 0; i--) {
1977 if (nchildfound)
1978 break;
1979 if ((nodes[i].tag.displaytype & parenttype))
1980 break;
1981 for (j = 0; j < nchilds; j++) {
1982 child = childs[j];
1983 if (nodes[i].tag.id == child) {
1984 /* fake closing the previous tag…
1985 for (k = curnode; k >= i; k--)
1986 xmltagend(p, nodes[k].ta…
1987 nchildfound = 1;
1988 break;
1989 }
1990 }
1991 }
1992 }
1993
1994 incnode();
1995 string_clear(&nodes_links[curnode]); /* clear possible link refe…
1996 cur = &nodes[curnode];
1997 memset(cur, 0, sizeof(*cur)); /* clear / reset node */
1998 /* tag defaults */
1999 cur->tag.displaytype = DisplayInline;
2000 cur->tag.name = cur->tagname; /* assign fixed-size buffer */
2001 strlcpy(cur->tagname, t, sizeof(cur->tagname));
2002
2003 /* force to lowercase */
2004 for (s = cur->tagname; *s; s++)
2005 *s = TOLOWER((unsigned char)*s);
2006
2007 /* matched tag: copy tag information to current node */
2008 if (found)
2009 memcpy(&(cur->tag), found, sizeof(*found));
2010
2011 /* if parent tag is hidden then hide itself too */
2012 if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & Display…
2013 cur->tag.displaytype |= DisplayNone;
2014 }
2015
2016 static void
2017 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
2018 {
2019 struct tag *found;
2020 enum TagId tagid;
2021 struct node *cur, *parent;
2022 int i, margintop;
2023
2024 /* match tag and lookup metadata */
2025 tagid = 0;
2026 if ((found = findtag(t)))
2027 tagid = found->id;
2028
2029 /* temporary replace the callback except the reader and end of t…
2030 restore the context once we receive the same ignored tag in t…
2031 end tag handler */
2032 if (tagid == TagScript) {
2033 ignorestate = endtag = "</script>";
2034 getnext = p->getnext; /* for restore */
2035 p->getnext = getnext_ignore;
2036 xmltagend(p, t, tl, 0); /* fake the call the tag was end…
2037 return;
2038 } else if (tagid == TagStyle) {
2039 ignorestate = endtag = "</style>";
2040 getnext = p->getnext; /* for restore */
2041 p->getnext = getnext_ignore;
2042 xmltagend(p, t, tl, 0); /* fake the call the tag was end…
2043 return;
2044 }
2045
2046 #if 0
2047 /* disable line-wrapping inside tables */
2048 if (tagid == TagTable)
2049 linewrap = 0;
2050 #endif
2051
2052 cur = &nodes[curnode];
2053
2054 /* copy attributes if set */
2055 if (attr_id.len)
2056 strlcpy(cur->id, attr_id.data, sizeof(cur->id));
2057 else
2058 cur->id[0] = '\0';
2059 if (attr_class.len)
2060 strlcpy(cur->classnames, attr_class.data, sizeof(cur->cl…
2061 else
2062 cur->classnames[0] = '\0';
2063
2064 /* parent node */
2065 if (curnode > 0) {
2066 parent = &nodes[curnode - 1];
2067 parent->nchildren++; /* increase child node count */
2068 /* count visible childnodes */
2069 if (!(cur->tag.displaytype & DisplayNone))
2070 parent->visnchildren++;
2071 } else {
2072 parent = NULL;
2073 }
2074
2075 if (reader_mode && sel_show && reader_ignore &&
2076 iscssmatchany(sel_show, nodes, curnode))
2077 reader_ignore = 0;
2078
2079 /* hide element */
2080 if (reader_mode && sel_hide &&
2081 iscssmatchany(sel_hide, nodes, curnode))
2082 cur->tag.displaytype |= DisplayNone;
2083
2084 /* indent for this tag */
2085 cur->indent = cur->tag.indent;
2086
2087 if (!reader_ignore) {
2088 /* add link reference, print links and alt text */
2089 handleinlinelink();
2090 handleinlinealt();
2091 }
2092
2093 /* <select><option> */
2094 if (cur->tag.displaytype & DisplayOption) {
2095 /* <select multiple>: show all options */
2096 if (parent->tag.displaytype & DisplaySelectMulti)
2097 cur->tag.displaytype |= DisplayBlock;
2098 else if (parent->nchildren > 1) /* show the first item a…
2099 cur->tag.displaytype |= DisplayNone; /* else hid…
2100 }
2101
2102 if (cur->tag.displaytype & DisplayNone)
2103 return;
2104
2105 if (reader_ignore)
2106 return;
2107
2108 indent = calcindent();
2109
2110 if ((cur->tag.displaytype & (DisplayBlock | DisplayHeader | Disp…
2111 DisplayTable | DisplayTableRow |
2112 DisplayList | DisplayListItem))) {
2113 startblock(); /* break line if needed */
2114 }
2115
2116 if (cur->tag.displaytype & (DisplayButton | DisplayOption)) {
2117 hflush();
2118 hputchar('[');
2119 }
2120
2121 margintop = cur->tag.margintop;
2122 if (cur->tag.displaytype & (DisplayList)) {
2123 for (i = curnode - 1; i >= 0; i--) {
2124 if (nodes[i].tag.displaytype & DisplayList)
2125 break;
2126 if (!(nodes[i].tag.displaytype & DisplayListItem…
2127 continue;
2128 if (nodes[i].hasdata && margintop > 0) {
2129 margintop--;
2130 break;
2131 }
2132 }
2133 } else if (cur->tag.displaytype & (DisplayBlock|DisplayTable)) {
2134 if (!parentcontainerhasdata(cur->tag.displaytype, curnod…
2135 if (margintop > 0)
2136 margintop--;
2137 }
2138 }
2139
2140 if (margintop > 0) {
2141 hflush();
2142 for (i = currentnewlines; i < margintop; i++) {
2143 putchar('\n');
2144 nbytesline = 0;
2145 ncells = 0;
2146 currentnewlines++;
2147 }
2148 hadnewline = 1;
2149 }
2150
2151 if (cur->tag.displaytype & DisplayPre) {
2152 skipinitialws = 1;
2153 } else if (cur->tag.displaytype & DisplayTableCell) {
2154 if (parent && parent->visnchildren > 1)
2155 hputchar('\t');
2156 } else if (cur->tag.displaytype & DisplayListItem) {
2157 /* find first parent node and ordered numbers or unorder…
2158 if (parent) {
2159 skipinitialws = 0;
2160
2161 /* print bullet, add columns to indentation leve…
2162 if (parent->tag.displaytype & DisplayListOrdered…
2163 hprintf("%4zu. ", parent->nchildren);
2164 cur->indent = 6;
2165 indent += cur->indent; /* align to numbe…
2166 } else if (parent->tag.displaytype & DisplayList…
2167 hprint(str_bullet_item);
2168 cur->indent = 2;
2169 indent += 2; /* align to bullet */
2170 }
2171 }
2172 skipinitialws = 0;
2173 } else if (cur->tag.displaytype & DisplayInput) {
2174 if (!attr_type.len) {
2175 hprintf("[%-15s]", attr_value.len ? attr_value.d…
2176 } else if (!strcasecmp(attr_type.data, "button")) {
2177 hprintf("[%s]", attr_value.len ? attr_value.data…
2178 } else if (!strcasecmp(attr_type.data, "submit")) {
2179 hprintf("[%s]", attr_value.len ? attr_value.data…
2180 } else if (!strcasecmp(attr_type.data, "reset")) {
2181 hprintf("[%s]", attr_value.len ? attr_value.data…
2182 } else if (!strcasecmp(attr_type.data, "checkbox")) {
2183 hprintf("[%s]",
2184 attr_checked.len &&
2185 !strcasecmp(attr_checked.data, "checked"…
2186 } else if (!strcasecmp(attr_type.data, "radio")) {
2187 hprintf("[%s]",
2188 attr_checked.len &&
2189 !strcasecmp(attr_checked.data, "checked"…
2190 } else if (!strcasecmp(attr_type.data, "hidden")) {
2191 cur->tag.displaytype |= DisplayNone;
2192 } else {
2193 /* unrecognized / default case is text */
2194 hprintf("[%-15s]", attr_value.len ? attr_value.d…
2195 }
2196 }
2197
2198 startmarkup(cur->tag.markuptype);
2199
2200 /* do not count data such as an item bullet as part of the data …
2201 the node */
2202 cur->hasdata = 0;
2203
2204 if (tagid == TagHr) { /* ruler */
2205 i = termwidth - indent - defaultindent;
2206 for (; i > 0; i--)
2207 hprint(str_ruler);
2208 cur->hasdata = 1; /* treat <hr/> as data */
2209 } else if (tagid == TagBr) {
2210 hflush();
2211 hadnewline = 0; /* forced newline */
2212 hputchar('\n');
2213 cur->hasdata = 1; /* treat <br/> as data */
2214 }
2215
2216 /* autoclose tags, such as <br>, pretend we are <br/> */
2217 if (!isshort && cur->tag.isvoid)
2218 xmltagend(p, t, tl, 1); /* pretend close of short tag */
2219 }
2220
2221 static void
2222 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
2223 size_t nl, const char *v, size_t vl)
2224 {
2225 struct node *cur;
2226 enum TagId tagid;
2227
2228 cur = &nodes[curnode];
2229 tagid = cur->tag.id;
2230
2231 /* hide tags with attribute aria-hidden or hidden */
2232 if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
2233 cur->tag.displaytype |= DisplayNone;
2234
2235 if (!attr_class_set && !attrcmp(n, "class")) /* use the first se…
2236 string_append(&attr_class, v, vl);
2237 else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set…
2238 string_append(&attr_id, v, vl);
2239 else if (!attrcmp(n, "type"))
2240 string_append(&attr_type, v, vl);
2241 else if (!attrcmp(n, "value"))
2242 string_append(&attr_value, v, vl);
2243
2244 /* <base href="..." /> */
2245 if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
2246 strlcat(basehrefdoc, v, sizeof(basehrefdoc));
2247
2248 if (tagid == TagA && !attrcmp(n, "href"))
2249 string_append(&attr_href, v, vl);
2250
2251 if (tagid == TagSelect && !attrcmp(n, "multiple"))
2252 cur->tag.displaytype |= DisplaySelectMulti;
2253
2254 if (tagid == TagObject && !attrcmp(n, "data"))
2255 string_append(&attr_data, v, vl);
2256
2257 /* show img alt attribute as text. */
2258 if (tagid == TagImg && !attrcmp(n, "alt"))
2259 string_append(&attr_alt, v, vl);
2260
2261 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"…
2262 string_append(&attr_checked, v, vl);
2263
2264 /* src attribute */
2265 switch (tagid) {
2266 case TagAudio:
2267 case TagEmbed:
2268 case TagFrame:
2269 case TagIframe:
2270 case TagImg:
2271 case TagSource:
2272 case TagTrack:
2273 case TagVideo:
2274 if (!attrcmp(n, "src"))
2275 string_append(&attr_src, v, vl);
2276 break;
2277 default:
2278 break;
2279 }
2280 }
2281
2282 static void
2283 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n,
2284 size_t nl, const char *v, size_t vl)
2285 {
2286 char buf[16];
2287 int len;
2288
2289 len = xml_entitytostr(v, buf, sizeof(buf));
2290 if (len > 0)
2291 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
2292 else
2293 xmlattr(p, t, tl, n, nl, v, vl);
2294 }
2295
2296 static void
2297 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
2298 size_t nl)
2299 {
2300 struct node *cur;
2301 enum TagId tagid;
2302
2303 cur = &nodes[curnode];
2304 tagid = cur->tag.id;
2305
2306 if (!attr_class_set && !attrcmp(n, "class"))
2307 attr_class_set = 1;
2308 else if (!attr_id_set && !attrcmp(n, "id"))
2309 attr_id_set = 1;
2310
2311 /* set base URL, if it is set it cannot be overwritten again */
2312 if (!basehrefset && basehrefdoc[0] &&
2313 tagid == TagBase && !attrcmp(n, "href"))
2314 basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : …
2315
2316 /* if attribute checked is set but it has no value then set it t…
2317 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"…
2318 string_append(&attr_checked, "checked", sizeof("checked"…
2319 }
2320
2321 static void
2322 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
2323 size_t nl)
2324 {
2325 struct node *cur;
2326 enum TagId tagid;
2327
2328 cur = &nodes[curnode];
2329 tagid = cur->tag.id;
2330
2331 if (!attrcmp(n, "alt"))
2332 string_clear(&attr_alt);
2333 else if (!attrcmp(n, "checked"))
2334 string_clear(&attr_checked);
2335 else if (!attr_class_set && !attrcmp(n, "class"))
2336 string_clear(&attr_class);
2337 else if (!attrcmp(n, "data"))
2338 string_clear(&attr_data);
2339 else if (!attrcmp(n, "href"))
2340 string_clear(&attr_href);
2341 else if (!attr_id_set && !attrcmp(n, "id"))
2342 string_clear(&attr_id);
2343 else if (!attrcmp(n, "src"))
2344 string_clear(&attr_src);
2345 else if (!attrcmp(n, "type"))
2346 string_clear(&attr_type);
2347 else if (!attrcmp(n, "value"))
2348 string_clear(&attr_value);
2349
2350 if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
2351 basehrefdoc[0] = '\0';
2352 }
2353
2354 static void
2355 usage(void)
2356 {
2357 fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u …
2358 exit(1);
2359 }
2360
2361 int
2362 main(int argc, char **argv)
2363 {
2364 char *basehref;
2365
2366 if (pledge("stdio", NULL) < 0)
2367 err(1, "pledge");
2368
2369 ARGBEGIN {
2370 case '8':
2371 str_bullet_item = "\xe2\x80\xa2 ";
2372 str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal…
2373 break;
2374 case 'a':
2375 allowansi = !allowansi;
2376 break;
2377 case 'b':
2378 basehref = EARGF(usage());
2379 if (uri_parse(basehref, &base) == -1 ||
2380 !base.proto[0])
2381 usage();
2382 basehrefset = 1;
2383 break;
2384 case 'd':
2385 uniqrefs = !uniqrefs;
2386 break;
2387 case 'i':
2388 showrefinline = !showrefinline;
2389 break;
2390 case 'I':
2391 showurlinline = !showurlinline;
2392 break;
2393 case 'l':
2394 showrefbottom = !showrefbottom;
2395 break;
2396 case 'r':
2397 allowlinewrap = !allowlinewrap;
2398 break;
2399 case 's':
2400 sel_show = compileselectors(EARGF(usage()));
2401 /* switch to reader/selector mode, ignore all data excep…
2402 reader_mode = 1;
2403 reader_ignore = 1;
2404 break;
2405 case 'u':
2406 sel_hide = compileselectors(EARGF(usage()));
2407 /* switch to reader/selector mode */
2408 reader_mode = 1;
2409 break;
2410 case 'w':
2411 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1)
2412 usage();
2413 break;
2414 case 'x':
2415 resources = !resources;
2416 break;
2417 default:
2418 usage();
2419 } ARGEND
2420
2421 linewrap = allowlinewrap;
2422
2423 /* initial nodes */
2424 ncapnodes = NODE_CAP_INC;
2425 nodes = ecalloc(ncapnodes, sizeof(*nodes));
2426 nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links));
2427
2428 parser.xmlattrstart = xmlattrstart;
2429 parser.xmlattr = xmlattr;
2430 parser.xmlattrentity = xmlattrentity;
2431 parser.xmlattrend = xmlattrend;
2432 parser.xmlcdatastart = xmlcdatastart;
2433 parser.xmlcdata = xmlcdata;
2434 parser.xmlcdataend = xmlcdataend;
2435 parser.xmldatastart = xmldatastart;
2436 parser.xmldata = xmldata;
2437 parser.xmldataentity = xmldataentity;
2438 parser.xmldataend = xmldataend;
2439 parser.xmltagstart = xmltagstart;
2440 parser.xmltagstartparsed = xmltagstartparsed;
2441 parser.xmltagend = xmltagend;
2442
2443 parser.getnext = getchar;
2444 xml_parse(&parser);
2445
2446 hflush();
2447 if (ncells > 0)
2448 newline();
2449
2450 if (showrefbottom || resources)
2451 printlinkrefs();
2452
2453 hflush();
2454 setmarkup(0);
2455
2456 return 0;
2457 }
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.