GopherProxy

	webdump.c - webdump - HTML to plain-text converter for webpages
	git clone git://git.codemadness.org/webdump
	Log
	Files
	Refs
	README
	LICENSE
	---
	webdump.c (66818B)
	---
	1 #include <errno.h>
	2 #include <limits.h>
	3 #include <stdio.h>
	4 #include <stdarg.h>
	5 #include <stdlib.h>
	6 #include <string.h>
	7 #include <strings.h>
	8 #include <unistd.h>
	9
	10 #include "arg.h"
	11 char *argv0;
	12
	13 #include "tree.h"
	14 #include "xml.h"
	15
	16 static XMLParser parser;
	17
	18 #ifndef __OpenBSD__
	19 #define pledge(p1,p2) 0
	20 #endif
	21
	22 #undef strlcat
	23 size_t strlcat(char , const char , size_t);
	24 #undef strlcpy
	25 size_t strlcpy(char , const char , size_t);
	26
	27 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
	28 #define ISALPHA(c) ((((unsigned)c) \| 32) - 'a' < 26)
	29 #define ISCNTRL(c) ((c) < ' ' \|\| (c) == 0x7f)
	30 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
	31 #define ISSPACE(c) ((c) == ' ' \|\| ((((unsigned)c) - '\t') < 5))
	32 #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) \| 32) : (c))
	33
	34 #define LEN(x) (sizeof(x) / sizeof(x[0]))
	35
	36 /* URI */
	37 struct uri {
	38 char proto[48]; /* scheme including ":" or "://" */
	39 char userinfo[256]; /* username [:password] */
	40 char host[256];
	41 char port[6]; /* numeric port */
	42 char path[1024];
	43 char query[1024];
	44 char fragment[1024];
	45 };
	46
	47 /* options */
	48 static int allowansi = 0; /* (-a) allow ANSI escape codes */
	49 static int uniqrefs = 0; /* (-d) number unique references */
	50 static int showrefinline = 0; /* (-i) show link reference number inline…
	51 static int showurlinline = 0; /* (-I) show full link reference inline */
	52 static int showrefbottom = 0; /* (-l) show link references at the botto…
	53 static int allowlinewrap = 0; /* (-r) line-wrapping */
	54 static int termwidth = 77; /* (-w) terminal width */
	55 static int resources = 0; /* (-x) write resources line-by-line to f…
	56
	57 enum DisplayType {
	58 DisplayUnknown = 0,
	59 DisplayInline = 1 << 0,
	60 DisplayInlineBlock = 1 << 1, /* unused for now */
	61 DisplayBlock = 1 << 2,
	62 DisplayNone = 1 << 3,
	63 DisplayPre = 1 << 4,
	64 DisplayList = 1 << 5,
	65 DisplayListOrdered = 1 << 6,
	66 DisplayListItem = 1 << 7,
	67 DisplayTable = 1 << 8,
	68 DisplayTableRow = 1 << 9,
	69 DisplayTableCell = 1 << 10,
	70 DisplayHeader = 1 << 11,
	71 DisplayDl = 1 << 12,
	72 DisplayInput = 1 << 13,
	73 DisplayButton = 1 << 14,
	74 DisplaySelect = 1 << 15,
	75 DisplaySelectMulti = 1 << 16,
	76 DisplayOption = 1 << 17
	77 };
	78
	79 /* ANSI markup */
	80 enum MarkupType {
	81 MarkupNone = 0,
	82 MarkupBold = 1 << 0,
	83 MarkupItalic = 1 << 1,
	84 MarkupUnderline = 1 << 2,
	85 MarkupBlink = 1 << 3, /* lol */
	86 MarkupReverse = 1 << 4,
	87 MarkupStrike = 1 << 5
	88 };
	89
	90 /* String data / memory pool */
	91 typedef struct string {
	92 char data; / data */
	93 size_t len; /* string length */
	94 size_t bufsiz; /* allocated size */
	95 } String;
	96
	97 enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAud…
	98 TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButto…
	99 TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDet…
	100 TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFields…
	101 TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, T…
	102 TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, …
	103 TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi,
	104 TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, …
	105 TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch,
	106 TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
	107 TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate,
	108 TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrac…
	109 TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp };
	110
	111 struct tag {
	112 const char *name;
	113 enum TagId id;
	114 enum DisplayType displaytype;
	115 enum MarkupType markuptype; /* ANSI markup */
	116 enum DisplayType parenttype; /* display type belonging to elemen…
	117 int isvoid; /* "void" element */
	118 int isoptional; /* optional to close tag */
	119 int margintop; /* newlines when the tag starts */
	120 int marginbottom; /* newlines after the tag ends */
	121 int indent; /* indent in cells */
	122 };
	123
	124 struct node {
	125 char tagname[256];
	126 struct tag tag;
	127 size_t nchildren; /* child node count */
	128 size_t visnchildren; /* child node count which are visible */
	129 /* attributes */
	130 char id[256];
	131 char classnames[1024];
	132 int indent; /* indent per node, for formatting */
	133 int hasdata; /* tag contains some data, for formatting */
	134 };
	135
	136 struct selectornode {
	137 char tagname[256];
	138 long index; /* index of node to match on: -1 if not matching on …
	139 /* attributes */
	140 char id[256];
	141 char classnames[1024];
	142 };
	143
	144 struct selector {
	145 const char *text;
	146 struct selectornode nodes[32];
	147 int depth;
	148 };
	149
	150 /* list of selectors */
	151 struct selectors {
	152 struct selector **selectors;
	153 size_t count;
	154 };
	155
	156 /* RB tree of link references */
	157 struct linkref {
	158 char *type;
	159 enum TagId tagid;
	160 char *url;
	161 int ishidden;
	162 size_t linknr;
	163 RB_ENTRY(linkref) entry;
	164 };
	165
	166 /* link references and hidden link references */
	167 static struct linkref **visrefs;
	168 static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */
	169 static struct linkref **hiddenrefs;
	170 static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capaci…
	171
	172 /* compare link by URL for link references RB-tree */
	173 static int
	174 linkrefcmp(struct linkref r1, struct linkref r2)
	175 {
	176 return strcmp(r1->url, r2->url);
	177 }
	178
	179 RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
	180 RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
	181
	182 static const char str_bullet_item = " ";
	183 static const char *str_checkbox_checked = "x";
	184 static const char *str_ruler = "-";
	185 static const char str_radio_checked = "";
	186
	187 /* base href, to make URLs absolute */
	188 static char basehrefdoc[4096]; /* buffer for base href in document, if a…
	189 static int basehrefset; /* base href set and can be used? */
	190 static struct uri base; /* parsed current base href */
	191
	192 /* buffers for some attributes of the current tag */
	193 static String attr_alt; /* alt attribute */
	194 static String attr_checked; /* checked attribute */
	195 static String attr_class; /* class attribute */
	196 static int attr_class_set; /* class attribute is set already */
	197 static String attr_data; /* data attribute */
	198 static String attr_href; /* href attribute */
	199 static String attr_id; /* id attribute */
	200 static int attr_id_set; /* class attribute is set already */
	201 static String attr_src; /* src attribute */
	202 static String attr_type; /* type attribute */
	203 static String attr_value; /* value attribute */
	204
	205 static String htmldata; /* buffered HTML data near the current tag */
	206
	207 /* for white-space output handling:
	208 1 = whitespace emitted (suppress repeated), 2 = other characters on t…
	209 Behaviour:
	210 * White-space data before non-whitespace data in tags are ignored on …
	211 * Repeated white-space are ignored: a single space (' ') is emitted.
	212 */
	213 static int whitespace_mode;
	214 static int nbytesline; /* bytes on this line */
	215 static int ncells; /* current cell/column count */
	216 static int hadnewline; /* count for repeated newlines */
	217 /* flag for skipping initial white-space in tag: for HTML white-space ha…
	218 static int skipinitialws = 1;
	219 #define DEFAULT_INDENT 2
	220 static const int defaultindent = DEFAULT_INDENT; /* default indent / mar…
	221 static int indent; /* indent for the current line, in columns */
	222 /* previous output sequential newlines, used for calculating margins bet…
	223 elements and reducing excessive newlines */
	224 static int currentnewlines;
	225
	226 /* buffers for line-wrapping (buffer per word boundary) */
	227 static char rbuf[1024];
	228 static int rbuflen;
	229 static int rnbufcells; /* pending cell count to add */
	230
	231 #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */
	232 static struct node nodes; / node tree (one per level is remembered) */
	233 static String nodes_links; / keep track of links per node */
	234 static size_t ncapnodes; /* current allocated node capacity */
	235 static int curnode; /* current node depth */
	236
	237 /* reader / selector mode (-s) */
	238 static int reader_mode;
	239 /* flag if the tags and their children should be ignored in the current …
	240 static int reader_ignore;
	241
	242 static enum MarkupType curmarkup; /* current markup state (bold, underli…
	243 static int linewrap; /* allow linewrap in this context */
	244
	245 /* selector to match (for -s and -u) */
	246 static struct selectors sel_hide, sel_show;
	247
	248 /* tags table: needs to be sorted like tagcmp(), alphabetically */
	249
	250 /* tag id displaytype markup …
	251 static struct tag tags[] = {
	252 { "a", TagA, DisplayInline, MarkupU…
	253 { "address", TagAddress, DisplayBlock, 0, …
	254 { "area", TagArea, DisplayInline, 0, …
	255 { "article", TagArticle, DisplayBlock, 0, …
	256 { "aside", TagAside, DisplayBlock, 0, …
	257 { "audio", TagAudio, DisplayInline, MarkupU…
	258 { "b", TagB, DisplayInline, MarkupB…
	259 { "base", TagBase, DisplayInline, 0, …
	260 { "blink", TagBlink, DisplayInline, MarkupB…
	261 { "blockquote", TagBlockquote, DisplayBlock, 0, …
	262 { "body", TagBody, DisplayBlock, 0, …
	263 { "br", TagBr, 0, 0, …
	264 { "button", TagButton, DisplayInline \| DisplayButton, 0, …
	265 { "cite", TagCite, DisplayInline, MarkupI…
	266 { "col", TagCol, DisplayInline, 0, …
	267 { "colgroup", TagColgroup, DisplayInline, 0, …
	268 { "datalist", TagDatalist, DisplayNone, 0, …
	269 { "dd", TagDd, DisplayBlock, 0, …
	270 { "del", TagDel, DisplayInline, MarkupS…
	271 { "details", TagDetails, DisplayBlock, 0, …
	272 { "dfn", TagDfn, DisplayInline, MarkupI…
	273 { "dir", TagDir, DisplayList, 0, …
	274 { "div", TagDiv, DisplayBlock, 0, …
	275 { "dl", TagDl, DisplayBlock \| DisplayDl, 0, …
	276 { "dt", TagDt, DisplayBlock, MarkupB…
	277 { "em", TagEm, DisplayInline, MarkupI…
	278 { "embed", TagEmbed, DisplayInline, 0, …
	279 { "fieldset", TagFieldset, DisplayBlock, 0, …
	280 { "figcaption", TagFigcaption, DisplayBlock, 0, …
	281 { "figure", TagFigure, DisplayBlock, 0, …
	282 { "footer", TagFooter, DisplayBlock, 0, …
	283 { "form", TagForm, DisplayBlock, 0, …
	284 { "frame", TagFrame, DisplayInline, 0, …
	285 { "h1", TagH1, DisplayHeader, MarkupB…
	286 { "h2", TagH2, DisplayHeader, MarkupB…
	287 { "h3", TagH3, DisplayHeader, MarkupB…
	288 { "h4", TagH4, DisplayHeader, MarkupB…
	289 { "h5", TagH5, DisplayHeader, MarkupB…
	290 { "h6", TagH6, DisplayHeader, MarkupB…
	291 { "head", TagHead, DisplayBlock, 0, …
	292 { "header", TagHeader, DisplayBlock, 0, …
	293 { "hr", TagHr, DisplayBlock, 0, …
	294 { "html", TagHtml, DisplayBlock, 0, …
	295 { "i", TagI, DisplayInline, MarkupI…
	296 { "iframe", TagIframe, DisplayInline, 0, …
	297 { "img", TagImg, DisplayInline, MarkupU…
	298 { "input", TagInput, DisplayInput, 0, …
	299 { "ins", TagIns, DisplayInline, MarkupU…
	300 { "label", TagLabel, DisplayInline, 0, …
	301 { "legend", TagLegend, DisplayBlock, 0, …
	302 { "li", TagLi, DisplayListItem, 0, …
	303 { "link", TagLink, DisplayInline, 0, …
	304 { "main", TagMain, DisplayBlock, 0, …
	305 { "mark", TagMark, DisplayInline, MarkupR…
	306 { "menu", TagMenu, DisplayList, 0, …
	307 { "meta", TagMeta, DisplayInline, 0, …
	308 { "nav", TagNav, DisplayBlock, 0, …
	309 { "object", TagObject, DisplayInline, 0, …
	310 { "ol", TagOl, DisplayList \| DisplayListOrdered, 0, …
	311 { "option", TagOption, DisplayInline \| DisplayOption, 0, …
	312 { "p", TagP, DisplayBlock, 0, …
	313 { "param", TagParam, DisplayInline, 0, …
	314 { "pre", TagPre, DisplayPre, 0, …
	315 { "s", TagS, DisplayInline, MarkupS…
	316 { "script", TagScript, DisplayNone, 0, …
	317 { "search", TagSearch, DisplayBlock, 0, …
	318 { "section", TagSection, DisplayBlock, 0, …
	319 { "select", TagSelect, DisplayInline \| DisplaySelect, 0, …
	320 { "source", TagSource, DisplayInline, 0, …
	321 { "strike", TagStrike, DisplayInline, MarkupS…
	322 { "strong", TagStrong, DisplayInline, MarkupB…
	323 { "style", TagStyle, DisplayNone, 0, …
	324 { "summary", TagSummary, DisplayBlock, 0, …
	325 { "svg", TagSvg, DisplayNone, 0, …
	326 { "table", TagTable, DisplayTable, 0, …
	327 { "tbody", TagTbody, DisplayInline, 0, …
	328 { "td", TagTd, DisplayTableCell, 0, …
	329 { "template", TagTemplate, DisplayNone, 0, …
	330 { "textarea", TagTextarea, DisplayInline, 0, …
	331 { "tfoot", TagTfoot, DisplayInline, 0, …
	332 { "th", TagTh, DisplayTableCell, MarkupB…
	333 { "thead", TagThead, DisplayInline, 0, …
	334 { "title", TagTitle, DisplayBlock, 0, …
	335 { "tr", TagTr, DisplayTableRow, 0, …
	336 { "track", TagTrack, DisplayInline, 0, …
	337 { "u", TagU, DisplayInline, MarkupU…
	338 { "ul", TagUl, DisplayList, 0, …
	339 { "var", TagVar, DisplayInline, MarkupI…
	340 { "video", TagVideo, DisplayInline, MarkupU…
	341 { "wbr", TagWbr, DisplayInline, 0, …
	342 { "xmp", TagXmp, DisplayPre, 0, …
	343 };
	344
	345 /* hint for compilers and static analyzers that a function exits */
	346 #ifndef __dead
	347 #define __dead
	348 #endif
	349
	350 /* print to stderr, print error message of errno and exit(). */
	351 __dead static void
	352 err(int exitstatus, const char *fmt, ...)
	353 {
	354 va_list ap;
	355 int saved_errno;
	356
	357 saved_errno = errno;
	358
	359 fputs("webdump: ", stderr);
	360 if (fmt) {
	361 va_start(ap, fmt);
	362 vfprintf(stderr, fmt, ap);
	363 va_end(ap);
	364 fputs(": ", stderr);
	365 }
	366 fprintf(stderr, "%s\n", strerror(saved_errno));
	367
	368 exit(exitstatus);
	369 }
	370
	371 /* print to stderr and exit(). */
	372 __dead static void
	373 errx(int exitstatus, const char *fmt, ...)
	374 {
	375 va_list ap;
	376
	377 fputs("webdump: ", stderr);
	378 if (fmt) {
	379 va_start(ap, fmt);
	380 vfprintf(stderr, fmt, ap);
	381 va_end(ap);
	382 }
	383 fputs("\n", stderr);
	384
	385 exit(exitstatus);
	386 }
	387
	388 static const char ignorestate, endtag;
	389 static int (*getnext)(void);
	390
	391 /* return a space for all data until some case-insensitive string occurs…
	392 is used to parse incorrect HTML/XML that contains unescaped HTML in s…
	393 or style tags. If you see some </script> tag in a CDATA or comment
	394 section then e-mail W3C and tell them the web is too complex. */
	395 static inline int
	396 getnext_ignore(void)
	397 {
	398 int c;
	399
	400 if ((c = getnext()) == EOF)
	401 return EOF;
	402
	403 if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignores…
	404 ignorestate++;
	405 if (*ignorestate == '\0') {
	406 parser.getnext = getnext; /* restore */
	407 return ' ';
	408 }
	409 } else {
	410 ignorestate = endtag; /* no full match: reset to beginni…
	411 }
	412
	413 return ' '; /* pretend there is just SPACEs */
	414 }
	415
	416 /* Clear string only; don't free, prevents unnecessary reallocation. */
	417 static void
	418 string_clear(String *s)
	419 {
	420 if (s->data)
	421 s->data[0] = '\0';
	422 s->len = 0;
	423 }
	424
	425 static void
	426 string_buffer_realloc(String *s, size_t newlen)
	427 {
	428 size_t alloclen;
	429
	430 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
	431 ;
	432 if (!(s->data = realloc(s->data, alloclen)))
	433 err(1, "realloc");
	434 s->bufsiz = alloclen;
	435 }
	436
	437 static void
	438 string_append(String s, const char data, size_t len)
	439 {
	440 if (!len)
	441 return;
	442 /* check if allocation is necesary, don't shrink buffer,
	443 * should be more than bufsiz ofcourse. */
	444 if (s->len + len >= s->bufsiz)
	445 string_buffer_realloc(s, s->len + len + 1);
	446 memcpy(s->data + s->len, data, len);
	447 s->len += len;
	448 s->data[s->len] = '\0';
	449 }
	450
	451 static char *
	452 estrdup(const char *s)
	453 {
	454 char *p;
	455
	456 if (!(p = strdup(s)))
	457 err(1, "strdup");
	458 return p;
	459 }
	460
	461 static char *
	462 estrndup(const char *s, size_t n)
	463 {
	464 char *p;
	465
	466 if (!(p = strndup(s, n)))
	467 err(1, "strndup");
	468 return p;
	469 }
	470
	471 static void *
	472 erealloc(void *p, size_t siz)
	473 {
	474 if (!(p = realloc(p, siz)))
	475 err(1, "realloc");
	476
	477 return p;
	478 }
	479
	480 static void *
	481 ecalloc(size_t nmemb, size_t size)
	482 {
	483 void *p;
	484
	485 if (!(p = calloc(nmemb, size)))
	486 err(1, "calloc");
	487 return p;
	488 }
	489
	490 /* check if string has a non-empty scheme / protocol part */
	491 static int
	492 uri_hasscheme(const char *s)
	493 {
	494 const char *p = s;
	495
	496 for (; ISALPHA((unsigned char)p) \|\| ISDIGIT((unsigned char)p) …
	497 p == '+' \|\| p == '-' \|\| *p == '.'; p++)
	498 ;
	499 /* scheme, except if empty and starts with ":" then it is a path…
	500 return (*p == ':' && p != s);
	501 }
	502
	503 static int
	504 uri_parse(const char s, struct uri u)
	505 {
	506 const char *p = s;
	507 char *endptr;
	508 size_t i;
	509 long l;
	510
	511 u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
	512 u->path[0] = u->query[0] = u->fragment[0] = '\0';
	513
	514 /* protocol-relative */
	515 if (p == '/' && (p + 1) == '/') {
	516 p += 2; /* skip "//" */
	517 goto parseauth;
	518 }
	519
	520 /* scheme / protocol part */
	521 for (; ISALPHA((unsigned char)p) \|\| ISDIGIT((unsigned char)p) …
	522 p == '+' \|\| p == '-' \|\| *p == '.'; p++)
	523 ;
	524 /* scheme, except if empty and starts with ":" then it is a path…
	525 if (*p == ':' && p != s) {
	526 if ((p + 1) == '/' && (p + 2) == '/')
	527 p += 3; /* skip "://" */
	528 else
	529 p++; /* skip ":" */
	530
	531 if ((size_t)(p - s) >= sizeof(u->proto))
	532 return -1; /* protocol too long */
	533 memcpy(u->proto, s, p - s);
	534 u->proto[p - s] = '\0';
	535
	536 if (*(p - 1) != '/')
	537 goto parsepath;
	538 } else {
	539 p = s; /* no scheme format, reset to start */
	540 goto parsepath;
	541 }
	542
	543 parseauth:
	544 /* userinfo (username:password) */
	545 i = strcspn(p, "@/?#");
	546 if (p[i] == '@') {
	547 if (i >= sizeof(u->userinfo))
	548 return -1; /* userinfo too long */
	549 memcpy(u->userinfo, p, i);
	550 u->userinfo[i] = '\0';
	551 p += i + 1;
	552 }
	553
	554 /* IPv6 address */
	555 if (*p == '[') {
	556 /* bracket not found, host too short or too long */
	557 i = strcspn(p, "]");
	558 if (p[i] != ']' \|\| i < 3)
	559 return -1;
	560 i++; /* including "]" */
	561 } else {
	562 /* domain / host part, skip until port, path or end. */
	563 i = strcspn(p, ":/?#");
	564 }
	565 if (i >= sizeof(u->host))
	566 return -1; /* host too long */
	567 memcpy(u->host, p, i);
	568 u->host[i] = '\0';
	569 p += i;
	570
	571 /* port */
	572 if (*p == ':') {
	573 p++;
	574 if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
	575 return -1; /* port too long */
	576 memcpy(u->port, p, i);
	577 u->port[i] = '\0';
	578 /* check for valid port: range 1 - 65535, may be empty */
	579 errno = 0;
	580 l = strtol(u->port, &endptr, 10);
	581 if (i && (errno \|\| *endptr \|\| l <= 0 \|\| l > 65535))
	582 return -1;
	583 p += i;
	584 }
	585
	586 parsepath:
	587 /* path */
	588 if ((i = strcspn(p, "?#")) >= sizeof(u->path))
	589 return -1; /* path too long */
	590 memcpy(u->path, p, i);
	591 u->path[i] = '\0';
	592 p += i;
	593
	594 /* query */
	595 if (*p == '?') {
	596 p++;
	597 if ((i = strcspn(p, "#")) >= sizeof(u->query))
	598 return -1; /* query too long */
	599 memcpy(u->query, p, i);
	600 u->query[i] = '\0';
	601 p += i;
	602 }
	603
	604 /* fragment */
	605 if (*p == '#') {
	606 p++;
	607 if ((i = strlen(p)) >= sizeof(u->fragment))
	608 return -1; /* fragment too long */
	609 memcpy(u->fragment, p, i);
	610 u->fragment[i] = '\0';
	611 }
	612
	613 return 0;
	614 }
	615
	616 /* Transform and try to make the URI `u` absolute using base URI `b` int…
	617 Follows some of the logic from "RFC 3986 - 5.2.2. Transform Reference…
	618 Returns 0 on success, -1 on error or truncation. */
	619 static int
	620 uri_makeabs(struct uri a, struct uri u, struct uri *b)
	621 {
	622 char *p;
	623 int c;
	624
	625 strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
	626
	627 if (u->proto[0] \|\| u->host[0]) {
	628 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, siz…
	629 strlcpy(a->host, u->host, sizeof(a->host));
	630 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
	631 strlcpy(a->host, u->host, sizeof(a->host));
	632 strlcpy(a->port, u->port, sizeof(a->port));
	633 strlcpy(a->path, u->path, sizeof(a->path));
	634 strlcpy(a->query, u->query, sizeof(a->query));
	635 return 0;
	636 }
	637
	638 strlcpy(a->proto, b->proto, sizeof(a->proto));
	639 strlcpy(a->host, b->host, sizeof(a->host));
	640 strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
	641 strlcpy(a->host, b->host, sizeof(a->host));
	642 strlcpy(a->port, b->port, sizeof(a->port));
	643
	644 if (!u->path[0]) {
	645 strlcpy(a->path, b->path, sizeof(a->path));
	646 } else if (u->path[0] == '/') {
	647 strlcpy(a->path, u->path, sizeof(a->path));
	648 } else {
	649 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '…
	650 a->path[1] = '\0';
	651
	652 if ((p = strrchr(b->path, '/'))) {
	653 c = *(++p);
	654 p = '\0'; / temporary NUL-terminate */
	655 if (strlcat(a->path, b->path, sizeof(a->path)) >…
	656 return -1;
	657 p = c; / restore */
	658 }
	659 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof…
	660 return -1;
	661 }
	662
	663 if (u->path[0] \|\| u->query[0])
	664 strlcpy(a->query, u->query, sizeof(a->query));
	665 else
	666 strlcpy(a->query, b->query, sizeof(a->query));
	667
	668 return 0;
	669 }
	670
	671 static int
	672 uri_format(char buf, size_t bufsiz, struct uri u)
	673 {
	674 return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
	675 u->proto,
	676 u->userinfo[0] ? u->userinfo : "",
	677 u->userinfo[0] ? "@" : "",
	678 u->host,
	679 u->port[0] ? ":" : "",
	680 u->port,
	681 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
	682 u->path,
	683 u->query[0] ? "?" : "",
	684 u->query,
	685 u->fragment[0] ? "#" : "",
	686 u->fragment);
	687 }
	688
	689 /* compare tag name (case-insensitive) */
	690 static int
	691 tagcmp(const char s1, const char s2)
	692 {
	693 return strcasecmp(s1, s2);
	694 }
	695
	696 /* compare attribute name (case-insensitive) */
	697 static int
	698 attrcmp(const char s1, const char s2)
	699 {
	700 return strcasecmp(s1, s2);
	701 }
	702
	703 static void
	704 rindent(void)
	705 {
	706 int i, total;
	707
	708 total = indent + defaultindent;
	709 if (total < 0)
	710 total = 0;
	711 for (i = 0; i < total; i++)
	712 putchar(' ');
	713
	714 nbytesline += total;
	715 ncells += total;
	716 }
	717
	718 static void
	719 emitmarkup(int markuptype)
	720 {
	721 if (!allowansi)
	722 return;
	723
	724 if (!markuptype)
	725 fputs("\033[0m", stdout); /* reset all attributes */
	726
	727 /* set */
	728 if (markuptype & MarkupBold)
	729 fputs("\033[1m", stdout);
	730 if (markuptype & MarkupItalic)
	731 fputs("\033[3m", stdout);
	732 if (markuptype & MarkupUnderline)
	733 fputs("\033[4m", stdout);
	734 if (markuptype & MarkupBlink)
	735 fputs("\033[5m", stdout);
	736 if (markuptype & MarkupReverse)
	737 fputs("\033[7m", stdout);
	738 if (markuptype & MarkupStrike)
	739 fputs("\033[9m", stdout);
	740 }
	741
	742 /* flush remaining buffer (containing a word): used for word-wrap handli…
	743 static void
	744 hflush(void)
	745 {
	746 int i;
	747
	748 if (!rbuflen)
	749 return;
	750
	751 if (!nbytesline) {
	752 if (curmarkup)
	753 emitmarkup(0);
	754 rindent();
	755 /* emit code again per line, needed for GNU/less -R */
	756 if (curmarkup)
	757 emitmarkup(curmarkup);
	758 }
	759
	760 for (i = 0; i < rbuflen; i++)
	761 putchar(rbuf[i]);
	762
	763 nbytesline += rbuflen;
	764 ncells += rnbufcells;
	765 rbuflen = 0;
	766 rnbufcells = 0;
	767 }
	768
	769 static void
	770 printansi(const char *s)
	771 {
	772 size_t len;
	773
	774 if (!allowansi)
	775 return;
	776
	777 if (linewrap) {
	778 len = strlen(s);
	779 if (rbuflen + len + 1 >= sizeof(rbuf))
	780 hflush();
	781 if (rbuflen + len + 1 < sizeof(rbuf)) {
	782 memcpy(rbuf + rbuflen, s, len);
	783 rbuflen += len;
	784 /* NOTE: nbytesline and ncells are not counted f…
	785 }
	786 } else {
	787 fputs(s, stdout);
	788 }
	789 }
	790
	791 static void
	792 setmarkup(int markuptype)
	793 {
	794 if (!allowansi)
	795 return;
	796
	797 /* need change? */
	798 if (curmarkup == markuptype)
	799 return;
	800
	801 if (!markuptype) {
	802 printansi("\033[0m"); /* reset all attributes */
	803 curmarkup = markuptype;
	804 return;
	805 }
	806
	807 /* set */
	808 if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold))
	809 printansi("\033[1m");
	810 if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic))
	811 printansi("\033[3m");
	812 if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderl…
	813 printansi("\033[4m");
	814 if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink))
	815 printansi("\033[5m");
	816 if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse))
	817 printansi("\033[7m");
	818 if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike))
	819 printansi("\033[9m");
	820
	821 /* unset */
	822 if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold))
	823 printansi("\033[22m"); /* reset bold or faint */
	824 if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic))
	825 printansi("\033[23m"); /* reset italic */
	826 if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderl…
	827 printansi("\033[24m"); /* reset underline */
	828 if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink))
	829 printansi("\033[25m"); /* reset blink */
	830 if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse))
	831 printansi("\033[27m"); /* reset reverse */
	832 if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike))
	833 printansi("\033[29m"); /* reset strike */
	834
	835 curmarkup = markuptype;
	836 }
	837
	838 static void
	839 startmarkup(int markuptype)
	840 {
	841 setmarkup(curmarkup \| markuptype);
	842 }
	843
	844 static void
	845 endmarkup(int markuptype)
	846 {
	847 setmarkup(curmarkup & ~markuptype);
	848 }
	849
	850 /* rough cell width of a unicode codepoint by counting a unicode codepoi…
	851 cell in general.
	852 NOTE: this is of course incorrect since characters can be 2 width asw…
	853 in the future maybe replace this with wcwidth() or similar */
	854 static int
	855 utfwidth(int c)
	856 {
	857 /* not the start of a codepoint */
	858 if ((c & 0xc0) == 0x80)
	859 return 0;
	860 /* count TAB as 8 */
	861 if (c == '\t')
	862 return 8;
	863 return 1;
	864 }
	865
	866 /* write a character, handling state of repeated newlines, some HTML
	867 white-space rules, indentation and word-wrapping */
	868 static void
	869 hputchar(int c)
	870 {
	871 struct node *cur = &nodes[curnode];
	872 cur->hasdata = 1;
	873
	874 if (c == '\n') {
	875 /* previous line had characters, so not a repeated newli…
	876 if (nbytesline > 0)
	877 hadnewline = 0;
	878
	879 /* start a new line, no chars on this line yet */
	880 whitespace_mode &= ~2; /* no chars on this line yet */
	881 nbytesline = 0;
	882 ncells = 0;
	883
	884 if (hadnewline)
	885 currentnewlines++; /* repeating newlines */
	886 hadnewline = 1;
	887 } else {
	888 hadnewline = 0;
	889 currentnewlines = 0;
	890 }
	891
	892 /* skip initial/leading white-space */
	893 if (ISSPACE((unsigned char)c)) {
	894 if (skipinitialws)
	895 return;
	896 } else {
	897 skipinitialws = 0;
	898 }
	899
	900 if (!(c == '\n' \|\| c == '\t' \|\| !ISCNTRL((unsigned char)c)))
	901 return;
	902
	903 if (!linewrap) {
	904 if (c == '\n') {
	905 putchar('\n');
	906 nbytesline = 0;
	907 ncells = 0;
	908 } else {
	909 if (!nbytesline) {
	910 if (curmarkup)
	911 emitmarkup(0);
	912 rindent();
	913 /* emit code again per line, needed for …
	914 if (curmarkup)
	915 emitmarkup(curmarkup);
	916 }
	917 putchar(c);
	918 nbytesline++;
	919 ncells += utfwidth(c);
	920 }
	921 return;
	922 }
	923
	924 /* really too long: the whole word doesn't even fit, flush it */
	925 if (ncells + rnbufcells >= termwidth \|\| rbuflen >= sizeof(rbuf) …
	926 putchar('\n');
	927 nbytesline = 0;
	928 ncells = 0;
	929 hflush();
	930 }
	931
	932 if (c == '\n') {
	933 putchar('\n');
	934 hflush();
	935 return;
	936 } else if (ISSPACE((unsigned char)c) \|\| c == '-') {
	937 if (ncells + rnbufcells >= termwidth) {
	938 putchar('\n');
	939 nbytesline = 0;
	940 ncells = 0;
	941 }
	942 rbuf[rbuflen++] = c;
	943 rnbufcells += utfwidth(c);
	944 hflush();
	945 return;
	946 }
	947
	948 rbuf[rbuflen++] = c;
	949 rnbufcells += utfwidth(c);
	950 }
	951
	952 /* calculate indentation of current node depth, using the sum of each
	953 indentation per node */
	954 static int
	955 calcindent(void)
	956 {
	957 int i, n = 0;
	958
	959 for (i = curnode; i >= 0; i--)
	960 n += nodes[i].indent;
	961
	962 return n;
	963 }
	964
	965 static void
	966 hprint(const char *s)
	967 {
	968 for (; *s; ++s)
	969 hputchar(*s);
	970 }
	971
	972 /* printf(), max 256 bytes for now */
	973 static void
	974 hprintf(const char *fmt, ...)
	975 {
	976 va_list ap;
	977 char buf[256];
	978
	979 va_start(ap, fmt);
	980 vsnprintf(buf, sizeof(buf), fmt, ap);
	981 va_end(ap);
	982
	983 /* use hprint() formatting logic. */
	984 hprint(buf);
	985 }
	986
	987 static void
	988 newline(void)
	989 {
	990 if (skipinitialws)
	991 return;
	992 hputchar('\n');
	993 }
	994
	995 static int
	996 parentcontainerhasdata(int curtype, int n)
	997 {
	998 int i;
	999
	1000 for (i = n; i >= 0; i--) {
	1001 if (nodes[i].tag.displaytype & (DisplayList\|DisplayTable…
	1002 break;
	1003 if (nodes[i].hasdata)
	1004 return 1;
	1005 }
	1006
	1007 return 0;
	1008 }
	1009
	1010 /* start on a newline for the start of a block element or not */
	1011 static void
	1012 startblock(void)
	1013 {
	1014 hflush();
	1015 whitespace_mode &= ~2; /* no characters on this line yet */
	1016 if (nbytesline <= 0)
	1017 return;
	1018 if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata)
	1019 hputchar('\n');
	1020 }
	1021
	1022 /* start on a newline for the end of a block element or not */
	1023 static void
	1024 endblock(void)
	1025 {
	1026 hflush();
	1027 whitespace_mode &= ~2; /* no characters on this line yet */
	1028 if (nbytesline <= 0)
	1029 return;
	1030 if (!hadnewline)
	1031 hputchar('\n');
	1032 }
	1033
	1034 /* print one character safely: no control characters,
	1035 handle HTML white-space rules */
	1036 static void
	1037 printc(int c)
	1038 {
	1039 if (ISSPACE((unsigned char)c)) {
	1040 if (whitespace_mode == 2)
	1041 hputchar(' ');
	1042 whitespace_mode \|= 1;
	1043 } else {
	1044 whitespace_mode = 2;
	1045 if (!ISCNTRL((unsigned char)c))
	1046 hputchar(c);
	1047 }
	1048 }
	1049
	1050 static void
	1051 printpre(const char *s, size_t len)
	1052 {
	1053 struct node *cur;
	1054 size_t i;
	1055
	1056 /* reset state of newlines because this data is printed literall…
	1057 hadnewline = 0;
	1058 currentnewlines = 0;
	1059
	1060 /* skip leading newline */
	1061 i = 0;
	1062 if (skipinitialws) {
	1063 if (*s == '\n' && i < len) {
	1064 s++;
	1065 i++;
	1066 }
	1067 }
	1068
	1069 hflush();
	1070
	1071 skipinitialws = 0;
	1072
	1073 if (*s) {
	1074 cur = &nodes[curnode];
	1075 cur->hasdata = 1;
	1076 }
	1077
	1078 for (; *s && i < len; s++, i++) {
	1079 switch (*s) {
	1080 case '\n':
	1081 putchar('\n');
	1082 nbytesline = 0;
	1083 ncells = 0;
	1084 break;
	1085 case '\t':
	1086 hadnewline = 0;
	1087 if (!nbytesline) {
	1088 if (curmarkup)
	1089 emitmarkup(0);
	1090 rindent();
	1091 /* emit code again per line, needed for …
	1092 if (curmarkup)
	1093 emitmarkup(curmarkup);
	1094 }
	1095
	1096 /* TAB to 8 spaces */
	1097 fputs(" ", stdout);
	1098 nbytesline += 8;
	1099 ncells += 8;
	1100 break;
	1101 default:
	1102 if (ISCNTRL((unsigned char)*s))
	1103 continue;
	1104
	1105 if (!nbytesline) {
	1106 if (curmarkup)
	1107 emitmarkup(0);
	1108 rindent();
	1109 /* emit code again per line, needed for …
	1110 if (curmarkup)
	1111 emitmarkup(curmarkup);
	1112 }
	1113
	1114 putchar(*s);
	1115 nbytesline++;
	1116 /* start of rune: incorrectly assume 1 rune is 1…
	1117 ncells += utfwidth((unsigned char)*s);
	1118 }
	1119 }
	1120 }
	1121
	1122 static struct node *
	1123 findparenttype(int cur, int findtype)
	1124 {
	1125 int i;
	1126
	1127 for (i = cur; i >= 0; i--) {
	1128 if ((nodes[i].tag.displaytype & findtype))
	1129 return &nodes[i];
	1130 }
	1131 return NULL;
	1132 }
	1133
	1134 static int
	1135 isclassmatch(const char haystack, const char needle)
	1136 {
	1137 const char *p;
	1138 size_t needlelen;
	1139 size_t matched = 0;
	1140
	1141 needlelen = strlen(needle);
	1142 for (p = haystack; *p; p++) {
	1143 if (ISSPACE((unsigned char)*p)) {
	1144 matched = 0;
	1145 continue;
	1146 }
	1147 if (needle[matched] == *p)
	1148 matched++;
	1149 else
	1150 matched = 0;
	1151 if (matched == needlelen) {
	1152 if ((p + 1) == '\0' \|\| ISSPACE((unsigned char)…
	1153 return 1;
	1154 }
	1155 }
	1156
	1157 return 0;
	1158 }
	1159
	1160 /* very limited CSS-like selector, supports: main, main#id, main.class,
	1161 ".class", "#id", "ul li a" */
	1162 static int
	1163 compileselector(const char sel, struct selectornode nodes, size_t maxn…
	1164 {
	1165 int depth = 0, len;
	1166 long l;
	1167 const char s, start;
	1168 char tmp[256];
	1169 int nameset = 0;
	1170
	1171 memset(&nodes[0], 0, sizeof(nodes[0]));
	1172 nodes[0].index = -1;
	1173
	1174 s = sel;
	1175 for (; s && ISSPACE((unsigned char)s); s++)
	1176 ;
	1177
	1178 start = s;
	1179 for (; ; s++) {
	1180 /* end of tag */
	1181 if (!nameset &&
	1182 (s == '#' \|\| s == '.' \|\| *s == '@' \|\|
	1183 s == '\0' \|\| ISSPACE((unsigned char)s))) {
	1184 nameset = 1;
	1185 len = s - start; /* tag name */
	1186 if (len >= sizeof(tmp))
	1187 return 0;
	1188 if (len)
	1189 memcpy(tmp, start, len);
	1190 tmp[len] = '\0';
	1191
	1192 memcpy(nodes[depth].tagname, tmp, len + 1);
	1193 }
	1194
	1195 /* end */
	1196 if (s == '\0' \|\| ISSPACE((unsigned char)s)) {
	1197 for (; ISSPACE((unsigned char)*s); s++)
	1198 ;
	1199 start = s; /* start of a new tag */
	1200 depth++;
	1201 if (depth >= maxnodes)
	1202 return 0;
	1203
	1204 nameset = 0;
	1205 memset(&nodes[depth], 0, sizeof(nodes[depth]));
	1206 nodes[depth].index = -1;
	1207
	1208 /* end of selector */
	1209 if (*s == '\0')
	1210 break;
	1211 }
	1212
	1213 /* index */
	1214 if (*s == '@') {
	1215 len = strcspn(s + 1, ".#@ \t\n");
	1216 if (len >= sizeof(tmp))
	1217 return 0;
	1218 memcpy(tmp, s + 1, len);
	1219 tmp[len] = '\0';
	1220
	1221 l = strtol(tmp, NULL, 10);
	1222 if (l >= 0)
	1223 nodes[depth].index = l;
	1224 s += len;
	1225 start = s + 1;
	1226 continue;
	1227 }
	1228
	1229 /* id */
	1230 if (*s == '#') {
	1231 len = strcspn(s + 1, ".#@ \t\n");
	1232 if (len >= sizeof(tmp))
	1233 return 0;
	1234 memcpy(tmp, s + 1, len);
	1235 tmp[len] = '\0';
	1236 memcpy(nodes[depth].id, tmp, len + 1);
	1237 s += len;
	1238 start = s + 1;
	1239 continue;
	1240 }
	1241
	1242 /* class */
	1243 if (*s == '.') {
	1244 len = strcspn(s + 1, ".#@ \t\n");
	1245 if (len >= sizeof(tmp))
	1246 return 0;
	1247 memcpy(tmp, s + 1, len);
	1248 tmp[len] = '\0';
	1249 /* allow only one classname for now */
	1250 memcpy(nodes[depth].classnames, tmp, len + 1);
	1251 s += len;
	1252 start = s + 1;
	1253 continue;
	1254 }
	1255 }
	1256
	1257 return depth;
	1258 }
	1259
	1260 static struct selector *
	1261 newselector(const char *q)
	1262 {
	1263 struct selector *sel;
	1264 int r;
	1265
	1266 sel = ecalloc(1, sizeof(*sel));
	1267 sel->text = estrdup(q);
	1268
	1269 r = compileselector(sel->text, sel->nodes, LEN(sel->nodes));
	1270 if (r <= 0) {
	1271 free(sel);
	1272 return NULL;
	1273 }
	1274 sel->depth = r;
	1275
	1276 return sel;
	1277 }
	1278
	1279 static struct selectors *
	1280 compileselectors(const char *q)
	1281 {
	1282 struct selectors *sels = NULL;
	1283 struct selector *sel;
	1284 const char *start;
	1285 char *qe;
	1286 int count = 0;
	1287 size_t siz;
	1288
	1289 sels = ecalloc(1, sizeof(*sels));
	1290
	1291 start = q;
	1292 for (; ; q++) {
	1293 if (q == ',' \|\| q == '\0') {
	1294 qe = estrndup(start, q - start);
	1295 sel = newselector(qe);
	1296 free(qe);
	1297
	1298 /* add new selector */
	1299 siz = (count + 1) * sizeof(struct selector *);
	1300 sels->selectors = erealloc(sels->selectors, siz);
	1301 sels->selectors[count] = sel;
	1302 count++;
	1303
	1304 if (*q == '\0')
	1305 break;
	1306 start = q + 1;
	1307 }
	1308 }
	1309 sels->count = count;
	1310
	1311 return sels;
	1312 }
	1313
	1314 /* very limited CSS-like matcher, supports: main, main#id, main.class,
	1315 ".class", "#id", "ul li a" */
	1316 static int
	1317 iscssmatch(struct selector sel, struct node root, int maxdepth)
	1318 {
	1319 int d, md = 0;
	1320
	1321 for (d = 0; d <= maxdepth; d++) {
	1322 /* tag matched? */
	1323 if (sel->nodes[md].tagname[0] &&
	1324 strcasecmp(sel->nodes[md].tagname, root[d].tagname))
	1325 continue; /* no */
	1326
	1327 /* id matched? */
	1328 if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, ro…
	1329 continue; /* no */
	1330
	1331 /* class matched, for now allow only one classname in th…
	1332 matching multiple classnames */
	1333 if (sel->nodes[md].classnames[0] &&
	1334 !isclassmatch(root[d].classnames, sel->nodes[md].cla…
	1335 continue; /* no */
	1336
	1337 /* index matched */
	1338 if (sel->nodes[md].index != -1 &&
	1339 (d == 0 \|\|
	1340 root[d - 1].nchildren == 0 \|\|
	1341 sel->nodes[md].index != root[d - 1].nchildren - 1))
	1342 continue;
	1343
	1344 md++;
	1345 /* all matched of one selector */
	1346 if (md == sel->depth)
	1347 return 1;
	1348 }
	1349
	1350 return 0;
	1351 }
	1352
	1353 static int
	1354 iscssmatchany(struct selectors sels, struct node root, int maxdepth)
	1355 {
	1356 struct selector *sel;
	1357 int i;
	1358
	1359 for (i = 0; i < sels->count; i++) {
	1360 sel = sels->selectors[i];
	1361 if (iscssmatch(sel, root, maxdepth))
	1362 return 1;
	1363 }
	1364 return 0;
	1365 }
	1366
	1367 static void
	1368 handleinlinealt(void)
	1369 {
	1370 struct node *cur;
	1371 char start, s, *e;
	1372
	1373 /* do not show the alt text if the element is hidden */
	1374 cur = &nodes[curnode];
	1375 if (cur->tag.displaytype & DisplayNone)
	1376 return;
	1377
	1378 /* show img alt attribute as text. */
	1379 if (attr_alt.len) {
	1380 start = attr_alt.data;
	1381 e = attr_alt.data + attr_alt.len;
	1382
	1383 for (s = start; s < e; s++)
	1384 printc((unsigned char)*s);
	1385 hflush();
	1386 } else if (cur->tag.id == TagImg && !showurlinline) {
	1387 /* if there is no alt text and no URL is shown inline, t…
	1388 show "[IMG]" to indicate there was an image there */
	1389 hprint("[IMG]");
	1390 }
	1391 }
	1392
	1393 /* lookup a link reference by url in the red-black tree */
	1394 static struct linkref *
	1395 findlinkref(const char *url)
	1396 {
	1397 struct linkref find;
	1398
	1399 find.url = (char *)url;
	1400
	1401 return RB_FIND(linkreftree, &linkrefhead, &find);
	1402 }
	1403
	1404 /* add a link reference. Returns the added link reference, or the existi…
	1405 reference if links are deduplicated */
	1406 static struct linkref *
	1407 addlinkref(const char url, const char _type, enum TagId tagid, int ish…
	1408 {
	1409 struct linkref *link;
	1410 size_t linknr;
	1411
	1412 /* if links are deduplicates return the existing link */
	1413 if (uniqrefs && (link = findlinkref(url)))
	1414 return link;
	1415
	1416 if (tagid == TagA)
	1417 _type = "link";
	1418
	1419 link = ecalloc(1, sizeof(*link));
	1420
	1421 if (!ishidden) {
	1422 linknr = ++nvisrefs;
	1423 if (nvisrefs >= ncapvisrefs) {
	1424 ncapvisrefs += 256; /* greedy alloc */
	1425 visrefs = erealloc(visrefs, sizeof(visrefs) n…
	1426 }
	1427 visrefs[linknr - 1] = link; /* add pointer to list */
	1428 } else {
	1429 linknr = ++nhiddenrefs;
	1430 if (nhiddenrefs >= ncaphiddenrefs) {
	1431 ncaphiddenrefs += 256; /* greedy alloc */
	1432 hiddenrefs = erealloc(hiddenrefs, sizeof(*hidden…
	1433 }
	1434 hiddenrefs[linknr - 1] = link; /* add pointer to list */
	1435 }
	1436
	1437 link->url = estrdup(url);
	1438 link->type = estrdup(_type);
	1439 link->tagid = tagid;
	1440 link->ishidden = ishidden;
	1441 link->linknr = linknr;
	1442
	1443 /* add to tree: the tree is only used for checking unique link r…
	1444 if (uniqrefs)
	1445 RB_INSERT(linkreftree, &linkrefhead, link);
	1446
	1447 return link;
	1448 }
	1449
	1450 static void
	1451 handleinlinelink(void)
	1452 {
	1453 struct uri newuri, olduri;
	1454 struct node *cur;
	1455 char buf[4096], *url;
	1456 int r;
	1457
	1458 if (!showrefbottom && !showrefinline && !showurlinline && !resou…
	1459 return; /* there is no need to collect the reference */
	1460
	1461 if (!attr_href.len && !attr_src.len && !attr_data.len)
	1462 return; /* there is no reference */
	1463
	1464 /* by default use the original URL */
	1465 if (attr_src.len)
	1466 url = attr_src.data;
	1467 else if (attr_href.len)
	1468 url = attr_href.data;
	1469 else
	1470 url = attr_data.data;
	1471
	1472 if (!url)
	1473 return;
	1474
	1475 /* Not an absolute URL yet: try to make it absolute.
	1476 If it is not possible use the relative URL */
	1477 if (!uri_hasscheme(url) && basehrefset &&
	1478 uri_parse(url, &olduri) != -1 &&
	1479 uri_makeabs(&newuri, &olduri, &base) != -1 &&
	1480 newuri.proto[0]) {
	1481 r = uri_format(buf, sizeof(buf), &newuri);
	1482 if (r >= 0 && (size_t)r < sizeof(buf))
	1483 url = buf;
	1484 }
	1485
	1486 if (!url[0])
	1487 return;
	1488
	1489 cur = &nodes[curnode];
	1490
	1491 if (!(cur->tag.displaytype & DisplayNone)) {
	1492 string_clear(&nodes_links[curnode]);
	1493 string_append(&nodes_links[curnode], url, strlen(url));
	1494 }
	1495
	1496 /* add hidden links directly to the reference,
	1497 the order doesn't matter */
	1498 if (cur->tag.displaytype & DisplayNone)
	1499 addlinkref(url, cur->tag.name, cur->tag.id, 1);
	1500 }
	1501
	1502 static void
	1503 printlinkrefs(void)
	1504 {
	1505 struct linkref *ref;
	1506 size_t i;
	1507
	1508 if (!nvisrefs && !nhiddenrefs)
	1509 return;
	1510
	1511 if (resources) {
	1512 for (i = 0; i < nvisrefs; i++) {
	1513 ref = visrefs[i];
	1514 dprintf(3, "%s\t%s\n", ref->type, ref->url);
	1515 }
	1516 for (i = 0; i < nhiddenrefs; i++) {
	1517 ref = hiddenrefs[i];
	1518 dprintf(3, "%s\t%s\n", ref->type, ref->url);
	1519 }
	1520 }
	1521
	1522 printf("\nReferences\n\n");
	1523
	1524 for (i = 0; i < nvisrefs; i++) {
	1525 ref = visrefs[i];
	1526 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->ty…
	1527 }
	1528
	1529 if (nhiddenrefs > 0)
	1530 printf("\n\nHidden references\n\n");
	1531 /* hidden links don't have a link number, just count them */
	1532 for (i = 0; i < nhiddenrefs; i++) {
	1533 ref = hiddenrefs[i];
	1534 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->ty…
	1535 }
	1536 }
	1537
	1538 /* size to grow node capacity (greedy) */
	1539 #define NODE_CAP_INC 16
	1540
	1541 /* increase node depth, allocate space for nodes if needed */
	1542 static void
	1543 incnode(void)
	1544 {
	1545 size_t i;
	1546
	1547 curnode++;
	1548
	1549 if (curnode >= MAX_NODE_DEPTH)
	1550 errx(1, "max node depth reached: %d", curnode);
	1551
	1552 if (curnode >= ncapnodes) {
	1553 nodes = erealloc(nodes, sizeof(nodes) (ncapnodes + NO…
	1554 nodes_links = erealloc(nodes_links, sizeof(*nodes_links)…
	1555
	1556 /* clear new region */
	1557 memset(&nodes[ncapnodes], 0, sizeof(nodes) NODE_CAP_I…
	1558 memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) …
	1559
	1560 for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) {
	1561 nodes[i].tag.displaytype = DisplayInline;
	1562 nodes[i].tag.name = nodes[i].tagname; /* assign …
	1563 }
	1564
	1565 ncapnodes += NODE_CAP_INC; /* greedy alloc */
	1566 }
	1567 }
	1568
	1569 static void
	1570 xmldatastart(XMLParser *p)
	1571 {
	1572 }
	1573
	1574 static void
	1575 xmldataend(XMLParser *p)
	1576 {
	1577 struct node *cur;
	1578 char start, s, *e;
	1579
	1580 if (!htmldata.data \|\| !htmldata.len)
	1581 return;
	1582
	1583 cur = &nodes[curnode];
	1584
	1585 if (reader_ignore \|\| (cur->tag.displaytype & DisplayNone)) {
	1586 /* print nothing */
	1587 } else if ((cur->tag.displaytype & DisplayPre) \|\|
	1588 findparenttype(curnode - 1, DisplayPre)) {
	1589 printpre(htmldata.data, htmldata.len);
	1590 } else {
	1591 start = htmldata.data;
	1592 e = htmldata.data + htmldata.len;
	1593
	1594 for (s = start; s < e; s++)
	1595 printc((unsigned char)*s);
	1596 }
	1597
	1598 string_clear(&htmldata);
	1599 }
	1600
	1601 static void
	1602 xmldata(XMLParser p, const char data, size_t datalen)
	1603 {
	1604 struct node *cur;
	1605
	1606 if (reader_ignore)
	1607 return;
	1608
	1609 cur = &nodes[curnode];
	1610 if (cur->tag.displaytype & DisplayNone)
	1611 return;
	1612
	1613 string_append(&htmldata, data, datalen);
	1614 }
	1615
	1616 static void
	1617 xmldataentity(XMLParser p, const char data, size_t datalen)
	1618 {
	1619 struct node *cur;
	1620 char buf[16];
	1621 int n;
	1622
	1623 if (reader_ignore)
	1624 return;
	1625
	1626 cur = &nodes[curnode];
	1627 if (cur->tag.displaytype & DisplayNone)
	1628 return;
	1629
	1630 n = xml_entitytostr(data, buf, sizeof(buf));
	1631 if (n > 0)
	1632 xmldata(p, buf, (size_t)n);
	1633 else
	1634 xmldata(p, data, datalen);
	1635 }
	1636
	1637 static void
	1638 xmlcdatastart(XMLParser *p)
	1639 {
	1640 xmldatastart(p);
	1641 }
	1642
	1643 static void
	1644 xmlcdataend(XMLParser *p)
	1645 {
	1646 xmldataend(p); /* treat CDATA as data */
	1647 }
	1648
	1649 static void
	1650 xmlcdata(XMLParser p, const char data, size_t datalen)
	1651 {
	1652 xmldata(p, data, datalen); /* treat CDATA as data */
	1653 }
	1654
	1655 /* lookup function to compare tag name (case-insensitive) for sort funct…
	1656 static int
	1657 findtagcmp(const void v1, const void v2)
	1658 {
	1659 struct tag t1 = (struct tag )v1;
	1660 struct tag t2 = (struct tag )v2;
	1661
	1662 return strcasecmp(t1->name, t2->name);
	1663 }
	1664
	1665 /* binary search tag by tag name */
	1666 static struct tag *
	1667 findtag(const char *t)
	1668 {
	1669 struct tag find = { 0 };
	1670
	1671 find.name = t;
	1672
	1673 return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp…
	1674 }
	1675
	1676 static void
	1677 handleendtag(struct tag *tag)
	1678 {
	1679 int i, marginbottom;
	1680
	1681 if (tag->displaytype & DisplayNone)
	1682 return;
	1683 if (reader_ignore)
	1684 return;
	1685
	1686 if (tag->displaytype & (DisplayButton \| DisplayOption)) {
	1687 hputchar(']');
	1688 hflush();
	1689 }
	1690
	1691 if (tag->displaytype & (DisplayBlock \| DisplayHeader \| DisplayTa…
	1692 DisplayList \| DisplayListItem \| DisplayPre)) {
	1693 endblock(); /* break line if needed */
	1694 }
	1695
	1696 /* when a list ends and its not inside a list add an extra botto…
	1697 marginbottom = tag->marginbottom;
	1698
	1699 if (marginbottom > 0) {
	1700 if (tag->displaytype & DisplayList) {
	1701 if (findparenttype(curnode - 1, DisplayList))
	1702 marginbottom--;
	1703 }
	1704 }
	1705
	1706 if (marginbottom > 0) {
	1707 hflush();
	1708 for (i = currentnewlines; i < marginbottom; i++) {
	1709 putchar('\n');
	1710 nbytesline = 0;
	1711 ncells = 0;
	1712 currentnewlines++;
	1713 }
	1714 hadnewline = 1;
	1715 }
	1716 }
	1717
	1718 static void
	1719 endnode(struct node *cur)
	1720 {
	1721 struct linkref *ref;
	1722 int i, ishidden;
	1723
	1724 /* set a flag indicating the element and its parent containers h…
	1725 This is used for some formatting */
	1726 if (cur->hasdata) {
	1727 for (i = curnode; i >= 0; i--)
	1728 nodes[i].hasdata = 1;
	1729 }
	1730
	1731 endmarkup(cur->tag.markuptype);
	1732
	1733 ishidden = reader_ignore \|\| (cur->tag.displaytype & DisplayNone);
	1734
	1735 /* add link and show the link number in the visible order */
	1736 if (!ishidden && nodes_links[curnode].len > 0) {
	1737 ref = addlinkref(nodes_links[curnode].data,
	1738 cur->tag.name, cur->tag.id, ishidden);
	1739
	1740 if (showrefinline \|\| showurlinline) {
	1741 hflush();
	1742 startmarkup(MarkupReverse);
	1743 }
	1744
	1745 if (showrefinline)
	1746 hprintf("[%zu]", ref->linknr);
	1747 if (showurlinline) {
	1748 if (ref->tagid == TagA)
	1749 hprintf("[%s]", ref->url);
	1750 else
	1751 hprintf("[%s: %s]", ref->type, ref->url);
	1752 }
	1753 if (showrefinline \|\| showurlinline) {
	1754 endmarkup(MarkupReverse);
	1755 hflush();
	1756 }
	1757 }
	1758
	1759 handleendtag(&(cur->tag));
	1760 }
	1761
	1762 static void
	1763 xmltagend(XMLParser p, const char t, size_t tl, int isshort)
	1764 {
	1765 struct tag found, tag;
	1766 enum TagId child, childs[16];
	1767 size_t nchilds;
	1768 int i, j, k, nchildfound, parenttype;
	1769
	1770 /* match tag and lookup metadata */
	1771 /* ignore closing of void elements, like </br>, which is not all…
	1772 if ((found = findtag(t))) {
	1773 if (!isshort && found->isvoid)
	1774 return;
	1775 }
	1776
	1777 /* TODO: implement more complete optional tag handling.
	1778 in reality the optional tag rules are more complex, see:
	1779 https://html.spec.whatwg.org/multipage/syntax.html#optional-t…
	1780
	1781 child = 0;
	1782 nchilds = 0;
	1783 nchildfound = 0;
	1784 parenttype = 0; /* by default, seek until the root */
	1785
	1786 if (found && found->displaytype & DisplayPre) {
	1787 skipinitialws = 0; /* do not skip white-space, for margi…
	1788 } else if (found && found->displaytype & DisplayList) {
	1789 childs[0] = TagLi;
	1790 nchilds = 1;
	1791 parenttype = DisplayList;
	1792 } else if (found && found->displaytype & DisplayTableRow) {
	1793 childs[0] = TagTd;
	1794 nchilds = 1;
	1795 parenttype = DisplayTableRow;
	1796 } else if (found && found->displaytype & DisplayTable) {
	1797 childs[0] = TagTd;
	1798 nchilds = 1;
	1799 parenttype = DisplayTable;
	1800 } else if (found && found->displaytype & DisplaySelect) {
	1801 childs[0] = TagOption;
	1802 nchilds = 1;
	1803 parenttype = DisplaySelect;
	1804 } else if (found && found->displaytype & DisplayDl) {
	1805 childs[0] = TagP;
	1806 childs[1] = TagDd;
	1807 childs[2] = TagDt;
	1808 nchilds = 3;
	1809 parenttype = DisplayDl;
	1810 } else if (found && found->displaytype & DisplayBlock) {
	1811 childs[0] = TagP;
	1812 nchilds = 1;
	1813 parenttype = 0; /* seek until the root */
	1814 }
	1815
	1816 if (nchilds > 0) {
	1817 for (i = curnode; i >= 0; i--) {
	1818 if (nchildfound)
	1819 break;
	1820 if ((nodes[i].tag.displaytype & parenttype))
	1821 break;
	1822 for (j = 0; j < nchilds; j++) {
	1823 child = childs[j];
	1824 if (nodes[i].tag.id == child) {
	1825 /* fake closing the previous tag…
	1826 for (k = curnode; k >= i; k--)
	1827 endnode(&nodes[k]);
	1828 curnode = k;
	1829 nchildfound = 1;
	1830 break;
	1831 }
	1832 }
	1833 }
	1834 }
	1835
	1836 /* if the current closing tag matches the current open tag */
	1837 if (nodes[curnode].tag.name &&
	1838 !tagcmp(nodes[curnode].tag.name, t)) {
	1839 endnode(&nodes[curnode]);
	1840 if (curnode)
	1841 curnode--;
	1842 } else {
	1843 /* ... else lookup the first matching start tag. This is…
	1844 for handling optional closing tags */
	1845 tag = NULL;
	1846 for (i = curnode; i >= 0; i--) {
	1847 if (nodes[i].tag.name &&
	1848 !tagcmp(nodes[i].tag.name, t)) {
	1849 endnode(&nodes[i]);
	1850 curnode = i > 0 ? i - 1 : 0;
	1851 tag = &nodes[i].tag;
	1852 break;
	1853 }
	1854 }
	1855 /* unmatched closing tag found */
	1856 if (!tag && found)
	1857 handleendtag(found);
	1858 }
	1859 indent = calcindent();
	1860
	1861 #if 0
	1862 /* check if linewrap is enabled, but currently is disabled and n…
	1863 be restored */
	1864 if (allowlinewrap && !linewrap) {
	1865 tag = NULL;
	1866 for (i = curnode; i >= 0; i--) {
	1867 if (nodes[i].tag.id == TagTable) {
	1868 tag = &nodes[i].tag;
	1869 break;
	1870 }
	1871 }
	1872 if (!tag)
	1873 linewrap = allowlinewrap;
	1874 }
	1875 #endif
	1876
	1877 /* restore markup of the tag we are in now */
	1878 startmarkup(nodes[curnode].tag.markuptype);
	1879
	1880 /* check if the current node still matches the visible selector …
	1881 if (reader_mode && sel_show && !reader_ignore) {
	1882 if (!iscssmatchany(sel_show, nodes, curnode)) {
	1883 reader_ignore = 1;
	1884 newline();
	1885 }
	1886 }
	1887 }
	1888
	1889 static void
	1890 xmltagstart(XMLParser p, const char t, size_t tl)
	1891 {
	1892 struct tag *found;
	1893 struct node *cur;
	1894 enum TagId tagid;
	1895 enum TagId child, childs[16];
	1896 size_t nchilds;
	1897 char *s;
	1898 int i, j, k, nchildfound, parenttype;
	1899
	1900 cur = &nodes[curnode];
	1901
	1902 string_clear(&attr_alt);
	1903 string_clear(&attr_checked);
	1904 string_clear(&attr_class);
	1905 attr_class_set = 0;
	1906 string_clear(&attr_data);
	1907 string_clear(&attr_href);
	1908 string_clear(&attr_id);
	1909 attr_id_set = 0;
	1910 string_clear(&attr_src);
	1911 string_clear(&attr_type);
	1912 string_clear(&attr_value);
	1913
	1914 /* match tag and lookup metadata */
	1915 found = findtag(t);
	1916
	1917 /* TODO: implement more complete optional tag handling.
	1918 in reality the optional tag rules are more complex, see:
	1919 https://html.spec.whatwg.org/multipage/syntax.html#optional-t…
	1920
	1921 child = 0;
	1922 nchilds = 0;
	1923 nchildfound = 0;
	1924 parenttype = 0; /* by default, seek until the root */
	1925
	1926 /* if optional tag <p> is open and a list element is found, clos…
	1927 if (found && found->displaytype & DisplayList) {
	1928 /* not inside a list */
	1929 childs[0] = TagP;
	1930 nchilds = 1;
	1931 parenttype = DisplayList;
	1932 } else if (found && found->isoptional) {
	1933 tagid = found->id;
	1934 if (tagid == TagLi) {
	1935 childs[0] = TagLi;
	1936 nchilds = 1;
	1937 parenttype = DisplayList;
	1938 } else if (tagid == TagTd) {
	1939 childs[0] = TagTd;
	1940 nchilds = 1;
	1941 parenttype = DisplayTableRow;
	1942 } else if (tagid == TagTr) {
	1943 childs[0] = TagTr;
	1944 nchilds = 1;
	1945 parenttype = DisplayTable;
	1946 } else if (tagid == TagP) {
	1947 childs[0] = TagP;
	1948 nchilds = 1;
	1949 parenttype = 0; /* seek until the root */
	1950 } else if (tagid == TagOption) {
	1951 childs[0] = TagOption;
	1952 nchilds = 1;
	1953 parenttype = DisplaySelect;
	1954 } else if (tagid == TagDt) {
	1955 childs[0] = TagDd;
	1956 nchilds = 1;
	1957 parenttype = DisplayDl;
	1958 } else if (tagid == TagDd) {
	1959 childs[0] = TagDd;
	1960 childs[1] = TagDt;
	1961 nchilds = 2;
	1962 parenttype = DisplayDl;
	1963 } else if (tagid == cur->tag.id) {
	1964 /* fake closing the previous tag if it is the sa…
	1965 xmltagend(p, t, tl, 0);
	1966 }
	1967 } else if (found && found->displaytype & DisplayBlock) {
	1968 /* check if we have an open "<p>" tag */
	1969 childs[0] = TagP;
	1970 childs[1] = TagDl;
	1971 nchilds = 2;
	1972 parenttype = DisplayDl;
	1973 }
	1974
	1975 if (nchilds > 0) {
	1976 for (i = curnode; i >= 0; i--) {
	1977 if (nchildfound)
	1978 break;
	1979 if ((nodes[i].tag.displaytype & parenttype))
	1980 break;
	1981 for (j = 0; j < nchilds; j++) {
	1982 child = childs[j];
	1983 if (nodes[i].tag.id == child) {
	1984 /* fake closing the previous tag…
	1985 for (k = curnode; k >= i; k--)
	1986 xmltagend(p, nodes[k].ta…
	1987 nchildfound = 1;
	1988 break;
	1989 }
	1990 }
	1991 }
	1992 }
	1993
	1994 incnode();
	1995 string_clear(&nodes_links[curnode]); /* clear possible link refe…
	1996 cur = &nodes[curnode];
	1997 memset(cur, 0, sizeof(cur)); / clear / reset node */
	1998 /* tag defaults */
	1999 cur->tag.displaytype = DisplayInline;
	2000 cur->tag.name = cur->tagname; /* assign fixed-size buffer */
	2001 strlcpy(cur->tagname, t, sizeof(cur->tagname));
	2002
	2003 /* force to lowercase */
	2004 for (s = cur->tagname; *s; s++)
	2005 s = TOLOWER((unsigned char)s);
	2006
	2007 /* matched tag: copy tag information to current node */
	2008 if (found)
	2009 memcpy(&(cur->tag), found, sizeof(*found));
	2010
	2011 /* if parent tag is hidden then hide itself too */
	2012 if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & Display…
	2013 cur->tag.displaytype \|= DisplayNone;
	2014 }
	2015
	2016 static void
	2017 xmltagstartparsed(XMLParser p, const char t, size_t tl, int isshort)
	2018 {
	2019 struct tag *found;
	2020 enum TagId tagid;
	2021 struct node cur, parent;
	2022 int i, margintop;
	2023
	2024 /* match tag and lookup metadata */
	2025 tagid = 0;
	2026 if ((found = findtag(t)))
	2027 tagid = found->id;
	2028
	2029 /* temporary replace the callback except the reader and end of t…
	2030 restore the context once we receive the same ignored tag in t…
	2031 end tag handler */
	2032 if (tagid == TagScript) {
	2033 ignorestate = endtag = "</script>";
	2034 getnext = p->getnext; /* for restore */
	2035 p->getnext = getnext_ignore;
	2036 xmltagend(p, t, tl, 0); /* fake the call the tag was end…
	2037 return;
	2038 } else if (tagid == TagStyle) {
	2039 ignorestate = endtag = "</style>";
	2040 getnext = p->getnext; /* for restore */
	2041 p->getnext = getnext_ignore;
	2042 xmltagend(p, t, tl, 0); /* fake the call the tag was end…
	2043 return;
	2044 }
	2045
	2046 #if 0
	2047 /* disable line-wrapping inside tables */
	2048 if (tagid == TagTable)
	2049 linewrap = 0;
	2050 #endif
	2051
	2052 cur = &nodes[curnode];
	2053
	2054 /* copy attributes if set */
	2055 if (attr_id.len)
	2056 strlcpy(cur->id, attr_id.data, sizeof(cur->id));
	2057 else
	2058 cur->id[0] = '\0';
	2059 if (attr_class.len)
	2060 strlcpy(cur->classnames, attr_class.data, sizeof(cur->cl…
	2061 else
	2062 cur->classnames[0] = '\0';
	2063
	2064 /* parent node */
	2065 if (curnode > 0) {
	2066 parent = &nodes[curnode - 1];
	2067 parent->nchildren++; /* increase child node count */
	2068 /* count visible childnodes */
	2069 if (!(cur->tag.displaytype & DisplayNone))
	2070 parent->visnchildren++;
	2071 } else {
	2072 parent = NULL;
	2073 }
	2074
	2075 if (reader_mode && sel_show && reader_ignore &&
	2076 iscssmatchany(sel_show, nodes, curnode))
	2077 reader_ignore = 0;
	2078
	2079 /* hide element */
	2080 if (reader_mode && sel_hide &&
	2081 iscssmatchany(sel_hide, nodes, curnode))
	2082 cur->tag.displaytype \|= DisplayNone;
	2083
	2084 /* indent for this tag */
	2085 cur->indent = cur->tag.indent;
	2086
	2087 if (!reader_ignore) {
	2088 /* add link reference, print links and alt text */
	2089 handleinlinelink();
	2090 handleinlinealt();
	2091 }
	2092
	2093 /* <select><option> */
	2094 if (cur->tag.displaytype & DisplayOption) {
	2095 /* <select multiple>: show all options */
	2096 if (parent->tag.displaytype & DisplaySelectMulti)
	2097 cur->tag.displaytype \|= DisplayBlock;
	2098 else if (parent->nchildren > 1) /* show the first item a…
	2099 cur->tag.displaytype \|= DisplayNone; /* else hid…
	2100 }
	2101
	2102 if (cur->tag.displaytype & DisplayNone)
	2103 return;
	2104
	2105 if (reader_ignore)
	2106 return;
	2107
	2108 indent = calcindent();
	2109
	2110 if ((cur->tag.displaytype & (DisplayBlock \| DisplayHeader \| Disp…
	2111 DisplayTable \| DisplayTableRow \|
	2112 DisplayList \| DisplayListItem))) {
	2113 startblock(); /* break line if needed */
	2114 }
	2115
	2116 if (cur->tag.displaytype & (DisplayButton \| DisplayOption)) {
	2117 hflush();
	2118 hputchar('[');
	2119 }
	2120
	2121 margintop = cur->tag.margintop;
	2122 if (cur->tag.displaytype & (DisplayList)) {
	2123 for (i = curnode - 1; i >= 0; i--) {
	2124 if (nodes[i].tag.displaytype & DisplayList)
	2125 break;
	2126 if (!(nodes[i].tag.displaytype & DisplayListItem…
	2127 continue;
	2128 if (nodes[i].hasdata && margintop > 0) {
	2129 margintop--;
	2130 break;
	2131 }
	2132 }
	2133 } else if (cur->tag.displaytype & (DisplayBlock\|DisplayTable)) {
	2134 if (!parentcontainerhasdata(cur->tag.displaytype, curnod…
	2135 if (margintop > 0)
	2136 margintop--;
	2137 }
	2138 }
	2139
	2140 if (margintop > 0) {
	2141 hflush();
	2142 for (i = currentnewlines; i < margintop; i++) {
	2143 putchar('\n');
	2144 nbytesline = 0;
	2145 ncells = 0;
	2146 currentnewlines++;
	2147 }
	2148 hadnewline = 1;
	2149 }
	2150
	2151 if (cur->tag.displaytype & DisplayPre) {
	2152 skipinitialws = 1;
	2153 } else if (cur->tag.displaytype & DisplayTableCell) {
	2154 if (parent && parent->visnchildren > 1)
	2155 hputchar('\t');
	2156 } else if (cur->tag.displaytype & DisplayListItem) {
	2157 /* find first parent node and ordered numbers or unorder…
	2158 if (parent) {
	2159 skipinitialws = 0;
	2160
	2161 /* print bullet, add columns to indentation leve…
	2162 if (parent->tag.displaytype & DisplayListOrdered…
	2163 hprintf("%4zu. ", parent->nchildren);
	2164 cur->indent = 6;
	2165 indent += cur->indent; /* align to numbe…
	2166 } else if (parent->tag.displaytype & DisplayList…
	2167 hprint(str_bullet_item);
	2168 cur->indent = 2;
	2169 indent += 2; /* align to bullet */
	2170 }
	2171 }
	2172 skipinitialws = 0;
	2173 } else if (cur->tag.displaytype & DisplayInput) {
	2174 if (!attr_type.len) {
	2175 hprintf("[%-15s]", attr_value.len ? attr_value.d…
	2176 } else if (!strcasecmp(attr_type.data, "button")) {
	2177 hprintf("[%s]", attr_value.len ? attr_value.data…
	2178 } else if (!strcasecmp(attr_type.data, "submit")) {
	2179 hprintf("[%s]", attr_value.len ? attr_value.data…
	2180 } else if (!strcasecmp(attr_type.data, "reset")) {
	2181 hprintf("[%s]", attr_value.len ? attr_value.data…
	2182 } else if (!strcasecmp(attr_type.data, "checkbox")) {
	2183 hprintf("[%s]",
	2184 attr_checked.len &&
	2185 !strcasecmp(attr_checked.data, "checked"…
	2186 } else if (!strcasecmp(attr_type.data, "radio")) {
	2187 hprintf("[%s]",
	2188 attr_checked.len &&
	2189 !strcasecmp(attr_checked.data, "checked"…
	2190 } else if (!strcasecmp(attr_type.data, "hidden")) {
	2191 cur->tag.displaytype \|= DisplayNone;
	2192 } else {
	2193 /* unrecognized / default case is text */
	2194 hprintf("[%-15s]", attr_value.len ? attr_value.d…
	2195 }
	2196 }
	2197
	2198 startmarkup(cur->tag.markuptype);
	2199
	2200 /* do not count data such as an item bullet as part of the data …
	2201 the node */
	2202 cur->hasdata = 0;
	2203
	2204 if (tagid == TagHr) { /* ruler */
	2205 i = termwidth - indent - defaultindent;
	2206 for (; i > 0; i--)
	2207 hprint(str_ruler);
	2208 cur->hasdata = 1; /* treat <hr/> as data */
	2209 } else if (tagid == TagBr) {
	2210 hflush();
	2211 hadnewline = 0; /* forced newline */
	2212 hputchar('\n');
	2213 cur->hasdata = 1; /* treat <br/> as data */
	2214 }
	2215
	2216 /* autoclose tags, such as <br>, pretend we are <br/> */
	2217 if (!isshort && cur->tag.isvoid)
	2218 xmltagend(p, t, tl, 1); /* pretend close of short tag */
	2219 }
	2220
	2221 static void
	2222 xmlattr(XMLParser p, const char t, size_t tl, const char *n,
	2223 size_t nl, const char *v, size_t vl)
	2224 {
	2225 struct node *cur;
	2226 enum TagId tagid;
	2227
	2228 cur = &nodes[curnode];
	2229 tagid = cur->tag.id;
	2230
	2231 /* hide tags with attribute aria-hidden or hidden */
	2232 if (!attrcmp(n, "aria-hidden") \|\| !attrcmp(n, "hidden"))
	2233 cur->tag.displaytype \|= DisplayNone;
	2234
	2235 if (!attr_class_set && !attrcmp(n, "class")) /* use the first se…
	2236 string_append(&attr_class, v, vl);
	2237 else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set…
	2238 string_append(&attr_id, v, vl);
	2239 else if (!attrcmp(n, "type"))
	2240 string_append(&attr_type, v, vl);
	2241 else if (!attrcmp(n, "value"))
	2242 string_append(&attr_value, v, vl);
	2243
	2244 /* <base href="..." /> */
	2245 if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
	2246 strlcat(basehrefdoc, v, sizeof(basehrefdoc));
	2247
	2248 if (tagid == TagA && !attrcmp(n, "href"))
	2249 string_append(&attr_href, v, vl);
	2250
	2251 if (tagid == TagSelect && !attrcmp(n, "multiple"))
	2252 cur->tag.displaytype \|= DisplaySelectMulti;
	2253
	2254 if (tagid == TagObject && !attrcmp(n, "data"))
	2255 string_append(&attr_data, v, vl);
	2256
	2257 /* show img alt attribute as text. */
	2258 if (tagid == TagImg && !attrcmp(n, "alt"))
	2259 string_append(&attr_alt, v, vl);
	2260
	2261 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"…
	2262 string_append(&attr_checked, v, vl);
	2263
	2264 /* src attribute */
	2265 switch (tagid) {
	2266 case TagAudio:
	2267 case TagEmbed:
	2268 case TagFrame:
	2269 case TagIframe:
	2270 case TagImg:
	2271 case TagSource:
	2272 case TagTrack:
	2273 case TagVideo:
	2274 if (!attrcmp(n, "src"))
	2275 string_append(&attr_src, v, vl);
	2276 break;
	2277 default:
	2278 break;
	2279 }
	2280 }
	2281
	2282 static void
	2283 xmlattrentity(XMLParser p, const char t, size_t tl, const char *n,
	2284 size_t nl, const char *v, size_t vl)
	2285 {
	2286 char buf[16];
	2287 int len;
	2288
	2289 len = xml_entitytostr(v, buf, sizeof(buf));
	2290 if (len > 0)
	2291 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
	2292 else
	2293 xmlattr(p, t, tl, n, nl, v, vl);
	2294 }
	2295
	2296 static void
	2297 xmlattrend(XMLParser p, const char t, size_t tl, const char *n,
	2298 size_t nl)
	2299 {
	2300 struct node *cur;
	2301 enum TagId tagid;
	2302
	2303 cur = &nodes[curnode];
	2304 tagid = cur->tag.id;
	2305
	2306 if (!attr_class_set && !attrcmp(n, "class"))
	2307 attr_class_set = 1;
	2308 else if (!attr_id_set && !attrcmp(n, "id"))
	2309 attr_id_set = 1;
	2310
	2311 /* set base URL, if it is set it cannot be overwritten again */
	2312 if (!basehrefset && basehrefdoc[0] &&
	2313 tagid == TagBase && !attrcmp(n, "href"))
	2314 basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : …
	2315
	2316 /* if attribute checked is set but it has no value then set it t…
	2317 if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"…
	2318 string_append(&attr_checked, "checked", sizeof("checked"…
	2319 }
	2320
	2321 static void
	2322 xmlattrstart(XMLParser p, const char t, size_t tl, const char *n,
	2323 size_t nl)
	2324 {
	2325 struct node *cur;
	2326 enum TagId tagid;
	2327
	2328 cur = &nodes[curnode];
	2329 tagid = cur->tag.id;
	2330
	2331 if (!attrcmp(n, "alt"))
	2332 string_clear(&attr_alt);
	2333 else if (!attrcmp(n, "checked"))
	2334 string_clear(&attr_checked);
	2335 else if (!attr_class_set && !attrcmp(n, "class"))
	2336 string_clear(&attr_class);
	2337 else if (!attrcmp(n, "data"))
	2338 string_clear(&attr_data);
	2339 else if (!attrcmp(n, "href"))
	2340 string_clear(&attr_href);
	2341 else if (!attr_id_set && !attrcmp(n, "id"))
	2342 string_clear(&attr_id);
	2343 else if (!attrcmp(n, "src"))
	2344 string_clear(&attr_src);
	2345 else if (!attrcmp(n, "type"))
	2346 string_clear(&attr_type);
	2347 else if (!attrcmp(n, "value"))
	2348 string_clear(&attr_value);
	2349
	2350 if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
	2351 basehrefdoc[0] = '\0';
	2352 }
	2353
	2354 static void
	2355 usage(void)
	2356 {
	2357 fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u …
	2358 exit(1);
	2359 }
	2360
	2361 int
	2362 main(int argc, char **argv)
	2363 {
	2364 char *basehref;
	2365
	2366 if (pledge("stdio", NULL) < 0)
	2367 err(1, "pledge");
	2368
	2369 ARGBEGIN {
	2370 case '8':
	2371 str_bullet_item = "\xe2\x80\xa2 ";
	2372 str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal…
	2373 break;
	2374 case 'a':
	2375 allowansi = !allowansi;
	2376 break;
	2377 case 'b':
	2378 basehref = EARGF(usage());
	2379 if (uri_parse(basehref, &base) == -1 \|\|
	2380 !base.proto[0])
	2381 usage();
	2382 basehrefset = 1;
	2383 break;
	2384 case 'd':
	2385 uniqrefs = !uniqrefs;
	2386 break;
	2387 case 'i':
	2388 showrefinline = !showrefinline;
	2389 break;
	2390 case 'I':
	2391 showurlinline = !showurlinline;
	2392 break;
	2393 case 'l':
	2394 showrefbottom = !showrefbottom;
	2395 break;
	2396 case 'r':
	2397 allowlinewrap = !allowlinewrap;
	2398 break;
	2399 case 's':
	2400 sel_show = compileselectors(EARGF(usage()));
	2401 /* switch to reader/selector mode, ignore all data excep…
	2402 reader_mode = 1;
	2403 reader_ignore = 1;
	2404 break;
	2405 case 'u':
	2406 sel_hide = compileselectors(EARGF(usage()));
	2407 /* switch to reader/selector mode */
	2408 reader_mode = 1;
	2409 break;
	2410 case 'w':
	2411 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1)
	2412 usage();
	2413 break;
	2414 case 'x':
	2415 resources = !resources;
	2416 break;
	2417 default:
	2418 usage();
	2419 } ARGEND
	2420
	2421 linewrap = allowlinewrap;
	2422
	2423 /* initial nodes */
	2424 ncapnodes = NODE_CAP_INC;
	2425 nodes = ecalloc(ncapnodes, sizeof(*nodes));
	2426 nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links));
	2427
	2428 parser.xmlattrstart = xmlattrstart;
	2429 parser.xmlattr = xmlattr;
	2430 parser.xmlattrentity = xmlattrentity;
	2431 parser.xmlattrend = xmlattrend;
	2432 parser.xmlcdatastart = xmlcdatastart;
	2433 parser.xmlcdata = xmlcdata;
	2434 parser.xmlcdataend = xmlcdataend;
	2435 parser.xmldatastart = xmldatastart;
	2436 parser.xmldata = xmldata;
	2437 parser.xmldataentity = xmldataentity;
	2438 parser.xmldataend = xmldataend;
	2439 parser.xmltagstart = xmltagstart;
	2440 parser.xmltagstartparsed = xmltagstartparsed;
	2441 parser.xmltagend = xmltagend;
	2442
	2443 parser.getnext = getchar;
	2444 xml_parse(&parser);
	2445
	2446 hflush();
	2447 if (ncells > 0)
	2448 newline();
	2449
	2450 if (showrefbottom \|\| resources)
	2451 printlinkrefs();
	2452
	2453 hflush();
	2454 setmarkup(0);
	2455
	2456 return 0;
	2457 }