Introduction
Introduction Statistics Contact Development Disclaimer Help
cleanup code a bit and add some comments - webdump - HTML to plain-text convert…
git clone git://git.codemadness.org/webdump
Log
Files
Refs
README
LICENSE
---
commit 4793272ce07153284318336426796cb7e3c93af4
parent 589d7d1ed851b5226a4782de8c9f00001f25c599
Author: Hiltjo Posthuma <[email protected]>
Date: Tue, 19 Sep 2023 20:05:02 +0200
cleanup code a bit and add some comments
Diffstat:
M webdump.c | 129 +++++++++++++++--------------…
1 file changed, 62 insertions(+), 67 deletions(-)
---
diff --git a/webdump.c b/webdump.c
@@ -45,14 +45,14 @@ struct uri {
};
/* options */
-static int allowansi = 0; /* allow ANSI escape codes */
-static int showrefbottom = 0; /* show link references at the bottom */
-static int showrefinline = 0; /* show link reference number inline */
-static int showurlinline = 0; /* show full link reference inline */
-static int linewrap = 0; /* line-wrapping */
-static int termwidth = 77; /* terminal width */
-static int resources = 0; /* write resources line-by-line to fd 3? */
-static int uniqrefs = 0; /* number unique references */
+static int allowansi = 0; /* (-a) allow ANSI escape codes */
+static int uniqrefs = 0; /* (-d) number unique references */
+static int showrefinline = 0; /* (-i) show link reference number inline */
+static int showurlinline = 0; /* (-I) show full link reference inline */
+static int showrefbottom = 0; /* (-l) show link references at the bottom */
+static int linewrap = 0; /* (-r) line-wrapping */
+static int termwidth = 77; /* (-w) terminal width */
+static int resources = 0; /* (-x) write resources line-by-line to fd 3? */
enum DisplayType {
DisplayUnknown = 0,
@@ -95,17 +95,19 @@ typedef struct string {
} String;
enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
-TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, TagCite,
-TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, TagDfn, TagDir,
-TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, TagFigcaption, TagFigure,
-TagFooter, TagForm, TagFrame, TagH1, TagH2, TagH3, TagH4, TagH5, TagH6,
-TagHead, TagHeader, TagHr, TagHtml, TagI, TagIframe, TagImg, TagInput, TagIns,
-TagLabel, TagLegend, TagLi, TagLink, TagMain, TagMark, TagMenu, TagMeta,
-TagNav, TagObject, TagOl, TagOption, TagP, TagParam, TagPre, TagS, TagScript,
-TagSearch, TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
-TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, TagTfoot,
-TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, TagVar, TagVideo,
-TagWbr, TagXmp };
+
+ TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton,
+ TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails,
+ TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset,
+ TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2,
+ TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI,
+ TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi,
+ TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl,
+ TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch,
+ TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
+ TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea,
+ TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl,
+ TagVar, TagVideo, TagWbr, TagXmp };
struct tag {
const char *name;
@@ -168,6 +170,7 @@ static size_t nvisrefs, ncapvisrefs; /* visible link count …
struct linkref **hiddenrefs;
static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */
+/* compare link by URL for link references RB-tree */
int
linkrefcmp(struct linkref *r1, struct linkref *r2)
{
@@ -175,7 +178,6 @@ linkrefcmp(struct linkref *r1, struct linkref *r2)
}
RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
-RB_PROTOTYPE(linkreftree, linkref, entry, linkrefcmp)
RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
static const char *str_bullet_item = "* ";
@@ -184,10 +186,9 @@ static const char *str_ruler = "-";
static const char *str_radio_checked = "*";
/* base href, to make URLs absolute */
-static char *basehref = "";
-static char basehrefdoc[4096]; /* base href in document, if any */
-static int basehrefset = 0; /* base href set and can be used? */
-static struct uri base;
+static char basehrefdoc[4096]; /* buffer for base href in document, if any */
+static int basehrefset; /* base href set and can be used? */
+static struct uri base; /* parsed current base href */
/* buffers for some attributes of the current tag */
String attr_alt; /* alt attribute */
@@ -200,7 +201,7 @@ String attr_src; /* src attribute */
String attr_type; /* type attribute */
String attr_value; /* value attribute */
-static String htmldata;
+static String htmldata; /* buffered HTML data near the current tag */
/* for white-space output handling:
1 = whitespace emitted (suppress repeated), 2 = other characters on this li…
@@ -208,15 +209,15 @@ static String htmldata;
* White-space data before non-whitespace data in tags are ignored on a line.
* Repeated white-space are ignored: a single space (' ') is emitted.
*/
-static int whitespace_mode = 0;
-static int nbytesline = 0;
-static int ncells = 0; /* current cell count */
-static int hadnewline = 0; /* count for repeated newlines */
+static int whitespace_mode;
+static int nbytesline; /* bytes on this line */
+static int ncells; /* current cell/column count */
+static int hadnewline; /* count for repeated newlines */
/* flag for skipping initial white-space in tag: for HTML white-space handling…
static int skipinitialws = 1;
#define DEFAULT_INDENT 2
-static const int defaultindent = DEFAULT_INDENT;
-static int indent;
+static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */
+static int indent; /* indent for the current line, in columns */
/* previous output sequential newlines, used for calculating margins between
elements and reducing excessive newlines */
static int currentnewlines;
@@ -224,21 +225,22 @@ static int currentnewlines;
/* buffers for line-wrapping (buffer per word boundary) */
static char rbuf[1024];
static int rbuflen;
-static int rnbufcells = 0; /* pending cell count to add */
+static int rnbufcells; /* pending cell count to add */
#define MAX_NODE_DEPTH 65535 /* absolute maximum node depth */
-static struct node *nodes;
+static struct node *nodes; /* node tree (one per level is remembered) */
static String *nodes_links; /* keep track of links per node */
-static size_t ncapnodes;
+static size_t ncapnodes; /* current allocated node capacity */
static int curnode; /* current node depth */
-/* reader / selector mode */
-static int reader_mode = 0;
-static int reader_ignore = 0;
+/* reader / selector mode (-s) */
+static int reader_mode;
+/* flag if the tags and their children should be ignored in the current contex…
+static int reader_ignore;
-static enum MarkupType curmarkup;
+static enum MarkupType curmarkup; /* current markup state (bold, underline, et…
-/* selector to match */
+/* selector to match (for -s and -u) */
static struct selectors *sel_hide, *sel_show;
/* tags table: needs to be sorted like tagcmp(), alphabetically */
@@ -483,7 +485,7 @@ ecalloc(size_t nmemb, size_t size)
}
/* check if string has a non-empty scheme / protocol part */
-int
+static int
uri_hasscheme(const char *s)
{
const char *p = s;
@@ -495,7 +497,7 @@ uri_hasscheme(const char *s)
return (*p == ':' && p != s);
}
-int
+static int
uri_parse(const char *s, struct uri *u)
{
const char *p = s;
@@ -611,7 +613,7 @@ parsepath:
/* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
Returns 0 on success, -1 on error or truncation. */
-int
+static int
uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
{
char *p;
@@ -663,7 +665,7 @@ uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
return 0;
}
-int
+static int
uri_format(char *buf, size_t bufsiz, struct uri *u)
{
return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
@@ -682,14 +684,14 @@ uri_format(char *buf, size_t bufsiz, struct uri *u)
}
/* compare tag name (case-insensitive) */
-int
+static int
tagcmp(const char *s1, const char *s2)
{
return strcasecmp(s1, s2);
}
/* compare attribute name (case-insensitive) */
-int
+static int
attrcmp(const char *s1, const char *s2)
{
return strcasecmp(s1, s2);
@@ -846,7 +848,7 @@ endmarkup(int markuptype)
cell in general.
NOTE: this is of course incorrect since characters can be 2 width aswell,
in the future maybe replace this with wcwidth() or similar */
-int
+static int
utfwidth(int c)
{
/* not the start of a codepoint */
@@ -1002,17 +1004,6 @@ parentcontainerhasdata(int curtype, int n)
return 0;
}
-static int
-parenthasdata(int n)
-{
- int i;
-
- for (i = n; i >= 0; i--)
- return nodes[i].hasdata;
-
- return 0;
-}
-
/* start on a newline for the start of a block element or not */
static void
startblock(void)
@@ -1021,7 +1012,7 @@ startblock(void)
whitespace_mode &= ~2; /* no characters on this line yet */
if (nbytesline <= 0)
return;
- if (!hadnewline && parenthasdata(curnode - 1))
+ if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata)
hputchar('\n');
}
@@ -1137,7 +1128,7 @@ findparenttype(int cur, int findtype)
return NULL;
}
-int
+static int
isclassmatch(const char *haystack, const char *needle)
{
const char *p;
@@ -1165,7 +1156,7 @@ isclassmatch(const char *haystack, const char *needle)
/* very limited CSS-like selector, supports: main, main#id, main.class,
".class", "#id", "ul li a" */
-int
+static int
compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
{
int depth = 0, len;
@@ -1263,7 +1254,7 @@ compileselector(const char *sel, struct selectornode *nod…
return depth;
}
-struct selector *
+static struct selector *
newselector(const char *q)
{
struct selector *sel;
@@ -1282,7 +1273,7 @@ newselector(const char *q)
return sel;
}
-struct selectors *
+static struct selectors *
compileselectors(const char *q)
{
struct selectors *sels = NULL;
@@ -1319,7 +1310,7 @@ compileselectors(const char *q)
/* very limited CSS-like matcher, supports: main, main#id, main.class,
".class", "#id", "ul li a" */
-int
+static int
iscssmatch(struct selector *sel, struct node *root, int maxdepth)
{
int d, md = 0;
@@ -1356,7 +1347,7 @@ iscssmatch(struct selector *sel, struct node *root, int m…
return 0;
}
-int
+static int
iscssmatchany(struct selectors *sels, struct node *root, int maxdepth)
{
struct selector *sel;
@@ -1499,7 +1490,7 @@ handleinlinelink(void)
addlinkref(url, cur->tag.name, cur->tag.id, 1);
}
-void
+static void
printlinkrefs(void)
{
struct linkref *ref;
@@ -1535,6 +1526,7 @@ printlinkrefs(void)
}
}
+/* size to grow node capacity (greedy) */
#define NODE_CAP_INC 256
/* increase node depth, allocate space for nodes if needed */
@@ -1759,6 +1751,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int iss…
size_t nchilds;
int i, j, k, nchildfound, parenttype;
+ /* match tag and lookup metadata */
/* ignore closing of void elements, like </br>, which is not allowed */
if ((found = findtag(t))) {
if (!isshort && found->isvoid)
@@ -1884,7 +1877,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
string_clear(&attr_type);
string_clear(&attr_value);
- /* match tag */
+ /* match tag and lookup metadata */
found = findtag(t);
/* TODO: implement more complete optional tag handling.
@@ -1993,7 +1986,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl,…
struct node *cur, *parent;
int i, margintop;
- /* match tag */
+ /* match tag and lookup metadata */
tagid = 0;
if ((found = findtag(t)))
tagid = found->id;
@@ -2322,6 +2315,8 @@ usage(void)
int
main(int argc, char **argv)
{
+ char *basehref;
+
if (pledge("stdio", NULL) < 0)
err(1, "pledge");
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.