various improvements - webdump - HTML to plain-text converter for webpages | |
git clone git://git.codemadness.org/webdump | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 011b4885a533382d98f1aee6cb9619e280c99947 | |
parent 89c9108dc27fe27e0f028f67508a1156ed242d2a | |
Author: Hiltjo Posthuma <[email protected]> | |
Date: Mon, 18 Sep 2023 19:06:03 +0200 | |
various improvements | |
Improve link references: | |
- Add RB tree to lookup link references: this uses a stripped-down version of | |
OpenBSD tree.h | |
- Add 2 separate linked-lists for the order of visible and hidden links. | |
- Hidden links and now also deduplicated. | |
Improve nested nodes and max depth: | |
- Rework and increase the allowed depth of nodes. Allocate them on the heap. | |
Diffstat: | |
M Makefile | 2 +- | |
A tree.h | 483 +++++++++++++++++++++++++++++… | |
M webdump.c | 177 ++++++++++++++++++++---------… | |
3 files changed, 597 insertions(+), 65 deletions(-) | |
--- | |
diff --git a/Makefile b/Makefile | |
@@ -19,7 +19,7 @@ BIN = ${NAME} | |
SCRIPTS = | |
SRC = ${BIN:=.c} | |
-HDR = arg.h namedentities.h xml.h | |
+HDR = arg.h namedentities.h tree.h xml.h | |
LIBXML = libxml.a | |
LIBXMLSRC = \ | |
diff --git a/tree.h b/tree.h | |
@@ -0,0 +1,483 @@ | |
+/* $OpenBSD: tree.h,v 1.31 2023/03/08 04:43:09 guenther Exp $ */ | |
+/* | |
+ * Copyright 2002 Niels Provos <[email protected]> | |
+ * All rights reserved. | |
+ * | |
+ * Redistribution and use in source and binary forms, with or without | |
+ * modification, are permitted provided that the following conditions | |
+ * are met: | |
+ * 1. Redistributions of source code must retain the above copyright | |
+ * notice, this list of conditions and the following disclaimer. | |
+ * 2. Redistributions in binary form must reproduce the above copyright | |
+ * notice, this list of conditions and the following disclaimer in the | |
+ * documentation and/or other materials provided with the distribution. | |
+ * | |
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR | |
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | |
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | |
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, | |
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | |
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
+ */ | |
+ | |
+#ifndef TREE_H | |
+#define TREE_H | |
+ | |
+/* | |
+ * This file defines a red-black tree structure. | |
+ * | |
+ * A red-black tree is a binary search tree with the node color as an | |
+ * extra attribute. It fulfills a set of conditions: | |
+ * - every search path from the root to a leaf consists of the | |
+ * same number of black nodes, | |
+ * - each red node (except for the root) has a black parent, | |
+ * - each leaf node is black. | |
+ * | |
+ * Every operation on a red-black tree is bounded as O(lg n). | |
+ * The maximum height of a red-black tree is 2lg (n+1). | |
+ */ | |
+ | |
+/* Macros that define a red-black tree */ | |
+#define RB_HEAD(name, type) \ | |
+struct name { \ | |
+ struct type *rbh_root; /* root of the tree */ \ | |
+} | |
+ | |
+#define RB_INITIALIZER(root) \ | |
+ { NULL } | |
+ | |
+#define RB_INIT(root) do { \ | |
+ (root)->rbh_root = NULL; \ | |
+} while (0) | |
+ | |
+#define RB_BLACK 0 | |
+#define RB_RED 1 | |
+#define RB_ENTRY(type) \ | |
+struct { \ | |
+ struct type *rbe_left; /* left element */ … | |
+ struct type *rbe_right; /* right element */ … | |
+ struct type *rbe_parent; /* parent element */ \ | |
+ int rbe_color; /* node color */ \ | |
+} | |
+ | |
+#define RB_LEFT(elm, field) (elm)->field.rbe_left | |
+#define RB_RIGHT(elm, field) (elm)->field.rbe_right | |
+#define RB_PARENT(elm, field) (elm)->field.rbe_parent | |
+#define RB_COLOR(elm, field) (elm)->field.rbe_color | |
+#define RB_ROOT(head) (head)->rbh_root | |
+#define RB_EMPTY(head) (RB_ROOT(head) == NULL) | |
+ | |
+#define RB_SET(elm, parent, field) do { … | |
+ RB_PARENT(elm, field) = parent; … | |
+ RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ | |
+ RB_COLOR(elm, field) = RB_RED; \ | |
+} while (0) | |
+ | |
+#define RB_SET_BLACKRED(black, red, field) do { … | |
+ RB_COLOR(black, field) = RB_BLACK; \ | |
+ RB_COLOR(red, field) = RB_RED; \ | |
+} while (0) | |
+ | |
+#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ | |
+ (tmp) = RB_RIGHT(elm, field); \ | |
+ if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field))) { \ | |
+ RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \ | |
+ } \ | |
+ if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \ | |
+ if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ | |
+ RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ | |
+ else \ | |
+ RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); … | |
+ } else \ | |
+ (head)->rbh_root = (tmp); \ | |
+ RB_LEFT(tmp, field) = (elm); \ | |
+ RB_PARENT(elm, field) = (tmp); \ | |
+} while (0) | |
+ | |
+#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ | |
+ (tmp) = RB_LEFT(elm, field); \ | |
+ if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field))) { \ | |
+ RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); … | |
+ } \ | |
+ if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \ | |
+ if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ | |
+ RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ | |
+ else \ | |
+ RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); … | |
+ } else \ | |
+ (head)->rbh_root = (tmp); \ | |
+ RB_RIGHT(tmp, field) = (elm); \ | |
+ RB_PARENT(elm, field) = (tmp); \ | |
+} while (0) | |
+ | |
+/* Generates prototypes and inline functions */ | |
+#define RB_PROTOTYPE(name, type, field, cmp) … | |
+ RB_PROTOTYPE_INTERNAL(name, type, field, cmp,) | |
+#define RB_PROTOTYPE_STATIC(name, type, field, cmp) … | |
+ RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __attribute__((__unused_… | |
+#define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \ | |
+attr void name##_RB_INSERT_COLOR(struct name *, struct type *); … | |
+attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ | |
+attr struct type *name##_RB_REMOVE(struct name *, struct type *); \ | |
+attr struct type *name##_RB_INSERT(struct name *, struct type *); \ | |
+attr struct type *name##_RB_FIND(struct name *, struct type *); … | |
+attr struct type *name##_RB_NFIND(struct name *, struct type *); \ | |
+attr struct type *name##_RB_NEXT(struct type *); \ | |
+attr struct type *name##_RB_PREV(struct type *); \ | |
+attr struct type *name##_RB_MINMAX(struct name *, int); … | |
+ \ | |
+ | |
+/* Main rb operation. | |
+ * Moves node close to the key of elm to top | |
+ */ | |
+#define RB_GENERATE(name, type, field, cmp) … | |
+ RB_GENERATE_INTERNAL(name, type, field, cmp,) | |
+#define RB_GENERATE_STATIC(name, type, field, cmp) … | |
+ RB_GENERATE_INTERNAL(name, type, field, cmp, __attribute__((__unused__… | |
+#define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \ | |
+attr void \ | |
+name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ | |
+{ \ | |
+ struct type *parent, *gparent, *tmp; \ | |
+ while ((parent = RB_PARENT(elm, field)) && \ | |
+ RB_COLOR(parent, field) == RB_RED) { \ | |
+ gparent = RB_PARENT(parent, field); \ | |
+ if (parent == RB_LEFT(gparent, field)) { \ | |
+ tmp = RB_RIGHT(gparent, field); … | |
+ if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ | |
+ RB_COLOR(tmp, field) = RB_BLACK; \ | |
+ RB_SET_BLACKRED(parent, gparent, field);\ | |
+ elm = gparent; \ | |
+ continue; \ | |
+ } \ | |
+ if (RB_RIGHT(parent, field) == elm) { \ | |
+ RB_ROTATE_LEFT(head, parent, tmp, field);\ | |
+ tmp = parent; \ | |
+ parent = elm; \ | |
+ elm = tmp; \ | |
+ } \ | |
+ RB_SET_BLACKRED(parent, gparent, field); \ | |
+ RB_ROTATE_RIGHT(head, gparent, tmp, field); \ | |
+ } else { \ | |
+ tmp = RB_LEFT(gparent, field); \ | |
+ if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ | |
+ RB_COLOR(tmp, field) = RB_BLACK; \ | |
+ RB_SET_BLACKRED(parent, gparent, field);\ | |
+ elm = gparent; \ | |
+ continue; \ | |
+ } \ | |
+ if (RB_LEFT(parent, field) == elm) { \ | |
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\ | |
+ tmp = parent; \ | |
+ parent = elm; \ | |
+ elm = tmp; \ | |
+ } \ | |
+ RB_SET_BLACKRED(parent, gparent, field); \ | |
+ RB_ROTATE_LEFT(head, gparent, tmp, field); \ | |
+ } \ | |
+ } \ | |
+ RB_COLOR(head->rbh_root, field) = RB_BLACK; \ | |
+} \ | |
+ \ | |
+attr void \ | |
+name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *el… | |
+{ \ | |
+ struct type *tmp; \ | |
+ while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \ | |
+ elm != RB_ROOT(head)) { \ | |
+ if (RB_LEFT(parent, field) == elm) { \ | |
+ tmp = RB_RIGHT(parent, field); \ | |
+ if (RB_COLOR(tmp, field) == RB_RED) { \ | |
+ RB_SET_BLACKRED(tmp, parent, field); \ | |
+ RB_ROTATE_LEFT(head, parent, tmp, field);\ | |
+ tmp = RB_RIGHT(parent, field); \ | |
+ } \ | |
+ if ((RB_LEFT(tmp, field) == NULL || \ | |
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) … | |
+ (RB_RIGHT(tmp, field) == NULL || \ | |
+ RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)… | |
+ RB_COLOR(tmp, field) = RB_RED; \ | |
+ elm = parent; \ | |
+ parent = RB_PARENT(elm, field); … | |
+ } else { \ | |
+ if (RB_RIGHT(tmp, field) == NULL || \ | |
+ RB_COLOR(RB_RIGHT(tmp, field), field) == R… | |
+ struct type *oleft; \ | |
+ if ((oleft = RB_LEFT(tmp, field)))\ | |
+ RB_COLOR(oleft, field) = RB_BL… | |
+ RB_COLOR(tmp, field) = RB_RED; \ | |
+ RB_ROTATE_RIGHT(head, tmp, oleft, fiel… | |
+ tmp = RB_RIGHT(parent, field); \ | |
+ } \ | |
+ RB_COLOR(tmp, field) = RB_COLOR(parent, field)… | |
+ RB_COLOR(parent, field) = RB_BLACK; \ | |
+ if (RB_RIGHT(tmp, field)) \ | |
+ RB_COLOR(RB_RIGHT(tmp, field), field) … | |
+ RB_ROTATE_LEFT(head, parent, tmp, field);\ | |
+ elm = RB_ROOT(head); \ | |
+ break; \ | |
+ } \ | |
+ } else { \ | |
+ tmp = RB_LEFT(parent, field); \ | |
+ if (RB_COLOR(tmp, field) == RB_RED) { \ | |
+ RB_SET_BLACKRED(tmp, parent, field); \ | |
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\ | |
+ tmp = RB_LEFT(parent, field); \ | |
+ } \ | |
+ if ((RB_LEFT(tmp, field) == NULL || \ | |
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) … | |
+ (RB_RIGHT(tmp, field) == NULL || \ | |
+ RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)… | |
+ RB_COLOR(tmp, field) = RB_RED; \ | |
+ elm = parent; \ | |
+ parent = RB_PARENT(elm, field); … | |
+ } else { \ | |
+ if (RB_LEFT(tmp, field) == NULL || \ | |
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB… | |
+ struct type *oright; \ | |
+ if ((oright = RB_RIGHT(tmp, field)))\ | |
+ RB_COLOR(oright, field) = RB_B… | |
+ RB_COLOR(tmp, field) = RB_RED; \ | |
+ RB_ROTATE_LEFT(head, tmp, oright, fiel… | |
+ tmp = RB_LEFT(parent, field); \ | |
+ } \ | |
+ RB_COLOR(tmp, field) = RB_COLOR(parent, field)… | |
+ RB_COLOR(parent, field) = RB_BLACK; \ | |
+ if (RB_LEFT(tmp, field)) \ | |
+ RB_COLOR(RB_LEFT(tmp, field), field) =… | |
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\ | |
+ elm = RB_ROOT(head); \ | |
+ break; \ | |
+ } \ | |
+ } \ | |
+ } \ | |
+ if (elm) \ | |
+ RB_COLOR(elm, field) = RB_BLACK; \ | |
+} \ | |
+ \ | |
+attr struct type * \ | |
+name##_RB_REMOVE(struct name *head, struct type *elm) \ | |
+{ \ | |
+ struct type *child, *parent, *old = elm; \ | |
+ int color; \ | |
+ if (RB_LEFT(elm, field) == NULL) \ | |
+ child = RB_RIGHT(elm, field); \ | |
+ else if (RB_RIGHT(elm, field) == NULL) \ | |
+ child = RB_LEFT(elm, field); \ | |
+ else { \ | |
+ struct type *left; \ | |
+ elm = RB_RIGHT(elm, field); \ | |
+ while ((left = RB_LEFT(elm, field))) \ | |
+ elm = left; \ | |
+ child = RB_RIGHT(elm, field); \ | |
+ parent = RB_PARENT(elm, field); … | |
+ color = RB_COLOR(elm, field); \ | |
+ if (child) \ | |
+ RB_PARENT(child, field) = parent; \ | |
+ if (parent) { \ | |
+ if (RB_LEFT(parent, field) == elm) \ | |
+ RB_LEFT(parent, field) = child; … | |
+ else \ | |
+ RB_RIGHT(parent, field) = child; \ | |
+ } else \ | |
+ RB_ROOT(head) = child; \ | |
+ if (RB_PARENT(elm, field) == old) \ | |
+ parent = elm; \ | |
+ (elm)->field = (old)->field; \ | |
+ if (RB_PARENT(old, field)) { \ | |
+ if (RB_LEFT(RB_PARENT(old, field), field) == old)\ | |
+ RB_LEFT(RB_PARENT(old, field), field) = elm;\ | |
+ else \ | |
+ RB_RIGHT(RB_PARENT(old, field), field) = elm;\ | |
+ } else \ | |
+ RB_ROOT(head) = elm; \ | |
+ RB_PARENT(RB_LEFT(old, field), field) = elm; \ | |
+ if (RB_RIGHT(old, field)) \ | |
+ RB_PARENT(RB_RIGHT(old, field), field) = elm; \ | |
+ if (parent) { \ | |
+ left = parent; \ | |
+ do { \ | |
+ } while ((left = RB_PARENT(left, field))); \ | |
+ } \ | |
+ goto color; \ | |
+ } \ | |
+ parent = RB_PARENT(elm, field); … | |
+ color = RB_COLOR(elm, field); \ | |
+ if (child) \ | |
+ RB_PARENT(child, field) = parent; \ | |
+ if (parent) { \ | |
+ if (RB_LEFT(parent, field) == elm) \ | |
+ RB_LEFT(parent, field) = child; … | |
+ else \ | |
+ RB_RIGHT(parent, field) = child; \ | |
+ } else \ | |
+ RB_ROOT(head) = child; \ | |
+color: \ | |
+ if (color == RB_BLACK) \ | |
+ name##_RB_REMOVE_COLOR(head, parent, child); \ | |
+ return (old); \ | |
+} \ | |
+ \ | |
+/* Inserts a node into the RB tree */ \ | |
+attr struct type * \ | |
+name##_RB_INSERT(struct name *head, struct type *elm) \ | |
+{ \ | |
+ struct type *tmp; \ | |
+ struct type *parent = NULL; \ | |
+ int comp = 0; \ | |
+ tmp = RB_ROOT(head); \ | |
+ while (tmp) { \ | |
+ parent = tmp; \ | |
+ comp = (cmp)(elm, parent); \ | |
+ if (comp < 0) \ | |
+ tmp = RB_LEFT(tmp, field); \ | |
+ else if (comp > 0) \ | |
+ tmp = RB_RIGHT(tmp, field); \ | |
+ else \ | |
+ return (tmp); \ | |
+ } \ | |
+ RB_SET(elm, parent, field); \ | |
+ if (parent != NULL) { \ | |
+ if (comp < 0) \ | |
+ RB_LEFT(parent, field) = elm; \ | |
+ else \ | |
+ RB_RIGHT(parent, field) = elm; \ | |
+ } else \ | |
+ RB_ROOT(head) = elm; \ | |
+ name##_RB_INSERT_COLOR(head, elm); \ | |
+ return (NULL); \ | |
+} \ | |
+ \ | |
+/* Finds the node with the same key as elm */ \ | |
+attr struct type * \ | |
+name##_RB_FIND(struct name *head, struct type *elm) \ | |
+{ \ | |
+ struct type *tmp = RB_ROOT(head); \ | |
+ int comp; \ | |
+ while (tmp) { \ | |
+ comp = cmp(elm, tmp); \ | |
+ if (comp < 0) \ | |
+ tmp = RB_LEFT(tmp, field); \ | |
+ else if (comp > 0) \ | |
+ tmp = RB_RIGHT(tmp, field); \ | |
+ else \ | |
+ return (tmp); \ | |
+ } \ | |
+ return (NULL); \ | |
+} \ | |
+ \ | |
+/* Finds the first node greater than or equal to the search key */ \ | |
+attr struct type * \ | |
+name##_RB_NFIND(struct name *head, struct type *elm) \ | |
+{ \ | |
+ struct type *tmp = RB_ROOT(head); \ | |
+ struct type *res = NULL; \ | |
+ int comp; \ | |
+ while (tmp) { \ | |
+ comp = cmp(elm, tmp); \ | |
+ if (comp < 0) { … | |
+ res = tmp; \ | |
+ tmp = RB_LEFT(tmp, field); \ | |
+ } \ | |
+ else if (comp > 0) \ | |
+ tmp = RB_RIGHT(tmp, field); \ | |
+ else \ | |
+ return (tmp); \ | |
+ } \ | |
+ return (res); \ | |
+} \ | |
+ \ | |
+attr struct type * \ | |
+name##_RB_NEXT(struct type *elm) \ | |
+{ \ | |
+ if (RB_RIGHT(elm, field)) { \ | |
+ elm = RB_RIGHT(elm, field); \ | |
+ while (RB_LEFT(elm, field)) \ | |
+ elm = RB_LEFT(elm, field); \ | |
+ } else { \ | |
+ if (RB_PARENT(elm, field) && \ | |
+ (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ | |
+ elm = RB_PARENT(elm, field); \ | |
+ else { \ | |
+ while (RB_PARENT(elm, field) && … | |
+ (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ | |
+ elm = RB_PARENT(elm, field); \ | |
+ elm = RB_PARENT(elm, field); \ | |
+ } \ | |
+ } \ | |
+ return (elm); \ | |
+} \ | |
+ \ | |
+attr struct type * \ | |
+name##_RB_PREV(struct type *elm) \ | |
+{ \ | |
+ if (RB_LEFT(elm, field)) { \ | |
+ elm = RB_LEFT(elm, field); \ | |
+ while (RB_RIGHT(elm, field)) \ | |
+ elm = RB_RIGHT(elm, field); \ | |
+ } else { \ | |
+ if (RB_PARENT(elm, field) && \ | |
+ (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \ | |
+ elm = RB_PARENT(elm, field); \ | |
+ else { \ | |
+ while (RB_PARENT(elm, field) && … | |
+ (elm == RB_LEFT(RB_PARENT(elm, field), field)))\ | |
+ elm = RB_PARENT(elm, field); \ | |
+ elm = RB_PARENT(elm, field); \ | |
+ } \ | |
+ } \ | |
+ return (elm); \ | |
+} \ | |
+ \ | |
+attr struct type * \ | |
+name##_RB_MINMAX(struct name *head, int val) \ | |
+{ \ | |
+ struct type *tmp = RB_ROOT(head); \ | |
+ struct type *parent = NULL; \ | |
+ while (tmp) { \ | |
+ parent = tmp; \ | |
+ if (val < 0) \ | |
+ tmp = RB_LEFT(tmp, field); \ | |
+ else \ | |
+ tmp = RB_RIGHT(tmp, field); \ | |
+ } \ | |
+ return (parent); \ | |
+} | |
+ | |
+#define RB_NEGINF -1 | |
+#define RB_INF 1 | |
+ | |
+#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) | |
+#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) | |
+#define RB_FIND(name, x, y) name##_RB_FIND(x, y) | |
+#define RB_NFIND(name, x, y) name##_RB_NFIND(x, y) | |
+#define RB_NEXT(name, x, y) name##_RB_NEXT(y) | |
+#define RB_PREV(name, x, y) name##_RB_PREV(y) | |
+#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) | |
+#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) | |
+ | |
+#define RB_FOREACH(x, name, head) \ | |
+ for ((x) = RB_MIN(name, head); \ | |
+ (x) != NULL; \ | |
+ (x) = name##_RB_NEXT(x)) | |
+ | |
+#define RB_FOREACH_SAFE(x, name, head, y) \ | |
+ for ((x) = RB_MIN(name, head); \ | |
+ ((x) != NULL) && ((y) = name##_RB_NEXT(x), 1); \ | |
+ (x) = (y)) | |
+ | |
+#define RB_FOREACH_REVERSE(x, name, head) \ | |
+ for ((x) = RB_MAX(name, head); \ | |
+ (x) != NULL; \ | |
+ (x) = name##_RB_PREV(x)) | |
+ | |
+#define RB_FOREACH_REVERSE_SAFE(x, name, head, y) \ | |
+ for ((x) = RB_MAX(name, head); \ | |
+ ((x) != NULL) && ((y) = name##_RB_PREV(x), 1); \ | |
+ (x) = (y)) | |
+ | |
+#endif /* TREE_H */ | |
diff --git a/webdump.c b/webdump.c | |
@@ -10,6 +10,7 @@ | |
#include "arg.h" | |
char *argv0; | |
+#include "tree.h" | |
#include "xml.h" | |
static XMLParser parser; | |
@@ -151,19 +152,31 @@ struct selectors { | |
size_t count; | |
}; | |
-/* linked-list of link references */ | |
+/* RB tree of link references */ | |
struct linkref { | |
char *type; | |
enum TagId tagid; | |
char *url; | |
int ishidden; | |
size_t linknr; | |
- struct linkref *next; | |
+ RB_ENTRY(linkref) entry; | |
}; | |
-static struct linkref *links_head; | |
-static struct linkref *links_cur; | |
-static int linkcount; /* visible link count */ | |
+/* link references and hidden link references */ | |
+struct linkref **visrefs; | |
+static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */ | |
+struct linkref **hiddenrefs; | |
+static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */ | |
+ | |
+int | |
+linkrefcmp(struct linkref *r1, struct linkref *r2) | |
+{ | |
+ return strcmp(r1->url, r2->url); | |
+} | |
+ | |
+RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead); | |
+RB_PROTOTYPE(linkreftree, linkref, entry, linkrefcmp) | |
+RB_GENERATE(linkreftree, linkref, entry, linkrefcmp) | |
static const char *str_bullet_item = "* "; | |
static const char *str_checkbox_checked = "x"; | |
@@ -213,10 +226,11 @@ static char rbuf[1024]; | |
static int rbuflen; | |
static int rnbufcells = 0; /* pending cell count to add */ | |
-#define MAX_DEPTH 256 | |
-static struct node nodes[MAX_DEPTH]; | |
-static String nodes_links[MAX_DEPTH]; /* keep track of links per node */ | |
-static int curnode; | |
+#define MAX_NODE_DEPTH 65535 /* absolute maximum node depth */ | |
+static struct node *nodes; | |
+static String *nodes_links; /* keep track of links per node */ | |
+static size_t ncapnodes; | |
+static int curnode; /* current node depth */ | |
/* reader / selector mode */ | |
static int reader_mode = 0; | |
@@ -1378,39 +1392,59 @@ handleinlinealt(void) | |
} | |
} | |
-/* slow linear lookup of link references | |
- TODO: optimize it, maybe using tree.h RB_TREE? */ | |
+/* lookup a link reference by url in the red-black tree */ | |
static struct linkref * | |
findlinkref(const char *url) | |
{ | |
- struct linkref *cur; | |
+ struct linkref find; | |
- for (cur = links_head; cur; cur = cur->next) { | |
- if (!strcmp(url, cur->url)) | |
- return cur; | |
- } | |
- return NULL; | |
+ find.url = (char *)url; | |
+ | |
+ return RB_FIND(linkreftree, &linkrefhead, &find); | |
} | |
+/* add a link reference. Returns the added link reference, or the existing link | |
+ reference if links are deduplicated */ | |
static struct linkref * | |
-addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden, | |
- int linknr) | |
+addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden) | |
{ | |
+ struct linkref *link; | |
+ size_t linknr; | |
+ | |
+ /* if links are deduplicates return the existing link */ | |
+ if (uniqrefs && (link = findlinkref(url))) | |
+ return link; | |
+ | |
if (tagid == TagA) | |
_type = "link"; | |
- /* add to linked list */ | |
- if (!links_head) | |
- links_cur = links_head = ecalloc(1, sizeof(*links_head)); | |
- else | |
- links_cur = links_cur->next = ecalloc(1, sizeof(*links_head)); | |
- links_cur->url = estrdup(url); | |
- links_cur->type = estrdup(_type); | |
- links_cur->tagid = tagid; | |
- links_cur->ishidden = ishidden; | |
- links_cur->linknr = linknr; | |
+ link = ecalloc(1, sizeof(*link)); | |
+ | |
+ if (!ishidden) { | |
+ linknr = ++nvisrefs; | |
+ if (nvisrefs >= ncapvisrefs) | |
+ ncapvisrefs += 256; /* greedy alloc */ | |
+ visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs); | |
+ visrefs[linknr - 1] = link; /* add pointer to list */ | |
+ } else { | |
+ linknr = ++nhiddenrefs; | |
+ if (nhiddenrefs >= ncaphiddenrefs) | |
+ ncaphiddenrefs += 256; /* greedy alloc */ | |
+ hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphi… | |
+ hiddenrefs[linknr - 1] = link; /* add pointer to list */ | |
+ } | |
- return links_cur; | |
+ link->url = estrdup(url); | |
+ link->type = estrdup(_type); | |
+ link->tagid = tagid; | |
+ link->ishidden = ishidden; | |
+ link->linknr = linknr; | |
+ | |
+ /* add to tree: the tree is only used for checking unique link referen… | |
+ if (uniqrefs) | |
+ RB_INSERT(linkreftree, &linkrefhead, link); | |
+ | |
+ return link; | |
} | |
static void | |
@@ -1462,43 +1496,65 @@ handleinlinelink(void) | |
/* add hidden links directly to the reference, | |
the order doesn't matter */ | |
if (cur->tag.displaytype & DisplayNone) | |
- addlinkref(url, cur->tag.name, cur->tag.id, 1, 0); | |
+ addlinkref(url, cur->tag.name, cur->tag.id, 1); | |
} | |
void | |
printlinkrefs(void) | |
{ | |
+ struct linkref *ref; | |
size_t i; | |
- int hashiddenrefs = 0; | |
- if (!links_head) | |
+ if (!nvisrefs && !nhiddenrefs) | |
return; | |
if (resources) { | |
- for (i = 1, links_cur = links_head; links_cur; links_cur = lin… | |
- dprintf(3, "%s\t%s\n", links_cur->type, links_cur->url… | |
+ for (i = 0; i < nvisrefs; i++) { | |
+ ref = visrefs[i]; | |
+ dprintf(3, "%s\t%s\n", ref->type, ref->url); | |
+ } | |
+ for (i = 0; i < nhiddenrefs; i++) { | |
+ ref = hiddenrefs[i]; | |
+ dprintf(3, "%s\t%s\n", ref->type, ref->url); | |
+ } | |
} | |
printf("\nReferences\n\n"); | |
- i = 1; | |
- for (links_cur = links_head; links_cur; links_cur = links_cur->next) { | |
- if (links_cur->ishidden) { | |
- hashiddenrefs = 1; | |
- continue; | |
- } | |
- printf(" %zu. %s (%s)\n", links_cur->linknr, links_cur->url, l… | |
- i++; | |
+ for (i = 0; i < nvisrefs; i++) { | |
+ ref = visrefs[i]; | |
+ printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type); | |
} | |
- if (hashiddenrefs) | |
+ if (nhiddenrefs > 0) | |
printf("\n\nHidden references\n\n"); | |
/* hidden links don't have a link number, just count them */ | |
- for (links_cur = links_head; links_cur; links_cur = links_cur->next) { | |
- if (!links_cur->ishidden) | |
- continue; | |
- printf(" %zu. %s (%s)\n", i, links_cur->url, links_cur->type); | |
- i++; | |
+ for (i = 0; i < nhiddenrefs; i++) { | |
+ ref = hiddenrefs[i]; | |
+ printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type); | |
+ } | |
+} | |
+ | |
+#define NODE_CAP_INC 256 | |
+ | |
+/* increase node depth, allocate space for nodes if needed */ | |
+static void | |
+incnode(void) | |
+{ | |
+ curnode++; | |
+ | |
+ if (curnode >= MAX_NODE_DEPTH) | |
+ errx(1, "max node depth reached: %d", curnode); | |
+ | |
+ if (curnode >= ncapnodes) { | |
+ nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP… | |
+ nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (nc… | |
+ | |
+ /* clear new region */ | |
+ memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC); | |
+ memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE… | |
+ | |
+ ncapnodes += NODE_CAP_INC; /* greedy alloc */ | |
} | |
} | |
@@ -1670,17 +1726,8 @@ endnode(struct node *cur) | |
/* add link and show the link number in the visible order */ | |
if (!ishidden && nodes_links[curnode].len > 0) { | |
- if (uniqrefs) | |
- ref = findlinkref(nodes_links[curnode].data); | |
- else | |
- ref = NULL; | |
- | |
- /* new link: add it */ | |
- if (!ref) { | |
- linkcount++; | |
- ref = addlinkref(nodes_links[curnode].data, | |
- cur->tag.name, cur->tag.id, ishidden, linkcoun… | |
- } | |
+ ref = addlinkref(nodes_links[curnode].data, | |
+ cur->tag.name, cur->tag.id, ishidden); | |
if (showrefinline || showurlinline) { | |
hflush(); | |
@@ -1825,9 +1872,6 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
char *s; | |
int i, j, k, nchildfound, parenttype; | |
- if (curnode >= MAX_DEPTH - 2) | |
- errx(1, "max tag depth reached: %d\n", curnode); | |
- | |
cur = &nodes[curnode]; | |
string_clear(&attr_alt); | |
@@ -1920,7 +1964,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) | |
} | |
} | |
- curnode++; | |
+ incnode(); | |
string_clear(&nodes_links[curnode]); /* clear possible link reference … | |
cur = &nodes[curnode]; | |
memset(cur, 0, sizeof(*cur)); /* clear / reset node */ | |
@@ -2333,6 +2377,11 @@ main(int argc, char **argv) | |
usage(); | |
} ARGEND | |
+ /* initial nodes */ | |
+ ncapnodes = NODE_CAP_INC; | |
+ nodes = ecalloc(ncapnodes, sizeof(*nodes)); | |
+ nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links)); | |
+ | |
/* top-most document root needs initialization */ | |
nodes[0].tag.name = ""; | |