Check-in by ben on 2024-08-05 23:35:09
Change print_html() to convert archive.org to pharos links in
the webdump References section.
INSERTED DELETED
67 20 src/web.awk
67 20 TOTAL over 1 changed file
Index: src/web.awk
==================================================================
--- src/web.awk
+++ src/web.awk
@@ -107,52 +107,99 @@
}
if (type == TYPE_TEXT) {
print $0
}
} else {
- if (match($0, /^ [0-9]+\. /)) {
- prefix = substr($0, 0, RLENGTH)
- link = substr($0, RLENGTH+1)
- if (link !~ /^[a-z]+:/) {
- # convert relative link to full URL
- relative = link
- if (relative ~ /^\/\//) {
- link = proto relative
- } else if (relative ~ /^\//) {
- link = root relative
- } else {
- link = base "/" relative
- }
- }
- print prefix link
- } else {
- print $0
- }
+ print_ref_full($0, base, proto, root)
}
}
close(cmd)
unlink(curlcfg)
return
}
-function print_html(html, cmd, work) {
+function print_html(html, cmd, marker, work) {
work = gettemp()
gsub(/\\n/, "<br>", html)
print html >work
close(work)
cmd = sprintf("%s -a -n 3 <%s | %s -ilr -w 60", cmd_strings, work, \
cmd_webdump)
+ marker = 999999
while ((cmd | getline) > 0) {
gsub(/\t/, " ")
- print
+ if (NR < marker) {
+ if ($0 ~ /^References$/) {
+ marker = NR
+ }
+ print $0
+ } else {
+ print_ref_pharos($0)
+ }
}
close(cmd)
unlink(work)
return
}
+
+# Print the webdump references section, converting relative URLs
+# to full URLs
+
+function print_ref_full(str, base, proto, root, link, prefix, relative) {
+ if (match(str, /^ [0-9]+\. /)) {
+ prefix = substr(str, 0, RLENGTH)
+ link = substr(str, RLENGTH+1)
+ # convert relative links to full URLs
+ if (link !~ /^[a-z]+:/) {
+ # convert relative link to full URL
+ relative = link
+ if (relative ~ /^\/\//) {
+ link = proto relative
+ } else if (relative ~ /^\//) {
+ link = root relative
+ } else {
+ link = base "/" relative
+ }
+ }
+ print prefix link
+ } else {
+ print str
+ }
+ return
+}
+
+
+# Print the webdump references section, translating archive.org URLs to
+# pharos URLs
+
+function print_ref_pharos(str, id, label, link, prefix, relative, token) {
+ if (match(str, /^ [0-9]+\. /)) {
+ prefix = substr(str, 0, RLENGTH)
+ link = substr(str, RLENGTH+1)
+
+ id = ""
+ if (match(link, /https?:\/\/(www\.)?archive\.org\/details\//)) {
+ token = substr(link, RSTART+RLENGTH)
+ id = substr(token, 1, length(token) - 7)
+ if (match(id, /[?\/ ]/)) {
+ id = substr(id, 1, RSTART-1)
+ }
+ }
+ if (length(id) > 0) {
+ label = prefix id
+ printf "[1|%s|%s/details/%s|%s|%s]\n", label, cgipath,
+ id, server, port
+ } else {
+ print str
+ }
+ } else {
+ print str
+ }
+ return
+}
function web_init() {
TYPE_HEADERS = 2
TYPE_LINKS = 1
TYPE_RAW = 9