Introduction
Introduction Statistics Contact Development Disclaimer Help
tscrape_update improvements - tscrape - twitter scraper
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
commit db47c97bea3370886d011a2c950ead2551cf3fbc
parent 5e6e62cf3522747a7c4573736d774503ff139a12
Author: Hiltjo Posthuma <[email protected]>
Date: Sun, 12 May 2019 19:20:49 +0200
tscrape_update improvements
- Better checking and verbose logging (on failure) of each stage:
fetchfeed, filter, merge, order, convertencoding. This makes sure on out-of-m…
disk-space or other resource limits the output is not corrupted.
- This also has the added advantage it runs less processes (piped) at the same
time.
- Clear previous unneeded file to preserve space in /tmp
(/tmp is often mounted as mfs/tmpfs).
- Rename fetchfeed to fetch.
- Add logging function (able to override), use more logical logging format (pun
intended).
- Code-style: order overridable functions in execution order.
Diffstat:
M tscrape_update | 104 ++++++++++++++++++++---------…
1 file changed, 69 insertions(+), 35 deletions(-)
---
diff --git a/tscrape_update b/tscrape_update
@@ -31,10 +31,17 @@ loadconfig() {
fi
}
-# merge raw files: unique sort by id, retweetid.
-# merge(name, oldfile, newfile)
-merge() {
- sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
+# log(name,s)
+log() {
+ printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
+}
+
+# fetch a feed via HTTP/HTTPS etc.
+# fetch(name, url, feedfile)
+fetch() {
+ # fail on redirects, hide User-Agent, timeout is 15 seconds.
+ curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
+ "$2" 2>/dev/null
}
# filter fields.
@@ -49,15 +56,10 @@ order() {
sort -t ' ' -k1rn,1
}
-# fetch a feed via HTTP/HTTPS etc.
-# fetchfeed(name, url, feedfile)
-fetchfeed() {
- if curl --http1.0 -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
- "$2" 2>/dev/null; then
- printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
- else
- printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
- fi
+# merge raw files: unique sort by id, retweetid.
+# merge(name, oldfile, newfile)
+merge() {
+ sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
}
# fetch and parse feed.
@@ -73,33 +75,65 @@ feed() {
(name="$1"
filename="$(printf '%s' "$1" | tr '/' '_')"
feedurl="$2"
- tmpfeedfile="${tscrapetmpdir}/${filename}"
- tmpencfile=""
+
tscrapefile="${tscrapepath}/${filename}"
+ tmpfeedfile="${tscrapetmpdir}/${filename}"
+
+ if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedfile}.f…
+ log "${name}" "FAIL (FETCH)"
+ return
+ fi
+
+ if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; then
+ log "${name}" "FAIL (CONVERT)"
+ return
+ fi
+ rm -f "${tmpfeedfile}.fetch"
+
+ if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"…
+ log "${name}" "FAIL (FILTER)"
+ return
+ fi
+ rm -f "${tmpfeedfile}.tsv"
+
+ # new feed data is empty: no need for below stages.
+ if [ ! -s "${tmpfeedfile}.filter" ]; then
+ log "${name}" "OK"
+ return
+ fi
+
+ # if file does not exist yet "merge" with /dev/null.
+ if [ -e "${tscrapefile}" ]; then
+ oldfile="${tscrapefile}"
+ else
+ oldfile="/dev/null"
+ fi
+
+ if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${tmpfeed…
+ log "${name}" "FAIL (MERGE)"
+ return
+ fi
+ rm -f "${tmpfeedfile}.filter"
+
+ if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"…
+ log "${name}" "FAIL (ORDER)"
+ return
+ fi
+ rm -f "${tmpfeedfile}.merge"
+
+ # atomic move.
+ if ! mv "${tmpfeedfile}.order" "${tscrapefile}"; then
+ log "${name}" "FAIL (MOVE)"
+ return
+ fi
- fetchfeed "${name}" "${feedurl}" "${tscrapefile}" | \
- tscrape | filter "${name}" > "${tmpfeedfile}"
-
- # get new data and merge with old.
- tscrapefilenew="${tscrapepath}/${filename}.new"
- # new feed data is non-empty.
- if [ -s "${tmpfeedfile}" ]; then
- # if file exists, merge
- if [ -e "${tscrapefile}" ]; then
- merge "${name}" "${tscrapefile}" "${tmpfeedfile}" | \
- order "${name}" > "${tscrapefilenew}"
-
- # overwrite old file with updated file
- mv "${tscrapefilenew}" "${tscrapefile}"
- else
- merge "${name}" "/dev/null" "${tmpfeedfile}" | \
- order "${name}" > "${tscrapefile}"
- fi
- fi) &
+ # OK
+ log "${name}" "OK"
+ ) &
}
cleanup() {
- # remove temporary files
+ # remove temporary directory with files.
rm -rf "${tscrapetmpdir}"
}
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.