Introduction
Introduction Statistics Contact Development Disclaimer Help
sfeed_update - sfeed - RSS and Atom parser
git clone git://git.codemadness.org/sfeed
Log
Files
Refs
README
LICENSE
---
sfeed_update (6730B)
---
1 #!/bin/sh
2 # update feeds, merge with old feeds.
3 # NOTE: assumes "sfeed_*" executables are in $PATH.
4
5 # defaults
6 sfeedpath="$HOME/.sfeed/feeds"
7
8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
9 # feeds are finished at a time.
10 maxjobs=16
11
12 # load config (evaluate shellscript).
13 # loadconfig(configfile)
14 loadconfig() {
15 # allow to specify config via argv[1].
16 if [ "$1" != "" ]; then
17 # get absolute path of config file required for includin…
18 config="$1"
19 configpath=$(readlink -f "${config}" 2>/dev/null)
20 else
21 # default config location.
22 config="$HOME/.sfeed/sfeedrc"
23 configpath="${config}"
24 fi
25
26 # config is loaded here to be able to override $sfeedpath or fun…
27 if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then
28 . "${configpath}"
29 else
30 printf "Configuration file \"%s\" cannot be read.\n" "${…
31 echo "See the sfeedrc.example file or the sfeedrc(5) man…
32 die
33 fi
34 }
35
36 # log(name, s)
37 log() {
38 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
39 }
40
41 # log_error(name, s)
42 log_error() {
43 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
44 # set error exit status indicator for parallel jobs.
45 rm -f "${sfeedtmpdir}/ok"
46 }
47
48 # fetch a feed via HTTP/HTTPS etc.
49 # fetch(name, url, feedfile)
50 fetch() {
51 # fail on redirects, hide User-Agent, timeout is 15 seconds.
52 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
53 "$2" 2>/dev/null
54 }
55
56 # convert encoding from one encoding to another.
57 # convertencoding(name, from, to)
58 convertencoding() {
59 if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
60 iconv -cs -f "$2" -t "$3" 2> /dev/null
61 else
62 # else no convert, just output.
63 cat
64 fi
65 }
66
67 # parse and convert input, by default XML to the sfeed(5) TSV format.
68 # parse(name, feedurl, basesiteurl)
69 parse() {
70 sfeed "$3"
71 }
72
73 # filter fields.
74 # filter(name, url)
75 filter() {
76 cat
77 }
78
79 # merge raw files: unique sort by id, title, link.
80 # merge(name, oldfile, newfile)
81 merge() {
82 sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
83 }
84
85 # order by timestamp (descending).
86 # order(name, url)
87 order() {
88 sort -t ' ' -k1rn,1 2>/dev/null
89 }
90
91 # internal handler to fetch and process a feed.
92 # _feed(name, feedurl, [basesiteurl], [encoding])
93 _feed() {
94 name="$1"
95 feedurl="$2"
96 basesiteurl="$3"
97 encoding="$4"
98
99 filename="$(printf '%s' "${name}" | tr '/' '_')"
100 sfeedfile="${sfeedpath}/${filename}"
101 tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
102
103 # if file does not exist yet create it.
104 [ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
105
106 if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfil…
107 log_error "${name}" "FAIL (FETCH)"
108 return 1
109 fi
110
111 # try to detect encoding (if not specified). if detecting the en…
112 [ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfi…
113
114 if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfee…
115 log_error "${name}" "FAIL (ENCODING)"
116 return 1
117 fi
118 rm -f "${tmpfeedfile}.fetch"
119
120 # if baseurl is empty then use feedurl.
121 if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" <…
122 log_error "${name}" "FAIL (PARSE)"
123 return 1
124 fi
125 rm -f "${tmpfeedfile}.utf8"
126
127 if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${t…
128 log_error "${name}" "FAIL (FILTER)"
129 return 1
130 fi
131 rm -f "${tmpfeedfile}.tsv"
132
133 # new feed data is empty: no need for below stages.
134 if [ ! -s "${tmpfeedfile}.filter" ]; then
135 log "${name}" "OK"
136 return 0
137 fi
138
139 if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "$…
140 log_error "${name}" "FAIL (MERGE)"
141 return 1
142 fi
143 rm -f "${tmpfeedfile}.filter"
144
145 if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${…
146 log_error "${name}" "FAIL (ORDER)"
147 return 1
148 fi
149 rm -f "${tmpfeedfile}.merge"
150
151 # copy
152 if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
153 log_error "${name}" "FAIL (COPY)"
154 return 1
155 fi
156 rm -f "${tmpfeedfile}.order"
157
158 # OK
159 log "${name}" "OK"
160 return 0
161 }
162
163 # fetch and process a feed in parallel.
164 # feed(name, feedurl, [basesiteurl], [encoding])
165 feed() {
166 # Output job parameters for xargs.
167 # Specify fields as a single parameter separated by a NUL byte.
168 # The parameter is split into fields later by the child process,…
169 # allows using xargs with empty fields across many implementatio…
170 printf '%s\037%s\037%s\037%s\037%s\037%s\0' \
171 "${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4"
172 }
173
174 # cleanup()
175 cleanup() {
176 # remove temporary directory with feed files.
177 rm -rf "${sfeedtmpdir}"
178 }
179
180 # die(statuscode)
181 die() {
182 statuscode="${1:-1}" # default: exit 1
183 # cleanup temporary files etc.
184 cleanup
185 exit "${statuscode}"
186 }
187
188 # sighandler(signo)
189 sighandler() {
190 signo="$1"
191 # ignore TERM signal for myself.
192 trap -- "" TERM
193 # kill all running children >:D
194 kill -TERM -$$
195 }
196
197 # feeds()
198 feeds() {
199 printf "Configuration file \"%s\" is invalid or does not contain…
200 echo "See sfeedrc.example for an example." >&2
201 die
202 }
203
204 # runfeeds()
205 runfeeds() {
206 # print feeds for parallel processing with xargs.
207 feeds > "${sfeedtmpdir}/jobs" || die
208 SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \
209 "$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs"
210 }
211
212 # main(args...)
213 main() {
214 # signal number received for parent.
215 signo=0
216 # SIGINT: signal to interrupt parent.
217 trap -- "sighandler 2" "INT"
218 # SIGTERM: signal to terminate parent.
219 trap -- "sighandler 15" "TERM"
220 # load config file.
221 loadconfig "$1"
222 # fetch feeds and store in temporary directory.
223 sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die
224 mkdir -p "${sfeedtmpdir}/feeds"
225 touch "${sfeedtmpdir}/ok" || die
226 # make sure path exists.
227 mkdir -p "${sfeedpath}"
228 # run and process the feeds.
229 runfeeds
230 statuscode=$?
231
232 # check error exit status indicator for parallel jobs.
233 [ -f "${sfeedtmpdir}/ok" ] || statuscode=1
234 # on signal SIGINT and SIGTERM exit with signal number + 128.
235 [ ${signo} -ne 0 ] && die $((signo+128))
236 die ${statuscode}
237 }
238
239 # process a single feed.
240 # parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding
241 if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then
242 [ "$1" = "" ] && exit 0 # must have an argument set
243 # IFS is "\037"
244 printf '%s\n' "$1" | \
245 while IFS="" read -r _config _tmpdir _name _feedurl _basesiteur…
246 loadconfig "${_config}"
247 sfeedtmpdir="${_tmpdir}"
248 _feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_enc…
249 exit "$?"
250 done
251 exit 0
252 fi
253
254 # ...else parent mode:
255 argv0="$0" # store $0, in the zsh shell $0 is the name of the function.
256 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.