sfeed_update - sfeed - RSS and Atom parser | |
git clone git://git.codemadness.org/sfeed | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
sfeed_update (6730B) | |
--- | |
1 #!/bin/sh | |
2 # update feeds, merge with old feeds. | |
3 # NOTE: assumes "sfeed_*" executables are in $PATH. | |
4 | |
5 # defaults | |
6 sfeedpath="$HOME/.sfeed/feeds" | |
7 | |
8 # used for processing feeds concurrently: wait until ${maxjobs} amount of | |
9 # feeds are finished at a time. | |
10 maxjobs=16 | |
11 | |
12 # load config (evaluate shellscript). | |
13 # loadconfig(configfile) | |
14 loadconfig() { | |
15 # allow to specify config via argv[1]. | |
16 if [ "$1" != "" ]; then | |
17 # get absolute path of config file required for includin… | |
18 config="$1" | |
19 configpath=$(readlink -f "${config}" 2>/dev/null) | |
20 else | |
21 # default config location. | |
22 config="$HOME/.sfeed/sfeedrc" | |
23 configpath="${config}" | |
24 fi | |
25 | |
26 # config is loaded here to be able to override $sfeedpath or fun… | |
27 if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then | |
28 . "${configpath}" | |
29 else | |
30 printf "Configuration file \"%s\" cannot be read.\n" "${… | |
31 echo "See the sfeedrc.example file or the sfeedrc(5) man… | |
32 die | |
33 fi | |
34 } | |
35 | |
36 # log(name, s) | |
37 log() { | |
38 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" | |
39 } | |
40 | |
41 # log_error(name, s) | |
42 log_error() { | |
43 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2 | |
44 # set error exit status indicator for parallel jobs. | |
45 rm -f "${sfeedtmpdir}/ok" | |
46 } | |
47 | |
48 # fetch a feed via HTTP/HTTPS etc. | |
49 # fetch(name, url, feedfile) | |
50 fetch() { | |
51 # fail on redirects, hide User-Agent, timeout is 15 seconds. | |
52 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ | |
53 "$2" 2>/dev/null | |
54 } | |
55 | |
56 # convert encoding from one encoding to another. | |
57 # convertencoding(name, from, to) | |
58 convertencoding() { | |
59 if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then | |
60 iconv -cs -f "$2" -t "$3" 2> /dev/null | |
61 else | |
62 # else no convert, just output. | |
63 cat | |
64 fi | |
65 } | |
66 | |
67 # parse and convert input, by default XML to the sfeed(5) TSV format. | |
68 # parse(name, feedurl, basesiteurl) | |
69 parse() { | |
70 sfeed "$3" | |
71 } | |
72 | |
73 # filter fields. | |
74 # filter(name, url) | |
75 filter() { | |
76 cat | |
77 } | |
78 | |
79 # merge raw files: unique sort by id, title, link. | |
80 # merge(name, oldfile, newfile) | |
81 merge() { | |
82 sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null | |
83 } | |
84 | |
85 # order by timestamp (descending). | |
86 # order(name, url) | |
87 order() { | |
88 sort -t ' ' -k1rn,1 2>/dev/null | |
89 } | |
90 | |
91 # internal handler to fetch and process a feed. | |
92 # _feed(name, feedurl, [basesiteurl], [encoding]) | |
93 _feed() { | |
94 name="$1" | |
95 feedurl="$2" | |
96 basesiteurl="$3" | |
97 encoding="$4" | |
98 | |
99 filename="$(printf '%s' "${name}" | tr '/' '_')" | |
100 sfeedfile="${sfeedpath}/${filename}" | |
101 tmpfeedfile="${sfeedtmpdir}/feeds/${filename}" | |
102 | |
103 # if file does not exist yet create it. | |
104 [ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null | |
105 | |
106 if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfil… | |
107 log_error "${name}" "FAIL (FETCH)" | |
108 return 1 | |
109 fi | |
110 | |
111 # try to detect encoding (if not specified). if detecting the en… | |
112 [ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfi… | |
113 | |
114 if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfee… | |
115 log_error "${name}" "FAIL (ENCODING)" | |
116 return 1 | |
117 fi | |
118 rm -f "${tmpfeedfile}.fetch" | |
119 | |
120 # if baseurl is empty then use feedurl. | |
121 if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" <… | |
122 log_error "${name}" "FAIL (PARSE)" | |
123 return 1 | |
124 fi | |
125 rm -f "${tmpfeedfile}.utf8" | |
126 | |
127 if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${t… | |
128 log_error "${name}" "FAIL (FILTER)" | |
129 return 1 | |
130 fi | |
131 rm -f "${tmpfeedfile}.tsv" | |
132 | |
133 # new feed data is empty: no need for below stages. | |
134 if [ ! -s "${tmpfeedfile}.filter" ]; then | |
135 log "${name}" "OK" | |
136 return 0 | |
137 fi | |
138 | |
139 if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "$… | |
140 log_error "${name}" "FAIL (MERGE)" | |
141 return 1 | |
142 fi | |
143 rm -f "${tmpfeedfile}.filter" | |
144 | |
145 if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${… | |
146 log_error "${name}" "FAIL (ORDER)" | |
147 return 1 | |
148 fi | |
149 rm -f "${tmpfeedfile}.merge" | |
150 | |
151 # copy | |
152 if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then | |
153 log_error "${name}" "FAIL (COPY)" | |
154 return 1 | |
155 fi | |
156 rm -f "${tmpfeedfile}.order" | |
157 | |
158 # OK | |
159 log "${name}" "OK" | |
160 return 0 | |
161 } | |
162 | |
163 # fetch and process a feed in parallel. | |
164 # feed(name, feedurl, [basesiteurl], [encoding]) | |
165 feed() { | |
166 # Output job parameters for xargs. | |
167 # Specify fields as a single parameter separated by a NUL byte. | |
168 # The parameter is split into fields later by the child process,… | |
169 # allows using xargs with empty fields across many implementatio… | |
170 printf '%s\037%s\037%s\037%s\037%s\037%s\0' \ | |
171 "${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4" | |
172 } | |
173 | |
174 # cleanup() | |
175 cleanup() { | |
176 # remove temporary directory with feed files. | |
177 rm -rf "${sfeedtmpdir}" | |
178 } | |
179 | |
180 # die(statuscode) | |
181 die() { | |
182 statuscode="${1:-1}" # default: exit 1 | |
183 # cleanup temporary files etc. | |
184 cleanup | |
185 exit "${statuscode}" | |
186 } | |
187 | |
188 # sighandler(signo) | |
189 sighandler() { | |
190 signo="$1" | |
191 # ignore TERM signal for myself. | |
192 trap -- "" TERM | |
193 # kill all running children >:D | |
194 kill -TERM -$$ | |
195 } | |
196 | |
197 # feeds() | |
198 feeds() { | |
199 printf "Configuration file \"%s\" is invalid or does not contain… | |
200 echo "See sfeedrc.example for an example." >&2 | |
201 die | |
202 } | |
203 | |
204 # runfeeds() | |
205 runfeeds() { | |
206 # print feeds for parallel processing with xargs. | |
207 feeds > "${sfeedtmpdir}/jobs" || die | |
208 SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \ | |
209 "$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs" | |
210 } | |
211 | |
212 # main(args...) | |
213 main() { | |
214 # signal number received for parent. | |
215 signo=0 | |
216 # SIGINT: signal to interrupt parent. | |
217 trap -- "sighandler 2" "INT" | |
218 # SIGTERM: signal to terminate parent. | |
219 trap -- "sighandler 15" "TERM" | |
220 # load config file. | |
221 loadconfig "$1" | |
222 # fetch feeds and store in temporary directory. | |
223 sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die | |
224 mkdir -p "${sfeedtmpdir}/feeds" | |
225 touch "${sfeedtmpdir}/ok" || die | |
226 # make sure path exists. | |
227 mkdir -p "${sfeedpath}" | |
228 # run and process the feeds. | |
229 runfeeds | |
230 statuscode=$? | |
231 | |
232 # check error exit status indicator for parallel jobs. | |
233 [ -f "${sfeedtmpdir}/ok" ] || statuscode=1 | |
234 # on signal SIGINT and SIGTERM exit with signal number + 128. | |
235 [ ${signo} -ne 0 ] && die $((signo+128)) | |
236 die ${statuscode} | |
237 } | |
238 | |
239 # process a single feed. | |
240 # parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding | |
241 if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then | |
242 [ "$1" = "" ] && exit 0 # must have an argument set | |
243 # IFS is "\037" | |
244 printf '%s\n' "$1" | \ | |
245 while IFS="" read -r _config _tmpdir _name _feedurl _basesiteur… | |
246 loadconfig "${_config}" | |
247 sfeedtmpdir="${_tmpdir}" | |
248 _feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_enc… | |
249 exit "$?" | |
250 done | |
251 exit 0 | |
252 fi | |
253 | |
254 # ...else parent mode: | |
255 argv0="$0" # store $0, in the zsh shell $0 is the name of the function. | |
256 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@" |