tscrape_update - tscrape - twitter scraper (not working anymore) | |
git clone git://git.codemadness.org/tscrape | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
tscrape_update (4960B) | |
--- | |
1 #!/bin/sh | |
2 # update feeds, merge with old feeds. | |
3 # NOTE: assumes "tscrape_*" executables are in $PATH. | |
4 | |
5 # defaults | |
6 tscrapepath="$HOME/.tscrape/feeds" | |
7 | |
8 # used for processing feeds concurrently: wait until ${maxjobs} amount of | |
9 # feeds are finished at a time. | |
10 maxjobs=8 | |
11 | |
12 # Twitter authentication bearer (seems to be static). | |
13 bearer="AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3D… | |
14 | |
15 # guest token. | |
16 token="" | |
17 | |
18 # load config (evaluate shellscript). | |
19 # loadconfig(configfile) | |
20 loadconfig() { | |
21 # allow to specify config via argv[1]. | |
22 if [ "$1" != "" ]; then | |
23 # get absolute path of config file. | |
24 config=$(readlink -f "$1") | |
25 else | |
26 # default config location. | |
27 config="$HOME/.tscrape/tscraperc" | |
28 fi | |
29 | |
30 # config is loaded here to be able to override $tscrapepath or f… | |
31 if [ -r "${config}" ]; then | |
32 . "${config}" | |
33 else | |
34 echo "Configuration file \"${config}\" does not exist or… | |
35 echo "See tscraperc.example for an example." >&2 | |
36 exit 1 | |
37 fi | |
38 } | |
39 | |
40 # log(name, s) | |
41 log() { | |
42 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2 | |
43 } | |
44 | |
45 # acquire guest token. | |
46 # guesttoken() | |
47 guesttoken() { | |
48 # fail on redirects, hide User-Agent, timeout is 15 seconds. | |
49 curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ | |
50 -H "Authorization: Bearer ${bearer}" \ | |
51 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev… | |
52 sed -nE 's@.*\{"guest_token":"([^"]*)"\}.*@\1@p' | |
53 } | |
54 | |
55 # fetch a feed via HTTP/HTTPS etc. | |
56 # fetch(name, twittername, feedfile) | |
57 fetch() { | |
58 url="https://api.twitter.com/1.1/statuses/user_timeline.json?scr… | |
59 | |
60 # fail on redirects, hide User-Agent, timeout is 15 seconds. | |
61 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ | |
62 -H "Authorization: Bearer ${bearer}" \ | |
63 -H "x-guest-token: $token" \ | |
64 "${url}" 2>/dev/null | |
65 } | |
66 | |
67 # filter fields. | |
68 # filter(name) | |
69 filter() { | |
70 cat | |
71 } | |
72 | |
73 # merge raw files: unique sort by id, retweetid. | |
74 # merge(name, oldfile, newfile) | |
75 merge() { | |
76 sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null | |
77 } | |
78 | |
79 # order by timestamp (descending). | |
80 # order(name) | |
81 order() { | |
82 sort -t ' ' -k1rn,1 | |
83 } | |
84 | |
85 # fetch and parse feed. | |
86 # feed(name, feedurl) | |
87 feed() { | |
88 # wait until ${maxjobs} are finished: will stall the queue if an… | |
89 # is slow, but it is portable. | |
90 [ ${signo} -ne 0 ] && return | |
91 [ $((curjobs % maxjobs)) -eq 0 ] && wait | |
92 [ ${signo} -ne 0 ] && return | |
93 curjobs=$((curjobs + 1)) | |
94 | |
95 (name="$1" | |
96 filename="$(printf '%s' "$1" | tr '/' '_')" | |
97 feedurl="$2" | |
98 | |
99 tscrapefile="${tscrapepath}/${filename}" | |
100 tmpfeedfile="${tscrapetmpdir}/${filename}" | |
101 | |
102 if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedf… | |
103 log "${name}" "FAIL (FETCH)" | |
104 return | |
105 fi | |
106 | |
107 if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; th… | |
108 log "${name}" "FAIL (CONVERT)" | |
109 return | |
110 fi | |
111 rm -f "${tmpfeedfile}.fetch" | |
112 | |
113 if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.f… | |
114 log "${name}" "FAIL (FILTER)" | |
115 return | |
116 fi | |
117 rm -f "${tmpfeedfile}.tsv" | |
118 | |
119 # new feed data is empty: no need for below stages. | |
120 if [ ! -s "${tmpfeedfile}.filter" ]; then | |
121 log "${name}" "OK" | |
122 return | |
123 fi | |
124 | |
125 # if file does not exist yet "merge" with /dev/null. | |
126 if [ -e "${tscrapefile}" ]; then | |
127 oldfile="${tscrapefile}" | |
128 else | |
129 oldfile="/dev/null" | |
130 fi | |
131 | |
132 if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${t… | |
133 log "${name}" "FAIL (MERGE)" | |
134 return | |
135 fi | |
136 rm -f "${tmpfeedfile}.filter" | |
137 | |
138 if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.… | |
139 log "${name}" "FAIL (ORDER)" | |
140 return | |
141 fi | |
142 rm -f "${tmpfeedfile}.merge" | |
143 | |
144 # copy | |
145 if ! cp "${tmpfeedfile}.order" "${tscrapefile}"; then | |
146 log "${name}" "FAIL (COPY)" | |
147 return | |
148 fi | |
149 rm -f "${tmpfeedfile}.order" | |
150 | |
151 # OK | |
152 log "${name}" "OK" | |
153 ) & | |
154 } | |
155 | |
156 cleanup() { | |
157 # remove temporary directory with files. | |
158 rm -rf "${tscrapetmpdir}" | |
159 } | |
160 | |
161 sighandler() { | |
162 signo="$1" | |
163 # ignore TERM signal for myself. | |
164 trap -- "" TERM | |
165 # kill all running childs >:D | |
166 kill -TERM -$$ | |
167 } | |
168 | |
169 feeds() { | |
170 echo "Configuration file \"${config}\" is invalid or does not co… | |
171 echo "See tscraperc.example for an example." >&2 | |
172 } | |
173 | |
174 # get quest token. | |
175 token=$(guesttoken) | |
176 if [ -z "${token}" ]; then | |
177 echo "Failed to acquire guest token" >&2 | |
178 exit 1 | |
179 fi | |
180 | |
181 # job counter. | |
182 curjobs=0 | |
183 # signal number received for parent. | |
184 signo=0 | |
185 # SIGINT: signal to interrupt parent. | |
186 trap -- "sighandler 2" "INT" | |
187 # SIGTERM: signal to terminate parent. | |
188 trap -- "sighandler 15" "TERM" | |
189 # load config file. | |
190 loadconfig "$1" | |
191 # fetch feeds and store in temporary directory. | |
192 tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')" | |
193 # make sure path exists. | |
194 mkdir -p "${tscrapepath}" | |
195 # fetch feeds specified in config file. | |
196 feeds | |
197 # wait till all feeds are fetched (concurrently). | |
198 [ ${signo} -eq 0 ] && wait | |
199 # cleanup temporary files etc. | |
200 cleanup | |
201 # on signal SIGINT and SIGTERM exit with signal number + 128. | |
202 [ ${signo} -ne 0 ] && exit $((signo+128)) | |
203 exit 0 |