Introduction
Introduction Statistics Contact Development Disclaimer Help
tscrape_update - tscrape - twitter scraper (not working anymore)
git clone git://git.codemadness.org/tscrape
Log
Files
Refs
README
LICENSE
---
tscrape_update (4960B)
---
1 #!/bin/sh
2 # update feeds, merge with old feeds.
3 # NOTE: assumes "tscrape_*" executables are in $PATH.
4
5 # defaults
6 tscrapepath="$HOME/.tscrape/feeds"
7
8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
9 # feeds are finished at a time.
10 maxjobs=8
11
12 # Twitter authentication bearer (seems to be static).
13 bearer="AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3D…
14
15 # guest token.
16 token=""
17
18 # load config (evaluate shellscript).
19 # loadconfig(configfile)
20 loadconfig() {
21 # allow to specify config via argv[1].
22 if [ "$1" != "" ]; then
23 # get absolute path of config file.
24 config=$(readlink -f "$1")
25 else
26 # default config location.
27 config="$HOME/.tscrape/tscraperc"
28 fi
29
30 # config is loaded here to be able to override $tscrapepath or f…
31 if [ -r "${config}" ]; then
32 . "${config}"
33 else
34 echo "Configuration file \"${config}\" does not exist or…
35 echo "See tscraperc.example for an example." >&2
36 exit 1
37 fi
38 }
39
40 # log(name, s)
41 log() {
42 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
43 }
44
45 # acquire guest token.
46 # guesttoken()
47 guesttoken() {
48 # fail on redirects, hide User-Agent, timeout is 15 seconds.
49 curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
50 -H "Authorization: Bearer ${bearer}" \
51 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev…
52 sed -nE 's@.*\{"guest_token":"([^"]*)"\}.*@\1@p'
53 }
54
55 # fetch a feed via HTTP/HTTPS etc.
56 # fetch(name, twittername, feedfile)
57 fetch() {
58 url="https://api.twitter.com/1.1/statuses/user_timeline.json?scr…
59
60 # fail on redirects, hide User-Agent, timeout is 15 seconds.
61 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
62 -H "Authorization: Bearer ${bearer}" \
63 -H "x-guest-token: $token" \
64 "${url}" 2>/dev/null
65 }
66
67 # filter fields.
68 # filter(name)
69 filter() {
70 cat
71 }
72
73 # merge raw files: unique sort by id, retweetid.
74 # merge(name, oldfile, newfile)
75 merge() {
76 sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
77 }
78
79 # order by timestamp (descending).
80 # order(name)
81 order() {
82 sort -t ' ' -k1rn,1
83 }
84
85 # fetch and parse feed.
86 # feed(name, feedurl)
87 feed() {
88 # wait until ${maxjobs} are finished: will stall the queue if an…
89 # is slow, but it is portable.
90 [ ${signo} -ne 0 ] && return
91 [ $((curjobs % maxjobs)) -eq 0 ] && wait
92 [ ${signo} -ne 0 ] && return
93 curjobs=$((curjobs + 1))
94
95 (name="$1"
96 filename="$(printf '%s' "$1" | tr '/' '_')"
97 feedurl="$2"
98
99 tscrapefile="${tscrapepath}/${filename}"
100 tmpfeedfile="${tscrapetmpdir}/${filename}"
101
102 if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedf…
103 log "${name}" "FAIL (FETCH)"
104 return
105 fi
106
107 if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; th…
108 log "${name}" "FAIL (CONVERT)"
109 return
110 fi
111 rm -f "${tmpfeedfile}.fetch"
112
113 if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.f…
114 log "${name}" "FAIL (FILTER)"
115 return
116 fi
117 rm -f "${tmpfeedfile}.tsv"
118
119 # new feed data is empty: no need for below stages.
120 if [ ! -s "${tmpfeedfile}.filter" ]; then
121 log "${name}" "OK"
122 return
123 fi
124
125 # if file does not exist yet "merge" with /dev/null.
126 if [ -e "${tscrapefile}" ]; then
127 oldfile="${tscrapefile}"
128 else
129 oldfile="/dev/null"
130 fi
131
132 if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${t…
133 log "${name}" "FAIL (MERGE)"
134 return
135 fi
136 rm -f "${tmpfeedfile}.filter"
137
138 if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.…
139 log "${name}" "FAIL (ORDER)"
140 return
141 fi
142 rm -f "${tmpfeedfile}.merge"
143
144 # copy
145 if ! cp "${tmpfeedfile}.order" "${tscrapefile}"; then
146 log "${name}" "FAIL (COPY)"
147 return
148 fi
149 rm -f "${tmpfeedfile}.order"
150
151 # OK
152 log "${name}" "OK"
153 ) &
154 }
155
156 cleanup() {
157 # remove temporary directory with files.
158 rm -rf "${tscrapetmpdir}"
159 }
160
161 sighandler() {
162 signo="$1"
163 # ignore TERM signal for myself.
164 trap -- "" TERM
165 # kill all running childs >:D
166 kill -TERM -$$
167 }
168
169 feeds() {
170 echo "Configuration file \"${config}\" is invalid or does not co…
171 echo "See tscraperc.example for an example." >&2
172 }
173
174 # get quest token.
175 token=$(guesttoken)
176 if [ -z "${token}" ]; then
177 echo "Failed to acquire guest token" >&2
178 exit 1
179 fi
180
181 # job counter.
182 curjobs=0
183 # signal number received for parent.
184 signo=0
185 # SIGINT: signal to interrupt parent.
186 trap -- "sighandler 2" "INT"
187 # SIGTERM: signal to terminate parent.
188 trap -- "sighandler 15" "TERM"
189 # load config file.
190 loadconfig "$1"
191 # fetch feeds and store in temporary directory.
192 tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
193 # make sure path exists.
194 mkdir -p "${tscrapepath}"
195 # fetch feeds specified in config file.
196 feeds
197 # wait till all feeds are fetched (concurrently).
198 [ ${signo} -eq 0 ] && wait
199 # cleanup temporary files etc.
200 cleanup
201 # on signal SIGINT and SIGTERM exit with signal number + 128.
202 [ ${signo} -ne 0 ] && exit $((signo+128))
203 exit 0
You are viewing proxied material from codemadness.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.