GopherProxy

	tscrape_update - tscrape - twitter scraper (not working anymore)
	git clone git://git.codemadness.org/tscrape
	Log
	Files
	Refs
	README
	LICENSE
	---
	tscrape_update (4960B)
	---
	1 #!/bin/sh
	2 # update feeds, merge with old feeds.
	3 # NOTE: assumes "tscrape_*" executables are in $PATH.
	4
	5 # defaults
	6 tscrapepath="$HOME/.tscrape/feeds"
	7
	8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
	9 # feeds are finished at a time.
	10 maxjobs=8
	11
	12 # Twitter authentication bearer (seems to be static).
	13 bearer="AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3D…
	14
	15 # guest token.
	16 token=""
	17
	18 # load config (evaluate shellscript).
	19 # loadconfig(configfile)
	20 loadconfig() {
	21 # allow to specify config via argv[1].
	22 if [ "$1" != "" ]; then
	23 # get absolute path of config file.
	24 config=$(readlink -f "$1")
	25 else
	26 # default config location.
	27 config="$HOME/.tscrape/tscraperc"
	28 fi
	29
	30 # config is loaded here to be able to override $tscrapepath or f…
	31 if [ -r "${config}" ]; then
	32 . "${config}"
	33 else
	34 echo "Configuration file \"${config}\" does not exist or…
	35 echo "See tscraperc.example for an example." >&2
	36 exit 1
	37 fi
	38 }
	39
	40 # log(name, s)
	41 log() {
	42 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
	43 }
	44
	45 # acquire guest token.
	46 # guesttoken()
	47 guesttoken() {
	48 # fail on redirects, hide User-Agent, timeout is 15 seconds.
	49 curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
	50 -H "Authorization: Bearer ${bearer}" \
	51 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev…
	52 sed -nE 's@.\{"guest_token":"([^"])"\}.*@\1@p'
	53 }
	54
	55 # fetch a feed via HTTP/HTTPS etc.
	56 # fetch(name, twittername, feedfile)
	57 fetch() {
	58 url="https://api.twitter.com/1.1/statuses/user_timeline.json?scr…
	59
	60 # fail on redirects, hide User-Agent, timeout is 15 seconds.
	61 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
	62 -H "Authorization: Bearer ${bearer}" \
	63 -H "x-guest-token: $token" \
	64 "${url}" 2>/dev/null
	65 }
	66
	67 # filter fields.
	68 # filter(name)
	69 filter() {
	70 cat
	71 }
	72
	73 # merge raw files: unique sort by id, retweetid.
	74 # merge(name, oldfile, newfile)
	75 merge() {
	76 sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
	77 }
	78
	79 # order by timestamp (descending).
	80 # order(name)
	81 order() {
	82 sort -t ' ' -k1rn,1
	83 }
	84
	85 # fetch and parse feed.
	86 # feed(name, feedurl)
	87 feed() {
	88 # wait until ${maxjobs} are finished: will stall the queue if an…
	89 # is slow, but it is portable.
	90 [ ${signo} -ne 0 ] && return
	91 [ $((curjobs % maxjobs)) -eq 0 ] && wait
	92 [ ${signo} -ne 0 ] && return
	93 curjobs=$((curjobs + 1))
	94
	95 (name="$1"
	96 filename="$(printf '%s' "$1" \| tr '/' '_')"
	97 feedurl="$2"
	98
	99 tscrapefile="${tscrapepath}/${filename}"
	100 tmpfeedfile="${tscrapetmpdir}/${filename}"
	101
	102 if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedf…
	103 log "${name}" "FAIL (FETCH)"
	104 return
	105 fi
	106
	107 if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; th…
	108 log "${name}" "FAIL (CONVERT)"
	109 return
	110 fi
	111 rm -f "${tmpfeedfile}.fetch"
	112
	113 if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.f…
	114 log "${name}" "FAIL (FILTER)"
	115 return
	116 fi
	117 rm -f "${tmpfeedfile}.tsv"
	118
	119 # new feed data is empty: no need for below stages.
	120 if [ ! -s "${tmpfeedfile}.filter" ]; then
	121 log "${name}" "OK"
	122 return
	123 fi
	124
	125 # if file does not exist yet "merge" with /dev/null.
	126 if [ -e "${tscrapefile}" ]; then
	127 oldfile="${tscrapefile}"
	128 else
	129 oldfile="/dev/null"
	130 fi
	131
	132 if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${t…
	133 log "${name}" "FAIL (MERGE)"
	134 return
	135 fi
	136 rm -f "${tmpfeedfile}.filter"
	137
	138 if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.…
	139 log "${name}" "FAIL (ORDER)"
	140 return
	141 fi
	142 rm -f "${tmpfeedfile}.merge"
	143
	144 # copy
	145 if ! cp "${tmpfeedfile}.order" "${tscrapefile}"; then
	146 log "${name}" "FAIL (COPY)"
	147 return
	148 fi
	149 rm -f "${tmpfeedfile}.order"
	150
	151 # OK
	152 log "${name}" "OK"
	153 ) &
	154 }
	155
	156 cleanup() {
	157 # remove temporary directory with files.
	158 rm -rf "${tscrapetmpdir}"
	159 }
	160
	161 sighandler() {
	162 signo="$1"
	163 # ignore TERM signal for myself.
	164 trap -- "" TERM
	165 # kill all running childs >:D
	166 kill -TERM -$$
	167 }
	168
	169 feeds() {
	170 echo "Configuration file \"${config}\" is invalid or does not co…
	171 echo "See tscraperc.example for an example." >&2
	172 }
	173
	174 # get quest token.
	175 token=$(guesttoken)
	176 if [ -z "${token}" ]; then
	177 echo "Failed to acquire guest token" >&2
	178 exit 1
	179 fi
	180
	181 # job counter.
	182 curjobs=0
	183 # signal number received for parent.
	184 signo=0
	185 # SIGINT: signal to interrupt parent.
	186 trap -- "sighandler 2" "INT"
	187 # SIGTERM: signal to terminate parent.
	188 trap -- "sighandler 15" "TERM"
	189 # load config file.
	190 loadconfig "$1"
	191 # fetch feeds and store in temporary directory.
	192 tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
	193 # make sure path exists.
	194 mkdir -p "${tscrapepath}"
	195 # fetch feeds specified in config file.
	196 feeds
	197 # wait till all feeds are fetched (concurrently).
	198 [ ${signo} -eq 0 ] && wait
	199 # cleanup temporary files etc.
	200 cleanup
	201 # on signal SIGINT and SIGTERM exit with signal number + 128.
	202 [ ${signo} -ne 0 ] && exit $((signo+128))
	203 exit 0