#!/bin/sh
# update feeds, merge with old feeds.
# NOTE: assumes "tscrape_*" executables are in $PATH.

# defaults
tscrapepath="$HOME/.tscrape/feeds"

# used for processing feeds concurrently: wait until ${maxjobs} amount of
# feeds are finished at a time.
maxjobs=8

# Twitter authentication bearer (seems to be static).
bearer="AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"

# guest token.
token=""

# load config (evaluate shellscript).
# loadconfig(configfile)
loadconfig() {
	# allow to specify config via argv[1].
	if [ "$1" != "" ]; then
		# get absolute path of config file.
		config=$(readlink -f "$1")
	else
		# default config location.
		config="$HOME/.tscrape/tscraperc"
	fi

	# config is loaded here to be able to override $tscrapepath or functions.
	if [ -r "${config}" ]; then
		. "${config}"
	else
		echo "Configuration file \"${config}\" does not exist or is not readable." >&2
		echo "See tscraperc.example for an example." >&2
		exit 1
	fi
}

# log(name, s)
log() {
	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
}

# acquire guest token.
# guesttoken()
guesttoken() {
	# fail on redirects, hide User-Agent, timeout is 15 seconds.
	curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
		-H "Authorization: Bearer ${bearer}" \
		'https://api.twitter.com/1.1/guest/activate.json' 2>/dev/null | \
		sed -nE 's@.*\{"guest_token":"([^"]*)"\}.*@\1@p'
}

# fetch a feed via HTTP/HTTPS etc.
# fetch(name, twittername, feedfile)
fetch() {
	url="https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=$2&tweet_mode=extended&count=100&include_rts=1"

	# fail on redirects, hide User-Agent, timeout is 15 seconds.
	curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
		-H "Authorization: Bearer ${bearer}" \
		-H "x-guest-token: $token" \
		"${url}" 2>/dev/null
}

# filter fields.
# filter(name)
filter() {
	cat
}

# merge raw files: unique sort by id, retweetid.
# merge(name, oldfile, newfile)
merge() {
	sort -t '	' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
}

# order by timestamp (descending).
# order(name)
order() {
	sort -t '	' -k1rn,1
}

# fetch and parse feed.
# feed(name, feedurl)
feed() {
	# wait until ${maxjobs} are finished: will stall the queue if an item
	# is slow, but it is portable.
	[ ${signo} -ne 0 ] && return
	[ $((curjobs % maxjobs)) -eq 0 ] && wait
	[ ${signo} -ne 0 ] && return
	curjobs=$((curjobs + 1))

	(name="$1"
	filename="$(printf '%s' "$1" | tr '/' '_')"
	feedurl="$2"

	tscrapefile="${tscrapepath}/${filename}"
	tmpfeedfile="${tscrapetmpdir}/${filename}"

	if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedfile}.fetch"; then
		log "${name}" "FAIL (FETCH)"
		return
	fi

	if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; then
		log "${name}" "FAIL (CONVERT)"
		return
	fi
	rm -f "${tmpfeedfile}.fetch"

	if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
		log "${name}" "FAIL (FILTER)"
		return
	fi
	rm -f "${tmpfeedfile}.tsv"

	# new feed data is empty: no need for below stages.
	if [ ! -s "${tmpfeedfile}.filter" ]; then
		log "${name}" "OK"
		return
	fi

	# if file does not exist yet "merge" with /dev/null.
	if [ -e "${tscrapefile}" ]; then
		oldfile="${tscrapefile}"
	else
		oldfile="/dev/null"
	fi

	if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
		log "${name}" "FAIL (MERGE)"
		return
	fi
	rm -f "${tmpfeedfile}.filter"

	if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
		log "${name}" "FAIL (ORDER)"
		return
	fi
	rm -f "${tmpfeedfile}.merge"

	# copy
	if ! cp "${tmpfeedfile}.order" "${tscrapefile}"; then
		log "${name}" "FAIL (COPY)"
		return
	fi
	rm -f "${tmpfeedfile}.order"

	# OK
	log "${name}" "OK"
	) &
}

cleanup() {
	# remove temporary directory with files.
	rm -rf "${tscrapetmpdir}"
}

sighandler() {
	signo="$1"
	# ignore TERM signal for myself.
	trap -- "" TERM
	# kill all running childs >:D
	kill -TERM -$$
}

feeds() {
	echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
	echo "See tscraperc.example for an example." >&2
}

# get quest token.
token=$(guesttoken)
if [ -z "${token}" ]; then
	echo "Failed to acquire guest token" >&2
	exit 1
fi

# job counter.
curjobs=0
# signal number received for parent.
signo=0
# SIGINT: signal to interrupt parent.
trap -- "sighandler 2" "INT"
# SIGTERM: signal to terminate parent.
trap -- "sighandler 15" "TERM"
# load config file.
loadconfig "$1"
# fetch feeds and store in temporary directory.
tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
# make sure path exists.
mkdir -p "${tscrapepath}"
# fetch feeds specified in config file.
feeds
# wait till all feeds are fetched (concurrently).
[ ${signo} -eq 0 ] && wait
# cleanup temporary files etc.
cleanup
# on signal SIGINT and SIGTERM exit with signal number + 128.
[ ${signo} -ne 0 ] && exit $((signo+128))
exit 0
