sfeed

simple feed reader - forked from git.codemadness.org/sfeed
git clone git://src.gearsix.net/sfeed
Log | Files | Refs | Atom | README | LICENSE

sfeed_update (5664B)


      1 #!/bin/sh
      2 # update feeds, merge with old feeds.
      3 # NOTE: assumes "sfeed_*" executables are in $PATH.
      4 
      5 # defaults
      6 sfeedpath="$HOME/.sfeed/feeds"
      7 
      8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
      9 # feeds are finished at a time.
     10 maxjobs=8
     11 
     12 # load config (evaluate shellscript).
     13 # loadconfig(configfile)
     14 loadconfig() {
     15 	# allow to specify config via argv[1].
     16 	if [ "$1" != "" ]; then
     17 		# get absolute path of config file required for including.
     18 		config="$1"
     19 		path=$(readlink -f "${config}" 2>/dev/null)
     20 	else
     21 		# default config location.
     22 		config="$HOME/.sfeed/sfeedrc"
     23 		path="${config}"
     24 	fi
     25 
     26 	# config is loaded here to be able to override $sfeedpath or functions.
     27 	if [ -r "${path}" ]; then
     28 		. "${path}"
     29 	else
     30 		printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
     31 		echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2
     32 		exit 1
     33 	fi
     34 }
     35 
     36 # log(name, s)
     37 log() {
     38 	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
     39 }
     40 
     41 # log_error(name, s)
     42 log_error() {
     43 	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
     44 	# set error exit status indicator for parallel jobs.
     45 	rm -f "${sfeedtmpdir}/ok"
     46 }
     47 
     48 # fetch a feed via HTTP/HTTPS etc.
     49 # fetch(name, url, feedfile)
     50 fetch() {
     51 	# fail on redirects, hide User-Agent, timeout is 15 seconds.
     52 	curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
     53 		"$2" 2>/dev/null
     54 }
     55 
     56 # convert encoding from one encoding to another.
     57 # convertencoding(name, from, to)
     58 convertencoding() {
     59 	if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
     60 		iconv -cs -f "$2" -t "$3" 2> /dev/null
     61 	else
     62 		# else no convert, just output.
     63 		cat
     64 	fi
     65 }
     66 
     67 # parse and convert input, by default XML to the sfeed(5) TSV format.
     68 # parse(name, feedurl, basesiteurl)
     69 parse() {
     70 	sfeed "$3"
     71 }
     72 
     73 # filter fields.
     74 # filter(name)
     75 filter() {
     76 	cat
     77 }
     78 
     79 # merge raw files: unique sort by id, title, link.
     80 # merge(name, oldfile, newfile)
     81 merge() {
     82 	sort -t '	' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
     83 }
     84 
     85 # order by timestamp (descending).
     86 # order(name)
     87 order() {
     88 	sort -t '	' -k1rn,1
     89 }
     90 
     91 # internal handler to fetch and process a feed.
     92 # _feed(name, feedurl, [basesiteurl], [encoding])
     93 _feed() {
     94 	name="$1"
     95 	feedurl="$2"
     96 	basesiteurl="$3"
     97 	encoding="$4"
     98 
     99 	filename="$(printf '%s' "${name}" | tr '/' '_')"
    100 	sfeedfile="${sfeedpath}/${filename}"
    101 	tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
    102 
    103 	# if file does not exist yet create it.
    104 	[ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
    105 
    106 	if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
    107 		log_error "${name}" "FAIL (FETCH)"
    108 		return 1
    109 	fi
    110 
    111 	# try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
    112 	[ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
    113 
    114 	if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
    115 		log_error "${name}" "FAIL (ENCODING)"
    116 		return 1
    117 	fi
    118 	rm -f "${tmpfeedfile}.fetch"
    119 
    120 	# if baseurl is empty then use feedurl.
    121 	if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
    122 		log_error "${name}" "FAIL (PARSE)"
    123 		return 1
    124 	fi
    125 	rm -f "${tmpfeedfile}.utf8"
    126 
    127 	if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
    128 		log_error "${name}" "FAIL (FILTER)"
    129 		return 1
    130 	fi
    131 	rm -f "${tmpfeedfile}.tsv"
    132 
    133 	# new feed data is empty: no need for below stages.
    134 	if [ ! -s "${tmpfeedfile}.filter" ]; then
    135 		log "${name}" "OK"
    136 		return 0
    137 	fi
    138 
    139 	if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
    140 		log_error "${name}" "FAIL (MERGE)"
    141 		return 1
    142 	fi
    143 	rm -f "${tmpfeedfile}.filter"
    144 
    145 	if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
    146 		log_error "${name}" "FAIL (ORDER)"
    147 		return 1
    148 	fi
    149 	rm -f "${tmpfeedfile}.merge"
    150 
    151 	# copy
    152 	if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
    153 		log_error "${name}" "FAIL (COPY)"
    154 		return 1
    155 	fi
    156 	rm -f "${tmpfeedfile}.order"
    157 
    158 	# OK
    159 	log "${name}" "OK"
    160 	return 0
    161 }
    162 
    163 # fetch and process a feed in parallel.
    164 # feed(name, feedurl, [basesiteurl], [encoding])
    165 feed() {
    166 	# wait until ${maxjobs} are finished: will stall the queue if an item
    167 	# is slow, but it is portable.
    168 	[ ${signo} -ne 0 ] && return
    169 	[ $((curjobs % maxjobs)) -eq 0 ] && wait
    170 	[ ${signo} -ne 0 ] && return
    171 	curjobs=$((curjobs + 1))
    172 
    173 	_feed "$@" &
    174 }
    175 
    176 cleanup() {
    177 	# remove temporary directory with feed files.
    178 	rm -rf "${sfeedtmpdir}"
    179 }
    180 
    181 sighandler() {
    182 	signo="$1"
    183 	# ignore TERM signal for myself.
    184 	trap -- "" TERM
    185 	# kill all running children >:D
    186 	kill -TERM -$$
    187 }
    188 
    189 feeds() {
    190 	printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
    191 	echo "See sfeedrc.example for an example." >&2
    192 }
    193 
    194 main() {
    195 	# job counter.
    196 	curjobs=0
    197 	# signal number received for parent.
    198 	signo=0
    199 	# SIGINT: signal to interrupt parent.
    200 	trap -- "sighandler 2" "INT"
    201 	# SIGTERM: signal to terminate parent.
    202 	trap -- "sighandler 15" "TERM"
    203 	# load config file.
    204 	loadconfig "$1"
    205 	# fetch feeds and store in temporary directory.
    206 	sfeedtmpdir="$(mktemp -d '/tmp/sfeed_XXXXXX')"
    207 	mkdir -p "${sfeedtmpdir}/feeds"
    208 	touch "${sfeedtmpdir}/ok"
    209 	# make sure path exists.
    210 	mkdir -p "${sfeedpath}"
    211 	# fetch feeds specified in config file.
    212 	feeds
    213 	# wait till all feeds are fetched (concurrently).
    214 	[ ${signo} -eq 0 ] && wait
    215 	# check error exit status indicator for parallel jobs.
    216 	[ -f "${sfeedtmpdir}/ok" ]
    217 	status=$?
    218 	# cleanup temporary files etc.
    219 	cleanup
    220 	# on signal SIGINT and SIGTERM exit with signal number + 128.
    221 	[ ${signo} -ne 0 ] && exit $((signo+128))
    222 	exit ${status}
    223 }
    224 
    225 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"