sfeed

simple feed reader - forked from git.codemadness.org/sfeed
git clone git://src.gearsix.net/sfeedsfeed.zip
Log | Files | Refs | Atom | README | LICENSE

sfeed_update (raw) (6730B)


   1 #!/bin/sh
   2 # update feeds, merge with old feeds.
   3 # NOTE: assumes "sfeed_*" executables are in $PATH.
   4 
   5 # defaults
   6 sfeedpath="$HOME/.sfeed/feeds"
   7 
   8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
   9 # feeds are finished at a time.
  10 maxjobs=16
  11 
  12 # load config (evaluate shellscript).
  13 # loadconfig(configfile)
  14 loadconfig() {
  15 	# allow to specify config via argv[1].
  16 	if [ "$1" != "" ]; then
  17 		# get absolute path of config file required for including.
  18 		config="$1"
  19 		configpath=$(readlink -f "${config}" 2>/dev/null)
  20 	else
  21 		# default config location.
  22 		config="$HOME/.sfeed/sfeedrc"
  23 		configpath="${config}"
  24 	fi
  25 
  26 	# config is loaded here to be able to override $sfeedpath or functions.
  27 	if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then
  28 		. "${configpath}"
  29 	else
  30 		printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
  31 		echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2
  32 		die
  33 	fi
  34 }
  35 
  36 # log(name, s)
  37 log() {
  38 	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
  39 }
  40 
  41 # log_error(name, s)
  42 log_error() {
  43 	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
  44 	# set error exit status indicator for parallel jobs.
  45 	rm -f "${sfeedtmpdir}/ok"
  46 }
  47 
  48 # fetch a feed via HTTP/HTTPS etc.
  49 # fetch(name, url, feedfile)
  50 fetch() {
  51 	# fail on redirects, hide User-Agent, timeout is 15 seconds.
  52 	curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
  53 		"$2" 2>/dev/null
  54 }
  55 
  56 # convert encoding from one encoding to another.
  57 # convertencoding(name, from, to)
  58 convertencoding() {
  59 	if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
  60 		iconv -cs -f "$2" -t "$3" 2> /dev/null
  61 	else
  62 		# else no convert, just output.
  63 		cat
  64 	fi
  65 }
  66 
  67 # parse and convert input, by default XML to the sfeed(5) TSV format.
  68 # parse(name, feedurl, basesiteurl)
  69 parse() {
  70 	sfeed "$3"
  71 }
  72 
  73 # filter fields.
  74 # filter(name, url)
  75 filter() {
  76 	cat
  77 }
  78 
  79 # merge raw files: unique sort by id, title, link.
  80 # merge(name, oldfile, newfile)
  81 merge() {
  82 	sort -t '	' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
  83 }
  84 
  85 # order by timestamp (descending).
  86 # order(name, url)
  87 order() {
  88 	sort -t '	' -k1rn,1 2>/dev/null
  89 }
  90 
  91 # internal handler to fetch and process a feed.
  92 # _feed(name, feedurl, [basesiteurl], [encoding])
  93 _feed() {
  94 	name="$1"
  95 	feedurl="$2"
  96 	basesiteurl="$3"
  97 	encoding="$4"
  98 
  99 	filename="$(printf '%s' "${name}" | tr '/' '_')"
 100 	sfeedfile="${sfeedpath}/${filename}"
 101 	tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
 102 
 103 	# if file does not exist yet create it.
 104 	[ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
 105 
 106 	if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
 107 		log_error "${name}" "FAIL (FETCH)"
 108 		return 1
 109 	fi
 110 
 111 	# try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
 112 	[ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
 113 
 114 	if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
 115 		log_error "${name}" "FAIL (ENCODING)"
 116 		return 1
 117 	fi
 118 	rm -f "${tmpfeedfile}.fetch"
 119 
 120 	# if baseurl is empty then use feedurl.
 121 	if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
 122 		log_error "${name}" "FAIL (PARSE)"
 123 		return 1
 124 	fi
 125 	rm -f "${tmpfeedfile}.utf8"
 126 
 127 	if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
 128 		log_error "${name}" "FAIL (FILTER)"
 129 		return 1
 130 	fi
 131 	rm -f "${tmpfeedfile}.tsv"
 132 
 133 	# new feed data is empty: no need for below stages.
 134 	if [ ! -s "${tmpfeedfile}.filter" ]; then
 135 		log "${name}" "OK"
 136 		return 0
 137 	fi
 138 
 139 	if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
 140 		log_error "${name}" "FAIL (MERGE)"
 141 		return 1
 142 	fi
 143 	rm -f "${tmpfeedfile}.filter"
 144 
 145 	if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
 146 		log_error "${name}" "FAIL (ORDER)"
 147 		return 1
 148 	fi
 149 	rm -f "${tmpfeedfile}.merge"
 150 
 151 	# copy
 152 	if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
 153 		log_error "${name}" "FAIL (COPY)"
 154 		return 1
 155 	fi
 156 	rm -f "${tmpfeedfile}.order"
 157 
 158 	# OK
 159 	log "${name}" "OK"
 160 	return 0
 161 }
 162 
 163 # fetch and process a feed in parallel.
 164 # feed(name, feedurl, [basesiteurl], [encoding])
 165 feed() {
 166 	# Output job parameters for xargs.
 167 	# Specify fields as a single parameter separated by a NUL byte.
 168 	# The parameter is split into fields later by the child process, this
 169 	# allows using xargs with empty fields across many implementations.
 170 	printf '%s\037%s\037%s\037%s\037%s\037%s\0' \
 171 		"${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4"
 172 }
 173 
 174 # cleanup()
 175 cleanup() {
 176 	# remove temporary directory with feed files.
 177 	rm -rf "${sfeedtmpdir}"
 178 }
 179 
 180 # die(statuscode)
 181 die() {
 182 	statuscode="${1:-1}" # default: exit 1
 183 	# cleanup temporary files etc.
 184 	cleanup
 185 	exit "${statuscode}"
 186 }
 187 
 188 # sighandler(signo)
 189 sighandler() {
 190 	signo="$1"
 191 	# ignore TERM signal for myself.
 192 	trap -- "" TERM
 193 	# kill all running children >:D
 194 	kill -TERM -$$
 195 }
 196 
 197 # feeds()
 198 feeds() {
 199 	printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
 200 	echo "See sfeedrc.example for an example." >&2
 201 	die
 202 }
 203 
 204 # runfeeds()
 205 runfeeds() {
 206 	# print feeds for parallel processing with xargs.
 207 	feeds > "${sfeedtmpdir}/jobs" || die
 208 	SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \
 209 		"$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs"
 210 }
 211 
 212 # main(args...)
 213 main() {
 214 	# signal number received for parent.
 215 	signo=0
 216 	# SIGINT: signal to interrupt parent.
 217 	trap -- "sighandler 2" "INT"
 218 	# SIGTERM: signal to terminate parent.
 219 	trap -- "sighandler 15" "TERM"
 220 	# load config file.
 221 	loadconfig "$1"
 222 	# fetch feeds and store in temporary directory.
 223 	sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die
 224 	mkdir -p "${sfeedtmpdir}/feeds"
 225 	touch "${sfeedtmpdir}/ok" || die
 226 	# make sure path exists.
 227 	mkdir -p "${sfeedpath}"
 228 	# run and process the feeds.
 229 	runfeeds
 230 	statuscode=$?
 231 
 232 	# check error exit status indicator for parallel jobs.
 233 	[ -f "${sfeedtmpdir}/ok" ] || statuscode=1
 234 	# on signal SIGINT and SIGTERM exit with signal number + 128.
 235 	[ ${signo} -ne 0 ] && die $((signo+128))
 236 	die ${statuscode}
 237 }
 238 
 239 # process a single feed.
 240 # parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding
 241 if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then
 242 	[ "$1" = "" ] && exit 0 # must have an argument set
 243 	# IFS is "\037"
 244 	printf '%s\n' "$1" | \
 245 	while IFS="" read -r _config _tmpdir _name _feedurl _basesiteurl _encoding; do
 246 		loadconfig "${_config}"
 247 		sfeedtmpdir="${_tmpdir}"
 248 		_feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_encoding}"
 249 		exit "$?"
 250 	done
 251 	exit 0
 252 fi
 253 
 254 # ...else parent mode:
 255 argv0="$0" # store $0, in the zsh shell $0 is the name of the function.
 256 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"