sfeed

simple feed reader - forked from https://git.codemadness.org/sfeed
git clone git://src.gearsix.net/sfeedsfeed.zip
Log | Files | Refs | Atom | README | LICENSE

sfeed_update (raw) (5664B)


   1 #!/bin/sh
   2 # update feeds, merge with old feeds.
   3 # NOTE: assumes "sfeed_*" executables are in $PATH.
   4 
   5 # defaults
   6 sfeedpath="$HOME/.sfeed/feeds"
   7 
   8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
   9 # feeds are finished at a time.
  10 maxjobs=8
  11 
  12 # load config (evaluate shellscript).
  13 # loadconfig(configfile)
  14 loadconfig() {
  15 	# allow to specify config via argv[1].
  16 	if [ "$1" != "" ]; then
  17 		# get absolute path of config file required for including.
  18 		config="$1"
  19 		path=$(readlink -f "${config}" 2>/dev/null)
  20 	else
  21 		# default config location.
  22 		config="$HOME/.sfeed/sfeedrc"
  23 		path="${config}"
  24 	fi
  25 
  26 	# config is loaded here to be able to override $sfeedpath or functions.
  27 	if [ -r "${path}" ]; then
  28 		. "${path}"
  29 	else
  30 		printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
  31 		echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2
  32 		exit 1
  33 	fi
  34 }
  35 
  36 # log(name, s)
  37 log() {
  38 	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
  39 }
  40 
  41 # log_error(name, s)
  42 log_error() {
  43 	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
  44 	# set error exit status indicator for parallel jobs.
  45 	rm -f "${sfeedtmpdir}/ok"
  46 }
  47 
  48 # fetch a feed via HTTP/HTTPS etc.
  49 # fetch(name, url, feedfile)
  50 fetch() {
  51 	# fail on redirects, hide User-Agent, timeout is 15 seconds.
  52 	curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
  53 		"$2" 2>/dev/null
  54 }
  55 
  56 # convert encoding from one encoding to another.
  57 # convertencoding(name, from, to)
  58 convertencoding() {
  59 	if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
  60 		iconv -cs -f "$2" -t "$3" 2> /dev/null
  61 	else
  62 		# else no convert, just output.
  63 		cat
  64 	fi
  65 }
  66 
  67 # parse and convert input, by default XML to the sfeed(5) TSV format.
  68 # parse(name, feedurl, basesiteurl)
  69 parse() {
  70 	sfeed "$3"
  71 }
  72 
  73 # filter fields.
  74 # filter(name)
  75 filter() {
  76 	cat
  77 }
  78 
  79 # merge raw files: unique sort by id, title, link.
  80 # merge(name, oldfile, newfile)
  81 merge() {
  82 	sort -t '	' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
  83 }
  84 
  85 # order by timestamp (descending).
  86 # order(name)
  87 order() {
  88 	sort -t '	' -k1rn,1
  89 }
  90 
  91 # internal handler to fetch and process a feed.
  92 # _feed(name, feedurl, [basesiteurl], [encoding])
  93 _feed() {
  94 	name="$1"
  95 	feedurl="$2"
  96 	basesiteurl="$3"
  97 	encoding="$4"
  98 
  99 	filename="$(printf '%s' "${name}" | tr '/' '_')"
 100 	sfeedfile="${sfeedpath}/${filename}"
 101 	tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
 102 
 103 	# if file does not exist yet create it.
 104 	[ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
 105 
 106 	if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
 107 		log_error "${name}" "FAIL (FETCH)"
 108 		return 1
 109 	fi
 110 
 111 	# try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
 112 	[ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
 113 
 114 	if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
 115 		log_error "${name}" "FAIL (ENCODING)"
 116 		return 1
 117 	fi
 118 	rm -f "${tmpfeedfile}.fetch"
 119 
 120 	# if baseurl is empty then use feedurl.
 121 	if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
 122 		log_error "${name}" "FAIL (PARSE)"
 123 		return 1
 124 	fi
 125 	rm -f "${tmpfeedfile}.utf8"
 126 
 127 	if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
 128 		log_error "${name}" "FAIL (FILTER)"
 129 		return 1
 130 	fi
 131 	rm -f "${tmpfeedfile}.tsv"
 132 
 133 	# new feed data is empty: no need for below stages.
 134 	if [ ! -s "${tmpfeedfile}.filter" ]; then
 135 		log "${name}" "OK"
 136 		return 0
 137 	fi
 138 
 139 	if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
 140 		log_error "${name}" "FAIL (MERGE)"
 141 		return 1
 142 	fi
 143 	rm -f "${tmpfeedfile}.filter"
 144 
 145 	if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
 146 		log_error "${name}" "FAIL (ORDER)"
 147 		return 1
 148 	fi
 149 	rm -f "${tmpfeedfile}.merge"
 150 
 151 	# copy
 152 	if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
 153 		log_error "${name}" "FAIL (COPY)"
 154 		return 1
 155 	fi
 156 	rm -f "${tmpfeedfile}.order"
 157 
 158 	# OK
 159 	log "${name}" "OK"
 160 	return 0
 161 }
 162 
 163 # fetch and process a feed in parallel.
 164 # feed(name, feedurl, [basesiteurl], [encoding])
 165 feed() {
 166 	# wait until ${maxjobs} are finished: will stall the queue if an item
 167 	# is slow, but it is portable.
 168 	[ ${signo} -ne 0 ] && return
 169 	[ $((curjobs % maxjobs)) -eq 0 ] && wait
 170 	[ ${signo} -ne 0 ] && return
 171 	curjobs=$((curjobs + 1))
 172 
 173 	_feed "$@" &
 174 }
 175 
 176 cleanup() {
 177 	# remove temporary directory with feed files.
 178 	rm -rf "${sfeedtmpdir}"
 179 }
 180 
 181 sighandler() {
 182 	signo="$1"
 183 	# ignore TERM signal for myself.
 184 	trap -- "" TERM
 185 	# kill all running children >:D
 186 	kill -TERM -$$
 187 }
 188 
 189 feeds() {
 190 	printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
 191 	echo "See sfeedrc.example for an example." >&2
 192 }
 193 
 194 main() {
 195 	# job counter.
 196 	curjobs=0
 197 	# signal number received for parent.
 198 	signo=0
 199 	# SIGINT: signal to interrupt parent.
 200 	trap -- "sighandler 2" "INT"
 201 	# SIGTERM: signal to terminate parent.
 202 	trap -- "sighandler 15" "TERM"
 203 	# load config file.
 204 	loadconfig "$1"
 205 	# fetch feeds and store in temporary directory.
 206 	sfeedtmpdir="$(mktemp -d '/tmp/sfeed_XXXXXX')"
 207 	mkdir -p "${sfeedtmpdir}/feeds"
 208 	touch "${sfeedtmpdir}/ok"
 209 	# make sure path exists.
 210 	mkdir -p "${sfeedpath}"
 211 	# fetch feeds specified in config file.
 212 	feeds
 213 	# wait till all feeds are fetched (concurrently).
 214 	[ ${signo} -eq 0 ] && wait
 215 	# check error exit status indicator for parallel jobs.
 216 	[ -f "${sfeedtmpdir}/ok" ]
 217 	status=$?
 218 	# cleanup temporary files etc.
 219 	cleanup
 220 	# on signal SIGINT and SIGTERM exit with signal number + 128.
 221 	[ ${signo} -ne 0 ] && exit $((signo+128))
 222 	exit ${status}
 223 }
 224 
 225 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"