sfeed_update (5664B)
1 #!/bin/sh 2 # update feeds, merge with old feeds. 3 # NOTE: assumes "sfeed_*" executables are in $PATH. 4 5 # defaults 6 sfeedpath="$HOME/.sfeed/feeds" 7 8 # used for processing feeds concurrently: wait until ${maxjobs} amount of 9 # feeds are finished at a time. 10 maxjobs=8 11 12 # load config (evaluate shellscript). 13 # loadconfig(configfile) 14 loadconfig() { 15 # allow to specify config via argv[1]. 16 if [ "$1" != "" ]; then 17 # get absolute path of config file required for including. 18 config="$1" 19 path=$(readlink -f "${config}" 2>/dev/null) 20 else 21 # default config location. 22 config="$HOME/.sfeed/sfeedrc" 23 path="${config}" 24 fi 25 26 # config is loaded here to be able to override $sfeedpath or functions. 27 if [ -r "${path}" ]; then 28 . "${path}" 29 else 30 printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2 31 echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2 32 exit 1 33 fi 34 } 35 36 # log(name, s) 37 log() { 38 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" 39 } 40 41 # log_error(name, s) 42 log_error() { 43 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2 44 # set error exit status indicator for parallel jobs. 45 rm -f "${sfeedtmpdir}/ok" 46 } 47 48 # fetch a feed via HTTP/HTTPS etc. 49 # fetch(name, url, feedfile) 50 fetch() { 51 # fail on redirects, hide User-Agent, timeout is 15 seconds. 52 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ 53 "$2" 2>/dev/null 54 } 55 56 # convert encoding from one encoding to another. 57 # convertencoding(name, from, to) 58 convertencoding() { 59 if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then 60 iconv -cs -f "$2" -t "$3" 2> /dev/null 61 else 62 # else no convert, just output. 63 cat 64 fi 65 } 66 67 # parse and convert input, by default XML to the sfeed(5) TSV format. 68 # parse(name, feedurl, basesiteurl) 69 parse() { 70 sfeed "$3" 71 } 72 73 # filter fields. 74 # filter(name) 75 filter() { 76 cat 77 } 78 79 # merge raw files: unique sort by id, title, link. 80 # merge(name, oldfile, newfile) 81 merge() { 82 sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null 83 } 84 85 # order by timestamp (descending). 86 # order(name) 87 order() { 88 sort -t ' ' -k1rn,1 89 } 90 91 # internal handler to fetch and process a feed. 92 # _feed(name, feedurl, [basesiteurl], [encoding]) 93 _feed() { 94 name="$1" 95 feedurl="$2" 96 basesiteurl="$3" 97 encoding="$4" 98 99 filename="$(printf '%s' "${name}" | tr '/' '_')" 100 sfeedfile="${sfeedpath}/${filename}" 101 tmpfeedfile="${sfeedtmpdir}/feeds/${filename}" 102 103 # if file does not exist yet create it. 104 [ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null 105 106 if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then 107 log_error "${name}" "FAIL (FETCH)" 108 return 1 109 fi 110 111 # try to detect encoding (if not specified). if detecting the encoding fails assume utf-8. 112 [ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch") 113 114 if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then 115 log_error "${name}" "FAIL (ENCODING)" 116 return 1 117 fi 118 rm -f "${tmpfeedfile}.fetch" 119 120 # if baseurl is empty then use feedurl. 121 if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then 122 log_error "${name}" "FAIL (PARSE)" 123 return 1 124 fi 125 rm -f "${tmpfeedfile}.utf8" 126 127 if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then 128 log_error "${name}" "FAIL (FILTER)" 129 return 1 130 fi 131 rm -f "${tmpfeedfile}.tsv" 132 133 # new feed data is empty: no need for below stages. 134 if [ ! -s "${tmpfeedfile}.filter" ]; then 135 log "${name}" "OK" 136 return 0 137 fi 138 139 if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then 140 log_error "${name}" "FAIL (MERGE)" 141 return 1 142 fi 143 rm -f "${tmpfeedfile}.filter" 144 145 if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then 146 log_error "${name}" "FAIL (ORDER)" 147 return 1 148 fi 149 rm -f "${tmpfeedfile}.merge" 150 151 # copy 152 if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then 153 log_error "${name}" "FAIL (COPY)" 154 return 1 155 fi 156 rm -f "${tmpfeedfile}.order" 157 158 # OK 159 log "${name}" "OK" 160 return 0 161 } 162 163 # fetch and process a feed in parallel. 164 # feed(name, feedurl, [basesiteurl], [encoding]) 165 feed() { 166 # wait until ${maxjobs} are finished: will stall the queue if an item 167 # is slow, but it is portable. 168 [ ${signo} -ne 0 ] && return 169 [ $((curjobs % maxjobs)) -eq 0 ] && wait 170 [ ${signo} -ne 0 ] && return 171 curjobs=$((curjobs + 1)) 172 173 _feed "$@" & 174 } 175 176 cleanup() { 177 # remove temporary directory with feed files. 178 rm -rf "${sfeedtmpdir}" 179 } 180 181 sighandler() { 182 signo="$1" 183 # ignore TERM signal for myself. 184 trap -- "" TERM 185 # kill all running children >:D 186 kill -TERM -$$ 187 } 188 189 feeds() { 190 printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2 191 echo "See sfeedrc.example for an example." >&2 192 } 193 194 main() { 195 # job counter. 196 curjobs=0 197 # signal number received for parent. 198 signo=0 199 # SIGINT: signal to interrupt parent. 200 trap -- "sighandler 2" "INT" 201 # SIGTERM: signal to terminate parent. 202 trap -- "sighandler 15" "TERM" 203 # load config file. 204 loadconfig "$1" 205 # fetch feeds and store in temporary directory. 206 sfeedtmpdir="$(mktemp -d '/tmp/sfeed_XXXXXX')" 207 mkdir -p "${sfeedtmpdir}/feeds" 208 touch "${sfeedtmpdir}/ok" 209 # make sure path exists. 210 mkdir -p "${sfeedpath}" 211 # fetch feeds specified in config file. 212 feeds 213 # wait till all feeds are fetched (concurrently). 214 [ ${signo} -eq 0 ] && wait 215 # check error exit status indicator for parallel jobs. 216 [ -f "${sfeedtmpdir}/ok" ] 217 status=$? 218 # cleanup temporary files etc. 219 cleanup 220 # on signal SIGINT and SIGTERM exit with signal number + 128. 221 [ ${signo} -ne 0 ] && exit $((signo+128)) 222 exit ${status} 223 } 224 225 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"