1*906afcb8SAndy Fiddaman#!/usr/bin/ksh93 2*906afcb8SAndy Fiddaman 3*906afcb8SAndy Fiddaman# 4*906afcb8SAndy Fiddaman# CDDL HEADER START 5*906afcb8SAndy Fiddaman# 6*906afcb8SAndy Fiddaman# The contents of this file are subject to the terms of the 7*906afcb8SAndy Fiddaman# Common Development and Distribution License (the "License"). 8*906afcb8SAndy Fiddaman# You may not use this file except in compliance with the License. 9*906afcb8SAndy Fiddaman# 10*906afcb8SAndy Fiddaman# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11*906afcb8SAndy Fiddaman# or http://www.opensolaris.org/os/licensing. 12*906afcb8SAndy Fiddaman# See the License for the specific language governing permissions 13*906afcb8SAndy Fiddaman# and limitations under the License. 14*906afcb8SAndy Fiddaman# 15*906afcb8SAndy Fiddaman# When distributing Covered Code, include this CDDL HEADER in each 16*906afcb8SAndy Fiddaman# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17*906afcb8SAndy Fiddaman# If applicable, add the following below this CDDL HEADER, with the 18*906afcb8SAndy Fiddaman# fields enclosed by brackets "[]" replaced with your own identifying 19*906afcb8SAndy Fiddaman# information: Portions Copyright [yyyy] [name of copyright owner] 20*906afcb8SAndy Fiddaman# 21*906afcb8SAndy Fiddaman# CDDL HEADER END 22*906afcb8SAndy Fiddaman# 23*906afcb8SAndy Fiddaman 24*906afcb8SAndy Fiddaman# 25*906afcb8SAndy Fiddaman# Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 26*906afcb8SAndy Fiddaman# 27*906afcb8SAndy Fiddaman 28*906afcb8SAndy Fiddaman# 29*906afcb8SAndy Fiddaman# rssread - a simple RSS2.0 reader with RSS to XHTML to 30*906afcb8SAndy Fiddaman# plaintext conversion. 31*906afcb8SAndy Fiddaman# 32*906afcb8SAndy Fiddaman 33*906afcb8SAndy Fiddaman# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 34*906afcb8SAndy Fiddamanexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 35*906afcb8SAndy Fiddaman 36*906afcb8SAndy Fiddamanfunction printmsg 37*906afcb8SAndy Fiddaman{ 38*906afcb8SAndy Fiddaman print -u2 "$*" 39*906afcb8SAndy Fiddaman} 40*906afcb8SAndy Fiddaman 41*906afcb8SAndy Fiddamanfunction debugmsg 42*906afcb8SAndy Fiddaman{ 43*906afcb8SAndy Fiddaman# printmsg "$*" 44*906afcb8SAndy Fiddamantrue 45*906afcb8SAndy Fiddaman} 46*906afcb8SAndy Fiddaman 47*906afcb8SAndy Fiddamanfunction fatal_error 48*906afcb8SAndy Fiddaman{ 49*906afcb8SAndy Fiddaman print -u2 "${progname}: $*" 50*906afcb8SAndy Fiddaman exit 1 51*906afcb8SAndy Fiddaman} 52*906afcb8SAndy Fiddaman 53*906afcb8SAndy Fiddamantypeset -T urlconnection_t=( 54*906afcb8SAndy Fiddaman # public 55*906afcb8SAndy Fiddaman typeset user_agent="ksh93/urlconnection_t" 56*906afcb8SAndy Fiddaman 57*906afcb8SAndy Fiddaman # private variables 58*906afcb8SAndy Fiddaman typeset protocol 59*906afcb8SAndy Fiddaman typeset path1 60*906afcb8SAndy Fiddaman typeset host 61*906afcb8SAndy Fiddaman typeset path 62*906afcb8SAndy Fiddaman typeset port 63*906afcb8SAndy Fiddaman 64*906afcb8SAndy Fiddaman compound netfd=( 65*906afcb8SAndy Fiddaman integer in=-1 # incoming traffic 66*906afcb8SAndy Fiddaman integer out=-1 # outgoing traffic 67*906afcb8SAndy Fiddaman ) 68*906afcb8SAndy Fiddaman 69*906afcb8SAndy Fiddaman # only used for https 70*906afcb8SAndy Fiddaman compound ssl=( 71*906afcb8SAndy Fiddaman compound fifo=( 72*906afcb8SAndy Fiddaman typeset dir="" 73*906afcb8SAndy Fiddaman typeset in="" 74*906afcb8SAndy Fiddaman typeset out="" 75*906afcb8SAndy Fiddaman ) 76*906afcb8SAndy Fiddaman integer openssl_client_pid=-1 77*906afcb8SAndy Fiddaman ) 78*906afcb8SAndy Fiddaman 79*906afcb8SAndy Fiddaman # parse HTTP return code, cookies etc. 80*906afcb8SAndy Fiddaman function parse_http_response 81*906afcb8SAndy Fiddaman { 82*906afcb8SAndy Fiddaman nameref response="$1" 83*906afcb8SAndy Fiddaman typeset h statuscode statusmsg i 84*906afcb8SAndy Fiddaman 85*906afcb8SAndy Fiddaman # we use '\r' as additional IFS to filter the final '\r' 86*906afcb8SAndy Fiddaman IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 87*906afcb8SAndy Fiddaman [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 88*906afcb8SAndy Fiddaman [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 89*906afcb8SAndy Fiddaman response.statuscode="$statuscode" 90*906afcb8SAndy Fiddaman response.statusmsg="$statusmsg" 91*906afcb8SAndy Fiddaman 92*906afcb8SAndy Fiddaman # skip remaining headers 93*906afcb8SAndy Fiddaman while IFS='' read -r i ; do 94*906afcb8SAndy Fiddaman [[ "$i" == $'\r' ]] && break 95*906afcb8SAndy Fiddaman 96*906afcb8SAndy Fiddaman # strip '\r' at the end 97*906afcb8SAndy Fiddaman i="${i/~(Er)$'\r'/}" 98*906afcb8SAndy Fiddaman 99*906afcb8SAndy Fiddaman case "$i" in 100*906afcb8SAndy Fiddaman ~(Eli)Content-Type:.*) 101*906afcb8SAndy Fiddaman response.content_type="${i/~(El).*:[[:blank:]]*/}" 102*906afcb8SAndy Fiddaman ;; 103*906afcb8SAndy Fiddaman ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 104*906afcb8SAndy Fiddaman integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 105*906afcb8SAndy Fiddaman ;; 106*906afcb8SAndy Fiddaman ~(Eli)Transfer-Encoding:.*) 107*906afcb8SAndy Fiddaman response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 108*906afcb8SAndy Fiddaman ;; 109*906afcb8SAndy Fiddaman esac 110*906afcb8SAndy Fiddaman done 111*906afcb8SAndy Fiddaman 112*906afcb8SAndy Fiddaman return 0 113*906afcb8SAndy Fiddaman } 114*906afcb8SAndy Fiddaman 115*906afcb8SAndy Fiddaman function cat_http_body 116*906afcb8SAndy Fiddaman { 117*906afcb8SAndy Fiddaman typeset emode="$1" 118*906afcb8SAndy Fiddaman typeset hexchunksize="0" 119*906afcb8SAndy Fiddaman integer chunksize=0 120*906afcb8SAndy Fiddaman 121*906afcb8SAndy Fiddaman if [[ "${emode}" == "chunked" ]] ; then 122*906afcb8SAndy Fiddaman while IFS=$'\n' read hexchunksize ; do 123*906afcb8SAndy Fiddaman hexchunksize="${hexchunksize//$'\r'/}" 124*906afcb8SAndy Fiddaman [[ "${hexchunksize}" != "" ]] || continue 125*906afcb8SAndy Fiddaman [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] || break 126*906afcb8SAndy Fiddaman (( chunksize=$( printf "16#%s\n" "${hexchunksize}" ) )) 127*906afcb8SAndy Fiddaman (( chunksize > 0 )) || break 128*906afcb8SAndy Fiddaman dd bs=1 count="${chunksize}" 2>/dev/null 129*906afcb8SAndy Fiddaman done 130*906afcb8SAndy Fiddaman else 131*906afcb8SAndy Fiddaman cat 132*906afcb8SAndy Fiddaman fi 133*906afcb8SAndy Fiddaman 134*906afcb8SAndy Fiddaman return 0 135*906afcb8SAndy Fiddaman } 136*906afcb8SAndy Fiddaman 137*906afcb8SAndy Fiddaman function init_url 138*906afcb8SAndy Fiddaman { 139*906afcb8SAndy Fiddaman _.protocol="${1%://*}" 140*906afcb8SAndy Fiddaman _.path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 141*906afcb8SAndy Fiddaman 142*906afcb8SAndy Fiddaman if [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then 143*906afcb8SAndy Fiddaman _.host="${_.path1%%/*}" 144*906afcb8SAndy Fiddaman _.path="${_.path1#*/}" 145*906afcb8SAndy Fiddaman _.port="${_.host##*:}" 146*906afcb8SAndy Fiddaman fi 147*906afcb8SAndy Fiddaman 148*906afcb8SAndy Fiddaman return 0 149*906afcb8SAndy Fiddaman } 150*906afcb8SAndy Fiddaman 151*906afcb8SAndy Fiddaman # close connection 152*906afcb8SAndy Fiddaman function close_connection 153*906afcb8SAndy Fiddaman { 154*906afcb8SAndy Fiddaman integer ret 155*906afcb8SAndy Fiddaman 156*906afcb8SAndy Fiddaman if (( _.netfd.in != -1 )) ; then 157*906afcb8SAndy Fiddaman redirect {_.netfd.in}<&- 158*906afcb8SAndy Fiddaman (( _.netfd.in=-1 )) 159*906afcb8SAndy Fiddaman fi 160*906afcb8SAndy Fiddaman 161*906afcb8SAndy Fiddaman if (( _.netfd.in != _.netfd.out && _.netfd.out != -1 )) ; then 162*906afcb8SAndy Fiddaman redirect {_.netfd.out}<&- 163*906afcb8SAndy Fiddaman (( _.netfd.out=-1 )) 164*906afcb8SAndy Fiddaman fi 165*906afcb8SAndy Fiddaman 166*906afcb8SAndy Fiddaman if [[ "${_.protocol}" == "https" ]] ; then 167*906afcb8SAndy Fiddaman wait ${_.ssl.openssl_client_pid} || { print -u2 -f "%s: openssl failed.\n" ; return 1 ; } 168*906afcb8SAndy Fiddaman (( _.ssl.openssl_client_pid=-1 )) 169*906afcb8SAndy Fiddaman 170*906afcb8SAndy Fiddaman rm -r \"${_.ssl.fifo.dir}\" 171*906afcb8SAndy Fiddaman _.ssl.fifo.dir="" 172*906afcb8SAndy Fiddaman fi 173*906afcb8SAndy Fiddaman 174*906afcb8SAndy Fiddaman return 0 175*906afcb8SAndy Fiddaman } 176*906afcb8SAndy Fiddaman 177*906afcb8SAndy Fiddaman function open_connection 178*906afcb8SAndy Fiddaman { 179*906afcb8SAndy Fiddaman if [[ "${_.protocol}" == "https" ]] ; then 180*906afcb8SAndy Fiddaman _.ssl.fifo.dir="$(mktemp -t -d)" 181*906afcb8SAndy Fiddaman _.ssl.fifo.in="${_.ssl.fifo.dir}/in" 182*906afcb8SAndy Fiddaman _.ssl.fifo.out="${_.ssl.fifo.dir}/out" 183*906afcb8SAndy Fiddaman 184*906afcb8SAndy Fiddaman # Use "errexit" to leave it at the first error 185*906afcb8SAndy Fiddaman # (this saves lots of if/fi tests for error checking) 186*906afcb8SAndy Fiddaman set -o errexit 187*906afcb8SAndy Fiddaman 188*906afcb8SAndy Fiddaman mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}" 189*906afcb8SAndy Fiddaman 190*906afcb8SAndy Fiddaman # create async openssl child to handle https 191*906afcb8SAndy Fiddaman openssl s_client -quiet -connect "${_.host}:${_.port}" <"${_.ssl.fifo.in}" >>"${_.ssl.fifo.out}" & 192*906afcb8SAndy Fiddaman 193*906afcb8SAndy Fiddaman _.ssl.openssl_client_pid=$! 194*906afcb8SAndy Fiddaman else 195*906afcb8SAndy Fiddaman redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}" 196*906afcb8SAndy Fiddaman (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } 197*906afcb8SAndy Fiddaman (( _.netfd.out=_.netfd.in )) 198*906afcb8SAndy Fiddaman fi 199*906afcb8SAndy Fiddaman return 0 200*906afcb8SAndy Fiddaman } 201*906afcb8SAndy Fiddaman 202*906afcb8SAndy Fiddaman function send_request 203*906afcb8SAndy Fiddaman { 204*906afcb8SAndy Fiddaman typeset request="$1" 205*906afcb8SAndy Fiddaman 206*906afcb8SAndy Fiddaman set -o errexit 207*906afcb8SAndy Fiddaman 208*906afcb8SAndy Fiddaman if [[ "${_.protocol}" == "https" ]] ; then 209*906afcb8SAndy Fiddaman print -n -- "${request}\r\n" >> "${_.ssl.fifo.in}" 210*906afcb8SAndy Fiddaman 211*906afcb8SAndy Fiddaman redirect {_.netfd.in}< "${_.ssl.fifo.out}" 212*906afcb8SAndy Fiddaman else 213*906afcb8SAndy Fiddaman print -n -- "${request}\r\n" >&${_.netfd.out} 214*906afcb8SAndy Fiddaman fi 215*906afcb8SAndy Fiddaman return 0 216*906afcb8SAndy Fiddaman } 217*906afcb8SAndy Fiddaman 218*906afcb8SAndy Fiddaman function cat_url 219*906afcb8SAndy Fiddaman { 220*906afcb8SAndy Fiddaman if [[ "${_.protocol}" == "file" ]] ; then 221*906afcb8SAndy Fiddaman cat "${_.path1}" 222*906afcb8SAndy Fiddaman return $? 223*906afcb8SAndy Fiddaman elif [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then 224*906afcb8SAndy Fiddaman compound httpresponse # http response 225*906afcb8SAndy Fiddaman 226*906afcb8SAndy Fiddaman # If URL did not contain a port number in the host part then look at the 227*906afcb8SAndy Fiddaman # protocol to get the port number 228*906afcb8SAndy Fiddaman if [[ "${_.port}" == "${_.host}" ]] ; then 229*906afcb8SAndy Fiddaman case "${_.protocol}" in 230*906afcb8SAndy Fiddaman "http") _.port=80 ;; 231*906afcb8SAndy Fiddaman "https") _.port=443 ;; 232*906afcb8SAndy Fiddaman *) _.port="$(getent services "${_.protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 233*906afcb8SAndy Fiddaman esac 234*906afcb8SAndy Fiddaman else 235*906afcb8SAndy Fiddaman _.host="${_.host%:*}" 236*906afcb8SAndy Fiddaman fi 237*906afcb8SAndy Fiddaman 238*906afcb8SAndy Fiddaman printmsg "protocol=${_.protocol} port=${_.port} host=${_.host} path=${_.path}" 239*906afcb8SAndy Fiddaman 240*906afcb8SAndy Fiddaman # prechecks 241*906afcb8SAndy Fiddaman [[ "${_.protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 242*906afcb8SAndy Fiddaman [[ "${_.port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 243*906afcb8SAndy Fiddaman [[ "${_.host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 244*906afcb8SAndy Fiddaman [[ "${_.path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 245*906afcb8SAndy Fiddaman 246*906afcb8SAndy Fiddaman _.open_connection || return 1 247*906afcb8SAndy Fiddaman 248*906afcb8SAndy Fiddaman # send HTTP request 249*906afcb8SAndy Fiddaman request="GET /${_.path} HTTP/1.1\r\n" 250*906afcb8SAndy Fiddaman request+="Host: ${_.host}\r\n" 251*906afcb8SAndy Fiddaman request+="User-Agent: ${_.user_agent}\r\n" 252*906afcb8SAndy Fiddaman request+="Connection: close\r\n" 253*906afcb8SAndy Fiddaman _.send_request "${request}\r\n" 254*906afcb8SAndy Fiddaman 255*906afcb8SAndy Fiddaman # collect response and send it to stdout 256*906afcb8SAndy Fiddaman { 257*906afcb8SAndy Fiddaman _.parse_http_response httpresponse 258*906afcb8SAndy Fiddaman _.cat_http_body "${httpresponse.transfer_encoding}" 259*906afcb8SAndy Fiddaman } <&${_.netfd.in} 260*906afcb8SAndy Fiddaman 261*906afcb8SAndy Fiddaman _.close_connection 262*906afcb8SAndy Fiddaman 263*906afcb8SAndy Fiddaman return 0 264*906afcb8SAndy Fiddaman else 265*906afcb8SAndy Fiddaman return 1 266*906afcb8SAndy Fiddaman fi 267*906afcb8SAndy Fiddaman # notreached 268*906afcb8SAndy Fiddaman } 269*906afcb8SAndy Fiddaman) 270*906afcb8SAndy Fiddaman 271*906afcb8SAndy Fiddamanfunction html_entity_to_ascii 272*906afcb8SAndy Fiddaman{ 273*906afcb8SAndy Fiddaman typeset buf 274*906afcb8SAndy Fiddaman typeset entity 275*906afcb8SAndy Fiddaman typeset c 276*906afcb8SAndy Fiddaman typeset value 277*906afcb8SAndy Fiddaman 278*906afcb8SAndy Fiddaman # Todo: Add more HTML/MathML entities here 279*906afcb8SAndy Fiddaman # Note we use a static variable (typeset -S) here to make sure we 280*906afcb8SAndy Fiddaman # don't loose the cache data between calls 281*906afcb8SAndy Fiddaman typeset -S -A entity_cache=( 282*906afcb8SAndy Fiddaman # entity to ascii (fixme: add UTF-8 transliterations) 283*906afcb8SAndy Fiddaman ["nbsp"]=' ' 284*906afcb8SAndy Fiddaman ["lt"]='<' 285*906afcb8SAndy Fiddaman ["le"]='<=' 286*906afcb8SAndy Fiddaman ["gt"]='>' 287*906afcb8SAndy Fiddaman ["ge"]='>=' 288*906afcb8SAndy Fiddaman ["amp"]='&' 289*906afcb8SAndy Fiddaman ["quot"]='"' 290*906afcb8SAndy Fiddaman ["apos"]="'" 291*906afcb8SAndy Fiddaman ) 292*906afcb8SAndy Fiddaman 293*906afcb8SAndy Fiddaman buf="" 294*906afcb8SAndy Fiddaman while IFS='' read -r -N 1 c ; do 295*906afcb8SAndy Fiddaman if [[ "$c" != "&" ]] ; then 296*906afcb8SAndy Fiddaman print -n -r -- "${c}" 297*906afcb8SAndy Fiddaman continue 298*906afcb8SAndy Fiddaman fi 299*906afcb8SAndy Fiddaman 300*906afcb8SAndy Fiddaman entity="" 301*906afcb8SAndy Fiddaman while IFS='' read -r -N 1 c ; do 302*906afcb8SAndy Fiddaman case "$c" in 303*906afcb8SAndy Fiddaman ";") 304*906afcb8SAndy Fiddaman break 305*906afcb8SAndy Fiddaman ;; 306*906afcb8SAndy Fiddaman ~(Eilr)[a-z0-9#]) 307*906afcb8SAndy Fiddaman entity+="$c" 308*906afcb8SAndy Fiddaman continue 309*906afcb8SAndy Fiddaman ;; 310*906afcb8SAndy Fiddaman *) 311*906afcb8SAndy Fiddaman# debugmsg "error &${entity}${c}#" 312*906afcb8SAndy Fiddaman 313*906afcb8SAndy Fiddaman print -n -r -- "${entity}${c}" 314*906afcb8SAndy Fiddaman entity="" 315*906afcb8SAndy Fiddaman continue 2 316*906afcb8SAndy Fiddaman ;; 317*906afcb8SAndy Fiddaman esac 318*906afcb8SAndy Fiddaman done 319*906afcb8SAndy Fiddaman 320*906afcb8SAndy Fiddaman value="" 321*906afcb8SAndy Fiddaman if [[ "${entity_cache["${entity}"]}" != "" ]] ; then 322*906afcb8SAndy Fiddaman# debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#" 323*906afcb8SAndy Fiddaman value="${entity_cache["${entity}"]}" 324*906afcb8SAndy Fiddaman else 325*906afcb8SAndy Fiddaman if [[ "${entity:0:1}" == "#" ]] ; then 326*906afcb8SAndy Fiddaman # decimal literal 327*906afcb8SAndy Fiddaman value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }" 328*906afcb8SAndy Fiddaman elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then 329*906afcb8SAndy Fiddaman # hexadecimal literal 330*906afcb8SAndy Fiddaman value="${ printf "\u[${entity:0:7}]" ; }" 331*906afcb8SAndy Fiddaman else 332*906afcb8SAndy Fiddaman # unknown literal - pass-through 333*906afcb8SAndy Fiddaman value="ENT=|${entity}|" 334*906afcb8SAndy Fiddaman fi 335*906afcb8SAndy Fiddaman 336*906afcb8SAndy Fiddaman entity_cache["${entity}"]="${value}" 337*906afcb8SAndy Fiddaman 338*906afcb8SAndy Fiddaman# debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#" 339*906afcb8SAndy Fiddaman fi 340*906afcb8SAndy Fiddaman 341*906afcb8SAndy Fiddaman printf "%s" "${value}" 342*906afcb8SAndy Fiddaman done 343*906afcb8SAndy Fiddaman 344*906afcb8SAndy Fiddaman return 0 345*906afcb8SAndy Fiddaman} 346*906afcb8SAndy Fiddaman 347*906afcb8SAndy Fiddaman# dumb xhtml handler - no CSS, tables, images, iframes or nested 348*906afcb8SAndy Fiddaman# structures are supported (and we assume that the input is correct 349*906afcb8SAndy Fiddaman# xhtml). The code was written in a trial&&error manner and should be 350*906afcb8SAndy Fiddaman# rewritten to parse xhtml correctly. 351*906afcb8SAndy Fiddamanfunction handle_html 352*906afcb8SAndy Fiddaman{ 353*906afcb8SAndy Fiddaman # we can't use global variables here when multiple callbacks use the same 354*906afcb8SAndy Fiddaman # callback function - but we can use the callback associative array for 355*906afcb8SAndy Fiddaman # variable storage instead 356*906afcb8SAndy Fiddaman nameref callbacks=${1} 357*906afcb8SAndy Fiddaman typeset tag_type="$2" 358*906afcb8SAndy Fiddaman typeset tag_value="$3" 359*906afcb8SAndy Fiddaman 360*906afcb8SAndy Fiddaman case "${tag_type}" in 361*906afcb8SAndy Fiddaman tag_begin) 362*906afcb8SAndy Fiddaman case "${tag_value}" in 363*906afcb8SAndy Fiddaman br) printf "\n" ;; 364*906afcb8SAndy Fiddaman hr) printf "\n-------------------------------------\n" ;; 365*906afcb8SAndy Fiddaman pre) callbacks["html_pre"]='true' ;; 366*906afcb8SAndy Fiddaman p) printf "\n" ;; 367*906afcb8SAndy Fiddaman esac 368*906afcb8SAndy Fiddaman ;; 369*906afcb8SAndy Fiddaman 370*906afcb8SAndy Fiddaman tag_end) 371*906afcb8SAndy Fiddaman case "${tag_value}" in 372*906afcb8SAndy Fiddaman pre) callbacks["html_pre"]='false' ;; 373*906afcb8SAndy Fiddaman esac 374*906afcb8SAndy Fiddaman ;; 375*906afcb8SAndy Fiddaman 376*906afcb8SAndy Fiddaman tag_text) 377*906afcb8SAndy Fiddaman if ${callbacks["html_pre"]} ; then 378*906afcb8SAndy Fiddaman printf "%s" "${tag_value}" 379*906afcb8SAndy Fiddaman else 380*906afcb8SAndy Fiddaman # compress spaces/newlines/tabs/etc. 381*906afcb8SAndy Fiddaman printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }" 382*906afcb8SAndy Fiddaman fi 383*906afcb8SAndy Fiddaman ;; 384*906afcb8SAndy Fiddaman 385*906afcb8SAndy Fiddaman document_start) 386*906afcb8SAndy Fiddaman callbacks["html_pre"]='false' 387*906afcb8SAndy Fiddaman ;; 388*906afcb8SAndy Fiddaman document_end) ;; 389*906afcb8SAndy Fiddaman esac 390*906afcb8SAndy Fiddaman 391*906afcb8SAndy Fiddaman return 0 392*906afcb8SAndy Fiddaman} 393*906afcb8SAndy Fiddaman 394*906afcb8SAndy Fiddamanfunction handle_rss 395*906afcb8SAndy Fiddaman{ 396*906afcb8SAndy Fiddaman # we can't use global variables here when multiple callbacks use the same 397*906afcb8SAndy Fiddaman # callback function - but we can use the callback associative array for 398*906afcb8SAndy Fiddaman # variable storage instead 399*906afcb8SAndy Fiddaman nameref callbacks=${1} 400*906afcb8SAndy Fiddaman typeset tag_type="$2" 401*906afcb8SAndy Fiddaman typeset tag_value="$3" 402*906afcb8SAndy Fiddaman 403*906afcb8SAndy Fiddaman case "${tag_type}" in 404*906afcb8SAndy Fiddaman tag_begin) 405*906afcb8SAndy Fiddaman case "${tag_value}" in 406*906afcb8SAndy Fiddaman item) 407*906afcb8SAndy Fiddaman item["title"]="" 408*906afcb8SAndy Fiddaman item["link"]="" 409*906afcb8SAndy Fiddaman item["tag"]="" 410*906afcb8SAndy Fiddaman item["description"]="" 411*906afcb8SAndy Fiddaman ;; 412*906afcb8SAndy Fiddaman esac 413*906afcb8SAndy Fiddaman callbacks["textbuf"]="" 414*906afcb8SAndy Fiddaman ;; 415*906afcb8SAndy Fiddaman tag_end) 416*906afcb8SAndy Fiddaman case "${tag_value}" in 417*906afcb8SAndy Fiddaman item) 418*906afcb8SAndy Fiddaman # note that each RSS item needs to be converted seperately from RSS to HTML to plain text 419*906afcb8SAndy Fiddaman # to make sure that the state of one RSS item doesn't affect others 420*906afcb8SAndy Fiddaman ( 421*906afcb8SAndy Fiddaman printf $"<br />#### RSS item: title: %s ####" "${item["title"]}" 422*906afcb8SAndy Fiddaman printf $"<br />## author: %s" "${item["author"]}" 423*906afcb8SAndy Fiddaman printf $"<br />## link: %s" "${item["link"]}" 424*906afcb8SAndy Fiddaman printf $"<br />## date: %s" "${item["pubDate"]}" 425*906afcb8SAndy Fiddaman printf $"<br />## begin description:" 426*906afcb8SAndy Fiddaman printf $"<br />%s<br />" "${item["description"]}" 427*906afcb8SAndy Fiddaman printf $"<br />## end description<br />" 428*906afcb8SAndy Fiddaman print # extra newline to make sure the sed pipeline gets flushed 429*906afcb8SAndy Fiddaman ) | 430*906afcb8SAndy Fiddaman html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code) 431*906afcb8SAndy Fiddaman xml_tok "xhtmltok_cb" | # convert HTML to plain text 432*906afcb8SAndy Fiddaman html_entity_to_ascii # convert HTML entities 433*906afcb8SAndy Fiddaman ;; 434*906afcb8SAndy Fiddaman title) item["title"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 435*906afcb8SAndy Fiddaman link) item["link"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 436*906afcb8SAndy Fiddaman dc:creator | author) item["author"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 437*906afcb8SAndy Fiddaman dc:date | pubDate) item["pubDate"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 438*906afcb8SAndy Fiddaman description) item["description"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 439*906afcb8SAndy Fiddaman esac 440*906afcb8SAndy Fiddaman callbacks["textbuf"]="" 441*906afcb8SAndy Fiddaman ;; 442*906afcb8SAndy Fiddaman tag_text) 443*906afcb8SAndy Fiddaman callbacks["textbuf"]+="${tag_value}" 444*906afcb8SAndy Fiddaman ;; 445*906afcb8SAndy Fiddaman document_start) ;; 446*906afcb8SAndy Fiddaman document_end) ;; 447*906afcb8SAndy Fiddaman esac 448*906afcb8SAndy Fiddaman return 0 449*906afcb8SAndy Fiddaman} 450*906afcb8SAndy Fiddaman 451*906afcb8SAndy Fiddamanfunction xml_tok 452*906afcb8SAndy Fiddaman{ 453*906afcb8SAndy Fiddaman typeset buf="" 454*906afcb8SAndy Fiddaman typeset namebuf="" 455*906afcb8SAndy Fiddaman typeset attrbuf="" 456*906afcb8SAndy Fiddaman typeset c="" 457*906afcb8SAndy Fiddaman typeset isendtag # bool: true/false 458*906afcb8SAndy Fiddaman typeset issingletag # bool: true/false (used for tags like "<br />") 459*906afcb8SAndy Fiddaman nameref callbacks=${1} 460*906afcb8SAndy Fiddaman 461*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 462*906afcb8SAndy Fiddaman 463*906afcb8SAndy Fiddaman while IFS='' read -r -N 1 c ; do 464*906afcb8SAndy Fiddaman isendtag=false 465*906afcb8SAndy Fiddaman 466*906afcb8SAndy Fiddaman if [[ "$c" == "<" ]] ; then 467*906afcb8SAndy Fiddaman # flush any text content 468*906afcb8SAndy Fiddaman if [[ "$buf" != "" ]] ; then 469*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 470*906afcb8SAndy Fiddaman buf="" 471*906afcb8SAndy Fiddaman fi 472*906afcb8SAndy Fiddaman 473*906afcb8SAndy Fiddaman IFS='' read -r -N 1 c 474*906afcb8SAndy Fiddaman if [[ "$c" == "/" ]] ; then 475*906afcb8SAndy Fiddaman isendtag=true 476*906afcb8SAndy Fiddaman else 477*906afcb8SAndy Fiddaman buf="$c" 478*906afcb8SAndy Fiddaman fi 479*906afcb8SAndy Fiddaman IFS='' read -r -d '>' c 480*906afcb8SAndy Fiddaman buf+="$c" 481*906afcb8SAndy Fiddaman 482*906afcb8SAndy Fiddaman # handle comments 483*906afcb8SAndy Fiddaman if [[ "$buf" == ~(El)!-- ]] ; then 484*906afcb8SAndy Fiddaman # did we read the comment completely ? 485*906afcb8SAndy Fiddaman if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 486*906afcb8SAndy Fiddaman buf+=">" 487*906afcb8SAndy Fiddaman while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 488*906afcb8SAndy Fiddaman IFS='' read -r -N 1 c || break 489*906afcb8SAndy Fiddaman buf+="$c" 490*906afcb8SAndy Fiddaman done 491*906afcb8SAndy Fiddaman fi 492*906afcb8SAndy Fiddaman 493*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 494*906afcb8SAndy Fiddaman buf="" 495*906afcb8SAndy Fiddaman continue 496*906afcb8SAndy Fiddaman fi 497*906afcb8SAndy Fiddaman 498*906afcb8SAndy Fiddaman # check if the tag starts and ends at the same time (like "<br />") 499*906afcb8SAndy Fiddaman if [[ "${buf}" == ~(Er).*/ ]] ; then 500*906afcb8SAndy Fiddaman issingletag=true 501*906afcb8SAndy Fiddaman buf="${buf%*/}" 502*906afcb8SAndy Fiddaman else 503*906afcb8SAndy Fiddaman issingletag=false 504*906afcb8SAndy Fiddaman fi 505*906afcb8SAndy Fiddaman 506*906afcb8SAndy Fiddaman # check if the tag has attributes (e.g. space after name) 507*906afcb8SAndy Fiddaman if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 508*906afcb8SAndy Fiddaman namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 509*906afcb8SAndy Fiddaman attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 510*906afcb8SAndy Fiddaman else 511*906afcb8SAndy Fiddaman namebuf="$buf" 512*906afcb8SAndy Fiddaman attrbuf="" 513*906afcb8SAndy Fiddaman fi 514*906afcb8SAndy Fiddaman 515*906afcb8SAndy Fiddaman if ${isendtag} ; then 516*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 517*906afcb8SAndy Fiddaman else 518*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 519*906afcb8SAndy Fiddaman 520*906afcb8SAndy Fiddaman # handle tags like <br/> (which are start- and end-tag in one piece) 521*906afcb8SAndy Fiddaman if ${issingletag} ; then 522*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 523*906afcb8SAndy Fiddaman fi 524*906afcb8SAndy Fiddaman fi 525*906afcb8SAndy Fiddaman buf="" 526*906afcb8SAndy Fiddaman else 527*906afcb8SAndy Fiddaman buf+="$c" 528*906afcb8SAndy Fiddaman fi 529*906afcb8SAndy Fiddaman done 530*906afcb8SAndy Fiddaman 531*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 532*906afcb8SAndy Fiddaman 533*906afcb8SAndy Fiddaman print # final newline to make filters like "sed" happy 534*906afcb8SAndy Fiddaman} 535*906afcb8SAndy Fiddaman 536*906afcb8SAndy Fiddaman# return the value of LC_MESSAGES needed for subprocesses which 537*906afcb8SAndy Fiddaman# want to run in a different locale/encoding 538*906afcb8SAndy Fiddamanfunction get_lc_messages 539*906afcb8SAndy Fiddaman{ 540*906afcb8SAndy Fiddaman [[ "${LC_ALL}" != "" ]] && { print "${LC_ALL}" ; return 0 ; } 541*906afcb8SAndy Fiddaman [[ "${LC_MESSAGES}" != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; } 542*906afcb8SAndy Fiddaman [[ "${LANG}" != "" ]] && { print "${LANG}" ; return 0 ; } 543*906afcb8SAndy Fiddaman print "C" ; return 0 544*906afcb8SAndy Fiddaman} 545*906afcb8SAndy Fiddaman 546*906afcb8SAndy Fiddamanfunction do_rssread 547*906afcb8SAndy Fiddaman{ 548*906afcb8SAndy Fiddaman # set unicode locale since RSS is encoded in UTF-8 549*906afcb8SAndy Fiddaman # (and make sure $LC_MESSAGES is set to the parent 550*906afcb8SAndy Fiddaman # process's locale that all error messages are using 551*906afcb8SAndy Fiddaman # the callers locale/encoding) 552*906afcb8SAndy Fiddaman export \ 553*906afcb8SAndy Fiddaman LC_MESSAGES="${ get_lc_messages ; }" \ 554*906afcb8SAndy Fiddaman LC_MONETARY="en_US.UTF-8" \ 555*906afcb8SAndy Fiddaman LC_NUMERIC="en_US.UTF-8" \ 556*906afcb8SAndy Fiddaman LC_COLLATE="en_US.UTF-8" \ 557*906afcb8SAndy Fiddaman LC_CTYPE="en_US.UTF-8" \ 558*906afcb8SAndy Fiddaman LC_TIME="en_US.UTF-8" \ 559*906afcb8SAndy Fiddaman LANG="en_US.UTF-8" 560*906afcb8SAndy Fiddaman 561*906afcb8SAndy Fiddaman # return non-zero exit code for this function if the rss processing below fails 562*906afcb8SAndy Fiddaman set -o errexit 563*906afcb8SAndy Fiddaman 564*906afcb8SAndy Fiddaman urlconnection_t hc 565*906afcb8SAndy Fiddaman hc.user_agent="rssread/ksh93(ssl) (2010-03-27; $(uname -s -r -p))" 566*906afcb8SAndy Fiddaman hc.init_url "$1" 567*906afcb8SAndy Fiddaman 568*906afcb8SAndy Fiddaman # need extra newline after cat_url to terminate line with $'\n' 569*906afcb8SAndy Fiddaman # to make "xml_tok" happy 570*906afcb8SAndy Fiddaman data="${ hc.cat_url ; print ; }" 571*906afcb8SAndy Fiddaman 572*906afcb8SAndy Fiddaman print -u2 -f "# Got %d lines of RSS data, processing...\n" "${ wc -l <<< "${data}" ; }" 573*906afcb8SAndy Fiddaman 574*906afcb8SAndy Fiddaman xml_tok "rsstok_cb" <<< "${data}" 575*906afcb8SAndy Fiddaman 576*906afcb8SAndy Fiddaman return 0 577*906afcb8SAndy Fiddaman} 578*906afcb8SAndy Fiddaman 579*906afcb8SAndy Fiddamanfunction usage 580*906afcb8SAndy Fiddaman{ 581*906afcb8SAndy Fiddaman OPTIND=0 582*906afcb8SAndy Fiddaman getopts -a "${progname}" "${rssread_usage}" OPT '-?' 583*906afcb8SAndy Fiddaman exit 2 584*906afcb8SAndy Fiddaman} 585*906afcb8SAndy Fiddaman 586*906afcb8SAndy Fiddaman# make sure we use the ksh93 builtin versions 587*906afcb8SAndy Fiddamanbuiltin basename 588*906afcb8SAndy Fiddamanbuiltin cat 589*906afcb8SAndy Fiddamanbuiltin mkfifo 590*906afcb8SAndy Fiddaman 591*906afcb8SAndy Fiddamantypeset -A rsstok_cb # callbacks for xml_tok 592*906afcb8SAndy Fiddamanrsstok_cb["tag_begin"]="handle_rss" 593*906afcb8SAndy Fiddamanrsstok_cb["tag_end"]="handle_rss" 594*906afcb8SAndy Fiddamanrsstok_cb["tag_text"]="handle_rss" 595*906afcb8SAndy Fiddamanrsstok_cb["textbuf"]="" 596*906afcb8SAndy Fiddaman 597*906afcb8SAndy Fiddamantypeset -A xhtmltok_cb # callbacks for xml_tok 598*906afcb8SAndy Fiddamanxhtmltok_cb["tag_begin"]="handle_html" 599*906afcb8SAndy Fiddamanxhtmltok_cb["tag_end"]="handle_html" 600*906afcb8SAndy Fiddamanxhtmltok_cb["tag_text"]="handle_html" 601*906afcb8SAndy Fiddamanxhtmltok_cb["textbuf"]="" 602*906afcb8SAndy Fiddamanxhtmltok_cb["html_pre"]='false' 603*906afcb8SAndy Fiddaman 604*906afcb8SAndy Fiddamantypeset -A item 605*906afcb8SAndy Fiddaman 606*906afcb8SAndy Fiddamantypeset -A bookmark_urls 607*906afcb8SAndy Fiddaman 608*906afcb8SAndy Fiddaman# "ramdom" urls for testing 609*906afcb8SAndy Fiddamanbookmark_urls=( 610*906afcb8SAndy Fiddaman ["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss" 611*906afcb8SAndy Fiddaman # some Sun staff/sites 612*906afcb8SAndy Fiddaman ["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss" 613*906afcb8SAndy Fiddaman ["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml" 614*906afcb8SAndy Fiddaman ["bigadmin_scripts"]="https://www.sun.com/bigadmin/content/rss/scripts.xml" 615*906afcb8SAndy Fiddaman ["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss" 616*906afcb8SAndy Fiddaman ["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss" 617*906afcb8SAndy Fiddaman ["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss" 618*906afcb8SAndy Fiddaman ["planetsun"]="http://www.planetsun.org/rss20.xml" 619*906afcb8SAndy Fiddaman ["planetsolaris"]="http://www.planetsolaris.org/rss20.xml" 620*906afcb8SAndy Fiddaman ["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml" 621*906afcb8SAndy Fiddaman ["theregister_uk"]="http://www.theregister.co.uk/headlines.rss" 622*906afcb8SAndy Fiddaman ["heise"]="http://www.heise.de/newsticker/heise.rdf" 623*906afcb8SAndy Fiddaman ["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot" 624*906afcb8SAndy Fiddaman ["wikipedia_command_shells"]="http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history" 625*906afcb8SAndy Fiddaman) 626*906afcb8SAndy Fiddaman 627*906afcb8SAndy Fiddamantypeset progname="${ basename "${0}" ; }" 628*906afcb8SAndy Fiddaman 629*906afcb8SAndy Fiddamantypeset -r rssread_usage=$'+ 630*906afcb8SAndy Fiddaman[-?\n@(#)\$Id: rssread (Roland Mainz) 2010-03-27 \$\n] 631*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@sun.com>] 632*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@nrubsig.org>] 633*906afcb8SAndy Fiddaman[+NAME?rssread - fetch RSS messages and convert them to plain text] 634*906afcb8SAndy Fiddaman[+DESCRIPTION?\brssread\b RSS to plain text converter 635*906afcb8SAndy Fiddaman which fetches RSS streams via HTTP and converts them from 636*906afcb8SAndy Fiddaman RSS to HTML to plain text in the current locale/encoding.] 637*906afcb8SAndy Fiddaman[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.] 638*906afcb8SAndy Fiddaman 639*906afcb8SAndy Fiddaman[ url ] 640*906afcb8SAndy Fiddaman 641*906afcb8SAndy Fiddaman[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)] 642*906afcb8SAndy Fiddaman' 643*906afcb8SAndy Fiddaman 644*906afcb8SAndy Fiddamantypeset noiconv=false 645*906afcb8SAndy Fiddaman 646*906afcb8SAndy Fiddamanwhile getopts -a "${progname}" "${rssread_usage}" OPT ; do 647*906afcb8SAndy Fiddaman# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 648*906afcb8SAndy Fiddaman case ${OPT} in 649*906afcb8SAndy Fiddaman I) noiconv=true ;; 650*906afcb8SAndy Fiddaman +I) noiconv=false ;; 651*906afcb8SAndy Fiddaman *) usage ;; 652*906afcb8SAndy Fiddaman esac 653*906afcb8SAndy Fiddamandone 654*906afcb8SAndy Fiddamanshift $((OPTIND-1)) 655*906afcb8SAndy Fiddaman 656*906afcb8SAndy Fiddamantypeset url="$1" 657*906afcb8SAndy Fiddaman 658*906afcb8SAndy Fiddamanif [[ "${url}" == "" ]] ; then 659*906afcb8SAndy Fiddaman fatal_error $"No url given." 660*906afcb8SAndy Fiddamanfi 661*906afcb8SAndy Fiddaman 662*906afcb8SAndy Fiddamanif [[ "${bookmark_urls[${url}]}" != "" ]] ; then 663*906afcb8SAndy Fiddaman printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}" 664*906afcb8SAndy Fiddaman url="${bookmark_urls[${url}]}" 665*906afcb8SAndy Fiddamanfi 666*906afcb8SAndy Fiddaman 667*906afcb8SAndy Fiddamanif ${noiconv} ; then 668*906afcb8SAndy Fiddaman do_rssread "${url}" 669*906afcb8SAndy Fiddamanelse 670*906afcb8SAndy Fiddaman do_rssread "${url}" | iconv -f "UTF-8" - - 671*906afcb8SAndy Fiddamanfi 672*906afcb8SAndy Fiddaman 673*906afcb8SAndy Fiddamanexit 0 674*906afcb8SAndy Fiddaman#EOF. 675