xref: /titanic_51/usr/src/lib/libshell/common/scripts/rssread.sh (revision 3e14f97f673e8a630f076077de35afdd43dc1587)
17c2fbfb3SApril Chin#!/usr/bin/ksh93
27c2fbfb3SApril Chin
37c2fbfb3SApril Chin#
47c2fbfb3SApril Chin# CDDL HEADER START
57c2fbfb3SApril Chin#
67c2fbfb3SApril Chin# The contents of this file are subject to the terms of the
77c2fbfb3SApril Chin# Common Development and Distribution License (the "License").
87c2fbfb3SApril Chin# You may not use this file except in compliance with the License.
97c2fbfb3SApril Chin#
107c2fbfb3SApril Chin# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
117c2fbfb3SApril Chin# or http://www.opensolaris.org/os/licensing.
127c2fbfb3SApril Chin# See the License for the specific language governing permissions
137c2fbfb3SApril Chin# and limitations under the License.
147c2fbfb3SApril Chin#
157c2fbfb3SApril Chin# When distributing Covered Code, include this CDDL HEADER in each
167c2fbfb3SApril Chin# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
177c2fbfb3SApril Chin# If applicable, add the following below this CDDL HEADER, with the
187c2fbfb3SApril Chin# fields enclosed by brackets "[]" replaced with your own identifying
197c2fbfb3SApril Chin# information: Portions Copyright [yyyy] [name of copyright owner]
207c2fbfb3SApril Chin#
217c2fbfb3SApril Chin# CDDL HEADER END
227c2fbfb3SApril Chin#
237c2fbfb3SApril Chin
247c2fbfb3SApril Chin#
25*3e14f97fSRoger A. Faulkner# Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
267c2fbfb3SApril Chin#
277c2fbfb3SApril Chin
287c2fbfb3SApril Chin#
297c2fbfb3SApril Chin# rssread - a simple RSS2.0 reader with RSS to XHTML to
307c2fbfb3SApril Chin# plaintext conversion.
317c2fbfb3SApril Chin#
327c2fbfb3SApril Chin
337c2fbfb3SApril Chin# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
347c2fbfb3SApril Chinexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
357c2fbfb3SApril Chin
367c2fbfb3SApril Chinfunction printmsg
377c2fbfb3SApril Chin{
387c2fbfb3SApril Chin	print -u2 "$*"
397c2fbfb3SApril Chin}
407c2fbfb3SApril Chin
417c2fbfb3SApril Chinfunction debugmsg
427c2fbfb3SApril Chin{
437c2fbfb3SApril Chin#	printmsg "$*"
447c2fbfb3SApril Chintrue
457c2fbfb3SApril Chin}
467c2fbfb3SApril Chin
477c2fbfb3SApril Chinfunction fatal_error
487c2fbfb3SApril Chin{
497c2fbfb3SApril Chin	print -u2 "${progname}: $*"
507c2fbfb3SApril Chin	exit 1
517c2fbfb3SApril Chin}
527c2fbfb3SApril Chin
5334f9b3eeSRoland Mainztypeset -T urlconnection_t=(
5434f9b3eeSRoland Mainz	# public
5534f9b3eeSRoland Mainz	typeset user_agent="ksh93/urlconnection_t"
5634f9b3eeSRoland Mainz
5734f9b3eeSRoland Mainz	# private variables
5834f9b3eeSRoland Mainz	typeset protocol
5934f9b3eeSRoland Mainz	typeset path1
6034f9b3eeSRoland Mainz	typeset host
6134f9b3eeSRoland Mainz	typeset path
6234f9b3eeSRoland Mainz	typeset port
6334f9b3eeSRoland Mainz
6434f9b3eeSRoland Mainz	compound netfd=(
6534f9b3eeSRoland Mainz		integer in=-1  # incoming traffic
6634f9b3eeSRoland Mainz		integer out=-1 # outgoing traffic
6734f9b3eeSRoland Mainz	)
6834f9b3eeSRoland Mainz
6934f9b3eeSRoland Mainz	# only used for https
7034f9b3eeSRoland Mainz	compound ssl=(
7134f9b3eeSRoland Mainz		compound fifo=(
7234f9b3eeSRoland Mainz			typeset dir=""
7334f9b3eeSRoland Mainz			typeset in=""
7434f9b3eeSRoland Mainz			typeset out=""
7534f9b3eeSRoland Mainz		)
7634f9b3eeSRoland Mainz		integer openssl_client_pid=-1
7734f9b3eeSRoland Mainz	)
7834f9b3eeSRoland Mainz
797c2fbfb3SApril Chin	# parse HTTP return code, cookies etc.
807c2fbfb3SApril Chin	function parse_http_response
817c2fbfb3SApril Chin	{
827c2fbfb3SApril Chin		nameref response="$1"
837c2fbfb3SApril Chin		typeset h statuscode statusmsg i
847c2fbfb3SApril Chin
857c2fbfb3SApril Chin		# we use '\r' as additional IFS to filter the final '\r'
867c2fbfb3SApril Chin		IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code>
877c2fbfb3SApril Chin		[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
887c2fbfb3SApril Chin		[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
897c2fbfb3SApril Chin		response.statuscode="$statuscode"
907c2fbfb3SApril Chin		response.statusmsg="$statusmsg"
917c2fbfb3SApril Chin
927c2fbfb3SApril Chin		# skip remaining headers
937c2fbfb3SApril Chin		while IFS='' read -r i ; do
947c2fbfb3SApril Chin			[[ "$i" == $'\r' ]] && break
957c2fbfb3SApril Chin
967c2fbfb3SApril Chin			# strip '\r' at the end
977c2fbfb3SApril Chin			i="${i/~(Er)$'\r'/}"
987c2fbfb3SApril Chin
997c2fbfb3SApril Chin			case "$i" in
1007c2fbfb3SApril Chin				~(Eli)Content-Type:.*)
1017c2fbfb3SApril Chin					response.content_type="${i/~(El).*:[[:blank:]]*/}"
1027c2fbfb3SApril Chin					;;
1037c2fbfb3SApril Chin				~(Eli)Content-Length:[[:blank:]]*[0-9]*)
1047c2fbfb3SApril Chin					integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
1057c2fbfb3SApril Chin					;;
1067c2fbfb3SApril Chin				~(Eli)Transfer-Encoding:.*)
1077c2fbfb3SApril Chin					response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
1087c2fbfb3SApril Chin					;;
1097c2fbfb3SApril Chin			esac
1107c2fbfb3SApril Chin		done
1117c2fbfb3SApril Chin
1127c2fbfb3SApril Chin		return 0
1137c2fbfb3SApril Chin	}
1147c2fbfb3SApril Chin
1157c2fbfb3SApril Chin	function cat_http_body
1167c2fbfb3SApril Chin	{
1177c2fbfb3SApril Chin		typeset emode="$1"
1187c2fbfb3SApril Chin		typeset hexchunksize="0"
1197c2fbfb3SApril Chin		integer chunksize=0
1207c2fbfb3SApril Chin
1217c2fbfb3SApril Chin		if [[ "${emode}" == "chunked" ]] ; then
12234f9b3eeSRoland Mainz			while IFS=$'\n' read hexchunksize ; do
12334f9b3eeSRoland Mainz				hexchunksize="${hexchunksize//$'\r'/}"
12434f9b3eeSRoland Mainz				[[ "${hexchunksize}" != "" ]] || continue
12534f9b3eeSRoland Mainz				[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] || break
126*3e14f97fSRoger A. Faulkner				(( chunksize=$( printf "16#%s\n" "${hexchunksize}" )  ))
12734f9b3eeSRoland Mainz				(( chunksize > 0 )) || break
1287c2fbfb3SApril Chin				dd bs=1 count="${chunksize}" 2>/dev/null
1297c2fbfb3SApril Chin			done
1307c2fbfb3SApril Chin		else
1317c2fbfb3SApril Chin			cat
1327c2fbfb3SApril Chin		fi
1337c2fbfb3SApril Chin
1347c2fbfb3SApril Chin		return 0
1357c2fbfb3SApril Chin	}
1367c2fbfb3SApril Chin
13734f9b3eeSRoland Mainz	function init_url
1387c2fbfb3SApril Chin	{
13934f9b3eeSRoland Mainz		_.protocol="${1%://*}"
14034f9b3eeSRoland Mainz		_.path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
1417c2fbfb3SApril Chin
14234f9b3eeSRoland Mainz		if  [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then
14334f9b3eeSRoland Mainz			_.host="${_.path1%%/*}"
14434f9b3eeSRoland Mainz			_.path="${_.path1#*/}"
14534f9b3eeSRoland Mainz			_.port="${_.host##*:}"
1467c2fbfb3SApril Chin		fi
1477c2fbfb3SApril Chin
1487c2fbfb3SApril Chin		return 0
1497c2fbfb3SApril Chin	}
1507c2fbfb3SApril Chin
15134f9b3eeSRoland Mainz	# close connection
15234f9b3eeSRoland Mainz	function close_connection
15334f9b3eeSRoland Mainz	{
15434f9b3eeSRoland Mainz		integer ret
15534f9b3eeSRoland Mainz
15634f9b3eeSRoland Mainz		if (( _.netfd.in != -1 )) ; then
15734f9b3eeSRoland Mainz			redirect {_.netfd.in}<&-
15834f9b3eeSRoland Mainz			(( _.netfd.in=-1 ))
15934f9b3eeSRoland Mainz		fi
16034f9b3eeSRoland Mainz
16134f9b3eeSRoland Mainz		if (( _.netfd.in != _.netfd.out && _.netfd.out != -1 )) ; then
16234f9b3eeSRoland Mainz			redirect {_.netfd.out}<&-
16334f9b3eeSRoland Mainz			((  _.netfd.out=-1 ))
16434f9b3eeSRoland Mainz		fi
16534f9b3eeSRoland Mainz
16634f9b3eeSRoland Mainz		if [[ "${_.protocol}" == "https" ]] ; then
16734f9b3eeSRoland Mainz			wait ${_.ssl.openssl_client_pid} || { print -u2 -f "%s: openssl failed.\n" ; return 1 ; }
16834f9b3eeSRoland Mainz			(( _.ssl.openssl_client_pid=-1 ))
16934f9b3eeSRoland Mainz
17034f9b3eeSRoland Mainz			rm -r \"${_.ssl.fifo.dir}\"
17134f9b3eeSRoland Mainz			_.ssl.fifo.dir=""
17234f9b3eeSRoland Mainz		fi
17334f9b3eeSRoland Mainz
17434f9b3eeSRoland Mainz		return 0
17534f9b3eeSRoland Mainz	}
17634f9b3eeSRoland Mainz
17734f9b3eeSRoland Mainz	function open_connection
17834f9b3eeSRoland Mainz	{
17934f9b3eeSRoland Mainz		if [[ "${_.protocol}" == "https" ]] ; then
180*3e14f97fSRoger A. Faulkner			_.ssl.fifo.dir="$(mktemp -t -d)"
18134f9b3eeSRoland Mainz			_.ssl.fifo.in="${_.ssl.fifo.dir}/in"
18234f9b3eeSRoland Mainz			_.ssl.fifo.out="${_.ssl.fifo.dir}/out"
18334f9b3eeSRoland Mainz
18434f9b3eeSRoland Mainz			# Use "errexit" to leave it at the first error
18534f9b3eeSRoland Mainz			# (this saves lots of if/fi tests for error checking)
18634f9b3eeSRoland Mainz			set -o errexit
18734f9b3eeSRoland Mainz
18834f9b3eeSRoland Mainz			mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}"
18934f9b3eeSRoland Mainz
19034f9b3eeSRoland Mainz			# create async openssl child to handle https
19134f9b3eeSRoland Mainz			openssl s_client -quiet -connect "${_.host}:${_.port}" <"${_.ssl.fifo.in}" >>"${_.ssl.fifo.out}" &
19234f9b3eeSRoland Mainz
19334f9b3eeSRoland Mainz			_.ssl.openssl_client_pid=$!
19434f9b3eeSRoland Mainz		else
19534f9b3eeSRoland Mainz			redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}"
19634f9b3eeSRoland Mainz			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
19734f9b3eeSRoland Mainz			(( _.netfd.out=_.netfd.in ))
19834f9b3eeSRoland Mainz		fi
19934f9b3eeSRoland Mainz		return 0
20034f9b3eeSRoland Mainz	}
20134f9b3eeSRoland Mainz
20234f9b3eeSRoland Mainz	function send_request
20334f9b3eeSRoland Mainz	{
20434f9b3eeSRoland Mainz		typeset request="$1"
20534f9b3eeSRoland Mainz
20634f9b3eeSRoland Mainz		set -o errexit
20734f9b3eeSRoland Mainz
20834f9b3eeSRoland Mainz		if [[ "${_.protocol}" == "https" ]] ; then
20934f9b3eeSRoland Mainz				print -n -- "${request}\r\n" >>	"${_.ssl.fifo.in}"
21034f9b3eeSRoland Mainz
21134f9b3eeSRoland Mainz				redirect {_.netfd.in}< "${_.ssl.fifo.out}"
21234f9b3eeSRoland Mainz		else
21334f9b3eeSRoland Mainz				print -n -- "${request}\r\n" >&${_.netfd.out}
21434f9b3eeSRoland Mainz		fi
21534f9b3eeSRoland Mainz		return 0
21634f9b3eeSRoland Mainz	}
21734f9b3eeSRoland Mainz
21834f9b3eeSRoland Mainz	function cat_url
21934f9b3eeSRoland Mainz	{
22034f9b3eeSRoland Mainz		if [[ "${_.protocol}" == "file" ]] ; then
22134f9b3eeSRoland Mainz			cat "${_.path1}"
22234f9b3eeSRoland Mainz			return $?
22334f9b3eeSRoland Mainz		elif [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then
22434f9b3eeSRoland Mainz			compound httpresponse # http response
22534f9b3eeSRoland Mainz
22634f9b3eeSRoland Mainz			# If URL did not contain a port number in the host part then look at the
22734f9b3eeSRoland Mainz			# protocol to get the port number
22834f9b3eeSRoland Mainz			if [[ "${_.port}" == "${_.host}" ]] ; then
22934f9b3eeSRoland Mainz				case "${_.protocol}" in
23034f9b3eeSRoland Mainz					"http")  _.port=80 ;;
23134f9b3eeSRoland Mainz					"https") _.port=443 ;;
23234f9b3eeSRoland Mainz					*)       _.port="$(getent services "${_.protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
23334f9b3eeSRoland Mainz				esac
23434f9b3eeSRoland Mainz			else
23534f9b3eeSRoland Mainz				_.host="${_.host%:*}"
23634f9b3eeSRoland Mainz			fi
23734f9b3eeSRoland Mainz
23834f9b3eeSRoland Mainz			printmsg "protocol=${_.protocol} port=${_.port} host=${_.host} path=${_.path}"
23934f9b3eeSRoland Mainz
24034f9b3eeSRoland Mainz			# prechecks
24134f9b3eeSRoland Mainz			[[ "${_.protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
24234f9b3eeSRoland Mainz			[[ "${_.port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
24334f9b3eeSRoland Mainz			[[ "${_.host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
24434f9b3eeSRoland Mainz			[[ "${_.path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
24534f9b3eeSRoland Mainz
246*3e14f97fSRoger A. Faulkner			_.open_connection || return 1
24734f9b3eeSRoland Mainz
24834f9b3eeSRoland Mainz			# send HTTP request
24934f9b3eeSRoland Mainz			request="GET /${_.path} HTTP/1.1\r\n"
25034f9b3eeSRoland Mainz			request+="Host: ${_.host}\r\n"
25134f9b3eeSRoland Mainz			request+="User-Agent: ${_.user_agent}\r\n"
25234f9b3eeSRoland Mainz			request+="Connection: close\r\n"
25334f9b3eeSRoland Mainz			_.send_request "${request}\r\n"
25434f9b3eeSRoland Mainz
25534f9b3eeSRoland Mainz			# collect response and send it to stdout
25634f9b3eeSRoland Mainz			{
25734f9b3eeSRoland Mainz				_.parse_http_response httpresponse
25834f9b3eeSRoland Mainz				_.cat_http_body "${httpresponse.transfer_encoding}"
25934f9b3eeSRoland Mainz			} <&${_.netfd.in}
26034f9b3eeSRoland Mainz
26134f9b3eeSRoland Mainz			_.close_connection
26234f9b3eeSRoland Mainz
26334f9b3eeSRoland Mainz			return 0
26434f9b3eeSRoland Mainz		else
26534f9b3eeSRoland Mainz			return 1
26634f9b3eeSRoland Mainz		fi
26734f9b3eeSRoland Mainz		# notreached
26834f9b3eeSRoland Mainz	}
26934f9b3eeSRoland Mainz)
27034f9b3eeSRoland Mainz
2717c2fbfb3SApril Chinfunction html_entity_to_ascii
2727c2fbfb3SApril Chin{
2737c2fbfb3SApril Chin	typeset buf
2747c2fbfb3SApril Chin	typeset entity
2757c2fbfb3SApril Chin	typeset c
2767c2fbfb3SApril Chin	typeset value
2777c2fbfb3SApril Chin
2787c2fbfb3SApril Chin	# Todo: Add more HTML/MathML entities here
2797c2fbfb3SApril Chin	# Note we use a static variable (typeset -S) here to make sure we
2807c2fbfb3SApril Chin	# don't loose the cache data between calls
2817c2fbfb3SApril Chin	typeset -S -A entity_cache=(
2827c2fbfb3SApril Chin		# entity to ascii (fixme: add UTF-8 transliterations)
2837c2fbfb3SApril Chin		["nbsp"]=' '
2847c2fbfb3SApril Chin		["lt"]='<'
2857c2fbfb3SApril Chin		["le"]='<='
2867c2fbfb3SApril Chin		["gt"]='>'
2877c2fbfb3SApril Chin		["ge"]='>='
2887c2fbfb3SApril Chin		["amp"]='&'
2897c2fbfb3SApril Chin		["quot"]='"'
2907c2fbfb3SApril Chin		["apos"]="'"
2917c2fbfb3SApril Chin	)
2927c2fbfb3SApril Chin
2937c2fbfb3SApril Chin	buf=""
2947c2fbfb3SApril Chin	while IFS='' read -r -N 1 c ; do
2957c2fbfb3SApril Chin		if [[ "$c" != "&" ]] ; then
2967c2fbfb3SApril Chin			print -n -r -- "${c}"
2977c2fbfb3SApril Chin			continue
2987c2fbfb3SApril Chin		fi
2997c2fbfb3SApril Chin
3007c2fbfb3SApril Chin		entity=""
3017c2fbfb3SApril Chin		while IFS='' read -r -N 1 c ; do
3027c2fbfb3SApril Chin			case "$c" in
3037c2fbfb3SApril Chin				";")
3047c2fbfb3SApril Chin				break
3057c2fbfb3SApril Chin				;;
3067c2fbfb3SApril Chin			~(Eilr)[a-z0-9#])
3077c2fbfb3SApril Chin				entity+="$c"
3087c2fbfb3SApril Chin				continue
3097c2fbfb3SApril Chin				;;
3107c2fbfb3SApril Chin			*)
3117c2fbfb3SApril Chin#				debugmsg "error &${entity}${c}#"
3127c2fbfb3SApril Chin
3137c2fbfb3SApril Chin				print -n -r -- "${entity}${c}"
3147c2fbfb3SApril Chin				entity=""
3157c2fbfb3SApril Chin				continue 2
3167c2fbfb3SApril Chin				;;
3177c2fbfb3SApril Chin			esac
3187c2fbfb3SApril Chin		done
3197c2fbfb3SApril Chin
3207c2fbfb3SApril Chin		value=""
3217c2fbfb3SApril Chin		if [[ "${entity_cache["${entity}"]}" != "" ]] ; then
3227c2fbfb3SApril Chin#			debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
3237c2fbfb3SApril Chin			value="${entity_cache["${entity}"]}"
3247c2fbfb3SApril Chin		else
3257c2fbfb3SApril Chin			if [[ "${entity:0:1}" == "#" ]] ; then
3267c2fbfb3SApril Chin				# decimal literal
3277c2fbfb3SApril Chin				value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }"
3287c2fbfb3SApril Chin			elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then
3297c2fbfb3SApril Chin				# hexadecimal literal
3307c2fbfb3SApril Chin				value="${ printf "\u[${entity:0:7}]" ; }"
3317c2fbfb3SApril Chin			else
3327c2fbfb3SApril Chin				# unknown literal - pass-through
3337c2fbfb3SApril Chin				value="ENT=|${entity}|"
3347c2fbfb3SApril Chin			fi
3357c2fbfb3SApril Chin
3367c2fbfb3SApril Chin			entity_cache["${entity}"]="${value}"
3377c2fbfb3SApril Chin
3387c2fbfb3SApril Chin#			debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
3397c2fbfb3SApril Chin		fi
3407c2fbfb3SApril Chin
3417c2fbfb3SApril Chin		printf "%s" "${value}"
3427c2fbfb3SApril Chin	done
3437c2fbfb3SApril Chin
3447c2fbfb3SApril Chin	return 0
3457c2fbfb3SApril Chin}
3467c2fbfb3SApril Chin
3477c2fbfb3SApril Chin# dumb xhtml handler - no CSS,  tables, images, iframes or nested
3487c2fbfb3SApril Chin# structures are supported (and we assume that the input is correct
3497c2fbfb3SApril Chin# xhtml). The code was written in a trial&&error manner and should be
3507c2fbfb3SApril Chin# rewritten to parse xhtml correctly.
3517c2fbfb3SApril Chinfunction handle_html
3527c2fbfb3SApril Chin{
3537c2fbfb3SApril Chin    # we can't use global variables here when multiple callbacks use the same
3547c2fbfb3SApril Chin    # callback function - but we can use the callback associative array for
3557c2fbfb3SApril Chin    # variable storage instead
3567c2fbfb3SApril Chin    nameref callbacks=${1}
3577c2fbfb3SApril Chin    typeset tag_type="$2"
3587c2fbfb3SApril Chin    typeset tag_value="$3"
3597c2fbfb3SApril Chin
3607c2fbfb3SApril Chin    case "${tag_type}" in
3617c2fbfb3SApril Chin        tag_begin)
3627c2fbfb3SApril Chin            case "${tag_value}" in
3637c2fbfb3SApril Chin                br) printf "\n" ;;
3647c2fbfb3SApril Chin                hr) printf "\n-------------------------------------\n" ;;
3657c2fbfb3SApril Chin                pre) callbacks["html_pre"]='true' ;;
3667c2fbfb3SApril Chin                p)  printf "\n" ;;
3677c2fbfb3SApril Chin            esac
3687c2fbfb3SApril Chin            ;;
3697c2fbfb3SApril Chin
3707c2fbfb3SApril Chin        tag_end)
3717c2fbfb3SApril Chin            case "${tag_value}" in
3727c2fbfb3SApril Chin                pre) callbacks["html_pre"]='false' ;;
3737c2fbfb3SApril Chin            esac
3747c2fbfb3SApril Chin            ;;
3757c2fbfb3SApril Chin
3767c2fbfb3SApril Chin        tag_text)
3777c2fbfb3SApril Chin            if ${callbacks["html_pre"]} ; then
3787c2fbfb3SApril Chin                printf "%s" "${tag_value}"
3797c2fbfb3SApril Chin            else
3807c2fbfb3SApril Chin                # compress spaces/newlines/tabs/etc.
3817c2fbfb3SApril Chin                printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }"
3827c2fbfb3SApril Chin            fi
3837c2fbfb3SApril Chin            ;;
3847c2fbfb3SApril Chin
3857c2fbfb3SApril Chin        document_start)
3867c2fbfb3SApril Chin            callbacks["html_pre"]='false'
3877c2fbfb3SApril Chin            ;;
3887c2fbfb3SApril Chin        document_end) ;;
3897c2fbfb3SApril Chin    esac
3907c2fbfb3SApril Chin
3917c2fbfb3SApril Chin    return 0
3927c2fbfb3SApril Chin}
3937c2fbfb3SApril Chin
3947c2fbfb3SApril Chinfunction handle_rss
3957c2fbfb3SApril Chin{
3967c2fbfb3SApril Chin	# we can't use global variables here when multiple callbacks use the same
3977c2fbfb3SApril Chin	# callback function - but we can use the callback associative array for
3987c2fbfb3SApril Chin	# variable storage instead
3997c2fbfb3SApril Chin	nameref callbacks=${1}
4007c2fbfb3SApril Chin	typeset tag_type="$2"
4017c2fbfb3SApril Chin	typeset tag_value="$3"
4027c2fbfb3SApril Chin
4037c2fbfb3SApril Chin	case "${tag_type}" in
4047c2fbfb3SApril Chin		tag_begin)
4057c2fbfb3SApril Chin			case "${tag_value}" in
4067c2fbfb3SApril Chin				item)
4077c2fbfb3SApril Chin					item["title"]=""
4087c2fbfb3SApril Chin					item["link"]=""
4097c2fbfb3SApril Chin					item["tag"]=""
4107c2fbfb3SApril Chin					item["description"]=""
4117c2fbfb3SApril Chin					;;
4127c2fbfb3SApril Chin			esac
4137c2fbfb3SApril Chin			callbacks["textbuf"]=""
4147c2fbfb3SApril Chin			;;
4157c2fbfb3SApril Chin		tag_end)
4167c2fbfb3SApril Chin			case "${tag_value}" in
4177c2fbfb3SApril Chin				item)
4187c2fbfb3SApril Chin					# note that each RSS item needs to be converted seperately from RSS to HTML to plain text
4197c2fbfb3SApril Chin					# to make sure that the state of one RSS item doesn't affect others
4207c2fbfb3SApril Chin					(
4217c2fbfb3SApril Chin						printf $"<br />#### RSS item: title: %s ####" "${item["title"]}"
4227c2fbfb3SApril Chin						printf $"<br />## author: %s" "${item["author"]}"
4237c2fbfb3SApril Chin						printf $"<br />## link:   %s" "${item["link"]}"
4247c2fbfb3SApril Chin						printf $"<br />## date:   %s" "${item["pubDate"]}"
4257c2fbfb3SApril Chin						printf $"<br />## begin description:"
4267c2fbfb3SApril Chin						printf $"<br />%s<br />" "${item["description"]}"
4277c2fbfb3SApril Chin						printf $"<br />## end description<br />"
4287c2fbfb3SApril Chin						print # extra newline to make sure the sed pipeline gets flushed
4297c2fbfb3SApril Chin					) |
4307c2fbfb3SApril Chin						html_entity_to_ascii |	# convert XML entities (e.g. decode RSS content to HTML code)
4317c2fbfb3SApril Chin						xml_tok "xhtmltok_cb" |	# convert HTML to plain text
4327c2fbfb3SApril Chin						html_entity_to_ascii	# convert HTML entities
4337c2fbfb3SApril Chin					;;
4347c2fbfb3SApril Chin				title)                item["title"]="${callbacks["textbuf"]}"        ; callbacks["textbuf"]="" ;;
4357c2fbfb3SApril Chin				link)                 item["link"]="${callbacks["textbuf"]}"         ; callbacks["textbuf"]="" ;;
4367c2fbfb3SApril Chin				dc:creator | author)  item["author"]="${callbacks["textbuf"]}"       ; callbacks["textbuf"]="" ;;
4377c2fbfb3SApril Chin				dc:date | pubDate)    item["pubDate"]="${callbacks["textbuf"]}"      ; callbacks["textbuf"]="" ;;
4387c2fbfb3SApril Chin				description)          item["description"]="${callbacks["textbuf"]}"  ; callbacks["textbuf"]="" ;;
4397c2fbfb3SApril Chin			esac
4407c2fbfb3SApril Chin			callbacks["textbuf"]=""
4417c2fbfb3SApril Chin			;;
4427c2fbfb3SApril Chin		tag_text)
4437c2fbfb3SApril Chin			callbacks["textbuf"]+="${tag_value}"
4447c2fbfb3SApril Chin			;;
4457c2fbfb3SApril Chin		document_start) ;;
4467c2fbfb3SApril Chin		document_end) ;;
4477c2fbfb3SApril Chin	esac
4487c2fbfb3SApril Chin	return 0
4497c2fbfb3SApril Chin}
4507c2fbfb3SApril Chin
4517c2fbfb3SApril Chinfunction xml_tok
4527c2fbfb3SApril Chin{
4537c2fbfb3SApril Chin    typeset buf=""
4547c2fbfb3SApril Chin    typeset namebuf=""
4557c2fbfb3SApril Chin    typeset attrbuf=""
4567c2fbfb3SApril Chin    typeset c=""
4577c2fbfb3SApril Chin    typeset isendtag # bool: true/false
4587c2fbfb3SApril Chin    typeset issingletag # bool: true/false (used for tags like "<br />")
4597c2fbfb3SApril Chin    nameref callbacks=${1}
4607c2fbfb3SApril Chin
4617c2fbfb3SApril Chin    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
4627c2fbfb3SApril Chin
4637c2fbfb3SApril Chin    while IFS='' read -r -N 1 c ; do
4647c2fbfb3SApril Chin        isendtag=false
4657c2fbfb3SApril Chin
4667c2fbfb3SApril Chin        if [[ "$c" == "<" ]] ; then
4677c2fbfb3SApril Chin	    # flush any text content
4687c2fbfb3SApril Chin            if [[ "$buf" != "" ]] ; then
4697c2fbfb3SApril Chin                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
4707c2fbfb3SApril Chin                buf=""
4717c2fbfb3SApril Chin            fi
4727c2fbfb3SApril Chin
4737c2fbfb3SApril Chin            IFS='' read -r -N 1 c
4747c2fbfb3SApril Chin            if [[ "$c" == "/" ]] ; then
4757c2fbfb3SApril Chin                isendtag=true
4767c2fbfb3SApril Chin            else
4777c2fbfb3SApril Chin                buf="$c"
4787c2fbfb3SApril Chin            fi
4797c2fbfb3SApril Chin            IFS='' read -r -d '>' c
4807c2fbfb3SApril Chin            buf+="$c"
4817c2fbfb3SApril Chin
4827c2fbfb3SApril Chin	    # handle comments
4837c2fbfb3SApril Chin	    if [[ "$buf" == ~(El)!-- ]] ; then
4847c2fbfb3SApril Chin	        # did we read the comment completely ?
4857c2fbfb3SApril Chin	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
4867c2fbfb3SApril Chin		    buf+=">"
4877c2fbfb3SApril Chin	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
4887c2fbfb3SApril Chin		        IFS='' read -r -N 1 c || break
4897c2fbfb3SApril Chin		        buf+="$c"
4907c2fbfb3SApril Chin		    done
4917c2fbfb3SApril Chin		fi
4927c2fbfb3SApril Chin
4937c2fbfb3SApril Chin		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
4947c2fbfb3SApril Chin		buf=""
4957c2fbfb3SApril Chin		continue
4967c2fbfb3SApril Chin	    fi
4977c2fbfb3SApril Chin
4987c2fbfb3SApril Chin	    # check if the tag starts and ends at the same time (like "<br />")
4997c2fbfb3SApril Chin	    if [[ "${buf}" == ~(Er).*/ ]] ; then
5007c2fbfb3SApril Chin	        issingletag=true
5017c2fbfb3SApril Chin		buf="${buf%*/}"
5027c2fbfb3SApril Chin	    else
5037c2fbfb3SApril Chin	        issingletag=false
5047c2fbfb3SApril Chin	    fi
5057c2fbfb3SApril Chin
5067c2fbfb3SApril Chin	    # check if the tag has attributes (e.g. space after name)
5077c2fbfb3SApril Chin	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
5087c2fbfb3SApril Chin	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
5097c2fbfb3SApril Chin                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
5107c2fbfb3SApril Chin            else
5117c2fbfb3SApril Chin	        namebuf="$buf"
5127c2fbfb3SApril Chin		attrbuf=""
5137c2fbfb3SApril Chin	    fi
5147c2fbfb3SApril Chin
5157c2fbfb3SApril Chin            if ${isendtag} ; then
5167c2fbfb3SApril Chin                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
5177c2fbfb3SApril Chin            else
5187c2fbfb3SApril Chin                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
5197c2fbfb3SApril Chin
5207c2fbfb3SApril Chin                # handle tags like <br/> (which are start- and end-tag in one piece)
5217c2fbfb3SApril Chin                if ${issingletag} ; then
5227c2fbfb3SApril Chin                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
5237c2fbfb3SApril Chin                fi
5247c2fbfb3SApril Chin            fi
5257c2fbfb3SApril Chin            buf=""
5267c2fbfb3SApril Chin        else
5277c2fbfb3SApril Chin            buf+="$c"
5287c2fbfb3SApril Chin        fi
5297c2fbfb3SApril Chin    done
5307c2fbfb3SApril Chin
5317c2fbfb3SApril Chin    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
5327c2fbfb3SApril Chin
5337c2fbfb3SApril Chin    print # final newline to make filters like "sed" happy
5347c2fbfb3SApril Chin}
5357c2fbfb3SApril Chin
5367c2fbfb3SApril Chin# return the value of LC_MESSAGES needed for subprocesses which
5377c2fbfb3SApril Chin# want to run in a different locale/encoding
5387c2fbfb3SApril Chinfunction get_lc_messages
5397c2fbfb3SApril Chin{
5407c2fbfb3SApril Chin	[[ "${LC_ALL}"       != "" ]] && { print "${LC_ALL}"      ; return 0 ; }
5417c2fbfb3SApril Chin	[[ "${LC_MESSAGES}"  != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; }
5427c2fbfb3SApril Chin	[[ "${LANG}"         != "" ]] && { print "${LANG}"        ; return 0 ; }
5437c2fbfb3SApril Chin	print "C" ; return 0
5447c2fbfb3SApril Chin}
5457c2fbfb3SApril Chin
5467c2fbfb3SApril Chinfunction do_rssread
5477c2fbfb3SApril Chin{
5487c2fbfb3SApril Chin	# set unicode locale since RSS is encoded in UTF-8
5497c2fbfb3SApril Chin	# (and make sure $LC_MESSAGES is set to the parent
5507c2fbfb3SApril Chin	# process's locale that all error messages are using
5517c2fbfb3SApril Chin	# the callers locale/encoding)
5527c2fbfb3SApril Chin	export \
5537c2fbfb3SApril Chin		LC_MESSAGES="${ get_lc_messages ; }" \
5547c2fbfb3SApril Chin		LC_MONETARY="en_US.UTF-8" \
5557c2fbfb3SApril Chin		LC_NUMERIC="en_US.UTF-8" \
5567c2fbfb3SApril Chin		LC_COLLATE="en_US.UTF-8" \
5577c2fbfb3SApril Chin		LC_CTYPE="en_US.UTF-8" \
5587c2fbfb3SApril Chin		LC_TIME="en_US.UTF-8" \
5597c2fbfb3SApril Chin		LANG="en_US.UTF-8"
5607c2fbfb3SApril Chin
56134f9b3eeSRoland Mainz	# return non-zero exit code for this function if the rss processing below fails
56234f9b3eeSRoland Mainz	set -o errexit
56334f9b3eeSRoland Mainz
56434f9b3eeSRoland Mainz	urlconnection_t hc
565*3e14f97fSRoger A. Faulkner	hc.user_agent="rssread/ksh93(ssl) (2010-03-27; $(uname -s -r -p))"
56634f9b3eeSRoland Mainz	hc.init_url "$1"
56734f9b3eeSRoland Mainz
56834f9b3eeSRoland Mainz	# need extra newline after cat_url to terminate line with $'\n'
5697c2fbfb3SApril Chin	# to make "xml_tok" happy
57034f9b3eeSRoland Mainz	data="${ hc.cat_url ; print ; }"
57134f9b3eeSRoland Mainz
57234f9b3eeSRoland Mainz	print -u2 -f "# Got %d lines of RSS data, processing...\n" "${ wc -l <<< "${data}" ; }"
57334f9b3eeSRoland Mainz
57434f9b3eeSRoland Mainz	xml_tok "rsstok_cb" <<< "${data}"
57534f9b3eeSRoland Mainz
5767c2fbfb3SApril Chin	return 0
5777c2fbfb3SApril Chin}
5787c2fbfb3SApril Chin
5797c2fbfb3SApril Chinfunction usage
5807c2fbfb3SApril Chin{
5817c2fbfb3SApril Chin	OPTIND=0
5827c2fbfb3SApril Chin	getopts -a "${progname}" "${rssread_usage}" OPT '-?'
5837c2fbfb3SApril Chin	exit 2
5847c2fbfb3SApril Chin}
5857c2fbfb3SApril Chin
5867c2fbfb3SApril Chin# make sure we use the ksh93 builtin versions
5877c2fbfb3SApril Chinbuiltin basename
5887c2fbfb3SApril Chinbuiltin cat
58934f9b3eeSRoland Mainzbuiltin mkfifo
5907c2fbfb3SApril Chin
5917c2fbfb3SApril Chintypeset -A rsstok_cb # callbacks for xml_tok
5927c2fbfb3SApril Chinrsstok_cb["tag_begin"]="handle_rss"
5937c2fbfb3SApril Chinrsstok_cb["tag_end"]="handle_rss"
5947c2fbfb3SApril Chinrsstok_cb["tag_text"]="handle_rss"
5957c2fbfb3SApril Chinrsstok_cb["textbuf"]=""
5967c2fbfb3SApril Chin
5977c2fbfb3SApril Chintypeset -A xhtmltok_cb # callbacks for xml_tok
5987c2fbfb3SApril Chinxhtmltok_cb["tag_begin"]="handle_html"
5997c2fbfb3SApril Chinxhtmltok_cb["tag_end"]="handle_html"
6007c2fbfb3SApril Chinxhtmltok_cb["tag_text"]="handle_html"
6017c2fbfb3SApril Chinxhtmltok_cb["textbuf"]=""
6027c2fbfb3SApril Chinxhtmltok_cb["html_pre"]='false'
6037c2fbfb3SApril Chin
6047c2fbfb3SApril Chintypeset -A item
6057c2fbfb3SApril Chin
6067c2fbfb3SApril Chintypeset -A bookmark_urls
6077c2fbfb3SApril Chin
6087c2fbfb3SApril Chin# "ramdom" urls for testing
6097c2fbfb3SApril Chinbookmark_urls=(
6107c2fbfb3SApril Chin	["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss"
6117c2fbfb3SApril Chin	# some Sun staff/sites
6127c2fbfb3SApril Chin	["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss"
6137c2fbfb3SApril Chin	["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml"
61434f9b3eeSRoland Mainz	["bigadmin_scripts"]="https://www.sun.com/bigadmin/content/rss/scripts.xml"
6157c2fbfb3SApril Chin	["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss"
6167c2fbfb3SApril Chin	["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss"
6177c2fbfb3SApril Chin	["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss"
6187c2fbfb3SApril Chin	["planetsun"]="http://www.planetsun.org/rss20.xml"
6197c2fbfb3SApril Chin	["planetsolaris"]="http://www.planetsolaris.org/rss20.xml"
6207c2fbfb3SApril Chin	["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml"
6217c2fbfb3SApril Chin	["theregister_uk"]="http://www.theregister.co.uk/headlines.rss"
6227c2fbfb3SApril Chin	["heise"]="http://www.heise.de/newsticker/heise.rdf"
6237c2fbfb3SApril Chin	["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot"
62434f9b3eeSRoland Mainz	["wikipedia_command_shells"]="http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history"
6257c2fbfb3SApril Chin)
6267c2fbfb3SApril Chin
6277c2fbfb3SApril Chintypeset progname="${ basename "${0}" ; }"
6287c2fbfb3SApril Chin
6297c2fbfb3SApril Chintypeset -r rssread_usage=$'+
630*3e14f97fSRoger A. Faulkner[-?\n@(#)\$Id: rssread (Roland Mainz) 2010-03-27 \$\n]
6317c2fbfb3SApril Chin[-author?Roland Mainz <roland.mainz@sun.com>]
6327c2fbfb3SApril Chin[-author?Roland Mainz <roland.mainz@nrubsig.org>]
6337c2fbfb3SApril Chin[+NAME?rssread - fetch RSS messages and convert them to plain text]
6347c2fbfb3SApril Chin[+DESCRIPTION?\brssread\b RSS to plain text converter
6357c2fbfb3SApril Chin        which fetches RSS streams via HTTP and converts them from
6367c2fbfb3SApril Chin	RSS to HTML to plain text in the current locale/encoding.]
6377c2fbfb3SApril Chin[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.]
6387c2fbfb3SApril Chin
6397c2fbfb3SApril Chin[ url ]
6407c2fbfb3SApril Chin
6417c2fbfb3SApril Chin[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)]
6427c2fbfb3SApril Chin'
6437c2fbfb3SApril Chin
6447c2fbfb3SApril Chintypeset noiconv=false
6457c2fbfb3SApril Chin
6467c2fbfb3SApril Chinwhile getopts -a "${progname}" "${rssread_usage}" OPT ; do
6477c2fbfb3SApril Chin#	printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
6487c2fbfb3SApril Chin	case ${OPT} in
6497c2fbfb3SApril Chin		I)    noiconv=true  ;;
6507c2fbfb3SApril Chin		+I)   noiconv=false ;;
6517c2fbfb3SApril Chin		*)    usage ;;
6527c2fbfb3SApril Chin	esac
6537c2fbfb3SApril Chindone
6547c2fbfb3SApril Chinshift $((OPTIND-1))
6557c2fbfb3SApril Chin
6567c2fbfb3SApril Chintypeset url="$1"
6577c2fbfb3SApril Chin
6587c2fbfb3SApril Chinif [[ "${url}" == "" ]] ; then
6597c2fbfb3SApril Chin	fatal_error $"No url given."
6607c2fbfb3SApril Chinfi
6617c2fbfb3SApril Chin
6627c2fbfb3SApril Chinif [[ "${bookmark_urls[${url}]}" != "" ]] ; then
6637c2fbfb3SApril Chin	printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}"
6647c2fbfb3SApril Chin	url="${bookmark_urls[${url}]}"
6657c2fbfb3SApril Chinfi
6667c2fbfb3SApril Chin
6677c2fbfb3SApril Chinif ${noiconv} ; then
6687c2fbfb3SApril Chin	do_rssread "${url}"
6697c2fbfb3SApril Chinelse
6707c2fbfb3SApril Chin	do_rssread "${url}" | iconv -f "UTF-8" - -
6717c2fbfb3SApril Chinfi
6727c2fbfb3SApril Chin
6737c2fbfb3SApril Chinexit 0
6747c2fbfb3SApril Chin#EOF.
675