xref: /titanic_44/usr/src/lib/libshell/common/scripts/rssread.sh (revision 99025f2ef467cfbc67c1a5ac8f7316ec638206ad)
1#!/usr/bin/ksh93
2
3#
4# CDDL HEADER START
5#
6# The contents of this file are subject to the terms of the
7# Common Development and Distribution License (the "License").
8# You may not use this file except in compliance with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
26# Use is subject to license terms.
27#
28
29#
30# rssread - a simple RSS2.0 reader with RSS to XHTML to
31# plaintext conversion.
32#
33
34# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
35export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
36
37function printmsg
38{
39	print -u2 "$*"
40}
41
42function debugmsg
43{
44#	printmsg "$*"
45true
46}
47
48function fatal_error
49{
50	print -u2 "${progname}: $*"
51	exit 1
52}
53
54# parse HTTP return code, cookies etc.
55function parse_http_response
56{
57	nameref response="$1"
58	typeset h statuscode statusmsg i
59
60	# we use '\r' as additional IFS to filter the final '\r'
61	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
62	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
63	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
64	response.statuscode="$statuscode"
65	response.statusmsg="$statusmsg"
66
67	# skip remaining headers
68	while IFS='' read -r i ; do
69		[[ "$i" == $'\r' ]] && break
70
71		# strip '\r' at the end
72		i="${i/~(Er)$'\r'/}"
73
74		case "$i" in
75			~(Eli)Content-Type:.*)
76				response.content_type="${i/~(El).*:[[:blank:]]*/}"
77				;;
78			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
79				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
80				;;
81			~(Eli)Transfer-Encoding:.*)
82				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
83				;;
84		esac
85	done
86
87	return 0
88}
89
90function cat_http_body
91{
92	typeset emode="$1"
93	typeset hexchunksize="0"
94	integer chunksize=0
95
96	if [[ "${emode}" == "chunked" ]] ; then
97		while IFS=$'\r' read hexchunksize &&
98			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] &&
99			(( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do
100			dd bs=1 count="${chunksize}" 2>/dev/null
101		done
102	else
103		cat
104	fi
105
106	return 0
107}
108
109function cat_http
110{
111	typeset protocol="${1%://*}"
112	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
113
114	typeset host="${path1%%/*}"
115	typeset path="${path1#*/}"
116	typeset port="${host##*:}"
117
118	integer netfd
119	typeset -C httpresponse # http response
120
121	# If URL did not contain a port number in the host part then look at the
122	# protocol to get the port number
123	if [[ "${port}" == "${host}" ]] ; then
124		case "${protocol}" in
125			"http") port=80 ;;
126			*)      port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
127		esac
128	else
129		host="${host%:*}"
130	fi
131
132	printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
133
134	# prechecks
135	[[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
136	[[ "${port}"     == "" ]] && { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
137	[[ "${host}"     == "" ]] && { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
138	[[ "${path}"     == "" ]] && { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
139
140	# open TCP channel
141	redirect {netfd}<>"/dev/tcp/${host}/${port}"
142	(( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; }
143
144	# send HTTP request
145	request="GET /${path} HTTP/1.1\r\n"
146	request+="Host: ${host}\r\n"
147	request+="User-Agent: rssread/ksh93 (2008-10-14; $(uname -s -r -p))\r\n"
148	request+="Connection: close\r\n"
149	print -n -- "${request}\r\n" >&${netfd}
150
151	# collect response and send it to stdout
152	parse_http_response httpresponse <&${netfd}
153	cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
154
155	# close connection
156	redirect {netfd}<&-
157
158	return 0
159}
160
161function html_entity_to_ascii
162{
163	typeset buf
164	typeset entity
165	typeset c
166	typeset value
167
168	# Todo: Add more HTML/MathML entities here
169	# Note we use a static variable (typeset -S) here to make sure we
170	# don't loose the cache data between calls
171	typeset -S -A entity_cache=(
172		# entity to ascii (fixme: add UTF-8 transliterations)
173		["nbsp"]=' '
174		["lt"]='<'
175		["le"]='<='
176		["gt"]='>'
177		["ge"]='>='
178		["amp"]='&'
179		["quot"]='"'
180		["apos"]="'"
181	)
182
183	buf=""
184	while IFS='' read -r -N 1 c ; do
185		if [[ "$c" != "&" ]] ; then
186			print -n -r -- "${c}"
187			continue
188		fi
189
190		entity=""
191		while IFS='' read -r -N 1 c ; do
192			case "$c" in
193				";")
194				break
195				;;
196			~(Eilr)[a-z0-9#])
197				entity+="$c"
198				continue
199				;;
200			*)
201#				debugmsg "error &${entity}${c}#"
202
203				print -n -r -- "${entity}${c}"
204				entity=""
205				continue 2
206				;;
207			esac
208		done
209
210		value=""
211		if [[ "${entity_cache["${entity}"]}" != "" ]] ; then
212#			debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
213			value="${entity_cache["${entity}"]}"
214		else
215			if [[ "${entity:0:1}" == "#" ]] ; then
216				# decimal literal
217				value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }"
218			elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then
219				# hexadecimal literal
220				value="${ printf "\u[${entity:0:7}]" ; }"
221			else
222				# unknown literal - pass-through
223				value="ENT=|${entity}|"
224			fi
225
226			entity_cache["${entity}"]="${value}"
227
228#			debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
229		fi
230
231		printf "%s" "${value}"
232	done
233
234	return 0
235}
236
237# dumb xhtml handler - no CSS,  tables, images, iframes or nested
238# structures are supported (and we assume that the input is correct
239# xhtml). The code was written in a trial&&error manner and should be
240# rewritten to parse xhtml correctly.
241function handle_html
242{
243    # we can't use global variables here when multiple callbacks use the same
244    # callback function - but we can use the callback associative array for
245    # variable storage instead
246    nameref callbacks=${1}
247    typeset tag_type="$2"
248    typeset tag_value="$3"
249
250    case "${tag_type}" in
251        tag_begin)
252            case "${tag_value}" in
253                br) printf "\n" ;;
254                hr) printf "\n-------------------------------------\n" ;;
255                pre) callbacks["html_pre"]='true' ;;
256                p)  printf "\n" ;;
257            esac
258            ;;
259
260        tag_end)
261            case "${tag_value}" in
262                pre) callbacks["html_pre"]='false' ;;
263            esac
264            ;;
265
266        tag_text)
267            if ${callbacks["html_pre"]} ; then
268                printf "%s" "${tag_value}"
269            else
270                # compress spaces/newlines/tabs/etc.
271                printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }"
272            fi
273            ;;
274
275        document_start)
276            callbacks["html_pre"]='false'
277            ;;
278        document_end) ;;
279    esac
280
281    return 0
282}
283
284function handle_rss
285{
286	# we can't use global variables here when multiple callbacks use the same
287	# callback function - but we can use the callback associative array for
288	# variable storage instead
289	nameref callbacks=${1}
290	typeset tag_type="$2"
291	typeset tag_value="$3"
292
293	case "${tag_type}" in
294		tag_begin)
295			case "${tag_value}" in
296				item)
297					item["title"]=""
298					item["link"]=""
299					item["tag"]=""
300					item["description"]=""
301					;;
302			esac
303			callbacks["textbuf"]=""
304			;;
305		tag_end)
306			case "${tag_value}" in
307				item)
308					# note that each RSS item needs to be converted seperately from RSS to HTML to plain text
309					# to make sure that the state of one RSS item doesn't affect others
310					(
311						printf $"<br />#### RSS item: title: %s ####" "${item["title"]}"
312						printf $"<br />## author: %s" "${item["author"]}"
313						printf $"<br />## link:   %s" "${item["link"]}"
314						printf $"<br />## date:   %s" "${item["pubDate"]}"
315						printf $"<br />## begin description:"
316						printf $"<br />%s<br />" "${item["description"]}"
317						printf $"<br />## end description<br />"
318						print # extra newline to make sure the sed pipeline gets flushed
319					) |
320						html_entity_to_ascii |	# convert XML entities (e.g. decode RSS content to HTML code)
321						xml_tok "xhtmltok_cb" |	# convert HTML to plain text
322						html_entity_to_ascii	# convert HTML entities
323					;;
324				title)                item["title"]="${callbacks["textbuf"]}"        ; callbacks["textbuf"]="" ;;
325				link)                 item["link"]="${callbacks["textbuf"]}"         ; callbacks["textbuf"]="" ;;
326				dc:creator | author)  item["author"]="${callbacks["textbuf"]}"       ; callbacks["textbuf"]="" ;;
327				dc:date | pubDate)    item["pubDate"]="${callbacks["textbuf"]}"      ; callbacks["textbuf"]="" ;;
328				description)          item["description"]="${callbacks["textbuf"]}"  ; callbacks["textbuf"]="" ;;
329			esac
330			callbacks["textbuf"]=""
331			;;
332		tag_text)
333			callbacks["textbuf"]+="${tag_value}"
334			;;
335		document_start) ;;
336		document_end) ;;
337	esac
338	return 0
339}
340
341function xml_tok
342{
343    typeset buf=""
344    typeset namebuf=""
345    typeset attrbuf=""
346    typeset c=""
347    typeset isendtag # bool: true/false
348    typeset issingletag # bool: true/false (used for tags like "<br />")
349    nameref callbacks=${1}
350
351    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
352
353    while IFS='' read -r -N 1 c ; do
354        isendtag=false
355
356        if [[ "$c" == "<" ]] ; then
357	    # flush any text content
358            if [[ "$buf" != "" ]] ; then
359                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
360                buf=""
361            fi
362
363            IFS='' read -r -N 1 c
364            if [[ "$c" == "/" ]] ; then
365                isendtag=true
366            else
367                buf="$c"
368            fi
369            IFS='' read -r -d '>' c
370            buf+="$c"
371
372	    # handle comments
373	    if [[ "$buf" == ~(El)!-- ]] ; then
374	        # did we read the comment completely ?
375	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
376		    buf+=">"
377	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
378		        IFS='' read -r -N 1 c || break
379		        buf+="$c"
380		    done
381		fi
382
383		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
384		buf=""
385		continue
386	    fi
387
388	    # check if the tag starts and ends at the same time (like "<br />")
389	    if [[ "${buf}" == ~(Er).*/ ]] ; then
390	        issingletag=true
391		buf="${buf%*/}"
392	    else
393	        issingletag=false
394	    fi
395
396	    # check if the tag has attributes (e.g. space after name)
397	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
398	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
399                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
400            else
401	        namebuf="$buf"
402		attrbuf=""
403	    fi
404
405            if ${isendtag} ; then
406                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
407            else
408                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
409
410                # handle tags like <br/> (which are start- and end-tag in one piece)
411                if ${issingletag} ; then
412                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
413                fi
414            fi
415            buf=""
416        else
417            buf+="$c"
418        fi
419    done
420
421    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
422
423    print # final newline to make filters like "sed" happy
424}
425
426# return the value of LC_MESSAGES needed for subprocesses which
427# want to run in a different locale/encoding
428function get_lc_messages
429{
430	[[ "${LC_ALL}"       != "" ]] && { print "${LC_ALL}"      ; return 0 ; }
431	[[ "${LC_MESSAGES}"  != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; }
432	[[ "${LANG}"         != "" ]] && { print "${LANG}"        ; return 0 ; }
433	print "C" ; return 0
434}
435
436function do_rssread
437{
438	# set unicode locale since RSS is encoded in UTF-8
439	# (and make sure $LC_MESSAGES is set to the parent
440	# process's locale that all error messages are using
441	# the callers locale/encoding)
442	export \
443		LC_MESSAGES="${ get_lc_messages ; }" \
444		LC_MONETARY="en_US.UTF-8" \
445		LC_NUMERIC="en_US.UTF-8" \
446		LC_COLLATE="en_US.UTF-8" \
447		LC_CTYPE="en_US.UTF-8" \
448		LC_TIME="en_US.UTF-8" \
449		LANG="en_US.UTF-8"
450
451	# need extra newline after cat_http to terminate line with $'\n'
452	# to make "xml_tok" happy
453	{ cat_http "$1" ; print ; } |
454		xml_tok "rsstok_cb"
455	return 0
456}
457
458function usage
459{
460	OPTIND=0
461	getopts -a "${progname}" "${rssread_usage}" OPT '-?'
462	exit 2
463}
464
465# make sure we use the ksh93 builtin versions
466builtin basename
467builtin cat
468
469typeset -A rsstok_cb # callbacks for xml_tok
470rsstok_cb["tag_begin"]="handle_rss"
471rsstok_cb["tag_end"]="handle_rss"
472rsstok_cb["tag_text"]="handle_rss"
473rsstok_cb["textbuf"]=""
474
475typeset -A xhtmltok_cb # callbacks for xml_tok
476xhtmltok_cb["tag_begin"]="handle_html"
477xhtmltok_cb["tag_end"]="handle_html"
478xhtmltok_cb["tag_text"]="handle_html"
479xhtmltok_cb["textbuf"]=""
480xhtmltok_cb["html_pre"]='false'
481
482typeset -A item
483
484typeset -A bookmark_urls
485
486# "ramdom" urls for testing
487bookmark_urls=(
488	["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss"
489	# OpenSolaris.org sites
490	["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml"
491	["shell"]="http://www.opensolaris.org/rss/os/project/shell/announcements/rss2.xml"
492	["systemz"]="http://www.opensolaris.org/rss/os/project/systemz/announcements/rss2.xml"
493	# some Sun staff/sites
494	["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss"
495	["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml"
496	["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss"
497	["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss"
498	["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss"
499	["planetsun"]="http://www.planetsun.org/rss20.xml"
500	["planetsolaris"]="http://www.planetsolaris.org/rss20.xml"
501	["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml"
502	["theregister_uk"]="http://www.theregister.co.uk/headlines.rss"
503	["heise"]="http://www.heise.de/newsticker/heise.rdf"
504	["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot"
505)
506
507typeset progname="${ basename "${0}" ; }"
508
509typeset -r rssread_usage=$'+
510[-?\n@(#)\$Id: rssread (Roland Mainz) 2008-11-10 \$\n]
511[-author?Roland Mainz <roland.mainz@sun.com>]
512[-author?Roland Mainz <roland.mainz@nrubsig.org>]
513[+NAME?rssread - fetch RSS messages and convert them to plain text]
514[+DESCRIPTION?\brssread\b RSS to plain text converter
515        which fetches RSS streams via HTTP and converts them from
516	RSS to HTML to plain text in the current locale/encoding.]
517[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.]
518
519[ url ]
520
521[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)]
522'
523
524typeset noiconv=false
525
526while getopts -a "${progname}" "${rssread_usage}" OPT ; do
527#	printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
528	case ${OPT} in
529		I)    noiconv=true  ;;
530		+I)   noiconv=false ;;
531		*)    usage ;;
532	esac
533done
534shift $((OPTIND-1))
535
536typeset url="$1"
537
538if [[ "${url}" == "" ]] ; then
539	fatal_error $"No url given."
540fi
541
542if [[ "${bookmark_urls[${url}]}" != "" ]] ; then
543	printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}"
544	url="${bookmark_urls[${url}]}"
545fi
546
547if ${noiconv} ; then
548	do_rssread "${url}"
549else
550	do_rssread "${url}" | iconv -f "UTF-8" - -
551fi
552
553exit 0
554#EOF.
555