xref: /titanic_44/usr/src/lib/libshell/common/scripts/rssread.sh (revision 55f5292c612446ce6f93ddd248c0019b5974618b)
1#!/usr/bin/ksh93
2
3#
4# CDDL HEADER START
5#
6# The contents of this file are subject to the terms of the
7# Common Development and Distribution License (the "License").
8# You may not use this file except in compliance with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
26# Use is subject to license terms.
27#
28
29#
30# rssread - a simple RSS2.0 reader with RSS to XHTML to
31# plaintext conversion.
32#
33
34# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
35export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
36
37function printmsg
38{
39	print -u2 "$*"
40}
41
42function debugmsg
43{
44#	printmsg "$*"
45true
46}
47
48function fatal_error
49{
50	print -u2 "${progname}: $*"
51	exit 1
52}
53
54typeset -T urlconnection_t=(
55	# public
56	typeset user_agent="ksh93/urlconnection_t"
57
58	# private variables
59	typeset protocol
60	typeset path1
61	typeset host
62	typeset path
63	typeset port
64
65	compound netfd=(
66		integer in=-1  # incoming traffic
67		integer out=-1 # outgoing traffic
68	)
69
70	# only used for https
71	compound ssl=(
72		compound fifo=(
73			typeset dir=""
74			typeset in=""
75			typeset out=""
76		)
77		integer openssl_client_pid=-1
78	)
79
80	# parse HTTP return code, cookies etc.
81	function parse_http_response
82	{
83		nameref response="$1"
84		typeset h statuscode statusmsg i
85
86		# we use '\r' as additional IFS to filter the final '\r'
87		IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code>
88		[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
89		[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
90		response.statuscode="$statuscode"
91		response.statusmsg="$statusmsg"
92
93		# skip remaining headers
94		while IFS='' read -r i ; do
95			[[ "$i" == $'\r' ]] && break
96
97			# strip '\r' at the end
98			i="${i/~(Er)$'\r'/}"
99
100			case "$i" in
101				~(Eli)Content-Type:.*)
102					response.content_type="${i/~(El).*:[[:blank:]]*/}"
103					;;
104				~(Eli)Content-Length:[[:blank:]]*[0-9]*)
105					integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
106					;;
107				~(Eli)Transfer-Encoding:.*)
108					response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
109					;;
110			esac
111		done
112
113		return 0
114	}
115
116	function cat_http_body
117	{
118		typeset emode="$1"
119		typeset hexchunksize="0"
120		integer chunksize=0
121
122		if [[ "${emode}" == "chunked" ]] ; then
123			while IFS=$'\n' read hexchunksize ; do
124				hexchunksize="${hexchunksize//$'\r'/}"
125				[[ "${hexchunksize}" != "" ]] || continue
126				[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] || break
127				(( chunksize=16#${hexchunksize} ))
128				(( chunksize > 0 )) || break
129				dd bs=1 count="${chunksize}" 2>/dev/null
130			done
131		else
132			cat
133		fi
134
135		return 0
136	}
137
138	function init_url
139	{
140		_.protocol="${1%://*}"
141		_.path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
142
143		if  [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then
144			_.host="${_.path1%%/*}"
145			_.path="${_.path1#*/}"
146			_.port="${_.host##*:}"
147		fi
148
149		return 0
150	}
151
152	# close connection
153	function close_connection
154	{
155		integer ret
156
157		if (( _.netfd.in != -1 )) ; then
158			redirect {_.netfd.in}<&-
159			(( _.netfd.in=-1 ))
160		fi
161
162		if (( _.netfd.in != _.netfd.out && _.netfd.out != -1 )) ; then
163			redirect {_.netfd.out}<&-
164			((  _.netfd.out=-1 ))
165		fi
166
167		if [[ "${_.protocol}" == "https" ]] ; then
168			wait ${_.ssl.openssl_client_pid} || { print -u2 -f "%s: openssl failed.\n" ; return 1 ; }
169			(( _.ssl.openssl_client_pid=-1 ))
170
171			rm -r \"${_.ssl.fifo.dir}\"
172			_.ssl.fifo.dir=""
173		fi
174
175		return 0
176	}
177
178	function open_connection
179	{
180		if [[ "${_.protocol}" == "https" ]] ; then
181			_.ssl.fifo.dir="$(mktemp -d)"
182			_.ssl.fifo.in="${_.ssl.fifo.dir}/in"
183			_.ssl.fifo.out="${_.ssl.fifo.dir}/out"
184
185			# Use "errexit" to leave it at the first error
186			# (this saves lots of if/fi tests for error checking)
187			set -o errexit
188
189			mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}"
190
191			# create async openssl child to handle https
192			openssl s_client -quiet -connect "${_.host}:${_.port}" <"${_.ssl.fifo.in}" >>"${_.ssl.fifo.out}" &
193
194			_.ssl.openssl_client_pid=$!
195		else
196			redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}"
197			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
198			(( _.netfd.out=_.netfd.in ))
199		fi
200		return 0
201	}
202
203	function send_request
204	{
205		typeset request="$1"
206
207		set -o errexit
208
209		if [[ "${_.protocol}" == "https" ]] ; then
210				print -n -- "${request}\r\n" >>	"${_.ssl.fifo.in}"
211
212				redirect {_.netfd.in}< "${_.ssl.fifo.out}"
213		else
214				print -n -- "${request}\r\n" >&${_.netfd.out}
215		fi
216		return 0
217	}
218
219	function cat_url
220	{
221		if [[ "${_.protocol}" == "file" ]] ; then
222			cat "${_.path1}"
223			return $?
224		elif [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then
225			compound httpresponse # http response
226
227			# If URL did not contain a port number in the host part then look at the
228			# protocol to get the port number
229			if [[ "${_.port}" == "${_.host}" ]] ; then
230				case "${_.protocol}" in
231					"http")  _.port=80 ;;
232					"https") _.port=443 ;;
233					*)       _.port="$(getent services "${_.protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
234				esac
235			else
236				_.host="${_.host%:*}"
237			fi
238
239			printmsg "protocol=${_.protocol} port=${_.port} host=${_.host} path=${_.path}"
240
241			# prechecks
242			[[ "${_.protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
243			[[ "${_.port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
244			[[ "${_.host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
245			[[ "${_.path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
246
247			_.open_connection
248
249			# send HTTP request
250			request="GET /${_.path} HTTP/1.1\r\n"
251			request+="Host: ${_.host}\r\n"
252			request+="User-Agent: ${_.user_agent}\r\n"
253			request+="Connection: close\r\n"
254			_.send_request "${request}\r\n"
255
256			# collect response and send it to stdout
257			{
258				_.parse_http_response httpresponse
259				_.cat_http_body "${httpresponse.transfer_encoding}"
260			} <&${_.netfd.in}
261
262			_.close_connection
263
264			return 0
265		else
266			return 1
267		fi
268		# notreached
269	}
270)
271
272function html_entity_to_ascii
273{
274	typeset buf
275	typeset entity
276	typeset c
277	typeset value
278
279	# Todo: Add more HTML/MathML entities here
280	# Note we use a static variable (typeset -S) here to make sure we
281	# don't loose the cache data between calls
282	typeset -S -A entity_cache=(
283		# entity to ascii (fixme: add UTF-8 transliterations)
284		["nbsp"]=' '
285		["lt"]='<'
286		["le"]='<='
287		["gt"]='>'
288		["ge"]='>='
289		["amp"]='&'
290		["quot"]='"'
291		["apos"]="'"
292	)
293
294	buf=""
295	while IFS='' read -r -N 1 c ; do
296		if [[ "$c" != "&" ]] ; then
297			print -n -r -- "${c}"
298			continue
299		fi
300
301		entity=""
302		while IFS='' read -r -N 1 c ; do
303			case "$c" in
304				";")
305				break
306				;;
307			~(Eilr)[a-z0-9#])
308				entity+="$c"
309				continue
310				;;
311			*)
312#				debugmsg "error &${entity}${c}#"
313
314				print -n -r -- "${entity}${c}"
315				entity=""
316				continue 2
317				;;
318			esac
319		done
320
321		value=""
322		if [[ "${entity_cache["${entity}"]}" != "" ]] ; then
323#			debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
324			value="${entity_cache["${entity}"]}"
325		else
326			if [[ "${entity:0:1}" == "#" ]] ; then
327				# decimal literal
328				value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }"
329			elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then
330				# hexadecimal literal
331				value="${ printf "\u[${entity:0:7}]" ; }"
332			else
333				# unknown literal - pass-through
334				value="ENT=|${entity}|"
335			fi
336
337			entity_cache["${entity}"]="${value}"
338
339#			debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
340		fi
341
342		printf "%s" "${value}"
343	done
344
345	return 0
346}
347
348# dumb xhtml handler - no CSS,  tables, images, iframes or nested
349# structures are supported (and we assume that the input is correct
350# xhtml). The code was written in a trial&&error manner and should be
351# rewritten to parse xhtml correctly.
352function handle_html
353{
354    # we can't use global variables here when multiple callbacks use the same
355    # callback function - but we can use the callback associative array for
356    # variable storage instead
357    nameref callbacks=${1}
358    typeset tag_type="$2"
359    typeset tag_value="$3"
360
361    case "${tag_type}" in
362        tag_begin)
363            case "${tag_value}" in
364                br) printf "\n" ;;
365                hr) printf "\n-------------------------------------\n" ;;
366                pre) callbacks["html_pre"]='true' ;;
367                p)  printf "\n" ;;
368            esac
369            ;;
370
371        tag_end)
372            case "${tag_value}" in
373                pre) callbacks["html_pre"]='false' ;;
374            esac
375            ;;
376
377        tag_text)
378            if ${callbacks["html_pre"]} ; then
379                printf "%s" "${tag_value}"
380            else
381                # compress spaces/newlines/tabs/etc.
382                printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }"
383            fi
384            ;;
385
386        document_start)
387            callbacks["html_pre"]='false'
388            ;;
389        document_end) ;;
390    esac
391
392    return 0
393}
394
395function handle_rss
396{
397	# we can't use global variables here when multiple callbacks use the same
398	# callback function - but we can use the callback associative array for
399	# variable storage instead
400	nameref callbacks=${1}
401	typeset tag_type="$2"
402	typeset tag_value="$3"
403
404	case "${tag_type}" in
405		tag_begin)
406			case "${tag_value}" in
407				item)
408					item["title"]=""
409					item["link"]=""
410					item["tag"]=""
411					item["description"]=""
412					;;
413			esac
414			callbacks["textbuf"]=""
415			;;
416		tag_end)
417			case "${tag_value}" in
418				item)
419					# note that each RSS item needs to be converted seperately from RSS to HTML to plain text
420					# to make sure that the state of one RSS item doesn't affect others
421					(
422						printf $"<br />#### RSS item: title: %s ####" "${item["title"]}"
423						printf $"<br />## author: %s" "${item["author"]}"
424						printf $"<br />## link:   %s" "${item["link"]}"
425						printf $"<br />## date:   %s" "${item["pubDate"]}"
426						printf $"<br />## begin description:"
427						printf $"<br />%s<br />" "${item["description"]}"
428						printf $"<br />## end description<br />"
429						print # extra newline to make sure the sed pipeline gets flushed
430					) |
431						html_entity_to_ascii |	# convert XML entities (e.g. decode RSS content to HTML code)
432						xml_tok "xhtmltok_cb" |	# convert HTML to plain text
433						html_entity_to_ascii	# convert HTML entities
434					;;
435				title)                item["title"]="${callbacks["textbuf"]}"        ; callbacks["textbuf"]="" ;;
436				link)                 item["link"]="${callbacks["textbuf"]}"         ; callbacks["textbuf"]="" ;;
437				dc:creator | author)  item["author"]="${callbacks["textbuf"]}"       ; callbacks["textbuf"]="" ;;
438				dc:date | pubDate)    item["pubDate"]="${callbacks["textbuf"]}"      ; callbacks["textbuf"]="" ;;
439				description)          item["description"]="${callbacks["textbuf"]}"  ; callbacks["textbuf"]="" ;;
440			esac
441			callbacks["textbuf"]=""
442			;;
443		tag_text)
444			callbacks["textbuf"]+="${tag_value}"
445			;;
446		document_start) ;;
447		document_end) ;;
448	esac
449	return 0
450}
451
452function xml_tok
453{
454    typeset buf=""
455    typeset namebuf=""
456    typeset attrbuf=""
457    typeset c=""
458    typeset isendtag # bool: true/false
459    typeset issingletag # bool: true/false (used for tags like "<br />")
460    nameref callbacks=${1}
461
462    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
463
464    while IFS='' read -r -N 1 c ; do
465        isendtag=false
466
467        if [[ "$c" == "<" ]] ; then
468	    # flush any text content
469            if [[ "$buf" != "" ]] ; then
470                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
471                buf=""
472            fi
473
474            IFS='' read -r -N 1 c
475            if [[ "$c" == "/" ]] ; then
476                isendtag=true
477            else
478                buf="$c"
479            fi
480            IFS='' read -r -d '>' c
481            buf+="$c"
482
483	    # handle comments
484	    if [[ "$buf" == ~(El)!-- ]] ; then
485	        # did we read the comment completely ?
486	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
487		    buf+=">"
488	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
489		        IFS='' read -r -N 1 c || break
490		        buf+="$c"
491		    done
492		fi
493
494		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
495		buf=""
496		continue
497	    fi
498
499	    # check if the tag starts and ends at the same time (like "<br />")
500	    if [[ "${buf}" == ~(Er).*/ ]] ; then
501	        issingletag=true
502		buf="${buf%*/}"
503	    else
504	        issingletag=false
505	    fi
506
507	    # check if the tag has attributes (e.g. space after name)
508	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
509	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
510                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
511            else
512	        namebuf="$buf"
513		attrbuf=""
514	    fi
515
516            if ${isendtag} ; then
517                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
518            else
519                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
520
521                # handle tags like <br/> (which are start- and end-tag in one piece)
522                if ${issingletag} ; then
523                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
524                fi
525            fi
526            buf=""
527        else
528            buf+="$c"
529        fi
530    done
531
532    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
533
534    print # final newline to make filters like "sed" happy
535}
536
537# return the value of LC_MESSAGES needed for subprocesses which
538# want to run in a different locale/encoding
539function get_lc_messages
540{
541	[[ "${LC_ALL}"       != "" ]] && { print "${LC_ALL}"      ; return 0 ; }
542	[[ "${LC_MESSAGES}"  != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; }
543	[[ "${LANG}"         != "" ]] && { print "${LANG}"        ; return 0 ; }
544	print "C" ; return 0
545}
546
547function do_rssread
548{
549	# set unicode locale since RSS is encoded in UTF-8
550	# (and make sure $LC_MESSAGES is set to the parent
551	# process's locale that all error messages are using
552	# the callers locale/encoding)
553	export \
554		LC_MESSAGES="${ get_lc_messages ; }" \
555		LC_MONETARY="en_US.UTF-8" \
556		LC_NUMERIC="en_US.UTF-8" \
557		LC_COLLATE="en_US.UTF-8" \
558		LC_CTYPE="en_US.UTF-8" \
559		LC_TIME="en_US.UTF-8" \
560		LANG="en_US.UTF-8"
561
562	# return non-zero exit code for this function if the rss processing below fails
563	set -o errexit
564
565	urlconnection_t hc
566	hc.user_agent="rssread/ksh93(ssl) (2009-08-14; $(uname -s -r -p))"
567	hc.init_url "$1"
568
569	# need extra newline after cat_url to terminate line with $'\n'
570	# to make "xml_tok" happy
571	data="${ hc.cat_url ; print ; }"
572
573	print -u2 -f "# Got %d lines of RSS data, processing...\n" "${ wc -l <<< "${data}" ; }"
574
575	xml_tok "rsstok_cb" <<< "${data}"
576
577	return 0
578}
579
580function usage
581{
582	OPTIND=0
583	getopts -a "${progname}" "${rssread_usage}" OPT '-?'
584	exit 2
585}
586
587# make sure we use the ksh93 builtin versions
588builtin basename
589builtin cat
590builtin mkfifo
591
592typeset -A rsstok_cb # callbacks for xml_tok
593rsstok_cb["tag_begin"]="handle_rss"
594rsstok_cb["tag_end"]="handle_rss"
595rsstok_cb["tag_text"]="handle_rss"
596rsstok_cb["textbuf"]=""
597
598typeset -A xhtmltok_cb # callbacks for xml_tok
599xhtmltok_cb["tag_begin"]="handle_html"
600xhtmltok_cb["tag_end"]="handle_html"
601xhtmltok_cb["tag_text"]="handle_html"
602xhtmltok_cb["textbuf"]=""
603xhtmltok_cb["html_pre"]='false'
604
605typeset -A item
606
607typeset -A bookmark_urls
608
609# "ramdom" urls for testing
610bookmark_urls=(
611	["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss"
612	# OpenSolaris.org sites
613	["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml"
614	["ksh93_integration_ssl"]="https://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml"
615	["shell"]="http://www.opensolaris.org/rss/os/project/shell/announcements/rss2.xml"
616	["systemz"]="http://www.opensolaris.org/rss/os/project/systemz/announcements/rss2.xml"
617	["systemz_ssl"]="https://www.opensolaris.org/rss/os/project/systemz/announcements/rss2.xml"
618	# some Sun staff/sites
619	["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss"
620	["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml"
621	["bigadmin_scripts"]="https://www.sun.com/bigadmin/content/rss/scripts.xml"
622	["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss"
623	["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss"
624	["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss"
625	["planetsun"]="http://www.planetsun.org/rss20.xml"
626	["planetsolaris"]="http://www.planetsolaris.org/rss20.xml"
627	["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml"
628	["theregister_uk"]="http://www.theregister.co.uk/headlines.rss"
629	["heise"]="http://www.heise.de/newsticker/heise.rdf"
630	["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot"
631	["wikipedia_command_shells"]="http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history"
632)
633
634typeset progname="${ basename "${0}" ; }"
635
636typeset -r rssread_usage=$'+
637[-?\n@(#)\$Id: rssread (Roland Mainz) 2009-08-14 \$\n]
638[-author?Roland Mainz <roland.mainz@sun.com>]
639[-author?Roland Mainz <roland.mainz@nrubsig.org>]
640[+NAME?rssread - fetch RSS messages and convert them to plain text]
641[+DESCRIPTION?\brssread\b RSS to plain text converter
642        which fetches RSS streams via HTTP and converts them from
643	RSS to HTML to plain text in the current locale/encoding.]
644[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.]
645
646[ url ]
647
648[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)]
649'
650
651typeset noiconv=false
652
653while getopts -a "${progname}" "${rssread_usage}" OPT ; do
654#	printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
655	case ${OPT} in
656		I)    noiconv=true  ;;
657		+I)   noiconv=false ;;
658		*)    usage ;;
659	esac
660done
661shift $((OPTIND-1))
662
663typeset url="$1"
664
665if [[ "${url}" == "" ]] ; then
666	fatal_error $"No url given."
667fi
668
669if [[ "${bookmark_urls[${url}]}" != "" ]] ; then
670	printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}"
671	url="${bookmark_urls[${url}]}"
672fi
673
674if ${noiconv} ; then
675	do_rssread "${url}"
676else
677	do_rssread "${url}" | iconv -f "UTF-8" - -
678fi
679
680exit 0
681#EOF.
682