xref: /titanic_44/usr/src/lib/libshell/common/scripts/rssread.sh (revision 3f7d54a6b84904c8f4d8daa4c7b577bede7df8b9)
1#!/usr/bin/ksh93
2
3#
4# CDDL HEADER START
5#
6# The contents of this file are subject to the terms of the
7# Common Development and Distribution License (the "License").
8# You may not use this file except in compliance with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
26#
27
28#
29# rssread - a simple RSS2.0 reader with RSS to XHTML to
30# plaintext conversion.
31#
32
33# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
34export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
35
36function printmsg
37{
38	print -u2 "$*"
39}
40
41function debugmsg
42{
43#	printmsg "$*"
44true
45}
46
47function fatal_error
48{
49	print -u2 "${progname}: $*"
50	exit 1
51}
52
53typeset -T urlconnection_t=(
54	# public
55	typeset user_agent="ksh93/urlconnection_t"
56
57	# private variables
58	typeset protocol
59	typeset path1
60	typeset host
61	typeset path
62	typeset port
63
64	compound netfd=(
65		integer in=-1  # incoming traffic
66		integer out=-1 # outgoing traffic
67	)
68
69	# only used for https
70	compound ssl=(
71		compound fifo=(
72			typeset dir=""
73			typeset in=""
74			typeset out=""
75		)
76		integer openssl_client_pid=-1
77	)
78
79	# parse HTTP return code, cookies etc.
80	function parse_http_response
81	{
82		nameref response="$1"
83		typeset h statuscode statusmsg i
84
85		# we use '\r' as additional IFS to filter the final '\r'
86		IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code>
87		[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
88		[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
89		response.statuscode="$statuscode"
90		response.statusmsg="$statusmsg"
91
92		# skip remaining headers
93		while IFS='' read -r i ; do
94			[[ "$i" == $'\r' ]] && break
95
96			# strip '\r' at the end
97			i="${i/~(Er)$'\r'/}"
98
99			case "$i" in
100				~(Eli)Content-Type:.*)
101					response.content_type="${i/~(El).*:[[:blank:]]*/}"
102					;;
103				~(Eli)Content-Length:[[:blank:]]*[0-9]*)
104					integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
105					;;
106				~(Eli)Transfer-Encoding:.*)
107					response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
108					;;
109			esac
110		done
111
112		return 0
113	}
114
115	function cat_http_body
116	{
117		typeset emode="$1"
118		typeset hexchunksize="0"
119		integer chunksize=0
120
121		if [[ "${emode}" == "chunked" ]] ; then
122			while IFS=$'\n' read hexchunksize ; do
123				hexchunksize="${hexchunksize//$'\r'/}"
124				[[ "${hexchunksize}" != "" ]] || continue
125				[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] || break
126				(( chunksize=$( printf "16#%s\n" "${hexchunksize}" )  ))
127				(( chunksize > 0 )) || break
128				dd bs=1 count="${chunksize}" 2>/dev/null
129			done
130		else
131			cat
132		fi
133
134		return 0
135	}
136
137	function init_url
138	{
139		_.protocol="${1%://*}"
140		_.path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
141
142		if  [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then
143			_.host="${_.path1%%/*}"
144			_.path="${_.path1#*/}"
145			_.port="${_.host##*:}"
146		fi
147
148		return 0
149	}
150
151	# close connection
152	function close_connection
153	{
154		integer ret
155
156		if (( _.netfd.in != -1 )) ; then
157			redirect {_.netfd.in}<&-
158			(( _.netfd.in=-1 ))
159		fi
160
161		if (( _.netfd.in != _.netfd.out && _.netfd.out != -1 )) ; then
162			redirect {_.netfd.out}<&-
163			((  _.netfd.out=-1 ))
164		fi
165
166		if [[ "${_.protocol}" == "https" ]] ; then
167			wait ${_.ssl.openssl_client_pid} || { print -u2 -f "%s: openssl failed.\n" ; return 1 ; }
168			(( _.ssl.openssl_client_pid=-1 ))
169
170			rm -r \"${_.ssl.fifo.dir}\"
171			_.ssl.fifo.dir=""
172		fi
173
174		return 0
175	}
176
177	function open_connection
178	{
179		if [[ "${_.protocol}" == "https" ]] ; then
180			_.ssl.fifo.dir="$(mktemp -t -d)"
181			_.ssl.fifo.in="${_.ssl.fifo.dir}/in"
182			_.ssl.fifo.out="${_.ssl.fifo.dir}/out"
183
184			# Use "errexit" to leave it at the first error
185			# (this saves lots of if/fi tests for error checking)
186			set -o errexit
187
188			mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}"
189
190			# create async openssl child to handle https
191			openssl s_client -quiet -connect "${_.host}:${_.port}" <"${_.ssl.fifo.in}" >>"${_.ssl.fifo.out}" &
192
193			_.ssl.openssl_client_pid=$!
194		else
195			redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}"
196			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
197			(( _.netfd.out=_.netfd.in ))
198		fi
199		return 0
200	}
201
202	function send_request
203	{
204		typeset request="$1"
205
206		set -o errexit
207
208		if [[ "${_.protocol}" == "https" ]] ; then
209				print -n -- "${request}\r\n" >>	"${_.ssl.fifo.in}"
210
211				redirect {_.netfd.in}< "${_.ssl.fifo.out}"
212		else
213				print -n -- "${request}\r\n" >&${_.netfd.out}
214		fi
215		return 0
216	}
217
218	function cat_url
219	{
220		if [[ "${_.protocol}" == "file" ]] ; then
221			cat "${_.path1}"
222			return $?
223		elif [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then
224			compound httpresponse # http response
225
226			# If URL did not contain a port number in the host part then look at the
227			# protocol to get the port number
228			if [[ "${_.port}" == "${_.host}" ]] ; then
229				case "${_.protocol}" in
230					"http")  _.port=80 ;;
231					"https") _.port=443 ;;
232					*)       _.port="$(getent services "${_.protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
233				esac
234			else
235				_.host="${_.host%:*}"
236			fi
237
238			printmsg "protocol=${_.protocol} port=${_.port} host=${_.host} path=${_.path}"
239
240			# prechecks
241			[[ "${_.protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
242			[[ "${_.port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
243			[[ "${_.host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
244			[[ "${_.path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
245
246			_.open_connection || return 1
247
248			# send HTTP request
249			request="GET /${_.path} HTTP/1.1\r\n"
250			request+="Host: ${_.host}\r\n"
251			request+="User-Agent: ${_.user_agent}\r\n"
252			request+="Connection: close\r\n"
253			_.send_request "${request}\r\n"
254
255			# collect response and send it to stdout
256			{
257				_.parse_http_response httpresponse
258				_.cat_http_body "${httpresponse.transfer_encoding}"
259			} <&${_.netfd.in}
260
261			_.close_connection
262
263			return 0
264		else
265			return 1
266		fi
267		# notreached
268	}
269)
270
271function html_entity_to_ascii
272{
273	typeset buf
274	typeset entity
275	typeset c
276	typeset value
277
278	# Todo: Add more HTML/MathML entities here
279	# Note we use a static variable (typeset -S) here to make sure we
280	# don't loose the cache data between calls
281	typeset -S -A entity_cache=(
282		# entity to ascii (fixme: add UTF-8 transliterations)
283		["nbsp"]=' '
284		["lt"]='<'
285		["le"]='<='
286		["gt"]='>'
287		["ge"]='>='
288		["amp"]='&'
289		["quot"]='"'
290		["apos"]="'"
291	)
292
293	buf=""
294	while IFS='' read -r -N 1 c ; do
295		if [[ "$c" != "&" ]] ; then
296			print -n -r -- "${c}"
297			continue
298		fi
299
300		entity=""
301		while IFS='' read -r -N 1 c ; do
302			case "$c" in
303				";")
304				break
305				;;
306			~(Eilr)[a-z0-9#])
307				entity+="$c"
308				continue
309				;;
310			*)
311#				debugmsg "error &${entity}${c}#"
312
313				print -n -r -- "${entity}${c}"
314				entity=""
315				continue 2
316				;;
317			esac
318		done
319
320		value=""
321		if [[ "${entity_cache["${entity}"]}" != "" ]] ; then
322#			debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
323			value="${entity_cache["${entity}"]}"
324		else
325			if [[ "${entity:0:1}" == "#" ]] ; then
326				# decimal literal
327				value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }"
328			elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then
329				# hexadecimal literal
330				value="${ printf "\u[${entity:0:7}]" ; }"
331			else
332				# unknown literal - pass-through
333				value="ENT=|${entity}|"
334			fi
335
336			entity_cache["${entity}"]="${value}"
337
338#			debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
339		fi
340
341		printf "%s" "${value}"
342	done
343
344	return 0
345}
346
347# dumb xhtml handler - no CSS,  tables, images, iframes or nested
348# structures are supported (and we assume that the input is correct
349# xhtml). The code was written in a trial&&error manner and should be
350# rewritten to parse xhtml correctly.
351function handle_html
352{
353    # we can't use global variables here when multiple callbacks use the same
354    # callback function - but we can use the callback associative array for
355    # variable storage instead
356    nameref callbacks=${1}
357    typeset tag_type="$2"
358    typeset tag_value="$3"
359
360    case "${tag_type}" in
361        tag_begin)
362            case "${tag_value}" in
363                br) printf "\n" ;;
364                hr) printf "\n-------------------------------------\n" ;;
365                pre) callbacks["html_pre"]='true' ;;
366                p)  printf "\n" ;;
367            esac
368            ;;
369
370        tag_end)
371            case "${tag_value}" in
372                pre) callbacks["html_pre"]='false' ;;
373            esac
374            ;;
375
376        tag_text)
377            if ${callbacks["html_pre"]} ; then
378                printf "%s" "${tag_value}"
379            else
380                # compress spaces/newlines/tabs/etc.
381                printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }"
382            fi
383            ;;
384
385        document_start)
386            callbacks["html_pre"]='false'
387            ;;
388        document_end) ;;
389    esac
390
391    return 0
392}
393
394function handle_rss
395{
396	# we can't use global variables here when multiple callbacks use the same
397	# callback function - but we can use the callback associative array for
398	# variable storage instead
399	nameref callbacks=${1}
400	typeset tag_type="$2"
401	typeset tag_value="$3"
402
403	case "${tag_type}" in
404		tag_begin)
405			case "${tag_value}" in
406				item)
407					item["title"]=""
408					item["link"]=""
409					item["tag"]=""
410					item["description"]=""
411					;;
412			esac
413			callbacks["textbuf"]=""
414			;;
415		tag_end)
416			case "${tag_value}" in
417				item)
418					# note that each RSS item needs to be converted seperately from RSS to HTML to plain text
419					# to make sure that the state of one RSS item doesn't affect others
420					(
421						printf $"<br />#### RSS item: title: %s ####" "${item["title"]}"
422						printf $"<br />## author: %s" "${item["author"]}"
423						printf $"<br />## link:   %s" "${item["link"]}"
424						printf $"<br />## date:   %s" "${item["pubDate"]}"
425						printf $"<br />## begin description:"
426						printf $"<br />%s<br />" "${item["description"]}"
427						printf $"<br />## end description<br />"
428						print # extra newline to make sure the sed pipeline gets flushed
429					) |
430						html_entity_to_ascii |	# convert XML entities (e.g. decode RSS content to HTML code)
431						xml_tok "xhtmltok_cb" |	# convert HTML to plain text
432						html_entity_to_ascii	# convert HTML entities
433					;;
434				title)                item["title"]="${callbacks["textbuf"]}"        ; callbacks["textbuf"]="" ;;
435				link)                 item["link"]="${callbacks["textbuf"]}"         ; callbacks["textbuf"]="" ;;
436				dc:creator | author)  item["author"]="${callbacks["textbuf"]}"       ; callbacks["textbuf"]="" ;;
437				dc:date | pubDate)    item["pubDate"]="${callbacks["textbuf"]}"      ; callbacks["textbuf"]="" ;;
438				description)          item["description"]="${callbacks["textbuf"]}"  ; callbacks["textbuf"]="" ;;
439			esac
440			callbacks["textbuf"]=""
441			;;
442		tag_text)
443			callbacks["textbuf"]+="${tag_value}"
444			;;
445		document_start) ;;
446		document_end) ;;
447	esac
448	return 0
449}
450
451function xml_tok
452{
453    typeset buf=""
454    typeset namebuf=""
455    typeset attrbuf=""
456    typeset c=""
457    typeset isendtag # bool: true/false
458    typeset issingletag # bool: true/false (used for tags like "<br />")
459    nameref callbacks=${1}
460
461    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
462
463    while IFS='' read -r -N 1 c ; do
464        isendtag=false
465
466        if [[ "$c" == "<" ]] ; then
467	    # flush any text content
468            if [[ "$buf" != "" ]] ; then
469                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
470                buf=""
471            fi
472
473            IFS='' read -r -N 1 c
474            if [[ "$c" == "/" ]] ; then
475                isendtag=true
476            else
477                buf="$c"
478            fi
479            IFS='' read -r -d '>' c
480            buf+="$c"
481
482	    # handle comments
483	    if [[ "$buf" == ~(El)!-- ]] ; then
484	        # did we read the comment completely ?
485	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
486		    buf+=">"
487	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
488		        IFS='' read -r -N 1 c || break
489		        buf+="$c"
490		    done
491		fi
492
493		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
494		buf=""
495		continue
496	    fi
497
498	    # check if the tag starts and ends at the same time (like "<br />")
499	    if [[ "${buf}" == ~(Er).*/ ]] ; then
500	        issingletag=true
501		buf="${buf%*/}"
502	    else
503	        issingletag=false
504	    fi
505
506	    # check if the tag has attributes (e.g. space after name)
507	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
508	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
509                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
510            else
511	        namebuf="$buf"
512		attrbuf=""
513	    fi
514
515            if ${isendtag} ; then
516                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
517            else
518                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
519
520                # handle tags like <br/> (which are start- and end-tag in one piece)
521                if ${issingletag} ; then
522                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
523                fi
524            fi
525            buf=""
526        else
527            buf+="$c"
528        fi
529    done
530
531    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
532
533    print # final newline to make filters like "sed" happy
534}
535
536# return the value of LC_MESSAGES needed for subprocesses which
537# want to run in a different locale/encoding
538function get_lc_messages
539{
540	[[ "${LC_ALL}"       != "" ]] && { print "${LC_ALL}"      ; return 0 ; }
541	[[ "${LC_MESSAGES}"  != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; }
542	[[ "${LANG}"         != "" ]] && { print "${LANG}"        ; return 0 ; }
543	print "C" ; return 0
544}
545
546function do_rssread
547{
548	# set unicode locale since RSS is encoded in UTF-8
549	# (and make sure $LC_MESSAGES is set to the parent
550	# process's locale that all error messages are using
551	# the callers locale/encoding)
552	export \
553		LC_MESSAGES="${ get_lc_messages ; }" \
554		LC_MONETARY="en_US.UTF-8" \
555		LC_NUMERIC="en_US.UTF-8" \
556		LC_COLLATE="en_US.UTF-8" \
557		LC_CTYPE="en_US.UTF-8" \
558		LC_TIME="en_US.UTF-8" \
559		LANG="en_US.UTF-8"
560
561	# return non-zero exit code for this function if the rss processing below fails
562	set -o errexit
563
564	urlconnection_t hc
565	hc.user_agent="rssread/ksh93(ssl) (2010-03-27; $(uname -s -r -p))"
566	hc.init_url "$1"
567
568	# need extra newline after cat_url to terminate line with $'\n'
569	# to make "xml_tok" happy
570	data="${ hc.cat_url ; print ; }"
571
572	print -u2 -f "# Got %d lines of RSS data, processing...\n" "${ wc -l <<< "${data}" ; }"
573
574	xml_tok "rsstok_cb" <<< "${data}"
575
576	return 0
577}
578
579function usage
580{
581	OPTIND=0
582	getopts -a "${progname}" "${rssread_usage}" OPT '-?'
583	exit 2
584}
585
586# make sure we use the ksh93 builtin versions
587builtin basename
588builtin cat
589builtin mkfifo
590
591typeset -A rsstok_cb # callbacks for xml_tok
592rsstok_cb["tag_begin"]="handle_rss"
593rsstok_cb["tag_end"]="handle_rss"
594rsstok_cb["tag_text"]="handle_rss"
595rsstok_cb["textbuf"]=""
596
597typeset -A xhtmltok_cb # callbacks for xml_tok
598xhtmltok_cb["tag_begin"]="handle_html"
599xhtmltok_cb["tag_end"]="handle_html"
600xhtmltok_cb["tag_text"]="handle_html"
601xhtmltok_cb["textbuf"]=""
602xhtmltok_cb["html_pre"]='false'
603
604typeset -A item
605
606typeset -A bookmark_urls
607
608# "ramdom" urls for testing
609bookmark_urls=(
610	["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss"
611	# some Sun staff/sites
612	["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss"
613	["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml"
614	["bigadmin_scripts"]="https://www.sun.com/bigadmin/content/rss/scripts.xml"
615	["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss"
616	["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss"
617	["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss"
618	["planetsun"]="http://www.planetsun.org/rss20.xml"
619	["planetsolaris"]="http://www.planetsolaris.org/rss20.xml"
620	["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml"
621	["theregister_uk"]="http://www.theregister.co.uk/headlines.rss"
622	["heise"]="http://www.heise.de/newsticker/heise.rdf"
623	["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot"
624	["wikipedia_command_shells"]="http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history"
625)
626
627typeset progname="${ basename "${0}" ; }"
628
629typeset -r rssread_usage=$'+
630[-?\n@(#)\$Id: rssread (Roland Mainz) 2010-03-27 \$\n]
631[-author?Roland Mainz <roland.mainz@sun.com>]
632[-author?Roland Mainz <roland.mainz@nrubsig.org>]
633[+NAME?rssread - fetch RSS messages and convert them to plain text]
634[+DESCRIPTION?\brssread\b RSS to plain text converter
635        which fetches RSS streams via HTTP and converts them from
636	RSS to HTML to plain text in the current locale/encoding.]
637[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.]
638
639[ url ]
640
641[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)]
642'
643
644typeset noiconv=false
645
646while getopts -a "${progname}" "${rssread_usage}" OPT ; do
647#	printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
648	case ${OPT} in
649		I)    noiconv=true  ;;
650		+I)   noiconv=false ;;
651		*)    usage ;;
652	esac
653done
654shift $((OPTIND-1))
655
656typeset url="$1"
657
658if [[ "${url}" == "" ]] ; then
659	fatal_error $"No url given."
660fi
661
662if [[ "${bookmark_urls[${url}]}" != "" ]] ; then
663	printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}"
664	url="${bookmark_urls[${url}]}"
665fi
666
667if ${noiconv} ; then
668	do_rssread "${url}"
669else
670	do_rssread "${url}" | iconv -f "UTF-8" - -
671fi
672
673exit 0
674#EOF.
675