1#!/usr/bin/ksh93 2 3# 4# CDDL HEADER START 5# 6# The contents of this file are subject to the terms of the 7# Common Development and Distribution License (the "License"). 8# You may not use this file except in compliance with the License. 9# 10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11# or http://www.opensolaris.org/os/licensing. 12# See the License for the specific language governing permissions 13# and limitations under the License. 14# 15# When distributing Covered Code, include this CDDL HEADER in each 16# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17# If applicable, add the following below this CDDL HEADER, with the 18# fields enclosed by brackets "[]" replaced with your own identifying 19# information: Portions Copyright [yyyy] [name of copyright owner] 20# 21# CDDL HEADER END 22# 23 24# 25# Copyright 2008 Sun Microsystems, Inc. All rights reserved. 26# Use is subject to license terms. 27# 28 29# 30# rssread - a simple RSS2.0 reader with RSS to XHTML to 31# plaintext conversion. 32# 33 34# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 35export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 36 37function printmsg 38{ 39 print -u2 "$*" 40} 41 42function debugmsg 43{ 44# printmsg "$*" 45true 46} 47 48function fatal_error 49{ 50 print -u2 "${progname}: $*" 51 exit 1 52} 53 54# parse HTTP return code, cookies etc. 55function parse_http_response 56{ 57 nameref response="$1" 58 typeset h statuscode statusmsg i 59 60 # we use '\r' as additional IFS to filter the final '\r' 61 IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 62 [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 63 [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 64 response.statuscode="$statuscode" 65 response.statusmsg="$statusmsg" 66 67 # skip remaining headers 68 while IFS='' read -r i ; do 69 [[ "$i" == $'\r' ]] && break 70 71 # strip '\r' at the end 72 i="${i/~(Er)$'\r'/}" 73 74 case "$i" in 75 ~(Eli)Content-Type:.*) 76 response.content_type="${i/~(El).*:[[:blank:]]*/}" 77 ;; 78 ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 79 integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 80 ;; 81 ~(Eli)Transfer-Encoding:.*) 82 response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 83 ;; 84 esac 85 done 86 87 return 0 88} 89 90function cat_http_body 91{ 92 typeset emode="$1" 93 typeset hexchunksize="0" 94 integer chunksize=0 95 96 if [[ "${emode}" == "chunked" ]] ; then 97 while IFS=$'\r' read hexchunksize && 98 [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] && 99 (( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do 100 dd bs=1 count="${chunksize}" 2>/dev/null 101 done 102 else 103 cat 104 fi 105 106 return 0 107} 108 109function cat_http 110{ 111 typeset protocol="${1%://*}" 112 typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 113 114 typeset host="${path1%%/*}" 115 typeset path="${path1#*/}" 116 typeset port="${host##*:}" 117 118 integer netfd 119 typeset -C httpresponse # http response 120 121 # If URL did not contain a port number in the host part then look at the 122 # protocol to get the port number 123 if [[ "${port}" == "${host}" ]] ; then 124 case "${protocol}" in 125 "http") port=80 ;; 126 *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 127 esac 128 else 129 host="${host%:*}" 130 fi 131 132 printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" 133 134 # prechecks 135 [[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 136 [[ "${port}" == "" ]] && { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 137 [[ "${host}" == "" ]] && { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 138 [[ "${path}" == "" ]] && { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 139 140 # open TCP channel 141 redirect {netfd}<>"/dev/tcp/${host}/${port}" 142 (( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; } 143 144 # send HTTP request 145 request="GET /${path} HTTP/1.1\r\n" 146 request+="Host: ${host}\r\n" 147 request+="User-Agent: rssread/ksh93 (2008-10-14; $(uname -s -r -p))\r\n" 148 request+="Connection: close\r\n" 149 print -n -- "${request}\r\n" >&${netfd} 150 151 # collect response and send it to stdout 152 parse_http_response httpresponse <&${netfd} 153 cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} 154 155 # close connection 156 redirect {netfd}<&- 157 158 return 0 159} 160 161function html_entity_to_ascii 162{ 163 typeset buf 164 typeset entity 165 typeset c 166 typeset value 167 168 # Todo: Add more HTML/MathML entities here 169 # Note we use a static variable (typeset -S) here to make sure we 170 # don't loose the cache data between calls 171 typeset -S -A entity_cache=( 172 # entity to ascii (fixme: add UTF-8 transliterations) 173 ["nbsp"]=' ' 174 ["lt"]='<' 175 ["le"]='<=' 176 ["gt"]='>' 177 ["ge"]='>=' 178 ["amp"]='&' 179 ["quot"]='"' 180 ["apos"]="'" 181 ) 182 183 buf="" 184 while IFS='' read -r -N 1 c ; do 185 if [[ "$c" != "&" ]] ; then 186 print -n -r -- "${c}" 187 continue 188 fi 189 190 entity="" 191 while IFS='' read -r -N 1 c ; do 192 case "$c" in 193 ";") 194 break 195 ;; 196 ~(Eilr)[a-z0-9#]) 197 entity+="$c" 198 continue 199 ;; 200 *) 201# debugmsg "error &${entity}${c}#" 202 203 print -n -r -- "${entity}${c}" 204 entity="" 205 continue 2 206 ;; 207 esac 208 done 209 210 value="" 211 if [[ "${entity_cache["${entity}"]}" != "" ]] ; then 212# debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#" 213 value="${entity_cache["${entity}"]}" 214 else 215 if [[ "${entity:0:1}" == "#" ]] ; then 216 # decimal literal 217 value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }" 218 elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then 219 # hexadecimal literal 220 value="${ printf "\u[${entity:0:7}]" ; }" 221 else 222 # unknown literal - pass-through 223 value="ENT=|${entity}|" 224 fi 225 226 entity_cache["${entity}"]="${value}" 227 228# debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#" 229 fi 230 231 printf "%s" "${value}" 232 done 233 234 return 0 235} 236 237# dumb xhtml handler - no CSS, tables, images, iframes or nested 238# structures are supported (and we assume that the input is correct 239# xhtml). The code was written in a trial&&error manner and should be 240# rewritten to parse xhtml correctly. 241function handle_html 242{ 243 # we can't use global variables here when multiple callbacks use the same 244 # callback function - but we can use the callback associative array for 245 # variable storage instead 246 nameref callbacks=${1} 247 typeset tag_type="$2" 248 typeset tag_value="$3" 249 250 case "${tag_type}" in 251 tag_begin) 252 case "${tag_value}" in 253 br) printf "\n" ;; 254 hr) printf "\n-------------------------------------\n" ;; 255 pre) callbacks["html_pre"]='true' ;; 256 p) printf "\n" ;; 257 esac 258 ;; 259 260 tag_end) 261 case "${tag_value}" in 262 pre) callbacks["html_pre"]='false' ;; 263 esac 264 ;; 265 266 tag_text) 267 if ${callbacks["html_pre"]} ; then 268 printf "%s" "${tag_value}" 269 else 270 # compress spaces/newlines/tabs/etc. 271 printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }" 272 fi 273 ;; 274 275 document_start) 276 callbacks["html_pre"]='false' 277 ;; 278 document_end) ;; 279 esac 280 281 return 0 282} 283 284function handle_rss 285{ 286 # we can't use global variables here when multiple callbacks use the same 287 # callback function - but we can use the callback associative array for 288 # variable storage instead 289 nameref callbacks=${1} 290 typeset tag_type="$2" 291 typeset tag_value="$3" 292 293 case "${tag_type}" in 294 tag_begin) 295 case "${tag_value}" in 296 item) 297 item["title"]="" 298 item["link"]="" 299 item["tag"]="" 300 item["description"]="" 301 ;; 302 esac 303 callbacks["textbuf"]="" 304 ;; 305 tag_end) 306 case "${tag_value}" in 307 item) 308 # note that each RSS item needs to be converted seperately from RSS to HTML to plain text 309 # to make sure that the state of one RSS item doesn't affect others 310 ( 311 printf $"<br />#### RSS item: title: %s ####" "${item["title"]}" 312 printf $"<br />## author: %s" "${item["author"]}" 313 printf $"<br />## link: %s" "${item["link"]}" 314 printf $"<br />## date: %s" "${item["pubDate"]}" 315 printf $"<br />## begin description:" 316 printf $"<br />%s<br />" "${item["description"]}" 317 printf $"<br />## end description<br />" 318 print # extra newline to make sure the sed pipeline gets flushed 319 ) | 320 html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code) 321 xml_tok "xhtmltok_cb" | # convert HTML to plain text 322 html_entity_to_ascii # convert HTML entities 323 ;; 324 title) item["title"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 325 link) item["link"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 326 dc:creator | author) item["author"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 327 dc:date | pubDate) item["pubDate"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 328 description) item["description"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 329 esac 330 callbacks["textbuf"]="" 331 ;; 332 tag_text) 333 callbacks["textbuf"]+="${tag_value}" 334 ;; 335 document_start) ;; 336 document_end) ;; 337 esac 338 return 0 339} 340 341function xml_tok 342{ 343 typeset buf="" 344 typeset namebuf="" 345 typeset attrbuf="" 346 typeset c="" 347 typeset isendtag # bool: true/false 348 typeset issingletag # bool: true/false (used for tags like "<br />") 349 nameref callbacks=${1} 350 351 [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 352 353 while IFS='' read -r -N 1 c ; do 354 isendtag=false 355 356 if [[ "$c" == "<" ]] ; then 357 # flush any text content 358 if [[ "$buf" != "" ]] ; then 359 [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 360 buf="" 361 fi 362 363 IFS='' read -r -N 1 c 364 if [[ "$c" == "/" ]] ; then 365 isendtag=true 366 else 367 buf="$c" 368 fi 369 IFS='' read -r -d '>' c 370 buf+="$c" 371 372 # handle comments 373 if [[ "$buf" == ~(El)!-- ]] ; then 374 # did we read the comment completely ? 375 if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 376 buf+=">" 377 while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 378 IFS='' read -r -N 1 c || break 379 buf+="$c" 380 done 381 fi 382 383 [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 384 buf="" 385 continue 386 fi 387 388 # check if the tag starts and ends at the same time (like "<br />") 389 if [[ "${buf}" == ~(Er).*/ ]] ; then 390 issingletag=true 391 buf="${buf%*/}" 392 else 393 issingletag=false 394 fi 395 396 # check if the tag has attributes (e.g. space after name) 397 if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 398 namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 399 attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 400 else 401 namebuf="$buf" 402 attrbuf="" 403 fi 404 405 if ${isendtag} ; then 406 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 407 else 408 [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 409 410 # handle tags like <br/> (which are start- and end-tag in one piece) 411 if ${issingletag} ; then 412 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 413 fi 414 fi 415 buf="" 416 else 417 buf+="$c" 418 fi 419 done 420 421 [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 422 423 print # final newline to make filters like "sed" happy 424} 425 426# return the value of LC_MESSAGES needed for subprocesses which 427# want to run in a different locale/encoding 428function get_lc_messages 429{ 430 [[ "${LC_ALL}" != "" ]] && { print "${LC_ALL}" ; return 0 ; } 431 [[ "${LC_MESSAGES}" != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; } 432 [[ "${LANG}" != "" ]] && { print "${LANG}" ; return 0 ; } 433 print "C" ; return 0 434} 435 436function do_rssread 437{ 438 # set unicode locale since RSS is encoded in UTF-8 439 # (and make sure $LC_MESSAGES is set to the parent 440 # process's locale that all error messages are using 441 # the callers locale/encoding) 442 export \ 443 LC_MESSAGES="${ get_lc_messages ; }" \ 444 LC_MONETARY="en_US.UTF-8" \ 445 LC_NUMERIC="en_US.UTF-8" \ 446 LC_COLLATE="en_US.UTF-8" \ 447 LC_CTYPE="en_US.UTF-8" \ 448 LC_TIME="en_US.UTF-8" \ 449 LANG="en_US.UTF-8" 450 451 # need extra newline after cat_http to terminate line with $'\n' 452 # to make "xml_tok" happy 453 { cat_http "$1" ; print ; } | 454 xml_tok "rsstok_cb" 455 return 0 456} 457 458function usage 459{ 460 OPTIND=0 461 getopts -a "${progname}" "${rssread_usage}" OPT '-?' 462 exit 2 463} 464 465# make sure we use the ksh93 builtin versions 466builtin basename 467builtin cat 468 469typeset -A rsstok_cb # callbacks for xml_tok 470rsstok_cb["tag_begin"]="handle_rss" 471rsstok_cb["tag_end"]="handle_rss" 472rsstok_cb["tag_text"]="handle_rss" 473rsstok_cb["textbuf"]="" 474 475typeset -A xhtmltok_cb # callbacks for xml_tok 476xhtmltok_cb["tag_begin"]="handle_html" 477xhtmltok_cb["tag_end"]="handle_html" 478xhtmltok_cb["tag_text"]="handle_html" 479xhtmltok_cb["textbuf"]="" 480xhtmltok_cb["html_pre"]='false' 481 482typeset -A item 483 484typeset -A bookmark_urls 485 486# "ramdom" urls for testing 487bookmark_urls=( 488 ["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss" 489 # OpenSolaris.org sites 490 ["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml" 491 ["shell"]="http://www.opensolaris.org/rss/os/project/shell/announcements/rss2.xml" 492 ["systemz"]="http://www.opensolaris.org/rss/os/project/systemz/announcements/rss2.xml" 493 # some Sun staff/sites 494 ["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss" 495 ["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml" 496 ["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss" 497 ["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss" 498 ["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss" 499 ["planetsun"]="http://www.planetsun.org/rss20.xml" 500 ["planetsolaris"]="http://www.planetsolaris.org/rss20.xml" 501 ["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml" 502 ["theregister_uk"]="http://www.theregister.co.uk/headlines.rss" 503 ["heise"]="http://www.heise.de/newsticker/heise.rdf" 504 ["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot" 505) 506 507typeset progname="${ basename "${0}" ; }" 508 509typeset -r rssread_usage=$'+ 510[-?\n@(#)\$Id: rssread (Roland Mainz) 2008-11-10 \$\n] 511[-author?Roland Mainz <roland.mainz@sun.com>] 512[-author?Roland Mainz <roland.mainz@nrubsig.org>] 513[+NAME?rssread - fetch RSS messages and convert them to plain text] 514[+DESCRIPTION?\brssread\b RSS to plain text converter 515 which fetches RSS streams via HTTP and converts them from 516 RSS to HTML to plain text in the current locale/encoding.] 517[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.] 518 519[ url ] 520 521[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)] 522' 523 524typeset noiconv=false 525 526while getopts -a "${progname}" "${rssread_usage}" OPT ; do 527# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 528 case ${OPT} in 529 I) noiconv=true ;; 530 +I) noiconv=false ;; 531 *) usage ;; 532 esac 533done 534shift $((OPTIND-1)) 535 536typeset url="$1" 537 538if [[ "${url}" == "" ]] ; then 539 fatal_error $"No url given." 540fi 541 542if [[ "${bookmark_urls[${url}]}" != "" ]] ; then 543 printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}" 544 url="${bookmark_urls[${url}]}" 545fi 546 547if ${noiconv} ; then 548 do_rssread "${url}" 549else 550 do_rssread "${url}" | iconv -f "UTF-8" - - 551fi 552 553exit 0 554#EOF. 555