1#!/usr/bin/ksh93 2 3# 4# CDDL HEADER START 5# 6# The contents of this file are subject to the terms of the 7# Common Development and Distribution License (the "License"). 8# You may not use this file except in compliance with the License. 9# 10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11# or http://www.opensolaris.org/os/licensing. 12# See the License for the specific language governing permissions 13# and limitations under the License. 14# 15# When distributing Covered Code, include this CDDL HEADER in each 16# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17# If applicable, add the following below this CDDL HEADER, with the 18# fields enclosed by brackets "[]" replaced with your own identifying 19# information: Portions Copyright [yyyy] [name of copyright owner] 20# 21# CDDL HEADER END 22# 23 24# 25# Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 26# 27 28# 29# rssread - a simple RSS2.0 reader with RSS to XHTML to 30# plaintext conversion. 31# 32 33# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 34export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 35 36function printmsg 37{ 38 print -u2 "$*" 39} 40 41function debugmsg 42{ 43# printmsg "$*" 44true 45} 46 47function fatal_error 48{ 49 print -u2 "${progname}: $*" 50 exit 1 51} 52 53typeset -T urlconnection_t=( 54 # public 55 typeset user_agent="ksh93/urlconnection_t" 56 57 # private variables 58 typeset protocol 59 typeset path1 60 typeset host 61 typeset path 62 typeset port 63 64 compound netfd=( 65 integer in=-1 # incoming traffic 66 integer out=-1 # outgoing traffic 67 ) 68 69 # only used for https 70 compound ssl=( 71 compound fifo=( 72 typeset dir="" 73 typeset in="" 74 typeset out="" 75 ) 76 integer openssl_client_pid=-1 77 ) 78 79 # parse HTTP return code, cookies etc. 80 function parse_http_response 81 { 82 nameref response="$1" 83 typeset h statuscode statusmsg i 84 85 # we use '\r' as additional IFS to filter the final '\r' 86 IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 87 [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 88 [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 89 response.statuscode="$statuscode" 90 response.statusmsg="$statusmsg" 91 92 # skip remaining headers 93 while IFS='' read -r i ; do 94 [[ "$i" == $'\r' ]] && break 95 96 # strip '\r' at the end 97 i="${i/~(Er)$'\r'/}" 98 99 case "$i" in 100 ~(Eli)Content-Type:.*) 101 response.content_type="${i/~(El).*:[[:blank:]]*/}" 102 ;; 103 ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 104 integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 105 ;; 106 ~(Eli)Transfer-Encoding:.*) 107 response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 108 ;; 109 esac 110 done 111 112 return 0 113 } 114 115 function cat_http_body 116 { 117 typeset emode="$1" 118 typeset hexchunksize="0" 119 integer chunksize=0 120 121 if [[ "${emode}" == "chunked" ]] ; then 122 while IFS=$'\n' read hexchunksize ; do 123 hexchunksize="${hexchunksize//$'\r'/}" 124 [[ "${hexchunksize}" != "" ]] || continue 125 [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] || break 126 (( chunksize=$( printf "16#%s\n" "${hexchunksize}" ) )) 127 (( chunksize > 0 )) || break 128 dd bs=1 count="${chunksize}" 2>/dev/null 129 done 130 else 131 cat 132 fi 133 134 return 0 135 } 136 137 function init_url 138 { 139 _.protocol="${1%://*}" 140 _.path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 141 142 if [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then 143 _.host="${_.path1%%/*}" 144 _.path="${_.path1#*/}" 145 _.port="${_.host##*:}" 146 fi 147 148 return 0 149 } 150 151 # close connection 152 function close_connection 153 { 154 integer ret 155 156 if (( _.netfd.in != -1 )) ; then 157 redirect {_.netfd.in}<&- 158 (( _.netfd.in=-1 )) 159 fi 160 161 if (( _.netfd.in != _.netfd.out && _.netfd.out != -1 )) ; then 162 redirect {_.netfd.out}<&- 163 (( _.netfd.out=-1 )) 164 fi 165 166 if [[ "${_.protocol}" == "https" ]] ; then 167 wait ${_.ssl.openssl_client_pid} || { print -u2 -f "%s: openssl failed.\n" ; return 1 ; } 168 (( _.ssl.openssl_client_pid=-1 )) 169 170 rm -r \"${_.ssl.fifo.dir}\" 171 _.ssl.fifo.dir="" 172 fi 173 174 return 0 175 } 176 177 function open_connection 178 { 179 if [[ "${_.protocol}" == "https" ]] ; then 180 _.ssl.fifo.dir="$(mktemp -t -d)" 181 _.ssl.fifo.in="${_.ssl.fifo.dir}/in" 182 _.ssl.fifo.out="${_.ssl.fifo.dir}/out" 183 184 # Use "errexit" to leave it at the first error 185 # (this saves lots of if/fi tests for error checking) 186 set -o errexit 187 188 mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}" 189 190 # create async openssl child to handle https 191 openssl s_client -quiet -connect "${_.host}:${_.port}" <"${_.ssl.fifo.in}" >>"${_.ssl.fifo.out}" & 192 193 _.ssl.openssl_client_pid=$! 194 else 195 redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}" 196 (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } 197 (( _.netfd.out=_.netfd.in )) 198 fi 199 return 0 200 } 201 202 function send_request 203 { 204 typeset request="$1" 205 206 set -o errexit 207 208 if [[ "${_.protocol}" == "https" ]] ; then 209 print -n -- "${request}\r\n" >> "${_.ssl.fifo.in}" 210 211 redirect {_.netfd.in}< "${_.ssl.fifo.out}" 212 else 213 print -n -- "${request}\r\n" >&${_.netfd.out} 214 fi 215 return 0 216 } 217 218 function cat_url 219 { 220 if [[ "${_.protocol}" == "file" ]] ; then 221 cat "${_.path1}" 222 return $? 223 elif [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then 224 compound httpresponse # http response 225 226 # If URL did not contain a port number in the host part then look at the 227 # protocol to get the port number 228 if [[ "${_.port}" == "${_.host}" ]] ; then 229 case "${_.protocol}" in 230 "http") _.port=80 ;; 231 "https") _.port=443 ;; 232 *) _.port="$(getent services "${_.protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 233 esac 234 else 235 _.host="${_.host%:*}" 236 fi 237 238 printmsg "protocol=${_.protocol} port=${_.port} host=${_.host} path=${_.path}" 239 240 # prechecks 241 [[ "${_.protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 242 [[ "${_.port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 243 [[ "${_.host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 244 [[ "${_.path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 245 246 _.open_connection || return 1 247 248 # send HTTP request 249 request="GET /${_.path} HTTP/1.1\r\n" 250 request+="Host: ${_.host}\r\n" 251 request+="User-Agent: ${_.user_agent}\r\n" 252 request+="Connection: close\r\n" 253 _.send_request "${request}\r\n" 254 255 # collect response and send it to stdout 256 { 257 _.parse_http_response httpresponse 258 _.cat_http_body "${httpresponse.transfer_encoding}" 259 } <&${_.netfd.in} 260 261 _.close_connection 262 263 return 0 264 else 265 return 1 266 fi 267 # notreached 268 } 269) 270 271function html_entity_to_ascii 272{ 273 typeset buf 274 typeset entity 275 typeset c 276 typeset value 277 278 # Todo: Add more HTML/MathML entities here 279 # Note we use a static variable (typeset -S) here to make sure we 280 # don't loose the cache data between calls 281 typeset -S -A entity_cache=( 282 # entity to ascii (fixme: add UTF-8 transliterations) 283 ["nbsp"]=' ' 284 ["lt"]='<' 285 ["le"]='<=' 286 ["gt"]='>' 287 ["ge"]='>=' 288 ["amp"]='&' 289 ["quot"]='"' 290 ["apos"]="'" 291 ) 292 293 buf="" 294 while IFS='' read -r -N 1 c ; do 295 if [[ "$c" != "&" ]] ; then 296 print -n -r -- "${c}" 297 continue 298 fi 299 300 entity="" 301 while IFS='' read -r -N 1 c ; do 302 case "$c" in 303 ";") 304 break 305 ;; 306 ~(Eilr)[a-z0-9#]) 307 entity+="$c" 308 continue 309 ;; 310 *) 311# debugmsg "error &${entity}${c}#" 312 313 print -n -r -- "${entity}${c}" 314 entity="" 315 continue 2 316 ;; 317 esac 318 done 319 320 value="" 321 if [[ "${entity_cache["${entity}"]}" != "" ]] ; then 322# debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#" 323 value="${entity_cache["${entity}"]}" 324 else 325 if [[ "${entity:0:1}" == "#" ]] ; then 326 # decimal literal 327 value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }" 328 elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then 329 # hexadecimal literal 330 value="${ printf "\u[${entity:0:7}]" ; }" 331 else 332 # unknown literal - pass-through 333 value="ENT=|${entity}|" 334 fi 335 336 entity_cache["${entity}"]="${value}" 337 338# debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#" 339 fi 340 341 printf "%s" "${value}" 342 done 343 344 return 0 345} 346 347# dumb xhtml handler - no CSS, tables, images, iframes or nested 348# structures are supported (and we assume that the input is correct 349# xhtml). The code was written in a trial&&error manner and should be 350# rewritten to parse xhtml correctly. 351function handle_html 352{ 353 # we can't use global variables here when multiple callbacks use the same 354 # callback function - but we can use the callback associative array for 355 # variable storage instead 356 nameref callbacks=${1} 357 typeset tag_type="$2" 358 typeset tag_value="$3" 359 360 case "${tag_type}" in 361 tag_begin) 362 case "${tag_value}" in 363 br) printf "\n" ;; 364 hr) printf "\n-------------------------------------\n" ;; 365 pre) callbacks["html_pre"]='true' ;; 366 p) printf "\n" ;; 367 esac 368 ;; 369 370 tag_end) 371 case "${tag_value}" in 372 pre) callbacks["html_pre"]='false' ;; 373 esac 374 ;; 375 376 tag_text) 377 if ${callbacks["html_pre"]} ; then 378 printf "%s" "${tag_value}" 379 else 380 # compress spaces/newlines/tabs/etc. 381 printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }" 382 fi 383 ;; 384 385 document_start) 386 callbacks["html_pre"]='false' 387 ;; 388 document_end) ;; 389 esac 390 391 return 0 392} 393 394function handle_rss 395{ 396 # we can't use global variables here when multiple callbacks use the same 397 # callback function - but we can use the callback associative array for 398 # variable storage instead 399 nameref callbacks=${1} 400 typeset tag_type="$2" 401 typeset tag_value="$3" 402 403 case "${tag_type}" in 404 tag_begin) 405 case "${tag_value}" in 406 item) 407 item["title"]="" 408 item["link"]="" 409 item["tag"]="" 410 item["description"]="" 411 ;; 412 esac 413 callbacks["textbuf"]="" 414 ;; 415 tag_end) 416 case "${tag_value}" in 417 item) 418 # note that each RSS item needs to be converted seperately from RSS to HTML to plain text 419 # to make sure that the state of one RSS item doesn't affect others 420 ( 421 printf $"<br />#### RSS item: title: %s ####" "${item["title"]}" 422 printf $"<br />## author: %s" "${item["author"]}" 423 printf $"<br />## link: %s" "${item["link"]}" 424 printf $"<br />## date: %s" "${item["pubDate"]}" 425 printf $"<br />## begin description:" 426 printf $"<br />%s<br />" "${item["description"]}" 427 printf $"<br />## end description<br />" 428 print # extra newline to make sure the sed pipeline gets flushed 429 ) | 430 html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code) 431 xml_tok "xhtmltok_cb" | # convert HTML to plain text 432 html_entity_to_ascii # convert HTML entities 433 ;; 434 title) item["title"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 435 link) item["link"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 436 dc:creator | author) item["author"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 437 dc:date | pubDate) item["pubDate"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 438 description) item["description"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 439 esac 440 callbacks["textbuf"]="" 441 ;; 442 tag_text) 443 callbacks["textbuf"]+="${tag_value}" 444 ;; 445 document_start) ;; 446 document_end) ;; 447 esac 448 return 0 449} 450 451function xml_tok 452{ 453 typeset buf="" 454 typeset namebuf="" 455 typeset attrbuf="" 456 typeset c="" 457 typeset isendtag # bool: true/false 458 typeset issingletag # bool: true/false (used for tags like "<br />") 459 nameref callbacks=${1} 460 461 [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 462 463 while IFS='' read -r -N 1 c ; do 464 isendtag=false 465 466 if [[ "$c" == "<" ]] ; then 467 # flush any text content 468 if [[ "$buf" != "" ]] ; then 469 [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 470 buf="" 471 fi 472 473 IFS='' read -r -N 1 c 474 if [[ "$c" == "/" ]] ; then 475 isendtag=true 476 else 477 buf="$c" 478 fi 479 IFS='' read -r -d '>' c 480 buf+="$c" 481 482 # handle comments 483 if [[ "$buf" == ~(El)!-- ]] ; then 484 # did we read the comment completely ? 485 if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 486 buf+=">" 487 while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 488 IFS='' read -r -N 1 c || break 489 buf+="$c" 490 done 491 fi 492 493 [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 494 buf="" 495 continue 496 fi 497 498 # check if the tag starts and ends at the same time (like "<br />") 499 if [[ "${buf}" == ~(Er).*/ ]] ; then 500 issingletag=true 501 buf="${buf%*/}" 502 else 503 issingletag=false 504 fi 505 506 # check if the tag has attributes (e.g. space after name) 507 if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 508 namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 509 attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 510 else 511 namebuf="$buf" 512 attrbuf="" 513 fi 514 515 if ${isendtag} ; then 516 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 517 else 518 [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 519 520 # handle tags like <br/> (which are start- and end-tag in one piece) 521 if ${issingletag} ; then 522 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 523 fi 524 fi 525 buf="" 526 else 527 buf+="$c" 528 fi 529 done 530 531 [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 532 533 print # final newline to make filters like "sed" happy 534} 535 536# return the value of LC_MESSAGES needed for subprocesses which 537# want to run in a different locale/encoding 538function get_lc_messages 539{ 540 [[ "${LC_ALL}" != "" ]] && { print "${LC_ALL}" ; return 0 ; } 541 [[ "${LC_MESSAGES}" != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; } 542 [[ "${LANG}" != "" ]] && { print "${LANG}" ; return 0 ; } 543 print "C" ; return 0 544} 545 546function do_rssread 547{ 548 # set unicode locale since RSS is encoded in UTF-8 549 # (and make sure $LC_MESSAGES is set to the parent 550 # process's locale that all error messages are using 551 # the callers locale/encoding) 552 export \ 553 LC_MESSAGES="${ get_lc_messages ; }" \ 554 LC_MONETARY="en_US.UTF-8" \ 555 LC_NUMERIC="en_US.UTF-8" \ 556 LC_COLLATE="en_US.UTF-8" \ 557 LC_CTYPE="en_US.UTF-8" \ 558 LC_TIME="en_US.UTF-8" \ 559 LANG="en_US.UTF-8" 560 561 # return non-zero exit code for this function if the rss processing below fails 562 set -o errexit 563 564 urlconnection_t hc 565 hc.user_agent="rssread/ksh93(ssl) (2010-03-27; $(uname -s -r -p))" 566 hc.init_url "$1" 567 568 # need extra newline after cat_url to terminate line with $'\n' 569 # to make "xml_tok" happy 570 data="${ hc.cat_url ; print ; }" 571 572 print -u2 -f "# Got %d lines of RSS data, processing...\n" "${ wc -l <<< "${data}" ; }" 573 574 xml_tok "rsstok_cb" <<< "${data}" 575 576 return 0 577} 578 579function usage 580{ 581 OPTIND=0 582 getopts -a "${progname}" "${rssread_usage}" OPT '-?' 583 exit 2 584} 585 586# make sure we use the ksh93 builtin versions 587builtin basename 588builtin cat 589builtin mkfifo 590 591typeset -A rsstok_cb # callbacks for xml_tok 592rsstok_cb["tag_begin"]="handle_rss" 593rsstok_cb["tag_end"]="handle_rss" 594rsstok_cb["tag_text"]="handle_rss" 595rsstok_cb["textbuf"]="" 596 597typeset -A xhtmltok_cb # callbacks for xml_tok 598xhtmltok_cb["tag_begin"]="handle_html" 599xhtmltok_cb["tag_end"]="handle_html" 600xhtmltok_cb["tag_text"]="handle_html" 601xhtmltok_cb["textbuf"]="" 602xhtmltok_cb["html_pre"]='false' 603 604typeset -A item 605 606typeset -A bookmark_urls 607 608# "ramdom" urls for testing 609bookmark_urls=( 610 ["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss" 611 # some Sun staff/sites 612 ["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss" 613 ["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml" 614 ["bigadmin_scripts"]="https://www.sun.com/bigadmin/content/rss/scripts.xml" 615 ["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss" 616 ["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss" 617 ["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss" 618 ["planetsun"]="http://www.planetsun.org/rss20.xml" 619 ["planetsolaris"]="http://www.planetsolaris.org/rss20.xml" 620 ["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml" 621 ["theregister_uk"]="http://www.theregister.co.uk/headlines.rss" 622 ["heise"]="http://www.heise.de/newsticker/heise.rdf" 623 ["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot" 624 ["wikipedia_command_shells"]="http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history" 625) 626 627typeset progname="${ basename "${0}" ; }" 628 629typeset -r rssread_usage=$'+ 630[-?\n@(#)\$Id: rssread (Roland Mainz) 2010-03-27 \$\n] 631[-author?Roland Mainz <roland.mainz@sun.com>] 632[-author?Roland Mainz <roland.mainz@nrubsig.org>] 633[+NAME?rssread - fetch RSS messages and convert them to plain text] 634[+DESCRIPTION?\brssread\b RSS to plain text converter 635 which fetches RSS streams via HTTP and converts them from 636 RSS to HTML to plain text in the current locale/encoding.] 637[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.] 638 639[ url ] 640 641[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)] 642' 643 644typeset noiconv=false 645 646while getopts -a "${progname}" "${rssread_usage}" OPT ; do 647# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 648 case ${OPT} in 649 I) noiconv=true ;; 650 +I) noiconv=false ;; 651 *) usage ;; 652 esac 653done 654shift $((OPTIND-1)) 655 656typeset url="$1" 657 658if [[ "${url}" == "" ]] ; then 659 fatal_error $"No url given." 660fi 661 662if [[ "${bookmark_urls[${url}]}" != "" ]] ; then 663 printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}" 664 url="${bookmark_urls[${url}]}" 665fi 666 667if ${noiconv} ; then 668 do_rssread "${url}" 669else 670 do_rssread "${url}" | iconv -f "UTF-8" - - 671fi 672 673exit 0 674#EOF. 675