1#!/usr/bin/ksh93 2 3# 4# CDDL HEADER START 5# 6# The contents of this file are subject to the terms of the 7# Common Development and Distribution License (the "License"). 8# You may not use this file except in compliance with the License. 9# 10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11# or http://www.opensolaris.org/os/licensing. 12# See the License for the specific language governing permissions 13# and limitations under the License. 14# 15# When distributing Covered Code, include this CDDL HEADER in each 16# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17# If applicable, add the following below this CDDL HEADER, with the 18# fields enclosed by brackets "[]" replaced with your own identifying 19# information: Portions Copyright [yyyy] [name of copyright owner] 20# 21# CDDL HEADER END 22# 23 24# 25# Copyright 2009 Sun Microsystems, Inc. All rights reserved. 26# Use is subject to license terms. 27# 28 29# 30# rssread - a simple RSS2.0 reader with RSS to XHTML to 31# plaintext conversion. 32# 33 34# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 35export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 36 37function printmsg 38{ 39 print -u2 "$*" 40} 41 42function debugmsg 43{ 44# printmsg "$*" 45true 46} 47 48function fatal_error 49{ 50 print -u2 "${progname}: $*" 51 exit 1 52} 53 54typeset -T urlconnection_t=( 55 # public 56 typeset user_agent="ksh93/urlconnection_t" 57 58 # private variables 59 typeset protocol 60 typeset path1 61 typeset host 62 typeset path 63 typeset port 64 65 compound netfd=( 66 integer in=-1 # incoming traffic 67 integer out=-1 # outgoing traffic 68 ) 69 70 # only used for https 71 compound ssl=( 72 compound fifo=( 73 typeset dir="" 74 typeset in="" 75 typeset out="" 76 ) 77 integer openssl_client_pid=-1 78 ) 79 80 # parse HTTP return code, cookies etc. 81 function parse_http_response 82 { 83 nameref response="$1" 84 typeset h statuscode statusmsg i 85 86 # we use '\r' as additional IFS to filter the final '\r' 87 IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 88 [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 89 [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 90 response.statuscode="$statuscode" 91 response.statusmsg="$statusmsg" 92 93 # skip remaining headers 94 while IFS='' read -r i ; do 95 [[ "$i" == $'\r' ]] && break 96 97 # strip '\r' at the end 98 i="${i/~(Er)$'\r'/}" 99 100 case "$i" in 101 ~(Eli)Content-Type:.*) 102 response.content_type="${i/~(El).*:[[:blank:]]*/}" 103 ;; 104 ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 105 integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 106 ;; 107 ~(Eli)Transfer-Encoding:.*) 108 response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 109 ;; 110 esac 111 done 112 113 return 0 114 } 115 116 function cat_http_body 117 { 118 typeset emode="$1" 119 typeset hexchunksize="0" 120 integer chunksize=0 121 122 if [[ "${emode}" == "chunked" ]] ; then 123 while IFS=$'\n' read hexchunksize ; do 124 hexchunksize="${hexchunksize//$'\r'/}" 125 [[ "${hexchunksize}" != "" ]] || continue 126 [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] || break 127 (( chunksize=16#${hexchunksize} )) 128 (( chunksize > 0 )) || break 129 dd bs=1 count="${chunksize}" 2>/dev/null 130 done 131 else 132 cat 133 fi 134 135 return 0 136 } 137 138 function init_url 139 { 140 _.protocol="${1%://*}" 141 _.path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 142 143 if [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then 144 _.host="${_.path1%%/*}" 145 _.path="${_.path1#*/}" 146 _.port="${_.host##*:}" 147 fi 148 149 return 0 150 } 151 152 # close connection 153 function close_connection 154 { 155 integer ret 156 157 if (( _.netfd.in != -1 )) ; then 158 redirect {_.netfd.in}<&- 159 (( _.netfd.in=-1 )) 160 fi 161 162 if (( _.netfd.in != _.netfd.out && _.netfd.out != -1 )) ; then 163 redirect {_.netfd.out}<&- 164 (( _.netfd.out=-1 )) 165 fi 166 167 if [[ "${_.protocol}" == "https" ]] ; then 168 wait ${_.ssl.openssl_client_pid} || { print -u2 -f "%s: openssl failed.\n" ; return 1 ; } 169 (( _.ssl.openssl_client_pid=-1 )) 170 171 rm -r \"${_.ssl.fifo.dir}\" 172 _.ssl.fifo.dir="" 173 fi 174 175 return 0 176 } 177 178 function open_connection 179 { 180 if [[ "${_.protocol}" == "https" ]] ; then 181 _.ssl.fifo.dir="$(mktemp -d)" 182 _.ssl.fifo.in="${_.ssl.fifo.dir}/in" 183 _.ssl.fifo.out="${_.ssl.fifo.dir}/out" 184 185 # Use "errexit" to leave it at the first error 186 # (this saves lots of if/fi tests for error checking) 187 set -o errexit 188 189 mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}" 190 191 # create async openssl child to handle https 192 openssl s_client -quiet -connect "${_.host}:${_.port}" <"${_.ssl.fifo.in}" >>"${_.ssl.fifo.out}" & 193 194 _.ssl.openssl_client_pid=$! 195 else 196 redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}" 197 (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } 198 (( _.netfd.out=_.netfd.in )) 199 fi 200 return 0 201 } 202 203 function send_request 204 { 205 typeset request="$1" 206 207 set -o errexit 208 209 if [[ "${_.protocol}" == "https" ]] ; then 210 print -n -- "${request}\r\n" >> "${_.ssl.fifo.in}" 211 212 redirect {_.netfd.in}< "${_.ssl.fifo.out}" 213 else 214 print -n -- "${request}\r\n" >&${_.netfd.out} 215 fi 216 return 0 217 } 218 219 function cat_url 220 { 221 if [[ "${_.protocol}" == "file" ]] ; then 222 cat "${_.path1}" 223 return $? 224 elif [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then 225 compound httpresponse # http response 226 227 # If URL did not contain a port number in the host part then look at the 228 # protocol to get the port number 229 if [[ "${_.port}" == "${_.host}" ]] ; then 230 case "${_.protocol}" in 231 "http") _.port=80 ;; 232 "https") _.port=443 ;; 233 *) _.port="$(getent services "${_.protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 234 esac 235 else 236 _.host="${_.host%:*}" 237 fi 238 239 printmsg "protocol=${_.protocol} port=${_.port} host=${_.host} path=${_.path}" 240 241 # prechecks 242 [[ "${_.protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 243 [[ "${_.port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 244 [[ "${_.host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 245 [[ "${_.path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 246 247 _.open_connection 248 249 # send HTTP request 250 request="GET /${_.path} HTTP/1.1\r\n" 251 request+="Host: ${_.host}\r\n" 252 request+="User-Agent: ${_.user_agent}\r\n" 253 request+="Connection: close\r\n" 254 _.send_request "${request}\r\n" 255 256 # collect response and send it to stdout 257 { 258 _.parse_http_response httpresponse 259 _.cat_http_body "${httpresponse.transfer_encoding}" 260 } <&${_.netfd.in} 261 262 _.close_connection 263 264 return 0 265 else 266 return 1 267 fi 268 # notreached 269 } 270) 271 272function html_entity_to_ascii 273{ 274 typeset buf 275 typeset entity 276 typeset c 277 typeset value 278 279 # Todo: Add more HTML/MathML entities here 280 # Note we use a static variable (typeset -S) here to make sure we 281 # don't loose the cache data between calls 282 typeset -S -A entity_cache=( 283 # entity to ascii (fixme: add UTF-8 transliterations) 284 ["nbsp"]=' ' 285 ["lt"]='<' 286 ["le"]='<=' 287 ["gt"]='>' 288 ["ge"]='>=' 289 ["amp"]='&' 290 ["quot"]='"' 291 ["apos"]="'" 292 ) 293 294 buf="" 295 while IFS='' read -r -N 1 c ; do 296 if [[ "$c" != "&" ]] ; then 297 print -n -r -- "${c}" 298 continue 299 fi 300 301 entity="" 302 while IFS='' read -r -N 1 c ; do 303 case "$c" in 304 ";") 305 break 306 ;; 307 ~(Eilr)[a-z0-9#]) 308 entity+="$c" 309 continue 310 ;; 311 *) 312# debugmsg "error &${entity}${c}#" 313 314 print -n -r -- "${entity}${c}" 315 entity="" 316 continue 2 317 ;; 318 esac 319 done 320 321 value="" 322 if [[ "${entity_cache["${entity}"]}" != "" ]] ; then 323# debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#" 324 value="${entity_cache["${entity}"]}" 325 else 326 if [[ "${entity:0:1}" == "#" ]] ; then 327 # decimal literal 328 value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }" 329 elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then 330 # hexadecimal literal 331 value="${ printf "\u[${entity:0:7}]" ; }" 332 else 333 # unknown literal - pass-through 334 value="ENT=|${entity}|" 335 fi 336 337 entity_cache["${entity}"]="${value}" 338 339# debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#" 340 fi 341 342 printf "%s" "${value}" 343 done 344 345 return 0 346} 347 348# dumb xhtml handler - no CSS, tables, images, iframes or nested 349# structures are supported (and we assume that the input is correct 350# xhtml). The code was written in a trial&&error manner and should be 351# rewritten to parse xhtml correctly. 352function handle_html 353{ 354 # we can't use global variables here when multiple callbacks use the same 355 # callback function - but we can use the callback associative array for 356 # variable storage instead 357 nameref callbacks=${1} 358 typeset tag_type="$2" 359 typeset tag_value="$3" 360 361 case "${tag_type}" in 362 tag_begin) 363 case "${tag_value}" in 364 br) printf "\n" ;; 365 hr) printf "\n-------------------------------------\n" ;; 366 pre) callbacks["html_pre"]='true' ;; 367 p) printf "\n" ;; 368 esac 369 ;; 370 371 tag_end) 372 case "${tag_value}" in 373 pre) callbacks["html_pre"]='false' ;; 374 esac 375 ;; 376 377 tag_text) 378 if ${callbacks["html_pre"]} ; then 379 printf "%s" "${tag_value}" 380 else 381 # compress spaces/newlines/tabs/etc. 382 printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }" 383 fi 384 ;; 385 386 document_start) 387 callbacks["html_pre"]='false' 388 ;; 389 document_end) ;; 390 esac 391 392 return 0 393} 394 395function handle_rss 396{ 397 # we can't use global variables here when multiple callbacks use the same 398 # callback function - but we can use the callback associative array for 399 # variable storage instead 400 nameref callbacks=${1} 401 typeset tag_type="$2" 402 typeset tag_value="$3" 403 404 case "${tag_type}" in 405 tag_begin) 406 case "${tag_value}" in 407 item) 408 item["title"]="" 409 item["link"]="" 410 item["tag"]="" 411 item["description"]="" 412 ;; 413 esac 414 callbacks["textbuf"]="" 415 ;; 416 tag_end) 417 case "${tag_value}" in 418 item) 419 # note that each RSS item needs to be converted seperately from RSS to HTML to plain text 420 # to make sure that the state of one RSS item doesn't affect others 421 ( 422 printf $"<br />#### RSS item: title: %s ####" "${item["title"]}" 423 printf $"<br />## author: %s" "${item["author"]}" 424 printf $"<br />## link: %s" "${item["link"]}" 425 printf $"<br />## date: %s" "${item["pubDate"]}" 426 printf $"<br />## begin description:" 427 printf $"<br />%s<br />" "${item["description"]}" 428 printf $"<br />## end description<br />" 429 print # extra newline to make sure the sed pipeline gets flushed 430 ) | 431 html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code) 432 xml_tok "xhtmltok_cb" | # convert HTML to plain text 433 html_entity_to_ascii # convert HTML entities 434 ;; 435 title) item["title"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 436 link) item["link"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 437 dc:creator | author) item["author"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 438 dc:date | pubDate) item["pubDate"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 439 description) item["description"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 440 esac 441 callbacks["textbuf"]="" 442 ;; 443 tag_text) 444 callbacks["textbuf"]+="${tag_value}" 445 ;; 446 document_start) ;; 447 document_end) ;; 448 esac 449 return 0 450} 451 452function xml_tok 453{ 454 typeset buf="" 455 typeset namebuf="" 456 typeset attrbuf="" 457 typeset c="" 458 typeset isendtag # bool: true/false 459 typeset issingletag # bool: true/false (used for tags like "<br />") 460 nameref callbacks=${1} 461 462 [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 463 464 while IFS='' read -r -N 1 c ; do 465 isendtag=false 466 467 if [[ "$c" == "<" ]] ; then 468 # flush any text content 469 if [[ "$buf" != "" ]] ; then 470 [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 471 buf="" 472 fi 473 474 IFS='' read -r -N 1 c 475 if [[ "$c" == "/" ]] ; then 476 isendtag=true 477 else 478 buf="$c" 479 fi 480 IFS='' read -r -d '>' c 481 buf+="$c" 482 483 # handle comments 484 if [[ "$buf" == ~(El)!-- ]] ; then 485 # did we read the comment completely ? 486 if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 487 buf+=">" 488 while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 489 IFS='' read -r -N 1 c || break 490 buf+="$c" 491 done 492 fi 493 494 [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 495 buf="" 496 continue 497 fi 498 499 # check if the tag starts and ends at the same time (like "<br />") 500 if [[ "${buf}" == ~(Er).*/ ]] ; then 501 issingletag=true 502 buf="${buf%*/}" 503 else 504 issingletag=false 505 fi 506 507 # check if the tag has attributes (e.g. space after name) 508 if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 509 namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 510 attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 511 else 512 namebuf="$buf" 513 attrbuf="" 514 fi 515 516 if ${isendtag} ; then 517 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 518 else 519 [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 520 521 # handle tags like <br/> (which are start- and end-tag in one piece) 522 if ${issingletag} ; then 523 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 524 fi 525 fi 526 buf="" 527 else 528 buf+="$c" 529 fi 530 done 531 532 [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 533 534 print # final newline to make filters like "sed" happy 535} 536 537# return the value of LC_MESSAGES needed for subprocesses which 538# want to run in a different locale/encoding 539function get_lc_messages 540{ 541 [[ "${LC_ALL}" != "" ]] && { print "${LC_ALL}" ; return 0 ; } 542 [[ "${LC_MESSAGES}" != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; } 543 [[ "${LANG}" != "" ]] && { print "${LANG}" ; return 0 ; } 544 print "C" ; return 0 545} 546 547function do_rssread 548{ 549 # set unicode locale since RSS is encoded in UTF-8 550 # (and make sure $LC_MESSAGES is set to the parent 551 # process's locale that all error messages are using 552 # the callers locale/encoding) 553 export \ 554 LC_MESSAGES="${ get_lc_messages ; }" \ 555 LC_MONETARY="en_US.UTF-8" \ 556 LC_NUMERIC="en_US.UTF-8" \ 557 LC_COLLATE="en_US.UTF-8" \ 558 LC_CTYPE="en_US.UTF-8" \ 559 LC_TIME="en_US.UTF-8" \ 560 LANG="en_US.UTF-8" 561 562 # return non-zero exit code for this function if the rss processing below fails 563 set -o errexit 564 565 urlconnection_t hc 566 hc.user_agent="rssread/ksh93(ssl) (2009-08-14; $(uname -s -r -p))" 567 hc.init_url "$1" 568 569 # need extra newline after cat_url to terminate line with $'\n' 570 # to make "xml_tok" happy 571 data="${ hc.cat_url ; print ; }" 572 573 print -u2 -f "# Got %d lines of RSS data, processing...\n" "${ wc -l <<< "${data}" ; }" 574 575 xml_tok "rsstok_cb" <<< "${data}" 576 577 return 0 578} 579 580function usage 581{ 582 OPTIND=0 583 getopts -a "${progname}" "${rssread_usage}" OPT '-?' 584 exit 2 585} 586 587# make sure we use the ksh93 builtin versions 588builtin basename 589builtin cat 590builtin mkfifo 591 592typeset -A rsstok_cb # callbacks for xml_tok 593rsstok_cb["tag_begin"]="handle_rss" 594rsstok_cb["tag_end"]="handle_rss" 595rsstok_cb["tag_text"]="handle_rss" 596rsstok_cb["textbuf"]="" 597 598typeset -A xhtmltok_cb # callbacks for xml_tok 599xhtmltok_cb["tag_begin"]="handle_html" 600xhtmltok_cb["tag_end"]="handle_html" 601xhtmltok_cb["tag_text"]="handle_html" 602xhtmltok_cb["textbuf"]="" 603xhtmltok_cb["html_pre"]='false' 604 605typeset -A item 606 607typeset -A bookmark_urls 608 609# "ramdom" urls for testing 610bookmark_urls=( 611 ["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss" 612 # OpenSolaris.org sites 613 ["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml" 614 ["ksh93_integration_ssl"]="https://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml" 615 ["shell"]="http://www.opensolaris.org/rss/os/project/shell/announcements/rss2.xml" 616 ["systemz"]="http://www.opensolaris.org/rss/os/project/systemz/announcements/rss2.xml" 617 ["systemz_ssl"]="https://www.opensolaris.org/rss/os/project/systemz/announcements/rss2.xml" 618 # some Sun staff/sites 619 ["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss" 620 ["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml" 621 ["bigadmin_scripts"]="https://www.sun.com/bigadmin/content/rss/scripts.xml" 622 ["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss" 623 ["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss" 624 ["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss" 625 ["planetsun"]="http://www.planetsun.org/rss20.xml" 626 ["planetsolaris"]="http://www.planetsolaris.org/rss20.xml" 627 ["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml" 628 ["theregister_uk"]="http://www.theregister.co.uk/headlines.rss" 629 ["heise"]="http://www.heise.de/newsticker/heise.rdf" 630 ["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot" 631 ["wikipedia_command_shells"]="http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history" 632) 633 634typeset progname="${ basename "${0}" ; }" 635 636typeset -r rssread_usage=$'+ 637[-?\n@(#)\$Id: rssread (Roland Mainz) 2009-08-14 \$\n] 638[-author?Roland Mainz <roland.mainz@sun.com>] 639[-author?Roland Mainz <roland.mainz@nrubsig.org>] 640[+NAME?rssread - fetch RSS messages and convert them to plain text] 641[+DESCRIPTION?\brssread\b RSS to plain text converter 642 which fetches RSS streams via HTTP and converts them from 643 RSS to HTML to plain text in the current locale/encoding.] 644[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.] 645 646[ url ] 647 648[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)] 649' 650 651typeset noiconv=false 652 653while getopts -a "${progname}" "${rssread_usage}" OPT ; do 654# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 655 case ${OPT} in 656 I) noiconv=true ;; 657 +I) noiconv=false ;; 658 *) usage ;; 659 esac 660done 661shift $((OPTIND-1)) 662 663typeset url="$1" 664 665if [[ "${url}" == "" ]] ; then 666 fatal_error $"No url given." 667fi 668 669if [[ "${bookmark_urls[${url}]}" != "" ]] ; then 670 printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}" 671 url="${bookmark_urls[${url}]}" 672fi 673 674if ${noiconv} ; then 675 do_rssread "${url}" 676else 677 do_rssread "${url}" | iconv -f "UTF-8" - - 678fi 679 680exit 0 681#EOF. 682