17c2fbfb3SApril Chin#!/usr/bin/ksh93 27c2fbfb3SApril Chin 37c2fbfb3SApril Chin# 47c2fbfb3SApril Chin# CDDL HEADER START 57c2fbfb3SApril Chin# 67c2fbfb3SApril Chin# The contents of this file are subject to the terms of the 77c2fbfb3SApril Chin# Common Development and Distribution License (the "License"). 87c2fbfb3SApril Chin# You may not use this file except in compliance with the License. 97c2fbfb3SApril Chin# 107c2fbfb3SApril Chin# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 117c2fbfb3SApril Chin# or http://www.opensolaris.org/os/licensing. 127c2fbfb3SApril Chin# See the License for the specific language governing permissions 137c2fbfb3SApril Chin# and limitations under the License. 147c2fbfb3SApril Chin# 157c2fbfb3SApril Chin# When distributing Covered Code, include this CDDL HEADER in each 167c2fbfb3SApril Chin# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 177c2fbfb3SApril Chin# If applicable, add the following below this CDDL HEADER, with the 187c2fbfb3SApril Chin# fields enclosed by brackets "[]" replaced with your own identifying 197c2fbfb3SApril Chin# information: Portions Copyright [yyyy] [name of copyright owner] 207c2fbfb3SApril Chin# 217c2fbfb3SApril Chin# CDDL HEADER END 227c2fbfb3SApril Chin# 237c2fbfb3SApril Chin 247c2fbfb3SApril Chin# 25*3e14f97fSRoger A. Faulkner# Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 267c2fbfb3SApril Chin# 277c2fbfb3SApril Chin 287c2fbfb3SApril Chin# 297c2fbfb3SApril Chin# rssread - a simple RSS2.0 reader with RSS to XHTML to 307c2fbfb3SApril Chin# plaintext conversion. 317c2fbfb3SApril Chin# 327c2fbfb3SApril Chin 337c2fbfb3SApril Chin# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 347c2fbfb3SApril Chinexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 357c2fbfb3SApril Chin 367c2fbfb3SApril Chinfunction printmsg 377c2fbfb3SApril Chin{ 387c2fbfb3SApril Chin print -u2 "$*" 397c2fbfb3SApril Chin} 407c2fbfb3SApril Chin 417c2fbfb3SApril Chinfunction debugmsg 427c2fbfb3SApril Chin{ 437c2fbfb3SApril Chin# printmsg "$*" 447c2fbfb3SApril Chintrue 457c2fbfb3SApril Chin} 467c2fbfb3SApril Chin 477c2fbfb3SApril Chinfunction fatal_error 487c2fbfb3SApril Chin{ 497c2fbfb3SApril Chin print -u2 "${progname}: $*" 507c2fbfb3SApril Chin exit 1 517c2fbfb3SApril Chin} 527c2fbfb3SApril Chin 5334f9b3eeSRoland Mainztypeset -T urlconnection_t=( 5434f9b3eeSRoland Mainz # public 5534f9b3eeSRoland Mainz typeset user_agent="ksh93/urlconnection_t" 5634f9b3eeSRoland Mainz 5734f9b3eeSRoland Mainz # private variables 5834f9b3eeSRoland Mainz typeset protocol 5934f9b3eeSRoland Mainz typeset path1 6034f9b3eeSRoland Mainz typeset host 6134f9b3eeSRoland Mainz typeset path 6234f9b3eeSRoland Mainz typeset port 6334f9b3eeSRoland Mainz 6434f9b3eeSRoland Mainz compound netfd=( 6534f9b3eeSRoland Mainz integer in=-1 # incoming traffic 6634f9b3eeSRoland Mainz integer out=-1 # outgoing traffic 6734f9b3eeSRoland Mainz ) 6834f9b3eeSRoland Mainz 6934f9b3eeSRoland Mainz # only used for https 7034f9b3eeSRoland Mainz compound ssl=( 7134f9b3eeSRoland Mainz compound fifo=( 7234f9b3eeSRoland Mainz typeset dir="" 7334f9b3eeSRoland Mainz typeset in="" 7434f9b3eeSRoland Mainz typeset out="" 7534f9b3eeSRoland Mainz ) 7634f9b3eeSRoland Mainz integer openssl_client_pid=-1 7734f9b3eeSRoland Mainz ) 7834f9b3eeSRoland Mainz 797c2fbfb3SApril Chin # parse HTTP return code, cookies etc. 807c2fbfb3SApril Chin function parse_http_response 817c2fbfb3SApril Chin { 827c2fbfb3SApril Chin nameref response="$1" 837c2fbfb3SApril Chin typeset h statuscode statusmsg i 847c2fbfb3SApril Chin 857c2fbfb3SApril Chin # we use '\r' as additional IFS to filter the final '\r' 867c2fbfb3SApril Chin IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 877c2fbfb3SApril Chin [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 887c2fbfb3SApril Chin [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 897c2fbfb3SApril Chin response.statuscode="$statuscode" 907c2fbfb3SApril Chin response.statusmsg="$statusmsg" 917c2fbfb3SApril Chin 927c2fbfb3SApril Chin # skip remaining headers 937c2fbfb3SApril Chin while IFS='' read -r i ; do 947c2fbfb3SApril Chin [[ "$i" == $'\r' ]] && break 957c2fbfb3SApril Chin 967c2fbfb3SApril Chin # strip '\r' at the end 977c2fbfb3SApril Chin i="${i/~(Er)$'\r'/}" 987c2fbfb3SApril Chin 997c2fbfb3SApril Chin case "$i" in 1007c2fbfb3SApril Chin ~(Eli)Content-Type:.*) 1017c2fbfb3SApril Chin response.content_type="${i/~(El).*:[[:blank:]]*/}" 1027c2fbfb3SApril Chin ;; 1037c2fbfb3SApril Chin ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 1047c2fbfb3SApril Chin integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 1057c2fbfb3SApril Chin ;; 1067c2fbfb3SApril Chin ~(Eli)Transfer-Encoding:.*) 1077c2fbfb3SApril Chin response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 1087c2fbfb3SApril Chin ;; 1097c2fbfb3SApril Chin esac 1107c2fbfb3SApril Chin done 1117c2fbfb3SApril Chin 1127c2fbfb3SApril Chin return 0 1137c2fbfb3SApril Chin } 1147c2fbfb3SApril Chin 1157c2fbfb3SApril Chin function cat_http_body 1167c2fbfb3SApril Chin { 1177c2fbfb3SApril Chin typeset emode="$1" 1187c2fbfb3SApril Chin typeset hexchunksize="0" 1197c2fbfb3SApril Chin integer chunksize=0 1207c2fbfb3SApril Chin 1217c2fbfb3SApril Chin if [[ "${emode}" == "chunked" ]] ; then 12234f9b3eeSRoland Mainz while IFS=$'\n' read hexchunksize ; do 12334f9b3eeSRoland Mainz hexchunksize="${hexchunksize//$'\r'/}" 12434f9b3eeSRoland Mainz [[ "${hexchunksize}" != "" ]] || continue 12534f9b3eeSRoland Mainz [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] || break 126*3e14f97fSRoger A. Faulkner (( chunksize=$( printf "16#%s\n" "${hexchunksize}" ) )) 12734f9b3eeSRoland Mainz (( chunksize > 0 )) || break 1287c2fbfb3SApril Chin dd bs=1 count="${chunksize}" 2>/dev/null 1297c2fbfb3SApril Chin done 1307c2fbfb3SApril Chin else 1317c2fbfb3SApril Chin cat 1327c2fbfb3SApril Chin fi 1337c2fbfb3SApril Chin 1347c2fbfb3SApril Chin return 0 1357c2fbfb3SApril Chin } 1367c2fbfb3SApril Chin 13734f9b3eeSRoland Mainz function init_url 1387c2fbfb3SApril Chin { 13934f9b3eeSRoland Mainz _.protocol="${1%://*}" 14034f9b3eeSRoland Mainz _.path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 1417c2fbfb3SApril Chin 14234f9b3eeSRoland Mainz if [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then 14334f9b3eeSRoland Mainz _.host="${_.path1%%/*}" 14434f9b3eeSRoland Mainz _.path="${_.path1#*/}" 14534f9b3eeSRoland Mainz _.port="${_.host##*:}" 1467c2fbfb3SApril Chin fi 1477c2fbfb3SApril Chin 1487c2fbfb3SApril Chin return 0 1497c2fbfb3SApril Chin } 1507c2fbfb3SApril Chin 15134f9b3eeSRoland Mainz # close connection 15234f9b3eeSRoland Mainz function close_connection 15334f9b3eeSRoland Mainz { 15434f9b3eeSRoland Mainz integer ret 15534f9b3eeSRoland Mainz 15634f9b3eeSRoland Mainz if (( _.netfd.in != -1 )) ; then 15734f9b3eeSRoland Mainz redirect {_.netfd.in}<&- 15834f9b3eeSRoland Mainz (( _.netfd.in=-1 )) 15934f9b3eeSRoland Mainz fi 16034f9b3eeSRoland Mainz 16134f9b3eeSRoland Mainz if (( _.netfd.in != _.netfd.out && _.netfd.out != -1 )) ; then 16234f9b3eeSRoland Mainz redirect {_.netfd.out}<&- 16334f9b3eeSRoland Mainz (( _.netfd.out=-1 )) 16434f9b3eeSRoland Mainz fi 16534f9b3eeSRoland Mainz 16634f9b3eeSRoland Mainz if [[ "${_.protocol}" == "https" ]] ; then 16734f9b3eeSRoland Mainz wait ${_.ssl.openssl_client_pid} || { print -u2 -f "%s: openssl failed.\n" ; return 1 ; } 16834f9b3eeSRoland Mainz (( _.ssl.openssl_client_pid=-1 )) 16934f9b3eeSRoland Mainz 17034f9b3eeSRoland Mainz rm -r \"${_.ssl.fifo.dir}\" 17134f9b3eeSRoland Mainz _.ssl.fifo.dir="" 17234f9b3eeSRoland Mainz fi 17334f9b3eeSRoland Mainz 17434f9b3eeSRoland Mainz return 0 17534f9b3eeSRoland Mainz } 17634f9b3eeSRoland Mainz 17734f9b3eeSRoland Mainz function open_connection 17834f9b3eeSRoland Mainz { 17934f9b3eeSRoland Mainz if [[ "${_.protocol}" == "https" ]] ; then 180*3e14f97fSRoger A. Faulkner _.ssl.fifo.dir="$(mktemp -t -d)" 18134f9b3eeSRoland Mainz _.ssl.fifo.in="${_.ssl.fifo.dir}/in" 18234f9b3eeSRoland Mainz _.ssl.fifo.out="${_.ssl.fifo.dir}/out" 18334f9b3eeSRoland Mainz 18434f9b3eeSRoland Mainz # Use "errexit" to leave it at the first error 18534f9b3eeSRoland Mainz # (this saves lots of if/fi tests for error checking) 18634f9b3eeSRoland Mainz set -o errexit 18734f9b3eeSRoland Mainz 18834f9b3eeSRoland Mainz mkfifo "${_.ssl.fifo.in}" "${_.ssl.fifo.out}" 18934f9b3eeSRoland Mainz 19034f9b3eeSRoland Mainz # create async openssl child to handle https 19134f9b3eeSRoland Mainz openssl s_client -quiet -connect "${_.host}:${_.port}" <"${_.ssl.fifo.in}" >>"${_.ssl.fifo.out}" & 19234f9b3eeSRoland Mainz 19334f9b3eeSRoland Mainz _.ssl.openssl_client_pid=$! 19434f9b3eeSRoland Mainz else 19534f9b3eeSRoland Mainz redirect {_.netfd.in}<> "/dev/tcp/${_.host}/${_.port}" 19634f9b3eeSRoland Mainz (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } 19734f9b3eeSRoland Mainz (( _.netfd.out=_.netfd.in )) 19834f9b3eeSRoland Mainz fi 19934f9b3eeSRoland Mainz return 0 20034f9b3eeSRoland Mainz } 20134f9b3eeSRoland Mainz 20234f9b3eeSRoland Mainz function send_request 20334f9b3eeSRoland Mainz { 20434f9b3eeSRoland Mainz typeset request="$1" 20534f9b3eeSRoland Mainz 20634f9b3eeSRoland Mainz set -o errexit 20734f9b3eeSRoland Mainz 20834f9b3eeSRoland Mainz if [[ "${_.protocol}" == "https" ]] ; then 20934f9b3eeSRoland Mainz print -n -- "${request}\r\n" >> "${_.ssl.fifo.in}" 21034f9b3eeSRoland Mainz 21134f9b3eeSRoland Mainz redirect {_.netfd.in}< "${_.ssl.fifo.out}" 21234f9b3eeSRoland Mainz else 21334f9b3eeSRoland Mainz print -n -- "${request}\r\n" >&${_.netfd.out} 21434f9b3eeSRoland Mainz fi 21534f9b3eeSRoland Mainz return 0 21634f9b3eeSRoland Mainz } 21734f9b3eeSRoland Mainz 21834f9b3eeSRoland Mainz function cat_url 21934f9b3eeSRoland Mainz { 22034f9b3eeSRoland Mainz if [[ "${_.protocol}" == "file" ]] ; then 22134f9b3eeSRoland Mainz cat "${_.path1}" 22234f9b3eeSRoland Mainz return $? 22334f9b3eeSRoland Mainz elif [[ "${_.protocol}" == ~(Elr)http(|s) ]] ; then 22434f9b3eeSRoland Mainz compound httpresponse # http response 22534f9b3eeSRoland Mainz 22634f9b3eeSRoland Mainz # If URL did not contain a port number in the host part then look at the 22734f9b3eeSRoland Mainz # protocol to get the port number 22834f9b3eeSRoland Mainz if [[ "${_.port}" == "${_.host}" ]] ; then 22934f9b3eeSRoland Mainz case "${_.protocol}" in 23034f9b3eeSRoland Mainz "http") _.port=80 ;; 23134f9b3eeSRoland Mainz "https") _.port=443 ;; 23234f9b3eeSRoland Mainz *) _.port="$(getent services "${_.protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 23334f9b3eeSRoland Mainz esac 23434f9b3eeSRoland Mainz else 23534f9b3eeSRoland Mainz _.host="${_.host%:*}" 23634f9b3eeSRoland Mainz fi 23734f9b3eeSRoland Mainz 23834f9b3eeSRoland Mainz printmsg "protocol=${_.protocol} port=${_.port} host=${_.host} path=${_.path}" 23934f9b3eeSRoland Mainz 24034f9b3eeSRoland Mainz # prechecks 24134f9b3eeSRoland Mainz [[ "${_.protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 24234f9b3eeSRoland Mainz [[ "${_.port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 24334f9b3eeSRoland Mainz [[ "${_.host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 24434f9b3eeSRoland Mainz [[ "${_.path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 24534f9b3eeSRoland Mainz 246*3e14f97fSRoger A. Faulkner _.open_connection || return 1 24734f9b3eeSRoland Mainz 24834f9b3eeSRoland Mainz # send HTTP request 24934f9b3eeSRoland Mainz request="GET /${_.path} HTTP/1.1\r\n" 25034f9b3eeSRoland Mainz request+="Host: ${_.host}\r\n" 25134f9b3eeSRoland Mainz request+="User-Agent: ${_.user_agent}\r\n" 25234f9b3eeSRoland Mainz request+="Connection: close\r\n" 25334f9b3eeSRoland Mainz _.send_request "${request}\r\n" 25434f9b3eeSRoland Mainz 25534f9b3eeSRoland Mainz # collect response and send it to stdout 25634f9b3eeSRoland Mainz { 25734f9b3eeSRoland Mainz _.parse_http_response httpresponse 25834f9b3eeSRoland Mainz _.cat_http_body "${httpresponse.transfer_encoding}" 25934f9b3eeSRoland Mainz } <&${_.netfd.in} 26034f9b3eeSRoland Mainz 26134f9b3eeSRoland Mainz _.close_connection 26234f9b3eeSRoland Mainz 26334f9b3eeSRoland Mainz return 0 26434f9b3eeSRoland Mainz else 26534f9b3eeSRoland Mainz return 1 26634f9b3eeSRoland Mainz fi 26734f9b3eeSRoland Mainz # notreached 26834f9b3eeSRoland Mainz } 26934f9b3eeSRoland Mainz) 27034f9b3eeSRoland Mainz 2717c2fbfb3SApril Chinfunction html_entity_to_ascii 2727c2fbfb3SApril Chin{ 2737c2fbfb3SApril Chin typeset buf 2747c2fbfb3SApril Chin typeset entity 2757c2fbfb3SApril Chin typeset c 2767c2fbfb3SApril Chin typeset value 2777c2fbfb3SApril Chin 2787c2fbfb3SApril Chin # Todo: Add more HTML/MathML entities here 2797c2fbfb3SApril Chin # Note we use a static variable (typeset -S) here to make sure we 2807c2fbfb3SApril Chin # don't loose the cache data between calls 2817c2fbfb3SApril Chin typeset -S -A entity_cache=( 2827c2fbfb3SApril Chin # entity to ascii (fixme: add UTF-8 transliterations) 2837c2fbfb3SApril Chin ["nbsp"]=' ' 2847c2fbfb3SApril Chin ["lt"]='<' 2857c2fbfb3SApril Chin ["le"]='<=' 2867c2fbfb3SApril Chin ["gt"]='>' 2877c2fbfb3SApril Chin ["ge"]='>=' 2887c2fbfb3SApril Chin ["amp"]='&' 2897c2fbfb3SApril Chin ["quot"]='"' 2907c2fbfb3SApril Chin ["apos"]="'" 2917c2fbfb3SApril Chin ) 2927c2fbfb3SApril Chin 2937c2fbfb3SApril Chin buf="" 2947c2fbfb3SApril Chin while IFS='' read -r -N 1 c ; do 2957c2fbfb3SApril Chin if [[ "$c" != "&" ]] ; then 2967c2fbfb3SApril Chin print -n -r -- "${c}" 2977c2fbfb3SApril Chin continue 2987c2fbfb3SApril Chin fi 2997c2fbfb3SApril Chin 3007c2fbfb3SApril Chin entity="" 3017c2fbfb3SApril Chin while IFS='' read -r -N 1 c ; do 3027c2fbfb3SApril Chin case "$c" in 3037c2fbfb3SApril Chin ";") 3047c2fbfb3SApril Chin break 3057c2fbfb3SApril Chin ;; 3067c2fbfb3SApril Chin ~(Eilr)[a-z0-9#]) 3077c2fbfb3SApril Chin entity+="$c" 3087c2fbfb3SApril Chin continue 3097c2fbfb3SApril Chin ;; 3107c2fbfb3SApril Chin *) 3117c2fbfb3SApril Chin# debugmsg "error &${entity}${c}#" 3127c2fbfb3SApril Chin 3137c2fbfb3SApril Chin print -n -r -- "${entity}${c}" 3147c2fbfb3SApril Chin entity="" 3157c2fbfb3SApril Chin continue 2 3167c2fbfb3SApril Chin ;; 3177c2fbfb3SApril Chin esac 3187c2fbfb3SApril Chin done 3197c2fbfb3SApril Chin 3207c2fbfb3SApril Chin value="" 3217c2fbfb3SApril Chin if [[ "${entity_cache["${entity}"]}" != "" ]] ; then 3227c2fbfb3SApril Chin# debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#" 3237c2fbfb3SApril Chin value="${entity_cache["${entity}"]}" 3247c2fbfb3SApril Chin else 3257c2fbfb3SApril Chin if [[ "${entity:0:1}" == "#" ]] ; then 3267c2fbfb3SApril Chin # decimal literal 3277c2fbfb3SApril Chin value="${ printf "\u[${ printf "%x" "${entity:1:8}" ; }]" ; }" 3287c2fbfb3SApril Chin elif [[ "${entity:0:7}" == ~(Eilr)[0-9a-f]* ]] ; then 3297c2fbfb3SApril Chin # hexadecimal literal 3307c2fbfb3SApril Chin value="${ printf "\u[${entity:0:7}]" ; }" 3317c2fbfb3SApril Chin else 3327c2fbfb3SApril Chin # unknown literal - pass-through 3337c2fbfb3SApril Chin value="ENT=|${entity}|" 3347c2fbfb3SApril Chin fi 3357c2fbfb3SApril Chin 3367c2fbfb3SApril Chin entity_cache["${entity}"]="${value}" 3377c2fbfb3SApril Chin 3387c2fbfb3SApril Chin# debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#" 3397c2fbfb3SApril Chin fi 3407c2fbfb3SApril Chin 3417c2fbfb3SApril Chin printf "%s" "${value}" 3427c2fbfb3SApril Chin done 3437c2fbfb3SApril Chin 3447c2fbfb3SApril Chin return 0 3457c2fbfb3SApril Chin} 3467c2fbfb3SApril Chin 3477c2fbfb3SApril Chin# dumb xhtml handler - no CSS, tables, images, iframes or nested 3487c2fbfb3SApril Chin# structures are supported (and we assume that the input is correct 3497c2fbfb3SApril Chin# xhtml). The code was written in a trial&&error manner and should be 3507c2fbfb3SApril Chin# rewritten to parse xhtml correctly. 3517c2fbfb3SApril Chinfunction handle_html 3527c2fbfb3SApril Chin{ 3537c2fbfb3SApril Chin # we can't use global variables here when multiple callbacks use the same 3547c2fbfb3SApril Chin # callback function - but we can use the callback associative array for 3557c2fbfb3SApril Chin # variable storage instead 3567c2fbfb3SApril Chin nameref callbacks=${1} 3577c2fbfb3SApril Chin typeset tag_type="$2" 3587c2fbfb3SApril Chin typeset tag_value="$3" 3597c2fbfb3SApril Chin 3607c2fbfb3SApril Chin case "${tag_type}" in 3617c2fbfb3SApril Chin tag_begin) 3627c2fbfb3SApril Chin case "${tag_value}" in 3637c2fbfb3SApril Chin br) printf "\n" ;; 3647c2fbfb3SApril Chin hr) printf "\n-------------------------------------\n" ;; 3657c2fbfb3SApril Chin pre) callbacks["html_pre"]='true' ;; 3667c2fbfb3SApril Chin p) printf "\n" ;; 3677c2fbfb3SApril Chin esac 3687c2fbfb3SApril Chin ;; 3697c2fbfb3SApril Chin 3707c2fbfb3SApril Chin tag_end) 3717c2fbfb3SApril Chin case "${tag_value}" in 3727c2fbfb3SApril Chin pre) callbacks["html_pre"]='false' ;; 3737c2fbfb3SApril Chin esac 3747c2fbfb3SApril Chin ;; 3757c2fbfb3SApril Chin 3767c2fbfb3SApril Chin tag_text) 3777c2fbfb3SApril Chin if ${callbacks["html_pre"]} ; then 3787c2fbfb3SApril Chin printf "%s" "${tag_value}" 3797c2fbfb3SApril Chin else 3807c2fbfb3SApril Chin # compress spaces/newlines/tabs/etc. 3817c2fbfb3SApril Chin printf "%s" "${tag_value//+([\n\r\t\v[:space:][:blank:]])/ }" 3827c2fbfb3SApril Chin fi 3837c2fbfb3SApril Chin ;; 3847c2fbfb3SApril Chin 3857c2fbfb3SApril Chin document_start) 3867c2fbfb3SApril Chin callbacks["html_pre"]='false' 3877c2fbfb3SApril Chin ;; 3887c2fbfb3SApril Chin document_end) ;; 3897c2fbfb3SApril Chin esac 3907c2fbfb3SApril Chin 3917c2fbfb3SApril Chin return 0 3927c2fbfb3SApril Chin} 3937c2fbfb3SApril Chin 3947c2fbfb3SApril Chinfunction handle_rss 3957c2fbfb3SApril Chin{ 3967c2fbfb3SApril Chin # we can't use global variables here when multiple callbacks use the same 3977c2fbfb3SApril Chin # callback function - but we can use the callback associative array for 3987c2fbfb3SApril Chin # variable storage instead 3997c2fbfb3SApril Chin nameref callbacks=${1} 4007c2fbfb3SApril Chin typeset tag_type="$2" 4017c2fbfb3SApril Chin typeset tag_value="$3" 4027c2fbfb3SApril Chin 4037c2fbfb3SApril Chin case "${tag_type}" in 4047c2fbfb3SApril Chin tag_begin) 4057c2fbfb3SApril Chin case "${tag_value}" in 4067c2fbfb3SApril Chin item) 4077c2fbfb3SApril Chin item["title"]="" 4087c2fbfb3SApril Chin item["link"]="" 4097c2fbfb3SApril Chin item["tag"]="" 4107c2fbfb3SApril Chin item["description"]="" 4117c2fbfb3SApril Chin ;; 4127c2fbfb3SApril Chin esac 4137c2fbfb3SApril Chin callbacks["textbuf"]="" 4147c2fbfb3SApril Chin ;; 4157c2fbfb3SApril Chin tag_end) 4167c2fbfb3SApril Chin case "${tag_value}" in 4177c2fbfb3SApril Chin item) 4187c2fbfb3SApril Chin # note that each RSS item needs to be converted seperately from RSS to HTML to plain text 4197c2fbfb3SApril Chin # to make sure that the state of one RSS item doesn't affect others 4207c2fbfb3SApril Chin ( 4217c2fbfb3SApril Chin printf $"<br />#### RSS item: title: %s ####" "${item["title"]}" 4227c2fbfb3SApril Chin printf $"<br />## author: %s" "${item["author"]}" 4237c2fbfb3SApril Chin printf $"<br />## link: %s" "${item["link"]}" 4247c2fbfb3SApril Chin printf $"<br />## date: %s" "${item["pubDate"]}" 4257c2fbfb3SApril Chin printf $"<br />## begin description:" 4267c2fbfb3SApril Chin printf $"<br />%s<br />" "${item["description"]}" 4277c2fbfb3SApril Chin printf $"<br />## end description<br />" 4287c2fbfb3SApril Chin print # extra newline to make sure the sed pipeline gets flushed 4297c2fbfb3SApril Chin ) | 4307c2fbfb3SApril Chin html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code) 4317c2fbfb3SApril Chin xml_tok "xhtmltok_cb" | # convert HTML to plain text 4327c2fbfb3SApril Chin html_entity_to_ascii # convert HTML entities 4337c2fbfb3SApril Chin ;; 4347c2fbfb3SApril Chin title) item["title"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 4357c2fbfb3SApril Chin link) item["link"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 4367c2fbfb3SApril Chin dc:creator | author) item["author"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 4377c2fbfb3SApril Chin dc:date | pubDate) item["pubDate"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 4387c2fbfb3SApril Chin description) item["description"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;; 4397c2fbfb3SApril Chin esac 4407c2fbfb3SApril Chin callbacks["textbuf"]="" 4417c2fbfb3SApril Chin ;; 4427c2fbfb3SApril Chin tag_text) 4437c2fbfb3SApril Chin callbacks["textbuf"]+="${tag_value}" 4447c2fbfb3SApril Chin ;; 4457c2fbfb3SApril Chin document_start) ;; 4467c2fbfb3SApril Chin document_end) ;; 4477c2fbfb3SApril Chin esac 4487c2fbfb3SApril Chin return 0 4497c2fbfb3SApril Chin} 4507c2fbfb3SApril Chin 4517c2fbfb3SApril Chinfunction xml_tok 4527c2fbfb3SApril Chin{ 4537c2fbfb3SApril Chin typeset buf="" 4547c2fbfb3SApril Chin typeset namebuf="" 4557c2fbfb3SApril Chin typeset attrbuf="" 4567c2fbfb3SApril Chin typeset c="" 4577c2fbfb3SApril Chin typeset isendtag # bool: true/false 4587c2fbfb3SApril Chin typeset issingletag # bool: true/false (used for tags like "<br />") 4597c2fbfb3SApril Chin nameref callbacks=${1} 4607c2fbfb3SApril Chin 4617c2fbfb3SApril Chin [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 4627c2fbfb3SApril Chin 4637c2fbfb3SApril Chin while IFS='' read -r -N 1 c ; do 4647c2fbfb3SApril Chin isendtag=false 4657c2fbfb3SApril Chin 4667c2fbfb3SApril Chin if [[ "$c" == "<" ]] ; then 4677c2fbfb3SApril Chin # flush any text content 4687c2fbfb3SApril Chin if [[ "$buf" != "" ]] ; then 4697c2fbfb3SApril Chin [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 4707c2fbfb3SApril Chin buf="" 4717c2fbfb3SApril Chin fi 4727c2fbfb3SApril Chin 4737c2fbfb3SApril Chin IFS='' read -r -N 1 c 4747c2fbfb3SApril Chin if [[ "$c" == "/" ]] ; then 4757c2fbfb3SApril Chin isendtag=true 4767c2fbfb3SApril Chin else 4777c2fbfb3SApril Chin buf="$c" 4787c2fbfb3SApril Chin fi 4797c2fbfb3SApril Chin IFS='' read -r -d '>' c 4807c2fbfb3SApril Chin buf+="$c" 4817c2fbfb3SApril Chin 4827c2fbfb3SApril Chin # handle comments 4837c2fbfb3SApril Chin if [[ "$buf" == ~(El)!-- ]] ; then 4847c2fbfb3SApril Chin # did we read the comment completely ? 4857c2fbfb3SApril Chin if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 4867c2fbfb3SApril Chin buf+=">" 4877c2fbfb3SApril Chin while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 4887c2fbfb3SApril Chin IFS='' read -r -N 1 c || break 4897c2fbfb3SApril Chin buf+="$c" 4907c2fbfb3SApril Chin done 4917c2fbfb3SApril Chin fi 4927c2fbfb3SApril Chin 4937c2fbfb3SApril Chin [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 4947c2fbfb3SApril Chin buf="" 4957c2fbfb3SApril Chin continue 4967c2fbfb3SApril Chin fi 4977c2fbfb3SApril Chin 4987c2fbfb3SApril Chin # check if the tag starts and ends at the same time (like "<br />") 4997c2fbfb3SApril Chin if [[ "${buf}" == ~(Er).*/ ]] ; then 5007c2fbfb3SApril Chin issingletag=true 5017c2fbfb3SApril Chin buf="${buf%*/}" 5027c2fbfb3SApril Chin else 5037c2fbfb3SApril Chin issingletag=false 5047c2fbfb3SApril Chin fi 5057c2fbfb3SApril Chin 5067c2fbfb3SApril Chin # check if the tag has attributes (e.g. space after name) 5077c2fbfb3SApril Chin if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 5087c2fbfb3SApril Chin namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 5097c2fbfb3SApril Chin attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 5107c2fbfb3SApril Chin else 5117c2fbfb3SApril Chin namebuf="$buf" 5127c2fbfb3SApril Chin attrbuf="" 5137c2fbfb3SApril Chin fi 5147c2fbfb3SApril Chin 5157c2fbfb3SApril Chin if ${isendtag} ; then 5167c2fbfb3SApril Chin [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 5177c2fbfb3SApril Chin else 5187c2fbfb3SApril Chin [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 5197c2fbfb3SApril Chin 5207c2fbfb3SApril Chin # handle tags like <br/> (which are start- and end-tag in one piece) 5217c2fbfb3SApril Chin if ${issingletag} ; then 5227c2fbfb3SApril Chin [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 5237c2fbfb3SApril Chin fi 5247c2fbfb3SApril Chin fi 5257c2fbfb3SApril Chin buf="" 5267c2fbfb3SApril Chin else 5277c2fbfb3SApril Chin buf+="$c" 5287c2fbfb3SApril Chin fi 5297c2fbfb3SApril Chin done 5307c2fbfb3SApril Chin 5317c2fbfb3SApril Chin [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 5327c2fbfb3SApril Chin 5337c2fbfb3SApril Chin print # final newline to make filters like "sed" happy 5347c2fbfb3SApril Chin} 5357c2fbfb3SApril Chin 5367c2fbfb3SApril Chin# return the value of LC_MESSAGES needed for subprocesses which 5377c2fbfb3SApril Chin# want to run in a different locale/encoding 5387c2fbfb3SApril Chinfunction get_lc_messages 5397c2fbfb3SApril Chin{ 5407c2fbfb3SApril Chin [[ "${LC_ALL}" != "" ]] && { print "${LC_ALL}" ; return 0 ; } 5417c2fbfb3SApril Chin [[ "${LC_MESSAGES}" != "" ]] && { print "${LC_MESSAGES}" ; return 0 ; } 5427c2fbfb3SApril Chin [[ "${LANG}" != "" ]] && { print "${LANG}" ; return 0 ; } 5437c2fbfb3SApril Chin print "C" ; return 0 5447c2fbfb3SApril Chin} 5457c2fbfb3SApril Chin 5467c2fbfb3SApril Chinfunction do_rssread 5477c2fbfb3SApril Chin{ 5487c2fbfb3SApril Chin # set unicode locale since RSS is encoded in UTF-8 5497c2fbfb3SApril Chin # (and make sure $LC_MESSAGES is set to the parent 5507c2fbfb3SApril Chin # process's locale that all error messages are using 5517c2fbfb3SApril Chin # the callers locale/encoding) 5527c2fbfb3SApril Chin export \ 5537c2fbfb3SApril Chin LC_MESSAGES="${ get_lc_messages ; }" \ 5547c2fbfb3SApril Chin LC_MONETARY="en_US.UTF-8" \ 5557c2fbfb3SApril Chin LC_NUMERIC="en_US.UTF-8" \ 5567c2fbfb3SApril Chin LC_COLLATE="en_US.UTF-8" \ 5577c2fbfb3SApril Chin LC_CTYPE="en_US.UTF-8" \ 5587c2fbfb3SApril Chin LC_TIME="en_US.UTF-8" \ 5597c2fbfb3SApril Chin LANG="en_US.UTF-8" 5607c2fbfb3SApril Chin 56134f9b3eeSRoland Mainz # return non-zero exit code for this function if the rss processing below fails 56234f9b3eeSRoland Mainz set -o errexit 56334f9b3eeSRoland Mainz 56434f9b3eeSRoland Mainz urlconnection_t hc 565*3e14f97fSRoger A. Faulkner hc.user_agent="rssread/ksh93(ssl) (2010-03-27; $(uname -s -r -p))" 56634f9b3eeSRoland Mainz hc.init_url "$1" 56734f9b3eeSRoland Mainz 56834f9b3eeSRoland Mainz # need extra newline after cat_url to terminate line with $'\n' 5697c2fbfb3SApril Chin # to make "xml_tok" happy 57034f9b3eeSRoland Mainz data="${ hc.cat_url ; print ; }" 57134f9b3eeSRoland Mainz 57234f9b3eeSRoland Mainz print -u2 -f "# Got %d lines of RSS data, processing...\n" "${ wc -l <<< "${data}" ; }" 57334f9b3eeSRoland Mainz 57434f9b3eeSRoland Mainz xml_tok "rsstok_cb" <<< "${data}" 57534f9b3eeSRoland Mainz 5767c2fbfb3SApril Chin return 0 5777c2fbfb3SApril Chin} 5787c2fbfb3SApril Chin 5797c2fbfb3SApril Chinfunction usage 5807c2fbfb3SApril Chin{ 5817c2fbfb3SApril Chin OPTIND=0 5827c2fbfb3SApril Chin getopts -a "${progname}" "${rssread_usage}" OPT '-?' 5837c2fbfb3SApril Chin exit 2 5847c2fbfb3SApril Chin} 5857c2fbfb3SApril Chin 5867c2fbfb3SApril Chin# make sure we use the ksh93 builtin versions 5877c2fbfb3SApril Chinbuiltin basename 5887c2fbfb3SApril Chinbuiltin cat 58934f9b3eeSRoland Mainzbuiltin mkfifo 5907c2fbfb3SApril Chin 5917c2fbfb3SApril Chintypeset -A rsstok_cb # callbacks for xml_tok 5927c2fbfb3SApril Chinrsstok_cb["tag_begin"]="handle_rss" 5937c2fbfb3SApril Chinrsstok_cb["tag_end"]="handle_rss" 5947c2fbfb3SApril Chinrsstok_cb["tag_text"]="handle_rss" 5957c2fbfb3SApril Chinrsstok_cb["textbuf"]="" 5967c2fbfb3SApril Chin 5977c2fbfb3SApril Chintypeset -A xhtmltok_cb # callbacks for xml_tok 5987c2fbfb3SApril Chinxhtmltok_cb["tag_begin"]="handle_html" 5997c2fbfb3SApril Chinxhtmltok_cb["tag_end"]="handle_html" 6007c2fbfb3SApril Chinxhtmltok_cb["tag_text"]="handle_html" 6017c2fbfb3SApril Chinxhtmltok_cb["textbuf"]="" 6027c2fbfb3SApril Chinxhtmltok_cb["html_pre"]='false' 6037c2fbfb3SApril Chin 6047c2fbfb3SApril Chintypeset -A item 6057c2fbfb3SApril Chin 6067c2fbfb3SApril Chintypeset -A bookmark_urls 6077c2fbfb3SApril Chin 6087c2fbfb3SApril Chin# "ramdom" urls for testing 6097c2fbfb3SApril Chinbookmark_urls=( 6107c2fbfb3SApril Chin ["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=(%22ksh93%22%7C%22ksh+93%22+%7C+%22korn93%22+%7C+%22korn+93%22)&ie=utf-8&num=100&output=rss" 6117c2fbfb3SApril Chin # some Sun staff/sites 6127c2fbfb3SApril Chin ["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss" 6137c2fbfb3SApril Chin ["bigadmin"]="http://www.sun.com/bigadmin/content/rss/motd.xml" 61434f9b3eeSRoland Mainz ["bigadmin_scripts"]="https://www.sun.com/bigadmin/content/rss/scripts.xml" 6157c2fbfb3SApril Chin ["jmcp"]="http://www.jmcp.homeunix.com/roller/jmcp/feed/entries/rss" 6167c2fbfb3SApril Chin ["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss" 6177c2fbfb3SApril Chin ["alanc"]="http://blogs.sun.com/alanc/feed/entries/rss" 6187c2fbfb3SApril Chin ["planetsun"]="http://www.planetsun.org/rss20.xml" 6197c2fbfb3SApril Chin ["planetsolaris"]="http://www.planetsolaris.org/rss20.xml" 6207c2fbfb3SApril Chin ["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml" 6217c2fbfb3SApril Chin ["theregister_uk"]="http://www.theregister.co.uk/headlines.rss" 6227c2fbfb3SApril Chin ["heise"]="http://www.heise.de/newsticker/heise.rdf" 6237c2fbfb3SApril Chin ["slashdot"]="http://rss.slashdot.org/Slashdot/slashdot" 62434f9b3eeSRoland Mainz ["wikipedia_command_shells"]="http://en.wikipedia.org/w/index.php?title=Comparison_of_command_shells&feed=rss&action=history" 6257c2fbfb3SApril Chin) 6267c2fbfb3SApril Chin 6277c2fbfb3SApril Chintypeset progname="${ basename "${0}" ; }" 6287c2fbfb3SApril Chin 6297c2fbfb3SApril Chintypeset -r rssread_usage=$'+ 630*3e14f97fSRoger A. Faulkner[-?\n@(#)\$Id: rssread (Roland Mainz) 2010-03-27 \$\n] 6317c2fbfb3SApril Chin[-author?Roland Mainz <roland.mainz@sun.com>] 6327c2fbfb3SApril Chin[-author?Roland Mainz <roland.mainz@nrubsig.org>] 6337c2fbfb3SApril Chin[+NAME?rssread - fetch RSS messages and convert them to plain text] 6347c2fbfb3SApril Chin[+DESCRIPTION?\brssread\b RSS to plain text converter 6357c2fbfb3SApril Chin which fetches RSS streams via HTTP and converts them from 6367c2fbfb3SApril Chin RSS to HTML to plain text in the current locale/encoding.] 6377c2fbfb3SApril Chin[I:noiconv?Do not convert data from UTF-8 to current locale/encoding.] 6387c2fbfb3SApril Chin 6397c2fbfb3SApril Chin[ url ] 6407c2fbfb3SApril Chin 6417c2fbfb3SApril Chin[+SEE ALSO?\bksh93\b(1), \bshnote\b(1)] 6427c2fbfb3SApril Chin' 6437c2fbfb3SApril Chin 6447c2fbfb3SApril Chintypeset noiconv=false 6457c2fbfb3SApril Chin 6467c2fbfb3SApril Chinwhile getopts -a "${progname}" "${rssread_usage}" OPT ; do 6477c2fbfb3SApril Chin# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 6487c2fbfb3SApril Chin case ${OPT} in 6497c2fbfb3SApril Chin I) noiconv=true ;; 6507c2fbfb3SApril Chin +I) noiconv=false ;; 6517c2fbfb3SApril Chin *) usage ;; 6527c2fbfb3SApril Chin esac 6537c2fbfb3SApril Chindone 6547c2fbfb3SApril Chinshift $((OPTIND-1)) 6557c2fbfb3SApril Chin 6567c2fbfb3SApril Chintypeset url="$1" 6577c2fbfb3SApril Chin 6587c2fbfb3SApril Chinif [[ "${url}" == "" ]] ; then 6597c2fbfb3SApril Chin fatal_error $"No url given." 6607c2fbfb3SApril Chinfi 6617c2fbfb3SApril Chin 6627c2fbfb3SApril Chinif [[ "${bookmark_urls[${url}]}" != "" ]] ; then 6637c2fbfb3SApril Chin printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}" 6647c2fbfb3SApril Chin url="${bookmark_urls[${url}]}" 6657c2fbfb3SApril Chinfi 6667c2fbfb3SApril Chin 6677c2fbfb3SApril Chinif ${noiconv} ; then 6687c2fbfb3SApril Chin do_rssread "${url}" 6697c2fbfb3SApril Chinelse 6707c2fbfb3SApril Chin do_rssread "${url}" | iconv -f "UTF-8" - - 6717c2fbfb3SApril Chinfi 6727c2fbfb3SApril Chin 6737c2fbfb3SApril Chinexit 0 6747c2fbfb3SApril Chin#EOF. 675