1#!/usr/bin/ksh93 2 3# 4# CDDL HEADER START 5# 6# The contents of this file are subject to the terms of the 7# Common Development and Distribution License (the "License"). 8# You may not use this file except in compliance with the License. 9# 10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11# or http://www.opensolaris.org/os/licensing. 12# See the License for the specific language governing permissions 13# and limitations under the License. 14# 15# When distributing Covered Code, include this CDDL HEADER in each 16# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17# If applicable, add the following below this CDDL HEADER, with the 18# fields enclosed by brackets "[]" replaced with your own identifying 19# information: Portions Copyright [yyyy] [name of copyright owner] 20# 21# CDDL HEADER END 22# 23 24# 25# Copyright 2008 Sun Microsystems, Inc. All rights reserved. 26# Use is subject to license terms. 27# 28 29# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 30export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 31 32function fatal_error 33{ 34 print -u 2 "${progname}: $*" 35 exit 1 36} 37 38function attrstrtoattrarray 39{ 40#set -o xtrace 41 typeset s="$1" 42 nameref aa=$2 # attribute array 43 integer aa_count=0 44 integer aa_count=0 45 typeset nextattr 46 integer currattrlen=0 47 typeset tagstr 48 typeset tagval 49 50 while (( ${#s} > 0 )) ; do 51 # skip whitespaces 52 while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do 53 (( currattrlen++ )) 54 done 55 s="${s:currattrlen:${#s}}" 56 57 # anything left ? 58 (( ${#s} == 0 )) && break 59 60 # Pattern tests: 61 #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}" 62 #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}" 63 #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}" 64 #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}" 65 # All pattern combined via eregex (w|x|y|z): 66 #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}" 67 nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}" 68 currattrlen=$(( ${#s} - ${#nextattr})) 69 70 # add entry 71 tagstr="${s:0:currattrlen}" 72 if [[ "${tagstr}" == *=* ]] ; then 73 # normal case: attribute with value 74 75 tagval="${tagstr#*=}" 76 77 # strip quotes ('' or "") 78 if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then 79 tagval="${tagval:1:${#tagval}-2}" 80 fi 81 82 aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" ) 83 else 84 # special case for HTML where you have something like <foo baz> 85 aa[${aa_count}]=( name="${tagstr}" ) 86 fi 87 (( aa_count++ )) 88 (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert 89 done 90} 91 92 93function handle_document 94{ 95#set -o xtrace 96 nameref callbacks=${1} 97 typeset tag_type="${2}" 98 typeset tag_value="${3}" 99 typeset tag_attributes="${4}" 100 nameref doc=${callbacks["arg_tree"]} 101 nameref nodepath="${stack.items[stack.pos]}" 102 nameref nodesnum="${stack.items[stack.pos]}num" 103 104 case "${tag_type}" in 105 tag_begin) 106 nodepath[${nodesnum}]+=( 107 typeset tagtype="element" 108 typeset tagname="${tag_value}" 109 typeset -A tagattributes=( ) 110 typeset -A nodes=( ) 111 integer nodesnum=0 112 ) 113 114 # fill attributes 115 if [[ "${tag_attributes}" != "" ]] ; then 116 attrstrtoattrarray "${tag_attributes}" "nodepath[${nodesnum}].tagattributes" 117 fi 118 119 (( stack.pos++ )) 120 stack.items[stack.pos]="${stack.items[stack.pos-1]}[${nodesnum}].nodes" 121 (( nodesnum++ )) 122 ;; 123 tag_end) 124 (( stack.pos-- )) 125 ;; 126 tag_text) 127 nodepath[${nodesnum}]+=( 128 typeset tagtype="text" 129 typeset tagvalue="${tag_value}" 130 ) 131 (( nodesnum++ )) 132 ;; 133 tag_comment) 134 nodepath[${nodesnum}]+=( 135 typeset tagtype="comment" 136 typeset tagvalue="${tag_value}" 137 ) 138 (( nodesnum++ )) 139 ;; 140 document_start) 141 ;; 142 document_end) 143 ;; 144 esac 145 146# print "xmltok: '${tag_type}' = '${tag_value}'" 147} 148 149function xml_tok 150{ 151 typeset buf="" 152 typeset namebuf="" 153 typeset attrbuf="" 154 typeset c="" 155 typeset isendtag # bool: true/false 156 typeset issingletag # bool: true/false (used for tags like "<br />") 157 nameref callbacks=${1} 158 159 [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 160 161 while IFS='' read -r -N 1 c ; do 162 isendtag=false 163 164 if [[ "$c" == "<" ]] ; then 165 # flush any text content 166 if [[ "$buf" != "" ]] ; then 167 [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 168 buf="" 169 fi 170 171 IFS='' read -r -N 1 c 172 if [[ "$c" == "/" ]] ; then 173 isendtag=true 174 else 175 buf="$c" 176 fi 177 IFS='' read -r -d '>' c 178 buf+="$c" 179 180 # handle comments 181 if [[ "$buf" == ~(El)!-- ]] ; then 182 # did we read the comment completely ? 183 if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 184 buf+=">" 185 while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 186 IFS='' read -r -N 1 c || break 187 buf+="$c" 188 done 189 fi 190 191 [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 192 buf="" 193 continue 194 fi 195 196 # check if the tag starts and ends at the same time (like "<br />") 197 if [[ "${buf}" == ~(Er).*/ ]] ; then 198 issingletag=true 199 buf="${buf%*/}" 200 else 201 issingletag=false 202 fi 203 204 # check if the tag has attributes (e.g. space after name) 205 if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 206 namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 207 attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 208 else 209 namebuf="$buf" 210 attrbuf="" 211 fi 212 213 if ${isendtag} ; then 214 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 215 else 216 [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 217 218 # handle tags like <br/> (which are start- and end-tag in one piece) 219 if ${issingletag} ; then 220 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 221 fi 222 fi 223 buf="" 224 else 225 buf+="$c" 226 fi 227 done 228 229 [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 230 231 print # final newline to make filters like "sed" happy 232} 233 234function print_sample1_xml 235{ 236cat <<EOF 237<br /> 238<score-partwise instrument="flute1"> 239 <identification> 240 <kaiman>nocrocodile</kaiman> 241 </identification> 242 <!-- a comment --> 243 <partlist> 244 <foo>myfootext</foo> 245 <bar>mybartext</bar> 246 <snap /> 247 <!-- another 248 comment --> 249 <ttt>myttttext</ttt> 250 </partlist> 251</score-partwise> 252EOF 253} 254 255function usage 256{ 257 OPTIND=0 258 getopts -a "${progname}" "${xmldocumenttree1_usage}" OPT '-?' 259 exit 2 260} 261 262# program start 263builtin basename 264builtin cat 265builtin date 266builtin uname 267 268typeset progname="${ basename "${0}" ; }" 269 270typeset -r xmldocumenttree1_usage=$'+ 271[-?\n@(#)\$Id: xmldocumenttree1 (Roland Mainz) 2008-10-14 \$\n] 272[-author?Roland Mainz <roland.mainz@nrubsig.org>] 273[+NAME?xmldocumenttree1 - XML tree demo] 274[+DESCRIPTION?\bxmldocumenttree\b is a small ksh93 compound variable demo 275 which reads a XML input file, converts it into an internal 276 variable tree representation and outputs it in the format 277 specified by viewmode (either "list", "namelist" or "tree").] 278 279file viewmode 280 281[+SEE ALSO?\bksh93\b(1)] 282' 283 284while getopts -a "${progname}" "${xmldocumenttree1_usage}" OPT ; do 285# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 286 case ${OPT} in 287 *) usage ;; 288 esac 289done 290shift $((OPTIND-1)) 291 292typeset xmlfile="$1" 293typeset viewmode="$2" 294 295if [[ "${xmlfile}" == "" ]] ; then 296 fatal_error $"No file given." 297fi 298 299if [[ "${viewmode}" != ~(Elr)(list|namelist|tree) ]] ; then 300 fatal_error $"Invalid view mode \"${viewmode}\"." 301fi 302 303typeset -C xdoc 304typeset -A xdoc.nodes 305integer xdoc.nodesnum=0 306 307typeset -C stack 308typeset -a stack.items=( [0]="doc.nodes" ) 309integer stack.pos=0 310 311# setup callbacks for xml_tok 312typeset -A document_cb # callbacks for xml_tok 313document_cb["document_start"]="handle_document" 314document_cb["document_end"]="handle_document" 315document_cb["tag_begin"]="handle_document" 316document_cb["tag_end"]="handle_document" 317document_cb["tag_text"]="handle_document" 318document_cb["tag_comment"]="handle_document" 319# argument for "handle_document" 320document_cb["arg_tree"]="xdoc" 321 322 323if [[ "${xmlfile}" == "#sample1" ]] ; then 324 print_sample1_xml | xml_tok document_cb 325elif [[ "${xmlfile}" == "#sample2" ]] ; then 326 /usr/sfw/bin/wget \ 327 --user-agent='ksh93_xmldocumenttree' \ 328 --output-document=- \ 329 'http://www.google.com/custom?q=gummi+bears' | 330 /usr/bin/iconv -f "ISO8859-1" | 331 xml_tok document_cb 332else 333 cat "${xmlfile}" | xml_tok document_cb 334fi 335 336print -u2 "#parsing completed." 337 338case "${viewmode}" in 339 list) 340 set | egrep "xdoc.*(tagname|tagtype|tagval|tagattributes)" | fgrep -v ']=$' 341 ;; 342 namelist) 343 typeset + | egrep "xdoc.*(tagname|tagtype|tagval|tagattributes)" 344 ;; 345 tree) 346 print -- "${xdoc}" 347 ;; 348 *) 349 fatal_error $"Invalid view mode \"${viewmode}\"." 350 ;; 351esac 352 353print -u2 "#done." 354 355exit 0 356# EOF. 357