xref: /titanic_51/usr/src/lib/libshell/common/scripts/xmldocumenttree1.sh (revision 95c2d3023b88b9097d9822eb47ace5466e6d1cf4)
1#!/usr/bin/ksh93
2
3#
4# CDDL HEADER START
5#
6# The contents of this file are subject to the terms of the
7# Common Development and Distribution License (the "License").
8# You may not use this file except in compliance with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
26# Use is subject to license terms.
27#
28
29# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
30export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
31
32function fatal_error
33{
34	print -u 2 "${progname}: $*"
35	exit 1
36}
37
38function attrstrtoattrarray
39{
40#set -o xtrace
41    typeset s="$1"
42    nameref aa=$2 # attribute array
43    integer aa_count=0
44    integer aa_count=0
45    typeset nextattr
46    integer currattrlen=0
47    typeset tagstr
48    typeset tagval
49
50    while (( ${#s} > 0 )) ; do
51        # skip whitespaces
52        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
53            (( currattrlen++ ))
54        done
55        s="${s:currattrlen:${#s}}"
56
57        # anything left ?
58        (( ${#s} == 0 )) && break
59
60        # Pattern tests:
61        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
62        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
63        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
64        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
65        # All pattern combined via eregex (w|x|y|z):
66        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
67        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
68        currattrlen=$(( ${#s} - ${#nextattr}))
69
70        # add entry
71        tagstr="${s:0:currattrlen}"
72        if [[ "${tagstr}" == *=* ]] ; then
73            # normal case: attribute with value
74
75            tagval="${tagstr#*=}"
76
77            # strip quotes ('' or "")
78            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
79                tagval="${tagval:1:${#tagval}-2}"
80            fi
81
82            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
83        else
84            # special case for HTML where you have something like <foo baz>
85            aa[${aa_count}]=( name="${tagstr}" )
86        fi
87        (( aa_count++ ))
88        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
89    done
90}
91
92
93function handle_document
94{
95#set -o xtrace
96    nameref callbacks=${1}
97    typeset tag_type="${2}"
98    typeset tag_value="${3}"
99    typeset tag_attributes="${4}"
100    nameref doc=${callbacks["arg_tree"]}
101    nameref nodepath="${stack.items[stack.pos]}"
102    nameref nodesnum="${stack.items[stack.pos]}num"
103
104    case "${tag_type}" in
105        tag_begin)
106            nodepath[${nodesnum}]+=(
107                typeset tagtype="element"
108                typeset tagname="${tag_value}"
109                typeset -A tagattributes=( )
110                typeset -A nodes=( )
111                integer nodesnum=0
112            )
113
114            # fill attributes
115            if [[ "${tag_attributes}" != "" ]] ; then
116                attrstrtoattrarray "${tag_attributes}" "nodepath[${nodesnum}].tagattributes"
117            fi
118
119            (( stack.pos++ ))
120            stack.items[stack.pos]="${stack.items[stack.pos-1]}[${nodesnum}].nodes"
121            (( nodesnum++ ))
122            ;;
123        tag_end)
124            (( stack.pos-- ))
125            ;;
126        tag_text)
127            nodepath[${nodesnum}]+=(
128                typeset tagtype="text"
129                typeset tagvalue="${tag_value}"
130            )
131            (( nodesnum++ ))
132            ;;
133        tag_comment)
134            nodepath[${nodesnum}]+=(
135                typeset tagtype="comment"
136                typeset tagvalue="${tag_value}"
137            )
138            (( nodesnum++ ))
139            ;;
140        document_start)
141            ;;
142        document_end)
143            ;;
144    esac
145
146#    print "xmltok: '${tag_type}' = '${tag_value}'"
147}
148
149function xml_tok
150{
151    typeset buf=""
152    typeset namebuf=""
153    typeset attrbuf=""
154    typeset c=""
155    typeset isendtag # bool: true/false
156    typeset issingletag # bool: true/false (used for tags like "<br />")
157    nameref callbacks=${1}
158
159    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
160
161    while IFS='' read -r -N 1 c ; do
162        isendtag=false
163
164        if [[ "$c" == "<" ]] ; then
165	    # flush any text content
166            if [[ "$buf" != "" ]] ; then
167                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
168                buf=""
169            fi
170
171            IFS='' read -r -N 1 c
172            if [[ "$c" == "/" ]] ; then
173                isendtag=true
174            else
175                buf="$c"
176            fi
177            IFS='' read -r -d '>' c
178            buf+="$c"
179
180	    # handle comments
181	    if [[ "$buf" == ~(El)!-- ]] ; then
182	        # did we read the comment completely ?
183	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
184		    buf+=">"
185	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
186		        IFS='' read -r -N 1 c || break
187		        buf+="$c"
188		    done
189		fi
190
191		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
192		buf=""
193		continue
194	    fi
195
196	    # check if the tag starts and ends at the same time (like "<br />")
197	    if [[ "${buf}" == ~(Er).*/ ]] ; then
198	        issingletag=true
199		buf="${buf%*/}"
200	    else
201	        issingletag=false
202	    fi
203
204	    # check if the tag has attributes (e.g. space after name)
205	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
206	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
207                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
208            else
209	        namebuf="$buf"
210		attrbuf=""
211	    fi
212
213            if ${isendtag} ; then
214                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
215            else
216                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
217
218                # handle tags like <br/> (which are start- and end-tag in one piece)
219                if ${issingletag} ; then
220                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
221                fi
222            fi
223            buf=""
224        else
225            buf+="$c"
226        fi
227    done
228
229    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
230
231    print # final newline to make filters like "sed" happy
232}
233
234function print_sample1_xml
235{
236cat <<EOF
237<br />
238<score-partwise instrument="flute1">
239        <identification>
240            <kaiman>nocrocodile</kaiman>
241        </identification>
242        <!-- a comment -->
243        <partlist>
244            <foo>myfootext</foo>
245            <bar>mybartext</bar>
246            <snap />
247            <!-- another
248                 comment -->
249            <ttt>myttttext</ttt>
250        </partlist>
251</score-partwise>
252EOF
253}
254
255function usage
256{
257    OPTIND=0
258    getopts -a "${progname}" "${xmldocumenttree1_usage}" OPT '-?'
259    exit 2
260}
261
262# program start
263builtin basename
264builtin cat
265builtin date
266builtin uname
267
268typeset progname="${ basename "${0}" ; }"
269
270typeset -r xmldocumenttree1_usage=$'+
271[-?\n@(#)\$Id: xmldocumenttree1 (Roland Mainz) 2008-10-14 \$\n]
272[-author?Roland Mainz <roland.mainz@nrubsig.org>]
273[+NAME?xmldocumenttree1 - XML tree demo]
274[+DESCRIPTION?\bxmldocumenttree\b is a small ksh93 compound variable demo
275        which reads a XML input file, converts it into an internal
276        variable tree representation and outputs it in the format
277        specified by viewmode (either "list", "namelist" or "tree").]
278
279file viewmode
280
281[+SEE ALSO?\bksh93\b(1)]
282'
283
284while getopts -a "${progname}" "${xmldocumenttree1_usage}" OPT ; do
285#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
286    case ${OPT} in
287        *)    usage ;;
288    esac
289done
290shift $((OPTIND-1))
291
292typeset xmlfile="$1"
293typeset viewmode="$2"
294
295if [[ "${xmlfile}" == "" ]] ; then
296    fatal_error $"No file given."
297fi
298
299if [[ "${viewmode}" != ~(Elr)(list|namelist|tree) ]] ; then
300    fatal_error $"Invalid view mode \"${viewmode}\"."
301fi
302
303typeset -C xdoc
304typeset -A xdoc.nodes
305integer xdoc.nodesnum=0
306
307typeset -C stack
308typeset -a stack.items=( [0]="doc.nodes" )
309integer stack.pos=0
310
311# setup callbacks for xml_tok
312typeset -A document_cb # callbacks for xml_tok
313document_cb["document_start"]="handle_document"
314document_cb["document_end"]="handle_document"
315document_cb["tag_begin"]="handle_document"
316document_cb["tag_end"]="handle_document"
317document_cb["tag_text"]="handle_document"
318document_cb["tag_comment"]="handle_document"
319# argument for "handle_document"
320document_cb["arg_tree"]="xdoc"
321
322
323if [[ "${xmlfile}" == "#sample1" ]] ; then
324    print_sample1_xml | xml_tok document_cb
325elif [[ "${xmlfile}" == "#sample2" ]] ; then
326    /usr/sfw/bin/wget \
327            --user-agent='ksh93_xmldocumenttree' \
328	    --output-document=- \
329	    'http://www.google.com/custom?q=gummi+bears' |
330        /usr/bin/iconv -f "ISO8859-1" |
331        xml_tok document_cb
332else
333    cat "${xmlfile}" | xml_tok document_cb
334fi
335
336print -u2 "#parsing completed."
337
338case "${viewmode}" in
339    list)
340        set | egrep "xdoc.*(tagname|tagtype|tagval|tagattributes)" | fgrep -v ']=$'
341        ;;
342    namelist)
343        typeset + | egrep "xdoc.*(tagname|tagtype|tagval|tagattributes)"
344        ;;
345    tree)
346        print -- "${xdoc}"
347        ;;
348       *)
349        fatal_error $"Invalid view mode \"${viewmode}\"."
350        ;;
351esac
352
353print -u2 "#done."
354
355exit 0
356# EOF.
357