xref: /titanic_52/usr/src/cmd/ast/libshell/common/scripts/xmldocumenttree1.sh (revision 906afcb89d0412cc073b95c2d701a804a8cdb62c)
1#!/usr/bin/ksh93
2
3#
4# CDDL HEADER START
5#
6# The contents of this file are subject to the terms of the
7# Common Development and Distribution License (the "License").
8# You may not use this file except in compliance with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
26#
27
28# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
29export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
30
31function fatal_error
32{
33	print -u 2 "${progname}: $*"
34	exit 1
35}
36
37function attrstrtoattrarray
38{
39#set -o xtrace
40    typeset s="$1"
41    nameref aa=$2 # attribute array
42    integer aa_count=0
43    integer aa_count=0
44    typeset nextattr
45    integer currattrlen=0
46    typeset tagstr
47    typeset tagval
48
49    while (( ${#s} > 0 )) ; do
50        # skip whitespaces
51        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
52            (( currattrlen++ ))
53        done
54        s="${s:currattrlen:${#s}}"
55
56        # anything left ?
57        (( ${#s} == 0 )) && break
58
59        # Pattern tests:
60        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
61        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
62        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
63        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
64        # All pattern combined via eregex (w|x|y|z):
65        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
66        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
67        currattrlen=$(( ${#s} - ${#nextattr}))
68
69        # add entry
70        tagstr="${s:0:currattrlen}"
71        if [[ "${tagstr}" == *=* ]] ; then
72            # normal case: attribute with value
73
74            tagval="${tagstr#*=}"
75
76            # strip quotes ('' or "")
77            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
78                tagval="${tagval:1:${#tagval}-2}"
79            fi
80
81            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
82        else
83            # special case for HTML where you have something like <foo baz>
84            aa[${aa_count}]=( name="${tagstr}" )
85        fi
86        (( aa_count++ ))
87        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
88    done
89}
90
91
92function handle_document
93{
94#set -o xtrace
95    nameref callbacks=${1}
96    typeset tag_type="${2}"
97    typeset tag_value="${3}"
98    typeset tag_attributes="${4}"
99    nameref doc=${callbacks["arg_tree"]}
100    nameref nodepath="${stack.items[stack.pos]}"
101    nameref nodesnum="${stack.items[stack.pos]}num"
102
103    case "${tag_type}" in
104        tag_begin)
105            nodepath[${nodesnum}]+=(
106                typeset tagtype="element"
107                typeset tagname="${tag_value}"
108                compound -A tagattributes
109                compound -A nodes
110                integer nodesnum=0
111            )
112
113            # fill attributes
114            if [[ "${tag_attributes}" != "" ]] ; then
115                attrstrtoattrarray "${tag_attributes}" "nodepath[${nodesnum}].tagattributes"
116            fi
117
118            (( stack.pos++ ))
119            stack.items[stack.pos]="${stack.items[stack.pos-1]}[${nodesnum}].nodes"
120            (( nodesnum++ ))
121            ;;
122        tag_end)
123            (( stack.pos-- ))
124            ;;
125        tag_text)
126            nodepath[${nodesnum}]+=(
127                typeset tagtype="text"
128                typeset tagvalue="${tag_value}"
129            )
130            (( nodesnum++ ))
131            ;;
132        tag_comment)
133            nodepath[${nodesnum}]+=(
134                typeset tagtype="comment"
135                typeset tagvalue="${tag_value}"
136            )
137            (( nodesnum++ ))
138            ;;
139        document_start)
140            ;;
141        document_end)
142            ;;
143    esac
144
145#    print "xmltok: '${tag_type}' = '${tag_value}'"
146}
147
148function xml_tok
149{
150    typeset buf=""
151    typeset namebuf=""
152    typeset attrbuf=""
153    typeset c=""
154    typeset isendtag # bool: true/false
155    typeset issingletag # bool: true/false (used for tags like "<br />")
156    nameref callbacks=${1}
157
158    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
159
160    while IFS='' read -r -N 1 c ; do
161        isendtag=false
162
163        if [[ "$c" == "<" ]] ; then
164	    # flush any text content
165            if [[ "$buf" != "" ]] ; then
166                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
167                buf=""
168            fi
169
170            IFS='' read -r -N 1 c
171            if [[ "$c" == "/" ]] ; then
172                isendtag=true
173            else
174                buf="$c"
175            fi
176            IFS='' read -r -d '>' c
177            buf+="$c"
178
179	    # handle comments
180	    if [[ "$buf" == ~(El)!-- ]] ; then
181	        # did we read the comment completely ?
182	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
183		    buf+=">"
184	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
185		        IFS='' read -r -N 1 c || break
186		        buf+="$c"
187		    done
188		fi
189
190		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
191		buf=""
192		continue
193	    fi
194
195	    # check if the tag starts and ends at the same time (like "<br />")
196	    if [[ "${buf}" == ~(Er).*/ ]] ; then
197	        issingletag=true
198		buf="${buf%*/}"
199	    else
200	        issingletag=false
201	    fi
202
203	    # check if the tag has attributes (e.g. space after name)
204	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
205	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
206                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
207            else
208	        namebuf="$buf"
209		attrbuf=""
210	    fi
211
212            if ${isendtag} ; then
213                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
214            else
215                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
216
217                # handle tags like <br/> (which are start- and end-tag in one piece)
218                if ${issingletag} ; then
219                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
220                fi
221            fi
222            buf=""
223        else
224            buf+="$c"
225        fi
226    done
227
228    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
229
230    print # final newline to make filters like "sed" happy
231}
232
233function print_sample1_xml
234{
235cat <<EOF
236<br />
237<score-partwise instrument="flute1">
238        <identification>
239            <kaiman>nocrocodile</kaiman>
240        </identification>
241        <!-- a comment -->
242        <partlist>
243            <foo>myfootext</foo>
244            <bar>mybartext</bar>
245            <snap />
246            <!-- another
247                 comment -->
248            <ttt>myttttext</ttt>
249        </partlist>
250</score-partwise>
251EOF
252}
253
254function usage
255{
256    OPTIND=0
257    getopts -a "${progname}" "${xmldocumenttree1_usage}" OPT '-?'
258    exit 2
259}
260
261# program start
262builtin basename
263builtin cat
264builtin date
265builtin uname
266
267typeset progname="${ basename "${0}" ; }"
268
269typeset -r xmldocumenttree1_usage=$'+
270[-?\n@(#)\$Id: xmldocumenttree1 (Roland Mainz) 2009-05-09 \$\n]
271[-author?Roland Mainz <roland.mainz@nrubsig.org>]
272[+NAME?xmldocumenttree1 - XML tree demo]
273[+DESCRIPTION?\bxmldocumenttree\b is a small ksh93 compound variable demo
274        which reads a XML input file, converts it into an internal
275        variable tree representation and outputs it in the format
276        specified by viewmode (either "list", "namelist", "tree" or "compacttree").]
277
278file viewmode
279
280[+SEE ALSO?\bksh93\b(1)]
281'
282
283while getopts -a "${progname}" "${xmldocumenttree1_usage}" OPT ; do
284#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
285    case ${OPT} in
286        *)    usage ;;
287    esac
288done
289shift $((OPTIND-1))
290
291typeset xmlfile="$1"
292typeset viewmode="$2"
293
294if [[ "${xmlfile}" == "" ]] ; then
295    fatal_error $"No file given."
296fi
297
298if [[ "${viewmode}" != ~(Elr)(list|namelist|tree|compacttree) ]] ; then
299    fatal_error $"Invalid view mode \"${viewmode}\"."
300fi
301
302compound xdoc
303compound -A xdoc.nodes
304integer xdoc.nodesnum=0
305
306compound stack
307typeset -a stack.items=( [0]="doc.nodes" )
308integer stack.pos=0
309
310# setup callbacks for xml_tok
311typeset -A document_cb # callbacks for xml_tok
312document_cb["document_start"]="handle_document"
313document_cb["document_end"]="handle_document"
314document_cb["tag_begin"]="handle_document"
315document_cb["tag_end"]="handle_document"
316document_cb["tag_text"]="handle_document"
317document_cb["tag_comment"]="handle_document"
318# argument for "handle_document"
319document_cb["arg_tree"]="xdoc"
320
321
322if [[ "${xmlfile}" == "#sample1" ]] ; then
323    print_sample1_xml | xml_tok document_cb
324elif [[ "${xmlfile}" == "#sample2" ]] ; then
325    /usr/sfw/bin/wget \
326            --user-agent='ksh93_xmldocumenttree' \
327	    --output-document=- \
328	    'http://www.google.com/custom?q=gummi+bears' |
329        /usr/bin/iconv -f "ISO8859-1" |
330        xml_tok document_cb
331else
332    cat "${xmlfile}" | xml_tok document_cb
333fi
334
335print -u2 "#parsing completed."
336
337case "${viewmode}" in
338    list)
339        set | egrep "xdoc.*(tagname|tagtype|tagval|tagattributes)" | fgrep -v ']=$'
340        ;;
341    namelist)
342        typeset + | egrep "xdoc.*(tagname|tagtype|tagval|tagattributes)"
343        ;;
344    tree)
345        print -v xdoc
346        ;;
347    compacttree)
348        print -C xdoc
349        ;;
350       *)
351        fatal_error $"Invalid view mode \"${viewmode}\"."
352        ;;
353esac
354
355print -u2 "#done."
356
357exit 0
358# EOF.
359