xref: /titanic_53/usr/src/cmd/ast/libshell/common/scripts/crawlsrccomments.sh (revision 906afcb89d0412cc073b95c2d701a804a8cdb62c)
1*906afcb8SAndy Fiddaman#!/usr/bin/ksh93
2*906afcb8SAndy Fiddaman
3*906afcb8SAndy Fiddaman#
4*906afcb8SAndy Fiddaman# CDDL HEADER START
5*906afcb8SAndy Fiddaman#
6*906afcb8SAndy Fiddaman# The contents of this file are subject to the terms of the
7*906afcb8SAndy Fiddaman# Common Development and Distribution License (the "License").
8*906afcb8SAndy Fiddaman# You may not use this file except in compliance with the License.
9*906afcb8SAndy Fiddaman#
10*906afcb8SAndy Fiddaman# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11*906afcb8SAndy Fiddaman# or http://www.opensolaris.org/os/licensing.
12*906afcb8SAndy Fiddaman# See the License for the specific language governing permissions
13*906afcb8SAndy Fiddaman# and limitations under the License.
14*906afcb8SAndy Fiddaman#
15*906afcb8SAndy Fiddaman# When distributing Covered Code, include this CDDL HEADER in each
16*906afcb8SAndy Fiddaman# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17*906afcb8SAndy Fiddaman# If applicable, add the following below this CDDL HEADER, with the
18*906afcb8SAndy Fiddaman# fields enclosed by brackets "[]" replaced with your own identifying
19*906afcb8SAndy Fiddaman# information: Portions Copyright [yyyy] [name of copyright owner]
20*906afcb8SAndy Fiddaman#
21*906afcb8SAndy Fiddaman# CDDL HEADER END
22*906afcb8SAndy Fiddaman#
23*906afcb8SAndy Fiddaman
24*906afcb8SAndy Fiddaman#
25*906afcb8SAndy Fiddaman# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
26*906afcb8SAndy Fiddaman#
27*906afcb8SAndy Fiddaman
28*906afcb8SAndy Fiddaman# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
29*906afcb8SAndy Fiddamanexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
30*906afcb8SAndy Fiddaman
31*906afcb8SAndy Fiddaman# Make sure all math stuff runs in the "C" locale to avoid problems
32*906afcb8SAndy Fiddaman# with alternative # radix point representations (e.g. ',' instead of
33*906afcb8SAndy Fiddaman# '.' in de_DE.*-locales). This needs to be set _before_ any
34*906afcb8SAndy Fiddaman# floating-point constants are defined in this script).
35*906afcb8SAndy Fiddamanif [[ "${LC_ALL}" != "" ]] ; then
36*906afcb8SAndy Fiddaman    export \
37*906afcb8SAndy Fiddaman        LC_MONETARY="${LC_ALL}" \
38*906afcb8SAndy Fiddaman        LC_MESSAGES="${LC_ALL}" \
39*906afcb8SAndy Fiddaman        LC_COLLATE="${LC_ALL}" \
40*906afcb8SAndy Fiddaman        LC_CTYPE="${LC_ALL}"
41*906afcb8SAndy Fiddaman        unset LC_ALL
42*906afcb8SAndy Fiddamanfi
43*906afcb8SAndy Fiddamanexport LC_NUMERIC=C
44*906afcb8SAndy Fiddaman
45*906afcb8SAndy Fiddaman# constants values for tokenizer/parser stuff
46*906afcb8SAndy Fiddamancompound -r ch=(
47*906afcb8SAndy Fiddaman	newline=$'\n'
48*906afcb8SAndy Fiddaman	tab=$'\t'
49*906afcb8SAndy Fiddaman	formfeed=$'\f'
50*906afcb8SAndy Fiddaman)
51*906afcb8SAndy Fiddaman
52*906afcb8SAndy Fiddamanfunction fatal_error
53*906afcb8SAndy Fiddaman{
54*906afcb8SAndy Fiddaman	print -u2 "${progname}: $*"
55*906afcb8SAndy Fiddaman	exit 1
56*906afcb8SAndy Fiddaman}
57*906afcb8SAndy Fiddaman
58*906afcb8SAndy Fiddamanfunction printmsg
59*906afcb8SAndy Fiddaman{
60*906afcb8SAndy Fiddaman	print -u2 "$*"
61*906afcb8SAndy Fiddaman}
62*906afcb8SAndy Fiddaman
63*906afcb8SAndy Fiddaman
64*906afcb8SAndy Fiddamanfunction attrstrtoattrarray
65*906afcb8SAndy Fiddaman{
66*906afcb8SAndy Fiddaman#set -o xtrace
67*906afcb8SAndy Fiddaman    typeset s="$1"
68*906afcb8SAndy Fiddaman    nameref aa=$2 # attribute array
69*906afcb8SAndy Fiddaman    integer aa_count=0
70*906afcb8SAndy Fiddaman    integer aa_count=0
71*906afcb8SAndy Fiddaman    typeset nextattr
72*906afcb8SAndy Fiddaman    integer currattrlen=0
73*906afcb8SAndy Fiddaman    typeset tagstr
74*906afcb8SAndy Fiddaman    typeset tagval
75*906afcb8SAndy Fiddaman
76*906afcb8SAndy Fiddaman    while (( ${#s} > 0 )) ; do
77*906afcb8SAndy Fiddaman        # skip whitespaces
78*906afcb8SAndy Fiddaman        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
79*906afcb8SAndy Fiddaman            (( currattrlen++ ))
80*906afcb8SAndy Fiddaman        done
81*906afcb8SAndy Fiddaman        s="${s:currattrlen:${#s}}"
82*906afcb8SAndy Fiddaman
83*906afcb8SAndy Fiddaman        # anything left ?
84*906afcb8SAndy Fiddaman        (( ${#s} == 0 )) && break
85*906afcb8SAndy Fiddaman
86*906afcb8SAndy Fiddaman        # Pattern tests:
87*906afcb8SAndy Fiddaman        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
88*906afcb8SAndy Fiddaman        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
89*906afcb8SAndy Fiddaman        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
90*906afcb8SAndy Fiddaman        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
91*906afcb8SAndy Fiddaman        # All pattern combined via eregex (w|x|y|z):
92*906afcb8SAndy Fiddaman        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
93*906afcb8SAndy Fiddaman        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
94*906afcb8SAndy Fiddaman        currattrlen=$(( ${#s} - ${#nextattr}))
95*906afcb8SAndy Fiddaman
96*906afcb8SAndy Fiddaman        # add entry
97*906afcb8SAndy Fiddaman        tagstr="${s:0:currattrlen}"
98*906afcb8SAndy Fiddaman        if [[ "${tagstr}" == *=* ]] ; then
99*906afcb8SAndy Fiddaman            # normal case: attribute with value
100*906afcb8SAndy Fiddaman
101*906afcb8SAndy Fiddaman            tagval="${tagstr#*=}"
102*906afcb8SAndy Fiddaman
103*906afcb8SAndy Fiddaman            # strip quotes ('' or "")
104*906afcb8SAndy Fiddaman            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
105*906afcb8SAndy Fiddaman                tagval="${tagval:1:${#tagval}-2}"
106*906afcb8SAndy Fiddaman            fi
107*906afcb8SAndy Fiddaman
108*906afcb8SAndy Fiddaman            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
109*906afcb8SAndy Fiddaman        else
110*906afcb8SAndy Fiddaman            # special case for HTML where you have something like <foo baz>
111*906afcb8SAndy Fiddaman            aa[${aa_count}]=( name="${tagstr}" )
112*906afcb8SAndy Fiddaman        fi
113*906afcb8SAndy Fiddaman        (( aa_count++ ))
114*906afcb8SAndy Fiddaman        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
115*906afcb8SAndy Fiddaman    done
116*906afcb8SAndy Fiddaman}
117*906afcb8SAndy Fiddaman
118*906afcb8SAndy Fiddaman# XML document handler
119*906afcb8SAndy Fiddamanfunction handle_xml_document
120*906afcb8SAndy Fiddaman{
121*906afcb8SAndy Fiddaman#set -o xtrace
122*906afcb8SAndy Fiddaman    nameref callbacks=${1}
123*906afcb8SAndy Fiddaman    typeset tag_type="${2}"
124*906afcb8SAndy Fiddaman    typeset tag_value="${3}"
125*906afcb8SAndy Fiddaman    typeset tag_attributes="${4}"
126*906afcb8SAndy Fiddaman    nameref doc=${callbacks["arg_tree"]}
127*906afcb8SAndy Fiddaman    nameref nodepath="${stack.items[stack.pos]}"
128*906afcb8SAndy Fiddaman    nameref nodesnum="${stack.items[stack.pos]}num"
129*906afcb8SAndy Fiddaman
130*906afcb8SAndy Fiddaman    case "${tag_type}" in
131*906afcb8SAndy Fiddaman        tag_comment)
132*906afcb8SAndy Fiddaman            nodepath[${nodesnum}]+=(
133*906afcb8SAndy Fiddaman                typeset tagtype="comment"
134*906afcb8SAndy Fiddaman                typeset tagvalue="${tag_value}"
135*906afcb8SAndy Fiddaman            )
136*906afcb8SAndy Fiddaman            (( nodesnum++ ))
137*906afcb8SAndy Fiddaman            ;;
138*906afcb8SAndy Fiddaman    esac
139*906afcb8SAndy Fiddaman
140*906afcb8SAndy Fiddaman#    print "xmltok: '${tag_type}' = '${tag_value}'"
141*906afcb8SAndy Fiddaman}
142*906afcb8SAndy Fiddaman
143*906afcb8SAndy Fiddamanfunction xml_tok
144*906afcb8SAndy Fiddaman{
145*906afcb8SAndy Fiddaman    typeset buf=""
146*906afcb8SAndy Fiddaman    typeset namebuf=""
147*906afcb8SAndy Fiddaman    typeset attrbuf=""
148*906afcb8SAndy Fiddaman    typeset c=""
149*906afcb8SAndy Fiddaman    typeset isendtag # bool: true/false
150*906afcb8SAndy Fiddaman    typeset issingletag # bool: true/false (used for tags like "<br />")
151*906afcb8SAndy Fiddaman    nameref callbacks=${1}
152*906afcb8SAndy Fiddaman
153*906afcb8SAndy Fiddaman    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
154*906afcb8SAndy Fiddaman
155*906afcb8SAndy Fiddaman    while IFS='' read -r -N 1 c ; do
156*906afcb8SAndy Fiddaman        isendtag=false
157*906afcb8SAndy Fiddaman
158*906afcb8SAndy Fiddaman        if [[ "$c" == "<" ]] ; then
159*906afcb8SAndy Fiddaman	    # flush any text content
160*906afcb8SAndy Fiddaman            if [[ "$buf" != "" ]] ; then
161*906afcb8SAndy Fiddaman                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
162*906afcb8SAndy Fiddaman                buf=""
163*906afcb8SAndy Fiddaman            fi
164*906afcb8SAndy Fiddaman
165*906afcb8SAndy Fiddaman            IFS='' read -r -N 1 c
166*906afcb8SAndy Fiddaman            if [[ "$c" == "/" ]] ; then
167*906afcb8SAndy Fiddaman                isendtag=true
168*906afcb8SAndy Fiddaman            else
169*906afcb8SAndy Fiddaman                buf="$c"
170*906afcb8SAndy Fiddaman            fi
171*906afcb8SAndy Fiddaman            IFS='' read -r -d '>' c
172*906afcb8SAndy Fiddaman            buf+="$c"
173*906afcb8SAndy Fiddaman
174*906afcb8SAndy Fiddaman	    # handle comments
175*906afcb8SAndy Fiddaman	    if [[ "$buf" == ~(El)!-- ]] ; then
176*906afcb8SAndy Fiddaman	        # did we read the comment completely ?
177*906afcb8SAndy Fiddaman	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
178*906afcb8SAndy Fiddaman		    buf+=">"
179*906afcb8SAndy Fiddaman	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
180*906afcb8SAndy Fiddaman		        IFS='' read -r -N 1 c || break
181*906afcb8SAndy Fiddaman		        buf+="$c"
182*906afcb8SAndy Fiddaman		    done
183*906afcb8SAndy Fiddaman		fi
184*906afcb8SAndy Fiddaman
185*906afcb8SAndy Fiddaman		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
186*906afcb8SAndy Fiddaman		buf=""
187*906afcb8SAndy Fiddaman		continue
188*906afcb8SAndy Fiddaman	    fi
189*906afcb8SAndy Fiddaman
190*906afcb8SAndy Fiddaman	    # check if the tag starts and ends at the same time (like "<br />")
191*906afcb8SAndy Fiddaman	    if [[ "${buf}" == ~(Er).*/ ]] ; then
192*906afcb8SAndy Fiddaman	        issingletag=true
193*906afcb8SAndy Fiddaman		buf="${buf%*/}"
194*906afcb8SAndy Fiddaman	    else
195*906afcb8SAndy Fiddaman	        issingletag=false
196*906afcb8SAndy Fiddaman	    fi
197*906afcb8SAndy Fiddaman
198*906afcb8SAndy Fiddaman	    # check if the tag has attributes (e.g. space after name)
199*906afcb8SAndy Fiddaman	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
200*906afcb8SAndy Fiddaman	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
201*906afcb8SAndy Fiddaman                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
202*906afcb8SAndy Fiddaman            else
203*906afcb8SAndy Fiddaman	        namebuf="$buf"
204*906afcb8SAndy Fiddaman		attrbuf=""
205*906afcb8SAndy Fiddaman	    fi
206*906afcb8SAndy Fiddaman
207*906afcb8SAndy Fiddaman            if ${isendtag} ; then
208*906afcb8SAndy Fiddaman                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
209*906afcb8SAndy Fiddaman            else
210*906afcb8SAndy Fiddaman                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
211*906afcb8SAndy Fiddaman
212*906afcb8SAndy Fiddaman                # handle tags like <br/> (which are start- and end-tag in one piece)
213*906afcb8SAndy Fiddaman                if ${issingletag} ; then
214*906afcb8SAndy Fiddaman                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
215*906afcb8SAndy Fiddaman                fi
216*906afcb8SAndy Fiddaman            fi
217*906afcb8SAndy Fiddaman            buf=""
218*906afcb8SAndy Fiddaman        else
219*906afcb8SAndy Fiddaman            buf+="$c"
220*906afcb8SAndy Fiddaman        fi
221*906afcb8SAndy Fiddaman    done
222*906afcb8SAndy Fiddaman
223*906afcb8SAndy Fiddaman    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
224*906afcb8SAndy Fiddaman
225*906afcb8SAndy Fiddaman    print # final newline to make filters like "sed" happy
226*906afcb8SAndy Fiddaman}
227*906afcb8SAndy Fiddaman
228*906afcb8SAndy Fiddaman# enumerate comments in a shell (or shell-like) script
229*906afcb8SAndy Fiddamanfunction enumerate_comments_shell
230*906afcb8SAndy Fiddaman{
231*906afcb8SAndy Fiddaman	set -o errexit
232*906afcb8SAndy Fiddaman
233*906afcb8SAndy Fiddaman	typeset input_file="$1"
234*906afcb8SAndy Fiddaman	nameref comment_array="$2"
235*906afcb8SAndy Fiddaman	integer max_num_comments="$3"
236*906afcb8SAndy Fiddaman	integer ca=0 # index in "comment_array"
237*906afcb8SAndy Fiddaman
238*906afcb8SAndy Fiddaman	integer res=0
239*906afcb8SAndy Fiddaman
240*906afcb8SAndy Fiddaman	typeset comment=""
241*906afcb8SAndy Fiddaman
242*906afcb8SAndy Fiddaman	while (( res == 0 )) ; do
243*906afcb8SAndy Fiddaman		IFS='' read -r line
244*906afcb8SAndy Fiddaman		(( res=$? ))
245*906afcb8SAndy Fiddaman
246*906afcb8SAndy Fiddaman		if [[ "${line}" == ~(El)#.* ]] ; then
247*906afcb8SAndy Fiddaman			comment+="${line#\#}${ch.newline}"
248*906afcb8SAndy Fiddaman		else
249*906afcb8SAndy Fiddaman			if [[ "$comment" != "" ]] ; then
250*906afcb8SAndy Fiddaman				comment_array[ca++]="${comment}"
251*906afcb8SAndy Fiddaman				comment=""
252*906afcb8SAndy Fiddaman
253*906afcb8SAndy Fiddaman				if (( ca > max_num_comments )) ; then
254*906afcb8SAndy Fiddaman					break
255*906afcb8SAndy Fiddaman				fi
256*906afcb8SAndy Fiddaman			fi
257*906afcb8SAndy Fiddaman		fi
258*906afcb8SAndy Fiddaman	done <"${input_file}"
259*906afcb8SAndy Fiddaman
260*906afcb8SAndy Fiddaman	return 0
261*906afcb8SAndy Fiddaman}
262*906afcb8SAndy Fiddaman
263*906afcb8SAndy Fiddaman
264*906afcb8SAndy Fiddaman# enumerate comments in a troff document
265*906afcb8SAndy Fiddamanfunction enumerate_comments_troff
266*906afcb8SAndy Fiddaman{
267*906afcb8SAndy Fiddaman	set -o errexit
268*906afcb8SAndy Fiddaman
269*906afcb8SAndy Fiddaman	typeset input_file="$1"
270*906afcb8SAndy Fiddaman	nameref comment_array="$2"
271*906afcb8SAndy Fiddaman	integer max_num_comments="$3"
272*906afcb8SAndy Fiddaman	integer ca=0 # index in "comment_array"
273*906afcb8SAndy Fiddaman
274*906afcb8SAndy Fiddaman	integer res=0
275*906afcb8SAndy Fiddaman
276*906afcb8SAndy Fiddaman	typeset comment=""
277*906afcb8SAndy Fiddaman
278*906afcb8SAndy Fiddaman	while (( res == 0 )) ; do
279*906afcb8SAndy Fiddaman		IFS='' read -r line
280*906afcb8SAndy Fiddaman		(( res=$? ))
281*906afcb8SAndy Fiddaman
282*906afcb8SAndy Fiddaman		if [[ "${line}" == ~(El)\.*\\\" ]] ; then
283*906afcb8SAndy Fiddaman			comment+="${line#~(El)\.*\\\"}${ch.newline}"
284*906afcb8SAndy Fiddaman		else
285*906afcb8SAndy Fiddaman			if [[ "$comment" != "" ]] ; then
286*906afcb8SAndy Fiddaman				comment_array[ca++]="${comment}"
287*906afcb8SAndy Fiddaman				comment=""
288*906afcb8SAndy Fiddaman
289*906afcb8SAndy Fiddaman				if (( ca > max_num_comments )) ; then
290*906afcb8SAndy Fiddaman					break
291*906afcb8SAndy Fiddaman				fi
292*906afcb8SAndy Fiddaman			fi
293*906afcb8SAndy Fiddaman		fi
294*906afcb8SAndy Fiddaman	done <"${input_file}"
295*906afcb8SAndy Fiddaman
296*906afcb8SAndy Fiddaman	return 0
297*906afcb8SAndy Fiddaman}
298*906afcb8SAndy Fiddaman
299*906afcb8SAndy Fiddaman
300*906afcb8SAndy Fiddaman# enumerate comments in files which are preprocessed by
301*906afcb8SAndy Fiddaman# CPP (e.g. C, C++, Imakefile etc.)
302*906afcb8SAndy Fiddamanfunction enumerate_comments_cpp
303*906afcb8SAndy Fiddaman{
304*906afcb8SAndy Fiddaman	set -o errexit
305*906afcb8SAndy Fiddaman#	set -o nounset
306*906afcb8SAndy Fiddaman
307*906afcb8SAndy Fiddaman	integer err=0
308*906afcb8SAndy Fiddaman
309*906afcb8SAndy Fiddaman	typeset input_file="$1"
310*906afcb8SAndy Fiddaman	nameref comment_array="$2"
311*906afcb8SAndy Fiddaman	integer max_num_comments="$3"
312*906afcb8SAndy Fiddaman	integer max_filesize_for_scan="$4"
313*906afcb8SAndy Fiddaman	integer ca=0 # index in "comment_array"
314*906afcb8SAndy Fiddaman
315*906afcb8SAndy Fiddaman	typeset content
316*906afcb8SAndy Fiddaman	integer content_length
317*906afcb8SAndy Fiddaman
318*906afcb8SAndy Fiddaman	integer file_pos # file position
319*906afcb8SAndy Fiddaman	compound line_pos=(
320*906afcb8SAndy Fiddaman		integer x=0 # X position in line
321*906afcb8SAndy Fiddaman		integer y=0 # Y position in line (line number)
322*906afcb8SAndy Fiddaman	)
323*906afcb8SAndy Fiddaman	typeset c c2
324*906afcb8SAndy Fiddaman
325*906afcb8SAndy Fiddaman	typeset comment
326*906afcb8SAndy Fiddaman
327*906afcb8SAndy Fiddaman	compound state=(
328*906afcb8SAndy Fiddaman		# C comment state
329*906afcb8SAndy Fiddaman		typeset in_c_comment=false
330*906afcb8SAndy Fiddaman		# C++ comment state
331*906afcb8SAndy Fiddaman		compound cxx=(
332*906afcb8SAndy Fiddaman			typeset in_comment=false
333*906afcb8SAndy Fiddaman			typeset comment_continued=false
334*906afcb8SAndy Fiddaman			# position of current //-pos
335*906afcb8SAndy Fiddaman			compound comment_pos=(
336*906afcb8SAndy Fiddaman				integer x=-1
337*906afcb8SAndy Fiddaman				integer y=-1
338*906afcb8SAndy Fiddaman			)
339*906afcb8SAndy Fiddaman			# position of previous //-pos
340*906afcb8SAndy Fiddaman			compound comment_prev_pos=(
341*906afcb8SAndy Fiddaman				integer x=-1
342*906afcb8SAndy Fiddaman				integer y=-1
343*906afcb8SAndy Fiddaman			)
344*906afcb8SAndy Fiddaman		)
345*906afcb8SAndy Fiddaman		# literal state
346*906afcb8SAndy Fiddaman		typeset in_sq_literal=false # single-quote literal
347*906afcb8SAndy Fiddaman		typeset in_dq_literal=false # double-quote literal
348*906afcb8SAndy Fiddaman	)
349*906afcb8SAndy Fiddaman
350*906afcb8SAndy Fiddaman	content="$(< "${input_file}")"
351*906afcb8SAndy Fiddaman
352*906afcb8SAndy Fiddaman	# Truncate file to "max_filesize_for_scan" charatcters.
353*906afcb8SAndy Fiddaman	# This was originally added to work around a performance problem with
354*906afcb8SAndy Fiddaman	# the ${str:offset:chunksize} operator which scales badly in ksh93
355*906afcb8SAndy Fiddaman	# version 's' with the number of characters
356*906afcb8SAndy Fiddaman	if (( ${#content} > max_filesize_for_scan )) ; then
357*906afcb8SAndy Fiddaman		print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
358*906afcb8SAndy Fiddaman			"${input_file}" \
359*906afcb8SAndy Fiddaman			max_filesize_for_scan
360*906afcb8SAndy Fiddaman		content="${content:0:max_filesize_for_scan}"
361*906afcb8SAndy Fiddaman	fi
362*906afcb8SAndy Fiddaman	content_length=${#content}
363*906afcb8SAndy Fiddaman
364*906afcb8SAndy Fiddaman	# Iterate through the source code. The last character
365*906afcb8SAndy Fiddaman	# (when file_pos == content_length) will be empty to indicate
366*906afcb8SAndy Fiddaman	# EOF (this is needed for cases like when
367*906afcb8SAndy Fiddaman	# a C++ comment is not terminated by a newline... ;-/)
368*906afcb8SAndy Fiddaman	for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
369*906afcb8SAndy Fiddaman		c2="${content:file_pos:2}"
370*906afcb8SAndy Fiddaman		c="${c2:0:1}"
371*906afcb8SAndy Fiddaman
372*906afcb8SAndy Fiddaman		if [[ "$c" == "${ch.newline}" ]] ; then
373*906afcb8SAndy Fiddaman			(( line_pos.x=0, line_pos.y++ ))
374*906afcb8SAndy Fiddaman		else
375*906afcb8SAndy Fiddaman			(( line_pos.x++ ))
376*906afcb8SAndy Fiddaman		fi
377*906afcb8SAndy Fiddaman
378*906afcb8SAndy Fiddaman		if ${state.in_c_comment} ; then
379*906afcb8SAndy Fiddaman			if [[ "$c2" == "*/" ]] ; then
380*906afcb8SAndy Fiddaman				(( file_pos++, line_pos.x++ ))
381*906afcb8SAndy Fiddaman				state.in_c_comment=false
382*906afcb8SAndy Fiddaman
383*906afcb8SAndy Fiddaman				# flush comment text
384*906afcb8SAndy Fiddaman				comment_array[ca++]="${comment}"
385*906afcb8SAndy Fiddaman				comment=""
386*906afcb8SAndy Fiddaman
387*906afcb8SAndy Fiddaman				if (( ca > max_num_comments )) ; then
388*906afcb8SAndy Fiddaman					break
389*906afcb8SAndy Fiddaman				fi
390*906afcb8SAndy Fiddaman			else
391*906afcb8SAndy Fiddaman				comment+="$c"
392*906afcb8SAndy Fiddaman			fi
393*906afcb8SAndy Fiddaman		elif ${state.cxx.in_comment} ; then
394*906afcb8SAndy Fiddaman			if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
395*906afcb8SAndy Fiddaman				state.cxx.in_comment=false
396*906afcb8SAndy Fiddaman
397*906afcb8SAndy Fiddaman				# flush comment text
398*906afcb8SAndy Fiddaman				if ${state.cxx.comment_continued} ; then
399*906afcb8SAndy Fiddaman					comment_array[ca-1]+="${ch.newline}${comment}"
400*906afcb8SAndy Fiddaman					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
401*906afcb8SAndy Fiddaman					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
402*906afcb8SAndy Fiddaman				else
403*906afcb8SAndy Fiddaman					comment_array[ca++]="${comment}"
404*906afcb8SAndy Fiddaman					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
405*906afcb8SAndy Fiddaman					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
406*906afcb8SAndy Fiddaman				fi
407*906afcb8SAndy Fiddaman				comment=""
408*906afcb8SAndy Fiddaman
409*906afcb8SAndy Fiddaman				if (( ca > max_num_comments )) ; then
410*906afcb8SAndy Fiddaman					break
411*906afcb8SAndy Fiddaman				fi
412*906afcb8SAndy Fiddaman			else
413*906afcb8SAndy Fiddaman				comment+="$c"
414*906afcb8SAndy Fiddaman			fi
415*906afcb8SAndy Fiddaman		elif ${state.in_sq_literal} ; then
416*906afcb8SAndy Fiddaman			if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
417*906afcb8SAndy Fiddaman				state.in_sq_literal=false
418*906afcb8SAndy Fiddaman			fi
419*906afcb8SAndy Fiddaman		elif ${state.in_dq_literal} ; then
420*906afcb8SAndy Fiddaman			if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
421*906afcb8SAndy Fiddaman				state.in_dq_literal=false
422*906afcb8SAndy Fiddaman			fi
423*906afcb8SAndy Fiddaman		else
424*906afcb8SAndy Fiddaman			if [[ "$c2" == "/*" ]] ; then
425*906afcb8SAndy Fiddaman				(( file_pos++, line_pos.x++ ))
426*906afcb8SAndy Fiddaman				state.in_c_comment=true
427*906afcb8SAndy Fiddaman				comment=""
428*906afcb8SAndy Fiddaman			elif [[ "$c2" == "//" ]] ; then
429*906afcb8SAndy Fiddaman				(( file_pos++, line_pos.x++ ))
430*906afcb8SAndy Fiddaman				if (( state.cxx.comment_prev_pos.x == line_pos.x && \
431*906afcb8SAndy Fiddaman					state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
432*906afcb8SAndy Fiddaman					state.cxx.comment_continued=true
433*906afcb8SAndy Fiddaman			else
434*906afcb8SAndy Fiddaman				state.cxx.comment_continued=false
435*906afcb8SAndy Fiddaman			fi
436*906afcb8SAndy Fiddaman			(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
437*906afcb8SAndy Fiddaman			state.cxx.in_comment=true
438*906afcb8SAndy Fiddaman			comment=""
439*906afcb8SAndy Fiddaman			elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
440*906afcb8SAndy Fiddaman				state.in_sq_literal=true
441*906afcb8SAndy Fiddaman			elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
442*906afcb8SAndy Fiddaman				state.in_dq_literal=true
443*906afcb8SAndy Fiddaman			fi
444*906afcb8SAndy Fiddaman		fi
445*906afcb8SAndy Fiddaman	done
446*906afcb8SAndy Fiddaman
447*906afcb8SAndy Fiddaman	if [[ "$comment" != "" ]] ; then
448*906afcb8SAndy Fiddaman		print -u2 "## ERROR: Comment text buffer not empty at EOF."
449*906afcb8SAndy Fiddaman		err=1
450*906afcb8SAndy Fiddaman	fi
451*906afcb8SAndy Fiddaman
452*906afcb8SAndy Fiddaman	if ${state.in_c_comment} ; then
453*906afcb8SAndy Fiddaman		print -u2 "## ERROR: C comment did not close before EOF."
454*906afcb8SAndy Fiddaman		err=1
455*906afcb8SAndy Fiddaman	fi
456*906afcb8SAndy Fiddaman
457*906afcb8SAndy Fiddaman	if ${state.cxx.in_comment} ; then
458*906afcb8SAndy Fiddaman		print -u2 "## ERROR: C++ comment did not close before EOF."
459*906afcb8SAndy Fiddaman		err=1
460*906afcb8SAndy Fiddaman	fi
461*906afcb8SAndy Fiddaman
462*906afcb8SAndy Fiddaman	if ${state.in_dq_literal} ; then
463*906afcb8SAndy Fiddaman		print -u2 "## ERROR: Double-quoted literal did not close before EOF."
464*906afcb8SAndy Fiddaman		err=1
465*906afcb8SAndy Fiddaman	fi
466*906afcb8SAndy Fiddaman
467*906afcb8SAndy Fiddaman	# We treat this one only as warning since things like "foo.html.cpp" may
468*906afcb8SAndy Fiddaman	# trigger this condition accidently
469*906afcb8SAndy Fiddaman	if ${state.in_sq_literal} ; then
470*906afcb8SAndy Fiddaman		print -u2 "## WARNING: Single-quoted literal did not close before EOF."
471*906afcb8SAndy Fiddaman	fi
472*906afcb8SAndy Fiddaman
473*906afcb8SAndy Fiddaman	return $err
474*906afcb8SAndy Fiddaman}
475*906afcb8SAndy Fiddaman
476*906afcb8SAndy Fiddaman# determine file type
477*906afcb8SAndy Fiddamanfunction get_file_format
478*906afcb8SAndy Fiddaman{
479*906afcb8SAndy Fiddaman	set -o errexit
480*906afcb8SAndy Fiddaman
481*906afcb8SAndy Fiddaman	typeset filename="$1"
482*906afcb8SAndy Fiddaman	nameref file_format="$2"
483*906afcb8SAndy Fiddaman
484*906afcb8SAndy Fiddaman	typeset fileeval # evaluation result of /usr/bin/file
485*906afcb8SAndy Fiddaman
486*906afcb8SAndy Fiddaman	# check whether "filename" is a plain, readable file
487*906afcb8SAndy Fiddaman	[[ ! -f "$filename" ]] && return 1
488*906afcb8SAndy Fiddaman	[[ ! -r "$filename" ]] && return 1
489*906afcb8SAndy Fiddaman
490*906afcb8SAndy Fiddaman	# In theory this code would exclusively look at the contents of
491*906afcb8SAndy Fiddaman	# the file to figure out it's file format - unfortunately
492*906afcb8SAndy Fiddaman	# /usr/bin/file is virtually useless (the heuristics, matching
493*906afcb8SAndy Fiddaman	# and output unreliable) for many file formats and therefore
494*906afcb8SAndy Fiddaman	# we have to do a multi-stage approach which looks
495*906afcb8SAndy Fiddaman	# at the file's content if possible and at the filename
496*906afcb8SAndy Fiddaman	# otherwise. Fun... ;-(
497*906afcb8SAndy Fiddaman
498*906afcb8SAndy Fiddaman	# pass one: Find matches for file formats where /usr/bin/file
499*906afcb8SAndy Fiddaman	# is known to be unreliable:
500*906afcb8SAndy Fiddaman	case "$filename" in
501*906afcb8SAndy Fiddaman		*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
502*906afcb8SAndy Fiddaman			file_format="c_source"
503*906afcb8SAndy Fiddaman			return 0
504*906afcb8SAndy Fiddaman			;;
505*906afcb8SAndy Fiddaman		*Imakefile)
506*906afcb8SAndy Fiddaman			file_format="imakefile"
507*906afcb8SAndy Fiddaman			return 0
508*906afcb8SAndy Fiddaman			;;
509*906afcb8SAndy Fiddaman		*Makefile)
510*906afcb8SAndy Fiddaman			file_format="makefile"
511*906afcb8SAndy Fiddaman			return 0
512*906afcb8SAndy Fiddaman			;;
513*906afcb8SAndy Fiddaman	esac
514*906afcb8SAndy Fiddaman
515*906afcb8SAndy Fiddaman	# pass two: match by file content via /usr/bin/file
516*906afcb8SAndy Fiddaman	fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
517*906afcb8SAndy Fiddaman	case "$fileeval" in
518*906afcb8SAndy Fiddaman		~(E)roff)
519*906afcb8SAndy Fiddaman			file_format="troff"
520*906afcb8SAndy Fiddaman			return 0
521*906afcb8SAndy Fiddaman			;;
522*906afcb8SAndy Fiddaman		~(E)html\ document)
523*906afcb8SAndy Fiddaman			file_format="html"
524*906afcb8SAndy Fiddaman			return 0
525*906afcb8SAndy Fiddaman			;;
526*906afcb8SAndy Fiddaman		~(E)sgml\ document)
527*906afcb8SAndy Fiddaman			file_format="sgml"
528*906afcb8SAndy Fiddaman			return 0
529*906afcb8SAndy Fiddaman			;;
530*906afcb8SAndy Fiddaman		~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
531*906afcb8SAndy Fiddaman			file_format="shell"
532*906afcb8SAndy Fiddaman			return 0
533*906afcb8SAndy Fiddaman			;;
534*906afcb8SAndy Fiddaman		~(E)executable.*/perl\ script)
535*906afcb8SAndy Fiddaman			file_format="perl"
536*906afcb8SAndy Fiddaman			return 0
537*906afcb8SAndy Fiddaman			;;
538*906afcb8SAndy Fiddaman	esac
539*906afcb8SAndy Fiddaman
540*906afcb8SAndy Fiddaman	# pass three: fallhack to filename matching
541*906afcb8SAndy Fiddaman	case "$filename" in
542*906afcb8SAndy Fiddaman		*.man)
543*906afcb8SAndy Fiddaman			file_format="troff"
544*906afcb8SAndy Fiddaman			return 0
545*906afcb8SAndy Fiddaman			;;
546*906afcb8SAndy Fiddaman		*.html)
547*906afcb8SAndy Fiddaman			file_format="html"
548*906afcb8SAndy Fiddaman			return 0
549*906afcb8SAndy Fiddaman			;;
550*906afcb8SAndy Fiddaman		*.sgml)
551*906afcb8SAndy Fiddaman			file_format="sgml"
552*906afcb8SAndy Fiddaman			return 0
553*906afcb8SAndy Fiddaman			;;
554*906afcb8SAndy Fiddaman		*.xml)
555*906afcb8SAndy Fiddaman			file_format="xml"
556*906afcb8SAndy Fiddaman			return 0
557*906afcb8SAndy Fiddaman			;;
558*906afcb8SAndy Fiddaman		*.png)
559*906afcb8SAndy Fiddaman			file_format="image_png"
560*906afcb8SAndy Fiddaman			return 0
561*906afcb8SAndy Fiddaman			;;
562*906afcb8SAndy Fiddaman		*.xcf)
563*906afcb8SAndy Fiddaman			file_format="image_xcf"
564*906afcb8SAndy Fiddaman			return 0
565*906afcb8SAndy Fiddaman			;;
566*906afcb8SAndy Fiddaman		*.shar)
567*906afcb8SAndy Fiddaman			file_format="archive_shell"
568*906afcb8SAndy Fiddaman			return 0
569*906afcb8SAndy Fiddaman			;;
570*906afcb8SAndy Fiddaman		*.sh)
571*906afcb8SAndy Fiddaman			file_format="shell"
572*906afcb8SAndy Fiddaman			return 0
573*906afcb8SAndy Fiddaman			;;
574*906afcb8SAndy Fiddaman		*.pcf)
575*906afcb8SAndy Fiddaman			file_format="font_pcf"
576*906afcb8SAndy Fiddaman			return 0
577*906afcb8SAndy Fiddaman			;;
578*906afcb8SAndy Fiddaman		*.bdf)
579*906afcb8SAndy Fiddaman			file_format="font_bdf"
580*906afcb8SAndy Fiddaman			return 0
581*906afcb8SAndy Fiddaman			;;
582*906afcb8SAndy Fiddaman		*.pmf)
583*906afcb8SAndy Fiddaman			file_format="font_pmf"
584*906afcb8SAndy Fiddaman			return 0
585*906afcb8SAndy Fiddaman			;;
586*906afcb8SAndy Fiddaman		*.ttf | *.otf)
587*906afcb8SAndy Fiddaman			file_format="font_ttf"
588*906afcb8SAndy Fiddaman			return 0
589*906afcb8SAndy Fiddaman			;;
590*906afcb8SAndy Fiddaman		*.pfa | *.pfb)
591*906afcb8SAndy Fiddaman			file_format="font_postscript"
592*906afcb8SAndy Fiddaman			return 0
593*906afcb8SAndy Fiddaman			;;
594*906afcb8SAndy Fiddaman	esac
595*906afcb8SAndy Fiddaman
596*906afcb8SAndy Fiddaman	return 1
597*906afcb8SAndy Fiddaman}
598*906afcb8SAndy Fiddaman
599*906afcb8SAndy Fiddamanfunction extract_comments
600*906afcb8SAndy Fiddaman{
601*906afcb8SAndy Fiddaman	set -o errexit
602*906afcb8SAndy Fiddaman
603*906afcb8SAndy Fiddaman	nameref records="$1"
604*906afcb8SAndy Fiddaman	typeset filename="$2"
605*906afcb8SAndy Fiddaman	integer max_num_comments="$3"
606*906afcb8SAndy Fiddaman	integer max_filesize_for_scan="$4"
607*906afcb8SAndy Fiddaman
608*906afcb8SAndy Fiddaman	typeset datatype=""
609*906afcb8SAndy Fiddaman
610*906afcb8SAndy Fiddaman	records[${filename}]=(
611*906afcb8SAndy Fiddaman		typeset filename="$filename"
612*906afcb8SAndy Fiddaman
613*906afcb8SAndy Fiddaman		typeset fileformat_found="false" # "true" or "false"
614*906afcb8SAndy Fiddaman		typeset file_format=""
615*906afcb8SAndy Fiddaman
616*906afcb8SAndy Fiddaman		typeset -A hashsum
617*906afcb8SAndy Fiddaman
618*906afcb8SAndy Fiddaman		typeset comments_parsed="false" # "true" or "false"
619*906afcb8SAndy Fiddaman		typeset -a comments
620*906afcb8SAndy Fiddaman	)
621*906afcb8SAndy Fiddaman
622*906afcb8SAndy Fiddaman	records[${filename}].hashsum["md5"]="$(sum  -x md5  < "$filename")"
623*906afcb8SAndy Fiddaman	records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
624*906afcb8SAndy Fiddaman
625*906afcb8SAndy Fiddaman	if get_file_format "$filename" datatype ; then
626*906afcb8SAndy Fiddaman		records[${filename}].fileformat_found="true"
627*906afcb8SAndy Fiddaman		records[${filename}].file_format="$datatype"
628*906afcb8SAndy Fiddaman	else
629*906afcb8SAndy Fiddaman		return 1
630*906afcb8SAndy Fiddaman	fi
631*906afcb8SAndy Fiddaman
632*906afcb8SAndy Fiddaman	case "$datatype" in
633*906afcb8SAndy Fiddaman		c_source|imakefile)
634*906afcb8SAndy Fiddaman			enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
635*906afcb8SAndy Fiddaman				records[${filename}].comments_parsed=true
636*906afcb8SAndy Fiddaman			;;
637*906afcb8SAndy Fiddaman		shell|makefile)
638*906afcb8SAndy Fiddaman			enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
639*906afcb8SAndy Fiddaman				records[${filename}].comments_parsed=true
640*906afcb8SAndy Fiddaman			;;
641*906afcb8SAndy Fiddaman		troff)
642*906afcb8SAndy Fiddaman			enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
643*906afcb8SAndy Fiddaman				records[${filename}].comments_parsed=true
644*906afcb8SAndy Fiddaman			;;
645*906afcb8SAndy Fiddaman		# NOTE: Disabled for now
646*906afcb8SAndy Fiddaman		#xml|html|sgml)
647*906afcb8SAndy Fiddaman		#	enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
648*906afcb8SAndy Fiddaman		#		records[${filename}].comments_parsed=true
649*906afcb8SAndy Fiddaman		#	;;
650*906afcb8SAndy Fiddaman	esac
651*906afcb8SAndy Fiddaman
652*906afcb8SAndy Fiddaman	return 0
653*906afcb8SAndy Fiddaman}
654*906afcb8SAndy Fiddaman
655*906afcb8SAndy Fiddaman# parse HTTP return code, cookies etc.
656*906afcb8SAndy Fiddamanfunction parse_http_response
657*906afcb8SAndy Fiddaman{
658*906afcb8SAndy Fiddaman	nameref response="$1"
659*906afcb8SAndy Fiddaman	typeset h statuscode statusmsg i
660*906afcb8SAndy Fiddaman
661*906afcb8SAndy Fiddaman	# we use '\r' as additional IFS to filter the final '\r'
662*906afcb8SAndy Fiddaman	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
663*906afcb8SAndy Fiddaman	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
664*906afcb8SAndy Fiddaman	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
665*906afcb8SAndy Fiddaman	response.statuscode="$statuscode"
666*906afcb8SAndy Fiddaman	response.statusmsg="$statusmsg"
667*906afcb8SAndy Fiddaman
668*906afcb8SAndy Fiddaman	# skip remaining headers
669*906afcb8SAndy Fiddaman	while IFS='' read -r i ; do
670*906afcb8SAndy Fiddaman		[[ "$i" == $'\r' ]] && break
671*906afcb8SAndy Fiddaman
672*906afcb8SAndy Fiddaman		# strip '\r' at the end
673*906afcb8SAndy Fiddaman		i="${i/~(Er)$'\r'/}"
674*906afcb8SAndy Fiddaman
675*906afcb8SAndy Fiddaman		case "$i" in
676*906afcb8SAndy Fiddaman			~(Eli)Content-Type:.*)
677*906afcb8SAndy Fiddaman				response.content_type="${i/~(El).*:[[:blank:]]*/}"
678*906afcb8SAndy Fiddaman				;;
679*906afcb8SAndy Fiddaman			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
680*906afcb8SAndy Fiddaman				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
681*906afcb8SAndy Fiddaman				;;
682*906afcb8SAndy Fiddaman			~(Eli)Transfer-Encoding:.*)
683*906afcb8SAndy Fiddaman				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
684*906afcb8SAndy Fiddaman				;;
685*906afcb8SAndy Fiddaman		esac
686*906afcb8SAndy Fiddaman	done
687*906afcb8SAndy Fiddaman
688*906afcb8SAndy Fiddaman	return 0
689*906afcb8SAndy Fiddaman}
690*906afcb8SAndy Fiddaman
691*906afcb8SAndy Fiddamanfunction cat_http_body
692*906afcb8SAndy Fiddaman{
693*906afcb8SAndy Fiddaman	typeset emode="$1"
694*906afcb8SAndy Fiddaman	typeset hexchunksize="0"
695*906afcb8SAndy Fiddaman	integer chunksize=0
696*906afcb8SAndy Fiddaman
697*906afcb8SAndy Fiddaman	if [[ "${emode}" == "chunked" ]] ; then
698*906afcb8SAndy Fiddaman		while IFS=$'\r' read hexchunksize &&
699*906afcb8SAndy Fiddaman			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] &&
700*906afcb8SAndy Fiddaman			(( chunksize=$( printf "16#%s\n" "${hexchunksize}" )  )) && (( chunksize > 0 )) ; do
701*906afcb8SAndy Fiddaman			dd bs=1 count="${chunksize}" 2>/dev/null
702*906afcb8SAndy Fiddaman		done
703*906afcb8SAndy Fiddaman	else
704*906afcb8SAndy Fiddaman		cat
705*906afcb8SAndy Fiddaman	fi
706*906afcb8SAndy Fiddaman
707*906afcb8SAndy Fiddaman	return 0
708*906afcb8SAndy Fiddaman}
709*906afcb8SAndy Fiddaman
710*906afcb8SAndy Fiddamanfunction cat_url
711*906afcb8SAndy Fiddaman{
712*906afcb8SAndy Fiddaman	typeset protocol="${1%://*}"
713*906afcb8SAndy Fiddaman	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
714*906afcb8SAndy Fiddaman
715*906afcb8SAndy Fiddaman	if [[ "${protocol}" == "file" ]] ; then
716*906afcb8SAndy Fiddaman		cat "${path1}"
717*906afcb8SAndy Fiddaman		return $?
718*906afcb8SAndy Fiddaman	elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then
719*906afcb8SAndy Fiddaman		typeset host="${path1%%/*}"
720*906afcb8SAndy Fiddaman		typeset path="${path1#*/}"
721*906afcb8SAndy Fiddaman		typeset port="${host##*:}"
722*906afcb8SAndy Fiddaman
723*906afcb8SAndy Fiddaman		integer netfd
724*906afcb8SAndy Fiddaman		compound httpresponse # http response
725*906afcb8SAndy Fiddaman
726*906afcb8SAndy Fiddaman		# If URL did not contain a port number in the host part then look at the
727*906afcb8SAndy Fiddaman		# protocol to get the port number
728*906afcb8SAndy Fiddaman		if [[ "${port}" == "${host}" ]] ; then
729*906afcb8SAndy Fiddaman			case "${protocol}" in
730*906afcb8SAndy Fiddaman				"http")  port=80 ;;
731*906afcb8SAndy Fiddaman				"https") port=443 ;;
732*906afcb8SAndy Fiddaman				*)       port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
733*906afcb8SAndy Fiddaman			esac
734*906afcb8SAndy Fiddaman		else
735*906afcb8SAndy Fiddaman			host="${host%:*}"
736*906afcb8SAndy Fiddaman		fi
737*906afcb8SAndy Fiddaman
738*906afcb8SAndy Fiddaman		printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
739*906afcb8SAndy Fiddaman
740*906afcb8SAndy Fiddaman		# prechecks
741*906afcb8SAndy Fiddaman		[[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
742*906afcb8SAndy Fiddaman		[[ "${port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
743*906afcb8SAndy Fiddaman		[[ "${host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
744*906afcb8SAndy Fiddaman		[[ "${path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
745*906afcb8SAndy Fiddaman
746*906afcb8SAndy Fiddaman		# open TCP channel
747*906afcb8SAndy Fiddaman		if [[ "${protocol}" == "https" ]] ; then
748*906afcb8SAndy Fiddaman			compound sslfifo
749*906afcb8SAndy Fiddaman			sslfifo.dir="$(mktemp -d)"
750*906afcb8SAndy Fiddaman			sslfifo.in="${sslfifo.dir}/in"
751*906afcb8SAndy Fiddaman			sslfifo.out="${sslfifo.dir}/out"
752*906afcb8SAndy Fiddaman
753*906afcb8SAndy Fiddaman			# register an EXIT trap and use "errexit" to leave it at the first error
754*906afcb8SAndy Fiddaman			# (this saves lots of if/fi tests for error checking)
755*906afcb8SAndy Fiddaman			trap "rm -r \"${sslfifo.dir}\"" EXIT
756*906afcb8SAndy Fiddaman			set -o errexit
757*906afcb8SAndy Fiddaman
758*906afcb8SAndy Fiddaman			mkfifo "${sslfifo.in}" "${sslfifo.out}"
759*906afcb8SAndy Fiddaman
760*906afcb8SAndy Fiddaman			# create async openssl child to handle https
761*906afcb8SAndy Fiddaman			openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" &
762*906afcb8SAndy Fiddaman
763*906afcb8SAndy Fiddaman			# send HTTP request
764*906afcb8SAndy Fiddaman			request="GET /${path} HTTP/1.1\r\n"
765*906afcb8SAndy Fiddaman			request+="Host: ${host}\r\n"
766*906afcb8SAndy Fiddaman			request+="User-Agent: crawlsrccomments/ksh93(ssl) (2010-03-27; $(uname -s -r -p))\r\n"
767*906afcb8SAndy Fiddaman			request+="Connection: close\r\n"
768*906afcb8SAndy Fiddaman			print -n -- "${request}\r\n" >>	"${sslfifo.in}"
769*906afcb8SAndy Fiddaman
770*906afcb8SAndy Fiddaman			# collect response and send it to stdout
771*906afcb8SAndy Fiddaman			{
772*906afcb8SAndy Fiddaman				parse_http_response httpresponse
773*906afcb8SAndy Fiddaman				cat_http_body "${httpresponse.transfer_encoding}"
774*906afcb8SAndy Fiddaman			} <"${sslfifo.out}"
775*906afcb8SAndy Fiddaman
776*906afcb8SAndy Fiddaman			wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; }
777*906afcb8SAndy Fiddaman
778*906afcb8SAndy Fiddaman			return 0
779*906afcb8SAndy Fiddaman		else
780*906afcb8SAndy Fiddaman			redirect {netfd}<> "/dev/tcp/${host}/${port}"
781*906afcb8SAndy Fiddaman			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
782*906afcb8SAndy Fiddaman
783*906afcb8SAndy Fiddaman			# send HTTP request
784*906afcb8SAndy Fiddaman			request="GET /${path} HTTP/1.1\r\n"
785*906afcb8SAndy Fiddaman			request+="Host: ${host}\r\n"
786*906afcb8SAndy Fiddaman			request+="User-Agent: crawlsrccomments/ksh93 (2010-03-27; $(uname -s -r -p))\r\n"
787*906afcb8SAndy Fiddaman			request+="Connection: close\r\n"
788*906afcb8SAndy Fiddaman			print -n -- "${request}\r\n" >&${netfd}
789*906afcb8SAndy Fiddaman
790*906afcb8SAndy Fiddaman			# collect response and send it to stdout
791*906afcb8SAndy Fiddaman			parse_http_response httpresponse <&${netfd}
792*906afcb8SAndy Fiddaman			cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
793*906afcb8SAndy Fiddaman
794*906afcb8SAndy Fiddaman			# close connection
795*906afcb8SAndy Fiddaman			redirect {netfd}<&-
796*906afcb8SAndy Fiddaman
797*906afcb8SAndy Fiddaman			return 0
798*906afcb8SAndy Fiddaman		fi
799*906afcb8SAndy Fiddaman	else
800*906afcb8SAndy Fiddaman		return 1
801*906afcb8SAndy Fiddaman	fi
802*906afcb8SAndy Fiddaman	# notreached
803*906afcb8SAndy Fiddaman}
804*906afcb8SAndy Fiddaman
805*906afcb8SAndy Fiddamanfunction print_stats
806*906afcb8SAndy Fiddaman{
807*906afcb8SAndy Fiddaman	set -o errexit
808*906afcb8SAndy Fiddaman
809*906afcb8SAndy Fiddaman	# gather some statistics
810*906afcb8SAndy Fiddaman	compound stats=(
811*906afcb8SAndy Fiddaman		integer files_with_comments=0
812*906afcb8SAndy Fiddaman		integer files_without_comments=0
813*906afcb8SAndy Fiddaman
814*906afcb8SAndy Fiddaman		integer files_without_known_format=0
815*906afcb8SAndy Fiddaman
816*906afcb8SAndy Fiddaman		integer files_with_license_info=0
817*906afcb8SAndy Fiddaman		integer files_without_license_info=0
818*906afcb8SAndy Fiddaman
819*906afcb8SAndy Fiddaman		integer total_num_files=0
820*906afcb8SAndy Fiddaman	)
821*906afcb8SAndy Fiddaman
822*906afcb8SAndy Fiddaman	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
823*906afcb8SAndy Fiddaman		if "${records[$i].comments_parsed}" ; then
824*906afcb8SAndy Fiddaman			(( stats.files_with_comments++ ))
825*906afcb8SAndy Fiddaman		else
826*906afcb8SAndy Fiddaman			(( stats.files_without_comments++ ))
827*906afcb8SAndy Fiddaman		fi
828*906afcb8SAndy Fiddaman
829*906afcb8SAndy Fiddaman		if ! "${records[$i].fileformat_found}" ; then
830*906afcb8SAndy Fiddaman			(( stats.files_without_known_format++ ))
831*906afcb8SAndy Fiddaman		fi
832*906afcb8SAndy Fiddaman
833*906afcb8SAndy Fiddaman		if "${records[$i].license_info_found}" ; then
834*906afcb8SAndy Fiddaman			(( stats.files_with_license_info++ ))
835*906afcb8SAndy Fiddaman		else
836*906afcb8SAndy Fiddaman			(( stats.files_without_license_info++ ))
837*906afcb8SAndy Fiddaman		fi
838*906afcb8SAndy Fiddaman
839*906afcb8SAndy Fiddaman		(( stats.total_num_files++ ))
840*906afcb8SAndy Fiddaman	done
841*906afcb8SAndy Fiddaman
842*906afcb8SAndy Fiddaman	print -v stats
843*906afcb8SAndy Fiddaman	return 0
844*906afcb8SAndy Fiddaman}
845*906afcb8SAndy Fiddaman
846*906afcb8SAndy Fiddaman
847*906afcb8SAndy Fiddamanfunction print_comments_plain
848*906afcb8SAndy Fiddaman{
849*906afcb8SAndy Fiddaman	set -o errexit
850*906afcb8SAndy Fiddaman
851*906afcb8SAndy Fiddaman	nameref records=$1
852*906afcb8SAndy Fiddaman	nameref options=$2
853*906afcb8SAndy Fiddaman	typeset i j
854*906afcb8SAndy Fiddaman
855*906afcb8SAndy Fiddaman	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
856*906afcb8SAndy Fiddaman		nameref node=records[$i]
857*906afcb8SAndy Fiddaman
858*906afcb8SAndy Fiddaman		if [[ "${options.filepattern.accept}" != "" ]] && \
859*906afcb8SAndy Fiddaman		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
860*906afcb8SAndy Fiddaman			continue
861*906afcb8SAndy Fiddaman		fi
862*906afcb8SAndy Fiddaman		if [[ "${options.filepattern.reject}" != "" ]] && \
863*906afcb8SAndy Fiddaman		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
864*906afcb8SAndy Fiddaman			continue
865*906afcb8SAndy Fiddaman		fi
866*906afcb8SAndy Fiddaman
867*906afcb8SAndy Fiddaman		node.license_info_found=false
868*906afcb8SAndy Fiddaman
869*906afcb8SAndy Fiddaman		if ! "${node.comments_parsed}" ; then
870*906afcb8SAndy Fiddaman			continue
871*906afcb8SAndy Fiddaman		fi
872*906afcb8SAndy Fiddaman
873*906afcb8SAndy Fiddaman		for j in "${!node.comments[@]}" ; do
874*906afcb8SAndy Fiddaman			typeset s="${node.comments[$j]}"
875*906afcb8SAndy Fiddaman			typeset match=false
876*906afcb8SAndy Fiddaman
877*906afcb8SAndy Fiddaman			if [[ "${options.commentpattern.accept}" != "" ]] && \
878*906afcb8SAndy Fiddaman			   [[ "$s" == ${options.commentpattern.accept} ]] ; then
879*906afcb8SAndy Fiddaman				match=true
880*906afcb8SAndy Fiddaman			fi
881*906afcb8SAndy Fiddaman			if [[ "${options.commentpattern.reject}" != "" ]] && \
882*906afcb8SAndy Fiddaman			   [[ "$s" == ${options.commentpattern.reject} ]] ; then
883*906afcb8SAndy Fiddaman				match=false
884*906afcb8SAndy Fiddaman			fi
885*906afcb8SAndy Fiddaman
886*906afcb8SAndy Fiddaman			if "${match}" ; then
887*906afcb8SAndy Fiddaman				printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
888*906afcb8SAndy Fiddaman				printf "%s\n" "$s"
889*906afcb8SAndy Fiddaman				node.license_info_found=true
890*906afcb8SAndy Fiddaman			fi
891*906afcb8SAndy Fiddaman		done
892*906afcb8SAndy Fiddaman
893*906afcb8SAndy Fiddaman		if ! "${node.license_info_found}" ; then
894*906afcb8SAndy Fiddaman			printf "## no match found in '%s'," "${node.filename}"
895*906afcb8SAndy Fiddaman			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
896*906afcb8SAndy Fiddaman				"${node.comments_parsed}" \
897*906afcb8SAndy Fiddaman				"${node.fileformat_found}" \
898*906afcb8SAndy Fiddaman				"${node.file_format}"
899*906afcb8SAndy Fiddaman		fi
900*906afcb8SAndy Fiddaman	done
901*906afcb8SAndy Fiddaman
902*906afcb8SAndy Fiddaman	return 0
903*906afcb8SAndy Fiddaman}
904*906afcb8SAndy Fiddaman
905*906afcb8SAndy Fiddamanfunction print_comments_duplicates_compressed
906*906afcb8SAndy Fiddaman{
907*906afcb8SAndy Fiddaman	set -o errexit
908*906afcb8SAndy Fiddaman
909*906afcb8SAndy Fiddaman	nameref records=$1
910*906afcb8SAndy Fiddaman	nameref options=$2
911*906afcb8SAndy Fiddaman	typeset i j
912*906afcb8SAndy Fiddaman	typeset -A hashed_comments
913*906afcb8SAndy Fiddaman	integer num_hashed_comments
914*906afcb8SAndy Fiddaman
915*906afcb8SAndy Fiddaman	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
916*906afcb8SAndy Fiddaman		nameref node=records[$i]
917*906afcb8SAndy Fiddaman
918*906afcb8SAndy Fiddaman		if [[ "${options.filepattern.accept}" != "" ]] && \
919*906afcb8SAndy Fiddaman		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
920*906afcb8SAndy Fiddaman			continue
921*906afcb8SAndy Fiddaman		fi
922*906afcb8SAndy Fiddaman		if [[ "${options.filepattern.reject}" != "" ]] && \
923*906afcb8SAndy Fiddaman		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
924*906afcb8SAndy Fiddaman			continue
925*906afcb8SAndy Fiddaman		fi
926*906afcb8SAndy Fiddaman
927*906afcb8SAndy Fiddaman		node.license_info_found=false
928*906afcb8SAndy Fiddaman
929*906afcb8SAndy Fiddaman		if ! "${node.comments_parsed}" ; then
930*906afcb8SAndy Fiddaman			continue
931*906afcb8SAndy Fiddaman		fi
932*906afcb8SAndy Fiddaman
933*906afcb8SAndy Fiddaman		for j in "${!node.comments[@]}" ; do
934*906afcb8SAndy Fiddaman			typeset s="${node.comments[$j]}"
935*906afcb8SAndy Fiddaman			typeset match=false
936*906afcb8SAndy Fiddaman
937*906afcb8SAndy Fiddaman			if [[ "${options.commentpattern.accept}" != "" ]] && \
938*906afcb8SAndy Fiddaman			   [[ "$s" == ${options.commentpattern.accept} ]] ; then
939*906afcb8SAndy Fiddaman				match=true
940*906afcb8SAndy Fiddaman			fi
941*906afcb8SAndy Fiddaman			if [[ "${options.commentpattern.reject}" != "" ]] && \
942*906afcb8SAndy Fiddaman			   [[ "$s" == ${options.commentpattern.reject} ]] ; then
943*906afcb8SAndy Fiddaman				match=false
944*906afcb8SAndy Fiddaman			fi
945*906afcb8SAndy Fiddaman
946*906afcb8SAndy Fiddaman
947*906afcb8SAndy Fiddaman			if "${match}" ; then
948*906afcb8SAndy Fiddaman				typeset -l hashstring # lowercase
949*906afcb8SAndy Fiddaman
950*906afcb8SAndy Fiddaman				# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
951*906afcb8SAndy Fiddaman				hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
952*906afcb8SAndy Fiddaman				# ... and then create a MD5 hash from this string
953*906afcb8SAndy Fiddaman				hash="$(sum -x md5 <<<"${hashstring}")"
954*906afcb8SAndy Fiddaman
955*906afcb8SAndy Fiddaman				nameref hc_node=hashed_comments[${hash}]
956*906afcb8SAndy Fiddaman
957*906afcb8SAndy Fiddaman				if [[ "${hc_node}" == "" ]] ; then
958*906afcb8SAndy Fiddaman					# build node if there isn't one yet
959*906afcb8SAndy Fiddaman					typeset -a hc_node.fileids
960*906afcb8SAndy Fiddaman					typeset    hc_node.comment="$s"
961*906afcb8SAndy Fiddaman				fi
962*906afcb8SAndy Fiddaman
963*906afcb8SAndy Fiddaman				hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
964*906afcb8SAndy Fiddaman
965*906afcb8SAndy Fiddaman				node.license_info_found=true
966*906afcb8SAndy Fiddaman			fi
967*906afcb8SAndy Fiddaman		done
968*906afcb8SAndy Fiddaman
969*906afcb8SAndy Fiddaman		if ! "${node.license_info_found}" ; then
970*906afcb8SAndy Fiddaman			printf "## no match found in "
971*906afcb8SAndy Fiddaman			printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
972*906afcb8SAndy Fiddaman			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
973*906afcb8SAndy Fiddaman				"${node.comments_parsed}" \
974*906afcb8SAndy Fiddaman				"${node.fileformat_found}" \
975*906afcb8SAndy Fiddaman				"${node.file_format}"
976*906afcb8SAndy Fiddaman		fi
977*906afcb8SAndy Fiddaman	done
978*906afcb8SAndy Fiddaman
979*906afcb8SAndy Fiddaman	# print comments and all fileids (filename+hash sums) which include this comment
980*906afcb8SAndy Fiddaman	for i in "${!hashed_comments[@]}" ; do
981*906afcb8SAndy Fiddaman		printf "\f## The comment (ID=%s) ..." "${i}"
982*906afcb8SAndy Fiddaman		printf "\n-- snip --"
983*906afcb8SAndy Fiddaman		printf "\n%s" "${hashed_comments[${i}].comment}"
984*906afcb8SAndy Fiddaman		printf "\n-- snip --"
985*906afcb8SAndy Fiddaman		printf "\n... applies to the following files:\n"
986*906afcb8SAndy Fiddaman		printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
987*906afcb8SAndy Fiddaman	done
988*906afcb8SAndy Fiddaman
989*906afcb8SAndy Fiddaman	return 0
990*906afcb8SAndy Fiddaman}
991*906afcb8SAndy Fiddaman
992*906afcb8SAndy Fiddamanfunction do_crawl
993*906afcb8SAndy Fiddaman{
994*906afcb8SAndy Fiddaman	set -o errexit
995*906afcb8SAndy Fiddaman
996*906afcb8SAndy Fiddaman	compound options=(
997*906afcb8SAndy Fiddaman		integer max_filesize_for_scan=$((256*1024))
998*906afcb8SAndy Fiddaman		integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
999*906afcb8SAndy Fiddaman	)
1000*906afcb8SAndy Fiddaman
1001*906afcb8SAndy Fiddaman	shift
1002*906afcb8SAndy Fiddaman	while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
1003*906afcb8SAndy Fiddaman		printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1004*906afcb8SAndy Fiddaman		case ${OPT} in
1005*906afcb8SAndy Fiddaman			S)	options.max_filesize_for_scan="${OPTARG}"  ;;
1006*906afcb8SAndy Fiddaman			N)	options.max_num_comments="${OPTARG}"  ;;
1007*906afcb8SAndy Fiddaman			*)	usage do_crawl_usage ;;
1008*906afcb8SAndy Fiddaman		esac
1009*906afcb8SAndy Fiddaman	done
1010*906afcb8SAndy Fiddaman	shift $((OPTIND-1))
1011*906afcb8SAndy Fiddaman
1012*906afcb8SAndy Fiddaman	compound scan=(
1013*906afcb8SAndy Fiddaman		typeset -A records
1014*906afcb8SAndy Fiddaman	)
1015*906afcb8SAndy Fiddaman
1016*906afcb8SAndy Fiddaman	# read filenames from stdin
1017*906afcb8SAndy Fiddaman	while read i ; do
1018*906afcb8SAndy Fiddaman		printf "## scanning %s ...\n" "$i"
1019*906afcb8SAndy Fiddaman		extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
1020*906afcb8SAndy Fiddaman	done
1021*906afcb8SAndy Fiddaman
1022*906afcb8SAndy Fiddaman	# print compound variable array (we strip the "typeset -A records" for now)
1023*906afcb8SAndy Fiddaman	print -v scan >"crawlsrccomments_extracted_comments.cpv"
1024*906afcb8SAndy Fiddaman
1025*906afcb8SAndy Fiddaman	print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
1026*906afcb8SAndy Fiddaman
1027*906afcb8SAndy Fiddaman	return 0
1028*906afcb8SAndy Fiddaman}
1029*906afcb8SAndy Fiddaman
1030*906afcb8SAndy Fiddamanfunction do_getcomments
1031*906afcb8SAndy Fiddaman{
1032*906afcb8SAndy Fiddaman	set -o errexit
1033*906afcb8SAndy Fiddaman
1034*906afcb8SAndy Fiddaman	# vars
1035*906afcb8SAndy Fiddaman	compound scan
1036*906afcb8SAndy Fiddaman	typeset database
1037*906afcb8SAndy Fiddaman	typeset tmp
1038*906afcb8SAndy Fiddaman
1039*906afcb8SAndy Fiddaman	compound options=(
1040*906afcb8SAndy Fiddaman		typeset database="crawlsrccomments_extracted_comments.cpv"
1041*906afcb8SAndy Fiddaman
1042*906afcb8SAndy Fiddaman		typeset print_stats=false
1043*906afcb8SAndy Fiddaman		typeset zapduplicates=false
1044*906afcb8SAndy Fiddaman		compound filepattern=(
1045*906afcb8SAndy Fiddaman			typeset accept="*"
1046*906afcb8SAndy Fiddaman			typeset reject=""
1047*906afcb8SAndy Fiddaman		)
1048*906afcb8SAndy Fiddaman		compound commentpattern=(
1049*906afcb8SAndy Fiddaman			typeset accept="~(Ei)(license|copyright)"
1050*906afcb8SAndy Fiddaman			typeset reject=""
1051*906afcb8SAndy Fiddaman		)
1052*906afcb8SAndy Fiddaman	)
1053*906afcb8SAndy Fiddaman
1054*906afcb8SAndy Fiddaman	shift
1055*906afcb8SAndy Fiddaman	while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
1056*906afcb8SAndy Fiddaman	#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1057*906afcb8SAndy Fiddaman		case ${OPT} in
1058*906afcb8SAndy Fiddaman			c)	options.commentpattern.accept="${OPTARG}" ;;
1059*906afcb8SAndy Fiddaman			C)	options.commentpattern.reject="${OPTARG}" ;;
1060*906afcb8SAndy Fiddaman			D)	options.database="${OPTARG}" ;;
1061*906afcb8SAndy Fiddaman			l)	options.filepattern.accept="${OPTARG}" ;;
1062*906afcb8SAndy Fiddaman			L)	options.filepattern.reject="${OPTARG}" ;;
1063*906afcb8SAndy Fiddaman			S)	options.print_stats=true ;;
1064*906afcb8SAndy Fiddaman			+S)	options.print_stats=false ;;
1065*906afcb8SAndy Fiddaman			Z)	options.zapduplicates=true ;;
1066*906afcb8SAndy Fiddaman			+Z)	options.zapduplicates=false ;;
1067*906afcb8SAndy Fiddaman			*)	usage do_getcomments_usage ;;
1068*906afcb8SAndy Fiddaman		esac
1069*906afcb8SAndy Fiddaman	done
1070*906afcb8SAndy Fiddaman	shift $((OPTIND-1))
1071*906afcb8SAndy Fiddaman
1072*906afcb8SAndy Fiddaman	# array of temporary files which should be cleaned-up upon exit
1073*906afcb8SAndy Fiddaman	typeset -a tmpfiles
1074*906afcb8SAndy Fiddaman	trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
1075*906afcb8SAndy Fiddaman
1076*906afcb8SAndy Fiddaman	# Support for HTTP URLs
1077*906afcb8SAndy Fiddaman	if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then
1078*906afcb8SAndy Fiddaman		database="/tmp/extract_license_cat_url_${PPID}_$$.tmp"
1079*906afcb8SAndy Fiddaman		tmpfiles+=( "${database}" )
1080*906afcb8SAndy Fiddaman		print -u2 "# Loading URL..."
1081*906afcb8SAndy Fiddaman		cat_url "${options.database}" >"${database}"
1082*906afcb8SAndy Fiddaman		print -u2 "# Loading URL done."
1083*906afcb8SAndy Fiddaman	else
1084*906afcb8SAndy Fiddaman		database="${options.database}"
1085*906afcb8SAndy Fiddaman	fi
1086*906afcb8SAndy Fiddaman
1087*906afcb8SAndy Fiddaman	if [[ ! -r "${database}" ]] ; then
1088*906afcb8SAndy Fiddaman		fatal_error "Can't read ${database}."
1089*906afcb8SAndy Fiddaman	fi
1090*906afcb8SAndy Fiddaman
1091*906afcb8SAndy Fiddaman	# Support for compressed database files
1092*906afcb8SAndy Fiddaman	case "$(LC_ALL=C /usr/bin/file "${database}")" in
1093*906afcb8SAndy Fiddaman		*bzip2*)
1094*906afcb8SAndy Fiddaman			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1095*906afcb8SAndy Fiddaman			tmpfiles+=( "${tmp}" )
1096*906afcb8SAndy Fiddaman			print -u2 "# Uncompressing data (bzip2) ..."
1097*906afcb8SAndy Fiddaman			bzcat <"${database}" >"${tmp}"
1098*906afcb8SAndy Fiddaman			print -u2 "# Uncompression done."
1099*906afcb8SAndy Fiddaman			database="${tmp}"
1100*906afcb8SAndy Fiddaman			;;
1101*906afcb8SAndy Fiddaman		*gzip*)
1102*906afcb8SAndy Fiddaman			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1103*906afcb8SAndy Fiddaman			tmpfiles+=( "${tmp}" )
1104*906afcb8SAndy Fiddaman			print -u2 "# Uncompressing data (gzip) ..."
1105*906afcb8SAndy Fiddaman			gunzip -c <"${database}" >"${tmp}"
1106*906afcb8SAndy Fiddaman			print -u2 "# Uncompression done."
1107*906afcb8SAndy Fiddaman			database="${tmp}"
1108*906afcb8SAndy Fiddaman			;;
1109*906afcb8SAndy Fiddaman	esac
1110*906afcb8SAndy Fiddaman
1111*906afcb8SAndy Fiddaman	# Read compound variable which contain all recorded comments
1112*906afcb8SAndy Fiddaman	print -u2 "# reading records..."
1113*906afcb8SAndy Fiddaman	read -C scan <"${database}" || fatal_error 'Error reading data.'
1114*906afcb8SAndy Fiddaman	print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
1115*906afcb8SAndy Fiddaman
1116*906afcb8SAndy Fiddaman	# print comments
1117*906afcb8SAndy Fiddaman	print -u2 "# processing data..."
1118*906afcb8SAndy Fiddaman	print "## comments start:"
1119*906afcb8SAndy Fiddaman	if "${options.zapduplicates}" ; then
1120*906afcb8SAndy Fiddaman		print_comments_duplicates_compressed scan.records options
1121*906afcb8SAndy Fiddaman	else
1122*906afcb8SAndy Fiddaman		print_comments_plain scan.records options
1123*906afcb8SAndy Fiddaman	fi
1124*906afcb8SAndy Fiddaman	print "## comments end"
1125*906afcb8SAndy Fiddaman	print -u2 "# processing data done."
1126*906afcb8SAndy Fiddaman
1127*906afcb8SAndy Fiddaman	if "${options.print_stats}" ; then
1128*906afcb8SAndy Fiddaman		print_stats
1129*906afcb8SAndy Fiddaman	fi
1130*906afcb8SAndy Fiddaman
1131*906afcb8SAndy Fiddaman	return 0
1132*906afcb8SAndy Fiddaman}
1133*906afcb8SAndy Fiddaman
1134*906afcb8SAndy Fiddamanfunction usage
1135*906afcb8SAndy Fiddaman{
1136*906afcb8SAndy Fiddaman	nameref usagemsg=$1
1137*906afcb8SAndy Fiddaman	OPTIND=0
1138*906afcb8SAndy Fiddaman	getopts -a "${progname}" "${usagemsg}" OPT '-?'
1139*906afcb8SAndy Fiddaman	exit 2
1140*906afcb8SAndy Fiddaman}
1141*906afcb8SAndy Fiddaman
1142*906afcb8SAndy Fiddamantypeset -r do_getcomments_usage=$'+
1143*906afcb8SAndy Fiddaman[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n]
1144*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@sun.com>]
1145*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@nrubsig.org>]
1146*906afcb8SAndy Fiddaman[+NAME?getcomments - extract license information from source files]
1147*906afcb8SAndy Fiddaman[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
1148*906afcb8SAndy Fiddaman	license information from the "\bgetcomments\b"-database
1149*906afcb8SAndy Fiddaman	file created by \bcrawl\b. The script allows various
1150*906afcb8SAndy Fiddaman	filters (see options below) to be applied on the database]
1151*906afcb8SAndy Fiddaman[+?The license extraction is done in two steps - first a crawler script
1152*906afcb8SAndy Fiddaman	called \bcrawl\b will scan all source files, extract
1153*906afcb8SAndy Fiddaman	the comments and stores this information in a "database" file called
1154*906afcb8SAndy Fiddaman	"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
1155*906afcb8SAndy Fiddaman	queries on this database.]
1156*906afcb8SAndy Fiddaman[D:database?Database file for input (either file, http:// or https://-URL).]:[database]
1157*906afcb8SAndy Fiddaman[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
1158*906afcb8SAndy Fiddaman[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
1159*906afcb8SAndy Fiddaman[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
1160*906afcb8SAndy Fiddaman[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
1161*906afcb8SAndy Fiddaman[S:stats?Print statistics.]
1162*906afcb8SAndy Fiddaman[Z:zapsimilar?Combine similar/duplicate comments in the report.]
1163*906afcb8SAndy Fiddaman[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1164*906afcb8SAndy Fiddaman'
1165*906afcb8SAndy Fiddaman
1166*906afcb8SAndy Fiddamantypeset -r do_crawl_usage=$'+
1167*906afcb8SAndy Fiddaman[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n]
1168*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@sun.com>]
1169*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@nrubsig.org>]
1170*906afcb8SAndy Fiddaman[+NAME?crawl - crawl comment information from source files]
1171*906afcb8SAndy Fiddaman[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
1172*906afcb8SAndy Fiddaman	a list of source code files from stdin, determinates the type of
1173*906afcb8SAndy Fiddaman	syntax used by these files and then extracts
1174*906afcb8SAndy Fiddaman	comments from the source code and stores this information into a
1175*906afcb8SAndy Fiddaman	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1176*906afcb8SAndy Fiddaman	be processed by \bextract_license\b or similar processing tools.]
1177*906afcb8SAndy Fiddaman[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
1178*906afcb8SAndy Fiddaman	Defaults to 256K characters.]:[numchars]
1179*906afcb8SAndy Fiddaman[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
1180*906afcb8SAndy Fiddaman[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1181*906afcb8SAndy Fiddaman'
1182*906afcb8SAndy Fiddaman
1183*906afcb8SAndy Fiddamantypeset -r crawlsrccomments_usage=$'+
1184*906afcb8SAndy Fiddaman[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n]
1185*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@sun.com>]
1186*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@nrubsig.org>]
1187*906afcb8SAndy Fiddaman[+NAME?crawlsrccomments - extract and filter comment information from source files]
1188*906afcb8SAndy Fiddaman[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
1189*906afcb8SAndy Fiddaman	a list of source code files from stdin, determinates the type of
1190*906afcb8SAndy Fiddaman	syntax used by these files and then extracts
1191*906afcb8SAndy Fiddaman	comments from the source code and stores this information into a
1192*906afcb8SAndy Fiddaman	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1193*906afcb8SAndy Fiddaman	be processed by \bextract_license\b or similar processing tools.]
1194*906afcb8SAndy Fiddaman
1195*906afcb8SAndy Fiddaman[crawl|getcomments] options
1196*906afcb8SAndy Fiddaman
1197*906afcb8SAndy Fiddaman[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1198*906afcb8SAndy Fiddaman'
1199*906afcb8SAndy Fiddaman
1200*906afcb8SAndy Fiddaman
1201*906afcb8SAndy Fiddaman# program start
1202*906afcb8SAndy Fiddamanbuiltin basename
1203*906afcb8SAndy Fiddamanbuiltin cat
1204*906afcb8SAndy Fiddamanbuiltin date
1205*906afcb8SAndy Fiddamanbuiltin uname
1206*906afcb8SAndy Fiddamanbuiltin rm
1207*906afcb8SAndy Fiddamanbuiltin sum || fatal_error "sum builtin not found."
1208*906afcb8SAndy Fiddaman
1209*906afcb8SAndy Fiddaman# exit at the first error we hit
1210*906afcb8SAndy Fiddamanset -o errexit
1211*906afcb8SAndy Fiddaman
1212*906afcb8SAndy Fiddamantypeset progname="${ basename "${0}" ; }"
1213*906afcb8SAndy Fiddaman
1214*906afcb8SAndy Fiddamanwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
1215*906afcb8SAndy Fiddaman	# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1216*906afcb8SAndy Fiddaman	case ${OPT} in
1217*906afcb8SAndy Fiddaman		*)	usage crawlsrccomments_usage ;;
1218*906afcb8SAndy Fiddaman	esac
1219*906afcb8SAndy Fiddamandone
1220*906afcb8SAndy Fiddamanshift $((OPTIND-1))
1221*906afcb8SAndy Fiddaman
1222*906afcb8SAndy Fiddamantypeset cmd="$1"
1223*906afcb8SAndy Fiddaman
1224*906afcb8SAndy Fiddamancase "$cmd" in
1225*906afcb8SAndy Fiddaman	"crawl")
1226*906afcb8SAndy Fiddaman		progname+=" ${cmd}"
1227*906afcb8SAndy Fiddaman		do_crawl "$@"
1228*906afcb8SAndy Fiddaman		exit $?
1229*906afcb8SAndy Fiddaman		;;
1230*906afcb8SAndy Fiddaman	"getcomments")
1231*906afcb8SAndy Fiddaman		progname+=" ${cmd}"
1232*906afcb8SAndy Fiddaman		do_getcomments "$@"
1233*906afcb8SAndy Fiddaman		exit $?
1234*906afcb8SAndy Fiddaman		;;
1235*906afcb8SAndy Fiddaman	*)
1236*906afcb8SAndy Fiddaman		usage crawlsrccomments_usage
1237*906afcb8SAndy Fiddaman		;;
1238*906afcb8SAndy Fiddamanesac
1239*906afcb8SAndy Fiddaman
1240*906afcb8SAndy Fiddamanfatal_error "not reached."
1241*906afcb8SAndy Fiddaman# EOF.
1242