xref: /titanic_51/usr/src/lib/libshell/common/scripts/crawlsrccomments.sh (revision fc51f9bbbff02dbd8c3adf640b1a184ceeb58fa5)
1#!/usr/bin/ksh93
2
3#
4# CDDL HEADER START
5#
6# The contents of this file are subject to the terms of the
7# Common Development and Distribution License (the "License").
8# You may not use this file except in compliance with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
26# Use is subject to license terms.
27#
28
29# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
30export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
31
32# Make sure all math stuff runs in the "C" locale to avoid problems
33# with alternative # radix point representations (e.g. ',' instead of
34# '.' in de_DE.*-locales). This needs to be set _before_ any
35# floating-point constants are defined in this script).
36if [[ "${LC_ALL}" != "" ]] ; then
37    export \
38        LC_MONETARY="${LC_ALL}" \
39        LC_MESSAGES="${LC_ALL}" \
40        LC_COLLATE="${LC_ALL}" \
41        LC_CTYPE="${LC_ALL}"
42        unset LC_ALL
43fi
44export LC_NUMERIC=C
45
46# constants values for tokenizer/parser stuff
47typeset -r ch=(
48	newline=$'\n'
49	tab=$'\t'
50	formfeed=$'\f'
51)
52
53function fatal_error
54{
55	print -u2 "${progname}: $*"
56	exit 1
57}
58
59function printmsg
60{
61	print -u2 "$*"
62}
63
64
65function attrstrtoattrarray
66{
67#set -o xtrace
68    typeset s="$1"
69    nameref aa=$2 # attribute array
70    integer aa_count=0
71    integer aa_count=0
72    typeset nextattr
73    integer currattrlen=0
74    typeset tagstr
75    typeset tagval
76
77    while (( ${#s} > 0 )) ; do
78        # skip whitespaces
79        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
80            (( currattrlen++ ))
81        done
82        s="${s:currattrlen:${#s}}"
83
84        # anything left ?
85        (( ${#s} == 0 )) && break
86
87        # Pattern tests:
88        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
89        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
90        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
91        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
92        # All pattern combined via eregex (w|x|y|z):
93        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
94        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
95        currattrlen=$(( ${#s} - ${#nextattr}))
96
97        # add entry
98        tagstr="${s:0:currattrlen}"
99        if [[ "${tagstr}" == *=* ]] ; then
100            # normal case: attribute with value
101
102            tagval="${tagstr#*=}"
103
104            # strip quotes ('' or "")
105            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
106                tagval="${tagval:1:${#tagval}-2}"
107            fi
108
109            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
110        else
111            # special case for HTML where you have something like <foo baz>
112            aa[${aa_count}]=( name="${tagstr}" )
113        fi
114        (( aa_count++ ))
115        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
116    done
117}
118
119# XML document handler
120function handle_xml_document
121{
122#set -o xtrace
123    nameref callbacks=${1}
124    typeset tag_type="${2}"
125    typeset tag_value="${3}"
126    typeset tag_attributes="${4}"
127    nameref doc=${callbacks["arg_tree"]}
128    nameref nodepath="${stack.items[stack.pos]}"
129    nameref nodesnum="${stack.items[stack.pos]}num"
130
131    case "${tag_type}" in
132        tag_comment)
133            nodepath[${nodesnum}]+=(
134                typeset tagtype="comment"
135                typeset tagvalue="${tag_value}"
136            )
137            (( nodesnum++ ))
138            ;;
139    esac
140
141#    print "xmltok: '${tag_type}' = '${tag_value}'"
142}
143
144function xml_tok
145{
146    typeset buf=""
147    typeset namebuf=""
148    typeset attrbuf=""
149    typeset c=""
150    typeset isendtag # bool: true/false
151    typeset issingletag # bool: true/false (used for tags like "<br />")
152    nameref callbacks=${1}
153
154    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
155
156    while IFS='' read -r -N 1 c ; do
157        isendtag=false
158
159        if [[ "$c" == "<" ]] ; then
160	    # flush any text content
161            if [[ "$buf" != "" ]] ; then
162                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
163                buf=""
164            fi
165
166            IFS='' read -r -N 1 c
167            if [[ "$c" == "/" ]] ; then
168                isendtag=true
169            else
170                buf="$c"
171            fi
172            IFS='' read -r -d '>' c
173            buf+="$c"
174
175	    # handle comments
176	    if [[ "$buf" == ~(El)!-- ]] ; then
177	        # did we read the comment completely ?
178	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
179		    buf+=">"
180	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
181		        IFS='' read -r -N 1 c || break
182		        buf+="$c"
183		    done
184		fi
185
186		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
187		buf=""
188		continue
189	    fi
190
191	    # check if the tag starts and ends at the same time (like "<br />")
192	    if [[ "${buf}" == ~(Er).*/ ]] ; then
193	        issingletag=true
194		buf="${buf%*/}"
195	    else
196	        issingletag=false
197	    fi
198
199	    # check if the tag has attributes (e.g. space after name)
200	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
201	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
202                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
203            else
204	        namebuf="$buf"
205		attrbuf=""
206	    fi
207
208            if ${isendtag} ; then
209                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
210            else
211                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
212
213                # handle tags like <br/> (which are start- and end-tag in one piece)
214                if ${issingletag} ; then
215                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
216                fi
217            fi
218            buf=""
219        else
220            buf+="$c"
221        fi
222    done
223
224    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
225
226    print # final newline to make filters like "sed" happy
227}
228
229# enumerate comments in a shell (or shell-like) script
230function enumerate_comments_shell
231{
232	set -o errexit
233
234	typeset input_file="$1"
235	nameref comment_array="$2"
236	integer max_num_comments="$3"
237	integer ca=0 # index in "comment_array"
238
239	integer res=0
240
241	typeset comment=""
242
243	while (( res == 0 )) ; do
244		IFS='' read -r line
245		(( res=$? ))
246
247		if [[ "${line}" == ~(El)#.* ]] ; then
248			comment+="${line#\#}${ch.newline}"
249		else
250			if [[ "$comment" != "" ]] ; then
251				comment_array[ca++]="${comment}"
252				comment=""
253
254				if (( ca > max_num_comments )) ; then
255					break
256				fi
257			fi
258		fi
259	done <"${input_file}"
260
261	return 0
262}
263
264
265# enumerate comments in a troff document
266function enumerate_comments_troff
267{
268	set -o errexit
269
270	typeset input_file="$1"
271	nameref comment_array="$2"
272	integer max_num_comments="$3"
273	integer ca=0 # index in "comment_array"
274
275	integer res=0
276
277	typeset comment=""
278
279	while (( res == 0 )) ; do
280		IFS='' read -r line
281		(( res=$? ))
282
283		if [[ "${line}" == ~(El)\.*\\\" ]] ; then
284			comment+="${line#~(El)\.*\\\"}${ch.newline}"
285		else
286			if [[ "$comment" != "" ]] ; then
287				comment_array[ca++]="${comment}"
288				comment=""
289
290				if (( ca > max_num_comments )) ; then
291					break
292				fi
293			fi
294		fi
295	done <"${input_file}"
296
297	return 0
298}
299
300
301# enumerate comments in files which are preprocessed by
302# CPP (e.g. C, C++, Imakefile etc.)
303function enumerate_comments_cpp
304{
305	set -o errexit
306#	set -o nounset
307
308	integer err=0
309
310	typeset input_file="$1"
311	nameref comment_array="$2"
312	integer max_num_comments="$3"
313	integer max_filesize_for_scan="$4"
314	integer ca=0 # index in "comment_array"
315
316	typeset content
317	integer content_length
318
319	integer file_pos # file position
320	typeset line_pos=(
321		integer x=0 # X position in line
322		integer y=0 # Y position in line (line number)
323	)
324	typeset c c2
325
326	typeset comment
327
328	typeset state=(
329		# C comment state
330		typeset in_c_comment=false
331		# C++ comment state
332		typeset cxx=(
333			typeset in_comment=false
334			typeset comment_continued=false
335			# position of current //-pos
336			typeset comment_pos=(
337				integer x=-1
338				integer y=-1
339			)
340			# position of previous //-pos
341			typeset comment_prev_pos=(
342				integer x=-1
343				integer y=-1
344			)
345		)
346		# literal state
347		typeset in_sq_literal=false # single-quote literal
348		typeset in_dq_literal=false # double-quote literal
349	)
350
351	content="$(< "${input_file}")"
352
353	# Truncate file to "max_filesize_for_scan" charatcters.
354	# This was originally added to work around a performance problem with
355	# the ${str:offset:chunksize} operator which scales badly in ksh93
356	# version 's' with the number of characters
357	if (( ${#content} > max_filesize_for_scan )) ; then
358		print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
359			"${input_file}" \
360			max_filesize_for_scan
361		content="${content:0:max_filesize_for_scan}"
362	fi
363	content_length=${#content}
364
365	# Iterate through the source code. The last character
366	# (when file_pos == content_length) will be empty to indicate
367	# EOF (this is needed for cases like when
368	# a C++ comment is not terminated by a newline... ;-/)
369	for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
370		c2="${content:file_pos:2}"
371		c="${c2:0:1}"
372
373		if [[ "$c" == "${ch.newline}" ]] ; then
374			(( line_pos.x=0, line_pos.y++ ))
375		else
376			(( line_pos.x++ ))
377		fi
378
379		if ${state.in_c_comment} ; then
380			if [[ "$c2" == "*/" ]] ; then
381				(( file_pos++, line_pos.x++ ))
382				state.in_c_comment=false
383
384				# flush comment text
385				comment_array[ca++]="${comment}"
386				comment=""
387
388				if (( ca > max_num_comments )) ; then
389					break
390				fi
391			else
392				comment+="$c"
393			fi
394		elif ${state.cxx.in_comment} ; then
395			if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
396				state.cxx.in_comment=false
397
398				# flush comment text
399				if ${state.cxx.comment_continued} ; then
400					comment_array[ca-1]+="${ch.newline}${comment}"
401					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
402					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
403				else
404					comment_array[ca++]="${comment}"
405					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
406					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
407				fi
408				comment=""
409
410				if (( ca > max_num_comments )) ; then
411					break
412				fi
413			else
414				comment+="$c"
415			fi
416		elif ${state.in_sq_literal} ; then
417			if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
418				state.in_sq_literal=false
419			fi
420		elif ${state.in_dq_literal} ; then
421			if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
422				state.in_dq_literal=false
423			fi
424		else
425			if [[ "$c2" == "/*" ]] ; then
426				(( file_pos++, line_pos.x++ ))
427				state.in_c_comment=true
428				comment=""
429			elif [[ "$c2" == "//" ]] ; then
430				(( file_pos++, line_pos.x++ ))
431				if (( state.cxx.comment_prev_pos.x == line_pos.x && \
432					state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
433					state.cxx.comment_continued=true
434			else
435				state.cxx.comment_continued=false
436			fi
437			(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
438			state.cxx.in_comment=true
439			comment=""
440			elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
441				state.in_sq_literal=true
442			elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
443				state.in_dq_literal=true
444			fi
445		fi
446	done
447
448	if [[ "$comment" != "" ]] ; then
449		print -u2 "## ERROR: Comment text buffer not empty at EOF."
450		err=1
451	fi
452
453	if ${state.in_c_comment} ; then
454		print -u2 "## ERROR: C comment did not close before EOF."
455		err=1
456	fi
457
458	if ${state.cxx.in_comment} ; then
459		print -u2 "## ERROR: C++ comment did not close before EOF."
460		err=1
461	fi
462
463	if ${state.in_dq_literal} ; then
464		print -u2 "## ERROR: Double-quoted literal did not close before EOF."
465		err=1
466	fi
467
468	# We treat this one only as warning since things like "foo.html.cpp" may
469	# trigger this condition accidently
470	if ${state.in_sq_literal} ; then
471		print -u2 "## WARNING: Single-quoted literal did not close before EOF."
472	fi
473
474	return $err
475}
476
477# determine file type
478function get_file_format
479{
480	set -o errexit
481
482	typeset filename="$1"
483	nameref file_format="$2"
484
485	typeset fileeval # evaluation result of /usr/bin/file
486
487	# check whether "filename" is a plain, readable file
488	[[ ! -f "$filename" ]] && return 1
489	[[ ! -r "$filename" ]] && return 1
490
491	# In theory this code would exclusively look at the contents of
492	# the file to figure out it's file format - unfortunately
493	# /usr/bin/file is virtually useless (the heuristics, matching
494	# and output unreliable) for many file formats and therefore
495	# we have to do a multi-stage approach which looks
496	# at the file's content if possible and at the filename
497	# otherwise. Fun... ;-(
498
499	# pass one: Find matches for file formats where /usr/bin/file
500	# is known to be unreliable:
501	case "$filename" in
502		*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
503			file_format="c_source"
504			return 0
505			;;
506		*Imakefile)
507			file_format="imakefile"
508			return 0
509			;;
510		*Makefile)
511			file_format="makefile"
512			return 0
513			;;
514	esac
515
516	# pass two: match by file content via /usr/bin/file
517	fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
518	case "$fileeval" in
519		~(E)roff)
520			file_format="troff"
521			return 0
522			;;
523		~(E)html\ document)
524			file_format="html"
525			return 0
526			;;
527		~(E)sgml\ document)
528			file_format="sgml"
529			return 0
530			;;
531		~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
532			file_format="shell"
533			return 0
534			;;
535		~(E)executable.*/perl\ script)
536			file_format="perl"
537			return 0
538			;;
539	esac
540
541	# pass three: fallhack to filename matching
542	case "$filename" in
543		*.man)
544			file_format="troff"
545			return 0
546			;;
547		*.html)
548			file_format="html"
549			return 0
550			;;
551		*.sgml)
552			file_format="sgml"
553			return 0
554			;;
555		*.xml)
556			file_format="xml"
557			return 0
558			;;
559		*.png)
560			file_format="image_png"
561			return 0
562			;;
563		*.xcf)
564			file_format="image_xcf"
565			return 0
566			;;
567		*.shar)
568			file_format="archive_shell"
569			return 0
570			;;
571		*.sh)
572			file_format="shell"
573			return 0
574			;;
575		*.pcf)
576			file_format="font_pcf"
577			return 0
578			;;
579		*.bdf)
580			file_format="font_bdf"
581			return 0
582			;;
583		*.pmf)
584			file_format="font_pmf"
585			return 0
586			;;
587		*.ttf | *.otf)
588			file_format="font_ttf"
589			return 0
590			;;
591		*.pfa | *.pfb)
592			file_format="font_postscript"
593			return 0
594			;;
595	esac
596
597	return 1
598}
599
600function extract_comments
601{
602	set -o errexit
603
604	nameref records="$1"
605	typeset filename="$2"
606	integer max_num_comments="$3"
607	integer max_filesize_for_scan="$4"
608
609	typeset datatype=""
610
611	records[${filename}]=(
612		typeset filename="$filename"
613
614		typeset fileformat_found="false" # "true" or "false"
615		typeset file_format=""
616
617		typeset -A hashsum
618
619		typeset comments_parsed="false" # "true" or "false"
620		typeset -a comments
621	)
622
623	records[${filename}].hashsum["md5"]="$(sum  -x md5  < "$filename")"
624	records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
625
626	if get_file_format "$filename" datatype ; then
627		records[${filename}].fileformat_found="true"
628		records[${filename}].file_format="$datatype"
629	else
630		return 1
631	fi
632
633	case "$datatype" in
634		c_source|imakefile)
635			enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
636				records[${filename}].comments_parsed=true
637			;;
638		shell|makefile)
639			enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
640				records[${filename}].comments_parsed=true
641			;;
642		troff)
643			enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
644				records[${filename}].comments_parsed=true
645			;;
646		# NOTE: Disabled for now
647		#xml|html|sgml)
648		#	enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
649		#		records[${filename}].comments_parsed=true
650		#	;;
651	esac
652
653	return 0
654}
655
656# parse HTTP return code, cookies etc.
657function parse_http_response
658{
659	nameref response="$1"
660	typeset h statuscode statusmsg i
661
662	# we use '\r' as additional IFS to filter the final '\r'
663	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
664	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
665	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
666	response.statuscode="$statuscode"
667	response.statusmsg="$statusmsg"
668
669	# skip remaining headers
670	while IFS='' read -r i ; do
671		[[ "$i" == $'\r' ]] && break
672
673		# strip '\r' at the end
674		i="${i/~(Er)$'\r'/}"
675
676		case "$i" in
677			~(Eli)Content-Type:.*)
678				response.content_type="${i/~(El).*:[[:blank:]]*/}"
679				;;
680			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
681				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
682				;;
683			~(Eli)Transfer-Encoding:.*)
684				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
685				;;
686		esac
687	done
688
689	return 0
690}
691
692function cat_http_body
693{
694	typeset emode="$1"
695	typeset hexchunksize="0"
696	integer chunksize=0
697
698	if [[ "${emode}" == "chunked" ]] ; then
699		while IFS=$'\r' read hexchunksize &&
700			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] &&
701			(( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do
702			dd bs=1 count="${chunksize}" 2>/dev/null
703		done
704	else
705		cat
706	fi
707
708	return 0
709}
710
711function cat_http
712{
713	typeset protocol="${1%://*}"
714	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
715
716	typeset host="${path1%%/*}"
717	typeset path="${path1#*/}"
718	typeset port="${host##*:}"
719
720	integer netfd
721	typeset -C httpresponse # http response
722
723	# If URL did not contain a port number in the host part then look at the
724	# protocol to get the port number
725	if [[ "${port}" == "${host}" ]] ; then
726		case "${protocol}" in
727			"http") port=80 ;;
728			*)      port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
729		esac
730	else
731		host="${host%:*}"
732	fi
733
734	printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
735
736	# prechecks
737	[[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
738	[[ "${port}"     == "" ]] && { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
739	[[ "${host}"     == "" ]] && { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
740	[[ "${path}"     == "" ]] && { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
741
742	# open TCP channel
743	redirect {netfd}<>"/dev/tcp/${host}/${port}"
744	(( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; }
745
746	# send HTTP request
747	request="GET /${path} HTTP/1.1\r\n"
748	request+="Host: ${host}\r\n"
749	request+="User-Agent: crawlsrccomments/ksh93 (2008-06-14; $(uname -s -r -p))\r\n"
750	request+="Connection: close\r\n"
751	print -n -- "${request}\r\n" >&${netfd}
752
753	# collect response and send it to stdout
754	parse_http_response httpresponse <&${netfd}
755	cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
756
757	# close connection
758	redirect {netfd}<&-
759
760	return 0
761}
762
763function print_stats
764{
765	set -o errexit
766
767	# gather some statistics
768	typeset stats=(
769		integer files_with_comments=0
770		integer files_without_comments=0
771
772		integer files_without_known_format=0
773
774		integer files_with_license_info=0
775		integer files_without_license_info=0
776
777		integer total_num_files=0
778	)
779
780	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
781		if "${records[$i].comments_parsed}" ; then
782			(( stats.files_with_comments++ ))
783		else
784			(( stats.files_without_comments++ ))
785		fi
786
787		if ! "${records[$i].fileformat_found}" ; then
788			(( stats.files_without_known_format++ ))
789		fi
790
791		if "${records[$i].license_info_found}" ; then
792			(( stats.files_with_license_info++ ))
793		else
794			(( stats.files_without_license_info++ ))
795		fi
796
797		(( stats.total_num_files++ ))
798	done
799
800	printf "%B\n" stats
801	return 0
802}
803
804
805function print_comments_plain
806{
807	set -o errexit
808
809	nameref records=$1
810	nameref options=$2
811	typeset i j
812
813	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
814		nameref node=records[$i]
815
816		if [[ "${options.filepattern.accept}" != "" ]] && \
817		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
818			continue
819		fi
820		if [[ "${options.filepattern.reject}" != "" ]] && \
821		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
822			continue
823		fi
824
825		node.license_info_found=false
826
827		if ! "${node.comments_parsed}" ; then
828			continue
829		fi
830
831		for j in "${!node.comments[@]}" ; do
832			typeset s="${node.comments[$j]}"
833			typeset match=false
834
835			if [[ "${options.commentpattern.accept}" != "" ]] && \
836		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
837				match=true
838			fi
839			if [[ "${options.commentpattern.reject}" != "" ]] && \
840	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
841				match=false
842			fi
843
844			if "${match}" ; then
845				printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
846				printf "%s\n" "$s"
847				node.license_info_found=true
848			fi
849		done
850
851		if ! "${node.license_info_found}" ; then
852			printf "## no match found in '%s'," "${node.filename}"
853			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
854				"${node.comments_parsed}" \
855				"${node.fileformat_found}" \
856				"${node.file_format}"
857		fi
858	done
859
860	return 0
861}
862
863function print_comments_duplicates_compressed
864{
865	set -o errexit
866
867	nameref records=$1
868	nameref options=$2
869	typeset i j
870	typeset -A hashed_comments
871	integer num_hashed_comments
872
873	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
874		nameref node=records[$i]
875
876		if [[ "${options.filepattern.accept}" != "" ]] && \
877		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
878			continue
879		fi
880		if [[ "${options.filepattern.reject}" != "" ]] && \
881		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
882			continue
883		fi
884
885		node.license_info_found=false
886
887		if ! "${node.comments_parsed}" ; then
888			continue
889		fi
890
891		for j in "${!node.comments[@]}" ; do
892			typeset s="${node.comments[$j]}"
893			typeset match=false
894
895			if [[ "${options.commentpattern.accept}" != "" ]] && \
896		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
897				match=true
898			fi
899			if [[ "${options.commentpattern.reject}" != "" ]] && \
900	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
901				match=false
902			fi
903
904
905			if "${match}" ; then
906				typeset -l hashstring # lowercase
907
908				# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
909				hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
910				# ... and then create a MD5 hash from this string
911				hash="$(sum -x md5 <<<"${hashstring}")"
912
913				nameref hc_node=hashed_comments[${hash}]
914
915				if [[ "${hc_node}" == "" ]] ; then
916					# build node if there isn't one yet
917					typeset -a hc_node.fileids
918					typeset    hc_node.comment="$s"
919				fi
920
921				hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
922
923				node.license_info_found=true
924			fi
925		done
926
927		if ! "${node.license_info_found}" ; then
928			printf "## no match found in "
929			printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
930			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
931				"${node.comments_parsed}" \
932				"${node.fileformat_found}" \
933				"${node.file_format}"
934		fi
935	done
936
937	# print comments and all fileids (filename+hash sums) which include this comment
938	for i in "${!hashed_comments[@]}" ; do
939		printf "\f## The comment (ID=%s) ..." "${i}"
940		printf "\n-- snip --"
941		printf "\n%s" "${hashed_comments[${i}].comment}"
942		printf "\n-- snip --"
943		printf "\n... applies to the following files:\n"
944		printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
945	done
946
947	return 0
948}
949
950function do_crawl
951{
952	set -o errexit
953
954	typeset options=(
955		integer max_filesize_for_scan=$((256*1024))
956		integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
957	)
958
959	shift
960	while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
961		printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
962		case ${OPT} in
963			S)	options.max_filesize_for_scan="${OPTARG}"  ;;
964			N)	options.max_num_comments="${OPTARG}"  ;;
965			*)	usage do_crawl_usage ;;
966		esac
967	done
968	shift $((OPTIND-1))
969
970	typeset scan=(
971		typeset -A records
972	)
973
974	# read filenames from stdin
975	while read i ; do
976		printf "## scanning %s ...\n" "$i"
977		extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
978	done
979
980	# print compound variable array (we strip the "typeset -A records" for now)
981	printf "%B\n" scan |
982		sed $'s/^#.*$//;s/^\(//;s/^\)//;s/^\ttypeset -A records=\(//;s/^\t\)//' >"crawlsrccomments_extracted_comments.cpv"
983
984	print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
985
986	return 0
987}
988
989function do_getcomments
990{
991	set -o errexit
992
993	# vars
994	typeset scan=(
995		typeset -A records
996	)
997	typeset database
998	typeset tmp
999
1000	typeset options=(
1001		typeset database="crawlsrccomments_extracted_comments.cpv"
1002
1003		typeset print_stats=false
1004		typeset zapduplicates=false
1005		typeset filepattern=(
1006			typeset accept="*"
1007			typeset reject=""
1008		)
1009		typeset commentpattern=(
1010			typeset accept="~(Ei)(license|copyright)"
1011			typeset reject=""
1012		)
1013	)
1014
1015	shift
1016	while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
1017	#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1018		case ${OPT} in
1019			c)	options.commentpattern.accept="${OPTARG}" ;;
1020			C)	options.commentpattern.reject="${OPTARG}" ;;
1021			D)	options.database="${OPTARG}" ;;
1022			l)	options.filepattern.accept="${OPTARG}" ;;
1023			L)	options.filepattern.reject="${OPTARG}" ;;
1024			S)	options.print_stats=true ;;
1025			+S)	options.print_stats=false ;;
1026			Z)	options.zapduplicates=true ;;
1027			+Z)	options.zapduplicates=false ;;
1028			*)	usage do_getcomments_usage ;;
1029		esac
1030	done
1031	shift $((OPTIND-1))
1032
1033	# array of temporary files which should be cleaned-up upon exit
1034	typeset -a tmpfiles
1035	trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
1036
1037	# Support for HTTP URLs
1038	if [[ "${options.database}" == ~(El)http://.* ]] ; then
1039		database="/tmp/extract_license_cat_http_${PPID}_$$.tmp"
1040		tmpfiles+=( "${database}" )
1041		print -u2 "# Loading URL..."
1042		cat_http "${options.database}" >"${database}"
1043		print -u2 "# Loading URL done."
1044	else
1045		database="${options.database}"
1046	fi
1047
1048	if [[ ! -r "${database}" ]] ; then
1049		fatal_error "Can't read ${database}."
1050	fi
1051
1052	# Support for compressed database files
1053	case "$(LC_ALL=C /usr/bin/file "${database}")" in
1054		*bzip2*)
1055			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1056			tmpfiles+=( "${tmp}" )
1057			print -u2 "# Uncompressing data (bzip2) ..."
1058			bzcat <"${database}" >"${tmp}"
1059			print -u2 "# Uncompression done."
1060			database="${tmp}"
1061			;;
1062		*gzip*)
1063			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1064			tmpfiles+=( "${tmp}" )
1065			print -u2 "# Uncompressing data (gzip) ..."
1066			gunzip -c <"${database}" >"${tmp}"
1067			print -u2 "# Uncompression done."
1068			database="${tmp}"
1069			;;
1070	esac
1071
1072	# Read compound variable which contain all recorded comments
1073	print -u2 "# reading records..."
1074	{
1075		printf "("
1076		cat "${database}"
1077		printf ")\n"
1078	} | read -C scan.records || fatal_error 'Error reading data.'
1079	print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
1080
1081	# print comments
1082	print -u2 "# processing data..."
1083	print "## comments start:"
1084	if "${options.zapduplicates}" ; then
1085		print_comments_duplicates_compressed scan.records options
1086	else
1087		print_comments_plain scan.records options
1088	fi
1089	print "## comments end"
1090	print -u2 "# processing data done."
1091
1092	if "${options.print_stats}" ; then
1093		print_stats
1094	fi
1095
1096	return 0
1097}
1098
1099function usage
1100{
1101	nameref usagemsg=$1
1102	OPTIND=0
1103	getopts -a "${progname}" "${usagemsg}" OPT '-?'
1104	exit 2
1105}
1106
1107typeset -r do_getcomments_usage=$'+
1108[-?\n@(#)\$Id: getcomments (Roland Mainz) 2008-10-14 \$\n]
1109[-author?Roland Mainz <roland.mainz@sun.com>]
1110[+NAME?getcomments - extract license information from source files]
1111[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
1112	license information from the "\bgetcomments\b"-database
1113	file created by \bcrawl\b. The script allows various
1114	filters (see options below) to be applied on the database]
1115[+?The license extraction is done in two steps - first a crawler script
1116	called \bcrawl\b will scan all source files, extract
1117	the comments and stores this information in a "database" file called
1118	"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
1119	queries on this database.]
1120[D:database?Database file for input (either file or http://-URL).]:[database]
1121[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
1122[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
1123[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
1124[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
1125[S:stats?Print statistics.]
1126[Z:zapsimilar?Combine similar/duplicate comments in the report.]
1127[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1128'
1129
1130typeset -r do_crawl_usage=$'+
1131[-?\n@(#)\$Id: crawl (Roland Mainz) 2008-10-14 \$\n]
1132[-author?Roland Mainz <roland.mainz@sun.com>]
1133[+NAME?crawl - crawl comment information from source files]
1134[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
1135	a list of source code files from stdin, determinates the type of
1136	syntax used by these files and then extracts
1137	comments from the source code and stores this information into a
1138	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1139	be processed by \bextract_license\b or similar processing tools.]
1140[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
1141	Defaults to 256K characters.]:[numchars]
1142[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
1143[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1144'
1145
1146typeset -r crawlsrccomments_usage=$'+
1147[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2008-10-14 \$\n]
1148[-author?Roland Mainz <roland.mainz@sun.com>]
1149[+NAME?crawlsrccomments - extract and filter comment information from source files]
1150[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
1151	a list of source code files from stdin, determinates the type of
1152	syntax used by these files and then extracts
1153	comments from the source code and stores this information into a
1154	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1155	be processed by \bextract_license\b or similar processing tools.]
1156
1157[crawl|getcomments] options
1158
1159[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1160'
1161
1162
1163# program start
1164builtin basename
1165builtin cat
1166builtin date
1167builtin uname
1168builtin rm
1169builtin sum || fatal_error "sum builtin not found."
1170
1171# exit at the first error we hit
1172set -o errexit
1173
1174typeset progname="${ basename "${0}" ; }"
1175
1176while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
1177	# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1178	case ${OPT} in
1179		*)	usage crawlsrccomments_usage ;;
1180	esac
1181done
1182shift $((OPTIND-1))
1183
1184typeset cmd="$1"
1185
1186case "$cmd" in
1187	"crawl")
1188		progname+=" ${cmd}"
1189		do_crawl "$@"
1190		exit $?
1191		;;
1192	"getcomments")
1193		progname+=" ${cmd}"
1194		do_getcomments "$@"
1195		exit $?
1196		;;
1197	*)
1198		usage crawlsrccomments_usage
1199		;;
1200esac
1201
1202fatal_error "not reached."
1203# EOF.
1204