xref: /titanic_44/usr/src/lib/libshell/common/scripts/crawlsrccomments.sh (revision 1cb875ae88fb9463b368e725c2444776595895cb)
1#!/usr/bin/ksh93
2
3#
4# CDDL HEADER START
5#
6# The contents of this file are subject to the terms of the
7# Common Development and Distribution License (the "License").
8# You may not use this file except in compliance with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
26# Use is subject to license terms.
27#
28
29# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
30export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
31
32# Make sure all math stuff runs in the "C" locale to avoid problems
33# with alternative # radix point representations (e.g. ',' instead of
34# '.' in de_DE.*-locales). This needs to be set _before_ any
35# floating-point constants are defined in this script).
36if [[ "${LC_ALL}" != "" ]] ; then
37    export \
38        LC_MONETARY="${LC_ALL}" \
39        LC_MESSAGES="${LC_ALL}" \
40        LC_COLLATE="${LC_ALL}" \
41        LC_CTYPE="${LC_ALL}"
42        unset LC_ALL
43fi
44export LC_NUMERIC=C
45
46# constants values for tokenizer/parser stuff
47compound -r ch=(
48	newline=$'\n'
49	tab=$'\t'
50	formfeed=$'\f'
51)
52
53function fatal_error
54{
55	print -u2 "${progname}: $*"
56	exit 1
57}
58
59function printmsg
60{
61	print -u2 "$*"
62}
63
64
65function attrstrtoattrarray
66{
67#set -o xtrace
68    typeset s="$1"
69    nameref aa=$2 # attribute array
70    integer aa_count=0
71    integer aa_count=0
72    typeset nextattr
73    integer currattrlen=0
74    typeset tagstr
75    typeset tagval
76
77    while (( ${#s} > 0 )) ; do
78        # skip whitespaces
79        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
80            (( currattrlen++ ))
81        done
82        s="${s:currattrlen:${#s}}"
83
84        # anything left ?
85        (( ${#s} == 0 )) && break
86
87        # Pattern tests:
88        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
89        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
90        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
91        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
92        # All pattern combined via eregex (w|x|y|z):
93        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
94        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
95        currattrlen=$(( ${#s} - ${#nextattr}))
96
97        # add entry
98        tagstr="${s:0:currattrlen}"
99        if [[ "${tagstr}" == *=* ]] ; then
100            # normal case: attribute with value
101
102            tagval="${tagstr#*=}"
103
104            # strip quotes ('' or "")
105            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
106                tagval="${tagval:1:${#tagval}-2}"
107            fi
108
109            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
110        else
111            # special case for HTML where you have something like <foo baz>
112            aa[${aa_count}]=( name="${tagstr}" )
113        fi
114        (( aa_count++ ))
115        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
116    done
117}
118
119# XML document handler
120function handle_xml_document
121{
122#set -o xtrace
123    nameref callbacks=${1}
124    typeset tag_type="${2}"
125    typeset tag_value="${3}"
126    typeset tag_attributes="${4}"
127    nameref doc=${callbacks["arg_tree"]}
128    nameref nodepath="${stack.items[stack.pos]}"
129    nameref nodesnum="${stack.items[stack.pos]}num"
130
131    case "${tag_type}" in
132        tag_comment)
133            nodepath[${nodesnum}]+=(
134                typeset tagtype="comment"
135                typeset tagvalue="${tag_value}"
136            )
137            (( nodesnum++ ))
138            ;;
139    esac
140
141#    print "xmltok: '${tag_type}' = '${tag_value}'"
142}
143
144function xml_tok
145{
146    typeset buf=""
147    typeset namebuf=""
148    typeset attrbuf=""
149    typeset c=""
150    typeset isendtag # bool: true/false
151    typeset issingletag # bool: true/false (used for tags like "<br />")
152    nameref callbacks=${1}
153
154    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
155
156    while IFS='' read -r -N 1 c ; do
157        isendtag=false
158
159        if [[ "$c" == "<" ]] ; then
160	    # flush any text content
161            if [[ "$buf" != "" ]] ; then
162                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
163                buf=""
164            fi
165
166            IFS='' read -r -N 1 c
167            if [[ "$c" == "/" ]] ; then
168                isendtag=true
169            else
170                buf="$c"
171            fi
172            IFS='' read -r -d '>' c
173            buf+="$c"
174
175	    # handle comments
176	    if [[ "$buf" == ~(El)!-- ]] ; then
177	        # did we read the comment completely ?
178	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
179		    buf+=">"
180	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
181		        IFS='' read -r -N 1 c || break
182		        buf+="$c"
183		    done
184		fi
185
186		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
187		buf=""
188		continue
189	    fi
190
191	    # check if the tag starts and ends at the same time (like "<br />")
192	    if [[ "${buf}" == ~(Er).*/ ]] ; then
193	        issingletag=true
194		buf="${buf%*/}"
195	    else
196	        issingletag=false
197	    fi
198
199	    # check if the tag has attributes (e.g. space after name)
200	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
201	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
202                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
203            else
204	        namebuf="$buf"
205		attrbuf=""
206	    fi
207
208            if ${isendtag} ; then
209                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
210            else
211                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
212
213                # handle tags like <br/> (which are start- and end-tag in one piece)
214                if ${issingletag} ; then
215                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
216                fi
217            fi
218            buf=""
219        else
220            buf+="$c"
221        fi
222    done
223
224    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
225
226    print # final newline to make filters like "sed" happy
227}
228
229# enumerate comments in a shell (or shell-like) script
230function enumerate_comments_shell
231{
232	set -o errexit
233
234	typeset input_file="$1"
235	nameref comment_array="$2"
236	integer max_num_comments="$3"
237	integer ca=0 # index in "comment_array"
238
239	integer res=0
240
241	typeset comment=""
242
243	while (( res == 0 )) ; do
244		IFS='' read -r line
245		(( res=$? ))
246
247		if [[ "${line}" == ~(El)#.* ]] ; then
248			comment+="${line#\#}${ch.newline}"
249		else
250			if [[ "$comment" != "" ]] ; then
251				comment_array[ca++]="${comment}"
252				comment=""
253
254				if (( ca > max_num_comments )) ; then
255					break
256				fi
257			fi
258		fi
259	done <"${input_file}"
260
261	return 0
262}
263
264
265# enumerate comments in a troff document
266function enumerate_comments_troff
267{
268	set -o errexit
269
270	typeset input_file="$1"
271	nameref comment_array="$2"
272	integer max_num_comments="$3"
273	integer ca=0 # index in "comment_array"
274
275	integer res=0
276
277	typeset comment=""
278
279	while (( res == 0 )) ; do
280		IFS='' read -r line
281		(( res=$? ))
282
283		if [[ "${line}" == ~(El)\.*\\\" ]] ; then
284			comment+="${line#~(El)\.*\\\"}${ch.newline}"
285		else
286			if [[ "$comment" != "" ]] ; then
287				comment_array[ca++]="${comment}"
288				comment=""
289
290				if (( ca > max_num_comments )) ; then
291					break
292				fi
293			fi
294		fi
295	done <"${input_file}"
296
297	return 0
298}
299
300
301# enumerate comments in files which are preprocessed by
302# CPP (e.g. C, C++, Imakefile etc.)
303function enumerate_comments_cpp
304{
305	set -o errexit
306#	set -o nounset
307
308	integer err=0
309
310	typeset input_file="$1"
311	nameref comment_array="$2"
312	integer max_num_comments="$3"
313	integer max_filesize_for_scan="$4"
314	integer ca=0 # index in "comment_array"
315
316	typeset content
317	integer content_length
318
319	integer file_pos # file position
320	compound line_pos=(
321		integer x=0 # X position in line
322		integer y=0 # Y position in line (line number)
323	)
324	typeset c c2
325
326	typeset comment
327
328	compound state=(
329		# C comment state
330		typeset in_c_comment=false
331		# C++ comment state
332		compound cxx=(
333			typeset in_comment=false
334			typeset comment_continued=false
335			# position of current //-pos
336			compound comment_pos=(
337				integer x=-1
338				integer y=-1
339			)
340			# position of previous //-pos
341			compound comment_prev_pos=(
342				integer x=-1
343				integer y=-1
344			)
345		)
346		# literal state
347		typeset in_sq_literal=false # single-quote literal
348		typeset in_dq_literal=false # double-quote literal
349	)
350
351	content="$(< "${input_file}")"
352
353	# Truncate file to "max_filesize_for_scan" charatcters.
354	# This was originally added to work around a performance problem with
355	# the ${str:offset:chunksize} operator which scales badly in ksh93
356	# version 's' with the number of characters
357	if (( ${#content} > max_filesize_for_scan )) ; then
358		print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
359			"${input_file}" \
360			max_filesize_for_scan
361		content="${content:0:max_filesize_for_scan}"
362	fi
363	content_length=${#content}
364
365	# Iterate through the source code. The last character
366	# (when file_pos == content_length) will be empty to indicate
367	# EOF (this is needed for cases like when
368	# a C++ comment is not terminated by a newline... ;-/)
369	for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
370		c2="${content:file_pos:2}"
371		c="${c2:0:1}"
372
373		if [[ "$c" == "${ch.newline}" ]] ; then
374			(( line_pos.x=0, line_pos.y++ ))
375		else
376			(( line_pos.x++ ))
377		fi
378
379		if ${state.in_c_comment} ; then
380			if [[ "$c2" == "*/" ]] ; then
381				(( file_pos++, line_pos.x++ ))
382				state.in_c_comment=false
383
384				# flush comment text
385				comment_array[ca++]="${comment}"
386				comment=""
387
388				if (( ca > max_num_comments )) ; then
389					break
390				fi
391			else
392				comment+="$c"
393			fi
394		elif ${state.cxx.in_comment} ; then
395			if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
396				state.cxx.in_comment=false
397
398				# flush comment text
399				if ${state.cxx.comment_continued} ; then
400					comment_array[ca-1]+="${ch.newline}${comment}"
401					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
402					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
403				else
404					comment_array[ca++]="${comment}"
405					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
406					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
407				fi
408				comment=""
409
410				if (( ca > max_num_comments )) ; then
411					break
412				fi
413			else
414				comment+="$c"
415			fi
416		elif ${state.in_sq_literal} ; then
417			if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
418				state.in_sq_literal=false
419			fi
420		elif ${state.in_dq_literal} ; then
421			if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
422				state.in_dq_literal=false
423			fi
424		else
425			if [[ "$c2" == "/*" ]] ; then
426				(( file_pos++, line_pos.x++ ))
427				state.in_c_comment=true
428				comment=""
429			elif [[ "$c2" == "//" ]] ; then
430				(( file_pos++, line_pos.x++ ))
431				if (( state.cxx.comment_prev_pos.x == line_pos.x && \
432					state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
433					state.cxx.comment_continued=true
434			else
435				state.cxx.comment_continued=false
436			fi
437			(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
438			state.cxx.in_comment=true
439			comment=""
440			elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
441				state.in_sq_literal=true
442			elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
443				state.in_dq_literal=true
444			fi
445		fi
446	done
447
448	if [[ "$comment" != "" ]] ; then
449		print -u2 "## ERROR: Comment text buffer not empty at EOF."
450		err=1
451	fi
452
453	if ${state.in_c_comment} ; then
454		print -u2 "## ERROR: C comment did not close before EOF."
455		err=1
456	fi
457
458	if ${state.cxx.in_comment} ; then
459		print -u2 "## ERROR: C++ comment did not close before EOF."
460		err=1
461	fi
462
463	if ${state.in_dq_literal} ; then
464		print -u2 "## ERROR: Double-quoted literal did not close before EOF."
465		err=1
466	fi
467
468	# We treat this one only as warning since things like "foo.html.cpp" may
469	# trigger this condition accidently
470	if ${state.in_sq_literal} ; then
471		print -u2 "## WARNING: Single-quoted literal did not close before EOF."
472	fi
473
474	return $err
475}
476
477# determine file type
478function get_file_format
479{
480	set -o errexit
481
482	typeset filename="$1"
483	nameref file_format="$2"
484
485	typeset fileeval # evaluation result of /usr/bin/file
486
487	# check whether "filename" is a plain, readable file
488	[[ ! -f "$filename" ]] && return 1
489	[[ ! -r "$filename" ]] && return 1
490
491	# In theory this code would exclusively look at the contents of
492	# the file to figure out it's file format - unfortunately
493	# /usr/bin/file is virtually useless (the heuristics, matching
494	# and output unreliable) for many file formats and therefore
495	# we have to do a multi-stage approach which looks
496	# at the file's content if possible and at the filename
497	# otherwise. Fun... ;-(
498
499	# pass one: Find matches for file formats where /usr/bin/file
500	# is known to be unreliable:
501	case "$filename" in
502		*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
503			file_format="c_source"
504			return 0
505			;;
506		*Imakefile)
507			file_format="imakefile"
508			return 0
509			;;
510		*Makefile)
511			file_format="makefile"
512			return 0
513			;;
514	esac
515
516	# pass two: match by file content via /usr/bin/file
517	fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
518	case "$fileeval" in
519		~(E)roff)
520			file_format="troff"
521			return 0
522			;;
523		~(E)html\ document)
524			file_format="html"
525			return 0
526			;;
527		~(E)sgml\ document)
528			file_format="sgml"
529			return 0
530			;;
531		~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
532			file_format="shell"
533			return 0
534			;;
535		~(E)executable.*/perl\ script)
536			file_format="perl"
537			return 0
538			;;
539	esac
540
541	# pass three: fallhack to filename matching
542	case "$filename" in
543		*.man)
544			file_format="troff"
545			return 0
546			;;
547		*.html)
548			file_format="html"
549			return 0
550			;;
551		*.sgml)
552			file_format="sgml"
553			return 0
554			;;
555		*.xml)
556			file_format="xml"
557			return 0
558			;;
559		*.png)
560			file_format="image_png"
561			return 0
562			;;
563		*.xcf)
564			file_format="image_xcf"
565			return 0
566			;;
567		*.shar)
568			file_format="archive_shell"
569			return 0
570			;;
571		*.sh)
572			file_format="shell"
573			return 0
574			;;
575		*.pcf)
576			file_format="font_pcf"
577			return 0
578			;;
579		*.bdf)
580			file_format="font_bdf"
581			return 0
582			;;
583		*.pmf)
584			file_format="font_pmf"
585			return 0
586			;;
587		*.ttf | *.otf)
588			file_format="font_ttf"
589			return 0
590			;;
591		*.pfa | *.pfb)
592			file_format="font_postscript"
593			return 0
594			;;
595	esac
596
597	return 1
598}
599
600function extract_comments
601{
602	set -o errexit
603
604	nameref records="$1"
605	typeset filename="$2"
606	integer max_num_comments="$3"
607	integer max_filesize_for_scan="$4"
608
609	typeset datatype=""
610
611	records[${filename}]=(
612		typeset filename="$filename"
613
614		typeset fileformat_found="false" # "true" or "false"
615		typeset file_format=""
616
617		typeset -A hashsum
618
619		typeset comments_parsed="false" # "true" or "false"
620		typeset -a comments
621	)
622
623	records[${filename}].hashsum["md5"]="$(sum  -x md5  < "$filename")"
624	records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
625
626	if get_file_format "$filename" datatype ; then
627		records[${filename}].fileformat_found="true"
628		records[${filename}].file_format="$datatype"
629	else
630		return 1
631	fi
632
633	case "$datatype" in
634		c_source|imakefile)
635			enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
636				records[${filename}].comments_parsed=true
637			;;
638		shell|makefile)
639			enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
640				records[${filename}].comments_parsed=true
641			;;
642		troff)
643			enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
644				records[${filename}].comments_parsed=true
645			;;
646		# NOTE: Disabled for now
647		#xml|html|sgml)
648		#	enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
649		#		records[${filename}].comments_parsed=true
650		#	;;
651	esac
652
653	return 0
654}
655
656# parse HTTP return code, cookies etc.
657function parse_http_response
658{
659	nameref response="$1"
660	typeset h statuscode statusmsg i
661
662	# we use '\r' as additional IFS to filter the final '\r'
663	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
664	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
665	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
666	response.statuscode="$statuscode"
667	response.statusmsg="$statusmsg"
668
669	# skip remaining headers
670	while IFS='' read -r i ; do
671		[[ "$i" == $'\r' ]] && break
672
673		# strip '\r' at the end
674		i="${i/~(Er)$'\r'/}"
675
676		case "$i" in
677			~(Eli)Content-Type:.*)
678				response.content_type="${i/~(El).*:[[:blank:]]*/}"
679				;;
680			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
681				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
682				;;
683			~(Eli)Transfer-Encoding:.*)
684				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
685				;;
686		esac
687	done
688
689	return 0
690}
691
692function cat_http_body
693{
694	typeset emode="$1"
695	typeset hexchunksize="0"
696	integer chunksize=0
697
698	if [[ "${emode}" == "chunked" ]] ; then
699		while IFS=$'\r' read hexchunksize &&
700			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] &&
701			(( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do
702			dd bs=1 count="${chunksize}" 2>/dev/null
703		done
704	else
705		cat
706	fi
707
708	return 0
709}
710
711function cat_url
712{
713	typeset protocol="${1%://*}"
714	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
715
716	if [[ "${protocol}" == "file" ]] ; then
717		cat "${path1}"
718		return $?
719	elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then
720		typeset host="${path1%%/*}"
721		typeset path="${path1#*/}"
722		typeset port="${host##*:}"
723
724		integer netfd
725		compound httpresponse # http response
726
727		# If URL did not contain a port number in the host part then look at the
728		# protocol to get the port number
729		if [[ "${port}" == "${host}" ]] ; then
730			case "${protocol}" in
731				"http")  port=80 ;;
732				"https") port=443 ;;
733				*)       port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
734			esac
735		else
736			host="${host%:*}"
737		fi
738
739		printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
740
741		# prechecks
742		[[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
743		[[ "${port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
744		[[ "${host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
745		[[ "${path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
746
747		# open TCP channel
748		if [[ "${protocol}" == "https" ]] ; then
749			compound sslfifo
750			sslfifo.dir="$(mktemp -d)"
751			sslfifo.in="${sslfifo.dir}/in"
752			sslfifo.out="${sslfifo.dir}/out"
753
754			# register an EXIT trap and use "errexit" to leave it at the first error
755			# (this saves lots of if/fi tests for error checking)
756			trap "rm -r \"${sslfifo.dir}\"" EXIT
757			set -o errexit
758
759			mkfifo "${sslfifo.in}" "${sslfifo.out}"
760
761			# create async openssl child to handle https
762			openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" &
763
764			# send HTTP request
765			request="GET /${path} HTTP/1.1\r\n"
766			request+="Host: ${host}\r\n"
767			request+="User-Agent: crawlsrccomments/ksh93(ssl) (2009-05-08; $(uname -s -r -p))\r\n"
768			request+="Connection: close\r\n"
769			print -n -- "${request}\r\n" >>	"${sslfifo.in}"
770
771			# collect response and send it to stdout
772			{
773				parse_http_response httpresponse
774				cat_http_body "${httpresponse.transfer_encoding}"
775			} <"${sslfifo.out}"
776
777			wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; }
778
779			return 0
780		else
781			redirect {netfd}<> "/dev/tcp/${host}/${port}"
782			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
783
784			# send HTTP request
785			request="GET /${path} HTTP/1.1\r\n"
786			request+="Host: ${host}\r\n"
787			request+="User-Agent: crawlsrccomments/ksh93 (2009-05-08; $(uname -s -r -p))\r\n"
788			request+="Connection: close\r\n"
789			print -n -- "${request}\r\n" >&${netfd}
790
791			# collect response and send it to stdout
792			parse_http_response httpresponse <&${netfd}
793			cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
794
795			# close connection
796			redirect {netfd}<&-
797
798			return 0
799		fi
800	else
801		return 1
802	fi
803	# notreached
804}
805
806function print_stats
807{
808	set -o errexit
809
810	# gather some statistics
811	compound stats=(
812		integer files_with_comments=0
813		integer files_without_comments=0
814
815		integer files_without_known_format=0
816
817		integer files_with_license_info=0
818		integer files_without_license_info=0
819
820		integer total_num_files=0
821	)
822
823	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
824		if "${records[$i].comments_parsed}" ; then
825			(( stats.files_with_comments++ ))
826		else
827			(( stats.files_without_comments++ ))
828		fi
829
830		if ! "${records[$i].fileformat_found}" ; then
831			(( stats.files_without_known_format++ ))
832		fi
833
834		if "${records[$i].license_info_found}" ; then
835			(( stats.files_with_license_info++ ))
836		else
837			(( stats.files_without_license_info++ ))
838		fi
839
840		(( stats.total_num_files++ ))
841	done
842
843	print -v stats
844	return 0
845}
846
847
848function print_comments_plain
849{
850	set -o errexit
851
852	nameref records=$1
853	nameref options=$2
854	typeset i j
855
856	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
857		nameref node=records[$i]
858
859		if [[ "${options.filepattern.accept}" != "" ]] && \
860		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
861			continue
862		fi
863		if [[ "${options.filepattern.reject}" != "" ]] && \
864		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
865			continue
866		fi
867
868		node.license_info_found=false
869
870		if ! "${node.comments_parsed}" ; then
871			continue
872		fi
873
874		for j in "${!node.comments[@]}" ; do
875			typeset s="${node.comments[$j]}"
876			typeset match=false
877
878			if [[ "${options.commentpattern.accept}" != "" ]] && \
879		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
880				match=true
881			fi
882			if [[ "${options.commentpattern.reject}" != "" ]] && \
883	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
884				match=false
885			fi
886
887			if "${match}" ; then
888				printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
889				printf "%s\n" "$s"
890				node.license_info_found=true
891			fi
892		done
893
894		if ! "${node.license_info_found}" ; then
895			printf "## no match found in '%s'," "${node.filename}"
896			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
897				"${node.comments_parsed}" \
898				"${node.fileformat_found}" \
899				"${node.file_format}"
900		fi
901	done
902
903	return 0
904}
905
906function print_comments_duplicates_compressed
907{
908	set -o errexit
909
910	nameref records=$1
911	nameref options=$2
912	typeset i j
913	typeset -A hashed_comments
914	integer num_hashed_comments
915
916	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
917		nameref node=records[$i]
918
919		if [[ "${options.filepattern.accept}" != "" ]] && \
920		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
921			continue
922		fi
923		if [[ "${options.filepattern.reject}" != "" ]] && \
924		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
925			continue
926		fi
927
928		node.license_info_found=false
929
930		if ! "${node.comments_parsed}" ; then
931			continue
932		fi
933
934		for j in "${!node.comments[@]}" ; do
935			typeset s="${node.comments[$j]}"
936			typeset match=false
937
938			if [[ "${options.commentpattern.accept}" != "" ]] && \
939		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
940				match=true
941			fi
942			if [[ "${options.commentpattern.reject}" != "" ]] && \
943	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
944				match=false
945			fi
946
947
948			if "${match}" ; then
949				typeset -l hashstring # lowercase
950
951				# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
952				hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
953				# ... and then create a MD5 hash from this string
954				hash="$(sum -x md5 <<<"${hashstring}")"
955
956				nameref hc_node=hashed_comments[${hash}]
957
958				if [[ "${hc_node}" == "" ]] ; then
959					# build node if there isn't one yet
960					typeset -a hc_node.fileids
961					typeset    hc_node.comment="$s"
962				fi
963
964				hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
965
966				node.license_info_found=true
967			fi
968		done
969
970		if ! "${node.license_info_found}" ; then
971			printf "## no match found in "
972			printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
973			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
974				"${node.comments_parsed}" \
975				"${node.fileformat_found}" \
976				"${node.file_format}"
977		fi
978	done
979
980	# print comments and all fileids (filename+hash sums) which include this comment
981	for i in "${!hashed_comments[@]}" ; do
982		printf "\f## The comment (ID=%s) ..." "${i}"
983		printf "\n-- snip --"
984		printf "\n%s" "${hashed_comments[${i}].comment}"
985		printf "\n-- snip --"
986		printf "\n... applies to the following files:\n"
987		printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
988	done
989
990	return 0
991}
992
993function do_crawl
994{
995	set -o errexit
996
997	compound options=(
998		integer max_filesize_for_scan=$((256*1024))
999		integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
1000	)
1001
1002	shift
1003	while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
1004		printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1005		case ${OPT} in
1006			S)	options.max_filesize_for_scan="${OPTARG}"  ;;
1007			N)	options.max_num_comments="${OPTARG}"  ;;
1008			*)	usage do_crawl_usage ;;
1009		esac
1010	done
1011	shift $((OPTIND-1))
1012
1013	compound scan=(
1014		typeset -A records
1015	)
1016
1017	# read filenames from stdin
1018	while read i ; do
1019		printf "## scanning %s ...\n" "$i"
1020		extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
1021	done
1022
1023	# print compound variable array (we strip the "typeset -A records" for now)
1024	print -v scan >"crawlsrccomments_extracted_comments.cpv"
1025
1026	print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
1027
1028	return 0
1029}
1030
1031function do_getcomments
1032{
1033	set -o errexit
1034
1035	# vars
1036	compound scan
1037	typeset database
1038	typeset tmp
1039
1040	compound options=(
1041		typeset database="crawlsrccomments_extracted_comments.cpv"
1042
1043		typeset print_stats=false
1044		typeset zapduplicates=false
1045		compound filepattern=(
1046			typeset accept="*"
1047			typeset reject=""
1048		)
1049		compound commentpattern=(
1050			typeset accept="~(Ei)(license|copyright)"
1051			typeset reject=""
1052		)
1053	)
1054
1055	shift
1056	while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
1057	#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1058		case ${OPT} in
1059			c)	options.commentpattern.accept="${OPTARG}" ;;
1060			C)	options.commentpattern.reject="${OPTARG}" ;;
1061			D)	options.database="${OPTARG}" ;;
1062			l)	options.filepattern.accept="${OPTARG}" ;;
1063			L)	options.filepattern.reject="${OPTARG}" ;;
1064			S)	options.print_stats=true ;;
1065			+S)	options.print_stats=false ;;
1066			Z)	options.zapduplicates=true ;;
1067			+Z)	options.zapduplicates=false ;;
1068			*)	usage do_getcomments_usage ;;
1069		esac
1070	done
1071	shift $((OPTIND-1))
1072
1073	# array of temporary files which should be cleaned-up upon exit
1074	typeset -a tmpfiles
1075	trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
1076
1077	# Support for HTTP URLs
1078	if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then
1079		database="/tmp/extract_license_cat_url_${PPID}_$$.tmp"
1080		tmpfiles+=( "${database}" )
1081		print -u2 "# Loading URL..."
1082		cat_url "${options.database}" >"${database}"
1083		print -u2 "# Loading URL done."
1084	else
1085		database="${options.database}"
1086	fi
1087
1088	if [[ ! -r "${database}" ]] ; then
1089		fatal_error "Can't read ${database}."
1090	fi
1091
1092	# Support for compressed database files
1093	case "$(LC_ALL=C /usr/bin/file "${database}")" in
1094		*bzip2*)
1095			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1096			tmpfiles+=( "${tmp}" )
1097			print -u2 "# Uncompressing data (bzip2) ..."
1098			bzcat <"${database}" >"${tmp}"
1099			print -u2 "# Uncompression done."
1100			database="${tmp}"
1101			;;
1102		*gzip*)
1103			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1104			tmpfiles+=( "${tmp}" )
1105			print -u2 "# Uncompressing data (gzip) ..."
1106			gunzip -c <"${database}" >"${tmp}"
1107			print -u2 "# Uncompression done."
1108			database="${tmp}"
1109			;;
1110	esac
1111
1112	# Read compound variable which contain all recorded comments
1113	print -u2 "# reading records..."
1114	read -C scan <"${database}" || fatal_error 'Error reading data.'
1115	print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
1116
1117	# print comments
1118	print -u2 "# processing data..."
1119	print "## comments start:"
1120	if "${options.zapduplicates}" ; then
1121		print_comments_duplicates_compressed scan.records options
1122	else
1123		print_comments_plain scan.records options
1124	fi
1125	print "## comments end"
1126	print -u2 "# processing data done."
1127
1128	if "${options.print_stats}" ; then
1129		print_stats
1130	fi
1131
1132	return 0
1133}
1134
1135function usage
1136{
1137	nameref usagemsg=$1
1138	OPTIND=0
1139	getopts -a "${progname}" "${usagemsg}" OPT '-?'
1140	exit 2
1141}
1142
1143typeset -r do_getcomments_usage=$'+
1144[-?\n@(#)\$Id: getcomments (Roland Mainz) 2009-05-09 \$\n]
1145[-author?Roland Mainz <roland.mainz@sun.com>]
1146[+NAME?getcomments - extract license information from source files]
1147[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
1148	license information from the "\bgetcomments\b"-database
1149	file created by \bcrawl\b. The script allows various
1150	filters (see options below) to be applied on the database]
1151[+?The license extraction is done in two steps - first a crawler script
1152	called \bcrawl\b will scan all source files, extract
1153	the comments and stores this information in a "database" file called
1154	"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
1155	queries on this database.]
1156[D:database?Database file for input (either file, http:// or https://-URL).]:[database]
1157[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
1158[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
1159[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
1160[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
1161[S:stats?Print statistics.]
1162[Z:zapsimilar?Combine similar/duplicate comments in the report.]
1163[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1164'
1165
1166typeset -r do_crawl_usage=$'+
1167[-?\n@(#)\$Id: crawl (Roland Mainz) 2009-05-09 \$\n]
1168[-author?Roland Mainz <roland.mainz@sun.com>]
1169[+NAME?crawl - crawl comment information from source files]
1170[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
1171	a list of source code files from stdin, determinates the type of
1172	syntax used by these files and then extracts
1173	comments from the source code and stores this information into a
1174	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1175	be processed by \bextract_license\b or similar processing tools.]
1176[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
1177	Defaults to 256K characters.]:[numchars]
1178[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
1179[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1180'
1181
1182typeset -r crawlsrccomments_usage=$'+
1183[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2009-05-09 \$\n]
1184[-author?Roland Mainz <roland.mainz@sun.com>]
1185[+NAME?crawlsrccomments - extract and filter comment information from source files]
1186[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
1187	a list of source code files from stdin, determinates the type of
1188	syntax used by these files and then extracts
1189	comments from the source code and stores this information into a
1190	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1191	be processed by \bextract_license\b or similar processing tools.]
1192
1193[crawl|getcomments] options
1194
1195[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1196'
1197
1198
1199# program start
1200builtin basename
1201builtin cat
1202builtin date
1203builtin uname
1204builtin rm
1205builtin sum || fatal_error "sum builtin not found."
1206
1207# exit at the first error we hit
1208set -o errexit
1209
1210typeset progname="${ basename "${0}" ; }"
1211
1212while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
1213	# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1214	case ${OPT} in
1215		*)	usage crawlsrccomments_usage ;;
1216	esac
1217done
1218shift $((OPTIND-1))
1219
1220typeset cmd="$1"
1221
1222case "$cmd" in
1223	"crawl")
1224		progname+=" ${cmd}"
1225		do_crawl "$@"
1226		exit $?
1227		;;
1228	"getcomments")
1229		progname+=" ${cmd}"
1230		do_getcomments "$@"
1231		exit $?
1232		;;
1233	*)
1234		usage crawlsrccomments_usage
1235		;;
1236esac
1237
1238fatal_error "not reached."
1239# EOF.
1240