xref: /titanic_50/usr/src/lib/libshell/common/scripts/crawlsrccomments.sh (revision 134a1f4e3289b54e0f980e9cf05352e419a60bee)
1#!/usr/bin/ksh93
2
3#
4# CDDL HEADER START
5#
6# The contents of this file are subject to the terms of the
7# Common Development and Distribution License (the "License").
8# You may not use this file except in compliance with the License.
9#
10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11# or http://www.opensolaris.org/os/licensing.
12# See the License for the specific language governing permissions
13# and limitations under the License.
14#
15# When distributing Covered Code, include this CDDL HEADER in each
16# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17# If applicable, add the following below this CDDL HEADER, with the
18# fields enclosed by brackets "[]" replaced with your own identifying
19# information: Portions Copyright [yyyy] [name of copyright owner]
20#
21# CDDL HEADER END
22#
23
24#
25# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
26#
27
28# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
29export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
30
31# Make sure all math stuff runs in the "C" locale to avoid problems
32# with alternative # radix point representations (e.g. ',' instead of
33# '.' in de_DE.*-locales). This needs to be set _before_ any
34# floating-point constants are defined in this script).
35if [[ "${LC_ALL}" != "" ]] ; then
36    export \
37        LC_MONETARY="${LC_ALL}" \
38        LC_MESSAGES="${LC_ALL}" \
39        LC_COLLATE="${LC_ALL}" \
40        LC_CTYPE="${LC_ALL}"
41        unset LC_ALL
42fi
43export LC_NUMERIC=C
44
45# constants values for tokenizer/parser stuff
46compound -r ch=(
47	newline=$'\n'
48	tab=$'\t'
49	formfeed=$'\f'
50)
51
52function fatal_error
53{
54	print -u2 "${progname}: $*"
55	exit 1
56}
57
58function printmsg
59{
60	print -u2 "$*"
61}
62
63
64function attrstrtoattrarray
65{
66#set -o xtrace
67    typeset s="$1"
68    nameref aa=$2 # attribute array
69    integer aa_count=0
70    integer aa_count=0
71    typeset nextattr
72    integer currattrlen=0
73    typeset tagstr
74    typeset tagval
75
76    while (( ${#s} > 0 )) ; do
77        # skip whitespaces
78        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
79            (( currattrlen++ ))
80        done
81        s="${s:currattrlen:${#s}}"
82
83        # anything left ?
84        (( ${#s} == 0 )) && break
85
86        # Pattern tests:
87        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
88        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
89        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
90        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
91        # All pattern combined via eregex (w|x|y|z):
92        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
93        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
94        currattrlen=$(( ${#s} - ${#nextattr}))
95
96        # add entry
97        tagstr="${s:0:currattrlen}"
98        if [[ "${tagstr}" == *=* ]] ; then
99            # normal case: attribute with value
100
101            tagval="${tagstr#*=}"
102
103            # strip quotes ('' or "")
104            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
105                tagval="${tagval:1:${#tagval}-2}"
106            fi
107
108            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
109        else
110            # special case for HTML where you have something like <foo baz>
111            aa[${aa_count}]=( name="${tagstr}" )
112        fi
113        (( aa_count++ ))
114        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
115    done
116}
117
118# XML document handler
119function handle_xml_document
120{
121#set -o xtrace
122    nameref callbacks=${1}
123    typeset tag_type="${2}"
124    typeset tag_value="${3}"
125    typeset tag_attributes="${4}"
126    nameref doc=${callbacks["arg_tree"]}
127    nameref nodepath="${stack.items[stack.pos]}"
128    nameref nodesnum="${stack.items[stack.pos]}num"
129
130    case "${tag_type}" in
131        tag_comment)
132            nodepath[${nodesnum}]+=(
133                typeset tagtype="comment"
134                typeset tagvalue="${tag_value}"
135            )
136            (( nodesnum++ ))
137            ;;
138    esac
139
140#    print "xmltok: '${tag_type}' = '${tag_value}'"
141}
142
143function xml_tok
144{
145    typeset buf=""
146    typeset namebuf=""
147    typeset attrbuf=""
148    typeset c=""
149    typeset isendtag # bool: true/false
150    typeset issingletag # bool: true/false (used for tags like "<br />")
151    nameref callbacks=${1}
152
153    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
154
155    while IFS='' read -r -N 1 c ; do
156        isendtag=false
157
158        if [[ "$c" == "<" ]] ; then
159	    # flush any text content
160            if [[ "$buf" != "" ]] ; then
161                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
162                buf=""
163            fi
164
165            IFS='' read -r -N 1 c
166            if [[ "$c" == "/" ]] ; then
167                isendtag=true
168            else
169                buf="$c"
170            fi
171            IFS='' read -r -d '>' c
172            buf+="$c"
173
174	    # handle comments
175	    if [[ "$buf" == ~(El)!-- ]] ; then
176	        # did we read the comment completely ?
177	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
178		    buf+=">"
179	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
180		        IFS='' read -r -N 1 c || break
181		        buf+="$c"
182		    done
183		fi
184
185		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
186		buf=""
187		continue
188	    fi
189
190	    # check if the tag starts and ends at the same time (like "<br />")
191	    if [[ "${buf}" == ~(Er).*/ ]] ; then
192	        issingletag=true
193		buf="${buf%*/}"
194	    else
195	        issingletag=false
196	    fi
197
198	    # check if the tag has attributes (e.g. space after name)
199	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
200	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
201                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
202            else
203	        namebuf="$buf"
204		attrbuf=""
205	    fi
206
207            if ${isendtag} ; then
208                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
209            else
210                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
211
212                # handle tags like <br/> (which are start- and end-tag in one piece)
213                if ${issingletag} ; then
214                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
215                fi
216            fi
217            buf=""
218        else
219            buf+="$c"
220        fi
221    done
222
223    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
224
225    print # final newline to make filters like "sed" happy
226}
227
228# enumerate comments in a shell (or shell-like) script
229function enumerate_comments_shell
230{
231	set -o errexit
232
233	typeset input_file="$1"
234	nameref comment_array="$2"
235	integer max_num_comments="$3"
236	integer ca=0 # index in "comment_array"
237
238	integer res=0
239
240	typeset comment=""
241
242	while (( res == 0 )) ; do
243		IFS='' read -r line
244		(( res=$? ))
245
246		if [[ "${line}" == ~(El)#.* ]] ; then
247			comment+="${line#\#}${ch.newline}"
248		else
249			if [[ "$comment" != "" ]] ; then
250				comment_array[ca++]="${comment}"
251				comment=""
252
253				if (( ca > max_num_comments )) ; then
254					break
255				fi
256			fi
257		fi
258	done <"${input_file}"
259
260	return 0
261}
262
263
264# enumerate comments in a troff document
265function enumerate_comments_troff
266{
267	set -o errexit
268
269	typeset input_file="$1"
270	nameref comment_array="$2"
271	integer max_num_comments="$3"
272	integer ca=0 # index in "comment_array"
273
274	integer res=0
275
276	typeset comment=""
277
278	while (( res == 0 )) ; do
279		IFS='' read -r line
280		(( res=$? ))
281
282		if [[ "${line}" == ~(El)\.*\\\" ]] ; then
283			comment+="${line#~(El)\.*\\\"}${ch.newline}"
284		else
285			if [[ "$comment" != "" ]] ; then
286				comment_array[ca++]="${comment}"
287				comment=""
288
289				if (( ca > max_num_comments )) ; then
290					break
291				fi
292			fi
293		fi
294	done <"${input_file}"
295
296	return 0
297}
298
299
300# enumerate comments in files which are preprocessed by
301# CPP (e.g. C, C++, Imakefile etc.)
302function enumerate_comments_cpp
303{
304	set -o errexit
305#	set -o nounset
306
307	integer err=0
308
309	typeset input_file="$1"
310	nameref comment_array="$2"
311	integer max_num_comments="$3"
312	integer max_filesize_for_scan="$4"
313	integer ca=0 # index in "comment_array"
314
315	typeset content
316	integer content_length
317
318	integer file_pos # file position
319	compound line_pos=(
320		integer x=0 # X position in line
321		integer y=0 # Y position in line (line number)
322	)
323	typeset c c2
324
325	typeset comment
326
327	compound state=(
328		# C comment state
329		typeset in_c_comment=false
330		# C++ comment state
331		compound cxx=(
332			typeset in_comment=false
333			typeset comment_continued=false
334			# position of current //-pos
335			compound comment_pos=(
336				integer x=-1
337				integer y=-1
338			)
339			# position of previous //-pos
340			compound comment_prev_pos=(
341				integer x=-1
342				integer y=-1
343			)
344		)
345		# literal state
346		typeset in_sq_literal=false # single-quote literal
347		typeset in_dq_literal=false # double-quote literal
348	)
349
350	content="$(< "${input_file}")"
351
352	# Truncate file to "max_filesize_for_scan" charatcters.
353	# This was originally added to work around a performance problem with
354	# the ${str:offset:chunksize} operator which scales badly in ksh93
355	# version 's' with the number of characters
356	if (( ${#content} > max_filesize_for_scan )) ; then
357		print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
358			"${input_file}" \
359			max_filesize_for_scan
360		content="${content:0:max_filesize_for_scan}"
361	fi
362	content_length=${#content}
363
364	# Iterate through the source code. The last character
365	# (when file_pos == content_length) will be empty to indicate
366	# EOF (this is needed for cases like when
367	# a C++ comment is not terminated by a newline... ;-/)
368	for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
369		c2="${content:file_pos:2}"
370		c="${c2:0:1}"
371
372		if [[ "$c" == "${ch.newline}" ]] ; then
373			(( line_pos.x=0, line_pos.y++ ))
374		else
375			(( line_pos.x++ ))
376		fi
377
378		if ${state.in_c_comment} ; then
379			if [[ "$c2" == "*/" ]] ; then
380				(( file_pos++, line_pos.x++ ))
381				state.in_c_comment=false
382
383				# flush comment text
384				comment_array[ca++]="${comment}"
385				comment=""
386
387				if (( ca > max_num_comments )) ; then
388					break
389				fi
390			else
391				comment+="$c"
392			fi
393		elif ${state.cxx.in_comment} ; then
394			if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
395				state.cxx.in_comment=false
396
397				# flush comment text
398				if ${state.cxx.comment_continued} ; then
399					comment_array[ca-1]+="${ch.newline}${comment}"
400					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
401					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
402				else
403					comment_array[ca++]="${comment}"
404					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
405					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
406				fi
407				comment=""
408
409				if (( ca > max_num_comments )) ; then
410					break
411				fi
412			else
413				comment+="$c"
414			fi
415		elif ${state.in_sq_literal} ; then
416			if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
417				state.in_sq_literal=false
418			fi
419		elif ${state.in_dq_literal} ; then
420			if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
421				state.in_dq_literal=false
422			fi
423		else
424			if [[ "$c2" == "/*" ]] ; then
425				(( file_pos++, line_pos.x++ ))
426				state.in_c_comment=true
427				comment=""
428			elif [[ "$c2" == "//" ]] ; then
429				(( file_pos++, line_pos.x++ ))
430				if (( state.cxx.comment_prev_pos.x == line_pos.x && \
431					state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
432					state.cxx.comment_continued=true
433			else
434				state.cxx.comment_continued=false
435			fi
436			(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
437			state.cxx.in_comment=true
438			comment=""
439			elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
440				state.in_sq_literal=true
441			elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
442				state.in_dq_literal=true
443			fi
444		fi
445	done
446
447	if [[ "$comment" != "" ]] ; then
448		print -u2 "## ERROR: Comment text buffer not empty at EOF."
449		err=1
450	fi
451
452	if ${state.in_c_comment} ; then
453		print -u2 "## ERROR: C comment did not close before EOF."
454		err=1
455	fi
456
457	if ${state.cxx.in_comment} ; then
458		print -u2 "## ERROR: C++ comment did not close before EOF."
459		err=1
460	fi
461
462	if ${state.in_dq_literal} ; then
463		print -u2 "## ERROR: Double-quoted literal did not close before EOF."
464		err=1
465	fi
466
467	# We treat this one only as warning since things like "foo.html.cpp" may
468	# trigger this condition accidently
469	if ${state.in_sq_literal} ; then
470		print -u2 "## WARNING: Single-quoted literal did not close before EOF."
471	fi
472
473	return $err
474}
475
476# determine file type
477function get_file_format
478{
479	set -o errexit
480
481	typeset filename="$1"
482	nameref file_format="$2"
483
484	typeset fileeval # evaluation result of /usr/bin/file
485
486	# check whether "filename" is a plain, readable file
487	[[ ! -f "$filename" ]] && return 1
488	[[ ! -r "$filename" ]] && return 1
489
490	# In theory this code would exclusively look at the contents of
491	# the file to figure out it's file format - unfortunately
492	# /usr/bin/file is virtually useless (the heuristics, matching
493	# and output unreliable) for many file formats and therefore
494	# we have to do a multi-stage approach which looks
495	# at the file's content if possible and at the filename
496	# otherwise. Fun... ;-(
497
498	# pass one: Find matches for file formats where /usr/bin/file
499	# is known to be unreliable:
500	case "$filename" in
501		*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
502			file_format="c_source"
503			return 0
504			;;
505		*Imakefile)
506			file_format="imakefile"
507			return 0
508			;;
509		*Makefile)
510			file_format="makefile"
511			return 0
512			;;
513	esac
514
515	# pass two: match by file content via /usr/bin/file
516	fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
517	case "$fileeval" in
518		~(E)roff)
519			file_format="troff"
520			return 0
521			;;
522		~(E)html\ document)
523			file_format="html"
524			return 0
525			;;
526		~(E)sgml\ document)
527			file_format="sgml"
528			return 0
529			;;
530		~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
531			file_format="shell"
532			return 0
533			;;
534		~(E)executable.*/perl\ script)
535			file_format="perl"
536			return 0
537			;;
538	esac
539
540	# pass three: fallhack to filename matching
541	case "$filename" in
542		*.man)
543			file_format="troff"
544			return 0
545			;;
546		*.html)
547			file_format="html"
548			return 0
549			;;
550		*.sgml)
551			file_format="sgml"
552			return 0
553			;;
554		*.xml)
555			file_format="xml"
556			return 0
557			;;
558		*.png)
559			file_format="image_png"
560			return 0
561			;;
562		*.xcf)
563			file_format="image_xcf"
564			return 0
565			;;
566		*.shar)
567			file_format="archive_shell"
568			return 0
569			;;
570		*.sh)
571			file_format="shell"
572			return 0
573			;;
574		*.pcf)
575			file_format="font_pcf"
576			return 0
577			;;
578		*.bdf)
579			file_format="font_bdf"
580			return 0
581			;;
582		*.pmf)
583			file_format="font_pmf"
584			return 0
585			;;
586		*.ttf | *.otf)
587			file_format="font_ttf"
588			return 0
589			;;
590		*.pfa | *.pfb)
591			file_format="font_postscript"
592			return 0
593			;;
594	esac
595
596	return 1
597}
598
599function extract_comments
600{
601	set -o errexit
602
603	nameref records="$1"
604	typeset filename="$2"
605	integer max_num_comments="$3"
606	integer max_filesize_for_scan="$4"
607
608	typeset datatype=""
609
610	records[${filename}]=(
611		typeset filename="$filename"
612
613		typeset fileformat_found="false" # "true" or "false"
614		typeset file_format=""
615
616		typeset -A hashsum
617
618		typeset comments_parsed="false" # "true" or "false"
619		typeset -a comments
620	)
621
622	records[${filename}].hashsum["md5"]="$(sum  -x md5  < "$filename")"
623	records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
624
625	if get_file_format "$filename" datatype ; then
626		records[${filename}].fileformat_found="true"
627		records[${filename}].file_format="$datatype"
628	else
629		return 1
630	fi
631
632	case "$datatype" in
633		c_source|imakefile)
634			enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
635				records[${filename}].comments_parsed=true
636			;;
637		shell|makefile)
638			enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
639				records[${filename}].comments_parsed=true
640			;;
641		troff)
642			enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
643				records[${filename}].comments_parsed=true
644			;;
645		# NOTE: Disabled for now
646		#xml|html|sgml)
647		#	enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
648		#		records[${filename}].comments_parsed=true
649		#	;;
650	esac
651
652	return 0
653}
654
655# parse HTTP return code, cookies etc.
656function parse_http_response
657{
658	nameref response="$1"
659	typeset h statuscode statusmsg i
660
661	# we use '\r' as additional IFS to filter the final '\r'
662	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
663	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
664	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
665	response.statuscode="$statuscode"
666	response.statusmsg="$statusmsg"
667
668	# skip remaining headers
669	while IFS='' read -r i ; do
670		[[ "$i" == $'\r' ]] && break
671
672		# strip '\r' at the end
673		i="${i/~(Er)$'\r'/}"
674
675		case "$i" in
676			~(Eli)Content-Type:.*)
677				response.content_type="${i/~(El).*:[[:blank:]]*/}"
678				;;
679			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
680				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
681				;;
682			~(Eli)Transfer-Encoding:.*)
683				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
684				;;
685		esac
686	done
687
688	return 0
689}
690
691function cat_http_body
692{
693	typeset emode="$1"
694	typeset hexchunksize="0"
695	integer chunksize=0
696
697	if [[ "${emode}" == "chunked" ]] ; then
698		while IFS=$'\r' read hexchunksize &&
699			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] &&
700			(( chunksize=$( printf "16#%s\n" "${hexchunksize}" )  )) && (( chunksize > 0 )) ; do
701			dd bs=1 count="${chunksize}" 2>/dev/null
702		done
703	else
704		cat
705	fi
706
707	return 0
708}
709
710function cat_url
711{
712	typeset protocol="${1%://*}"
713	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
714
715	if [[ "${protocol}" == "file" ]] ; then
716		cat "${path1}"
717		return $?
718	elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then
719		typeset host="${path1%%/*}"
720		typeset path="${path1#*/}"
721		typeset port="${host##*:}"
722
723		integer netfd
724		compound httpresponse # http response
725
726		# If URL did not contain a port number in the host part then look at the
727		# protocol to get the port number
728		if [[ "${port}" == "${host}" ]] ; then
729			case "${protocol}" in
730				"http")  port=80 ;;
731				"https") port=443 ;;
732				*)       port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
733			esac
734		else
735			host="${host%:*}"
736		fi
737
738		printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
739
740		# prechecks
741		[[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
742		[[ "${port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
743		[[ "${host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
744		[[ "${path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }
745
746		# open TCP channel
747		if [[ "${protocol}" == "https" ]] ; then
748			compound sslfifo
749			sslfifo.dir="$(mktemp -d)"
750			sslfifo.in="${sslfifo.dir}/in"
751			sslfifo.out="${sslfifo.dir}/out"
752
753			# register an EXIT trap and use "errexit" to leave it at the first error
754			# (this saves lots of if/fi tests for error checking)
755			trap "rm -r \"${sslfifo.dir}\"" EXIT
756			set -o errexit
757
758			mkfifo "${sslfifo.in}" "${sslfifo.out}"
759
760			# create async openssl child to handle https
761			openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" &
762
763			# send HTTP request
764			request="GET /${path} HTTP/1.1\r\n"
765			request+="Host: ${host}\r\n"
766			request+="User-Agent: crawlsrccomments/ksh93(ssl) (2010-03-27; $(uname -s -r -p))\r\n"
767			request+="Connection: close\r\n"
768			print -n -- "${request}\r\n" >>	"${sslfifo.in}"
769
770			# collect response and send it to stdout
771			{
772				parse_http_response httpresponse
773				cat_http_body "${httpresponse.transfer_encoding}"
774			} <"${sslfifo.out}"
775
776			wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; }
777
778			return 0
779		else
780			redirect {netfd}<> "/dev/tcp/${host}/${port}"
781			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
782
783			# send HTTP request
784			request="GET /${path} HTTP/1.1\r\n"
785			request+="Host: ${host}\r\n"
786			request+="User-Agent: crawlsrccomments/ksh93 (2010-03-27; $(uname -s -r -p))\r\n"
787			request+="Connection: close\r\n"
788			print -n -- "${request}\r\n" >&${netfd}
789
790			# collect response and send it to stdout
791			parse_http_response httpresponse <&${netfd}
792			cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
793
794			# close connection
795			redirect {netfd}<&-
796
797			return 0
798		fi
799	else
800		return 1
801	fi
802	# notreached
803}
804
805function print_stats
806{
807	set -o errexit
808
809	# gather some statistics
810	compound stats=(
811		integer files_with_comments=0
812		integer files_without_comments=0
813
814		integer files_without_known_format=0
815
816		integer files_with_license_info=0
817		integer files_without_license_info=0
818
819		integer total_num_files=0
820	)
821
822	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
823		if "${records[$i].comments_parsed}" ; then
824			(( stats.files_with_comments++ ))
825		else
826			(( stats.files_without_comments++ ))
827		fi
828
829		if ! "${records[$i].fileformat_found}" ; then
830			(( stats.files_without_known_format++ ))
831		fi
832
833		if "${records[$i].license_info_found}" ; then
834			(( stats.files_with_license_info++ ))
835		else
836			(( stats.files_without_license_info++ ))
837		fi
838
839		(( stats.total_num_files++ ))
840	done
841
842	print -v stats
843	return 0
844}
845
846
847function print_comments_plain
848{
849	set -o errexit
850
851	nameref records=$1
852	nameref options=$2
853	typeset i j
854
855	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
856		nameref node=records[$i]
857
858		if [[ "${options.filepattern.accept}" != "" ]] && \
859		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
860			continue
861		fi
862		if [[ "${options.filepattern.reject}" != "" ]] && \
863		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
864			continue
865		fi
866
867		node.license_info_found=false
868
869		if ! "${node.comments_parsed}" ; then
870			continue
871		fi
872
873		for j in "${!node.comments[@]}" ; do
874			typeset s="${node.comments[$j]}"
875			typeset match=false
876
877			if [[ "${options.commentpattern.accept}" != "" ]] && \
878		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
879				match=true
880			fi
881			if [[ "${options.commentpattern.reject}" != "" ]] && \
882	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
883				match=false
884			fi
885
886			if "${match}" ; then
887				printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
888				printf "%s\n" "$s"
889				node.license_info_found=true
890			fi
891		done
892
893		if ! "${node.license_info_found}" ; then
894			printf "## no match found in '%s'," "${node.filename}"
895			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
896				"${node.comments_parsed}" \
897				"${node.fileformat_found}" \
898				"${node.file_format}"
899		fi
900	done
901
902	return 0
903}
904
905function print_comments_duplicates_compressed
906{
907	set -o errexit
908
909	nameref records=$1
910	nameref options=$2
911	typeset i j
912	typeset -A hashed_comments
913	integer num_hashed_comments
914
915	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
916		nameref node=records[$i]
917
918		if [[ "${options.filepattern.accept}" != "" ]] && \
919		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
920			continue
921		fi
922		if [[ "${options.filepattern.reject}" != "" ]] && \
923		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
924			continue
925		fi
926
927		node.license_info_found=false
928
929		if ! "${node.comments_parsed}" ; then
930			continue
931		fi
932
933		for j in "${!node.comments[@]}" ; do
934			typeset s="${node.comments[$j]}"
935			typeset match=false
936
937			if [[ "${options.commentpattern.accept}" != "" ]] && \
938		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
939				match=true
940			fi
941			if [[ "${options.commentpattern.reject}" != "" ]] && \
942	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
943				match=false
944			fi
945
946
947			if "${match}" ; then
948				typeset -l hashstring # lowercase
949
950				# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
951				hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
952				# ... and then create a MD5 hash from this string
953				hash="$(sum -x md5 <<<"${hashstring}")"
954
955				nameref hc_node=hashed_comments[${hash}]
956
957				if [[ "${hc_node}" == "" ]] ; then
958					# build node if there isn't one yet
959					typeset -a hc_node.fileids
960					typeset    hc_node.comment="$s"
961				fi
962
963				hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
964
965				node.license_info_found=true
966			fi
967		done
968
969		if ! "${node.license_info_found}" ; then
970			printf "## no match found in "
971			printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
972			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
973				"${node.comments_parsed}" \
974				"${node.fileformat_found}" \
975				"${node.file_format}"
976		fi
977	done
978
979	# print comments and all fileids (filename+hash sums) which include this comment
980	for i in "${!hashed_comments[@]}" ; do
981		printf "\f## The comment (ID=%s) ..." "${i}"
982		printf "\n-- snip --"
983		printf "\n%s" "${hashed_comments[${i}].comment}"
984		printf "\n-- snip --"
985		printf "\n... applies to the following files:\n"
986		printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
987	done
988
989	return 0
990}
991
992function do_crawl
993{
994	set -o errexit
995
996	compound options=(
997		integer max_filesize_for_scan=$((256*1024))
998		integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
999	)
1000
1001	shift
1002	while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
1003		printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1004		case ${OPT} in
1005			S)	options.max_filesize_for_scan="${OPTARG}"  ;;
1006			N)	options.max_num_comments="${OPTARG}"  ;;
1007			*)	usage do_crawl_usage ;;
1008		esac
1009	done
1010	shift $((OPTIND-1))
1011
1012	compound scan=(
1013		typeset -A records
1014	)
1015
1016	# read filenames from stdin
1017	while read i ; do
1018		printf "## scanning %s ...\n" "$i"
1019		extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
1020	done
1021
1022	# print compound variable array (we strip the "typeset -A records" for now)
1023	print -v scan >"crawlsrccomments_extracted_comments.cpv"
1024
1025	print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
1026
1027	return 0
1028}
1029
1030function do_getcomments
1031{
1032	set -o errexit
1033
1034	# vars
1035	compound scan
1036	typeset database
1037	typeset tmp
1038
1039	compound options=(
1040		typeset database="crawlsrccomments_extracted_comments.cpv"
1041
1042		typeset print_stats=false
1043		typeset zapduplicates=false
1044		compound filepattern=(
1045			typeset accept="*"
1046			typeset reject=""
1047		)
1048		compound commentpattern=(
1049			typeset accept="~(Ei)(license|copyright)"
1050			typeset reject=""
1051		)
1052	)
1053
1054	shift
1055	while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
1056	#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1057		case ${OPT} in
1058			c)	options.commentpattern.accept="${OPTARG}" ;;
1059			C)	options.commentpattern.reject="${OPTARG}" ;;
1060			D)	options.database="${OPTARG}" ;;
1061			l)	options.filepattern.accept="${OPTARG}" ;;
1062			L)	options.filepattern.reject="${OPTARG}" ;;
1063			S)	options.print_stats=true ;;
1064			+S)	options.print_stats=false ;;
1065			Z)	options.zapduplicates=true ;;
1066			+Z)	options.zapduplicates=false ;;
1067			*)	usage do_getcomments_usage ;;
1068		esac
1069	done
1070	shift $((OPTIND-1))
1071
1072	# array of temporary files which should be cleaned-up upon exit
1073	typeset -a tmpfiles
1074	trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
1075
1076	# Support for HTTP URLs
1077	if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then
1078		database="/tmp/extract_license_cat_url_${PPID}_$$.tmp"
1079		tmpfiles+=( "${database}" )
1080		print -u2 "# Loading URL..."
1081		cat_url "${options.database}" >"${database}"
1082		print -u2 "# Loading URL done."
1083	else
1084		database="${options.database}"
1085	fi
1086
1087	if [[ ! -r "${database}" ]] ; then
1088		fatal_error "Can't read ${database}."
1089	fi
1090
1091	# Support for compressed database files
1092	case "$(LC_ALL=C /usr/bin/file "${database}")" in
1093		*bzip2*)
1094			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1095			tmpfiles+=( "${tmp}" )
1096			print -u2 "# Uncompressing data (bzip2) ..."
1097			bzcat <"${database}" >"${tmp}"
1098			print -u2 "# Uncompression done."
1099			database="${tmp}"
1100			;;
1101		*gzip*)
1102			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
1103			tmpfiles+=( "${tmp}" )
1104			print -u2 "# Uncompressing data (gzip) ..."
1105			gunzip -c <"${database}" >"${tmp}"
1106			print -u2 "# Uncompression done."
1107			database="${tmp}"
1108			;;
1109	esac
1110
1111	# Read compound variable which contain all recorded comments
1112	print -u2 "# reading records..."
1113	read -C scan <"${database}" || fatal_error 'Error reading data.'
1114	print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
1115
1116	# print comments
1117	print -u2 "# processing data..."
1118	print "## comments start:"
1119	if "${options.zapduplicates}" ; then
1120		print_comments_duplicates_compressed scan.records options
1121	else
1122		print_comments_plain scan.records options
1123	fi
1124	print "## comments end"
1125	print -u2 "# processing data done."
1126
1127	if "${options.print_stats}" ; then
1128		print_stats
1129	fi
1130
1131	return 0
1132}
1133
1134function usage
1135{
1136	nameref usagemsg=$1
1137	OPTIND=0
1138	getopts -a "${progname}" "${usagemsg}" OPT '-?'
1139	exit 2
1140}
1141
1142typeset -r do_getcomments_usage=$'+
1143[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n]
1144[-author?Roland Mainz <roland.mainz@sun.com>]
1145[-author?Roland Mainz <roland.mainz@nrubsig.org>]
1146[+NAME?getcomments - extract license information from source files]
1147[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
1148	license information from the "\bgetcomments\b"-database
1149	file created by \bcrawl\b. The script allows various
1150	filters (see options below) to be applied on the database]
1151[+?The license extraction is done in two steps - first a crawler script
1152	called \bcrawl\b will scan all source files, extract
1153	the comments and stores this information in a "database" file called
1154	"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
1155	queries on this database.]
1156[D:database?Database file for input (either file, http:// or https://-URL).]:[database]
1157[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
1158[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
1159[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
1160[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
1161[S:stats?Print statistics.]
1162[Z:zapsimilar?Combine similar/duplicate comments in the report.]
1163[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1164'
1165
1166typeset -r do_crawl_usage=$'+
1167[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n]
1168[-author?Roland Mainz <roland.mainz@sun.com>]
1169[-author?Roland Mainz <roland.mainz@nrubsig.org>]
1170[+NAME?crawl - crawl comment information from source files]
1171[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
1172	a list of source code files from stdin, determinates the type of
1173	syntax used by these files and then extracts
1174	comments from the source code and stores this information into a
1175	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1176	be processed by \bextract_license\b or similar processing tools.]
1177[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
1178	Defaults to 256K characters.]:[numchars]
1179[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
1180[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1181'
1182
1183typeset -r crawlsrccomments_usage=$'+
1184[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n]
1185[-author?Roland Mainz <roland.mainz@sun.com>]
1186[-author?Roland Mainz <roland.mainz@nrubsig.org>]
1187[+NAME?crawlsrccomments - extract and filter comment information from source files]
1188[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
1189	a list of source code files from stdin, determinates the type of
1190	syntax used by these files and then extracts
1191	comments from the source code and stores this information into a
1192	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
1193	be processed by \bextract_license\b or similar processing tools.]
1194
1195[crawl|getcomments] options
1196
1197[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
1198'
1199
1200
1201# program start
1202builtin basename
1203builtin cat
1204builtin date
1205builtin uname
1206builtin rm
1207builtin sum || fatal_error "sum builtin not found."
1208
1209# exit at the first error we hit
1210set -o errexit
1211
1212typeset progname="${ basename "${0}" ; }"
1213
1214while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
1215	# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
1216	case ${OPT} in
1217		*)	usage crawlsrccomments_usage ;;
1218	esac
1219done
1220shift $((OPTIND-1))
1221
1222typeset cmd="$1"
1223
1224case "$cmd" in
1225	"crawl")
1226		progname+=" ${cmd}"
1227		do_crawl "$@"
1228		exit $?
1229		;;
1230	"getcomments")
1231		progname+=" ${cmd}"
1232		do_getcomments "$@"
1233		exit $?
1234		;;
1235	*)
1236		usage crawlsrccomments_usage
1237		;;
1238esac
1239
1240fatal_error "not reached."
1241# EOF.
1242