common/scripts/crawlsrccomments.sh

#!/usr/bin/ksh93

#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
#

# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin

# Make sure all math stuff runs in the "C" locale to avoid problems
# with alternative # radix point representations (e.g. ',' instead of
# '.' in de_DE.*-locales). This needs to be set _before_ any
# floating-point constants are defined in this script).
if [[ "${LC_ALL}" != "" ]] ; then
    export \
        LC_MONETARY="${LC_ALL}" \
        LC_MESSAGES="${LC_ALL}" \
        LC_COLLATE="${LC_ALL}" \
        LC_CTYPE="${LC_ALL}"
        unset LC_ALL
fi
export LC_NUMERIC=C

# constants values for tokenizer/parser stuff
compound -r ch=(
	newline=$'\n'
	tab=$'\t'
	formfeed=$'\f'
)

function fatal_error
{
	print -u2 "${progname}: $*"
	exit 1
}

function printmsg
{
	print -u2 "$*"
}


function attrstrtoattrarray
{
#set -o xtrace
    typeset s="$1"
    nameref aa=$2 # attribute array
    integer aa_count=0
    integer aa_count=0
    typeset nextattr
    integer currattrlen=0
    typeset tagstr
    typeset tagval

    while (( ${#s} > 0 )) ; do
        # skip whitespaces
        while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
            (( currattrlen++ ))
        done
        s="${s:currattrlen:${#s}}"

        # anything left ?
        (( ${#s} == 0 )) && break

        # Pattern tests:
        #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
        #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
        #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
        #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
        # All pattern combined via eregex (w|x|y|z):
        #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
        nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
        currattrlen=$(( ${#s} - ${#nextattr}))

        # add entry
        tagstr="${s:0:currattrlen}"
        if [[ "${tagstr}" == *=* ]] ; then
            # normal case: attribute with value

            tagval="${tagstr#*=}"

            # strip quotes ('' or "")
            if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
                tagval="${tagval:1:${#tagval}-2}"
            fi

            aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
        else
            # special case for HTML where you have something like <foo baz>
            aa[${aa_count}]=( name="${tagstr}" )
        fi
        (( aa_count++ ))
        (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
    done
}

# XML document handler
function handle_xml_document
{
#set -o xtrace
    nameref callbacks=${1}
    typeset tag_type="${2}"
    typeset tag_value="${3}"
    typeset tag_attributes="${4}"
    nameref doc=${callbacks["arg_tree"]}
    nameref nodepath="${stack.items[stack.pos]}"
    nameref nodesnum="${stack.items[stack.pos]}num"

    case "${tag_type}" in
        tag_comment)
            nodepath[${nodesnum}]+=(
                typeset tagtype="comment"
                typeset tagvalue="${tag_value}"
            )
            (( nodesnum++ ))
            ;;
    esac

#    print "xmltok: '${tag_type}' = '${tag_value}'"
}

function xml_tok
{
    typeset buf=""
    typeset namebuf=""
    typeset attrbuf=""
    typeset c=""
    typeset isendtag # bool: true/false
    typeset issingletag # bool: true/false (used for tags like "<br />")
    nameref callbacks=${1}

    [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"

    while IFS='' read -r -N 1 c ; do
        isendtag=false

        if [[ "$c" == "<" ]] ; then
	    # flush any text content
            if [[ "$buf" != "" ]] ; then
                [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
                buf=""
            fi

            IFS='' read -r -N 1 c
            if [[ "$c" == "/" ]] ; then
                isendtag=true
            else
                buf="$c"
            fi
            IFS='' read -r -d '>' c
            buf+="$c"

	    # handle comments
	    if [[ "$buf" == ~(El)!-- ]] ; then
	        # did we read the comment completely ?
	        if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
		    buf+=">"
	            while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
		        IFS='' read -r -N 1 c || break
		        buf+="$c"
		    done
		fi

		[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
		buf=""
		continue
	    fi

	    # check if the tag starts and ends at the same time (like "<br />")
	    if [[ "${buf}" == ~(Er).*/ ]] ; then
	        issingletag=true
		buf="${buf%*/}"
	    else
	        issingletag=false
	    fi

	    # check if the tag has attributes (e.g. space after name)
	    if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
	        namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
                attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
            else
	        namebuf="$buf"
		attrbuf=""
	    fi

            if ${isendtag} ; then
                [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
            else
                [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"

                # handle tags like <br/> (which are start- and end-tag in one piece)
                if ${issingletag} ; then
                    [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
                fi
            fi
            buf=""
        else
            buf+="$c"
        fi
    done

    [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"

    print # final newline to make filters like "sed" happy
}

# enumerate comments in a shell (or shell-like) script
function enumerate_comments_shell
{
	set -o errexit

	typeset input_file="$1"
	nameref comment_array="$2"
	integer max_num_comments="$3"
	integer ca=0 # index in "comment_array"

	integer res=0

	typeset comment=""

	while (( res == 0 )) ; do
		IFS='' read -r line
		(( res=$? ))

		if [[ "${line}" == ~(El)#.* ]] ; then
			comment+="${line#\#}${ch.newline}"
		else
			if [[ "$comment" != "" ]] ; then
				comment_array[ca++]="${comment}"
				comment=""

				if (( ca > max_num_comments )) ; then
					break
				fi
			fi
		fi
	done <"${input_file}"

	return 0
}


# enumerate comments in a troff document
function enumerate_comments_troff
{
	set -o errexit

	typeset input_file="$1"
	nameref comment_array="$2"
	integer max_num_comments="$3"
	integer ca=0 # index in "comment_array"

	integer res=0

	typeset comment=""

	while (( res == 0 )) ; do
		IFS='' read -r line
		(( res=$? ))

		if [[ "${line}" == ~(El)\.*\\\" ]] ; then
			comment+="${line#~(El)\.*\\\"}${ch.newline}"
		else
			if [[ "$comment" != "" ]] ; then
				comment_array[ca++]="${comment}"
				comment=""

				if (( ca > max_num_comments )) ; then
					break
				fi
			fi
		fi
	done <"${input_file}"

	return 0
}


# enumerate comments in files which are preprocessed by
# CPP (e.g. C, C++, Imakefile etc.)
function enumerate_comments_cpp
{
	set -o errexit
#	set -o nounset

	integer err=0

	typeset input_file="$1"
	nameref comment_array="$2"
	integer max_num_comments="$3"
	integer max_filesize_for_scan="$4"
	integer ca=0 # index in "comment_array"

	typeset content
	integer content_length

	integer file_pos # file position
	compound line_pos=(
		integer x=0 # X position in line
		integer y=0 # Y position in line (line number)
	)
	typeset c c2

	typeset comment

	compound state=(
		# C comment state
		typeset in_c_comment=false
		# C++ comment state
		compound cxx=(
			typeset in_comment=false
			typeset comment_continued=false
			# position of current //-pos
			compound comment_pos=(
				integer x=-1
				integer y=-1
			)
			# position of previous //-pos
			compound comment_prev_pos=(
				integer x=-1
				integer y=-1
			)
		)
		# literal state
		typeset in_sq_literal=false # single-quote literal
		typeset in_dq_literal=false # double-quote literal
	)

	content="$(< "${input_file}")"

	# Truncate file to "max_filesize_for_scan" charatcters.
	# This was originally added to work around a performance problem with
	# the ${str:offset:chunksize} operator which scales badly in ksh93
	# version 's' with the number of characters
	if (( ${#content} > max_filesize_for_scan )) ; then
		print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
			"${input_file}" \
			max_filesize_for_scan
		content="${content:0:max_filesize_for_scan}"
	fi
	content_length=${#content}

	# Iterate through the source code. The last character
	# (when file_pos == content_length) will be empty to indicate
	# EOF (this is needed for cases like when
	# a C++ comment is not terminated by a newline... ;-/)
	for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
		c2="${content:file_pos:2}"
		c="${c2:0:1}"

		if [[ "$c" == "${ch.newline}" ]] ; then
			(( line_pos.x=0, line_pos.y++ ))
		else
			(( line_pos.x++ ))
		fi

		if ${state.in_c_comment} ; then
			if [[ "$c2" == "*/" ]] ; then
				(( file_pos++, line_pos.x++ ))
				state.in_c_comment=false

				# flush comment text
				comment_array[ca++]="${comment}"
				comment=""

				if (( ca > max_num_comments )) ; then
					break
				fi
			else
				comment+="$c"
			fi
		elif ${state.cxx.in_comment} ; then
			if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
				state.cxx.in_comment=false

				# flush comment text
				if ${state.cxx.comment_continued} ; then
					comment_array[ca-1]+="${ch.newline}${comment}"
					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
				else
					comment_array[ca++]="${comment}"
					(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
					   state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
				fi
				comment=""

				if (( ca > max_num_comments )) ; then
					break
				fi
			else
				comment+="$c"
			fi
		elif ${state.in_sq_literal} ; then
			if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
				state.in_sq_literal=false
			fi
		elif ${state.in_dq_literal} ; then
			if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
				state.in_dq_literal=false
			fi
		else
			if [[ "$c2" == "/*" ]] ; then
				(( file_pos++, line_pos.x++ ))
				state.in_c_comment=true
				comment=""
			elif [[ "$c2" == "//" ]] ; then
				(( file_pos++, line_pos.x++ ))
				if (( state.cxx.comment_prev_pos.x == line_pos.x && \
					state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
					state.cxx.comment_continued=true
			else
				state.cxx.comment_continued=false
			fi
			(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
			state.cxx.in_comment=true
			comment=""
			elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
				state.in_sq_literal=true
			elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
				state.in_dq_literal=true
			fi
		fi
	done

	if [[ "$comment" != "" ]] ; then
		print -u2 "## ERROR: Comment text buffer not empty at EOF."
		err=1
	fi

	if ${state.in_c_comment} ; then
		print -u2 "## ERROR: C comment did not close before EOF."
		err=1
	fi

	if ${state.cxx.in_comment} ; then
		print -u2 "## ERROR: C++ comment did not close before EOF."
		err=1
	fi

	if ${state.in_dq_literal} ; then
		print -u2 "## ERROR: Double-quoted literal did not close before EOF."
		err=1
	fi

	# We treat this one only as warning since things like "foo.html.cpp" may
	# trigger this condition accidently
	if ${state.in_sq_literal} ; then
		print -u2 "## WARNING: Single-quoted literal did not close before EOF."
	fi

	return $err
}

# determine file type
function get_file_format
{
	set -o errexit

	typeset filename="$1"
	nameref file_format="$2"

	typeset fileeval # evaluation result of /usr/bin/file

	# check whether "filename" is a plain, readable file
	[[ ! -f "$filename" ]] && return 1
	[[ ! -r "$filename" ]] && return 1

	# In theory this code would exclusively look at the contents of
	# the file to figure out it's file format - unfortunately
	# /usr/bin/file is virtually useless (the heuristics, matching
	# and output unreliable) for many file formats and therefore
	# we have to do a multi-stage approach which looks
	# at the file's content if possible and at the filename
	# otherwise. Fun... ;-(

	# pass one: Find matches for file formats where /usr/bin/file
	# is known to be unreliable:
	case "$filename" in
		*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
			file_format="c_source"
			return 0
			;;
		*Imakefile)
			file_format="imakefile"
			return 0
			;;
		*Makefile)
			file_format="makefile"
			return 0
			;;
	esac

	# pass two: match by file content via /usr/bin/file
	fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
	case "$fileeval" in
		~(E)roff)
			file_format="troff"
			return 0
			;;
		~(E)html\ document)
			file_format="html"
			return 0
			;;
		~(E)sgml\ document)
			file_format="sgml"
			return 0
			;;
		~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
			file_format="shell"
			return 0
			;;
		~(E)executable.*/perl\ script)
			file_format="perl"
			return 0
			;;
	esac

	# pass three: fallhack to filename matching
	case "$filename" in
		*.man)
			file_format="troff"
			return 0
			;;
		*.html)
			file_format="html"
			return 0
			;;
		*.sgml)
			file_format="sgml"
			return 0
			;;
		*.xml)
			file_format="xml"
			return 0
			;;
		*.png)
			file_format="image_png"
			return 0
			;;
		*.xcf)
			file_format="image_xcf"
			return 0
			;;
		*.shar)
			file_format="archive_shell"
			return 0
			;;
		*.sh)
			file_format="shell"
			return 0
			;;
		*.pcf)
			file_format="font_pcf"
			return 0
			;;
		*.bdf)
			file_format="font_bdf"
			return 0
			;;
		*.pmf)
			file_format="font_pmf"
			return 0
			;;
		*.ttf | *.otf)
			file_format="font_ttf"
			return 0
			;;
		*.pfa | *.pfb)
			file_format="font_postscript"
			return 0
			;;
	esac

	return 1
}

function extract_comments
{
	set -o errexit

	nameref records="$1"
	typeset filename="$2"
	integer max_num_comments="$3"
	integer max_filesize_for_scan="$4"

	typeset datatype=""

	records[${filename}]=(
		typeset filename="$filename"

		typeset fileformat_found="false" # "true" or "false"
		typeset file_format=""

		typeset -A hashsum

		typeset comments_parsed="false" # "true" or "false"
		typeset -a comments
	)

	records[${filename}].hashsum["md5"]="$(sum  -x md5  < "$filename")"
	records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"

	if get_file_format "$filename" datatype ; then
		records[${filename}].fileformat_found="true"
		records[${filename}].file_format="$datatype"
	else
		return 1
	fi

	case "$datatype" in
		c_source|imakefile)
			enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
				records[${filename}].comments_parsed=true
			;;
		shell|makefile)
			enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
				records[${filename}].comments_parsed=true
			;;
		troff)
			enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
				records[${filename}].comments_parsed=true
			;;
		# NOTE: Disabled for now
		#xml|html|sgml)
		#	enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
		#		records[${filename}].comments_parsed=true
		#	;;
	esac

	return 0
}

# parse HTTP return code, cookies etc.
function parse_http_response
{
	nameref response="$1"
	typeset h statuscode statusmsg i

	# we use '\r' as additional IFS to filter the final '\r'
	IFS=$' \t\r' read -r h statuscode statusmsg  # read HTTP/1.[01] <code>
	[[ "$h" != ~(Eil)HTTP/.* ]]         && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
	[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n"  "$0" ; return 1 ; }
	response.statuscode="$statuscode"
	response.statusmsg="$statusmsg"

	# skip remaining headers
	while IFS='' read -r i ; do
		[[ "$i" == $'\r' ]] && break

		# strip '\r' at the end
		i="${i/~(Er)$'\r'/}"

		case "$i" in
			~(Eli)Content-Type:.*)
				response.content_type="${i/~(El).*:[[:blank:]]*/}"
				;;
			~(Eli)Content-Length:[[:blank:]]*[0-9]*)
				integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
				;;
			~(Eli)Transfer-Encoding:.*)
				response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
				;;
		esac
	done

	return 0
}

function cat_http_body
{
	typeset emode="$1"
	typeset hexchunksize="0"
	integer chunksize=0

	if [[ "${emode}" == "chunked" ]] ; then
		while IFS=$'\r' read hexchunksize &&
			[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] &&
			(( chunksize=$( printf "16#%s\n" "${hexchunksize}" )  )) && (( chunksize > 0 )) ; do
			dd bs=1 count="${chunksize}" 2>/dev/null
		done
	else
		cat
	fi

	return 0
}

function cat_url
{
	typeset protocol="${1%://*}"
	typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"

	if [[ "${protocol}" == "file" ]] ; then
		cat "${path1}"
		return $?
	elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then
		typeset host="${path1%%/*}"
		typeset path="${path1#*/}"
		typeset port="${host##*:}"

		integer netfd
		compound httpresponse # http response

		# If URL did not contain a port number in the host part then look at the
		# protocol to get the port number
		if [[ "${port}" == "${host}" ]] ; then
			case "${protocol}" in
				"http")  port=80 ;;
				"https") port=443 ;;
				*)       port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
			esac
		else
			host="${host%:*}"
		fi

		printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"

		# prechecks
		[[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
		[[ "${port}"     != "" ]] || { print -u2 -f "%s: port not set.\n"     "$0" ; return 1 ; }
		[[ "${host}"     != "" ]] || { print -u2 -f "%s: host not set.\n"     "$0" ; return 1 ; }
		[[ "${path}"     != "" ]] || { print -u2 -f "%s: path not set.\n"     "$0" ; return 1 ; }

		# open TCP channel
		if [[ "${protocol}" == "https" ]] ; then
			compound sslfifo
			sslfifo.dir="$(mktemp -d)"
			sslfifo.in="${sslfifo.dir}/in"
			sslfifo.out="${sslfifo.dir}/out"

			# register an EXIT trap and use "errexit" to leave it at the first error
			# (this saves lots of if/fi tests for error checking)
			trap "rm -r \"${sslfifo.dir}\"" EXIT
			set -o errexit

			mkfifo "${sslfifo.in}" "${sslfifo.out}"

			# create async openssl child to handle https
			openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" &

			# send HTTP request
			request="GET /${path} HTTP/1.1\r\n"
			request+="Host: ${host}\r\n"
			request+="User-Agent: crawlsrccomments/ksh93(ssl) (2010-03-27; $(uname -s -r -p))\r\n"
			request+="Connection: close\r\n"
			print -n -- "${request}\r\n" >>	"${sslfifo.in}"

			# collect response and send it to stdout
			{
				parse_http_response httpresponse
				cat_http_body "${httpresponse.transfer_encoding}"
			} <"${sslfifo.out}"

			wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; }

			return 0
		else
			redirect {netfd}<> "/dev/tcp/${host}/${port}"
			(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }

			# send HTTP request
			request="GET /${path} HTTP/1.1\r\n"
			request+="Host: ${host}\r\n"
			request+="User-Agent: crawlsrccomments/ksh93 (2010-03-27; $(uname -s -r -p))\r\n"
			request+="Connection: close\r\n"
			print -n -- "${request}\r\n" >&${netfd}

			# collect response and send it to stdout
			parse_http_response httpresponse <&${netfd}
			cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}

			# close connection
			redirect {netfd}<&-

			return 0
		fi
	else
		return 1
	fi
	# notreached
}

function print_stats
{
	set -o errexit

	# gather some statistics
	compound stats=(
		integer files_with_comments=0
		integer files_without_comments=0

		integer files_without_known_format=0

		integer files_with_license_info=0
		integer files_without_license_info=0

		integer total_num_files=0
	)

	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
		if "${records[$i].comments_parsed}" ; then
			(( stats.files_with_comments++ ))
		else
			(( stats.files_without_comments++ ))
		fi

		if ! "${records[$i].fileformat_found}" ; then
			(( stats.files_without_known_format++ ))
		fi

		if "${records[$i].license_info_found}" ; then
			(( stats.files_with_license_info++ ))
		else
			(( stats.files_without_license_info++ ))
		fi

		(( stats.total_num_files++ ))
	done

	print -v stats
	return 0
}


function print_comments_plain
{
	set -o errexit

	nameref records=$1
	nameref options=$2
	typeset i j

	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
		nameref node=records[$i]

		if [[ "${options.filepattern.accept}" != "" ]] && \
		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
			continue
		fi
		if [[ "${options.filepattern.reject}" != "" ]] && \
		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
			continue
		fi

		node.license_info_found=false

		if ! "${node.comments_parsed}" ; then
			continue
		fi

		for j in "${!node.comments[@]}" ; do
			typeset s="${node.comments[$j]}"
			typeset match=false

			if [[ "${options.commentpattern.accept}" != "" ]] && \
		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
				match=true
			fi
			if [[ "${options.commentpattern.reject}" != "" ]] && \
	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
				match=false
			fi

			if "${match}" ; then
				printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
				printf "%s\n" "$s"
				node.license_info_found=true
			fi
		done

		if ! "${node.license_info_found}" ; then
			printf "## no match found in '%s'," "${node.filename}"
			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
				"${node.comments_parsed}" \
				"${node.fileformat_found}" \
				"${node.file_format}"
		fi
	done

	return 0
}

function print_comments_duplicates_compressed
{
	set -o errexit

	nameref records=$1
	nameref options=$2
	typeset i j
	typeset -A hashed_comments
	integer num_hashed_comments

	for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
		nameref node=records[$i]

		if [[ "${options.filepattern.accept}" != "" ]] && \
		   [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
			continue
		fi
		if [[ "${options.filepattern.reject}" != "" ]] && \
		   [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
			continue
		fi

		node.license_info_found=false

		if ! "${node.comments_parsed}" ; then
			continue
		fi

		for j in "${!node.comments[@]}" ; do
			typeset s="${node.comments[$j]}"
			typeset match=false

			if [[ "${options.commentpattern.accept}" != "" ]] && \
		   	   [[ "$s" == ${options.commentpattern.accept} ]] ; then
				match=true
			fi
			if [[ "${options.commentpattern.reject}" != "" ]] && \
	  		   [[ "$s" == ${options.commentpattern.reject} ]] ; then
				match=false
			fi


			if "${match}" ; then
				typeset -l hashstring # lowercase

				# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
				hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
				# ... and then create a MD5 hash from this string
				hash="$(sum -x md5 <<<"${hashstring}")"

				nameref hc_node=hashed_comments[${hash}]

				if [[ "${hc_node}" == "" ]] ; then
					# build node if there isn't one yet
					typeset -a hc_node.fileids
					typeset    hc_node.comment="$s"
				fi

				hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )

				node.license_info_found=true
			fi
		done

		if ! "${node.license_info_found}" ; then
			printf "## no match found in "
			printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
			printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
				"${node.comments_parsed}" \
				"${node.fileformat_found}" \
				"${node.file_format}"
		fi
	done

	# print comments and all fileids (filename+hash sums) which include this comment
	for i in "${!hashed_comments[@]}" ; do
		printf "\f## The comment (ID=%s) ..." "${i}"
		printf "\n-- snip --"
		printf "\n%s" "${hashed_comments[${i}].comment}"
		printf "\n-- snip --"
		printf "\n... applies to the following files:\n"
		printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
	done

	return 0
}

function do_crawl
{
	set -o errexit

	compound options=(
		integer max_filesize_for_scan=$((256*1024))
		integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
	)

	shift
	while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
		printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
		case ${OPT} in
			S)	options.max_filesize_for_scan="${OPTARG}"  ;;
			N)	options.max_num_comments="${OPTARG}"  ;;
			*)	usage do_crawl_usage ;;
		esac
	done
	shift $((OPTIND-1))

	compound scan=(
		typeset -A records
	)

	# read filenames from stdin
	while read i ; do
		printf "## scanning %s ...\n" "$i"
		extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
	done

	# print compound variable array (we strip the "typeset -A records" for now)
	print -v scan >"crawlsrccomments_extracted_comments.cpv"

	print "# Wrote results to crawlsrccomments_extracted_comments.cpv"

	return 0
}

function do_getcomments
{
	set -o errexit

	# vars
	compound scan
	typeset database
	typeset tmp

	compound options=(
		typeset database="crawlsrccomments_extracted_comments.cpv"

		typeset print_stats=false
		typeset zapduplicates=false
		compound filepattern=(
			typeset accept="*"
			typeset reject=""
		)
		compound commentpattern=(
			typeset accept="~(Ei)(license|copyright)"
			typeset reject=""
		)
	)

	shift
	while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
	#    printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
		case ${OPT} in
			c)	options.commentpattern.accept="${OPTARG}" ;;
			C)	options.commentpattern.reject="${OPTARG}" ;;
			D)	options.database="${OPTARG}" ;;
			l)	options.filepattern.accept="${OPTARG}" ;;
			L)	options.filepattern.reject="${OPTARG}" ;;
			S)	options.print_stats=true ;;
			+S)	options.print_stats=false ;;
			Z)	options.zapduplicates=true ;;
			+Z)	options.zapduplicates=false ;;
			*)	usage do_getcomments_usage ;;
		esac
	done
	shift $((OPTIND-1))

	# array of temporary files which should be cleaned-up upon exit
	typeset -a tmpfiles
	trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT

	# Support for HTTP URLs
	if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then
		database="/tmp/extract_license_cat_url_${PPID}_$$.tmp"
		tmpfiles+=( "${database}" )
		print -u2 "# Loading URL..."
		cat_url "${options.database}" >"${database}"
		print -u2 "# Loading URL done."
	else
		database="${options.database}"
	fi

	if [[ ! -r "${database}" ]] ; then
		fatal_error "Can't read ${database}."
	fi

	# Support for compressed database files
	case "$(LC_ALL=C /usr/bin/file "${database}")" in
		*bzip2*)
			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
			tmpfiles+=( "${tmp}" )
			print -u2 "# Uncompressing data (bzip2) ..."
			bzcat <"${database}" >"${tmp}"
			print -u2 "# Uncompression done."
			database="${tmp}"
			;;
		*gzip*)
			tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
			tmpfiles+=( "${tmp}" )
			print -u2 "# Uncompressing data (gzip) ..."
			gunzip -c <"${database}" >"${tmp}"
			print -u2 "# Uncompression done."
			database="${tmp}"
			;;
	esac

	# Read compound variable which contain all recorded comments
	print -u2 "# reading records..."
	read -C scan <"${database}" || fatal_error 'Error reading data.'
	print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"

	# print comments
	print -u2 "# processing data..."
	print "## comments start:"
	if "${options.zapduplicates}" ; then
		print_comments_duplicates_compressed scan.records options
	else
		print_comments_plain scan.records options
	fi
	print "## comments end"
	print -u2 "# processing data done."

	if "${options.print_stats}" ; then
		print_stats
	fi

	return 0
}

function usage
{
	nameref usagemsg=$1
	OPTIND=0
	getopts -a "${progname}" "${usagemsg}" OPT '-?'
	exit 2
}

typeset -r do_getcomments_usage=$'+
[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?getcomments - extract license information from source files]
[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
	license information from the "\bgetcomments\b"-database
	file created by \bcrawl\b. The script allows various
	filters (see options below) to be applied on the database]
[+?The license extraction is done in two steps - first a crawler script
	called \bcrawl\b will scan all source files, extract
	the comments and stores this information in a "database" file called
	"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
	queries on this database.]
[D:database?Database file for input (either file, http:// or https://-URL).]:[database]
[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
[S:stats?Print statistics.]
[Z:zapsimilar?Combine similar/duplicate comments in the report.]
[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'

typeset -r do_crawl_usage=$'+
[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?crawl - crawl comment information from source files]
[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
	a list of source code files from stdin, determinates the type of
	syntax used by these files and then extracts
	comments from the source code and stores this information into a
	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
	be processed by \bextract_license\b or similar processing tools.]
[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
	Defaults to 256K characters.]:[numchars]
[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'

typeset -r crawlsrccomments_usage=$'+
[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?crawlsrccomments - extract and filter comment information from source files]
[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
	a list of source code files from stdin, determinates the type of
	syntax used by these files and then extracts
	comments from the source code and stores this information into a
	"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
	be processed by \bextract_license\b or similar processing tools.]

[crawl|getcomments] options

[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'


# program start
builtin basename
builtin cat
builtin date
builtin uname
builtin rm
builtin sum || fatal_error "sum builtin not found."

# exit at the first error we hit
set -o errexit

typeset progname="${ basename "${0}" ; }"

while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
	# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
	case ${OPT} in
		*)	usage crawlsrccomments_usage ;;
	esac
done
shift $((OPTIND-1))

typeset cmd="$1"

case "$cmd" in
	"crawl")
		progname+=" ${cmd}"
		do_crawl "$@"
		exit $?
		;;
	"getcomments")
		progname+=" ${cmd}"
		do_getcomments "$@"
		exit $?
		;;
	*)
		usage crawlsrccomments_usage
		;;
esac

fatal_error "not reached."
# EOF.