#!/usr/bin/ksh93 # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin # Make sure all math stuff runs in the "C" locale to avoid problems # with alternative # radix point representations (e.g. ',' instead of # '.' in de_DE.*-locales). This needs to be set _before_ any # floating-point constants are defined in this script). if [[ "${LC_ALL}" != "" ]] ; then export \ LC_MONETARY="${LC_ALL}" \ LC_MESSAGES="${LC_ALL}" \ LC_COLLATE="${LC_ALL}" \ LC_CTYPE="${LC_ALL}" unset LC_ALL fi export LC_NUMERIC=C # constants values for tokenizer/parser stuff compound -r ch=( newline=$'\n' tab=$'\t' formfeed=$'\f' ) function fatal_error { print -u2 "${progname}: $*" exit 1 } function printmsg { print -u2 "$*" } function attrstrtoattrarray { #set -o xtrace typeset s="$1" nameref aa=$2 # attribute array integer aa_count=0 integer aa_count=0 typeset nextattr integer currattrlen=0 typeset tagstr typeset tagval while (( ${#s} > 0 )) ; do # skip whitespaces while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do (( currattrlen++ )) done s="${s:currattrlen:${#s}}" # anything left ? (( ${#s} == 0 )) && break # Pattern tests: #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}" #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}" #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}" #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}" # All pattern combined via eregex (w|x|y|z): #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}" nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}" currattrlen=$(( ${#s} - ${#nextattr})) # add entry tagstr="${s:0:currattrlen}" if [[ "${tagstr}" == *=* ]] ; then # normal case: attribute with value tagval="${tagstr#*=}" # strip quotes ('' or "") if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then tagval="${tagval:1:${#tagval}-2}" fi aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" ) else # special case for HTML where you have something like aa[${aa_count}]=( name="${tagstr}" ) fi (( aa_count++ )) (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert done } # XML document handler function handle_xml_document { #set -o xtrace nameref callbacks=${1} typeset tag_type="${2}" typeset tag_value="${3}" typeset tag_attributes="${4}" nameref doc=${callbacks["arg_tree"]} nameref nodepath="${stack.items[stack.pos]}" nameref nodesnum="${stack.items[stack.pos]}num" case "${tag_type}" in tag_comment) nodepath[${nodesnum}]+=( typeset tagtype="comment" typeset tagvalue="${tag_value}" ) (( nodesnum++ )) ;; esac # print "xmltok: '${tag_type}' = '${tag_value}'" } function xml_tok { typeset buf="" typeset namebuf="" typeset attrbuf="" typeset c="" typeset isendtag # bool: true/false typeset issingletag # bool: true/false (used for tags like "
") nameref callbacks=${1} [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" while IFS='' read -r -N 1 c ; do isendtag=false if [[ "$c" == "<" ]] ; then # flush any text content if [[ "$buf" != "" ]] ; then [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" buf="" fi IFS='' read -r -N 1 c if [[ "$c" == "/" ]] ; then isendtag=true else buf="$c" fi IFS='' read -r -d '>' c buf+="$c" # handle comments if [[ "$buf" == ~(El)!-- ]] ; then # did we read the comment completely ? if [[ "$buf" != ~(Elr)!--.*-- ]] ; then buf+=">" while [[ "$buf" != ~(Elr)!--.*-- ]] ; do IFS='' read -r -N 1 c || break buf+="$c" done fi [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" buf="" continue fi # check if the tag starts and ends at the same time (like "
") if [[ "${buf}" == ~(Er).*/ ]] ; then issingletag=true buf="${buf%*/}" else issingletag=false fi # check if the tag has attributes (e.g. space after name) if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then namebuf="${buf%%~(E)[[:space:][:blank:]].*}" attrbuf="${buf#~(E).*[[:space:][:blank:]]}" else namebuf="$buf" attrbuf="" fi if ${isendtag} ; then [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" else [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" # handle tags like
(which are start- and end-tag in one piece) if ${issingletag} ; then [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" fi fi buf="" else buf+="$c" fi done [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" print # final newline to make filters like "sed" happy } # enumerate comments in a shell (or shell-like) script function enumerate_comments_shell { set -o errexit typeset input_file="$1" nameref comment_array="$2" integer max_num_comments="$3" integer ca=0 # index in "comment_array" integer res=0 typeset comment="" while (( res == 0 )) ; do IFS='' read -r line (( res=$? )) if [[ "${line}" == ~(El)#.* ]] ; then comment+="${line#\#}${ch.newline}" else if [[ "$comment" != "" ]] ; then comment_array[ca++]="${comment}" comment="" if (( ca > max_num_comments )) ; then break fi fi fi done <"${input_file}" return 0 } # enumerate comments in a troff document function enumerate_comments_troff { set -o errexit typeset input_file="$1" nameref comment_array="$2" integer max_num_comments="$3" integer ca=0 # index in "comment_array" integer res=0 typeset comment="" while (( res == 0 )) ; do IFS='' read -r line (( res=$? )) if [[ "${line}" == ~(El)\.*\\\" ]] ; then comment+="${line#~(El)\.*\\\"}${ch.newline}" else if [[ "$comment" != "" ]] ; then comment_array[ca++]="${comment}" comment="" if (( ca > max_num_comments )) ; then break fi fi fi done <"${input_file}" return 0 } # enumerate comments in files which are preprocessed by # CPP (e.g. C, C++, Imakefile etc.) function enumerate_comments_cpp { set -o errexit # set -o nounset integer err=0 typeset input_file="$1" nameref comment_array="$2" integer max_num_comments="$3" integer max_filesize_for_scan="$4" integer ca=0 # index in "comment_array" typeset content integer content_length integer file_pos # file position compound line_pos=( integer x=0 # X position in line integer y=0 # Y position in line (line number) ) typeset c c2 typeset comment compound state=( # C comment state typeset in_c_comment=false # C++ comment state compound cxx=( typeset in_comment=false typeset comment_continued=false # position of current //-pos compound comment_pos=( integer x=-1 integer y=-1 ) # position of previous //-pos compound comment_prev_pos=( integer x=-1 integer y=-1 ) ) # literal state typeset in_sq_literal=false # single-quote literal typeset in_dq_literal=false # double-quote literal ) content="$(< "${input_file}")" # Truncate file to "max_filesize_for_scan" charatcters. # This was originally added to work around a performance problem with # the ${str:offset:chunksize} operator which scales badly in ksh93 # version 's' with the number of characters if (( ${#content} > max_filesize_for_scan )) ; then print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \ "${input_file}" \ max_filesize_for_scan content="${content:0:max_filesize_for_scan}" fi content_length=${#content} # Iterate through the source code. The last character # (when file_pos == content_length) will be empty to indicate # EOF (this is needed for cases like when # a C++ comment is not terminated by a newline... ;-/) for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do c2="${content:file_pos:2}" c="${c2:0:1}" if [[ "$c" == "${ch.newline}" ]] ; then (( line_pos.x=0, line_pos.y++ )) else (( line_pos.x++ )) fi if ${state.in_c_comment} ; then if [[ "$c2" == "*/" ]] ; then (( file_pos++, line_pos.x++ )) state.in_c_comment=false # flush comment text comment_array[ca++]="${comment}" comment="" if (( ca > max_num_comments )) ; then break fi else comment+="$c" fi elif ${state.cxx.in_comment} ; then if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then state.cxx.in_comment=false # flush comment text if ${state.cxx.comment_continued} ; then comment_array[ca-1]+="${ch.newline}${comment}" (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) else comment_array[ca++]="${comment}" (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) fi comment="" if (( ca > max_num_comments )) ; then break fi else comment+="$c" fi elif ${state.in_sq_literal} ; then if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then state.in_sq_literal=false fi elif ${state.in_dq_literal} ; then if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then state.in_dq_literal=false fi else if [[ "$c2" == "/*" ]] ; then (( file_pos++, line_pos.x++ )) state.in_c_comment=true comment="" elif [[ "$c2" == "//" ]] ; then (( file_pos++, line_pos.x++ )) if (( state.cxx.comment_prev_pos.x == line_pos.x && \ state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then state.cxx.comment_continued=true else state.cxx.comment_continued=false fi (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y )) state.cxx.in_comment=true comment="" elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then state.in_sq_literal=true elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then state.in_dq_literal=true fi fi done if [[ "$comment" != "" ]] ; then print -u2 "## ERROR: Comment text buffer not empty at EOF." err=1 fi if ${state.in_c_comment} ; then print -u2 "## ERROR: C comment did not close before EOF." err=1 fi if ${state.cxx.in_comment} ; then print -u2 "## ERROR: C++ comment did not close before EOF." err=1 fi if ${state.in_dq_literal} ; then print -u2 "## ERROR: Double-quoted literal did not close before EOF." err=1 fi # We treat this one only as warning since things like "foo.html.cpp" may # trigger this condition accidently if ${state.in_sq_literal} ; then print -u2 "## WARNING: Single-quoted literal did not close before EOF." fi return $err } # determine file type function get_file_format { set -o errexit typeset filename="$1" nameref file_format="$2" typeset fileeval # evaluation result of /usr/bin/file # check whether "filename" is a plain, readable file [[ ! -f "$filename" ]] && return 1 [[ ! -r "$filename" ]] && return 1 # In theory this code would exclusively look at the contents of # the file to figure out it's file format - unfortunately # /usr/bin/file is virtually useless (the heuristics, matching # and output unreliable) for many file formats and therefore # we have to do a multi-stage approach which looks # at the file's content if possible and at the filename # otherwise. Fun... ;-( # pass one: Find matches for file formats where /usr/bin/file # is known to be unreliable: case "$filename" in *.[ch] | *.cpp | *.cc | *.cxx | *.hxx) file_format="c_source" return 0 ;; *Imakefile) file_format="imakefile" return 0 ;; *Makefile) file_format="makefile" return 0 ;; esac # pass two: match by file content via /usr/bin/file fileeval="$(LC_ALL=C /usr/bin/file "$filename")" case "$fileeval" in ~(E)roff) file_format="troff" return 0 ;; ~(E)html\ document) file_format="html" return 0 ;; ~(E)sgml\ document) file_format="sgml" return 0 ;; ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script) file_format="shell" return 0 ;; ~(E)executable.*/perl\ script) file_format="perl" return 0 ;; esac # pass three: fallhack to filename matching case "$filename" in *.man) file_format="troff" return 0 ;; *.html) file_format="html" return 0 ;; *.sgml) file_format="sgml" return 0 ;; *.xml) file_format="xml" return 0 ;; *.png) file_format="image_png" return 0 ;; *.xcf) file_format="image_xcf" return 0 ;; *.shar) file_format="archive_shell" return 0 ;; *.sh) file_format="shell" return 0 ;; *.pcf) file_format="font_pcf" return 0 ;; *.bdf) file_format="font_bdf" return 0 ;; *.pmf) file_format="font_pmf" return 0 ;; *.ttf | *.otf) file_format="font_ttf" return 0 ;; *.pfa | *.pfb) file_format="font_postscript" return 0 ;; esac return 1 } function extract_comments { set -o errexit nameref records="$1" typeset filename="$2" integer max_num_comments="$3" integer max_filesize_for_scan="$4" typeset datatype="" records[${filename}]=( typeset filename="$filename" typeset fileformat_found="false" # "true" or "false" typeset file_format="" typeset -A hashsum typeset comments_parsed="false" # "true" or "false" typeset -a comments ) records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")" records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")" if get_file_format "$filename" datatype ; then records[${filename}].fileformat_found="true" records[${filename}].file_format="$datatype" else return 1 fi case "$datatype" in c_source|imakefile) enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ records[${filename}].comments_parsed=true ;; shell|makefile) enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ records[${filename}].comments_parsed=true ;; troff) enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ records[${filename}].comments_parsed=true ;; # NOTE: Disabled for now #xml|html|sgml) # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ # records[${filename}].comments_parsed=true # ;; esac return 0 } # parse HTTP return code, cookies etc. function parse_http_response { nameref response="$1" typeset h statuscode statusmsg i # we use '\r' as additional IFS to filter the final '\r' IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } response.statuscode="$statuscode" response.statusmsg="$statusmsg" # skip remaining headers while IFS='' read -r i ; do [[ "$i" == $'\r' ]] && break # strip '\r' at the end i="${i/~(Er)$'\r'/}" case "$i" in ~(Eli)Content-Type:.*) response.content_type="${i/~(El).*:[[:blank:]]*/}" ;; ~(Eli)Content-Length:[[:blank:]]*[0-9]*) integer response.content_length="${i/~(El).*:[[:blank:]]*/}" ;; ~(Eli)Transfer-Encoding:.*) response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" ;; esac done return 0 } function cat_http_body { typeset emode="$1" typeset hexchunksize="0" integer chunksize=0 if [[ "${emode}" == "chunked" ]] ; then while IFS=$'\r' read hexchunksize && [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] && (( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do dd bs=1 count="${chunksize}" 2>/dev/null done else cat fi return 0 } function cat_url { typeset protocol="${1%://*}" typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" if [[ "${protocol}" == "file" ]] ; then cat "${path1}" return $? elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then typeset host="${path1%%/*}" typeset path="${path1#*/}" typeset port="${host##*:}" integer netfd compound httpresponse # http response # If URL did not contain a port number in the host part then look at the # protocol to get the port number if [[ "${port}" == "${host}" ]] ; then case "${protocol}" in "http") port=80 ;; "https") port=443 ;; *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; esac else host="${host%:*}" fi printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" # prechecks [[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } [[ "${port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } [[ "${host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } [[ "${path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } # open TCP channel if [[ "${protocol}" == "https" ]] ; then compound sslfifo sslfifo.dir="$(mktemp -d)" sslfifo.in="${sslfifo.dir}/in" sslfifo.out="${sslfifo.dir}/out" # register an EXIT trap and use "errexit" to leave it at the first error # (this saves lots of if/fi tests for error checking) trap "rm -r \"${sslfifo.dir}\"" EXIT set -o errexit mkfifo "${sslfifo.in}" "${sslfifo.out}" # create async openssl child to handle https openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" & # send HTTP request request="GET /${path} HTTP/1.1\r\n" request+="Host: ${host}\r\n" request+="User-Agent: crawlsrccomments/ksh93(ssl) (2009-05-08; $(uname -s -r -p))\r\n" request+="Connection: close\r\n" print -n -- "${request}\r\n" >> "${sslfifo.in}" # collect response and send it to stdout { parse_http_response httpresponse cat_http_body "${httpresponse.transfer_encoding}" } <"${sslfifo.out}" wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; } return 0 else redirect {netfd}<> "/dev/tcp/${host}/${port}" (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } # send HTTP request request="GET /${path} HTTP/1.1\r\n" request+="Host: ${host}\r\n" request+="User-Agent: crawlsrccomments/ksh93 (2009-05-08; $(uname -s -r -p))\r\n" request+="Connection: close\r\n" print -n -- "${request}\r\n" >&${netfd} # collect response and send it to stdout parse_http_response httpresponse <&${netfd} cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} # close connection redirect {netfd}<&- return 0 fi else return 1 fi # notreached } function print_stats { set -o errexit # gather some statistics compound stats=( integer files_with_comments=0 integer files_without_comments=0 integer files_without_known_format=0 integer files_with_license_info=0 integer files_without_license_info=0 integer total_num_files=0 ) for i in $(printf "%s\n" "${!records[@]}" | sort) ; do if "${records[$i].comments_parsed}" ; then (( stats.files_with_comments++ )) else (( stats.files_without_comments++ )) fi if ! "${records[$i].fileformat_found}" ; then (( stats.files_without_known_format++ )) fi if "${records[$i].license_info_found}" ; then (( stats.files_with_license_info++ )) else (( stats.files_without_license_info++ )) fi (( stats.total_num_files++ )) done print -v stats return 0 } function print_comments_plain { set -o errexit nameref records=$1 nameref options=$2 typeset i j for i in $(printf "%s\n" "${!records[@]}" | sort) ; do nameref node=records[$i] if [[ "${options.filepattern.accept}" != "" ]] && \ [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then continue fi if [[ "${options.filepattern.reject}" != "" ]] && \ [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then continue fi node.license_info_found=false if ! "${node.comments_parsed}" ; then continue fi for j in "${!node.comments[@]}" ; do typeset s="${node.comments[$j]}" typeset match=false if [[ "${options.commentpattern.accept}" != "" ]] && \ [[ "$s" == ${options.commentpattern.accept} ]] ; then match=true fi if [[ "${options.commentpattern.reject}" != "" ]] && \ [[ "$s" == ${options.commentpattern.reject} ]] ; then match=false fi if "${match}" ; then printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j" printf "%s\n" "$s" node.license_info_found=true fi done if ! "${node.license_info_found}" ; then printf "## no match found in '%s'," "${node.filename}" printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ "${node.comments_parsed}" \ "${node.fileformat_found}" \ "${node.file_format}" fi done return 0 } function print_comments_duplicates_compressed { set -o errexit nameref records=$1 nameref options=$2 typeset i j typeset -A hashed_comments integer num_hashed_comments for i in $(printf "%s\n" "${!records[@]}" | sort) ; do nameref node=records[$i] if [[ "${options.filepattern.accept}" != "" ]] && \ [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then continue fi if [[ "${options.filepattern.reject}" != "" ]] && \ [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then continue fi node.license_info_found=false if ! "${node.comments_parsed}" ; then continue fi for j in "${!node.comments[@]}" ; do typeset s="${node.comments[$j]}" typeset match=false if [[ "${options.commentpattern.accept}" != "" ]] && \ [[ "$s" == ${options.commentpattern.accept} ]] ; then match=true fi if [[ "${options.commentpattern.reject}" != "" ]] && \ [[ "$s" == ${options.commentpattern.reject} ]] ; then match=false fi if "${match}" ; then typeset -l hashstring # lowercase # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ... hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}" # ... and then create a MD5 hash from this string hash="$(sum -x md5 <<<"${hashstring}")" nameref hc_node=hashed_comments[${hash}] if [[ "${hc_node}" == "" ]] ; then # build node if there isn't one yet typeset -a hc_node.fileids typeset hc_node.comment="$s" fi hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" ) node.license_info_found=true fi done if ! "${node.license_info_found}" ; then printf "## no match found in " printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}" printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ "${node.comments_parsed}" \ "${node.fileformat_found}" \ "${node.file_format}" fi done # print comments and all fileids (filename+hash sums) which include this comment for i in "${!hashed_comments[@]}" ; do printf "\f## The comment (ID=%s) ..." "${i}" printf "\n-- snip --" printf "\n%s" "${hashed_comments[${i}].comment}" printf "\n-- snip --" printf "\n... applies to the following files:\n" printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber done return 0 } function do_crawl { set -o errexit compound options=( integer max_filesize_for_scan=$((256*1024)) integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite) ) shift while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" case ${OPT} in S) options.max_filesize_for_scan="${OPTARG}" ;; N) options.max_num_comments="${OPTARG}" ;; *) usage do_crawl_usage ;; esac done shift $((OPTIND-1)) compound scan=( typeset -A records ) # read filenames from stdin while read i ; do printf "## scanning %s ...\n" "$i" extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true done # print compound variable array (we strip the "typeset -A records" for now) print -v scan >"crawlsrccomments_extracted_comments.cpv" print "# Wrote results to crawlsrccomments_extracted_comments.cpv" return 0 } function do_getcomments { set -o errexit # vars compound scan typeset database typeset tmp compound options=( typeset database="crawlsrccomments_extracted_comments.cpv" typeset print_stats=false typeset zapduplicates=false compound filepattern=( typeset accept="*" typeset reject="" ) compound commentpattern=( typeset accept="~(Ei)(license|copyright)" typeset reject="" ) ) shift while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" case ${OPT} in c) options.commentpattern.accept="${OPTARG}" ;; C) options.commentpattern.reject="${OPTARG}" ;; D) options.database="${OPTARG}" ;; l) options.filepattern.accept="${OPTARG}" ;; L) options.filepattern.reject="${OPTARG}" ;; S) options.print_stats=true ;; +S) options.print_stats=false ;; Z) options.zapduplicates=true ;; +Z) options.zapduplicates=false ;; *) usage do_getcomments_usage ;; esac done shift $((OPTIND-1)) # array of temporary files which should be cleaned-up upon exit typeset -a tmpfiles trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT # Support for HTTP URLs if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then database="/tmp/extract_license_cat_url_${PPID}_$$.tmp" tmpfiles+=( "${database}" ) print -u2 "# Loading URL..." cat_url "${options.database}" >"${database}" print -u2 "# Loading URL done." else database="${options.database}" fi if [[ ! -r "${database}" ]] ; then fatal_error "Can't read ${database}." fi # Support for compressed database files case "$(LC_ALL=C /usr/bin/file "${database}")" in *bzip2*) tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" tmpfiles+=( "${tmp}" ) print -u2 "# Uncompressing data (bzip2) ..." bzcat <"${database}" >"${tmp}" print -u2 "# Uncompression done." database="${tmp}" ;; *gzip*) tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" tmpfiles+=( "${tmp}" ) print -u2 "# Uncompressing data (gzip) ..." gunzip -c <"${database}" >"${tmp}" print -u2 "# Uncompression done." database="${tmp}" ;; esac # Read compound variable which contain all recorded comments print -u2 "# reading records..." read -C scan <"${database}" || fatal_error 'Error reading data.' print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}" # print comments print -u2 "# processing data..." print "## comments start:" if "${options.zapduplicates}" ; then print_comments_duplicates_compressed scan.records options else print_comments_plain scan.records options fi print "## comments end" print -u2 "# processing data done." if "${options.print_stats}" ; then print_stats fi return 0 } function usage { nameref usagemsg=$1 OPTIND=0 getopts -a "${progname}" "${usagemsg}" OPT '-?' exit 2 } typeset -r do_getcomments_usage=$'+ [-?\n@(#)\$Id: getcomments (Roland Mainz) 2009-05-09 \$\n] [-author?Roland Mainz ] [+NAME?getcomments - extract license information from source files] [+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts license information from the "\bgetcomments\b"-database file created by \bcrawl\b. The script allows various filters (see options below) to be applied on the database] [+?The license extraction is done in two steps - first a crawler script called \bcrawl\b will scan all source files, extract the comments and stores this information in a "database" file called "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows queries on this database.] [D:database?Database file for input (either file, http:// or https://-URL).]:[database] [l:acceptfilepattern?Process only files which match pattern.]:[pattern] [L:rejectfilepattern?Process only files which do not match pattern.]:[pattern] [c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern] [C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern] [S:stats?Print statistics.] [Z:zapsimilar?Combine similar/duplicate comments in the report.] [+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] ' typeset -r do_crawl_usage=$'+ [-?\n@(#)\$Id: crawl (Roland Mainz) 2009-05-09 \$\n] [-author?Roland Mainz ] [+NAME?crawl - crawl comment information from source files] [+DESCRIPTION?\bcrawl\b is a small utilty script which reads a list of source code files from stdin, determinates the type of syntax used by these files and then extracts comments from the source code and stores this information into a "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then be processed by \bextract_license\b or similar processing tools.] [S:scanmaxcharacters?Scan a maximum number of numchars characters for comments. Defaults to 256K characters.]:[numchars] [N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments] [+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] ' typeset -r crawlsrccomments_usage=$'+ [-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2009-05-09 \$\n] [-author?Roland Mainz ] [+NAME?crawlsrccomments - extract and filter comment information from source files] [+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads a list of source code files from stdin, determinates the type of syntax used by these files and then extracts comments from the source code and stores this information into a "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then be processed by \bextract_license\b or similar processing tools.] [crawl|getcomments] options [+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] ' # program start builtin basename builtin cat builtin date builtin uname builtin rm builtin sum || fatal_error "sum builtin not found." # exit at the first error we hit set -o errexit typeset progname="${ basename "${0}" ; }" while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" case ${OPT} in *) usage crawlsrccomments_usage ;; esac done shift $((OPTIND-1)) typeset cmd="$1" case "$cmd" in "crawl") progname+=" ${cmd}" do_crawl "$@" exit $? ;; "getcomments") progname+=" ${cmd}" do_getcomments "$@" exit $? ;; *) usage crawlsrccomments_usage ;; esac fatal_error "not reached." # EOF.