1#!/usr/bin/ksh93 2 3# 4# CDDL HEADER START 5# 6# The contents of this file are subject to the terms of the 7# Common Development and Distribution License (the "License"). 8# You may not use this file except in compliance with the License. 9# 10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11# or http://www.opensolaris.org/os/licensing. 12# See the License for the specific language governing permissions 13# and limitations under the License. 14# 15# When distributing Covered Code, include this CDDL HEADER in each 16# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17# If applicable, add the following below this CDDL HEADER, with the 18# fields enclosed by brackets "[]" replaced with your own identifying 19# information: Portions Copyright [yyyy] [name of copyright owner] 20# 21# CDDL HEADER END 22# 23 24# 25# Copyright 2008 Sun Microsystems, Inc. All rights reserved. 26# Use is subject to license terms. 27# 28 29# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 30export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 31 32# Make sure all math stuff runs in the "C" locale to avoid problems 33# with alternative # radix point representations (e.g. ',' instead of 34# '.' in de_DE.*-locales). This needs to be set _before_ any 35# floating-point constants are defined in this script). 36if [[ "${LC_ALL}" != "" ]] ; then 37 export \ 38 LC_MONETARY="${LC_ALL}" \ 39 LC_MESSAGES="${LC_ALL}" \ 40 LC_COLLATE="${LC_ALL}" \ 41 LC_CTYPE="${LC_ALL}" 42 unset LC_ALL 43fi 44export LC_NUMERIC=C 45 46# constants values for tokenizer/parser stuff 47typeset -r ch=( 48 newline=$'\n' 49 tab=$'\t' 50 formfeed=$'\f' 51) 52 53function fatal_error 54{ 55 print -u2 "${progname}: $*" 56 exit 1 57} 58 59function printmsg 60{ 61 print -u2 "$*" 62} 63 64 65function attrstrtoattrarray 66{ 67#set -o xtrace 68 typeset s="$1" 69 nameref aa=$2 # attribute array 70 integer aa_count=0 71 integer aa_count=0 72 typeset nextattr 73 integer currattrlen=0 74 typeset tagstr 75 typeset tagval 76 77 while (( ${#s} > 0 )) ; do 78 # skip whitespaces 79 while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do 80 (( currattrlen++ )) 81 done 82 s="${s:currattrlen:${#s}}" 83 84 # anything left ? 85 (( ${#s} == 0 )) && break 86 87 # Pattern tests: 88 #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}" 89 #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}" 90 #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}" 91 #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}" 92 # All pattern combined via eregex (w|x|y|z): 93 #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}" 94 nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}" 95 currattrlen=$(( ${#s} - ${#nextattr})) 96 97 # add entry 98 tagstr="${s:0:currattrlen}" 99 if [[ "${tagstr}" == *=* ]] ; then 100 # normal case: attribute with value 101 102 tagval="${tagstr#*=}" 103 104 # strip quotes ('' or "") 105 if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then 106 tagval="${tagval:1:${#tagval}-2}" 107 fi 108 109 aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" ) 110 else 111 # special case for HTML where you have something like <foo baz> 112 aa[${aa_count}]=( name="${tagstr}" ) 113 fi 114 (( aa_count++ )) 115 (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert 116 done 117} 118 119# XML document handler 120function handle_xml_document 121{ 122#set -o xtrace 123 nameref callbacks=${1} 124 typeset tag_type="${2}" 125 typeset tag_value="${3}" 126 typeset tag_attributes="${4}" 127 nameref doc=${callbacks["arg_tree"]} 128 nameref nodepath="${stack.items[stack.pos]}" 129 nameref nodesnum="${stack.items[stack.pos]}num" 130 131 case "${tag_type}" in 132 tag_comment) 133 nodepath[${nodesnum}]+=( 134 typeset tagtype="comment" 135 typeset tagvalue="${tag_value}" 136 ) 137 (( nodesnum++ )) 138 ;; 139 esac 140 141# print "xmltok: '${tag_type}' = '${tag_value}'" 142} 143 144function xml_tok 145{ 146 typeset buf="" 147 typeset namebuf="" 148 typeset attrbuf="" 149 typeset c="" 150 typeset isendtag # bool: true/false 151 typeset issingletag # bool: true/false (used for tags like "<br />") 152 nameref callbacks=${1} 153 154 [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 155 156 while IFS='' read -r -N 1 c ; do 157 isendtag=false 158 159 if [[ "$c" == "<" ]] ; then 160 # flush any text content 161 if [[ "$buf" != "" ]] ; then 162 [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 163 buf="" 164 fi 165 166 IFS='' read -r -N 1 c 167 if [[ "$c" == "/" ]] ; then 168 isendtag=true 169 else 170 buf="$c" 171 fi 172 IFS='' read -r -d '>' c 173 buf+="$c" 174 175 # handle comments 176 if [[ "$buf" == ~(El)!-- ]] ; then 177 # did we read the comment completely ? 178 if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 179 buf+=">" 180 while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 181 IFS='' read -r -N 1 c || break 182 buf+="$c" 183 done 184 fi 185 186 [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 187 buf="" 188 continue 189 fi 190 191 # check if the tag starts and ends at the same time (like "<br />") 192 if [[ "${buf}" == ~(Er).*/ ]] ; then 193 issingletag=true 194 buf="${buf%*/}" 195 else 196 issingletag=false 197 fi 198 199 # check if the tag has attributes (e.g. space after name) 200 if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 201 namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 202 attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 203 else 204 namebuf="$buf" 205 attrbuf="" 206 fi 207 208 if ${isendtag} ; then 209 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 210 else 211 [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 212 213 # handle tags like <br/> (which are start- and end-tag in one piece) 214 if ${issingletag} ; then 215 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 216 fi 217 fi 218 buf="" 219 else 220 buf+="$c" 221 fi 222 done 223 224 [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 225 226 print # final newline to make filters like "sed" happy 227} 228 229# enumerate comments in a shell (or shell-like) script 230function enumerate_comments_shell 231{ 232 set -o errexit 233 234 typeset input_file="$1" 235 nameref comment_array="$2" 236 integer max_num_comments="$3" 237 integer ca=0 # index in "comment_array" 238 239 integer res=0 240 241 typeset comment="" 242 243 while (( res == 0 )) ; do 244 IFS='' read -r line 245 (( res=$? )) 246 247 if [[ "${line}" == ~(El)#.* ]] ; then 248 comment+="${line#\#}${ch.newline}" 249 else 250 if [[ "$comment" != "" ]] ; then 251 comment_array[ca++]="${comment}" 252 comment="" 253 254 if (( ca > max_num_comments )) ; then 255 break 256 fi 257 fi 258 fi 259 done <"${input_file}" 260 261 return 0 262} 263 264 265# enumerate comments in a troff document 266function enumerate_comments_troff 267{ 268 set -o errexit 269 270 typeset input_file="$1" 271 nameref comment_array="$2" 272 integer max_num_comments="$3" 273 integer ca=0 # index in "comment_array" 274 275 integer res=0 276 277 typeset comment="" 278 279 while (( res == 0 )) ; do 280 IFS='' read -r line 281 (( res=$? )) 282 283 if [[ "${line}" == ~(El)\.*\\\" ]] ; then 284 comment+="${line#~(El)\.*\\\"}${ch.newline}" 285 else 286 if [[ "$comment" != "" ]] ; then 287 comment_array[ca++]="${comment}" 288 comment="" 289 290 if (( ca > max_num_comments )) ; then 291 break 292 fi 293 fi 294 fi 295 done <"${input_file}" 296 297 return 0 298} 299 300 301# enumerate comments in files which are preprocessed by 302# CPP (e.g. C, C++, Imakefile etc.) 303function enumerate_comments_cpp 304{ 305 set -o errexit 306# set -o nounset 307 308 integer err=0 309 310 typeset input_file="$1" 311 nameref comment_array="$2" 312 integer max_num_comments="$3" 313 integer max_filesize_for_scan="$4" 314 integer ca=0 # index in "comment_array" 315 316 typeset content 317 integer content_length 318 319 integer file_pos # file position 320 typeset line_pos=( 321 integer x=0 # X position in line 322 integer y=0 # Y position in line (line number) 323 ) 324 typeset c c2 325 326 typeset comment 327 328 typeset state=( 329 # C comment state 330 typeset in_c_comment=false 331 # C++ comment state 332 typeset cxx=( 333 typeset in_comment=false 334 typeset comment_continued=false 335 # position of current //-pos 336 typeset comment_pos=( 337 integer x=-1 338 integer y=-1 339 ) 340 # position of previous //-pos 341 typeset comment_prev_pos=( 342 integer x=-1 343 integer y=-1 344 ) 345 ) 346 # literal state 347 typeset in_sq_literal=false # single-quote literal 348 typeset in_dq_literal=false # double-quote literal 349 ) 350 351 content="$(< "${input_file}")" 352 353 # Truncate file to "max_filesize_for_scan" charatcters. 354 # This was originally added to work around a performance problem with 355 # the ${str:offset:chunksize} operator which scales badly in ksh93 356 # version 's' with the number of characters 357 if (( ${#content} > max_filesize_for_scan )) ; then 358 print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \ 359 "${input_file}" \ 360 max_filesize_for_scan 361 content="${content:0:max_filesize_for_scan}" 362 fi 363 content_length=${#content} 364 365 # Iterate through the source code. The last character 366 # (when file_pos == content_length) will be empty to indicate 367 # EOF (this is needed for cases like when 368 # a C++ comment is not terminated by a newline... ;-/) 369 for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do 370 c2="${content:file_pos:2}" 371 c="${c2:0:1}" 372 373 if [[ "$c" == "${ch.newline}" ]] ; then 374 (( line_pos.x=0, line_pos.y++ )) 375 else 376 (( line_pos.x++ )) 377 fi 378 379 if ${state.in_c_comment} ; then 380 if [[ "$c2" == "*/" ]] ; then 381 (( file_pos++, line_pos.x++ )) 382 state.in_c_comment=false 383 384 # flush comment text 385 comment_array[ca++]="${comment}" 386 comment="" 387 388 if (( ca > max_num_comments )) ; then 389 break 390 fi 391 else 392 comment+="$c" 393 fi 394 elif ${state.cxx.in_comment} ; then 395 if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then 396 state.cxx.in_comment=false 397 398 # flush comment text 399 if ${state.cxx.comment_continued} ; then 400 comment_array[ca-1]+="${ch.newline}${comment}" 401 (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 402 state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 403 else 404 comment_array[ca++]="${comment}" 405 (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 406 state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 407 fi 408 comment="" 409 410 if (( ca > max_num_comments )) ; then 411 break 412 fi 413 else 414 comment+="$c" 415 fi 416 elif ${state.in_sq_literal} ; then 417 if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 418 state.in_sq_literal=false 419 fi 420 elif ${state.in_dq_literal} ; then 421 if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 422 state.in_dq_literal=false 423 fi 424 else 425 if [[ "$c2" == "/*" ]] ; then 426 (( file_pos++, line_pos.x++ )) 427 state.in_c_comment=true 428 comment="" 429 elif [[ "$c2" == "//" ]] ; then 430 (( file_pos++, line_pos.x++ )) 431 if (( state.cxx.comment_prev_pos.x == line_pos.x && \ 432 state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then 433 state.cxx.comment_continued=true 434 else 435 state.cxx.comment_continued=false 436 fi 437 (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y )) 438 state.cxx.in_comment=true 439 comment="" 440 elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 441 state.in_sq_literal=true 442 elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 443 state.in_dq_literal=true 444 fi 445 fi 446 done 447 448 if [[ "$comment" != "" ]] ; then 449 print -u2 "## ERROR: Comment text buffer not empty at EOF." 450 err=1 451 fi 452 453 if ${state.in_c_comment} ; then 454 print -u2 "## ERROR: C comment did not close before EOF." 455 err=1 456 fi 457 458 if ${state.cxx.in_comment} ; then 459 print -u2 "## ERROR: C++ comment did not close before EOF." 460 err=1 461 fi 462 463 if ${state.in_dq_literal} ; then 464 print -u2 "## ERROR: Double-quoted literal did not close before EOF." 465 err=1 466 fi 467 468 # We treat this one only as warning since things like "foo.html.cpp" may 469 # trigger this condition accidently 470 if ${state.in_sq_literal} ; then 471 print -u2 "## WARNING: Single-quoted literal did not close before EOF." 472 fi 473 474 return $err 475} 476 477# determine file type 478function get_file_format 479{ 480 set -o errexit 481 482 typeset filename="$1" 483 nameref file_format="$2" 484 485 typeset fileeval # evaluation result of /usr/bin/file 486 487 # check whether "filename" is a plain, readable file 488 [[ ! -f "$filename" ]] && return 1 489 [[ ! -r "$filename" ]] && return 1 490 491 # In theory this code would exclusively look at the contents of 492 # the file to figure out it's file format - unfortunately 493 # /usr/bin/file is virtually useless (the heuristics, matching 494 # and output unreliable) for many file formats and therefore 495 # we have to do a multi-stage approach which looks 496 # at the file's content if possible and at the filename 497 # otherwise. Fun... ;-( 498 499 # pass one: Find matches for file formats where /usr/bin/file 500 # is known to be unreliable: 501 case "$filename" in 502 *.[ch] | *.cpp | *.cc | *.cxx | *.hxx) 503 file_format="c_source" 504 return 0 505 ;; 506 *Imakefile) 507 file_format="imakefile" 508 return 0 509 ;; 510 *Makefile) 511 file_format="makefile" 512 return 0 513 ;; 514 esac 515 516 # pass two: match by file content via /usr/bin/file 517 fileeval="$(LC_ALL=C /usr/bin/file "$filename")" 518 case "$fileeval" in 519 ~(E)roff) 520 file_format="troff" 521 return 0 522 ;; 523 ~(E)html\ document) 524 file_format="html" 525 return 0 526 ;; 527 ~(E)sgml\ document) 528 file_format="sgml" 529 return 0 530 ;; 531 ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script) 532 file_format="shell" 533 return 0 534 ;; 535 ~(E)executable.*/perl\ script) 536 file_format="perl" 537 return 0 538 ;; 539 esac 540 541 # pass three: fallhack to filename matching 542 case "$filename" in 543 *.man) 544 file_format="troff" 545 return 0 546 ;; 547 *.html) 548 file_format="html" 549 return 0 550 ;; 551 *.sgml) 552 file_format="sgml" 553 return 0 554 ;; 555 *.xml) 556 file_format="xml" 557 return 0 558 ;; 559 *.png) 560 file_format="image_png" 561 return 0 562 ;; 563 *.xcf) 564 file_format="image_xcf" 565 return 0 566 ;; 567 *.shar) 568 file_format="archive_shell" 569 return 0 570 ;; 571 *.sh) 572 file_format="shell" 573 return 0 574 ;; 575 *.pcf) 576 file_format="font_pcf" 577 return 0 578 ;; 579 *.bdf) 580 file_format="font_bdf" 581 return 0 582 ;; 583 *.pmf) 584 file_format="font_pmf" 585 return 0 586 ;; 587 *.ttf | *.otf) 588 file_format="font_ttf" 589 return 0 590 ;; 591 *.pfa | *.pfb) 592 file_format="font_postscript" 593 return 0 594 ;; 595 esac 596 597 return 1 598} 599 600function extract_comments 601{ 602 set -o errexit 603 604 nameref records="$1" 605 typeset filename="$2" 606 integer max_num_comments="$3" 607 integer max_filesize_for_scan="$4" 608 609 typeset datatype="" 610 611 records[${filename}]=( 612 typeset filename="$filename" 613 614 typeset fileformat_found="false" # "true" or "false" 615 typeset file_format="" 616 617 typeset -A hashsum 618 619 typeset comments_parsed="false" # "true" or "false" 620 typeset -a comments 621 ) 622 623 records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")" 624 records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")" 625 626 if get_file_format "$filename" datatype ; then 627 records[${filename}].fileformat_found="true" 628 records[${filename}].file_format="$datatype" 629 else 630 return 1 631 fi 632 633 case "$datatype" in 634 c_source|imakefile) 635 enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 636 records[${filename}].comments_parsed=true 637 ;; 638 shell|makefile) 639 enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 640 records[${filename}].comments_parsed=true 641 ;; 642 troff) 643 enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 644 records[${filename}].comments_parsed=true 645 ;; 646 # NOTE: Disabled for now 647 #xml|html|sgml) 648 # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 649 # records[${filename}].comments_parsed=true 650 # ;; 651 esac 652 653 return 0 654} 655 656# parse HTTP return code, cookies etc. 657function parse_http_response 658{ 659 nameref response="$1" 660 typeset h statuscode statusmsg i 661 662 # we use '\r' as additional IFS to filter the final '\r' 663 IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 664 [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 665 [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 666 response.statuscode="$statuscode" 667 response.statusmsg="$statusmsg" 668 669 # skip remaining headers 670 while IFS='' read -r i ; do 671 [[ "$i" == $'\r' ]] && break 672 673 # strip '\r' at the end 674 i="${i/~(Er)$'\r'/}" 675 676 case "$i" in 677 ~(Eli)Content-Type:.*) 678 response.content_type="${i/~(El).*:[[:blank:]]*/}" 679 ;; 680 ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 681 integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 682 ;; 683 ~(Eli)Transfer-Encoding:.*) 684 response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 685 ;; 686 esac 687 done 688 689 return 0 690} 691 692function cat_http_body 693{ 694 typeset emode="$1" 695 typeset hexchunksize="0" 696 integer chunksize=0 697 698 if [[ "${emode}" == "chunked" ]] ; then 699 while IFS=$'\r' read hexchunksize && 700 [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] && 701 (( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do 702 dd bs=1 count="${chunksize}" 2>/dev/null 703 done 704 else 705 cat 706 fi 707 708 return 0 709} 710 711function cat_http 712{ 713 typeset protocol="${1%://*}" 714 typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 715 716 typeset host="${path1%%/*}" 717 typeset path="${path1#*/}" 718 typeset port="${host##*:}" 719 720 integer netfd 721 typeset -C httpresponse # http response 722 723 # If URL did not contain a port number in the host part then look at the 724 # protocol to get the port number 725 if [[ "${port}" == "${host}" ]] ; then 726 case "${protocol}" in 727 "http") port=80 ;; 728 *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 729 esac 730 else 731 host="${host%:*}" 732 fi 733 734 printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" 735 736 # prechecks 737 [[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 738 [[ "${port}" == "" ]] && { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 739 [[ "${host}" == "" ]] && { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 740 [[ "${path}" == "" ]] && { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 741 742 # open TCP channel 743 redirect {netfd}<>"/dev/tcp/${host}/${port}" 744 (( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; } 745 746 # send HTTP request 747 request="GET /${path} HTTP/1.1\r\n" 748 request+="Host: ${host}\r\n" 749 request+="User-Agent: crawlsrccomments/ksh93 (2008-06-14; $(uname -s -r -p))\r\n" 750 request+="Connection: close\r\n" 751 print -n -- "${request}\r\n" >&${netfd} 752 753 # collect response and send it to stdout 754 parse_http_response httpresponse <&${netfd} 755 cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} 756 757 # close connection 758 redirect {netfd}<&- 759 760 return 0 761} 762 763function print_stats 764{ 765 set -o errexit 766 767 # gather some statistics 768 typeset stats=( 769 integer files_with_comments=0 770 integer files_without_comments=0 771 772 integer files_without_known_format=0 773 774 integer files_with_license_info=0 775 integer files_without_license_info=0 776 777 integer total_num_files=0 778 ) 779 780 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 781 if "${records[$i].comments_parsed}" ; then 782 (( stats.files_with_comments++ )) 783 else 784 (( stats.files_without_comments++ )) 785 fi 786 787 if ! "${records[$i].fileformat_found}" ; then 788 (( stats.files_without_known_format++ )) 789 fi 790 791 if "${records[$i].license_info_found}" ; then 792 (( stats.files_with_license_info++ )) 793 else 794 (( stats.files_without_license_info++ )) 795 fi 796 797 (( stats.total_num_files++ )) 798 done 799 800 printf "%B\n" stats 801 return 0 802} 803 804 805function print_comments_plain 806{ 807 set -o errexit 808 809 nameref records=$1 810 nameref options=$2 811 typeset i j 812 813 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 814 nameref node=records[$i] 815 816 if [[ "${options.filepattern.accept}" != "" ]] && \ 817 [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 818 continue 819 fi 820 if [[ "${options.filepattern.reject}" != "" ]] && \ 821 [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 822 continue 823 fi 824 825 node.license_info_found=false 826 827 if ! "${node.comments_parsed}" ; then 828 continue 829 fi 830 831 for j in "${!node.comments[@]}" ; do 832 typeset s="${node.comments[$j]}" 833 typeset match=false 834 835 if [[ "${options.commentpattern.accept}" != "" ]] && \ 836 [[ "$s" == ${options.commentpattern.accept} ]] ; then 837 match=true 838 fi 839 if [[ "${options.commentpattern.reject}" != "" ]] && \ 840 [[ "$s" == ${options.commentpattern.reject} ]] ; then 841 match=false 842 fi 843 844 if "${match}" ; then 845 printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j" 846 printf "%s\n" "$s" 847 node.license_info_found=true 848 fi 849 done 850 851 if ! "${node.license_info_found}" ; then 852 printf "## no match found in '%s'," "${node.filename}" 853 printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 854 "${node.comments_parsed}" \ 855 "${node.fileformat_found}" \ 856 "${node.file_format}" 857 fi 858 done 859 860 return 0 861} 862 863function print_comments_duplicates_compressed 864{ 865 set -o errexit 866 867 nameref records=$1 868 nameref options=$2 869 typeset i j 870 typeset -A hashed_comments 871 integer num_hashed_comments 872 873 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 874 nameref node=records[$i] 875 876 if [[ "${options.filepattern.accept}" != "" ]] && \ 877 [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 878 continue 879 fi 880 if [[ "${options.filepattern.reject}" != "" ]] && \ 881 [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 882 continue 883 fi 884 885 node.license_info_found=false 886 887 if ! "${node.comments_parsed}" ; then 888 continue 889 fi 890 891 for j in "${!node.comments[@]}" ; do 892 typeset s="${node.comments[$j]}" 893 typeset match=false 894 895 if [[ "${options.commentpattern.accept}" != "" ]] && \ 896 [[ "$s" == ${options.commentpattern.accept} ]] ; then 897 match=true 898 fi 899 if [[ "${options.commentpattern.reject}" != "" ]] && \ 900 [[ "$s" == ${options.commentpattern.reject} ]] ; then 901 match=false 902 fi 903 904 905 if "${match}" ; then 906 typeset -l hashstring # lowercase 907 908 # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ... 909 hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}" 910 # ... and then create a MD5 hash from this string 911 hash="$(sum -x md5 <<<"${hashstring}")" 912 913 nameref hc_node=hashed_comments[${hash}] 914 915 if [[ "${hc_node}" == "" ]] ; then 916 # build node if there isn't one yet 917 typeset -a hc_node.fileids 918 typeset hc_node.comment="$s" 919 fi 920 921 hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" ) 922 923 node.license_info_found=true 924 fi 925 done 926 927 if ! "${node.license_info_found}" ; then 928 printf "## no match found in " 929 printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}" 930 printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 931 "${node.comments_parsed}" \ 932 "${node.fileformat_found}" \ 933 "${node.file_format}" 934 fi 935 done 936 937 # print comments and all fileids (filename+hash sums) which include this comment 938 for i in "${!hashed_comments[@]}" ; do 939 printf "\f## The comment (ID=%s) ..." "${i}" 940 printf "\n-- snip --" 941 printf "\n%s" "${hashed_comments[${i}].comment}" 942 printf "\n-- snip --" 943 printf "\n... applies to the following files:\n" 944 printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber 945 done 946 947 return 0 948} 949 950function do_crawl 951{ 952 set -o errexit 953 954 typeset options=( 955 integer max_filesize_for_scan=$((256*1024)) 956 integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite) 957 ) 958 959 shift 960 while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do 961 printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 962 case ${OPT} in 963 S) options.max_filesize_for_scan="${OPTARG}" ;; 964 N) options.max_num_comments="${OPTARG}" ;; 965 *) usage do_crawl_usage ;; 966 esac 967 done 968 shift $((OPTIND-1)) 969 970 typeset scan=( 971 typeset -A records 972 ) 973 974 # read filenames from stdin 975 while read i ; do 976 printf "## scanning %s ...\n" "$i" 977 extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true 978 done 979 980 # print compound variable array (we strip the "typeset -A records" for now) 981 printf "%B\n" scan | 982 sed $'s/^#.*$//;s/^\(//;s/^\)//;s/^\ttypeset -A records=\(//;s/^\t\)//' >"crawlsrccomments_extracted_comments.cpv" 983 984 print "# Wrote results to crawlsrccomments_extracted_comments.cpv" 985 986 return 0 987} 988 989function do_getcomments 990{ 991 set -o errexit 992 993 # vars 994 typeset scan=( 995 typeset -A records 996 ) 997 typeset database 998 typeset tmp 999 1000 typeset options=( 1001 typeset database="crawlsrccomments_extracted_comments.cpv" 1002 1003 typeset print_stats=false 1004 typeset zapduplicates=false 1005 typeset filepattern=( 1006 typeset accept="*" 1007 typeset reject="" 1008 ) 1009 typeset commentpattern=( 1010 typeset accept="~(Ei)(license|copyright)" 1011 typeset reject="" 1012 ) 1013 ) 1014 1015 shift 1016 while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do 1017 # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1018 case ${OPT} in 1019 c) options.commentpattern.accept="${OPTARG}" ;; 1020 C) options.commentpattern.reject="${OPTARG}" ;; 1021 D) options.database="${OPTARG}" ;; 1022 l) options.filepattern.accept="${OPTARG}" ;; 1023 L) options.filepattern.reject="${OPTARG}" ;; 1024 S) options.print_stats=true ;; 1025 +S) options.print_stats=false ;; 1026 Z) options.zapduplicates=true ;; 1027 +Z) options.zapduplicates=false ;; 1028 *) usage do_getcomments_usage ;; 1029 esac 1030 done 1031 shift $((OPTIND-1)) 1032 1033 # array of temporary files which should be cleaned-up upon exit 1034 typeset -a tmpfiles 1035 trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT 1036 1037 # Support for HTTP URLs 1038 if [[ "${options.database}" == ~(El)http://.* ]] ; then 1039 database="/tmp/extract_license_cat_http_${PPID}_$$.tmp" 1040 tmpfiles+=( "${database}" ) 1041 print -u2 "# Loading URL..." 1042 cat_http "${options.database}" >"${database}" 1043 print -u2 "# Loading URL done." 1044 else 1045 database="${options.database}" 1046 fi 1047 1048 if [[ ! -r "${database}" ]] ; then 1049 fatal_error "Can't read ${database}." 1050 fi 1051 1052 # Support for compressed database files 1053 case "$(LC_ALL=C /usr/bin/file "${database}")" in 1054 *bzip2*) 1055 tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1056 tmpfiles+=( "${tmp}" ) 1057 print -u2 "# Uncompressing data (bzip2) ..." 1058 bzcat <"${database}" >"${tmp}" 1059 print -u2 "# Uncompression done." 1060 database="${tmp}" 1061 ;; 1062 *gzip*) 1063 tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1064 tmpfiles+=( "${tmp}" ) 1065 print -u2 "# Uncompressing data (gzip) ..." 1066 gunzip -c <"${database}" >"${tmp}" 1067 print -u2 "# Uncompression done." 1068 database="${tmp}" 1069 ;; 1070 esac 1071 1072 # Read compound variable which contain all recorded comments 1073 print -u2 "# reading records..." 1074 { 1075 printf "(" 1076 cat "${database}" 1077 printf ")\n" 1078 } | read -C scan.records || fatal_error 'Error reading data.' 1079 print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}" 1080 1081 # print comments 1082 print -u2 "# processing data..." 1083 print "## comments start:" 1084 if "${options.zapduplicates}" ; then 1085 print_comments_duplicates_compressed scan.records options 1086 else 1087 print_comments_plain scan.records options 1088 fi 1089 print "## comments end" 1090 print -u2 "# processing data done." 1091 1092 if "${options.print_stats}" ; then 1093 print_stats 1094 fi 1095 1096 return 0 1097} 1098 1099function usage 1100{ 1101 nameref usagemsg=$1 1102 OPTIND=0 1103 getopts -a "${progname}" "${usagemsg}" OPT '-?' 1104 exit 2 1105} 1106 1107typeset -r do_getcomments_usage=$'+ 1108[-?\n@(#)\$Id: getcomments (Roland Mainz) 2008-10-14 \$\n] 1109[-author?Roland Mainz <roland.mainz@sun.com>] 1110[+NAME?getcomments - extract license information from source files] 1111[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts 1112 license information from the "\bgetcomments\b"-database 1113 file created by \bcrawl\b. The script allows various 1114 filters (see options below) to be applied on the database] 1115[+?The license extraction is done in two steps - first a crawler script 1116 called \bcrawl\b will scan all source files, extract 1117 the comments and stores this information in a "database" file called 1118 "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows 1119 queries on this database.] 1120[D:database?Database file for input (either file or http://-URL).]:[database] 1121[l:acceptfilepattern?Process only files which match pattern.]:[pattern] 1122[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern] 1123[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern] 1124[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern] 1125[S:stats?Print statistics.] 1126[Z:zapsimilar?Combine similar/duplicate comments in the report.] 1127[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1128' 1129 1130typeset -r do_crawl_usage=$'+ 1131[-?\n@(#)\$Id: crawl (Roland Mainz) 2008-10-14 \$\n] 1132[-author?Roland Mainz <roland.mainz@sun.com>] 1133[+NAME?crawl - crawl comment information from source files] 1134[+DESCRIPTION?\bcrawl\b is a small utilty script which reads 1135 a list of source code files from stdin, determinates the type of 1136 syntax used by these files and then extracts 1137 comments from the source code and stores this information into a 1138 "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1139 be processed by \bextract_license\b or similar processing tools.] 1140[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments. 1141 Defaults to 256K characters.]:[numchars] 1142[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments] 1143[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1144' 1145 1146typeset -r crawlsrccomments_usage=$'+ 1147[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2008-10-14 \$\n] 1148[-author?Roland Mainz <roland.mainz@sun.com>] 1149[+NAME?crawlsrccomments - extract and filter comment information from source files] 1150[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads 1151 a list of source code files from stdin, determinates the type of 1152 syntax used by these files and then extracts 1153 comments from the source code and stores this information into a 1154 "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1155 be processed by \bextract_license\b or similar processing tools.] 1156 1157[crawl|getcomments] options 1158 1159[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1160' 1161 1162 1163# program start 1164builtin basename 1165builtin cat 1166builtin date 1167builtin uname 1168builtin rm 1169builtin sum || fatal_error "sum builtin not found." 1170 1171# exit at the first error we hit 1172set -o errexit 1173 1174typeset progname="${ basename "${0}" ; }" 1175 1176while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do 1177 # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1178 case ${OPT} in 1179 *) usage crawlsrccomments_usage ;; 1180 esac 1181done 1182shift $((OPTIND-1)) 1183 1184typeset cmd="$1" 1185 1186case "$cmd" in 1187 "crawl") 1188 progname+=" ${cmd}" 1189 do_crawl "$@" 1190 exit $? 1191 ;; 1192 "getcomments") 1193 progname+=" ${cmd}" 1194 do_getcomments "$@" 1195 exit $? 1196 ;; 1197 *) 1198 usage crawlsrccomments_usage 1199 ;; 1200esac 1201 1202fatal_error "not reached." 1203# EOF. 1204