1#!/usr/bin/ksh93 2 3# 4# CDDL HEADER START 5# 6# The contents of this file are subject to the terms of the 7# Common Development and Distribution License (the "License"). 8# You may not use this file except in compliance with the License. 9# 10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11# or http://www.opensolaris.org/os/licensing. 12# See the License for the specific language governing permissions 13# and limitations under the License. 14# 15# When distributing Covered Code, include this CDDL HEADER in each 16# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17# If applicable, add the following below this CDDL HEADER, with the 18# fields enclosed by brackets "[]" replaced with your own identifying 19# information: Portions Copyright [yyyy] [name of copyright owner] 20# 21# CDDL HEADER END 22# 23 24# 25# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 26# 27 28# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 29export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 30 31# Make sure all math stuff runs in the "C" locale to avoid problems 32# with alternative # radix point representations (e.g. ',' instead of 33# '.' in de_DE.*-locales). This needs to be set _before_ any 34# floating-point constants are defined in this script). 35if [[ "${LC_ALL}" != "" ]] ; then 36 export \ 37 LC_MONETARY="${LC_ALL}" \ 38 LC_MESSAGES="${LC_ALL}" \ 39 LC_COLLATE="${LC_ALL}" \ 40 LC_CTYPE="${LC_ALL}" 41 unset LC_ALL 42fi 43export LC_NUMERIC=C 44 45# constants values for tokenizer/parser stuff 46compound -r ch=( 47 newline=$'\n' 48 tab=$'\t' 49 formfeed=$'\f' 50) 51 52function fatal_error 53{ 54 print -u2 "${progname}: $*" 55 exit 1 56} 57 58function printmsg 59{ 60 print -u2 "$*" 61} 62 63 64function attrstrtoattrarray 65{ 66#set -o xtrace 67 typeset s="$1" 68 nameref aa=$2 # attribute array 69 integer aa_count=0 70 integer aa_count=0 71 typeset nextattr 72 integer currattrlen=0 73 typeset tagstr 74 typeset tagval 75 76 while (( ${#s} > 0 )) ; do 77 # skip whitespaces 78 while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do 79 (( currattrlen++ )) 80 done 81 s="${s:currattrlen:${#s}}" 82 83 # anything left ? 84 (( ${#s} == 0 )) && break 85 86 # Pattern tests: 87 #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}" 88 #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}" 89 #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}" 90 #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}" 91 # All pattern combined via eregex (w|x|y|z): 92 #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}" 93 nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}" 94 currattrlen=$(( ${#s} - ${#nextattr})) 95 96 # add entry 97 tagstr="${s:0:currattrlen}" 98 if [[ "${tagstr}" == *=* ]] ; then 99 # normal case: attribute with value 100 101 tagval="${tagstr#*=}" 102 103 # strip quotes ('' or "") 104 if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then 105 tagval="${tagval:1:${#tagval}-2}" 106 fi 107 108 aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" ) 109 else 110 # special case for HTML where you have something like <foo baz> 111 aa[${aa_count}]=( name="${tagstr}" ) 112 fi 113 (( aa_count++ )) 114 (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert 115 done 116} 117 118# XML document handler 119function handle_xml_document 120{ 121#set -o xtrace 122 nameref callbacks=${1} 123 typeset tag_type="${2}" 124 typeset tag_value="${3}" 125 typeset tag_attributes="${4}" 126 nameref doc=${callbacks["arg_tree"]} 127 nameref nodepath="${stack.items[stack.pos]}" 128 nameref nodesnum="${stack.items[stack.pos]}num" 129 130 case "${tag_type}" in 131 tag_comment) 132 nodepath[${nodesnum}]+=( 133 typeset tagtype="comment" 134 typeset tagvalue="${tag_value}" 135 ) 136 (( nodesnum++ )) 137 ;; 138 esac 139 140# print "xmltok: '${tag_type}' = '${tag_value}'" 141} 142 143function xml_tok 144{ 145 typeset buf="" 146 typeset namebuf="" 147 typeset attrbuf="" 148 typeset c="" 149 typeset isendtag # bool: true/false 150 typeset issingletag # bool: true/false (used for tags like "<br />") 151 nameref callbacks=${1} 152 153 [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 154 155 while IFS='' read -r -N 1 c ; do 156 isendtag=false 157 158 if [[ "$c" == "<" ]] ; then 159 # flush any text content 160 if [[ "$buf" != "" ]] ; then 161 [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 162 buf="" 163 fi 164 165 IFS='' read -r -N 1 c 166 if [[ "$c" == "/" ]] ; then 167 isendtag=true 168 else 169 buf="$c" 170 fi 171 IFS='' read -r -d '>' c 172 buf+="$c" 173 174 # handle comments 175 if [[ "$buf" == ~(El)!-- ]] ; then 176 # did we read the comment completely ? 177 if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 178 buf+=">" 179 while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 180 IFS='' read -r -N 1 c || break 181 buf+="$c" 182 done 183 fi 184 185 [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 186 buf="" 187 continue 188 fi 189 190 # check if the tag starts and ends at the same time (like "<br />") 191 if [[ "${buf}" == ~(Er).*/ ]] ; then 192 issingletag=true 193 buf="${buf%*/}" 194 else 195 issingletag=false 196 fi 197 198 # check if the tag has attributes (e.g. space after name) 199 if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 200 namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 201 attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 202 else 203 namebuf="$buf" 204 attrbuf="" 205 fi 206 207 if ${isendtag} ; then 208 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 209 else 210 [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 211 212 # handle tags like <br/> (which are start- and end-tag in one piece) 213 if ${issingletag} ; then 214 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 215 fi 216 fi 217 buf="" 218 else 219 buf+="$c" 220 fi 221 done 222 223 [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 224 225 print # final newline to make filters like "sed" happy 226} 227 228# enumerate comments in a shell (or shell-like) script 229function enumerate_comments_shell 230{ 231 set -o errexit 232 233 typeset input_file="$1" 234 nameref comment_array="$2" 235 integer max_num_comments="$3" 236 integer ca=0 # index in "comment_array" 237 238 integer res=0 239 240 typeset comment="" 241 242 while (( res == 0 )) ; do 243 IFS='' read -r line 244 (( res=$? )) 245 246 if [[ "${line}" == ~(El)#.* ]] ; then 247 comment+="${line#\#}${ch.newline}" 248 else 249 if [[ "$comment" != "" ]] ; then 250 comment_array[ca++]="${comment}" 251 comment="" 252 253 if (( ca > max_num_comments )) ; then 254 break 255 fi 256 fi 257 fi 258 done <"${input_file}" 259 260 return 0 261} 262 263 264# enumerate comments in a troff document 265function enumerate_comments_troff 266{ 267 set -o errexit 268 269 typeset input_file="$1" 270 nameref comment_array="$2" 271 integer max_num_comments="$3" 272 integer ca=0 # index in "comment_array" 273 274 integer res=0 275 276 typeset comment="" 277 278 while (( res == 0 )) ; do 279 IFS='' read -r line 280 (( res=$? )) 281 282 if [[ "${line}" == ~(El)\.*\\\" ]] ; then 283 comment+="${line#~(El)\.*\\\"}${ch.newline}" 284 else 285 if [[ "$comment" != "" ]] ; then 286 comment_array[ca++]="${comment}" 287 comment="" 288 289 if (( ca > max_num_comments )) ; then 290 break 291 fi 292 fi 293 fi 294 done <"${input_file}" 295 296 return 0 297} 298 299 300# enumerate comments in files which are preprocessed by 301# CPP (e.g. C, C++, Imakefile etc.) 302function enumerate_comments_cpp 303{ 304 set -o errexit 305# set -o nounset 306 307 integer err=0 308 309 typeset input_file="$1" 310 nameref comment_array="$2" 311 integer max_num_comments="$3" 312 integer max_filesize_for_scan="$4" 313 integer ca=0 # index in "comment_array" 314 315 typeset content 316 integer content_length 317 318 integer file_pos # file position 319 compound line_pos=( 320 integer x=0 # X position in line 321 integer y=0 # Y position in line (line number) 322 ) 323 typeset c c2 324 325 typeset comment 326 327 compound state=( 328 # C comment state 329 typeset in_c_comment=false 330 # C++ comment state 331 compound cxx=( 332 typeset in_comment=false 333 typeset comment_continued=false 334 # position of current //-pos 335 compound comment_pos=( 336 integer x=-1 337 integer y=-1 338 ) 339 # position of previous //-pos 340 compound comment_prev_pos=( 341 integer x=-1 342 integer y=-1 343 ) 344 ) 345 # literal state 346 typeset in_sq_literal=false # single-quote literal 347 typeset in_dq_literal=false # double-quote literal 348 ) 349 350 content="$(< "${input_file}")" 351 352 # Truncate file to "max_filesize_for_scan" charatcters. 353 # This was originally added to work around a performance problem with 354 # the ${str:offset:chunksize} operator which scales badly in ksh93 355 # version 's' with the number of characters 356 if (( ${#content} > max_filesize_for_scan )) ; then 357 print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \ 358 "${input_file}" \ 359 max_filesize_for_scan 360 content="${content:0:max_filesize_for_scan}" 361 fi 362 content_length=${#content} 363 364 # Iterate through the source code. The last character 365 # (when file_pos == content_length) will be empty to indicate 366 # EOF (this is needed for cases like when 367 # a C++ comment is not terminated by a newline... ;-/) 368 for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do 369 c2="${content:file_pos:2}" 370 c="${c2:0:1}" 371 372 if [[ "$c" == "${ch.newline}" ]] ; then 373 (( line_pos.x=0, line_pos.y++ )) 374 else 375 (( line_pos.x++ )) 376 fi 377 378 if ${state.in_c_comment} ; then 379 if [[ "$c2" == "*/" ]] ; then 380 (( file_pos++, line_pos.x++ )) 381 state.in_c_comment=false 382 383 # flush comment text 384 comment_array[ca++]="${comment}" 385 comment="" 386 387 if (( ca > max_num_comments )) ; then 388 break 389 fi 390 else 391 comment+="$c" 392 fi 393 elif ${state.cxx.in_comment} ; then 394 if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then 395 state.cxx.in_comment=false 396 397 # flush comment text 398 if ${state.cxx.comment_continued} ; then 399 comment_array[ca-1]+="${ch.newline}${comment}" 400 (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 401 state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 402 else 403 comment_array[ca++]="${comment}" 404 (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 405 state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 406 fi 407 comment="" 408 409 if (( ca > max_num_comments )) ; then 410 break 411 fi 412 else 413 comment+="$c" 414 fi 415 elif ${state.in_sq_literal} ; then 416 if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 417 state.in_sq_literal=false 418 fi 419 elif ${state.in_dq_literal} ; then 420 if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 421 state.in_dq_literal=false 422 fi 423 else 424 if [[ "$c2" == "/*" ]] ; then 425 (( file_pos++, line_pos.x++ )) 426 state.in_c_comment=true 427 comment="" 428 elif [[ "$c2" == "//" ]] ; then 429 (( file_pos++, line_pos.x++ )) 430 if (( state.cxx.comment_prev_pos.x == line_pos.x && \ 431 state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then 432 state.cxx.comment_continued=true 433 else 434 state.cxx.comment_continued=false 435 fi 436 (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y )) 437 state.cxx.in_comment=true 438 comment="" 439 elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 440 state.in_sq_literal=true 441 elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 442 state.in_dq_literal=true 443 fi 444 fi 445 done 446 447 if [[ "$comment" != "" ]] ; then 448 print -u2 "## ERROR: Comment text buffer not empty at EOF." 449 err=1 450 fi 451 452 if ${state.in_c_comment} ; then 453 print -u2 "## ERROR: C comment did not close before EOF." 454 err=1 455 fi 456 457 if ${state.cxx.in_comment} ; then 458 print -u2 "## ERROR: C++ comment did not close before EOF." 459 err=1 460 fi 461 462 if ${state.in_dq_literal} ; then 463 print -u2 "## ERROR: Double-quoted literal did not close before EOF." 464 err=1 465 fi 466 467 # We treat this one only as warning since things like "foo.html.cpp" may 468 # trigger this condition accidently 469 if ${state.in_sq_literal} ; then 470 print -u2 "## WARNING: Single-quoted literal did not close before EOF." 471 fi 472 473 return $err 474} 475 476# determine file type 477function get_file_format 478{ 479 set -o errexit 480 481 typeset filename="$1" 482 nameref file_format="$2" 483 484 typeset fileeval # evaluation result of /usr/bin/file 485 486 # check whether "filename" is a plain, readable file 487 [[ ! -f "$filename" ]] && return 1 488 [[ ! -r "$filename" ]] && return 1 489 490 # In theory this code would exclusively look at the contents of 491 # the file to figure out it's file format - unfortunately 492 # /usr/bin/file is virtually useless (the heuristics, matching 493 # and output unreliable) for many file formats and therefore 494 # we have to do a multi-stage approach which looks 495 # at the file's content if possible and at the filename 496 # otherwise. Fun... ;-( 497 498 # pass one: Find matches for file formats where /usr/bin/file 499 # is known to be unreliable: 500 case "$filename" in 501 *.[ch] | *.cpp | *.cc | *.cxx | *.hxx) 502 file_format="c_source" 503 return 0 504 ;; 505 *Imakefile) 506 file_format="imakefile" 507 return 0 508 ;; 509 *Makefile) 510 file_format="makefile" 511 return 0 512 ;; 513 esac 514 515 # pass two: match by file content via /usr/bin/file 516 fileeval="$(LC_ALL=C /usr/bin/file "$filename")" 517 case "$fileeval" in 518 ~(E)roff) 519 file_format="troff" 520 return 0 521 ;; 522 ~(E)html\ document) 523 file_format="html" 524 return 0 525 ;; 526 ~(E)sgml\ document) 527 file_format="sgml" 528 return 0 529 ;; 530 ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script) 531 file_format="shell" 532 return 0 533 ;; 534 ~(E)executable.*/perl\ script) 535 file_format="perl" 536 return 0 537 ;; 538 esac 539 540 # pass three: fallhack to filename matching 541 case "$filename" in 542 *.man) 543 file_format="troff" 544 return 0 545 ;; 546 *.html) 547 file_format="html" 548 return 0 549 ;; 550 *.sgml) 551 file_format="sgml" 552 return 0 553 ;; 554 *.xml) 555 file_format="xml" 556 return 0 557 ;; 558 *.png) 559 file_format="image_png" 560 return 0 561 ;; 562 *.xcf) 563 file_format="image_xcf" 564 return 0 565 ;; 566 *.shar) 567 file_format="archive_shell" 568 return 0 569 ;; 570 *.sh) 571 file_format="shell" 572 return 0 573 ;; 574 *.pcf) 575 file_format="font_pcf" 576 return 0 577 ;; 578 *.bdf) 579 file_format="font_bdf" 580 return 0 581 ;; 582 *.pmf) 583 file_format="font_pmf" 584 return 0 585 ;; 586 *.ttf | *.otf) 587 file_format="font_ttf" 588 return 0 589 ;; 590 *.pfa | *.pfb) 591 file_format="font_postscript" 592 return 0 593 ;; 594 esac 595 596 return 1 597} 598 599function extract_comments 600{ 601 set -o errexit 602 603 nameref records="$1" 604 typeset filename="$2" 605 integer max_num_comments="$3" 606 integer max_filesize_for_scan="$4" 607 608 typeset datatype="" 609 610 records[${filename}]=( 611 typeset filename="$filename" 612 613 typeset fileformat_found="false" # "true" or "false" 614 typeset file_format="" 615 616 typeset -A hashsum 617 618 typeset comments_parsed="false" # "true" or "false" 619 typeset -a comments 620 ) 621 622 records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")" 623 records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")" 624 625 if get_file_format "$filename" datatype ; then 626 records[${filename}].fileformat_found="true" 627 records[${filename}].file_format="$datatype" 628 else 629 return 1 630 fi 631 632 case "$datatype" in 633 c_source|imakefile) 634 enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 635 records[${filename}].comments_parsed=true 636 ;; 637 shell|makefile) 638 enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 639 records[${filename}].comments_parsed=true 640 ;; 641 troff) 642 enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 643 records[${filename}].comments_parsed=true 644 ;; 645 # NOTE: Disabled for now 646 #xml|html|sgml) 647 # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 648 # records[${filename}].comments_parsed=true 649 # ;; 650 esac 651 652 return 0 653} 654 655# parse HTTP return code, cookies etc. 656function parse_http_response 657{ 658 nameref response="$1" 659 typeset h statuscode statusmsg i 660 661 # we use '\r' as additional IFS to filter the final '\r' 662 IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 663 [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 664 [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 665 response.statuscode="$statuscode" 666 response.statusmsg="$statusmsg" 667 668 # skip remaining headers 669 while IFS='' read -r i ; do 670 [[ "$i" == $'\r' ]] && break 671 672 # strip '\r' at the end 673 i="${i/~(Er)$'\r'/}" 674 675 case "$i" in 676 ~(Eli)Content-Type:.*) 677 response.content_type="${i/~(El).*:[[:blank:]]*/}" 678 ;; 679 ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 680 integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 681 ;; 682 ~(Eli)Transfer-Encoding:.*) 683 response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 684 ;; 685 esac 686 done 687 688 return 0 689} 690 691function cat_http_body 692{ 693 typeset emode="$1" 694 typeset hexchunksize="0" 695 integer chunksize=0 696 697 if [[ "${emode}" == "chunked" ]] ; then 698 while IFS=$'\r' read hexchunksize && 699 [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] && 700 (( chunksize=$( printf "16#%s\n" "${hexchunksize}" ) )) && (( chunksize > 0 )) ; do 701 dd bs=1 count="${chunksize}" 2>/dev/null 702 done 703 else 704 cat 705 fi 706 707 return 0 708} 709 710function cat_url 711{ 712 typeset protocol="${1%://*}" 713 typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 714 715 if [[ "${protocol}" == "file" ]] ; then 716 cat "${path1}" 717 return $? 718 elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then 719 typeset host="${path1%%/*}" 720 typeset path="${path1#*/}" 721 typeset port="${host##*:}" 722 723 integer netfd 724 compound httpresponse # http response 725 726 # If URL did not contain a port number in the host part then look at the 727 # protocol to get the port number 728 if [[ "${port}" == "${host}" ]] ; then 729 case "${protocol}" in 730 "http") port=80 ;; 731 "https") port=443 ;; 732 *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 733 esac 734 else 735 host="${host%:*}" 736 fi 737 738 printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" 739 740 # prechecks 741 [[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 742 [[ "${port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 743 [[ "${host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 744 [[ "${path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 745 746 # open TCP channel 747 if [[ "${protocol}" == "https" ]] ; then 748 compound sslfifo 749 sslfifo.dir="$(mktemp -d)" 750 sslfifo.in="${sslfifo.dir}/in" 751 sslfifo.out="${sslfifo.dir}/out" 752 753 # register an EXIT trap and use "errexit" to leave it at the first error 754 # (this saves lots of if/fi tests for error checking) 755 trap "rm -r \"${sslfifo.dir}\"" EXIT 756 set -o errexit 757 758 mkfifo "${sslfifo.in}" "${sslfifo.out}" 759 760 # create async openssl child to handle https 761 openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" & 762 763 # send HTTP request 764 request="GET /${path} HTTP/1.1\r\n" 765 request+="Host: ${host}\r\n" 766 request+="User-Agent: crawlsrccomments/ksh93(ssl) (2010-03-27; $(uname -s -r -p))\r\n" 767 request+="Connection: close\r\n" 768 print -n -- "${request}\r\n" >> "${sslfifo.in}" 769 770 # collect response and send it to stdout 771 { 772 parse_http_response httpresponse 773 cat_http_body "${httpresponse.transfer_encoding}" 774 } <"${sslfifo.out}" 775 776 wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; } 777 778 return 0 779 else 780 redirect {netfd}<> "/dev/tcp/${host}/${port}" 781 (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } 782 783 # send HTTP request 784 request="GET /${path} HTTP/1.1\r\n" 785 request+="Host: ${host}\r\n" 786 request+="User-Agent: crawlsrccomments/ksh93 (2010-03-27; $(uname -s -r -p))\r\n" 787 request+="Connection: close\r\n" 788 print -n -- "${request}\r\n" >&${netfd} 789 790 # collect response and send it to stdout 791 parse_http_response httpresponse <&${netfd} 792 cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} 793 794 # close connection 795 redirect {netfd}<&- 796 797 return 0 798 fi 799 else 800 return 1 801 fi 802 # notreached 803} 804 805function print_stats 806{ 807 set -o errexit 808 809 # gather some statistics 810 compound stats=( 811 integer files_with_comments=0 812 integer files_without_comments=0 813 814 integer files_without_known_format=0 815 816 integer files_with_license_info=0 817 integer files_without_license_info=0 818 819 integer total_num_files=0 820 ) 821 822 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 823 if "${records[$i].comments_parsed}" ; then 824 (( stats.files_with_comments++ )) 825 else 826 (( stats.files_without_comments++ )) 827 fi 828 829 if ! "${records[$i].fileformat_found}" ; then 830 (( stats.files_without_known_format++ )) 831 fi 832 833 if "${records[$i].license_info_found}" ; then 834 (( stats.files_with_license_info++ )) 835 else 836 (( stats.files_without_license_info++ )) 837 fi 838 839 (( stats.total_num_files++ )) 840 done 841 842 print -v stats 843 return 0 844} 845 846 847function print_comments_plain 848{ 849 set -o errexit 850 851 nameref records=$1 852 nameref options=$2 853 typeset i j 854 855 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 856 nameref node=records[$i] 857 858 if [[ "${options.filepattern.accept}" != "" ]] && \ 859 [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 860 continue 861 fi 862 if [[ "${options.filepattern.reject}" != "" ]] && \ 863 [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 864 continue 865 fi 866 867 node.license_info_found=false 868 869 if ! "${node.comments_parsed}" ; then 870 continue 871 fi 872 873 for j in "${!node.comments[@]}" ; do 874 typeset s="${node.comments[$j]}" 875 typeset match=false 876 877 if [[ "${options.commentpattern.accept}" != "" ]] && \ 878 [[ "$s" == ${options.commentpattern.accept} ]] ; then 879 match=true 880 fi 881 if [[ "${options.commentpattern.reject}" != "" ]] && \ 882 [[ "$s" == ${options.commentpattern.reject} ]] ; then 883 match=false 884 fi 885 886 if "${match}" ; then 887 printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j" 888 printf "%s\n" "$s" 889 node.license_info_found=true 890 fi 891 done 892 893 if ! "${node.license_info_found}" ; then 894 printf "## no match found in '%s'," "${node.filename}" 895 printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 896 "${node.comments_parsed}" \ 897 "${node.fileformat_found}" \ 898 "${node.file_format}" 899 fi 900 done 901 902 return 0 903} 904 905function print_comments_duplicates_compressed 906{ 907 set -o errexit 908 909 nameref records=$1 910 nameref options=$2 911 typeset i j 912 typeset -A hashed_comments 913 integer num_hashed_comments 914 915 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 916 nameref node=records[$i] 917 918 if [[ "${options.filepattern.accept}" != "" ]] && \ 919 [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 920 continue 921 fi 922 if [[ "${options.filepattern.reject}" != "" ]] && \ 923 [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 924 continue 925 fi 926 927 node.license_info_found=false 928 929 if ! "${node.comments_parsed}" ; then 930 continue 931 fi 932 933 for j in "${!node.comments[@]}" ; do 934 typeset s="${node.comments[$j]}" 935 typeset match=false 936 937 if [[ "${options.commentpattern.accept}" != "" ]] && \ 938 [[ "$s" == ${options.commentpattern.accept} ]] ; then 939 match=true 940 fi 941 if [[ "${options.commentpattern.reject}" != "" ]] && \ 942 [[ "$s" == ${options.commentpattern.reject} ]] ; then 943 match=false 944 fi 945 946 947 if "${match}" ; then 948 typeset -l hashstring # lowercase 949 950 # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ... 951 hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}" 952 # ... and then create a MD5 hash from this string 953 hash="$(sum -x md5 <<<"${hashstring}")" 954 955 nameref hc_node=hashed_comments[${hash}] 956 957 if [[ "${hc_node}" == "" ]] ; then 958 # build node if there isn't one yet 959 typeset -a hc_node.fileids 960 typeset hc_node.comment="$s" 961 fi 962 963 hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" ) 964 965 node.license_info_found=true 966 fi 967 done 968 969 if ! "${node.license_info_found}" ; then 970 printf "## no match found in " 971 printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}" 972 printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 973 "${node.comments_parsed}" \ 974 "${node.fileformat_found}" \ 975 "${node.file_format}" 976 fi 977 done 978 979 # print comments and all fileids (filename+hash sums) which include this comment 980 for i in "${!hashed_comments[@]}" ; do 981 printf "\f## The comment (ID=%s) ..." "${i}" 982 printf "\n-- snip --" 983 printf "\n%s" "${hashed_comments[${i}].comment}" 984 printf "\n-- snip --" 985 printf "\n... applies to the following files:\n" 986 printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber 987 done 988 989 return 0 990} 991 992function do_crawl 993{ 994 set -o errexit 995 996 compound options=( 997 integer max_filesize_for_scan=$((256*1024)) 998 integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite) 999 ) 1000 1001 shift 1002 while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do 1003 printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1004 case ${OPT} in 1005 S) options.max_filesize_for_scan="${OPTARG}" ;; 1006 N) options.max_num_comments="${OPTARG}" ;; 1007 *) usage do_crawl_usage ;; 1008 esac 1009 done 1010 shift $((OPTIND-1)) 1011 1012 compound scan=( 1013 typeset -A records 1014 ) 1015 1016 # read filenames from stdin 1017 while read i ; do 1018 printf "## scanning %s ...\n" "$i" 1019 extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true 1020 done 1021 1022 # print compound variable array (we strip the "typeset -A records" for now) 1023 print -v scan >"crawlsrccomments_extracted_comments.cpv" 1024 1025 print "# Wrote results to crawlsrccomments_extracted_comments.cpv" 1026 1027 return 0 1028} 1029 1030function do_getcomments 1031{ 1032 set -o errexit 1033 1034 # vars 1035 compound scan 1036 typeset database 1037 typeset tmp 1038 1039 compound options=( 1040 typeset database="crawlsrccomments_extracted_comments.cpv" 1041 1042 typeset print_stats=false 1043 typeset zapduplicates=false 1044 compound filepattern=( 1045 typeset accept="*" 1046 typeset reject="" 1047 ) 1048 compound commentpattern=( 1049 typeset accept="~(Ei)(license|copyright)" 1050 typeset reject="" 1051 ) 1052 ) 1053 1054 shift 1055 while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do 1056 # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1057 case ${OPT} in 1058 c) options.commentpattern.accept="${OPTARG}" ;; 1059 C) options.commentpattern.reject="${OPTARG}" ;; 1060 D) options.database="${OPTARG}" ;; 1061 l) options.filepattern.accept="${OPTARG}" ;; 1062 L) options.filepattern.reject="${OPTARG}" ;; 1063 S) options.print_stats=true ;; 1064 +S) options.print_stats=false ;; 1065 Z) options.zapduplicates=true ;; 1066 +Z) options.zapduplicates=false ;; 1067 *) usage do_getcomments_usage ;; 1068 esac 1069 done 1070 shift $((OPTIND-1)) 1071 1072 # array of temporary files which should be cleaned-up upon exit 1073 typeset -a tmpfiles 1074 trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT 1075 1076 # Support for HTTP URLs 1077 if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then 1078 database="/tmp/extract_license_cat_url_${PPID}_$$.tmp" 1079 tmpfiles+=( "${database}" ) 1080 print -u2 "# Loading URL..." 1081 cat_url "${options.database}" >"${database}" 1082 print -u2 "# Loading URL done." 1083 else 1084 database="${options.database}" 1085 fi 1086 1087 if [[ ! -r "${database}" ]] ; then 1088 fatal_error "Can't read ${database}." 1089 fi 1090 1091 # Support for compressed database files 1092 case "$(LC_ALL=C /usr/bin/file "${database}")" in 1093 *bzip2*) 1094 tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1095 tmpfiles+=( "${tmp}" ) 1096 print -u2 "# Uncompressing data (bzip2) ..." 1097 bzcat <"${database}" >"${tmp}" 1098 print -u2 "# Uncompression done." 1099 database="${tmp}" 1100 ;; 1101 *gzip*) 1102 tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1103 tmpfiles+=( "${tmp}" ) 1104 print -u2 "# Uncompressing data (gzip) ..." 1105 gunzip -c <"${database}" >"${tmp}" 1106 print -u2 "# Uncompression done." 1107 database="${tmp}" 1108 ;; 1109 esac 1110 1111 # Read compound variable which contain all recorded comments 1112 print -u2 "# reading records..." 1113 read -C scan <"${database}" || fatal_error 'Error reading data.' 1114 print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}" 1115 1116 # print comments 1117 print -u2 "# processing data..." 1118 print "## comments start:" 1119 if "${options.zapduplicates}" ; then 1120 print_comments_duplicates_compressed scan.records options 1121 else 1122 print_comments_plain scan.records options 1123 fi 1124 print "## comments end" 1125 print -u2 "# processing data done." 1126 1127 if "${options.print_stats}" ; then 1128 print_stats 1129 fi 1130 1131 return 0 1132} 1133 1134function usage 1135{ 1136 nameref usagemsg=$1 1137 OPTIND=0 1138 getopts -a "${progname}" "${usagemsg}" OPT '-?' 1139 exit 2 1140} 1141 1142typeset -r do_getcomments_usage=$'+ 1143[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n] 1144[-author?Roland Mainz <roland.mainz@sun.com>] 1145[-author?Roland Mainz <roland.mainz@nrubsig.org>] 1146[+NAME?getcomments - extract license information from source files] 1147[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts 1148 license information from the "\bgetcomments\b"-database 1149 file created by \bcrawl\b. The script allows various 1150 filters (see options below) to be applied on the database] 1151[+?The license extraction is done in two steps - first a crawler script 1152 called \bcrawl\b will scan all source files, extract 1153 the comments and stores this information in a "database" file called 1154 "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows 1155 queries on this database.] 1156[D:database?Database file for input (either file, http:// or https://-URL).]:[database] 1157[l:acceptfilepattern?Process only files which match pattern.]:[pattern] 1158[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern] 1159[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern] 1160[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern] 1161[S:stats?Print statistics.] 1162[Z:zapsimilar?Combine similar/duplicate comments in the report.] 1163[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1164' 1165 1166typeset -r do_crawl_usage=$'+ 1167[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n] 1168[-author?Roland Mainz <roland.mainz@sun.com>] 1169[-author?Roland Mainz <roland.mainz@nrubsig.org>] 1170[+NAME?crawl - crawl comment information from source files] 1171[+DESCRIPTION?\bcrawl\b is a small utilty script which reads 1172 a list of source code files from stdin, determinates the type of 1173 syntax used by these files and then extracts 1174 comments from the source code and stores this information into a 1175 "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1176 be processed by \bextract_license\b or similar processing tools.] 1177[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments. 1178 Defaults to 256K characters.]:[numchars] 1179[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments] 1180[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1181' 1182 1183typeset -r crawlsrccomments_usage=$'+ 1184[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n] 1185[-author?Roland Mainz <roland.mainz@sun.com>] 1186[-author?Roland Mainz <roland.mainz@nrubsig.org>] 1187[+NAME?crawlsrccomments - extract and filter comment information from source files] 1188[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads 1189 a list of source code files from stdin, determinates the type of 1190 syntax used by these files and then extracts 1191 comments from the source code and stores this information into a 1192 "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1193 be processed by \bextract_license\b or similar processing tools.] 1194 1195[crawl|getcomments] options 1196 1197[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1198' 1199 1200 1201# program start 1202builtin basename 1203builtin cat 1204builtin date 1205builtin uname 1206builtin rm 1207builtin sum || fatal_error "sum builtin not found." 1208 1209# exit at the first error we hit 1210set -o errexit 1211 1212typeset progname="${ basename "${0}" ; }" 1213 1214while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do 1215 # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1216 case ${OPT} in 1217 *) usage crawlsrccomments_usage ;; 1218 esac 1219done 1220shift $((OPTIND-1)) 1221 1222typeset cmd="$1" 1223 1224case "$cmd" in 1225 "crawl") 1226 progname+=" ${cmd}" 1227 do_crawl "$@" 1228 exit $? 1229 ;; 1230 "getcomments") 1231 progname+=" ${cmd}" 1232 do_getcomments "$@" 1233 exit $? 1234 ;; 1235 *) 1236 usage crawlsrccomments_usage 1237 ;; 1238esac 1239 1240fatal_error "not reached." 1241# EOF. 1242