1#!/usr/bin/ksh93 2 3# 4# CDDL HEADER START 5# 6# The contents of this file are subject to the terms of the 7# Common Development and Distribution License (the "License"). 8# You may not use this file except in compliance with the License. 9# 10# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11# or http://www.opensolaris.org/os/licensing. 12# See the License for the specific language governing permissions 13# and limitations under the License. 14# 15# When distributing Covered Code, include this CDDL HEADER in each 16# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17# If applicable, add the following below this CDDL HEADER, with the 18# fields enclosed by brackets "[]" replaced with your own identifying 19# information: Portions Copyright [yyyy] [name of copyright owner] 20# 21# CDDL HEADER END 22# 23 24# 25# Copyright 2009 Sun Microsystems, Inc. All rights reserved. 26# Use is subject to license terms. 27# 28 29# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 30export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 31 32# Make sure all math stuff runs in the "C" locale to avoid problems 33# with alternative # radix point representations (e.g. ',' instead of 34# '.' in de_DE.*-locales). This needs to be set _before_ any 35# floating-point constants are defined in this script). 36if [[ "${LC_ALL}" != "" ]] ; then 37 export \ 38 LC_MONETARY="${LC_ALL}" \ 39 LC_MESSAGES="${LC_ALL}" \ 40 LC_COLLATE="${LC_ALL}" \ 41 LC_CTYPE="${LC_ALL}" 42 unset LC_ALL 43fi 44export LC_NUMERIC=C 45 46# constants values for tokenizer/parser stuff 47compound -r ch=( 48 newline=$'\n' 49 tab=$'\t' 50 formfeed=$'\f' 51) 52 53function fatal_error 54{ 55 print -u2 "${progname}: $*" 56 exit 1 57} 58 59function printmsg 60{ 61 print -u2 "$*" 62} 63 64 65function attrstrtoattrarray 66{ 67#set -o xtrace 68 typeset s="$1" 69 nameref aa=$2 # attribute array 70 integer aa_count=0 71 integer aa_count=0 72 typeset nextattr 73 integer currattrlen=0 74 typeset tagstr 75 typeset tagval 76 77 while (( ${#s} > 0 )) ; do 78 # skip whitespaces 79 while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do 80 (( currattrlen++ )) 81 done 82 s="${s:currattrlen:${#s}}" 83 84 # anything left ? 85 (( ${#s} == 0 )) && break 86 87 # Pattern tests: 88 #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}" 89 #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}" 90 #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}" 91 #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}" 92 # All pattern combined via eregex (w|x|y|z): 93 #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}" 94 nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}" 95 currattrlen=$(( ${#s} - ${#nextattr})) 96 97 # add entry 98 tagstr="${s:0:currattrlen}" 99 if [[ "${tagstr}" == *=* ]] ; then 100 # normal case: attribute with value 101 102 tagval="${tagstr#*=}" 103 104 # strip quotes ('' or "") 105 if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then 106 tagval="${tagval:1:${#tagval}-2}" 107 fi 108 109 aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" ) 110 else 111 # special case for HTML where you have something like <foo baz> 112 aa[${aa_count}]=( name="${tagstr}" ) 113 fi 114 (( aa_count++ )) 115 (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert 116 done 117} 118 119# XML document handler 120function handle_xml_document 121{ 122#set -o xtrace 123 nameref callbacks=${1} 124 typeset tag_type="${2}" 125 typeset tag_value="${3}" 126 typeset tag_attributes="${4}" 127 nameref doc=${callbacks["arg_tree"]} 128 nameref nodepath="${stack.items[stack.pos]}" 129 nameref nodesnum="${stack.items[stack.pos]}num" 130 131 case "${tag_type}" in 132 tag_comment) 133 nodepath[${nodesnum}]+=( 134 typeset tagtype="comment" 135 typeset tagvalue="${tag_value}" 136 ) 137 (( nodesnum++ )) 138 ;; 139 esac 140 141# print "xmltok: '${tag_type}' = '${tag_value}'" 142} 143 144function xml_tok 145{ 146 typeset buf="" 147 typeset namebuf="" 148 typeset attrbuf="" 149 typeset c="" 150 typeset isendtag # bool: true/false 151 typeset issingletag # bool: true/false (used for tags like "<br />") 152 nameref callbacks=${1} 153 154 [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 155 156 while IFS='' read -r -N 1 c ; do 157 isendtag=false 158 159 if [[ "$c" == "<" ]] ; then 160 # flush any text content 161 if [[ "$buf" != "" ]] ; then 162 [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 163 buf="" 164 fi 165 166 IFS='' read -r -N 1 c 167 if [[ "$c" == "/" ]] ; then 168 isendtag=true 169 else 170 buf="$c" 171 fi 172 IFS='' read -r -d '>' c 173 buf+="$c" 174 175 # handle comments 176 if [[ "$buf" == ~(El)!-- ]] ; then 177 # did we read the comment completely ? 178 if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 179 buf+=">" 180 while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 181 IFS='' read -r -N 1 c || break 182 buf+="$c" 183 done 184 fi 185 186 [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 187 buf="" 188 continue 189 fi 190 191 # check if the tag starts and ends at the same time (like "<br />") 192 if [[ "${buf}" == ~(Er).*/ ]] ; then 193 issingletag=true 194 buf="${buf%*/}" 195 else 196 issingletag=false 197 fi 198 199 # check if the tag has attributes (e.g. space after name) 200 if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 201 namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 202 attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 203 else 204 namebuf="$buf" 205 attrbuf="" 206 fi 207 208 if ${isendtag} ; then 209 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 210 else 211 [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 212 213 # handle tags like <br/> (which are start- and end-tag in one piece) 214 if ${issingletag} ; then 215 [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 216 fi 217 fi 218 buf="" 219 else 220 buf+="$c" 221 fi 222 done 223 224 [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 225 226 print # final newline to make filters like "sed" happy 227} 228 229# enumerate comments in a shell (or shell-like) script 230function enumerate_comments_shell 231{ 232 set -o errexit 233 234 typeset input_file="$1" 235 nameref comment_array="$2" 236 integer max_num_comments="$3" 237 integer ca=0 # index in "comment_array" 238 239 integer res=0 240 241 typeset comment="" 242 243 while (( res == 0 )) ; do 244 IFS='' read -r line 245 (( res=$? )) 246 247 if [[ "${line}" == ~(El)#.* ]] ; then 248 comment+="${line#\#}${ch.newline}" 249 else 250 if [[ "$comment" != "" ]] ; then 251 comment_array[ca++]="${comment}" 252 comment="" 253 254 if (( ca > max_num_comments )) ; then 255 break 256 fi 257 fi 258 fi 259 done <"${input_file}" 260 261 return 0 262} 263 264 265# enumerate comments in a troff document 266function enumerate_comments_troff 267{ 268 set -o errexit 269 270 typeset input_file="$1" 271 nameref comment_array="$2" 272 integer max_num_comments="$3" 273 integer ca=0 # index in "comment_array" 274 275 integer res=0 276 277 typeset comment="" 278 279 while (( res == 0 )) ; do 280 IFS='' read -r line 281 (( res=$? )) 282 283 if [[ "${line}" == ~(El)\.*\\\" ]] ; then 284 comment+="${line#~(El)\.*\\\"}${ch.newline}" 285 else 286 if [[ "$comment" != "" ]] ; then 287 comment_array[ca++]="${comment}" 288 comment="" 289 290 if (( ca > max_num_comments )) ; then 291 break 292 fi 293 fi 294 fi 295 done <"${input_file}" 296 297 return 0 298} 299 300 301# enumerate comments in files which are preprocessed by 302# CPP (e.g. C, C++, Imakefile etc.) 303function enumerate_comments_cpp 304{ 305 set -o errexit 306# set -o nounset 307 308 integer err=0 309 310 typeset input_file="$1" 311 nameref comment_array="$2" 312 integer max_num_comments="$3" 313 integer max_filesize_for_scan="$4" 314 integer ca=0 # index in "comment_array" 315 316 typeset content 317 integer content_length 318 319 integer file_pos # file position 320 compound line_pos=( 321 integer x=0 # X position in line 322 integer y=0 # Y position in line (line number) 323 ) 324 typeset c c2 325 326 typeset comment 327 328 compound state=( 329 # C comment state 330 typeset in_c_comment=false 331 # C++ comment state 332 compound cxx=( 333 typeset in_comment=false 334 typeset comment_continued=false 335 # position of current //-pos 336 compound comment_pos=( 337 integer x=-1 338 integer y=-1 339 ) 340 # position of previous //-pos 341 compound comment_prev_pos=( 342 integer x=-1 343 integer y=-1 344 ) 345 ) 346 # literal state 347 typeset in_sq_literal=false # single-quote literal 348 typeset in_dq_literal=false # double-quote literal 349 ) 350 351 content="$(< "${input_file}")" 352 353 # Truncate file to "max_filesize_for_scan" charatcters. 354 # This was originally added to work around a performance problem with 355 # the ${str:offset:chunksize} operator which scales badly in ksh93 356 # version 's' with the number of characters 357 if (( ${#content} > max_filesize_for_scan )) ; then 358 print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \ 359 "${input_file}" \ 360 max_filesize_for_scan 361 content="${content:0:max_filesize_for_scan}" 362 fi 363 content_length=${#content} 364 365 # Iterate through the source code. The last character 366 # (when file_pos == content_length) will be empty to indicate 367 # EOF (this is needed for cases like when 368 # a C++ comment is not terminated by a newline... ;-/) 369 for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do 370 c2="${content:file_pos:2}" 371 c="${c2:0:1}" 372 373 if [[ "$c" == "${ch.newline}" ]] ; then 374 (( line_pos.x=0, line_pos.y++ )) 375 else 376 (( line_pos.x++ )) 377 fi 378 379 if ${state.in_c_comment} ; then 380 if [[ "$c2" == "*/" ]] ; then 381 (( file_pos++, line_pos.x++ )) 382 state.in_c_comment=false 383 384 # flush comment text 385 comment_array[ca++]="${comment}" 386 comment="" 387 388 if (( ca > max_num_comments )) ; then 389 break 390 fi 391 else 392 comment+="$c" 393 fi 394 elif ${state.cxx.in_comment} ; then 395 if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then 396 state.cxx.in_comment=false 397 398 # flush comment text 399 if ${state.cxx.comment_continued} ; then 400 comment_array[ca-1]+="${ch.newline}${comment}" 401 (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 402 state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 403 else 404 comment_array[ca++]="${comment}" 405 (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 406 state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 407 fi 408 comment="" 409 410 if (( ca > max_num_comments )) ; then 411 break 412 fi 413 else 414 comment+="$c" 415 fi 416 elif ${state.in_sq_literal} ; then 417 if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 418 state.in_sq_literal=false 419 fi 420 elif ${state.in_dq_literal} ; then 421 if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 422 state.in_dq_literal=false 423 fi 424 else 425 if [[ "$c2" == "/*" ]] ; then 426 (( file_pos++, line_pos.x++ )) 427 state.in_c_comment=true 428 comment="" 429 elif [[ "$c2" == "//" ]] ; then 430 (( file_pos++, line_pos.x++ )) 431 if (( state.cxx.comment_prev_pos.x == line_pos.x && \ 432 state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then 433 state.cxx.comment_continued=true 434 else 435 state.cxx.comment_continued=false 436 fi 437 (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y )) 438 state.cxx.in_comment=true 439 comment="" 440 elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 441 state.in_sq_literal=true 442 elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 443 state.in_dq_literal=true 444 fi 445 fi 446 done 447 448 if [[ "$comment" != "" ]] ; then 449 print -u2 "## ERROR: Comment text buffer not empty at EOF." 450 err=1 451 fi 452 453 if ${state.in_c_comment} ; then 454 print -u2 "## ERROR: C comment did not close before EOF." 455 err=1 456 fi 457 458 if ${state.cxx.in_comment} ; then 459 print -u2 "## ERROR: C++ comment did not close before EOF." 460 err=1 461 fi 462 463 if ${state.in_dq_literal} ; then 464 print -u2 "## ERROR: Double-quoted literal did not close before EOF." 465 err=1 466 fi 467 468 # We treat this one only as warning since things like "foo.html.cpp" may 469 # trigger this condition accidently 470 if ${state.in_sq_literal} ; then 471 print -u2 "## WARNING: Single-quoted literal did not close before EOF." 472 fi 473 474 return $err 475} 476 477# determine file type 478function get_file_format 479{ 480 set -o errexit 481 482 typeset filename="$1" 483 nameref file_format="$2" 484 485 typeset fileeval # evaluation result of /usr/bin/file 486 487 # check whether "filename" is a plain, readable file 488 [[ ! -f "$filename" ]] && return 1 489 [[ ! -r "$filename" ]] && return 1 490 491 # In theory this code would exclusively look at the contents of 492 # the file to figure out it's file format - unfortunately 493 # /usr/bin/file is virtually useless (the heuristics, matching 494 # and output unreliable) for many file formats and therefore 495 # we have to do a multi-stage approach which looks 496 # at the file's content if possible and at the filename 497 # otherwise. Fun... ;-( 498 499 # pass one: Find matches for file formats where /usr/bin/file 500 # is known to be unreliable: 501 case "$filename" in 502 *.[ch] | *.cpp | *.cc | *.cxx | *.hxx) 503 file_format="c_source" 504 return 0 505 ;; 506 *Imakefile) 507 file_format="imakefile" 508 return 0 509 ;; 510 *Makefile) 511 file_format="makefile" 512 return 0 513 ;; 514 esac 515 516 # pass two: match by file content via /usr/bin/file 517 fileeval="$(LC_ALL=C /usr/bin/file "$filename")" 518 case "$fileeval" in 519 ~(E)roff) 520 file_format="troff" 521 return 0 522 ;; 523 ~(E)html\ document) 524 file_format="html" 525 return 0 526 ;; 527 ~(E)sgml\ document) 528 file_format="sgml" 529 return 0 530 ;; 531 ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script) 532 file_format="shell" 533 return 0 534 ;; 535 ~(E)executable.*/perl\ script) 536 file_format="perl" 537 return 0 538 ;; 539 esac 540 541 # pass three: fallhack to filename matching 542 case "$filename" in 543 *.man) 544 file_format="troff" 545 return 0 546 ;; 547 *.html) 548 file_format="html" 549 return 0 550 ;; 551 *.sgml) 552 file_format="sgml" 553 return 0 554 ;; 555 *.xml) 556 file_format="xml" 557 return 0 558 ;; 559 *.png) 560 file_format="image_png" 561 return 0 562 ;; 563 *.xcf) 564 file_format="image_xcf" 565 return 0 566 ;; 567 *.shar) 568 file_format="archive_shell" 569 return 0 570 ;; 571 *.sh) 572 file_format="shell" 573 return 0 574 ;; 575 *.pcf) 576 file_format="font_pcf" 577 return 0 578 ;; 579 *.bdf) 580 file_format="font_bdf" 581 return 0 582 ;; 583 *.pmf) 584 file_format="font_pmf" 585 return 0 586 ;; 587 *.ttf | *.otf) 588 file_format="font_ttf" 589 return 0 590 ;; 591 *.pfa | *.pfb) 592 file_format="font_postscript" 593 return 0 594 ;; 595 esac 596 597 return 1 598} 599 600function extract_comments 601{ 602 set -o errexit 603 604 nameref records="$1" 605 typeset filename="$2" 606 integer max_num_comments="$3" 607 integer max_filesize_for_scan="$4" 608 609 typeset datatype="" 610 611 records[${filename}]=( 612 typeset filename="$filename" 613 614 typeset fileformat_found="false" # "true" or "false" 615 typeset file_format="" 616 617 typeset -A hashsum 618 619 typeset comments_parsed="false" # "true" or "false" 620 typeset -a comments 621 ) 622 623 records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")" 624 records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")" 625 626 if get_file_format "$filename" datatype ; then 627 records[${filename}].fileformat_found="true" 628 records[${filename}].file_format="$datatype" 629 else 630 return 1 631 fi 632 633 case "$datatype" in 634 c_source|imakefile) 635 enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 636 records[${filename}].comments_parsed=true 637 ;; 638 shell|makefile) 639 enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 640 records[${filename}].comments_parsed=true 641 ;; 642 troff) 643 enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 644 records[${filename}].comments_parsed=true 645 ;; 646 # NOTE: Disabled for now 647 #xml|html|sgml) 648 # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 649 # records[${filename}].comments_parsed=true 650 # ;; 651 esac 652 653 return 0 654} 655 656# parse HTTP return code, cookies etc. 657function parse_http_response 658{ 659 nameref response="$1" 660 typeset h statuscode statusmsg i 661 662 # we use '\r' as additional IFS to filter the final '\r' 663 IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 664 [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 665 [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 666 response.statuscode="$statuscode" 667 response.statusmsg="$statusmsg" 668 669 # skip remaining headers 670 while IFS='' read -r i ; do 671 [[ "$i" == $'\r' ]] && break 672 673 # strip '\r' at the end 674 i="${i/~(Er)$'\r'/}" 675 676 case "$i" in 677 ~(Eli)Content-Type:.*) 678 response.content_type="${i/~(El).*:[[:blank:]]*/}" 679 ;; 680 ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 681 integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 682 ;; 683 ~(Eli)Transfer-Encoding:.*) 684 response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 685 ;; 686 esac 687 done 688 689 return 0 690} 691 692function cat_http_body 693{ 694 typeset emode="$1" 695 typeset hexchunksize="0" 696 integer chunksize=0 697 698 if [[ "${emode}" == "chunked" ]] ; then 699 while IFS=$'\r' read hexchunksize && 700 [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] && 701 (( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do 702 dd bs=1 count="${chunksize}" 2>/dev/null 703 done 704 else 705 cat 706 fi 707 708 return 0 709} 710 711function cat_url 712{ 713 typeset protocol="${1%://*}" 714 typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 715 716 if [[ "${protocol}" == "file" ]] ; then 717 cat "${path1}" 718 return $? 719 elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then 720 typeset host="${path1%%/*}" 721 typeset path="${path1#*/}" 722 typeset port="${host##*:}" 723 724 integer netfd 725 compound httpresponse # http response 726 727 # If URL did not contain a port number in the host part then look at the 728 # protocol to get the port number 729 if [[ "${port}" == "${host}" ]] ; then 730 case "${protocol}" in 731 "http") port=80 ;; 732 "https") port=443 ;; 733 *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 734 esac 735 else 736 host="${host%:*}" 737 fi 738 739 printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" 740 741 # prechecks 742 [[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 743 [[ "${port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 744 [[ "${host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 745 [[ "${path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 746 747 # open TCP channel 748 if [[ "${protocol}" == "https" ]] ; then 749 compound sslfifo 750 sslfifo.dir="$(mktemp -d)" 751 sslfifo.in="${sslfifo.dir}/in" 752 sslfifo.out="${sslfifo.dir}/out" 753 754 # register an EXIT trap and use "errexit" to leave it at the first error 755 # (this saves lots of if/fi tests for error checking) 756 trap "rm -r \"${sslfifo.dir}\"" EXIT 757 set -o errexit 758 759 mkfifo "${sslfifo.in}" "${sslfifo.out}" 760 761 # create async openssl child to handle https 762 openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" & 763 764 # send HTTP request 765 request="GET /${path} HTTP/1.1\r\n" 766 request+="Host: ${host}\r\n" 767 request+="User-Agent: crawlsrccomments/ksh93(ssl) (2009-05-08; $(uname -s -r -p))\r\n" 768 request+="Connection: close\r\n" 769 print -n -- "${request}\r\n" >> "${sslfifo.in}" 770 771 # collect response and send it to stdout 772 { 773 parse_http_response httpresponse 774 cat_http_body "${httpresponse.transfer_encoding}" 775 } <"${sslfifo.out}" 776 777 wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; } 778 779 return 0 780 else 781 redirect {netfd}<> "/dev/tcp/${host}/${port}" 782 (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } 783 784 # send HTTP request 785 request="GET /${path} HTTP/1.1\r\n" 786 request+="Host: ${host}\r\n" 787 request+="User-Agent: crawlsrccomments/ksh93 (2009-05-08; $(uname -s -r -p))\r\n" 788 request+="Connection: close\r\n" 789 print -n -- "${request}\r\n" >&${netfd} 790 791 # collect response and send it to stdout 792 parse_http_response httpresponse <&${netfd} 793 cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} 794 795 # close connection 796 redirect {netfd}<&- 797 798 return 0 799 fi 800 else 801 return 1 802 fi 803 # notreached 804} 805 806function print_stats 807{ 808 set -o errexit 809 810 # gather some statistics 811 compound stats=( 812 integer files_with_comments=0 813 integer files_without_comments=0 814 815 integer files_without_known_format=0 816 817 integer files_with_license_info=0 818 integer files_without_license_info=0 819 820 integer total_num_files=0 821 ) 822 823 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 824 if "${records[$i].comments_parsed}" ; then 825 (( stats.files_with_comments++ )) 826 else 827 (( stats.files_without_comments++ )) 828 fi 829 830 if ! "${records[$i].fileformat_found}" ; then 831 (( stats.files_without_known_format++ )) 832 fi 833 834 if "${records[$i].license_info_found}" ; then 835 (( stats.files_with_license_info++ )) 836 else 837 (( stats.files_without_license_info++ )) 838 fi 839 840 (( stats.total_num_files++ )) 841 done 842 843 print -v stats 844 return 0 845} 846 847 848function print_comments_plain 849{ 850 set -o errexit 851 852 nameref records=$1 853 nameref options=$2 854 typeset i j 855 856 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 857 nameref node=records[$i] 858 859 if [[ "${options.filepattern.accept}" != "" ]] && \ 860 [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 861 continue 862 fi 863 if [[ "${options.filepattern.reject}" != "" ]] && \ 864 [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 865 continue 866 fi 867 868 node.license_info_found=false 869 870 if ! "${node.comments_parsed}" ; then 871 continue 872 fi 873 874 for j in "${!node.comments[@]}" ; do 875 typeset s="${node.comments[$j]}" 876 typeset match=false 877 878 if [[ "${options.commentpattern.accept}" != "" ]] && \ 879 [[ "$s" == ${options.commentpattern.accept} ]] ; then 880 match=true 881 fi 882 if [[ "${options.commentpattern.reject}" != "" ]] && \ 883 [[ "$s" == ${options.commentpattern.reject} ]] ; then 884 match=false 885 fi 886 887 if "${match}" ; then 888 printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j" 889 printf "%s\n" "$s" 890 node.license_info_found=true 891 fi 892 done 893 894 if ! "${node.license_info_found}" ; then 895 printf "## no match found in '%s'," "${node.filename}" 896 printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 897 "${node.comments_parsed}" \ 898 "${node.fileformat_found}" \ 899 "${node.file_format}" 900 fi 901 done 902 903 return 0 904} 905 906function print_comments_duplicates_compressed 907{ 908 set -o errexit 909 910 nameref records=$1 911 nameref options=$2 912 typeset i j 913 typeset -A hashed_comments 914 integer num_hashed_comments 915 916 for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 917 nameref node=records[$i] 918 919 if [[ "${options.filepattern.accept}" != "" ]] && \ 920 [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 921 continue 922 fi 923 if [[ "${options.filepattern.reject}" != "" ]] && \ 924 [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 925 continue 926 fi 927 928 node.license_info_found=false 929 930 if ! "${node.comments_parsed}" ; then 931 continue 932 fi 933 934 for j in "${!node.comments[@]}" ; do 935 typeset s="${node.comments[$j]}" 936 typeset match=false 937 938 if [[ "${options.commentpattern.accept}" != "" ]] && \ 939 [[ "$s" == ${options.commentpattern.accept} ]] ; then 940 match=true 941 fi 942 if [[ "${options.commentpattern.reject}" != "" ]] && \ 943 [[ "$s" == ${options.commentpattern.reject} ]] ; then 944 match=false 945 fi 946 947 948 if "${match}" ; then 949 typeset -l hashstring # lowercase 950 951 # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ... 952 hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}" 953 # ... and then create a MD5 hash from this string 954 hash="$(sum -x md5 <<<"${hashstring}")" 955 956 nameref hc_node=hashed_comments[${hash}] 957 958 if [[ "${hc_node}" == "" ]] ; then 959 # build node if there isn't one yet 960 typeset -a hc_node.fileids 961 typeset hc_node.comment="$s" 962 fi 963 964 hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" ) 965 966 node.license_info_found=true 967 fi 968 done 969 970 if ! "${node.license_info_found}" ; then 971 printf "## no match found in " 972 printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}" 973 printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 974 "${node.comments_parsed}" \ 975 "${node.fileformat_found}" \ 976 "${node.file_format}" 977 fi 978 done 979 980 # print comments and all fileids (filename+hash sums) which include this comment 981 for i in "${!hashed_comments[@]}" ; do 982 printf "\f## The comment (ID=%s) ..." "${i}" 983 printf "\n-- snip --" 984 printf "\n%s" "${hashed_comments[${i}].comment}" 985 printf "\n-- snip --" 986 printf "\n... applies to the following files:\n" 987 printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber 988 done 989 990 return 0 991} 992 993function do_crawl 994{ 995 set -o errexit 996 997 compound options=( 998 integer max_filesize_for_scan=$((256*1024)) 999 integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite) 1000 ) 1001 1002 shift 1003 while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do 1004 printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1005 case ${OPT} in 1006 S) options.max_filesize_for_scan="${OPTARG}" ;; 1007 N) options.max_num_comments="${OPTARG}" ;; 1008 *) usage do_crawl_usage ;; 1009 esac 1010 done 1011 shift $((OPTIND-1)) 1012 1013 compound scan=( 1014 typeset -A records 1015 ) 1016 1017 # read filenames from stdin 1018 while read i ; do 1019 printf "## scanning %s ...\n" "$i" 1020 extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true 1021 done 1022 1023 # print compound variable array (we strip the "typeset -A records" for now) 1024 print -v scan >"crawlsrccomments_extracted_comments.cpv" 1025 1026 print "# Wrote results to crawlsrccomments_extracted_comments.cpv" 1027 1028 return 0 1029} 1030 1031function do_getcomments 1032{ 1033 set -o errexit 1034 1035 # vars 1036 compound scan 1037 typeset database 1038 typeset tmp 1039 1040 compound options=( 1041 typeset database="crawlsrccomments_extracted_comments.cpv" 1042 1043 typeset print_stats=false 1044 typeset zapduplicates=false 1045 compound filepattern=( 1046 typeset accept="*" 1047 typeset reject="" 1048 ) 1049 compound commentpattern=( 1050 typeset accept="~(Ei)(license|copyright)" 1051 typeset reject="" 1052 ) 1053 ) 1054 1055 shift 1056 while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do 1057 # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1058 case ${OPT} in 1059 c) options.commentpattern.accept="${OPTARG}" ;; 1060 C) options.commentpattern.reject="${OPTARG}" ;; 1061 D) options.database="${OPTARG}" ;; 1062 l) options.filepattern.accept="${OPTARG}" ;; 1063 L) options.filepattern.reject="${OPTARG}" ;; 1064 S) options.print_stats=true ;; 1065 +S) options.print_stats=false ;; 1066 Z) options.zapduplicates=true ;; 1067 +Z) options.zapduplicates=false ;; 1068 *) usage do_getcomments_usage ;; 1069 esac 1070 done 1071 shift $((OPTIND-1)) 1072 1073 # array of temporary files which should be cleaned-up upon exit 1074 typeset -a tmpfiles 1075 trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT 1076 1077 # Support for HTTP URLs 1078 if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then 1079 database="/tmp/extract_license_cat_url_${PPID}_$$.tmp" 1080 tmpfiles+=( "${database}" ) 1081 print -u2 "# Loading URL..." 1082 cat_url "${options.database}" >"${database}" 1083 print -u2 "# Loading URL done." 1084 else 1085 database="${options.database}" 1086 fi 1087 1088 if [[ ! -r "${database}" ]] ; then 1089 fatal_error "Can't read ${database}." 1090 fi 1091 1092 # Support for compressed database files 1093 case "$(LC_ALL=C /usr/bin/file "${database}")" in 1094 *bzip2*) 1095 tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1096 tmpfiles+=( "${tmp}" ) 1097 print -u2 "# Uncompressing data (bzip2) ..." 1098 bzcat <"${database}" >"${tmp}" 1099 print -u2 "# Uncompression done." 1100 database="${tmp}" 1101 ;; 1102 *gzip*) 1103 tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1104 tmpfiles+=( "${tmp}" ) 1105 print -u2 "# Uncompressing data (gzip) ..." 1106 gunzip -c <"${database}" >"${tmp}" 1107 print -u2 "# Uncompression done." 1108 database="${tmp}" 1109 ;; 1110 esac 1111 1112 # Read compound variable which contain all recorded comments 1113 print -u2 "# reading records..." 1114 read -C scan <"${database}" || fatal_error 'Error reading data.' 1115 print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}" 1116 1117 # print comments 1118 print -u2 "# processing data..." 1119 print "## comments start:" 1120 if "${options.zapduplicates}" ; then 1121 print_comments_duplicates_compressed scan.records options 1122 else 1123 print_comments_plain scan.records options 1124 fi 1125 print "## comments end" 1126 print -u2 "# processing data done." 1127 1128 if "${options.print_stats}" ; then 1129 print_stats 1130 fi 1131 1132 return 0 1133} 1134 1135function usage 1136{ 1137 nameref usagemsg=$1 1138 OPTIND=0 1139 getopts -a "${progname}" "${usagemsg}" OPT '-?' 1140 exit 2 1141} 1142 1143typeset -r do_getcomments_usage=$'+ 1144[-?\n@(#)\$Id: getcomments (Roland Mainz) 2009-05-09 \$\n] 1145[-author?Roland Mainz <roland.mainz@sun.com>] 1146[+NAME?getcomments - extract license information from source files] 1147[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts 1148 license information from the "\bgetcomments\b"-database 1149 file created by \bcrawl\b. The script allows various 1150 filters (see options below) to be applied on the database] 1151[+?The license extraction is done in two steps - first a crawler script 1152 called \bcrawl\b will scan all source files, extract 1153 the comments and stores this information in a "database" file called 1154 "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows 1155 queries on this database.] 1156[D:database?Database file for input (either file, http:// or https://-URL).]:[database] 1157[l:acceptfilepattern?Process only files which match pattern.]:[pattern] 1158[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern] 1159[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern] 1160[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern] 1161[S:stats?Print statistics.] 1162[Z:zapsimilar?Combine similar/duplicate comments in the report.] 1163[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1164' 1165 1166typeset -r do_crawl_usage=$'+ 1167[-?\n@(#)\$Id: crawl (Roland Mainz) 2009-05-09 \$\n] 1168[-author?Roland Mainz <roland.mainz@sun.com>] 1169[+NAME?crawl - crawl comment information from source files] 1170[+DESCRIPTION?\bcrawl\b is a small utilty script which reads 1171 a list of source code files from stdin, determinates the type of 1172 syntax used by these files and then extracts 1173 comments from the source code and stores this information into a 1174 "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1175 be processed by \bextract_license\b or similar processing tools.] 1176[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments. 1177 Defaults to 256K characters.]:[numchars] 1178[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments] 1179[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1180' 1181 1182typeset -r crawlsrccomments_usage=$'+ 1183[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2009-05-09 \$\n] 1184[-author?Roland Mainz <roland.mainz@sun.com>] 1185[+NAME?crawlsrccomments - extract and filter comment information from source files] 1186[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads 1187 a list of source code files from stdin, determinates the type of 1188 syntax used by these files and then extracts 1189 comments from the source code and stores this information into a 1190 "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1191 be processed by \bextract_license\b or similar processing tools.] 1192 1193[crawl|getcomments] options 1194 1195[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1196' 1197 1198 1199# program start 1200builtin basename 1201builtin cat 1202builtin date 1203builtin uname 1204builtin rm 1205builtin sum || fatal_error "sum builtin not found." 1206 1207# exit at the first error we hit 1208set -o errexit 1209 1210typeset progname="${ basename "${0}" ; }" 1211 1212while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do 1213 # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1214 case ${OPT} in 1215 *) usage crawlsrccomments_usage ;; 1216 esac 1217done 1218shift $((OPTIND-1)) 1219 1220typeset cmd="$1" 1221 1222case "$cmd" in 1223 "crawl") 1224 progname+=" ${cmd}" 1225 do_crawl "$@" 1226 exit $? 1227 ;; 1228 "getcomments") 1229 progname+=" ${cmd}" 1230 do_getcomments "$@" 1231 exit $? 1232 ;; 1233 *) 1234 usage crawlsrccomments_usage 1235 ;; 1236esac 1237 1238fatal_error "not reached." 1239# EOF. 1240