1*906afcb8SAndy Fiddaman#!/usr/bin/ksh93 2*906afcb8SAndy Fiddaman 3*906afcb8SAndy Fiddaman# 4*906afcb8SAndy Fiddaman# CDDL HEADER START 5*906afcb8SAndy Fiddaman# 6*906afcb8SAndy Fiddaman# The contents of this file are subject to the terms of the 7*906afcb8SAndy Fiddaman# Common Development and Distribution License (the "License"). 8*906afcb8SAndy Fiddaman# You may not use this file except in compliance with the License. 9*906afcb8SAndy Fiddaman# 10*906afcb8SAndy Fiddaman# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 11*906afcb8SAndy Fiddaman# or http://www.opensolaris.org/os/licensing. 12*906afcb8SAndy Fiddaman# See the License for the specific language governing permissions 13*906afcb8SAndy Fiddaman# and limitations under the License. 14*906afcb8SAndy Fiddaman# 15*906afcb8SAndy Fiddaman# When distributing Covered Code, include this CDDL HEADER in each 16*906afcb8SAndy Fiddaman# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 17*906afcb8SAndy Fiddaman# If applicable, add the following below this CDDL HEADER, with the 18*906afcb8SAndy Fiddaman# fields enclosed by brackets "[]" replaced with your own identifying 19*906afcb8SAndy Fiddaman# information: Portions Copyright [yyyy] [name of copyright owner] 20*906afcb8SAndy Fiddaman# 21*906afcb8SAndy Fiddaman# CDDL HEADER END 22*906afcb8SAndy Fiddaman# 23*906afcb8SAndy Fiddaman 24*906afcb8SAndy Fiddaman# 25*906afcb8SAndy Fiddaman# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 26*906afcb8SAndy Fiddaman# 27*906afcb8SAndy Fiddaman 28*906afcb8SAndy Fiddaman# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant 29*906afcb8SAndy Fiddamanexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin 30*906afcb8SAndy Fiddaman 31*906afcb8SAndy Fiddaman# Make sure all math stuff runs in the "C" locale to avoid problems 32*906afcb8SAndy Fiddaman# with alternative # radix point representations (e.g. ',' instead of 33*906afcb8SAndy Fiddaman# '.' in de_DE.*-locales). This needs to be set _before_ any 34*906afcb8SAndy Fiddaman# floating-point constants are defined in this script). 35*906afcb8SAndy Fiddamanif [[ "${LC_ALL}" != "" ]] ; then 36*906afcb8SAndy Fiddaman export \ 37*906afcb8SAndy Fiddaman LC_MONETARY="${LC_ALL}" \ 38*906afcb8SAndy Fiddaman LC_MESSAGES="${LC_ALL}" \ 39*906afcb8SAndy Fiddaman LC_COLLATE="${LC_ALL}" \ 40*906afcb8SAndy Fiddaman LC_CTYPE="${LC_ALL}" 41*906afcb8SAndy Fiddaman unset LC_ALL 42*906afcb8SAndy Fiddamanfi 43*906afcb8SAndy Fiddamanexport LC_NUMERIC=C 44*906afcb8SAndy Fiddaman 45*906afcb8SAndy Fiddaman# constants values for tokenizer/parser stuff 46*906afcb8SAndy Fiddamancompound -r ch=( 47*906afcb8SAndy Fiddaman newline=$'\n' 48*906afcb8SAndy Fiddaman tab=$'\t' 49*906afcb8SAndy Fiddaman formfeed=$'\f' 50*906afcb8SAndy Fiddaman) 51*906afcb8SAndy Fiddaman 52*906afcb8SAndy Fiddamanfunction fatal_error 53*906afcb8SAndy Fiddaman{ 54*906afcb8SAndy Fiddaman print -u2 "${progname}: $*" 55*906afcb8SAndy Fiddaman exit 1 56*906afcb8SAndy Fiddaman} 57*906afcb8SAndy Fiddaman 58*906afcb8SAndy Fiddamanfunction printmsg 59*906afcb8SAndy Fiddaman{ 60*906afcb8SAndy Fiddaman print -u2 "$*" 61*906afcb8SAndy Fiddaman} 62*906afcb8SAndy Fiddaman 63*906afcb8SAndy Fiddaman 64*906afcb8SAndy Fiddamanfunction attrstrtoattrarray 65*906afcb8SAndy Fiddaman{ 66*906afcb8SAndy Fiddaman#set -o xtrace 67*906afcb8SAndy Fiddaman typeset s="$1" 68*906afcb8SAndy Fiddaman nameref aa=$2 # attribute array 69*906afcb8SAndy Fiddaman integer aa_count=0 70*906afcb8SAndy Fiddaman integer aa_count=0 71*906afcb8SAndy Fiddaman typeset nextattr 72*906afcb8SAndy Fiddaman integer currattrlen=0 73*906afcb8SAndy Fiddaman typeset tagstr 74*906afcb8SAndy Fiddaman typeset tagval 75*906afcb8SAndy Fiddaman 76*906afcb8SAndy Fiddaman while (( ${#s} > 0 )) ; do 77*906afcb8SAndy Fiddaman # skip whitespaces 78*906afcb8SAndy Fiddaman while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do 79*906afcb8SAndy Fiddaman (( currattrlen++ )) 80*906afcb8SAndy Fiddaman done 81*906afcb8SAndy Fiddaman s="${s:currattrlen:${#s}}" 82*906afcb8SAndy Fiddaman 83*906afcb8SAndy Fiddaman # anything left ? 84*906afcb8SAndy Fiddaman (( ${#s} == 0 )) && break 85*906afcb8SAndy Fiddaman 86*906afcb8SAndy Fiddaman # Pattern tests: 87*906afcb8SAndy Fiddaman #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}" 88*906afcb8SAndy Fiddaman #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}" 89*906afcb8SAndy Fiddaman #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}" 90*906afcb8SAndy Fiddaman #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}" 91*906afcb8SAndy Fiddaman # All pattern combined via eregex (w|x|y|z): 92*906afcb8SAndy Fiddaman #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}" 93*906afcb8SAndy Fiddaman nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}" 94*906afcb8SAndy Fiddaman currattrlen=$(( ${#s} - ${#nextattr})) 95*906afcb8SAndy Fiddaman 96*906afcb8SAndy Fiddaman # add entry 97*906afcb8SAndy Fiddaman tagstr="${s:0:currattrlen}" 98*906afcb8SAndy Fiddaman if [[ "${tagstr}" == *=* ]] ; then 99*906afcb8SAndy Fiddaman # normal case: attribute with value 100*906afcb8SAndy Fiddaman 101*906afcb8SAndy Fiddaman tagval="${tagstr#*=}" 102*906afcb8SAndy Fiddaman 103*906afcb8SAndy Fiddaman # strip quotes ('' or "") 104*906afcb8SAndy Fiddaman if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then 105*906afcb8SAndy Fiddaman tagval="${tagval:1:${#tagval}-2}" 106*906afcb8SAndy Fiddaman fi 107*906afcb8SAndy Fiddaman 108*906afcb8SAndy Fiddaman aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" ) 109*906afcb8SAndy Fiddaman else 110*906afcb8SAndy Fiddaman # special case for HTML where you have something like <foo baz> 111*906afcb8SAndy Fiddaman aa[${aa_count}]=( name="${tagstr}" ) 112*906afcb8SAndy Fiddaman fi 113*906afcb8SAndy Fiddaman (( aa_count++ )) 114*906afcb8SAndy Fiddaman (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert 115*906afcb8SAndy Fiddaman done 116*906afcb8SAndy Fiddaman} 117*906afcb8SAndy Fiddaman 118*906afcb8SAndy Fiddaman# XML document handler 119*906afcb8SAndy Fiddamanfunction handle_xml_document 120*906afcb8SAndy Fiddaman{ 121*906afcb8SAndy Fiddaman#set -o xtrace 122*906afcb8SAndy Fiddaman nameref callbacks=${1} 123*906afcb8SAndy Fiddaman typeset tag_type="${2}" 124*906afcb8SAndy Fiddaman typeset tag_value="${3}" 125*906afcb8SAndy Fiddaman typeset tag_attributes="${4}" 126*906afcb8SAndy Fiddaman nameref doc=${callbacks["arg_tree"]} 127*906afcb8SAndy Fiddaman nameref nodepath="${stack.items[stack.pos]}" 128*906afcb8SAndy Fiddaman nameref nodesnum="${stack.items[stack.pos]}num" 129*906afcb8SAndy Fiddaman 130*906afcb8SAndy Fiddaman case "${tag_type}" in 131*906afcb8SAndy Fiddaman tag_comment) 132*906afcb8SAndy Fiddaman nodepath[${nodesnum}]+=( 133*906afcb8SAndy Fiddaman typeset tagtype="comment" 134*906afcb8SAndy Fiddaman typeset tagvalue="${tag_value}" 135*906afcb8SAndy Fiddaman ) 136*906afcb8SAndy Fiddaman (( nodesnum++ )) 137*906afcb8SAndy Fiddaman ;; 138*906afcb8SAndy Fiddaman esac 139*906afcb8SAndy Fiddaman 140*906afcb8SAndy Fiddaman# print "xmltok: '${tag_type}' = '${tag_value}'" 141*906afcb8SAndy Fiddaman} 142*906afcb8SAndy Fiddaman 143*906afcb8SAndy Fiddamanfunction xml_tok 144*906afcb8SAndy Fiddaman{ 145*906afcb8SAndy Fiddaman typeset buf="" 146*906afcb8SAndy Fiddaman typeset namebuf="" 147*906afcb8SAndy Fiddaman typeset attrbuf="" 148*906afcb8SAndy Fiddaman typeset c="" 149*906afcb8SAndy Fiddaman typeset isendtag # bool: true/false 150*906afcb8SAndy Fiddaman typeset issingletag # bool: true/false (used for tags like "<br />") 151*906afcb8SAndy Fiddaman nameref callbacks=${1} 152*906afcb8SAndy Fiddaman 153*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start" 154*906afcb8SAndy Fiddaman 155*906afcb8SAndy Fiddaman while IFS='' read -r -N 1 c ; do 156*906afcb8SAndy Fiddaman isendtag=false 157*906afcb8SAndy Fiddaman 158*906afcb8SAndy Fiddaman if [[ "$c" == "<" ]] ; then 159*906afcb8SAndy Fiddaman # flush any text content 160*906afcb8SAndy Fiddaman if [[ "$buf" != "" ]] ; then 161*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf" 162*906afcb8SAndy Fiddaman buf="" 163*906afcb8SAndy Fiddaman fi 164*906afcb8SAndy Fiddaman 165*906afcb8SAndy Fiddaman IFS='' read -r -N 1 c 166*906afcb8SAndy Fiddaman if [[ "$c" == "/" ]] ; then 167*906afcb8SAndy Fiddaman isendtag=true 168*906afcb8SAndy Fiddaman else 169*906afcb8SAndy Fiddaman buf="$c" 170*906afcb8SAndy Fiddaman fi 171*906afcb8SAndy Fiddaman IFS='' read -r -d '>' c 172*906afcb8SAndy Fiddaman buf+="$c" 173*906afcb8SAndy Fiddaman 174*906afcb8SAndy Fiddaman # handle comments 175*906afcb8SAndy Fiddaman if [[ "$buf" == ~(El)!-- ]] ; then 176*906afcb8SAndy Fiddaman # did we read the comment completely ? 177*906afcb8SAndy Fiddaman if [[ "$buf" != ~(Elr)!--.*-- ]] ; then 178*906afcb8SAndy Fiddaman buf+=">" 179*906afcb8SAndy Fiddaman while [[ "$buf" != ~(Elr)!--.*-- ]] ; do 180*906afcb8SAndy Fiddaman IFS='' read -r -N 1 c || break 181*906afcb8SAndy Fiddaman buf+="$c" 182*906afcb8SAndy Fiddaman done 183*906afcb8SAndy Fiddaman fi 184*906afcb8SAndy Fiddaman 185*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}" 186*906afcb8SAndy Fiddaman buf="" 187*906afcb8SAndy Fiddaman continue 188*906afcb8SAndy Fiddaman fi 189*906afcb8SAndy Fiddaman 190*906afcb8SAndy Fiddaman # check if the tag starts and ends at the same time (like "<br />") 191*906afcb8SAndy Fiddaman if [[ "${buf}" == ~(Er).*/ ]] ; then 192*906afcb8SAndy Fiddaman issingletag=true 193*906afcb8SAndy Fiddaman buf="${buf%*/}" 194*906afcb8SAndy Fiddaman else 195*906afcb8SAndy Fiddaman issingletag=false 196*906afcb8SAndy Fiddaman fi 197*906afcb8SAndy Fiddaman 198*906afcb8SAndy Fiddaman # check if the tag has attributes (e.g. space after name) 199*906afcb8SAndy Fiddaman if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then 200*906afcb8SAndy Fiddaman namebuf="${buf%%~(E)[[:space:][:blank:]].*}" 201*906afcb8SAndy Fiddaman attrbuf="${buf#~(E).*[[:space:][:blank:]]}" 202*906afcb8SAndy Fiddaman else 203*906afcb8SAndy Fiddaman namebuf="$buf" 204*906afcb8SAndy Fiddaman attrbuf="" 205*906afcb8SAndy Fiddaman fi 206*906afcb8SAndy Fiddaman 207*906afcb8SAndy Fiddaman if ${isendtag} ; then 208*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 209*906afcb8SAndy Fiddaman else 210*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf" 211*906afcb8SAndy Fiddaman 212*906afcb8SAndy Fiddaman # handle tags like <br/> (which are start- and end-tag in one piece) 213*906afcb8SAndy Fiddaman if ${issingletag} ; then 214*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf" 215*906afcb8SAndy Fiddaman fi 216*906afcb8SAndy Fiddaman fi 217*906afcb8SAndy Fiddaman buf="" 218*906afcb8SAndy Fiddaman else 219*906afcb8SAndy Fiddaman buf+="$c" 220*906afcb8SAndy Fiddaman fi 221*906afcb8SAndy Fiddaman done 222*906afcb8SAndy Fiddaman 223*906afcb8SAndy Fiddaman [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success" 224*906afcb8SAndy Fiddaman 225*906afcb8SAndy Fiddaman print # final newline to make filters like "sed" happy 226*906afcb8SAndy Fiddaman} 227*906afcb8SAndy Fiddaman 228*906afcb8SAndy Fiddaman# enumerate comments in a shell (or shell-like) script 229*906afcb8SAndy Fiddamanfunction enumerate_comments_shell 230*906afcb8SAndy Fiddaman{ 231*906afcb8SAndy Fiddaman set -o errexit 232*906afcb8SAndy Fiddaman 233*906afcb8SAndy Fiddaman typeset input_file="$1" 234*906afcb8SAndy Fiddaman nameref comment_array="$2" 235*906afcb8SAndy Fiddaman integer max_num_comments="$3" 236*906afcb8SAndy Fiddaman integer ca=0 # index in "comment_array" 237*906afcb8SAndy Fiddaman 238*906afcb8SAndy Fiddaman integer res=0 239*906afcb8SAndy Fiddaman 240*906afcb8SAndy Fiddaman typeset comment="" 241*906afcb8SAndy Fiddaman 242*906afcb8SAndy Fiddaman while (( res == 0 )) ; do 243*906afcb8SAndy Fiddaman IFS='' read -r line 244*906afcb8SAndy Fiddaman (( res=$? )) 245*906afcb8SAndy Fiddaman 246*906afcb8SAndy Fiddaman if [[ "${line}" == ~(El)#.* ]] ; then 247*906afcb8SAndy Fiddaman comment+="${line#\#}${ch.newline}" 248*906afcb8SAndy Fiddaman else 249*906afcb8SAndy Fiddaman if [[ "$comment" != "" ]] ; then 250*906afcb8SAndy Fiddaman comment_array[ca++]="${comment}" 251*906afcb8SAndy Fiddaman comment="" 252*906afcb8SAndy Fiddaman 253*906afcb8SAndy Fiddaman if (( ca > max_num_comments )) ; then 254*906afcb8SAndy Fiddaman break 255*906afcb8SAndy Fiddaman fi 256*906afcb8SAndy Fiddaman fi 257*906afcb8SAndy Fiddaman fi 258*906afcb8SAndy Fiddaman done <"${input_file}" 259*906afcb8SAndy Fiddaman 260*906afcb8SAndy Fiddaman return 0 261*906afcb8SAndy Fiddaman} 262*906afcb8SAndy Fiddaman 263*906afcb8SAndy Fiddaman 264*906afcb8SAndy Fiddaman# enumerate comments in a troff document 265*906afcb8SAndy Fiddamanfunction enumerate_comments_troff 266*906afcb8SAndy Fiddaman{ 267*906afcb8SAndy Fiddaman set -o errexit 268*906afcb8SAndy Fiddaman 269*906afcb8SAndy Fiddaman typeset input_file="$1" 270*906afcb8SAndy Fiddaman nameref comment_array="$2" 271*906afcb8SAndy Fiddaman integer max_num_comments="$3" 272*906afcb8SAndy Fiddaman integer ca=0 # index in "comment_array" 273*906afcb8SAndy Fiddaman 274*906afcb8SAndy Fiddaman integer res=0 275*906afcb8SAndy Fiddaman 276*906afcb8SAndy Fiddaman typeset comment="" 277*906afcb8SAndy Fiddaman 278*906afcb8SAndy Fiddaman while (( res == 0 )) ; do 279*906afcb8SAndy Fiddaman IFS='' read -r line 280*906afcb8SAndy Fiddaman (( res=$? )) 281*906afcb8SAndy Fiddaman 282*906afcb8SAndy Fiddaman if [[ "${line}" == ~(El)\.*\\\" ]] ; then 283*906afcb8SAndy Fiddaman comment+="${line#~(El)\.*\\\"}${ch.newline}" 284*906afcb8SAndy Fiddaman else 285*906afcb8SAndy Fiddaman if [[ "$comment" != "" ]] ; then 286*906afcb8SAndy Fiddaman comment_array[ca++]="${comment}" 287*906afcb8SAndy Fiddaman comment="" 288*906afcb8SAndy Fiddaman 289*906afcb8SAndy Fiddaman if (( ca > max_num_comments )) ; then 290*906afcb8SAndy Fiddaman break 291*906afcb8SAndy Fiddaman fi 292*906afcb8SAndy Fiddaman fi 293*906afcb8SAndy Fiddaman fi 294*906afcb8SAndy Fiddaman done <"${input_file}" 295*906afcb8SAndy Fiddaman 296*906afcb8SAndy Fiddaman return 0 297*906afcb8SAndy Fiddaman} 298*906afcb8SAndy Fiddaman 299*906afcb8SAndy Fiddaman 300*906afcb8SAndy Fiddaman# enumerate comments in files which are preprocessed by 301*906afcb8SAndy Fiddaman# CPP (e.g. C, C++, Imakefile etc.) 302*906afcb8SAndy Fiddamanfunction enumerate_comments_cpp 303*906afcb8SAndy Fiddaman{ 304*906afcb8SAndy Fiddaman set -o errexit 305*906afcb8SAndy Fiddaman# set -o nounset 306*906afcb8SAndy Fiddaman 307*906afcb8SAndy Fiddaman integer err=0 308*906afcb8SAndy Fiddaman 309*906afcb8SAndy Fiddaman typeset input_file="$1" 310*906afcb8SAndy Fiddaman nameref comment_array="$2" 311*906afcb8SAndy Fiddaman integer max_num_comments="$3" 312*906afcb8SAndy Fiddaman integer max_filesize_for_scan="$4" 313*906afcb8SAndy Fiddaman integer ca=0 # index in "comment_array" 314*906afcb8SAndy Fiddaman 315*906afcb8SAndy Fiddaman typeset content 316*906afcb8SAndy Fiddaman integer content_length 317*906afcb8SAndy Fiddaman 318*906afcb8SAndy Fiddaman integer file_pos # file position 319*906afcb8SAndy Fiddaman compound line_pos=( 320*906afcb8SAndy Fiddaman integer x=0 # X position in line 321*906afcb8SAndy Fiddaman integer y=0 # Y position in line (line number) 322*906afcb8SAndy Fiddaman ) 323*906afcb8SAndy Fiddaman typeset c c2 324*906afcb8SAndy Fiddaman 325*906afcb8SAndy Fiddaman typeset comment 326*906afcb8SAndy Fiddaman 327*906afcb8SAndy Fiddaman compound state=( 328*906afcb8SAndy Fiddaman # C comment state 329*906afcb8SAndy Fiddaman typeset in_c_comment=false 330*906afcb8SAndy Fiddaman # C++ comment state 331*906afcb8SAndy Fiddaman compound cxx=( 332*906afcb8SAndy Fiddaman typeset in_comment=false 333*906afcb8SAndy Fiddaman typeset comment_continued=false 334*906afcb8SAndy Fiddaman # position of current //-pos 335*906afcb8SAndy Fiddaman compound comment_pos=( 336*906afcb8SAndy Fiddaman integer x=-1 337*906afcb8SAndy Fiddaman integer y=-1 338*906afcb8SAndy Fiddaman ) 339*906afcb8SAndy Fiddaman # position of previous //-pos 340*906afcb8SAndy Fiddaman compound comment_prev_pos=( 341*906afcb8SAndy Fiddaman integer x=-1 342*906afcb8SAndy Fiddaman integer y=-1 343*906afcb8SAndy Fiddaman ) 344*906afcb8SAndy Fiddaman ) 345*906afcb8SAndy Fiddaman # literal state 346*906afcb8SAndy Fiddaman typeset in_sq_literal=false # single-quote literal 347*906afcb8SAndy Fiddaman typeset in_dq_literal=false # double-quote literal 348*906afcb8SAndy Fiddaman ) 349*906afcb8SAndy Fiddaman 350*906afcb8SAndy Fiddaman content="$(< "${input_file}")" 351*906afcb8SAndy Fiddaman 352*906afcb8SAndy Fiddaman # Truncate file to "max_filesize_for_scan" charatcters. 353*906afcb8SAndy Fiddaman # This was originally added to work around a performance problem with 354*906afcb8SAndy Fiddaman # the ${str:offset:chunksize} operator which scales badly in ksh93 355*906afcb8SAndy Fiddaman # version 's' with the number of characters 356*906afcb8SAndy Fiddaman if (( ${#content} > max_filesize_for_scan )) ; then 357*906afcb8SAndy Fiddaman print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \ 358*906afcb8SAndy Fiddaman "${input_file}" \ 359*906afcb8SAndy Fiddaman max_filesize_for_scan 360*906afcb8SAndy Fiddaman content="${content:0:max_filesize_for_scan}" 361*906afcb8SAndy Fiddaman fi 362*906afcb8SAndy Fiddaman content_length=${#content} 363*906afcb8SAndy Fiddaman 364*906afcb8SAndy Fiddaman # Iterate through the source code. The last character 365*906afcb8SAndy Fiddaman # (when file_pos == content_length) will be empty to indicate 366*906afcb8SAndy Fiddaman # EOF (this is needed for cases like when 367*906afcb8SAndy Fiddaman # a C++ comment is not terminated by a newline... ;-/) 368*906afcb8SAndy Fiddaman for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do 369*906afcb8SAndy Fiddaman c2="${content:file_pos:2}" 370*906afcb8SAndy Fiddaman c="${c2:0:1}" 371*906afcb8SAndy Fiddaman 372*906afcb8SAndy Fiddaman if [[ "$c" == "${ch.newline}" ]] ; then 373*906afcb8SAndy Fiddaman (( line_pos.x=0, line_pos.y++ )) 374*906afcb8SAndy Fiddaman else 375*906afcb8SAndy Fiddaman (( line_pos.x++ )) 376*906afcb8SAndy Fiddaman fi 377*906afcb8SAndy Fiddaman 378*906afcb8SAndy Fiddaman if ${state.in_c_comment} ; then 379*906afcb8SAndy Fiddaman if [[ "$c2" == "*/" ]] ; then 380*906afcb8SAndy Fiddaman (( file_pos++, line_pos.x++ )) 381*906afcb8SAndy Fiddaman state.in_c_comment=false 382*906afcb8SAndy Fiddaman 383*906afcb8SAndy Fiddaman # flush comment text 384*906afcb8SAndy Fiddaman comment_array[ca++]="${comment}" 385*906afcb8SAndy Fiddaman comment="" 386*906afcb8SAndy Fiddaman 387*906afcb8SAndy Fiddaman if (( ca > max_num_comments )) ; then 388*906afcb8SAndy Fiddaman break 389*906afcb8SAndy Fiddaman fi 390*906afcb8SAndy Fiddaman else 391*906afcb8SAndy Fiddaman comment+="$c" 392*906afcb8SAndy Fiddaman fi 393*906afcb8SAndy Fiddaman elif ${state.cxx.in_comment} ; then 394*906afcb8SAndy Fiddaman if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then 395*906afcb8SAndy Fiddaman state.cxx.in_comment=false 396*906afcb8SAndy Fiddaman 397*906afcb8SAndy Fiddaman # flush comment text 398*906afcb8SAndy Fiddaman if ${state.cxx.comment_continued} ; then 399*906afcb8SAndy Fiddaman comment_array[ca-1]+="${ch.newline}${comment}" 400*906afcb8SAndy Fiddaman (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 401*906afcb8SAndy Fiddaman state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 402*906afcb8SAndy Fiddaman else 403*906afcb8SAndy Fiddaman comment_array[ca++]="${comment}" 404*906afcb8SAndy Fiddaman (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x , 405*906afcb8SAndy Fiddaman state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y )) 406*906afcb8SAndy Fiddaman fi 407*906afcb8SAndy Fiddaman comment="" 408*906afcb8SAndy Fiddaman 409*906afcb8SAndy Fiddaman if (( ca > max_num_comments )) ; then 410*906afcb8SAndy Fiddaman break 411*906afcb8SAndy Fiddaman fi 412*906afcb8SAndy Fiddaman else 413*906afcb8SAndy Fiddaman comment+="$c" 414*906afcb8SAndy Fiddaman fi 415*906afcb8SAndy Fiddaman elif ${state.in_sq_literal} ; then 416*906afcb8SAndy Fiddaman if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 417*906afcb8SAndy Fiddaman state.in_sq_literal=false 418*906afcb8SAndy Fiddaman fi 419*906afcb8SAndy Fiddaman elif ${state.in_dq_literal} ; then 420*906afcb8SAndy Fiddaman if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 421*906afcb8SAndy Fiddaman state.in_dq_literal=false 422*906afcb8SAndy Fiddaman fi 423*906afcb8SAndy Fiddaman else 424*906afcb8SAndy Fiddaman if [[ "$c2" == "/*" ]] ; then 425*906afcb8SAndy Fiddaman (( file_pos++, line_pos.x++ )) 426*906afcb8SAndy Fiddaman state.in_c_comment=true 427*906afcb8SAndy Fiddaman comment="" 428*906afcb8SAndy Fiddaman elif [[ "$c2" == "//" ]] ; then 429*906afcb8SAndy Fiddaman (( file_pos++, line_pos.x++ )) 430*906afcb8SAndy Fiddaman if (( state.cxx.comment_prev_pos.x == line_pos.x && \ 431*906afcb8SAndy Fiddaman state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then 432*906afcb8SAndy Fiddaman state.cxx.comment_continued=true 433*906afcb8SAndy Fiddaman else 434*906afcb8SAndy Fiddaman state.cxx.comment_continued=false 435*906afcb8SAndy Fiddaman fi 436*906afcb8SAndy Fiddaman (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y )) 437*906afcb8SAndy Fiddaman state.cxx.in_comment=true 438*906afcb8SAndy Fiddaman comment="" 439*906afcb8SAndy Fiddaman elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then 440*906afcb8SAndy Fiddaman state.in_sq_literal=true 441*906afcb8SAndy Fiddaman elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then 442*906afcb8SAndy Fiddaman state.in_dq_literal=true 443*906afcb8SAndy Fiddaman fi 444*906afcb8SAndy Fiddaman fi 445*906afcb8SAndy Fiddaman done 446*906afcb8SAndy Fiddaman 447*906afcb8SAndy Fiddaman if [[ "$comment" != "" ]] ; then 448*906afcb8SAndy Fiddaman print -u2 "## ERROR: Comment text buffer not empty at EOF." 449*906afcb8SAndy Fiddaman err=1 450*906afcb8SAndy Fiddaman fi 451*906afcb8SAndy Fiddaman 452*906afcb8SAndy Fiddaman if ${state.in_c_comment} ; then 453*906afcb8SAndy Fiddaman print -u2 "## ERROR: C comment did not close before EOF." 454*906afcb8SAndy Fiddaman err=1 455*906afcb8SAndy Fiddaman fi 456*906afcb8SAndy Fiddaman 457*906afcb8SAndy Fiddaman if ${state.cxx.in_comment} ; then 458*906afcb8SAndy Fiddaman print -u2 "## ERROR: C++ comment did not close before EOF." 459*906afcb8SAndy Fiddaman err=1 460*906afcb8SAndy Fiddaman fi 461*906afcb8SAndy Fiddaman 462*906afcb8SAndy Fiddaman if ${state.in_dq_literal} ; then 463*906afcb8SAndy Fiddaman print -u2 "## ERROR: Double-quoted literal did not close before EOF." 464*906afcb8SAndy Fiddaman err=1 465*906afcb8SAndy Fiddaman fi 466*906afcb8SAndy Fiddaman 467*906afcb8SAndy Fiddaman # We treat this one only as warning since things like "foo.html.cpp" may 468*906afcb8SAndy Fiddaman # trigger this condition accidently 469*906afcb8SAndy Fiddaman if ${state.in_sq_literal} ; then 470*906afcb8SAndy Fiddaman print -u2 "## WARNING: Single-quoted literal did not close before EOF." 471*906afcb8SAndy Fiddaman fi 472*906afcb8SAndy Fiddaman 473*906afcb8SAndy Fiddaman return $err 474*906afcb8SAndy Fiddaman} 475*906afcb8SAndy Fiddaman 476*906afcb8SAndy Fiddaman# determine file type 477*906afcb8SAndy Fiddamanfunction get_file_format 478*906afcb8SAndy Fiddaman{ 479*906afcb8SAndy Fiddaman set -o errexit 480*906afcb8SAndy Fiddaman 481*906afcb8SAndy Fiddaman typeset filename="$1" 482*906afcb8SAndy Fiddaman nameref file_format="$2" 483*906afcb8SAndy Fiddaman 484*906afcb8SAndy Fiddaman typeset fileeval # evaluation result of /usr/bin/file 485*906afcb8SAndy Fiddaman 486*906afcb8SAndy Fiddaman # check whether "filename" is a plain, readable file 487*906afcb8SAndy Fiddaman [[ ! -f "$filename" ]] && return 1 488*906afcb8SAndy Fiddaman [[ ! -r "$filename" ]] && return 1 489*906afcb8SAndy Fiddaman 490*906afcb8SAndy Fiddaman # In theory this code would exclusively look at the contents of 491*906afcb8SAndy Fiddaman # the file to figure out it's file format - unfortunately 492*906afcb8SAndy Fiddaman # /usr/bin/file is virtually useless (the heuristics, matching 493*906afcb8SAndy Fiddaman # and output unreliable) for many file formats and therefore 494*906afcb8SAndy Fiddaman # we have to do a multi-stage approach which looks 495*906afcb8SAndy Fiddaman # at the file's content if possible and at the filename 496*906afcb8SAndy Fiddaman # otherwise. Fun... ;-( 497*906afcb8SAndy Fiddaman 498*906afcb8SAndy Fiddaman # pass one: Find matches for file formats where /usr/bin/file 499*906afcb8SAndy Fiddaman # is known to be unreliable: 500*906afcb8SAndy Fiddaman case "$filename" in 501*906afcb8SAndy Fiddaman *.[ch] | *.cpp | *.cc | *.cxx | *.hxx) 502*906afcb8SAndy Fiddaman file_format="c_source" 503*906afcb8SAndy Fiddaman return 0 504*906afcb8SAndy Fiddaman ;; 505*906afcb8SAndy Fiddaman *Imakefile) 506*906afcb8SAndy Fiddaman file_format="imakefile" 507*906afcb8SAndy Fiddaman return 0 508*906afcb8SAndy Fiddaman ;; 509*906afcb8SAndy Fiddaman *Makefile) 510*906afcb8SAndy Fiddaman file_format="makefile" 511*906afcb8SAndy Fiddaman return 0 512*906afcb8SAndy Fiddaman ;; 513*906afcb8SAndy Fiddaman esac 514*906afcb8SAndy Fiddaman 515*906afcb8SAndy Fiddaman # pass two: match by file content via /usr/bin/file 516*906afcb8SAndy Fiddaman fileeval="$(LC_ALL=C /usr/bin/file "$filename")" 517*906afcb8SAndy Fiddaman case "$fileeval" in 518*906afcb8SAndy Fiddaman ~(E)roff) 519*906afcb8SAndy Fiddaman file_format="troff" 520*906afcb8SAndy Fiddaman return 0 521*906afcb8SAndy Fiddaman ;; 522*906afcb8SAndy Fiddaman ~(E)html\ document) 523*906afcb8SAndy Fiddaman file_format="html" 524*906afcb8SAndy Fiddaman return 0 525*906afcb8SAndy Fiddaman ;; 526*906afcb8SAndy Fiddaman ~(E)sgml\ document) 527*906afcb8SAndy Fiddaman file_format="sgml" 528*906afcb8SAndy Fiddaman return 0 529*906afcb8SAndy Fiddaman ;; 530*906afcb8SAndy Fiddaman ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script) 531*906afcb8SAndy Fiddaman file_format="shell" 532*906afcb8SAndy Fiddaman return 0 533*906afcb8SAndy Fiddaman ;; 534*906afcb8SAndy Fiddaman ~(E)executable.*/perl\ script) 535*906afcb8SAndy Fiddaman file_format="perl" 536*906afcb8SAndy Fiddaman return 0 537*906afcb8SAndy Fiddaman ;; 538*906afcb8SAndy Fiddaman esac 539*906afcb8SAndy Fiddaman 540*906afcb8SAndy Fiddaman # pass three: fallhack to filename matching 541*906afcb8SAndy Fiddaman case "$filename" in 542*906afcb8SAndy Fiddaman *.man) 543*906afcb8SAndy Fiddaman file_format="troff" 544*906afcb8SAndy Fiddaman return 0 545*906afcb8SAndy Fiddaman ;; 546*906afcb8SAndy Fiddaman *.html) 547*906afcb8SAndy Fiddaman file_format="html" 548*906afcb8SAndy Fiddaman return 0 549*906afcb8SAndy Fiddaman ;; 550*906afcb8SAndy Fiddaman *.sgml) 551*906afcb8SAndy Fiddaman file_format="sgml" 552*906afcb8SAndy Fiddaman return 0 553*906afcb8SAndy Fiddaman ;; 554*906afcb8SAndy Fiddaman *.xml) 555*906afcb8SAndy Fiddaman file_format="xml" 556*906afcb8SAndy Fiddaman return 0 557*906afcb8SAndy Fiddaman ;; 558*906afcb8SAndy Fiddaman *.png) 559*906afcb8SAndy Fiddaman file_format="image_png" 560*906afcb8SAndy Fiddaman return 0 561*906afcb8SAndy Fiddaman ;; 562*906afcb8SAndy Fiddaman *.xcf) 563*906afcb8SAndy Fiddaman file_format="image_xcf" 564*906afcb8SAndy Fiddaman return 0 565*906afcb8SAndy Fiddaman ;; 566*906afcb8SAndy Fiddaman *.shar) 567*906afcb8SAndy Fiddaman file_format="archive_shell" 568*906afcb8SAndy Fiddaman return 0 569*906afcb8SAndy Fiddaman ;; 570*906afcb8SAndy Fiddaman *.sh) 571*906afcb8SAndy Fiddaman file_format="shell" 572*906afcb8SAndy Fiddaman return 0 573*906afcb8SAndy Fiddaman ;; 574*906afcb8SAndy Fiddaman *.pcf) 575*906afcb8SAndy Fiddaman file_format="font_pcf" 576*906afcb8SAndy Fiddaman return 0 577*906afcb8SAndy Fiddaman ;; 578*906afcb8SAndy Fiddaman *.bdf) 579*906afcb8SAndy Fiddaman file_format="font_bdf" 580*906afcb8SAndy Fiddaman return 0 581*906afcb8SAndy Fiddaman ;; 582*906afcb8SAndy Fiddaman *.pmf) 583*906afcb8SAndy Fiddaman file_format="font_pmf" 584*906afcb8SAndy Fiddaman return 0 585*906afcb8SAndy Fiddaman ;; 586*906afcb8SAndy Fiddaman *.ttf | *.otf) 587*906afcb8SAndy Fiddaman file_format="font_ttf" 588*906afcb8SAndy Fiddaman return 0 589*906afcb8SAndy Fiddaman ;; 590*906afcb8SAndy Fiddaman *.pfa | *.pfb) 591*906afcb8SAndy Fiddaman file_format="font_postscript" 592*906afcb8SAndy Fiddaman return 0 593*906afcb8SAndy Fiddaman ;; 594*906afcb8SAndy Fiddaman esac 595*906afcb8SAndy Fiddaman 596*906afcb8SAndy Fiddaman return 1 597*906afcb8SAndy Fiddaman} 598*906afcb8SAndy Fiddaman 599*906afcb8SAndy Fiddamanfunction extract_comments 600*906afcb8SAndy Fiddaman{ 601*906afcb8SAndy Fiddaman set -o errexit 602*906afcb8SAndy Fiddaman 603*906afcb8SAndy Fiddaman nameref records="$1" 604*906afcb8SAndy Fiddaman typeset filename="$2" 605*906afcb8SAndy Fiddaman integer max_num_comments="$3" 606*906afcb8SAndy Fiddaman integer max_filesize_for_scan="$4" 607*906afcb8SAndy Fiddaman 608*906afcb8SAndy Fiddaman typeset datatype="" 609*906afcb8SAndy Fiddaman 610*906afcb8SAndy Fiddaman records[${filename}]=( 611*906afcb8SAndy Fiddaman typeset filename="$filename" 612*906afcb8SAndy Fiddaman 613*906afcb8SAndy Fiddaman typeset fileformat_found="false" # "true" or "false" 614*906afcb8SAndy Fiddaman typeset file_format="" 615*906afcb8SAndy Fiddaman 616*906afcb8SAndy Fiddaman typeset -A hashsum 617*906afcb8SAndy Fiddaman 618*906afcb8SAndy Fiddaman typeset comments_parsed="false" # "true" or "false" 619*906afcb8SAndy Fiddaman typeset -a comments 620*906afcb8SAndy Fiddaman ) 621*906afcb8SAndy Fiddaman 622*906afcb8SAndy Fiddaman records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")" 623*906afcb8SAndy Fiddaman records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")" 624*906afcb8SAndy Fiddaman 625*906afcb8SAndy Fiddaman if get_file_format "$filename" datatype ; then 626*906afcb8SAndy Fiddaman records[${filename}].fileformat_found="true" 627*906afcb8SAndy Fiddaman records[${filename}].file_format="$datatype" 628*906afcb8SAndy Fiddaman else 629*906afcb8SAndy Fiddaman return 1 630*906afcb8SAndy Fiddaman fi 631*906afcb8SAndy Fiddaman 632*906afcb8SAndy Fiddaman case "$datatype" in 633*906afcb8SAndy Fiddaman c_source|imakefile) 634*906afcb8SAndy Fiddaman enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 635*906afcb8SAndy Fiddaman records[${filename}].comments_parsed=true 636*906afcb8SAndy Fiddaman ;; 637*906afcb8SAndy Fiddaman shell|makefile) 638*906afcb8SAndy Fiddaman enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 639*906afcb8SAndy Fiddaman records[${filename}].comments_parsed=true 640*906afcb8SAndy Fiddaman ;; 641*906afcb8SAndy Fiddaman troff) 642*906afcb8SAndy Fiddaman enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 643*906afcb8SAndy Fiddaman records[${filename}].comments_parsed=true 644*906afcb8SAndy Fiddaman ;; 645*906afcb8SAndy Fiddaman # NOTE: Disabled for now 646*906afcb8SAndy Fiddaman #xml|html|sgml) 647*906afcb8SAndy Fiddaman # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \ 648*906afcb8SAndy Fiddaman # records[${filename}].comments_parsed=true 649*906afcb8SAndy Fiddaman # ;; 650*906afcb8SAndy Fiddaman esac 651*906afcb8SAndy Fiddaman 652*906afcb8SAndy Fiddaman return 0 653*906afcb8SAndy Fiddaman} 654*906afcb8SAndy Fiddaman 655*906afcb8SAndy Fiddaman# parse HTTP return code, cookies etc. 656*906afcb8SAndy Fiddamanfunction parse_http_response 657*906afcb8SAndy Fiddaman{ 658*906afcb8SAndy Fiddaman nameref response="$1" 659*906afcb8SAndy Fiddaman typeset h statuscode statusmsg i 660*906afcb8SAndy Fiddaman 661*906afcb8SAndy Fiddaman # we use '\r' as additional IFS to filter the final '\r' 662*906afcb8SAndy Fiddaman IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code> 663*906afcb8SAndy Fiddaman [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; } 664*906afcb8SAndy Fiddaman [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; } 665*906afcb8SAndy Fiddaman response.statuscode="$statuscode" 666*906afcb8SAndy Fiddaman response.statusmsg="$statusmsg" 667*906afcb8SAndy Fiddaman 668*906afcb8SAndy Fiddaman # skip remaining headers 669*906afcb8SAndy Fiddaman while IFS='' read -r i ; do 670*906afcb8SAndy Fiddaman [[ "$i" == $'\r' ]] && break 671*906afcb8SAndy Fiddaman 672*906afcb8SAndy Fiddaman # strip '\r' at the end 673*906afcb8SAndy Fiddaman i="${i/~(Er)$'\r'/}" 674*906afcb8SAndy Fiddaman 675*906afcb8SAndy Fiddaman case "$i" in 676*906afcb8SAndy Fiddaman ~(Eli)Content-Type:.*) 677*906afcb8SAndy Fiddaman response.content_type="${i/~(El).*:[[:blank:]]*/}" 678*906afcb8SAndy Fiddaman ;; 679*906afcb8SAndy Fiddaman ~(Eli)Content-Length:[[:blank:]]*[0-9]*) 680*906afcb8SAndy Fiddaman integer response.content_length="${i/~(El).*:[[:blank:]]*/}" 681*906afcb8SAndy Fiddaman ;; 682*906afcb8SAndy Fiddaman ~(Eli)Transfer-Encoding:.*) 683*906afcb8SAndy Fiddaman response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}" 684*906afcb8SAndy Fiddaman ;; 685*906afcb8SAndy Fiddaman esac 686*906afcb8SAndy Fiddaman done 687*906afcb8SAndy Fiddaman 688*906afcb8SAndy Fiddaman return 0 689*906afcb8SAndy Fiddaman} 690*906afcb8SAndy Fiddaman 691*906afcb8SAndy Fiddamanfunction cat_http_body 692*906afcb8SAndy Fiddaman{ 693*906afcb8SAndy Fiddaman typeset emode="$1" 694*906afcb8SAndy Fiddaman typeset hexchunksize="0" 695*906afcb8SAndy Fiddaman integer chunksize=0 696*906afcb8SAndy Fiddaman 697*906afcb8SAndy Fiddaman if [[ "${emode}" == "chunked" ]] ; then 698*906afcb8SAndy Fiddaman while IFS=$'\r' read hexchunksize && 699*906afcb8SAndy Fiddaman [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] && 700*906afcb8SAndy Fiddaman (( chunksize=$( printf "16#%s\n" "${hexchunksize}" ) )) && (( chunksize > 0 )) ; do 701*906afcb8SAndy Fiddaman dd bs=1 count="${chunksize}" 2>/dev/null 702*906afcb8SAndy Fiddaman done 703*906afcb8SAndy Fiddaman else 704*906afcb8SAndy Fiddaman cat 705*906afcb8SAndy Fiddaman fi 706*906afcb8SAndy Fiddaman 707*906afcb8SAndy Fiddaman return 0 708*906afcb8SAndy Fiddaman} 709*906afcb8SAndy Fiddaman 710*906afcb8SAndy Fiddamanfunction cat_url 711*906afcb8SAndy Fiddaman{ 712*906afcb8SAndy Fiddaman typeset protocol="${1%://*}" 713*906afcb8SAndy Fiddaman typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html" 714*906afcb8SAndy Fiddaman 715*906afcb8SAndy Fiddaman if [[ "${protocol}" == "file" ]] ; then 716*906afcb8SAndy Fiddaman cat "${path1}" 717*906afcb8SAndy Fiddaman return $? 718*906afcb8SAndy Fiddaman elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then 719*906afcb8SAndy Fiddaman typeset host="${path1%%/*}" 720*906afcb8SAndy Fiddaman typeset path="${path1#*/}" 721*906afcb8SAndy Fiddaman typeset port="${host##*:}" 722*906afcb8SAndy Fiddaman 723*906afcb8SAndy Fiddaman integer netfd 724*906afcb8SAndy Fiddaman compound httpresponse # http response 725*906afcb8SAndy Fiddaman 726*906afcb8SAndy Fiddaman # If URL did not contain a port number in the host part then look at the 727*906afcb8SAndy Fiddaman # protocol to get the port number 728*906afcb8SAndy Fiddaman if [[ "${port}" == "${host}" ]] ; then 729*906afcb8SAndy Fiddaman case "${protocol}" in 730*906afcb8SAndy Fiddaman "http") port=80 ;; 731*906afcb8SAndy Fiddaman "https") port=443 ;; 732*906afcb8SAndy Fiddaman *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;; 733*906afcb8SAndy Fiddaman esac 734*906afcb8SAndy Fiddaman else 735*906afcb8SAndy Fiddaman host="${host%:*}" 736*906afcb8SAndy Fiddaman fi 737*906afcb8SAndy Fiddaman 738*906afcb8SAndy Fiddaman printmsg "protocol=${protocol} port=${port} host=${host} path=${path}" 739*906afcb8SAndy Fiddaman 740*906afcb8SAndy Fiddaman # prechecks 741*906afcb8SAndy Fiddaman [[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; } 742*906afcb8SAndy Fiddaman [[ "${port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; } 743*906afcb8SAndy Fiddaman [[ "${host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; } 744*906afcb8SAndy Fiddaman [[ "${path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; } 745*906afcb8SAndy Fiddaman 746*906afcb8SAndy Fiddaman # open TCP channel 747*906afcb8SAndy Fiddaman if [[ "${protocol}" == "https" ]] ; then 748*906afcb8SAndy Fiddaman compound sslfifo 749*906afcb8SAndy Fiddaman sslfifo.dir="$(mktemp -d)" 750*906afcb8SAndy Fiddaman sslfifo.in="${sslfifo.dir}/in" 751*906afcb8SAndy Fiddaman sslfifo.out="${sslfifo.dir}/out" 752*906afcb8SAndy Fiddaman 753*906afcb8SAndy Fiddaman # register an EXIT trap and use "errexit" to leave it at the first error 754*906afcb8SAndy Fiddaman # (this saves lots of if/fi tests for error checking) 755*906afcb8SAndy Fiddaman trap "rm -r \"${sslfifo.dir}\"" EXIT 756*906afcb8SAndy Fiddaman set -o errexit 757*906afcb8SAndy Fiddaman 758*906afcb8SAndy Fiddaman mkfifo "${sslfifo.in}" "${sslfifo.out}" 759*906afcb8SAndy Fiddaman 760*906afcb8SAndy Fiddaman # create async openssl child to handle https 761*906afcb8SAndy Fiddaman openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" & 762*906afcb8SAndy Fiddaman 763*906afcb8SAndy Fiddaman # send HTTP request 764*906afcb8SAndy Fiddaman request="GET /${path} HTTP/1.1\r\n" 765*906afcb8SAndy Fiddaman request+="Host: ${host}\r\n" 766*906afcb8SAndy Fiddaman request+="User-Agent: crawlsrccomments/ksh93(ssl) (2010-03-27; $(uname -s -r -p))\r\n" 767*906afcb8SAndy Fiddaman request+="Connection: close\r\n" 768*906afcb8SAndy Fiddaman print -n -- "${request}\r\n" >> "${sslfifo.in}" 769*906afcb8SAndy Fiddaman 770*906afcb8SAndy Fiddaman # collect response and send it to stdout 771*906afcb8SAndy Fiddaman { 772*906afcb8SAndy Fiddaman parse_http_response httpresponse 773*906afcb8SAndy Fiddaman cat_http_body "${httpresponse.transfer_encoding}" 774*906afcb8SAndy Fiddaman } <"${sslfifo.out}" 775*906afcb8SAndy Fiddaman 776*906afcb8SAndy Fiddaman wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; } 777*906afcb8SAndy Fiddaman 778*906afcb8SAndy Fiddaman return 0 779*906afcb8SAndy Fiddaman else 780*906afcb8SAndy Fiddaman redirect {netfd}<> "/dev/tcp/${host}/${port}" 781*906afcb8SAndy Fiddaman (( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; } 782*906afcb8SAndy Fiddaman 783*906afcb8SAndy Fiddaman # send HTTP request 784*906afcb8SAndy Fiddaman request="GET /${path} HTTP/1.1\r\n" 785*906afcb8SAndy Fiddaman request+="Host: ${host}\r\n" 786*906afcb8SAndy Fiddaman request+="User-Agent: crawlsrccomments/ksh93 (2010-03-27; $(uname -s -r -p))\r\n" 787*906afcb8SAndy Fiddaman request+="Connection: close\r\n" 788*906afcb8SAndy Fiddaman print -n -- "${request}\r\n" >&${netfd} 789*906afcb8SAndy Fiddaman 790*906afcb8SAndy Fiddaman # collect response and send it to stdout 791*906afcb8SAndy Fiddaman parse_http_response httpresponse <&${netfd} 792*906afcb8SAndy Fiddaman cat_http_body "${httpresponse.transfer_encoding}" <&${netfd} 793*906afcb8SAndy Fiddaman 794*906afcb8SAndy Fiddaman # close connection 795*906afcb8SAndy Fiddaman redirect {netfd}<&- 796*906afcb8SAndy Fiddaman 797*906afcb8SAndy Fiddaman return 0 798*906afcb8SAndy Fiddaman fi 799*906afcb8SAndy Fiddaman else 800*906afcb8SAndy Fiddaman return 1 801*906afcb8SAndy Fiddaman fi 802*906afcb8SAndy Fiddaman # notreached 803*906afcb8SAndy Fiddaman} 804*906afcb8SAndy Fiddaman 805*906afcb8SAndy Fiddamanfunction print_stats 806*906afcb8SAndy Fiddaman{ 807*906afcb8SAndy Fiddaman set -o errexit 808*906afcb8SAndy Fiddaman 809*906afcb8SAndy Fiddaman # gather some statistics 810*906afcb8SAndy Fiddaman compound stats=( 811*906afcb8SAndy Fiddaman integer files_with_comments=0 812*906afcb8SAndy Fiddaman integer files_without_comments=0 813*906afcb8SAndy Fiddaman 814*906afcb8SAndy Fiddaman integer files_without_known_format=0 815*906afcb8SAndy Fiddaman 816*906afcb8SAndy Fiddaman integer files_with_license_info=0 817*906afcb8SAndy Fiddaman integer files_without_license_info=0 818*906afcb8SAndy Fiddaman 819*906afcb8SAndy Fiddaman integer total_num_files=0 820*906afcb8SAndy Fiddaman ) 821*906afcb8SAndy Fiddaman 822*906afcb8SAndy Fiddaman for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 823*906afcb8SAndy Fiddaman if "${records[$i].comments_parsed}" ; then 824*906afcb8SAndy Fiddaman (( stats.files_with_comments++ )) 825*906afcb8SAndy Fiddaman else 826*906afcb8SAndy Fiddaman (( stats.files_without_comments++ )) 827*906afcb8SAndy Fiddaman fi 828*906afcb8SAndy Fiddaman 829*906afcb8SAndy Fiddaman if ! "${records[$i].fileformat_found}" ; then 830*906afcb8SAndy Fiddaman (( stats.files_without_known_format++ )) 831*906afcb8SAndy Fiddaman fi 832*906afcb8SAndy Fiddaman 833*906afcb8SAndy Fiddaman if "${records[$i].license_info_found}" ; then 834*906afcb8SAndy Fiddaman (( stats.files_with_license_info++ )) 835*906afcb8SAndy Fiddaman else 836*906afcb8SAndy Fiddaman (( stats.files_without_license_info++ )) 837*906afcb8SAndy Fiddaman fi 838*906afcb8SAndy Fiddaman 839*906afcb8SAndy Fiddaman (( stats.total_num_files++ )) 840*906afcb8SAndy Fiddaman done 841*906afcb8SAndy Fiddaman 842*906afcb8SAndy Fiddaman print -v stats 843*906afcb8SAndy Fiddaman return 0 844*906afcb8SAndy Fiddaman} 845*906afcb8SAndy Fiddaman 846*906afcb8SAndy Fiddaman 847*906afcb8SAndy Fiddamanfunction print_comments_plain 848*906afcb8SAndy Fiddaman{ 849*906afcb8SAndy Fiddaman set -o errexit 850*906afcb8SAndy Fiddaman 851*906afcb8SAndy Fiddaman nameref records=$1 852*906afcb8SAndy Fiddaman nameref options=$2 853*906afcb8SAndy Fiddaman typeset i j 854*906afcb8SAndy Fiddaman 855*906afcb8SAndy Fiddaman for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 856*906afcb8SAndy Fiddaman nameref node=records[$i] 857*906afcb8SAndy Fiddaman 858*906afcb8SAndy Fiddaman if [[ "${options.filepattern.accept}" != "" ]] && \ 859*906afcb8SAndy Fiddaman [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 860*906afcb8SAndy Fiddaman continue 861*906afcb8SAndy Fiddaman fi 862*906afcb8SAndy Fiddaman if [[ "${options.filepattern.reject}" != "" ]] && \ 863*906afcb8SAndy Fiddaman [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 864*906afcb8SAndy Fiddaman continue 865*906afcb8SAndy Fiddaman fi 866*906afcb8SAndy Fiddaman 867*906afcb8SAndy Fiddaman node.license_info_found=false 868*906afcb8SAndy Fiddaman 869*906afcb8SAndy Fiddaman if ! "${node.comments_parsed}" ; then 870*906afcb8SAndy Fiddaman continue 871*906afcb8SAndy Fiddaman fi 872*906afcb8SAndy Fiddaman 873*906afcb8SAndy Fiddaman for j in "${!node.comments[@]}" ; do 874*906afcb8SAndy Fiddaman typeset s="${node.comments[$j]}" 875*906afcb8SAndy Fiddaman typeset match=false 876*906afcb8SAndy Fiddaman 877*906afcb8SAndy Fiddaman if [[ "${options.commentpattern.accept}" != "" ]] && \ 878*906afcb8SAndy Fiddaman [[ "$s" == ${options.commentpattern.accept} ]] ; then 879*906afcb8SAndy Fiddaman match=true 880*906afcb8SAndy Fiddaman fi 881*906afcb8SAndy Fiddaman if [[ "${options.commentpattern.reject}" != "" ]] && \ 882*906afcb8SAndy Fiddaman [[ "$s" == ${options.commentpattern.reject} ]] ; then 883*906afcb8SAndy Fiddaman match=false 884*906afcb8SAndy Fiddaman fi 885*906afcb8SAndy Fiddaman 886*906afcb8SAndy Fiddaman if "${match}" ; then 887*906afcb8SAndy Fiddaman printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j" 888*906afcb8SAndy Fiddaman printf "%s\n" "$s" 889*906afcb8SAndy Fiddaman node.license_info_found=true 890*906afcb8SAndy Fiddaman fi 891*906afcb8SAndy Fiddaman done 892*906afcb8SAndy Fiddaman 893*906afcb8SAndy Fiddaman if ! "${node.license_info_found}" ; then 894*906afcb8SAndy Fiddaman printf "## no match found in '%s'," "${node.filename}" 895*906afcb8SAndy Fiddaman printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 896*906afcb8SAndy Fiddaman "${node.comments_parsed}" \ 897*906afcb8SAndy Fiddaman "${node.fileformat_found}" \ 898*906afcb8SAndy Fiddaman "${node.file_format}" 899*906afcb8SAndy Fiddaman fi 900*906afcb8SAndy Fiddaman done 901*906afcb8SAndy Fiddaman 902*906afcb8SAndy Fiddaman return 0 903*906afcb8SAndy Fiddaman} 904*906afcb8SAndy Fiddaman 905*906afcb8SAndy Fiddamanfunction print_comments_duplicates_compressed 906*906afcb8SAndy Fiddaman{ 907*906afcb8SAndy Fiddaman set -o errexit 908*906afcb8SAndy Fiddaman 909*906afcb8SAndy Fiddaman nameref records=$1 910*906afcb8SAndy Fiddaman nameref options=$2 911*906afcb8SAndy Fiddaman typeset i j 912*906afcb8SAndy Fiddaman typeset -A hashed_comments 913*906afcb8SAndy Fiddaman integer num_hashed_comments 914*906afcb8SAndy Fiddaman 915*906afcb8SAndy Fiddaman for i in $(printf "%s\n" "${!records[@]}" | sort) ; do 916*906afcb8SAndy Fiddaman nameref node=records[$i] 917*906afcb8SAndy Fiddaman 918*906afcb8SAndy Fiddaman if [[ "${options.filepattern.accept}" != "" ]] && \ 919*906afcb8SAndy Fiddaman [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then 920*906afcb8SAndy Fiddaman continue 921*906afcb8SAndy Fiddaman fi 922*906afcb8SAndy Fiddaman if [[ "${options.filepattern.reject}" != "" ]] && \ 923*906afcb8SAndy Fiddaman [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then 924*906afcb8SAndy Fiddaman continue 925*906afcb8SAndy Fiddaman fi 926*906afcb8SAndy Fiddaman 927*906afcb8SAndy Fiddaman node.license_info_found=false 928*906afcb8SAndy Fiddaman 929*906afcb8SAndy Fiddaman if ! "${node.comments_parsed}" ; then 930*906afcb8SAndy Fiddaman continue 931*906afcb8SAndy Fiddaman fi 932*906afcb8SAndy Fiddaman 933*906afcb8SAndy Fiddaman for j in "${!node.comments[@]}" ; do 934*906afcb8SAndy Fiddaman typeset s="${node.comments[$j]}" 935*906afcb8SAndy Fiddaman typeset match=false 936*906afcb8SAndy Fiddaman 937*906afcb8SAndy Fiddaman if [[ "${options.commentpattern.accept}" != "" ]] && \ 938*906afcb8SAndy Fiddaman [[ "$s" == ${options.commentpattern.accept} ]] ; then 939*906afcb8SAndy Fiddaman match=true 940*906afcb8SAndy Fiddaman fi 941*906afcb8SAndy Fiddaman if [[ "${options.commentpattern.reject}" != "" ]] && \ 942*906afcb8SAndy Fiddaman [[ "$s" == ${options.commentpattern.reject} ]] ; then 943*906afcb8SAndy Fiddaman match=false 944*906afcb8SAndy Fiddaman fi 945*906afcb8SAndy Fiddaman 946*906afcb8SAndy Fiddaman 947*906afcb8SAndy Fiddaman if "${match}" ; then 948*906afcb8SAndy Fiddaman typeset -l hashstring # lowercase 949*906afcb8SAndy Fiddaman 950*906afcb8SAndy Fiddaman # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ... 951*906afcb8SAndy Fiddaman hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}" 952*906afcb8SAndy Fiddaman # ... and then create a MD5 hash from this string 953*906afcb8SAndy Fiddaman hash="$(sum -x md5 <<<"${hashstring}")" 954*906afcb8SAndy Fiddaman 955*906afcb8SAndy Fiddaman nameref hc_node=hashed_comments[${hash}] 956*906afcb8SAndy Fiddaman 957*906afcb8SAndy Fiddaman if [[ "${hc_node}" == "" ]] ; then 958*906afcb8SAndy Fiddaman # build node if there isn't one yet 959*906afcb8SAndy Fiddaman typeset -a hc_node.fileids 960*906afcb8SAndy Fiddaman typeset hc_node.comment="$s" 961*906afcb8SAndy Fiddaman fi 962*906afcb8SAndy Fiddaman 963*906afcb8SAndy Fiddaman hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" ) 964*906afcb8SAndy Fiddaman 965*906afcb8SAndy Fiddaman node.license_info_found=true 966*906afcb8SAndy Fiddaman fi 967*906afcb8SAndy Fiddaman done 968*906afcb8SAndy Fiddaman 969*906afcb8SAndy Fiddaman if ! "${node.license_info_found}" ; then 970*906afcb8SAndy Fiddaman printf "## no match found in " 971*906afcb8SAndy Fiddaman printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}" 972*906afcb8SAndy Fiddaman printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \ 973*906afcb8SAndy Fiddaman "${node.comments_parsed}" \ 974*906afcb8SAndy Fiddaman "${node.fileformat_found}" \ 975*906afcb8SAndy Fiddaman "${node.file_format}" 976*906afcb8SAndy Fiddaman fi 977*906afcb8SAndy Fiddaman done 978*906afcb8SAndy Fiddaman 979*906afcb8SAndy Fiddaman # print comments and all fileids (filename+hash sums) which include this comment 980*906afcb8SAndy Fiddaman for i in "${!hashed_comments[@]}" ; do 981*906afcb8SAndy Fiddaman printf "\f## The comment (ID=%s) ..." "${i}" 982*906afcb8SAndy Fiddaman printf "\n-- snip --" 983*906afcb8SAndy Fiddaman printf "\n%s" "${hashed_comments[${i}].comment}" 984*906afcb8SAndy Fiddaman printf "\n-- snip --" 985*906afcb8SAndy Fiddaman printf "\n... applies to the following files:\n" 986*906afcb8SAndy Fiddaman printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber 987*906afcb8SAndy Fiddaman done 988*906afcb8SAndy Fiddaman 989*906afcb8SAndy Fiddaman return 0 990*906afcb8SAndy Fiddaman} 991*906afcb8SAndy Fiddaman 992*906afcb8SAndy Fiddamanfunction do_crawl 993*906afcb8SAndy Fiddaman{ 994*906afcb8SAndy Fiddaman set -o errexit 995*906afcb8SAndy Fiddaman 996*906afcb8SAndy Fiddaman compound options=( 997*906afcb8SAndy Fiddaman integer max_filesize_for_scan=$((256*1024)) 998*906afcb8SAndy Fiddaman integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite) 999*906afcb8SAndy Fiddaman ) 1000*906afcb8SAndy Fiddaman 1001*906afcb8SAndy Fiddaman shift 1002*906afcb8SAndy Fiddaman while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do 1003*906afcb8SAndy Fiddaman printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1004*906afcb8SAndy Fiddaman case ${OPT} in 1005*906afcb8SAndy Fiddaman S) options.max_filesize_for_scan="${OPTARG}" ;; 1006*906afcb8SAndy Fiddaman N) options.max_num_comments="${OPTARG}" ;; 1007*906afcb8SAndy Fiddaman *) usage do_crawl_usage ;; 1008*906afcb8SAndy Fiddaman esac 1009*906afcb8SAndy Fiddaman done 1010*906afcb8SAndy Fiddaman shift $((OPTIND-1)) 1011*906afcb8SAndy Fiddaman 1012*906afcb8SAndy Fiddaman compound scan=( 1013*906afcb8SAndy Fiddaman typeset -A records 1014*906afcb8SAndy Fiddaman ) 1015*906afcb8SAndy Fiddaman 1016*906afcb8SAndy Fiddaman # read filenames from stdin 1017*906afcb8SAndy Fiddaman while read i ; do 1018*906afcb8SAndy Fiddaman printf "## scanning %s ...\n" "$i" 1019*906afcb8SAndy Fiddaman extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true 1020*906afcb8SAndy Fiddaman done 1021*906afcb8SAndy Fiddaman 1022*906afcb8SAndy Fiddaman # print compound variable array (we strip the "typeset -A records" for now) 1023*906afcb8SAndy Fiddaman print -v scan >"crawlsrccomments_extracted_comments.cpv" 1024*906afcb8SAndy Fiddaman 1025*906afcb8SAndy Fiddaman print "# Wrote results to crawlsrccomments_extracted_comments.cpv" 1026*906afcb8SAndy Fiddaman 1027*906afcb8SAndy Fiddaman return 0 1028*906afcb8SAndy Fiddaman} 1029*906afcb8SAndy Fiddaman 1030*906afcb8SAndy Fiddamanfunction do_getcomments 1031*906afcb8SAndy Fiddaman{ 1032*906afcb8SAndy Fiddaman set -o errexit 1033*906afcb8SAndy Fiddaman 1034*906afcb8SAndy Fiddaman # vars 1035*906afcb8SAndy Fiddaman compound scan 1036*906afcb8SAndy Fiddaman typeset database 1037*906afcb8SAndy Fiddaman typeset tmp 1038*906afcb8SAndy Fiddaman 1039*906afcb8SAndy Fiddaman compound options=( 1040*906afcb8SAndy Fiddaman typeset database="crawlsrccomments_extracted_comments.cpv" 1041*906afcb8SAndy Fiddaman 1042*906afcb8SAndy Fiddaman typeset print_stats=false 1043*906afcb8SAndy Fiddaman typeset zapduplicates=false 1044*906afcb8SAndy Fiddaman compound filepattern=( 1045*906afcb8SAndy Fiddaman typeset accept="*" 1046*906afcb8SAndy Fiddaman typeset reject="" 1047*906afcb8SAndy Fiddaman ) 1048*906afcb8SAndy Fiddaman compound commentpattern=( 1049*906afcb8SAndy Fiddaman typeset accept="~(Ei)(license|copyright)" 1050*906afcb8SAndy Fiddaman typeset reject="" 1051*906afcb8SAndy Fiddaman ) 1052*906afcb8SAndy Fiddaman ) 1053*906afcb8SAndy Fiddaman 1054*906afcb8SAndy Fiddaman shift 1055*906afcb8SAndy Fiddaman while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do 1056*906afcb8SAndy Fiddaman # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1057*906afcb8SAndy Fiddaman case ${OPT} in 1058*906afcb8SAndy Fiddaman c) options.commentpattern.accept="${OPTARG}" ;; 1059*906afcb8SAndy Fiddaman C) options.commentpattern.reject="${OPTARG}" ;; 1060*906afcb8SAndy Fiddaman D) options.database="${OPTARG}" ;; 1061*906afcb8SAndy Fiddaman l) options.filepattern.accept="${OPTARG}" ;; 1062*906afcb8SAndy Fiddaman L) options.filepattern.reject="${OPTARG}" ;; 1063*906afcb8SAndy Fiddaman S) options.print_stats=true ;; 1064*906afcb8SAndy Fiddaman +S) options.print_stats=false ;; 1065*906afcb8SAndy Fiddaman Z) options.zapduplicates=true ;; 1066*906afcb8SAndy Fiddaman +Z) options.zapduplicates=false ;; 1067*906afcb8SAndy Fiddaman *) usage do_getcomments_usage ;; 1068*906afcb8SAndy Fiddaman esac 1069*906afcb8SAndy Fiddaman done 1070*906afcb8SAndy Fiddaman shift $((OPTIND-1)) 1071*906afcb8SAndy Fiddaman 1072*906afcb8SAndy Fiddaman # array of temporary files which should be cleaned-up upon exit 1073*906afcb8SAndy Fiddaman typeset -a tmpfiles 1074*906afcb8SAndy Fiddaman trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT 1075*906afcb8SAndy Fiddaman 1076*906afcb8SAndy Fiddaman # Support for HTTP URLs 1077*906afcb8SAndy Fiddaman if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then 1078*906afcb8SAndy Fiddaman database="/tmp/extract_license_cat_url_${PPID}_$$.tmp" 1079*906afcb8SAndy Fiddaman tmpfiles+=( "${database}" ) 1080*906afcb8SAndy Fiddaman print -u2 "# Loading URL..." 1081*906afcb8SAndy Fiddaman cat_url "${options.database}" >"${database}" 1082*906afcb8SAndy Fiddaman print -u2 "# Loading URL done." 1083*906afcb8SAndy Fiddaman else 1084*906afcb8SAndy Fiddaman database="${options.database}" 1085*906afcb8SAndy Fiddaman fi 1086*906afcb8SAndy Fiddaman 1087*906afcb8SAndy Fiddaman if [[ ! -r "${database}" ]] ; then 1088*906afcb8SAndy Fiddaman fatal_error "Can't read ${database}." 1089*906afcb8SAndy Fiddaman fi 1090*906afcb8SAndy Fiddaman 1091*906afcb8SAndy Fiddaman # Support for compressed database files 1092*906afcb8SAndy Fiddaman case "$(LC_ALL=C /usr/bin/file "${database}")" in 1093*906afcb8SAndy Fiddaman *bzip2*) 1094*906afcb8SAndy Fiddaman tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1095*906afcb8SAndy Fiddaman tmpfiles+=( "${tmp}" ) 1096*906afcb8SAndy Fiddaman print -u2 "# Uncompressing data (bzip2) ..." 1097*906afcb8SAndy Fiddaman bzcat <"${database}" >"${tmp}" 1098*906afcb8SAndy Fiddaman print -u2 "# Uncompression done." 1099*906afcb8SAndy Fiddaman database="${tmp}" 1100*906afcb8SAndy Fiddaman ;; 1101*906afcb8SAndy Fiddaman *gzip*) 1102*906afcb8SAndy Fiddaman tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp" 1103*906afcb8SAndy Fiddaman tmpfiles+=( "${tmp}" ) 1104*906afcb8SAndy Fiddaman print -u2 "# Uncompressing data (gzip) ..." 1105*906afcb8SAndy Fiddaman gunzip -c <"${database}" >"${tmp}" 1106*906afcb8SAndy Fiddaman print -u2 "# Uncompression done." 1107*906afcb8SAndy Fiddaman database="${tmp}" 1108*906afcb8SAndy Fiddaman ;; 1109*906afcb8SAndy Fiddaman esac 1110*906afcb8SAndy Fiddaman 1111*906afcb8SAndy Fiddaman # Read compound variable which contain all recorded comments 1112*906afcb8SAndy Fiddaman print -u2 "# reading records..." 1113*906afcb8SAndy Fiddaman read -C scan <"${database}" || fatal_error 'Error reading data.' 1114*906afcb8SAndy Fiddaman print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}" 1115*906afcb8SAndy Fiddaman 1116*906afcb8SAndy Fiddaman # print comments 1117*906afcb8SAndy Fiddaman print -u2 "# processing data..." 1118*906afcb8SAndy Fiddaman print "## comments start:" 1119*906afcb8SAndy Fiddaman if "${options.zapduplicates}" ; then 1120*906afcb8SAndy Fiddaman print_comments_duplicates_compressed scan.records options 1121*906afcb8SAndy Fiddaman else 1122*906afcb8SAndy Fiddaman print_comments_plain scan.records options 1123*906afcb8SAndy Fiddaman fi 1124*906afcb8SAndy Fiddaman print "## comments end" 1125*906afcb8SAndy Fiddaman print -u2 "# processing data done." 1126*906afcb8SAndy Fiddaman 1127*906afcb8SAndy Fiddaman if "${options.print_stats}" ; then 1128*906afcb8SAndy Fiddaman print_stats 1129*906afcb8SAndy Fiddaman fi 1130*906afcb8SAndy Fiddaman 1131*906afcb8SAndy Fiddaman return 0 1132*906afcb8SAndy Fiddaman} 1133*906afcb8SAndy Fiddaman 1134*906afcb8SAndy Fiddamanfunction usage 1135*906afcb8SAndy Fiddaman{ 1136*906afcb8SAndy Fiddaman nameref usagemsg=$1 1137*906afcb8SAndy Fiddaman OPTIND=0 1138*906afcb8SAndy Fiddaman getopts -a "${progname}" "${usagemsg}" OPT '-?' 1139*906afcb8SAndy Fiddaman exit 2 1140*906afcb8SAndy Fiddaman} 1141*906afcb8SAndy Fiddaman 1142*906afcb8SAndy Fiddamantypeset -r do_getcomments_usage=$'+ 1143*906afcb8SAndy Fiddaman[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n] 1144*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@sun.com>] 1145*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@nrubsig.org>] 1146*906afcb8SAndy Fiddaman[+NAME?getcomments - extract license information from source files] 1147*906afcb8SAndy Fiddaman[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts 1148*906afcb8SAndy Fiddaman license information from the "\bgetcomments\b"-database 1149*906afcb8SAndy Fiddaman file created by \bcrawl\b. The script allows various 1150*906afcb8SAndy Fiddaman filters (see options below) to be applied on the database] 1151*906afcb8SAndy Fiddaman[+?The license extraction is done in two steps - first a crawler script 1152*906afcb8SAndy Fiddaman called \bcrawl\b will scan all source files, extract 1153*906afcb8SAndy Fiddaman the comments and stores this information in a "database" file called 1154*906afcb8SAndy Fiddaman "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows 1155*906afcb8SAndy Fiddaman queries on this database.] 1156*906afcb8SAndy Fiddaman[D:database?Database file for input (either file, http:// or https://-URL).]:[database] 1157*906afcb8SAndy Fiddaman[l:acceptfilepattern?Process only files which match pattern.]:[pattern] 1158*906afcb8SAndy Fiddaman[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern] 1159*906afcb8SAndy Fiddaman[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern] 1160*906afcb8SAndy Fiddaman[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern] 1161*906afcb8SAndy Fiddaman[S:stats?Print statistics.] 1162*906afcb8SAndy Fiddaman[Z:zapsimilar?Combine similar/duplicate comments in the report.] 1163*906afcb8SAndy Fiddaman[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1164*906afcb8SAndy Fiddaman' 1165*906afcb8SAndy Fiddaman 1166*906afcb8SAndy Fiddamantypeset -r do_crawl_usage=$'+ 1167*906afcb8SAndy Fiddaman[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n] 1168*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@sun.com>] 1169*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@nrubsig.org>] 1170*906afcb8SAndy Fiddaman[+NAME?crawl - crawl comment information from source files] 1171*906afcb8SAndy Fiddaman[+DESCRIPTION?\bcrawl\b is a small utilty script which reads 1172*906afcb8SAndy Fiddaman a list of source code files from stdin, determinates the type of 1173*906afcb8SAndy Fiddaman syntax used by these files and then extracts 1174*906afcb8SAndy Fiddaman comments from the source code and stores this information into a 1175*906afcb8SAndy Fiddaman "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1176*906afcb8SAndy Fiddaman be processed by \bextract_license\b or similar processing tools.] 1177*906afcb8SAndy Fiddaman[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments. 1178*906afcb8SAndy Fiddaman Defaults to 256K characters.]:[numchars] 1179*906afcb8SAndy Fiddaman[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments] 1180*906afcb8SAndy Fiddaman[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1181*906afcb8SAndy Fiddaman' 1182*906afcb8SAndy Fiddaman 1183*906afcb8SAndy Fiddamantypeset -r crawlsrccomments_usage=$'+ 1184*906afcb8SAndy Fiddaman[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n] 1185*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@sun.com>] 1186*906afcb8SAndy Fiddaman[-author?Roland Mainz <roland.mainz@nrubsig.org>] 1187*906afcb8SAndy Fiddaman[+NAME?crawlsrccomments - extract and filter comment information from source files] 1188*906afcb8SAndy Fiddaman[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads 1189*906afcb8SAndy Fiddaman a list of source code files from stdin, determinates the type of 1190*906afcb8SAndy Fiddaman syntax used by these files and then extracts 1191*906afcb8SAndy Fiddaman comments from the source code and stores this information into a 1192*906afcb8SAndy Fiddaman "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then 1193*906afcb8SAndy Fiddaman be processed by \bextract_license\b or similar processing tools.] 1194*906afcb8SAndy Fiddaman 1195*906afcb8SAndy Fiddaman[crawl|getcomments] options 1196*906afcb8SAndy Fiddaman 1197*906afcb8SAndy Fiddaman[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)] 1198*906afcb8SAndy Fiddaman' 1199*906afcb8SAndy Fiddaman 1200*906afcb8SAndy Fiddaman 1201*906afcb8SAndy Fiddaman# program start 1202*906afcb8SAndy Fiddamanbuiltin basename 1203*906afcb8SAndy Fiddamanbuiltin cat 1204*906afcb8SAndy Fiddamanbuiltin date 1205*906afcb8SAndy Fiddamanbuiltin uname 1206*906afcb8SAndy Fiddamanbuiltin rm 1207*906afcb8SAndy Fiddamanbuiltin sum || fatal_error "sum builtin not found." 1208*906afcb8SAndy Fiddaman 1209*906afcb8SAndy Fiddaman# exit at the first error we hit 1210*906afcb8SAndy Fiddamanset -o errexit 1211*906afcb8SAndy Fiddaman 1212*906afcb8SAndy Fiddamantypeset progname="${ basename "${0}" ; }" 1213*906afcb8SAndy Fiddaman 1214*906afcb8SAndy Fiddamanwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do 1215*906afcb8SAndy Fiddaman # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|" 1216*906afcb8SAndy Fiddaman case ${OPT} in 1217*906afcb8SAndy Fiddaman *) usage crawlsrccomments_usage ;; 1218*906afcb8SAndy Fiddaman esac 1219*906afcb8SAndy Fiddamandone 1220*906afcb8SAndy Fiddamanshift $((OPTIND-1)) 1221*906afcb8SAndy Fiddaman 1222*906afcb8SAndy Fiddamantypeset cmd="$1" 1223*906afcb8SAndy Fiddaman 1224*906afcb8SAndy Fiddamancase "$cmd" in 1225*906afcb8SAndy Fiddaman "crawl") 1226*906afcb8SAndy Fiddaman progname+=" ${cmd}" 1227*906afcb8SAndy Fiddaman do_crawl "$@" 1228*906afcb8SAndy Fiddaman exit $? 1229*906afcb8SAndy Fiddaman ;; 1230*906afcb8SAndy Fiddaman "getcomments") 1231*906afcb8SAndy Fiddaman progname+=" ${cmd}" 1232*906afcb8SAndy Fiddaman do_getcomments "$@" 1233*906afcb8SAndy Fiddaman exit $? 1234*906afcb8SAndy Fiddaman ;; 1235*906afcb8SAndy Fiddaman *) 1236*906afcb8SAndy Fiddaman usage crawlsrccomments_usage 1237*906afcb8SAndy Fiddaman ;; 1238*906afcb8SAndy Fiddamanesac 1239*906afcb8SAndy Fiddaman 1240*906afcb8SAndy Fiddamanfatal_error "not reached." 1241*906afcb8SAndy Fiddaman# EOF. 1242