1*da2e3ebdSchin /*********************************************************************** 2*da2e3ebdSchin * * 3*da2e3ebdSchin * This software is part of the ast package * 4*da2e3ebdSchin * Copyright (c) 1992-2007 AT&T Knowledge Ventures * 5*da2e3ebdSchin * and is licensed under the * 6*da2e3ebdSchin * Common Public License, Version 1.0 * 7*da2e3ebdSchin * by AT&T Knowledge Ventures * 8*da2e3ebdSchin * * 9*da2e3ebdSchin * A copy of the License is available at * 10*da2e3ebdSchin * http://www.opensource.org/licenses/cpl1.0.txt * 11*da2e3ebdSchin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12*da2e3ebdSchin * * 13*da2e3ebdSchin * Information and Software Systems Research * 14*da2e3ebdSchin * AT&T Research * 15*da2e3ebdSchin * Florham Park NJ * 16*da2e3ebdSchin * * 17*da2e3ebdSchin * Glenn Fowler <gsf@research.att.com> * 18*da2e3ebdSchin * David Korn <dgk@research.att.com> * 19*da2e3ebdSchin * * 20*da2e3ebdSchin ***********************************************************************/ 21*da2e3ebdSchin #pragma prototyped 22*da2e3ebdSchin /* 23*da2e3ebdSchin * uniq 24*da2e3ebdSchin * 25*da2e3ebdSchin * Written by David Korn 26*da2e3ebdSchin */ 27*da2e3ebdSchin 28*da2e3ebdSchin static const char usage[] = 29*da2e3ebdSchin "[-?\n@(#)$Id: uniq (AT&T Research) 2006-08-28 $\n]" 30*da2e3ebdSchin USAGE_LICENSE 31*da2e3ebdSchin "[+NAME?uniq - Report or filter out repeated lines in a file]" 32*da2e3ebdSchin "[+DESCRIPTION?\buniq\b reads an input, comparing adjacent lines, and " 33*da2e3ebdSchin "writing one copy of each input line on the output. The second " 34*da2e3ebdSchin "and succeeding copies of the repeated adjacent lines are not " 35*da2e3ebdSchin "written.]" 36*da2e3ebdSchin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes " 37*da2e3ebdSchin "to standard output. If no \ainfile\a is given, or if the \ainfile\a " 38*da2e3ebdSchin "is \b-\b, \buniq\b reads from standard input with the start of " 39*da2e3ebdSchin "the file is defined as the current offset.]" 40*da2e3ebdSchin "[c:count?Output the number of times each line occurred along with " 41*da2e3ebdSchin "the line.]" 42*da2e3ebdSchin "[d:repeated|duplicates?Output the first of each duplicate line.]" 43*da2e3ebdSchin "[D:all-repeated?Output all duplicate lines as a group with an empty " 44*da2e3ebdSchin "line delimiter specified by \adelimit\a:]:?[delimit:=none]" 45*da2e3ebdSchin "{" 46*da2e3ebdSchin "[n:none?Do not delimit duplicate groups.]" 47*da2e3ebdSchin "[p:prepend?Prepend an empty line before each group.]" 48*da2e3ebdSchin "[s:separate?Separate each group with an empty line.]" 49*da2e3ebdSchin "}" 50*da2e3ebdSchin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over " 51*da2e3ebdSchin "before checking for uniqueness. A field is the minimal string matching " 52*da2e3ebdSchin "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b.]" 53*da2e3ebdSchin "[i:ignore-case?Ignore case in comparisons.]" 54*da2e3ebdSchin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over " 55*da2e3ebdSchin "before checking for uniqueness. If specified along with \b-f\b, " 56*da2e3ebdSchin "the first \achars\a after the first \afields\a are ignored. If " 57*da2e3ebdSchin "the \achars\a specifies more characters than are on the line, " 58*da2e3ebdSchin "an empty string will be used for comparison.]" 59*da2e3ebdSchin "[u:unique?Output unique lines.]" 60*da2e3ebdSchin "[w:check-chars]#[chars?\achars\a is the number of characters to compare " 61*da2e3ebdSchin "after skipping any specified fields and characters.]" 62*da2e3ebdSchin "\n" 63*da2e3ebdSchin "\n[infile [outfile]]\n" 64*da2e3ebdSchin "\n" 65*da2e3ebdSchin "[+EXIT STATUS?]{" 66*da2e3ebdSchin "[+0?The input file was successfully processed.]" 67*da2e3ebdSchin "[+>0?An error occurred.]" 68*da2e3ebdSchin "}" 69*da2e3ebdSchin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]" 70*da2e3ebdSchin ; 71*da2e3ebdSchin 72*da2e3ebdSchin #include <cmd.h> 73*da2e3ebdSchin 74*da2e3ebdSchin #define C_FLAG 1 75*da2e3ebdSchin #define D_FLAG 2 76*da2e3ebdSchin #define U_FLAG 4 77*da2e3ebdSchin 78*da2e3ebdSchin #define CWIDTH 4 79*da2e3ebdSchin #define MAXCNT 9999 80*da2e3ebdSchin 81*da2e3ebdSchin typedef int (*Compare_f)(const char*, const char*, size_t); 82*da2e3ebdSchin 83*da2e3ebdSchin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare) 84*da2e3ebdSchin { 85*da2e3ebdSchin register int n, f, outsize=0; 86*da2e3ebdSchin register char *cp, *ep, *bufp, *outp; 87*da2e3ebdSchin char *orecp, *sbufp=0, *outbuff; 88*da2e3ebdSchin int reclen,oreclen= -1,count=0,cwidth=0,sep,next; 89*da2e3ebdSchin if(mode&C_FLAG) 90*da2e3ebdSchin cwidth = CWIDTH+1; 91*da2e3ebdSchin while(1) 92*da2e3ebdSchin { 93*da2e3ebdSchin if(bufp = sfgetr(fdin,'\n',0)) 94*da2e3ebdSchin n = sfvalue(fdin); 95*da2e3ebdSchin else if(bufp = sfgetr(fdin,'\n',SF_LASTR)) 96*da2e3ebdSchin { 97*da2e3ebdSchin n = sfvalue(fdin); 98*da2e3ebdSchin bufp = memcpy(fmtbuf(n + 1), bufp, n); 99*da2e3ebdSchin bufp[n++] = '\n'; 100*da2e3ebdSchin } 101*da2e3ebdSchin else 102*da2e3ebdSchin n = 0; 103*da2e3ebdSchin if(n) 104*da2e3ebdSchin { 105*da2e3ebdSchin cp = bufp; 106*da2e3ebdSchin ep = cp + n; 107*da2e3ebdSchin if(f=fields) 108*da2e3ebdSchin while(f-->0 && cp<ep) /* skip over fields */ 109*da2e3ebdSchin { 110*da2e3ebdSchin while(cp<ep && *cp==' ' || *cp=='\t') 111*da2e3ebdSchin cp++; 112*da2e3ebdSchin while(cp<ep && *cp!=' ' && *cp!='\t') 113*da2e3ebdSchin cp++; 114*da2e3ebdSchin } 115*da2e3ebdSchin if(chars) 116*da2e3ebdSchin cp += chars; 117*da2e3ebdSchin if((reclen = n - (cp-bufp)) <=0) 118*da2e3ebdSchin { 119*da2e3ebdSchin reclen = 1; 120*da2e3ebdSchin cp = bufp + sfvalue(fdin)-1; 121*da2e3ebdSchin } 122*da2e3ebdSchin else if(width >= 0 && width < reclen) 123*da2e3ebdSchin reclen = width; 124*da2e3ebdSchin } 125*da2e3ebdSchin else 126*da2e3ebdSchin reclen=-2; 127*da2e3ebdSchin if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen))) 128*da2e3ebdSchin { 129*da2e3ebdSchin count++; 130*da2e3ebdSchin if (!all) 131*da2e3ebdSchin continue; 132*da2e3ebdSchin next = count; 133*da2e3ebdSchin } 134*da2e3ebdSchin else 135*da2e3ebdSchin { 136*da2e3ebdSchin next = 0; 137*da2e3ebdSchin if(outsize>0) 138*da2e3ebdSchin { 139*da2e3ebdSchin if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count)) 140*da2e3ebdSchin { 141*da2e3ebdSchin if(outp!=sbufp) 142*da2e3ebdSchin sfwrite(fdout,outp,0); 143*da2e3ebdSchin } 144*da2e3ebdSchin else 145*da2e3ebdSchin { 146*da2e3ebdSchin if(cwidth) 147*da2e3ebdSchin { 148*da2e3ebdSchin outp[CWIDTH] = ' '; 149*da2e3ebdSchin if(count<MAXCNT) 150*da2e3ebdSchin { 151*da2e3ebdSchin sfsprintf(outp,cwidth,"%*d",CWIDTH,count+1); 152*da2e3ebdSchin outp[CWIDTH] = ' '; 153*da2e3ebdSchin } 154*da2e3ebdSchin else 155*da2e3ebdSchin { 156*da2e3ebdSchin outsize -= (CWIDTH+1); 157*da2e3ebdSchin if(outp!=sbufp) 158*da2e3ebdSchin { 159*da2e3ebdSchin if(!(sbufp=fmtbuf(outsize))) 160*da2e3ebdSchin return(1); 161*da2e3ebdSchin memcpy(sbufp,outp+CWIDTH+1,outsize); 162*da2e3ebdSchin sfwrite(fdout,outp,0); 163*da2e3ebdSchin outp = sbufp; 164*da2e3ebdSchin } 165*da2e3ebdSchin else 166*da2e3ebdSchin outp += CWIDTH+1; 167*da2e3ebdSchin sfprintf(fdout,"%4d ",count+1); 168*da2e3ebdSchin } 169*da2e3ebdSchin } 170*da2e3ebdSchin if(sfwrite(fdout,outp,outsize) != outsize) 171*da2e3ebdSchin return(1); 172*da2e3ebdSchin } 173*da2e3ebdSchin } 174*da2e3ebdSchin } 175*da2e3ebdSchin if(n==0) 176*da2e3ebdSchin break; 177*da2e3ebdSchin if(count = next) 178*da2e3ebdSchin { 179*da2e3ebdSchin if(sfwrite(fdout,outp,outsize) != outsize) 180*da2e3ebdSchin return(1); 181*da2e3ebdSchin if(*all >= 0) 182*da2e3ebdSchin *all = 1; 183*da2e3ebdSchin sep = 0; 184*da2e3ebdSchin } 185*da2e3ebdSchin else 186*da2e3ebdSchin sep = all && *all > 0; 187*da2e3ebdSchin /* save current record */ 188*da2e3ebdSchin if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0) 189*da2e3ebdSchin return(1); 190*da2e3ebdSchin outp = outbuff; 191*da2e3ebdSchin if(outsize < n+cwidth+sep) 192*da2e3ebdSchin { 193*da2e3ebdSchin /* no room in outp, clear lock and use side buffer */ 194*da2e3ebdSchin sfwrite(fdout,outp,0); 195*da2e3ebdSchin if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep))) 196*da2e3ebdSchin return(1); 197*da2e3ebdSchin } 198*da2e3ebdSchin else 199*da2e3ebdSchin outsize = n+cwidth+sep; 200*da2e3ebdSchin memcpy(outp+cwidth+sep,bufp,n); 201*da2e3ebdSchin if(sep) 202*da2e3ebdSchin outp[cwidth] = '\n'; 203*da2e3ebdSchin oreclen = reclen; 204*da2e3ebdSchin orecp = outp+cwidth+sep + (cp-bufp); 205*da2e3ebdSchin } 206*da2e3ebdSchin return(0); 207*da2e3ebdSchin } 208*da2e3ebdSchin 209*da2e3ebdSchin int 210*da2e3ebdSchin b_uniq(int argc, char** argv, void* context) 211*da2e3ebdSchin { 212*da2e3ebdSchin register int n, mode=0; 213*da2e3ebdSchin register char *cp; 214*da2e3ebdSchin int fields=0, chars=0, width=-1; 215*da2e3ebdSchin Sfio_t *fpin, *fpout; 216*da2e3ebdSchin int* all = 0; 217*da2e3ebdSchin int sep; 218*da2e3ebdSchin Compare_f compare = (Compare_f)memcmp; 219*da2e3ebdSchin 220*da2e3ebdSchin cmdinit(argc, argv, context, ERROR_CATALOG, 0); 221*da2e3ebdSchin while (n = optget(argv, usage)) switch (n) 222*da2e3ebdSchin { 223*da2e3ebdSchin case 'c': 224*da2e3ebdSchin mode |= C_FLAG; 225*da2e3ebdSchin break; 226*da2e3ebdSchin case 'd': 227*da2e3ebdSchin mode |= D_FLAG; 228*da2e3ebdSchin break; 229*da2e3ebdSchin case 'D': 230*da2e3ebdSchin mode |= D_FLAG; 231*da2e3ebdSchin switch ((int)opt_info.num) 232*da2e3ebdSchin { 233*da2e3ebdSchin case 'p': 234*da2e3ebdSchin sep = 1; 235*da2e3ebdSchin break; 236*da2e3ebdSchin case 's': 237*da2e3ebdSchin sep = 0; 238*da2e3ebdSchin break; 239*da2e3ebdSchin default: 240*da2e3ebdSchin sep = -1; 241*da2e3ebdSchin break; 242*da2e3ebdSchin } 243*da2e3ebdSchin all = &sep; 244*da2e3ebdSchin break; 245*da2e3ebdSchin case 'i': 246*da2e3ebdSchin compare = (Compare_f)strncasecmp; 247*da2e3ebdSchin break; 248*da2e3ebdSchin case 'u': 249*da2e3ebdSchin mode |= U_FLAG; 250*da2e3ebdSchin break; 251*da2e3ebdSchin case 'f': 252*da2e3ebdSchin if(*opt_info.option=='-') 253*da2e3ebdSchin fields = opt_info.num; 254*da2e3ebdSchin else 255*da2e3ebdSchin chars = opt_info.num; 256*da2e3ebdSchin break; 257*da2e3ebdSchin case 's': 258*da2e3ebdSchin chars = opt_info.num; 259*da2e3ebdSchin break; 260*da2e3ebdSchin case 'w': 261*da2e3ebdSchin width = opt_info.num; 262*da2e3ebdSchin break; 263*da2e3ebdSchin case ':': 264*da2e3ebdSchin error(2, "%s", opt_info.arg); 265*da2e3ebdSchin break; 266*da2e3ebdSchin case '?': 267*da2e3ebdSchin error(ERROR_usage(2), "%s", opt_info.arg); 268*da2e3ebdSchin break; 269*da2e3ebdSchin } 270*da2e3ebdSchin argv += opt_info.index; 271*da2e3ebdSchin if(all && (mode&C_FLAG)) 272*da2e3ebdSchin error(2, "-c and -D are mutually exclusive"); 273*da2e3ebdSchin if(error_info.errors) 274*da2e3ebdSchin error(ERROR_usage(2), "%s", optusage(NiL)); 275*da2e3ebdSchin if((cp = *argv) && (argv++,!streq(cp,"-"))) 276*da2e3ebdSchin { 277*da2e3ebdSchin if(!(fpin = sfopen(NiL,cp,"r"))) 278*da2e3ebdSchin error(ERROR_system(1),"%s: cannot open",cp); 279*da2e3ebdSchin } 280*da2e3ebdSchin else 281*da2e3ebdSchin fpin = sfstdin; 282*da2e3ebdSchin if(cp = *argv) 283*da2e3ebdSchin { 284*da2e3ebdSchin argv++; 285*da2e3ebdSchin if(!(fpout = sfopen(NiL,cp,"w"))) 286*da2e3ebdSchin error(ERROR_system(1),"%s: cannot create",cp); 287*da2e3ebdSchin } 288*da2e3ebdSchin else 289*da2e3ebdSchin fpout = sfstdout; 290*da2e3ebdSchin if(*argv) 291*da2e3ebdSchin { 292*da2e3ebdSchin error(2, "too many arguments"); 293*da2e3ebdSchin error(ERROR_usage(2), "%s", optusage(NiL)); 294*da2e3ebdSchin } 295*da2e3ebdSchin error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare); 296*da2e3ebdSchin if(fpin!=sfstdin) 297*da2e3ebdSchin sfclose(fpin); 298*da2e3ebdSchin if(fpout!=sfstdout) 299*da2e3ebdSchin sfclose(fpout); 300*da2e3ebdSchin return(error_info.errors); 301*da2e3ebdSchin } 302*da2e3ebdSchin 303