1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1992-2008 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * * 20 ***********************************************************************/ 21 #pragma prototyped 22 /* 23 * uniq 24 * 25 * Written by David Korn 26 */ 27 28 static const char usage[] = 29 "[-n?\n@(#)$Id: uniq (AT&T Research) 2008-04-24 $\n]" 30 USAGE_LICENSE 31 "[+NAME?uniq - Report or filter out repeated lines in a file]" 32 "[+DESCRIPTION?\buniq\b reads an input, comparing adjacent lines, and " 33 "writing one copy of each input line on the output. The second " 34 "and succeeding copies of the repeated adjacent lines are not " 35 "written.]" 36 "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes " 37 "to standard output. If no \ainfile\a is given, or if the \ainfile\a " 38 "is \b-\b, \buniq\b reads from standard input with the start of " 39 "the file is defined as the current offset.]" 40 "[c:count?Output the number of times each line occurred along with " 41 "the line.]" 42 "[d:repeated|duplicates?Output the first of each duplicate line.]" 43 "[D:all-repeated?Output all duplicate lines as a group with an empty " 44 "line delimiter specified by \adelimit\a:]:?[delimit:=none]" 45 "{" 46 "[n:none?Do not delimit duplicate groups.]" 47 "[p:prepend?Prepend an empty line before each group.]" 48 "[s:separate?Separate each group with an empty line.]" 49 "}" 50 "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over " 51 "before checking for uniqueness. A field is the minimal string matching " 52 "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b.]" 53 "[i:ignore-case?Ignore case in comparisons.]" 54 "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over " 55 "before checking for uniqueness. If specified along with \b-f\b, " 56 "the first \achars\a after the first \afields\a are ignored. If " 57 "the \achars\a specifies more characters than are on the line, " 58 "an empty string will be used for comparison.]" 59 "[u:unique?Output unique lines.]" 60 "[w:check-chars]#[chars?\achars\a is the number of characters to compare " 61 "after skipping any specified fields and characters.]" 62 "\n" 63 "\n[infile [outfile]]\n" 64 "\n" 65 "[+EXIT STATUS?]{" 66 "[+0?The input file was successfully processed.]" 67 "[+>0?An error occurred.]" 68 "}" 69 "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]" 70 ; 71 72 #include <cmd.h> 73 74 #define C_FLAG 1 75 #define D_FLAG 2 76 #define U_FLAG 4 77 78 #define CWIDTH 4 79 #define MAXCNT 9999 80 81 typedef int (*Compare_f)(const char*, const char*, size_t); 82 83 static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare) 84 { 85 register int n, f, outsize=0; 86 register char *cp, *ep, *bufp, *outp; 87 char *orecp, *sbufp=0, *outbuff; 88 int reclen,oreclen= -1,count=0,cwidth=0,sep,next; 89 if(mode&C_FLAG) 90 cwidth = CWIDTH+1; 91 while(1) 92 { 93 if(bufp = sfgetr(fdin,'\n',0)) 94 n = sfvalue(fdin); 95 else if(bufp = sfgetr(fdin,'\n',SF_LASTR)) 96 { 97 n = sfvalue(fdin); 98 bufp = memcpy(fmtbuf(n + 1), bufp, n); 99 bufp[n++] = '\n'; 100 } 101 else 102 n = 0; 103 if(n) 104 { 105 cp = bufp; 106 ep = cp + n; 107 if(f=fields) 108 while(f-->0 && cp<ep) /* skip over fields */ 109 { 110 while(cp<ep && *cp==' ' || *cp=='\t') 111 cp++; 112 while(cp<ep && *cp!=' ' && *cp!='\t') 113 cp++; 114 } 115 if(chars) 116 cp += chars; 117 if((reclen = n - (cp-bufp)) <=0) 118 { 119 reclen = 1; 120 cp = bufp + sfvalue(fdin)-1; 121 } 122 else if(width >= 0 && width < reclen) 123 reclen = width; 124 } 125 else 126 reclen=-2; 127 if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen))) 128 { 129 count++; 130 if (!all) 131 continue; 132 next = count; 133 } 134 else 135 { 136 next = 0; 137 if(outsize>0) 138 { 139 if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count)) 140 { 141 if(outp!=sbufp) 142 sfwrite(fdout,outp,0); 143 } 144 else 145 { 146 if(cwidth) 147 { 148 outp[CWIDTH] = ' '; 149 if(count<MAXCNT) 150 { 151 sfsprintf(outp,cwidth,"%*d",CWIDTH,count+1); 152 outp[CWIDTH] = ' '; 153 } 154 else 155 { 156 outsize -= (CWIDTH+1); 157 if(outp!=sbufp) 158 { 159 if(!(sbufp=fmtbuf(outsize))) 160 return(1); 161 memcpy(sbufp,outp+CWIDTH+1,outsize); 162 sfwrite(fdout,outp,0); 163 outp = sbufp; 164 } 165 else 166 outp += CWIDTH+1; 167 sfprintf(fdout,"%4d ",count+1); 168 } 169 } 170 if(sfwrite(fdout,outp,outsize) != outsize) 171 return(1); 172 } 173 } 174 } 175 if(n==0) 176 break; 177 if(count = next) 178 { 179 if(sfwrite(fdout,outp,outsize) != outsize) 180 return(1); 181 if(*all >= 0) 182 *all = 1; 183 sep = 0; 184 } 185 else 186 sep = all && *all > 0; 187 /* save current record */ 188 if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0) 189 return(1); 190 outp = outbuff; 191 if(outsize < n+cwidth+sep) 192 { 193 /* no room in outp, clear lock and use side buffer */ 194 sfwrite(fdout,outp,0); 195 if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep))) 196 return(1); 197 } 198 else 199 outsize = n+cwidth+sep; 200 memcpy(outp+cwidth+sep,bufp,n); 201 if(sep) 202 outp[cwidth] = '\n'; 203 oreclen = reclen; 204 orecp = outp+cwidth+sep + (cp-bufp); 205 } 206 return(0); 207 } 208 209 int 210 b_uniq(int argc, char** argv, void* context) 211 { 212 register int n, mode=0; 213 register char *cp; 214 int fields=0, chars=0, width=-1; 215 Sfio_t *fpin, *fpout; 216 int* all = 0; 217 int sep; 218 Compare_f compare = (Compare_f)memcmp; 219 220 cmdinit(argc, argv, context, ERROR_CATALOG, 0); 221 while (n = optget(argv, usage)) switch (n) 222 { 223 case 'c': 224 mode |= C_FLAG; 225 break; 226 case 'd': 227 mode |= D_FLAG; 228 break; 229 case 'D': 230 mode |= D_FLAG; 231 switch ((int)opt_info.num) 232 { 233 case 'p': 234 sep = 1; 235 break; 236 case 's': 237 sep = 0; 238 break; 239 default: 240 sep = -1; 241 break; 242 } 243 all = &sep; 244 break; 245 case 'i': 246 compare = (Compare_f)strncasecmp; 247 break; 248 case 'u': 249 mode |= U_FLAG; 250 break; 251 case 'f': 252 if(*opt_info.option=='-') 253 fields = opt_info.num; 254 else 255 chars = opt_info.num; 256 break; 257 case 's': 258 chars = opt_info.num; 259 break; 260 case 'w': 261 width = opt_info.num; 262 break; 263 case ':': 264 error(2, "%s", opt_info.arg); 265 break; 266 case '?': 267 error(ERROR_usage(2), "%s", opt_info.arg); 268 break; 269 } 270 argv += opt_info.index; 271 if(all && (mode&C_FLAG)) 272 error(2, "-c and -D are mutually exclusive"); 273 if(error_info.errors) 274 error(ERROR_usage(2), "%s", optusage(NiL)); 275 if((cp = *argv) && (argv++,!streq(cp,"-"))) 276 { 277 if(!(fpin = sfopen(NiL,cp,"r"))) 278 error(ERROR_system(1),"%s: cannot open",cp); 279 } 280 else 281 fpin = sfstdin; 282 if(cp = *argv) 283 { 284 argv++; 285 if(!(fpout = sfopen(NiL,cp,"w"))) 286 error(ERROR_system(1),"%s: cannot create",cp); 287 } 288 else 289 fpout = sfstdout; 290 if(*argv) 291 { 292 error(2, "too many arguments"); 293 error(ERROR_usage(2), "%s", optusage(NiL)); 294 } 295 error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare); 296 if(fpin!=sfstdin) 297 sfclose(fpin); 298 if(fpout!=sfstdout) 299 sfclose(fpout); 300 return(error_info.errors); 301 } 302 303