1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2012 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23 * uniq
24 *
25 * Written by David Korn
26 */
27
28 static const char usage[] =
29 "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
30 USAGE_LICENSE
31 "[+NAME?uniq - Report or filter out repeated lines in a file]"
32 "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
33 "writes one copy of each input line on the output. The second "
34 "and succeeding copies of the repeated adjacent lines are not "
35 "written.]"
36 "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37 "to standard output. If no \ainfile\a is given, or if the \ainfile\a "
38 "is \b-\b, \buniq\b reads from standard input with the start of "
39 "the file defined as the current offset.]"
40 "[c:count?Output the number of times each line occurred along with "
41 "the line.]"
42 "[d:repeated|duplicates?Output the first of each duplicate line.]"
43 "[D:all-repeated?Output all duplicate lines as a group with an empty "
44 "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
45 "{"
46 "[n:none?Do not delimit duplicate groups.]"
47 "[p:prepend?Prepend an empty line before each group.]"
48 "[s:separate?Separate each group with an empty line.]"
49 "}"
50 "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51 "before checking for uniqueness. A field is the minimal string matching "
52 "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
53 "\b--skip-fields\b=\anumber\a.]"
54 "[i:ignore-case?Ignore case in comparisons.]"
55 "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
56 "before checking for uniqueness. If specified along with \b-f\b, "
57 "the first \achars\a after the first \afields\a are ignored. If "
58 "the \achars\a specifies more characters than are on the line, "
59 "an empty string will be used for comparison. +\anumber\a is "
60 "equivalent to \b--skip-chars\b=\anumber\a.]"
61 "[u:unique?Output unique lines.]"
62 "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
63 "after skipping any specified fields and characters.]"
64 "\n"
65 "\n[infile [outfile]]\n"
66 "\n"
67 "[+EXIT STATUS?]{"
68 "[+0?The input file was successfully processed.]"
69 "[+>0?An error occurred.]"
70 "}"
71 "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
72 ;
73
74 #include <cmd.h>
75
76 #define C_FLAG 1
77 #define D_FLAG 2
78 #define U_FLAG 4
79
80 #define CWIDTH 4
81 #define MAXCNT 9999
82
83 typedef int (*Compare_f)(const char*, const char*, size_t);
84
uniq(Sfio_t * fdin,Sfio_t * fdout,int fields,int chars,int width,int mode,int * all,Compare_f compare)85 static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
86 {
87 register int n, f, outsize=0, mb = mbwide();
88 register char *cp, *ep, *mp, *bufp, *outp;
89 char *orecp, *sbufp=0, *outbuff;
90 int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
91 if(mode&C_FLAG)
92 cwidth = CWIDTH+1;
93 while(1)
94 {
95 if(bufp = sfgetr(fdin,'\n',0))
96 n = sfvalue(fdin);
97 else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
98 {
99 n = sfvalue(fdin);
100 bufp = memcpy(fmtbuf(n + 1), bufp, n);
101 bufp[n++] = '\n';
102 }
103 else
104 n = 0;
105 if (n)
106 {
107 cp = bufp;
108 ep = cp + n;
109 if (f = fields)
110 while (f-->0 && cp<ep) /* skip over fields */
111 {
112 while (cp<ep && *cp==' ' || *cp=='\t')
113 cp++;
114 while (cp<ep && *cp!=' ' && *cp!='\t')
115 cp++;
116 }
117 if (chars)
118 {
119 if (mb)
120 for (f = chars; f; f--)
121 mbchar(cp);
122 else
123 cp += chars;
124 }
125 if ((reclen = n - (cp - bufp)) <= 0)
126 {
127 reclen = 1;
128 cp = bufp + n - 1;
129 }
130 else if (width >= 0 && width < reclen)
131 {
132 if (mb)
133 {
134 reclen = 0;
135 mp = cp;
136 while (reclen < width && mp < ep)
137 {
138 reclen++;
139 mbchar(mp);
140 }
141 reclen = mp - cp;
142 }
143 else
144 reclen = width;
145 }
146 }
147 else
148 reclen = -2;
149 if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
150 {
151 count++;
152 if (!all)
153 continue;
154 next = count;
155 }
156 else
157 {
158 next = 0;
159 if(outsize>0)
160 {
161 if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
162 {
163 if(outp!=sbufp)
164 sfwrite(fdout,outp,0);
165 }
166 else
167 {
168 if(cwidth)
169 {
170 if(count<9)
171 {
172 f = 0;
173 while(f < CWIDTH-1)
174 outp[f++] = ' ';
175 outp[f++] = '0' + count + 1;
176 outp[f] = ' ';
177 }
178 else if(count<MAXCNT)
179 {
180 count++;
181 f = CWIDTH;
182 outp[f--] = ' ';
183 do
184 {
185 outp[f--] = '0' + (count % 10);
186 } while (count /= 10);
187 while (f >= 0)
188 outp[f--] = ' ';
189 }
190 else
191 {
192 outsize -= (CWIDTH+1);
193 if(outp!=sbufp)
194 {
195 if(!(sbufp=fmtbuf(outsize)))
196 return(1);
197 memcpy(sbufp,outp+CWIDTH+1,outsize);
198 sfwrite(fdout,outp,0);
199 outp = sbufp;
200 }
201 else
202 outp += CWIDTH+1;
203 sfprintf(fdout,"%4d ",count+1);
204 }
205 }
206 if(sfwrite(fdout,outp,outsize) != outsize)
207 return(1);
208 }
209 }
210 }
211 if(n==0)
212 break;
213 if(count = next)
214 {
215 if(sfwrite(fdout,outp,outsize) != outsize)
216 return(1);
217 if(*all >= 0)
218 *all = 1;
219 sep = 0;
220 }
221 else
222 sep = all && *all > 0;
223 /* save current record */
224 if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
225 return(1);
226 outp = outbuff;
227 if(outsize < n+cwidth+sep)
228 {
229 /* no room in outp, clear lock and use side buffer */
230 sfwrite(fdout,outp,0);
231 if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
232 return(1);
233 }
234 else
235 outsize = n+cwidth+sep;
236 memcpy(outp+cwidth+sep,bufp,n);
237 if(sep)
238 outp[cwidth] = '\n';
239 oreclen = reclen;
240 orecp = outp+cwidth+sep + (cp-bufp);
241 }
242 return(0);
243 }
244
245 int
b_uniq(int argc,char ** argv,Shbltin_t * context)246 b_uniq(int argc, char** argv, Shbltin_t* context)
247 {
248 register int mode=0;
249 register char *cp;
250 int fields=0, chars=0, width=-1;
251 Sfio_t *fpin, *fpout;
252 int* all = 0;
253 int sep;
254 Compare_f compare = (Compare_f)memcmp;
255
256 cmdinit(argc, argv, context, ERROR_CATALOG, 0);
257 for (;;)
258 {
259 switch (optget(argv, usage))
260 {
261 case 'c':
262 mode |= C_FLAG;
263 continue;
264 case 'd':
265 mode |= D_FLAG;
266 continue;
267 case 'D':
268 mode |= D_FLAG;
269 switch ((int)opt_info.num)
270 {
271 case 'p':
272 sep = 1;
273 break;
274 case 's':
275 sep = 0;
276 break;
277 default:
278 sep = -1;
279 break;
280 }
281 all = &sep;
282 continue;
283 case 'i':
284 compare = (Compare_f)strncasecmp;
285 continue;
286 case 'u':
287 mode |= U_FLAG;
288 continue;
289 case 'f':
290 if(*opt_info.option=='-')
291 fields = opt_info.num;
292 else
293 chars = opt_info.num;
294 continue;
295 case 's':
296 chars = opt_info.num;
297 continue;
298 case 'w':
299 width = opt_info.num;
300 continue;
301 case ':':
302 error(2, "%s", opt_info.arg);
303 break;
304 case '?':
305 error(ERROR_usage(2), "%s", opt_info.arg);
306 break;
307 }
308 break;
309 }
310 argv += opt_info.index;
311 if(all && (mode&C_FLAG))
312 error(2, "-c and -D are mutually exclusive");
313 if(error_info.errors)
314 error(ERROR_usage(2), "%s", optusage(NiL));
315 if((cp = *argv) && (argv++,!streq(cp,"-")))
316 {
317 if(!(fpin = sfopen(NiL,cp,"r")))
318 error(ERROR_system(1),"%s: cannot open",cp);
319 }
320 else
321 fpin = sfstdin;
322 if(cp = *argv)
323 {
324 argv++;
325 if(!(fpout = sfopen(NiL,cp,"w")))
326 error(ERROR_system(1),"%s: cannot create",cp);
327 }
328 else
329 fpout = sfstdout;
330 if(*argv)
331 {
332 error(2, "too many arguments");
333 error(ERROR_usage(2), "%s", optusage(NiL));
334 }
335 error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
336 if(fpin!=sfstdin)
337 sfclose(fpin);
338 if(fpout!=sfstdout)
339 sfclose(fpout);
340 return(error_info.errors);
341 }
342
343