1da2e3ebdSchin /***********************************************************************
2da2e3ebdSchin * *
3da2e3ebdSchin * This software is part of the ast package *
4*3e14f97fSRoger A. Faulkner * Copyright (c) 1992-2010 AT&T Intellectual Property *
5da2e3ebdSchin * and is licensed under the *
6da2e3ebdSchin * Common Public License, Version 1.0 *
77c2fbfb3SApril Chin * by AT&T Intellectual Property *
8da2e3ebdSchin * *
9da2e3ebdSchin * A copy of the License is available at *
10da2e3ebdSchin * http://www.opensource.org/licenses/cpl1.0.txt *
11da2e3ebdSchin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12da2e3ebdSchin * *
13da2e3ebdSchin * Information and Software Systems Research *
14da2e3ebdSchin * AT&T Research *
15da2e3ebdSchin * Florham Park NJ *
16da2e3ebdSchin * *
17da2e3ebdSchin * Glenn Fowler <gsf@research.att.com> *
18da2e3ebdSchin * David Korn <dgk@research.att.com> *
19da2e3ebdSchin * *
20da2e3ebdSchin ***********************************************************************/
21da2e3ebdSchin #pragma prototyped
22da2e3ebdSchin /*
23da2e3ebdSchin * uniq
24da2e3ebdSchin *
25da2e3ebdSchin * Written by David Korn
26da2e3ebdSchin */
27da2e3ebdSchin
28da2e3ebdSchin static const char usage[] =
29*3e14f97fSRoger A. Faulkner "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
30da2e3ebdSchin USAGE_LICENSE
31da2e3ebdSchin "[+NAME?uniq - Report or filter out repeated lines in a file]"
3234f9b3eeSRoland Mainz "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
3334f9b3eeSRoland Mainz "writes one copy of each input line on the output. The second "
34da2e3ebdSchin "and succeeding copies of the repeated adjacent lines are not "
35da2e3ebdSchin "written.]"
36da2e3ebdSchin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37da2e3ebdSchin "to standard output. If no \ainfile\a is given, or if the \ainfile\a "
38da2e3ebdSchin "is \b-\b, \buniq\b reads from standard input with the start of "
3934f9b3eeSRoland Mainz "the file defined as the current offset.]"
40da2e3ebdSchin "[c:count?Output the number of times each line occurred along with "
41da2e3ebdSchin "the line.]"
42da2e3ebdSchin "[d:repeated|duplicates?Output the first of each duplicate line.]"
43da2e3ebdSchin "[D:all-repeated?Output all duplicate lines as a group with an empty "
44da2e3ebdSchin "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
45da2e3ebdSchin "{"
46da2e3ebdSchin "[n:none?Do not delimit duplicate groups.]"
47da2e3ebdSchin "[p:prepend?Prepend an empty line before each group.]"
48da2e3ebdSchin "[s:separate?Separate each group with an empty line.]"
49da2e3ebdSchin "}"
50da2e3ebdSchin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51da2e3ebdSchin "before checking for uniqueness. A field is the minimal string matching "
5234f9b3eeSRoland Mainz "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
5334f9b3eeSRoland Mainz "\b--skip-fields\b=\anumber\a.]"
54da2e3ebdSchin "[i:ignore-case?Ignore case in comparisons.]"
55da2e3ebdSchin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
56da2e3ebdSchin "before checking for uniqueness. If specified along with \b-f\b, "
57da2e3ebdSchin "the first \achars\a after the first \afields\a are ignored. If "
58da2e3ebdSchin "the \achars\a specifies more characters than are on the line, "
5934f9b3eeSRoland Mainz "an empty string will be used for comparison. +\anumber\a is "
6034f9b3eeSRoland Mainz "equivalent to \b--skip-chars\b=\anumber\a.]"
61da2e3ebdSchin "[u:unique?Output unique lines.]"
62da2e3ebdSchin "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
63da2e3ebdSchin "after skipping any specified fields and characters.]"
64da2e3ebdSchin "\n"
65da2e3ebdSchin "\n[infile [outfile]]\n"
66da2e3ebdSchin "\n"
67da2e3ebdSchin "[+EXIT STATUS?]{"
68da2e3ebdSchin "[+0?The input file was successfully processed.]"
69da2e3ebdSchin "[+>0?An error occurred.]"
70da2e3ebdSchin "}"
71da2e3ebdSchin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
72da2e3ebdSchin ;
73da2e3ebdSchin
74da2e3ebdSchin #include <cmd.h>
75da2e3ebdSchin
76da2e3ebdSchin #define C_FLAG 1
77da2e3ebdSchin #define D_FLAG 2
78da2e3ebdSchin #define U_FLAG 4
79da2e3ebdSchin
80da2e3ebdSchin #define CWIDTH 4
81da2e3ebdSchin #define MAXCNT 9999
82da2e3ebdSchin
83da2e3ebdSchin typedef int (*Compare_f)(const char*, const char*, size_t);
84da2e3ebdSchin
uniq(Sfio_t * fdin,Sfio_t * fdout,int fields,int chars,int width,int mode,int * all,Compare_f compare)85da2e3ebdSchin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
86da2e3ebdSchin {
87*3e14f97fSRoger A. Faulkner register int n, f, outsize=0, mb = mbwide();
88*3e14f97fSRoger A. Faulkner register char *cp, *ep, *mp, *bufp, *outp;
89da2e3ebdSchin char *orecp, *sbufp=0, *outbuff;
90da2e3ebdSchin int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
91da2e3ebdSchin if(mode&C_FLAG)
92da2e3ebdSchin cwidth = CWIDTH+1;
93da2e3ebdSchin while(1)
94da2e3ebdSchin {
95da2e3ebdSchin if(bufp = sfgetr(fdin,'\n',0))
96da2e3ebdSchin n = sfvalue(fdin);
97da2e3ebdSchin else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
98da2e3ebdSchin {
99da2e3ebdSchin n = sfvalue(fdin);
100da2e3ebdSchin bufp = memcpy(fmtbuf(n + 1), bufp, n);
101da2e3ebdSchin bufp[n++] = '\n';
102da2e3ebdSchin }
103da2e3ebdSchin else
104da2e3ebdSchin n = 0;
105da2e3ebdSchin if (n)
106da2e3ebdSchin {
107da2e3ebdSchin cp = bufp;
108da2e3ebdSchin ep = cp + n;
109da2e3ebdSchin if (f = fields)
110da2e3ebdSchin while (f-->0 && cp<ep) /* skip over fields */
111da2e3ebdSchin {
112da2e3ebdSchin while (cp<ep && *cp==' ' || *cp=='\t')
113da2e3ebdSchin cp++;
114da2e3ebdSchin while (cp<ep && *cp!=' ' && *cp!='\t')
115da2e3ebdSchin cp++;
116da2e3ebdSchin }
117da2e3ebdSchin if (chars)
118*3e14f97fSRoger A. Faulkner {
119*3e14f97fSRoger A. Faulkner if (mb)
120*3e14f97fSRoger A. Faulkner for (f = chars; f; f--)
121*3e14f97fSRoger A. Faulkner mbchar(cp);
122*3e14f97fSRoger A. Faulkner else
123da2e3ebdSchin cp += chars;
124*3e14f97fSRoger A. Faulkner }
125da2e3ebdSchin if ((reclen = n - (cp - bufp)) <= 0)
126da2e3ebdSchin {
127da2e3ebdSchin reclen = 1;
128*3e14f97fSRoger A. Faulkner cp = bufp + n - 1;
129da2e3ebdSchin }
130da2e3ebdSchin else if (width >= 0 && width < reclen)
131*3e14f97fSRoger A. Faulkner {
132*3e14f97fSRoger A. Faulkner if (mb)
133*3e14f97fSRoger A. Faulkner {
134*3e14f97fSRoger A. Faulkner reclen = 0;
135*3e14f97fSRoger A. Faulkner mp = cp;
136*3e14f97fSRoger A. Faulkner while (reclen < width && mp < ep)
137*3e14f97fSRoger A. Faulkner {
138*3e14f97fSRoger A. Faulkner reclen++;
139*3e14f97fSRoger A. Faulkner mbchar(mp);
140*3e14f97fSRoger A. Faulkner }
141*3e14f97fSRoger A. Faulkner reclen = mp - cp;
142*3e14f97fSRoger A. Faulkner }
143*3e14f97fSRoger A. Faulkner else
144da2e3ebdSchin reclen = width;
145da2e3ebdSchin }
146*3e14f97fSRoger A. Faulkner }
147da2e3ebdSchin else
148da2e3ebdSchin reclen = -2;
149da2e3ebdSchin if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
150da2e3ebdSchin {
151da2e3ebdSchin count++;
152da2e3ebdSchin if (!all)
153da2e3ebdSchin continue;
154da2e3ebdSchin next = count;
155da2e3ebdSchin }
156da2e3ebdSchin else
157da2e3ebdSchin {
158da2e3ebdSchin next = 0;
159da2e3ebdSchin if(outsize>0)
160da2e3ebdSchin {
161da2e3ebdSchin if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
162da2e3ebdSchin {
163da2e3ebdSchin if(outp!=sbufp)
164da2e3ebdSchin sfwrite(fdout,outp,0);
165da2e3ebdSchin }
166da2e3ebdSchin else
167da2e3ebdSchin {
168da2e3ebdSchin if(cwidth)
169da2e3ebdSchin {
17034f9b3eeSRoland Mainz if(count<9)
171da2e3ebdSchin {
17234f9b3eeSRoland Mainz f = 0;
17334f9b3eeSRoland Mainz while(f < CWIDTH-1)
17434f9b3eeSRoland Mainz outp[f++] = ' ';
17534f9b3eeSRoland Mainz outp[f++] = '0' + count + 1;
17634f9b3eeSRoland Mainz outp[f] = ' ';
17734f9b3eeSRoland Mainz }
17834f9b3eeSRoland Mainz else if(count<MAXCNT)
17934f9b3eeSRoland Mainz {
18034f9b3eeSRoland Mainz count++;
18134f9b3eeSRoland Mainz f = CWIDTH;
18234f9b3eeSRoland Mainz outp[f--] = ' ';
18334f9b3eeSRoland Mainz do
18434f9b3eeSRoland Mainz {
18534f9b3eeSRoland Mainz outp[f--] = '0' + (count % 10);
18634f9b3eeSRoland Mainz } while (count /= 10);
18734f9b3eeSRoland Mainz while (f >= 0)
18834f9b3eeSRoland Mainz outp[f--] = ' ';
189da2e3ebdSchin }
190da2e3ebdSchin else
191da2e3ebdSchin {
192da2e3ebdSchin outsize -= (CWIDTH+1);
193da2e3ebdSchin if(outp!=sbufp)
194da2e3ebdSchin {
195da2e3ebdSchin if(!(sbufp=fmtbuf(outsize)))
196da2e3ebdSchin return(1);
197da2e3ebdSchin memcpy(sbufp,outp+CWIDTH+1,outsize);
198da2e3ebdSchin sfwrite(fdout,outp,0);
199da2e3ebdSchin outp = sbufp;
200da2e3ebdSchin }
201da2e3ebdSchin else
202da2e3ebdSchin outp += CWIDTH+1;
203da2e3ebdSchin sfprintf(fdout,"%4d ",count+1);
204da2e3ebdSchin }
205da2e3ebdSchin }
206da2e3ebdSchin if(sfwrite(fdout,outp,outsize) != outsize)
207da2e3ebdSchin return(1);
208da2e3ebdSchin }
209da2e3ebdSchin }
210da2e3ebdSchin }
211da2e3ebdSchin if(n==0)
212da2e3ebdSchin break;
213da2e3ebdSchin if(count = next)
214da2e3ebdSchin {
215da2e3ebdSchin if(sfwrite(fdout,outp,outsize) != outsize)
216da2e3ebdSchin return(1);
217da2e3ebdSchin if(*all >= 0)
218da2e3ebdSchin *all = 1;
219da2e3ebdSchin sep = 0;
220da2e3ebdSchin }
221da2e3ebdSchin else
222da2e3ebdSchin sep = all && *all > 0;
223da2e3ebdSchin /* save current record */
224da2e3ebdSchin if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
225da2e3ebdSchin return(1);
226da2e3ebdSchin outp = outbuff;
227da2e3ebdSchin if(outsize < n+cwidth+sep)
228da2e3ebdSchin {
229da2e3ebdSchin /* no room in outp, clear lock and use side buffer */
230da2e3ebdSchin sfwrite(fdout,outp,0);
231da2e3ebdSchin if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
232da2e3ebdSchin return(1);
233da2e3ebdSchin }
234da2e3ebdSchin else
235da2e3ebdSchin outsize = n+cwidth+sep;
236da2e3ebdSchin memcpy(outp+cwidth+sep,bufp,n);
237da2e3ebdSchin if(sep)
238da2e3ebdSchin outp[cwidth] = '\n';
239da2e3ebdSchin oreclen = reclen;
240da2e3ebdSchin orecp = outp+cwidth+sep + (cp-bufp);
241da2e3ebdSchin }
242da2e3ebdSchin return(0);
243da2e3ebdSchin }
244da2e3ebdSchin
245da2e3ebdSchin int
b_uniq(int argc,char ** argv,void * context)246da2e3ebdSchin b_uniq(int argc, char** argv, void* context)
247da2e3ebdSchin {
248da2e3ebdSchin register int n, mode=0;
249da2e3ebdSchin register char *cp;
250da2e3ebdSchin int fields=0, chars=0, width=-1;
251da2e3ebdSchin Sfio_t *fpin, *fpout;
252da2e3ebdSchin int* all = 0;
253da2e3ebdSchin int sep;
254da2e3ebdSchin Compare_f compare = (Compare_f)memcmp;
255da2e3ebdSchin
256da2e3ebdSchin cmdinit(argc, argv, context, ERROR_CATALOG, 0);
257da2e3ebdSchin while (n = optget(argv, usage)) switch (n)
258da2e3ebdSchin {
259da2e3ebdSchin case 'c':
260da2e3ebdSchin mode |= C_FLAG;
261da2e3ebdSchin break;
262da2e3ebdSchin case 'd':
263da2e3ebdSchin mode |= D_FLAG;
264da2e3ebdSchin break;
265da2e3ebdSchin case 'D':
266da2e3ebdSchin mode |= D_FLAG;
267da2e3ebdSchin switch ((int)opt_info.num)
268da2e3ebdSchin {
269da2e3ebdSchin case 'p':
270da2e3ebdSchin sep = 1;
271da2e3ebdSchin break;
272da2e3ebdSchin case 's':
273da2e3ebdSchin sep = 0;
274da2e3ebdSchin break;
275da2e3ebdSchin default:
276da2e3ebdSchin sep = -1;
277da2e3ebdSchin break;
278da2e3ebdSchin }
279da2e3ebdSchin all = &sep;
280da2e3ebdSchin break;
281da2e3ebdSchin case 'i':
282da2e3ebdSchin compare = (Compare_f)strncasecmp;
283da2e3ebdSchin break;
284da2e3ebdSchin case 'u':
285da2e3ebdSchin mode |= U_FLAG;
286da2e3ebdSchin break;
287da2e3ebdSchin case 'f':
288da2e3ebdSchin if(*opt_info.option=='-')
289da2e3ebdSchin fields = opt_info.num;
290da2e3ebdSchin else
291da2e3ebdSchin chars = opt_info.num;
292da2e3ebdSchin break;
293da2e3ebdSchin case 's':
294da2e3ebdSchin chars = opt_info.num;
295da2e3ebdSchin break;
296da2e3ebdSchin case 'w':
297da2e3ebdSchin width = opt_info.num;
298da2e3ebdSchin break;
299da2e3ebdSchin case ':':
300da2e3ebdSchin error(2, "%s", opt_info.arg);
301da2e3ebdSchin break;
302da2e3ebdSchin case '?':
303da2e3ebdSchin error(ERROR_usage(2), "%s", opt_info.arg);
304da2e3ebdSchin break;
305da2e3ebdSchin }
306da2e3ebdSchin argv += opt_info.index;
307da2e3ebdSchin if(all && (mode&C_FLAG))
308da2e3ebdSchin error(2, "-c and -D are mutually exclusive");
309da2e3ebdSchin if(error_info.errors)
310da2e3ebdSchin error(ERROR_usage(2), "%s", optusage(NiL));
311da2e3ebdSchin if((cp = *argv) && (argv++,!streq(cp,"-")))
312da2e3ebdSchin {
313da2e3ebdSchin if(!(fpin = sfopen(NiL,cp,"r")))
314da2e3ebdSchin error(ERROR_system(1),"%s: cannot open",cp);
315da2e3ebdSchin }
316da2e3ebdSchin else
317da2e3ebdSchin fpin = sfstdin;
318da2e3ebdSchin if(cp = *argv)
319da2e3ebdSchin {
320da2e3ebdSchin argv++;
321da2e3ebdSchin if(!(fpout = sfopen(NiL,cp,"w")))
322da2e3ebdSchin error(ERROR_system(1),"%s: cannot create",cp);
323da2e3ebdSchin }
324da2e3ebdSchin else
325da2e3ebdSchin fpout = sfstdout;
326da2e3ebdSchin if(*argv)
327da2e3ebdSchin {
328da2e3ebdSchin error(2, "too many arguments");
329da2e3ebdSchin error(ERROR_usage(2), "%s", optusage(NiL));
330da2e3ebdSchin }
331da2e3ebdSchin error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
332da2e3ebdSchin if(fpin!=sfstdin)
333da2e3ebdSchin sfclose(fpin);
334da2e3ebdSchin if(fpout!=sfstdout)
335da2e3ebdSchin sfclose(fpout);
336da2e3ebdSchin return(error_info.errors);
337da2e3ebdSchin }
338da2e3ebdSchin
339