xref: /titanic_51/usr/src/lib/libcmd/common/uniq.c (revision 3e14f97f673e8a630f076077de35afdd43dc1587)
1da2e3ebdSchin /***********************************************************************
2da2e3ebdSchin *                                                                      *
3da2e3ebdSchin *               This software is part of the ast package               *
4*3e14f97fSRoger A. Faulkner *          Copyright (c) 1992-2010 AT&T Intellectual Property          *
5da2e3ebdSchin *                      and is licensed under the                       *
6da2e3ebdSchin *                  Common Public License, Version 1.0                  *
77c2fbfb3SApril Chin *                    by AT&T Intellectual Property                     *
8da2e3ebdSchin *                                                                      *
9da2e3ebdSchin *                A copy of the License is available at                 *
10da2e3ebdSchin *            http://www.opensource.org/licenses/cpl1.0.txt             *
11da2e3ebdSchin *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12da2e3ebdSchin *                                                                      *
13da2e3ebdSchin *              Information and Software Systems Research               *
14da2e3ebdSchin *                            AT&T Research                             *
15da2e3ebdSchin *                           Florham Park NJ                            *
16da2e3ebdSchin *                                                                      *
17da2e3ebdSchin *                 Glenn Fowler <gsf@research.att.com>                  *
18da2e3ebdSchin *                  David Korn <dgk@research.att.com>                   *
19da2e3ebdSchin *                                                                      *
20da2e3ebdSchin ***********************************************************************/
21da2e3ebdSchin #pragma prototyped
22da2e3ebdSchin /*
23da2e3ebdSchin  * uniq
24da2e3ebdSchin  *
25da2e3ebdSchin  * Written by David Korn
26da2e3ebdSchin  */
27da2e3ebdSchin 
28da2e3ebdSchin static const char usage[] =
29*3e14f97fSRoger A. Faulkner "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
30da2e3ebdSchin USAGE_LICENSE
31da2e3ebdSchin "[+NAME?uniq - Report or filter out repeated lines in a file]"
3234f9b3eeSRoland Mainz "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
3334f9b3eeSRoland Mainz 	"writes one copy of each input line on the output.  The second "
34da2e3ebdSchin 	"and succeeding copies of the repeated adjacent lines are not "
35da2e3ebdSchin 	"written.]"
36da2e3ebdSchin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37da2e3ebdSchin 	"to standard output.  If no \ainfile\a is given, or if the \ainfile\a "
38da2e3ebdSchin 	"is \b-\b, \buniq\b reads from standard input with the start of "
3934f9b3eeSRoland Mainz 	"the file defined as the current offset.]"
40da2e3ebdSchin "[c:count?Output the number of times each line occurred  along with "
41da2e3ebdSchin 	"the line.]"
42da2e3ebdSchin "[d:repeated|duplicates?Output the first of each duplicate line.]"
43da2e3ebdSchin "[D:all-repeated?Output all duplicate lines as a group with an empty "
44da2e3ebdSchin     "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
45da2e3ebdSchin     "{"
46da2e3ebdSchin         "[n:none?Do not delimit duplicate groups.]"
47da2e3ebdSchin         "[p:prepend?Prepend an empty line before each group.]"
48da2e3ebdSchin         "[s:separate?Separate each group with an empty line.]"
49da2e3ebdSchin     "}"
50da2e3ebdSchin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51da2e3ebdSchin     "before checking for uniqueness. A field is the minimal string matching "
5234f9b3eeSRoland Mainz     "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
5334f9b3eeSRoland Mainz     "\b--skip-fields\b=\anumber\a.]"
54da2e3ebdSchin "[i:ignore-case?Ignore case in comparisons.]"
55da2e3ebdSchin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
56da2e3ebdSchin 	"before checking for uniqueness.  If specified along with \b-f\b, "
57da2e3ebdSchin 	"the first \achars\a after the first \afields\a are ignored.  If "
58da2e3ebdSchin 	"the \achars\a specifies more characters than are on the line, "
5934f9b3eeSRoland Mainz 	"an empty string will be used for comparison. +\anumber\a is "
6034f9b3eeSRoland Mainz 	"equivalent to \b--skip-chars\b=\anumber\a.]"
61da2e3ebdSchin "[u:unique?Output unique lines.]"
62da2e3ebdSchin "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
63da2e3ebdSchin 	"after skipping any specified fields and characters.]"
64da2e3ebdSchin "\n"
65da2e3ebdSchin "\n[infile [outfile]]\n"
66da2e3ebdSchin "\n"
67da2e3ebdSchin "[+EXIT STATUS?]{"
68da2e3ebdSchin 	"[+0?The input file was successfully processed.]"
69da2e3ebdSchin 	"[+>0?An error occurred.]"
70da2e3ebdSchin "}"
71da2e3ebdSchin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
72da2e3ebdSchin ;
73da2e3ebdSchin 
74da2e3ebdSchin #include <cmd.h>
75da2e3ebdSchin 
76da2e3ebdSchin #define C_FLAG	1
77da2e3ebdSchin #define D_FLAG	2
78da2e3ebdSchin #define U_FLAG	4
79da2e3ebdSchin 
80da2e3ebdSchin #define CWIDTH	4
81da2e3ebdSchin #define MAXCNT	9999
82da2e3ebdSchin 
83da2e3ebdSchin typedef int (*Compare_f)(const char*, const char*, size_t);
84da2e3ebdSchin 
uniq(Sfio_t * fdin,Sfio_t * fdout,int fields,int chars,int width,int mode,int * all,Compare_f compare)85da2e3ebdSchin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
86da2e3ebdSchin {
87*3e14f97fSRoger A. Faulkner 	register int n, f, outsize=0, mb = mbwide();
88*3e14f97fSRoger A. Faulkner 	register char *cp, *ep, *mp, *bufp, *outp;
89da2e3ebdSchin 	char *orecp, *sbufp=0, *outbuff;
90da2e3ebdSchin 	int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
91da2e3ebdSchin 	if(mode&C_FLAG)
92da2e3ebdSchin 		cwidth = CWIDTH+1;
93da2e3ebdSchin 	while(1)
94da2e3ebdSchin 	{
95da2e3ebdSchin 		if(bufp = sfgetr(fdin,'\n',0))
96da2e3ebdSchin 			n = sfvalue(fdin);
97da2e3ebdSchin 		else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
98da2e3ebdSchin 		{
99da2e3ebdSchin 			n = sfvalue(fdin);
100da2e3ebdSchin 			bufp = memcpy(fmtbuf(n + 1), bufp, n);
101da2e3ebdSchin 			bufp[n++] = '\n';
102da2e3ebdSchin 		}
103da2e3ebdSchin 		else
104da2e3ebdSchin 			n = 0;
105da2e3ebdSchin 		if (n)
106da2e3ebdSchin 		{
107da2e3ebdSchin 			cp = bufp;
108da2e3ebdSchin 			ep = cp + n;
109da2e3ebdSchin 			if (f = fields)
110da2e3ebdSchin 				while (f-->0 && cp<ep) /* skip over fields */
111da2e3ebdSchin 				{
112da2e3ebdSchin 					while (cp<ep && *cp==' ' || *cp=='\t')
113da2e3ebdSchin 						cp++;
114da2e3ebdSchin 					while (cp<ep && *cp!=' ' && *cp!='\t')
115da2e3ebdSchin 						cp++;
116da2e3ebdSchin 				}
117da2e3ebdSchin 			if (chars)
118*3e14f97fSRoger A. Faulkner 			{
119*3e14f97fSRoger A. Faulkner 				if (mb)
120*3e14f97fSRoger A. Faulkner 					for (f = chars; f; f--)
121*3e14f97fSRoger A. Faulkner 						mbchar(cp);
122*3e14f97fSRoger A. Faulkner 				else
123da2e3ebdSchin 					cp += chars;
124*3e14f97fSRoger A. Faulkner 			}
125da2e3ebdSchin 			if ((reclen = n - (cp - bufp)) <= 0)
126da2e3ebdSchin 			{
127da2e3ebdSchin 				reclen = 1;
128*3e14f97fSRoger A. Faulkner 				cp = bufp + n - 1;
129da2e3ebdSchin 			}
130da2e3ebdSchin 			else if (width >= 0 && width < reclen)
131*3e14f97fSRoger A. Faulkner 			{
132*3e14f97fSRoger A. Faulkner 				if (mb)
133*3e14f97fSRoger A. Faulkner 				{
134*3e14f97fSRoger A. Faulkner 					reclen = 0;
135*3e14f97fSRoger A. Faulkner 					mp = cp;
136*3e14f97fSRoger A. Faulkner 					while (reclen < width && mp < ep)
137*3e14f97fSRoger A. Faulkner 					{
138*3e14f97fSRoger A. Faulkner 						reclen++;
139*3e14f97fSRoger A. Faulkner 						mbchar(mp);
140*3e14f97fSRoger A. Faulkner 					}
141*3e14f97fSRoger A. Faulkner 					reclen = mp - cp;
142*3e14f97fSRoger A. Faulkner 				}
143*3e14f97fSRoger A. Faulkner 				else
144da2e3ebdSchin 					reclen = width;
145da2e3ebdSchin 			}
146*3e14f97fSRoger A. Faulkner 		}
147da2e3ebdSchin 		else
148da2e3ebdSchin 			reclen = -2;
149da2e3ebdSchin 		if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
150da2e3ebdSchin 		{
151da2e3ebdSchin 			count++;
152da2e3ebdSchin 			if (!all)
153da2e3ebdSchin 				continue;
154da2e3ebdSchin 			next = count;
155da2e3ebdSchin 		}
156da2e3ebdSchin 		else
157da2e3ebdSchin 		{
158da2e3ebdSchin 			next = 0;
159da2e3ebdSchin 			if(outsize>0)
160da2e3ebdSchin 			{
161da2e3ebdSchin 				if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
162da2e3ebdSchin 				{
163da2e3ebdSchin 					if(outp!=sbufp)
164da2e3ebdSchin 						sfwrite(fdout,outp,0);
165da2e3ebdSchin 				}
166da2e3ebdSchin 				else
167da2e3ebdSchin 				{
168da2e3ebdSchin 					if(cwidth)
169da2e3ebdSchin 					{
17034f9b3eeSRoland Mainz 						if(count<9)
171da2e3ebdSchin 						{
17234f9b3eeSRoland Mainz 							f = 0;
17334f9b3eeSRoland Mainz 							while(f < CWIDTH-1)
17434f9b3eeSRoland Mainz 								outp[f++] = ' ';
17534f9b3eeSRoland Mainz 							outp[f++] = '0' + count + 1;
17634f9b3eeSRoland Mainz 							outp[f] = ' ';
17734f9b3eeSRoland Mainz 						}
17834f9b3eeSRoland Mainz 						else if(count<MAXCNT)
17934f9b3eeSRoland Mainz 						{
18034f9b3eeSRoland Mainz 							count++;
18134f9b3eeSRoland Mainz 							f = CWIDTH;
18234f9b3eeSRoland Mainz 							outp[f--] = ' ';
18334f9b3eeSRoland Mainz 							do
18434f9b3eeSRoland Mainz 							{
18534f9b3eeSRoland Mainz 								outp[f--] = '0' + (count % 10);
18634f9b3eeSRoland Mainz 							} while (count /= 10);
18734f9b3eeSRoland Mainz 							while (f >= 0)
18834f9b3eeSRoland Mainz 								outp[f--] = ' ';
189da2e3ebdSchin 						}
190da2e3ebdSchin 						else
191da2e3ebdSchin 						{
192da2e3ebdSchin 							outsize -= (CWIDTH+1);
193da2e3ebdSchin 							if(outp!=sbufp)
194da2e3ebdSchin 							{
195da2e3ebdSchin 								if(!(sbufp=fmtbuf(outsize)))
196da2e3ebdSchin 									return(1);
197da2e3ebdSchin 								memcpy(sbufp,outp+CWIDTH+1,outsize);
198da2e3ebdSchin 								sfwrite(fdout,outp,0);
199da2e3ebdSchin 								outp = sbufp;
200da2e3ebdSchin 							}
201da2e3ebdSchin 							else
202da2e3ebdSchin 								outp += CWIDTH+1;
203da2e3ebdSchin 							sfprintf(fdout,"%4d ",count+1);
204da2e3ebdSchin 						}
205da2e3ebdSchin 					}
206da2e3ebdSchin 					if(sfwrite(fdout,outp,outsize) != outsize)
207da2e3ebdSchin 						return(1);
208da2e3ebdSchin 				}
209da2e3ebdSchin 			}
210da2e3ebdSchin 		}
211da2e3ebdSchin 		if(n==0)
212da2e3ebdSchin 			break;
213da2e3ebdSchin 		if(count = next)
214da2e3ebdSchin 		{
215da2e3ebdSchin 			if(sfwrite(fdout,outp,outsize) != outsize)
216da2e3ebdSchin 				return(1);
217da2e3ebdSchin 			if(*all >= 0)
218da2e3ebdSchin 				*all = 1;
219da2e3ebdSchin 			sep = 0;
220da2e3ebdSchin 		}
221da2e3ebdSchin 		else
222da2e3ebdSchin 			sep = all && *all > 0;
223da2e3ebdSchin 		/* save current record */
224da2e3ebdSchin 		if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
225da2e3ebdSchin 			return(1);
226da2e3ebdSchin 		outp = outbuff;
227da2e3ebdSchin 		if(outsize < n+cwidth+sep)
228da2e3ebdSchin 		{
229da2e3ebdSchin 			/* no room in outp, clear lock and use side buffer */
230da2e3ebdSchin 			sfwrite(fdout,outp,0);
231da2e3ebdSchin 			if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
232da2e3ebdSchin 				return(1);
233da2e3ebdSchin 		}
234da2e3ebdSchin 		else
235da2e3ebdSchin 			outsize = n+cwidth+sep;
236da2e3ebdSchin 		memcpy(outp+cwidth+sep,bufp,n);
237da2e3ebdSchin 		if(sep)
238da2e3ebdSchin 			outp[cwidth] = '\n';
239da2e3ebdSchin 		oreclen = reclen;
240da2e3ebdSchin 		orecp = outp+cwidth+sep + (cp-bufp);
241da2e3ebdSchin 	}
242da2e3ebdSchin 	return(0);
243da2e3ebdSchin }
244da2e3ebdSchin 
245da2e3ebdSchin int
b_uniq(int argc,char ** argv,void * context)246da2e3ebdSchin b_uniq(int argc, char** argv, void* context)
247da2e3ebdSchin {
248da2e3ebdSchin 	register int n, mode=0;
249da2e3ebdSchin 	register char *cp;
250da2e3ebdSchin 	int fields=0, chars=0, width=-1;
251da2e3ebdSchin 	Sfio_t *fpin, *fpout;
252da2e3ebdSchin 	int* all = 0;
253da2e3ebdSchin 	int sep;
254da2e3ebdSchin 	Compare_f compare = (Compare_f)memcmp;
255da2e3ebdSchin 
256da2e3ebdSchin 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
257da2e3ebdSchin 	while (n = optget(argv, usage)) switch (n)
258da2e3ebdSchin 	{
259da2e3ebdSchin 	    case 'c':
260da2e3ebdSchin 		mode |= C_FLAG;
261da2e3ebdSchin 		break;
262da2e3ebdSchin 	    case 'd':
263da2e3ebdSchin 		mode |= D_FLAG;
264da2e3ebdSchin 		break;
265da2e3ebdSchin 	    case 'D':
266da2e3ebdSchin 		mode |= D_FLAG;
267da2e3ebdSchin 		switch ((int)opt_info.num)
268da2e3ebdSchin 		{
269da2e3ebdSchin 		case 'p':
270da2e3ebdSchin 			sep = 1;
271da2e3ebdSchin 			break;
272da2e3ebdSchin 		case 's':
273da2e3ebdSchin 			sep = 0;
274da2e3ebdSchin 			break;
275da2e3ebdSchin 		default:
276da2e3ebdSchin 			sep = -1;
277da2e3ebdSchin 			break;
278da2e3ebdSchin 		}
279da2e3ebdSchin 		all = &sep;
280da2e3ebdSchin 		break;
281da2e3ebdSchin 	    case 'i':
282da2e3ebdSchin 		compare = (Compare_f)strncasecmp;
283da2e3ebdSchin 		break;
284da2e3ebdSchin 	    case 'u':
285da2e3ebdSchin 		mode |= U_FLAG;
286da2e3ebdSchin 		break;
287da2e3ebdSchin 	    case 'f':
288da2e3ebdSchin 		if(*opt_info.option=='-')
289da2e3ebdSchin 			fields = opt_info.num;
290da2e3ebdSchin 		else
291da2e3ebdSchin 			chars = opt_info.num;
292da2e3ebdSchin 		break;
293da2e3ebdSchin 	    case 's':
294da2e3ebdSchin 		chars = opt_info.num;
295da2e3ebdSchin 		break;
296da2e3ebdSchin 	    case 'w':
297da2e3ebdSchin 		width = opt_info.num;
298da2e3ebdSchin 		break;
299da2e3ebdSchin 	    case ':':
300da2e3ebdSchin 		error(2, "%s", opt_info.arg);
301da2e3ebdSchin 		break;
302da2e3ebdSchin 	    case '?':
303da2e3ebdSchin 		error(ERROR_usage(2), "%s", opt_info.arg);
304da2e3ebdSchin 		break;
305da2e3ebdSchin 	}
306da2e3ebdSchin 	argv += opt_info.index;
307da2e3ebdSchin 	if(all && (mode&C_FLAG))
308da2e3ebdSchin 		error(2, "-c and -D are mutually exclusive");
309da2e3ebdSchin 	if(error_info.errors)
310da2e3ebdSchin 		error(ERROR_usage(2), "%s", optusage(NiL));
311da2e3ebdSchin 	if((cp = *argv) && (argv++,!streq(cp,"-")))
312da2e3ebdSchin 	{
313da2e3ebdSchin 		if(!(fpin = sfopen(NiL,cp,"r")))
314da2e3ebdSchin 			error(ERROR_system(1),"%s: cannot open",cp);
315da2e3ebdSchin 	}
316da2e3ebdSchin 	else
317da2e3ebdSchin 		fpin = sfstdin;
318da2e3ebdSchin 	if(cp = *argv)
319da2e3ebdSchin 	{
320da2e3ebdSchin 		argv++;
321da2e3ebdSchin 		if(!(fpout = sfopen(NiL,cp,"w")))
322da2e3ebdSchin 			error(ERROR_system(1),"%s: cannot create",cp);
323da2e3ebdSchin 	}
324da2e3ebdSchin 	else
325da2e3ebdSchin 		fpout = sfstdout;
326da2e3ebdSchin 	if(*argv)
327da2e3ebdSchin 	{
328da2e3ebdSchin 		error(2, "too many arguments");
329da2e3ebdSchin 		error(ERROR_usage(2), "%s", optusage(NiL));
330da2e3ebdSchin 	}
331da2e3ebdSchin 	error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
332da2e3ebdSchin 	if(fpin!=sfstdin)
333da2e3ebdSchin 		sfclose(fpin);
334da2e3ebdSchin 	if(fpout!=sfstdout)
335da2e3ebdSchin 		sfclose(fpout);
336da2e3ebdSchin 	return(error_info.errors);
337da2e3ebdSchin }
338da2e3ebdSchin 
339