xref: /titanic_51/usr/src/lib/libcmd/common/uniq.c (revision da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968)
1*da2e3ebdSchin /***********************************************************************
2*da2e3ebdSchin *                                                                      *
3*da2e3ebdSchin *               This software is part of the ast package               *
4*da2e3ebdSchin *           Copyright (c) 1992-2007 AT&T Knowledge Ventures            *
5*da2e3ebdSchin *                      and is licensed under the                       *
6*da2e3ebdSchin *                  Common Public License, Version 1.0                  *
7*da2e3ebdSchin *                      by AT&T Knowledge Ventures                      *
8*da2e3ebdSchin *                                                                      *
9*da2e3ebdSchin *                A copy of the License is available at                 *
10*da2e3ebdSchin *            http://www.opensource.org/licenses/cpl1.0.txt             *
11*da2e3ebdSchin *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12*da2e3ebdSchin *                                                                      *
13*da2e3ebdSchin *              Information and Software Systems Research               *
14*da2e3ebdSchin *                            AT&T Research                             *
15*da2e3ebdSchin *                           Florham Park NJ                            *
16*da2e3ebdSchin *                                                                      *
17*da2e3ebdSchin *                 Glenn Fowler <gsf@research.att.com>                  *
18*da2e3ebdSchin *                  David Korn <dgk@research.att.com>                   *
19*da2e3ebdSchin *                                                                      *
20*da2e3ebdSchin ***********************************************************************/
21*da2e3ebdSchin #pragma prototyped
22*da2e3ebdSchin /*
23*da2e3ebdSchin  * uniq
24*da2e3ebdSchin  *
25*da2e3ebdSchin  * Written by David Korn
26*da2e3ebdSchin  */
27*da2e3ebdSchin 
28*da2e3ebdSchin static const char usage[] =
29*da2e3ebdSchin "[-?\n@(#)$Id: uniq (AT&T Research) 2006-08-28 $\n]"
30*da2e3ebdSchin USAGE_LICENSE
31*da2e3ebdSchin "[+NAME?uniq - Report or filter out repeated lines in a file]"
32*da2e3ebdSchin "[+DESCRIPTION?\buniq\b reads an input, comparing adjacent lines, and "
33*da2e3ebdSchin 	"writing one copy of each input line on the output.  The second "
34*da2e3ebdSchin 	"and succeeding copies of the repeated adjacent lines are not "
35*da2e3ebdSchin 	"written.]"
36*da2e3ebdSchin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37*da2e3ebdSchin 	"to standard output.  If no \ainfile\a is given, or if the \ainfile\a "
38*da2e3ebdSchin 	"is \b-\b, \buniq\b reads from standard input with  the start of "
39*da2e3ebdSchin 	"the file is defined as the current offset.]"
40*da2e3ebdSchin "[c:count?Output the number of times each line occurred  along with "
41*da2e3ebdSchin 	"the line.]"
42*da2e3ebdSchin "[d:repeated|duplicates?Output the first of each duplicate line.]"
43*da2e3ebdSchin "[D:all-repeated?Output all duplicate lines as a group with an empty "
44*da2e3ebdSchin     "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
45*da2e3ebdSchin     "{"
46*da2e3ebdSchin         "[n:none?Do not delimit duplicate groups.]"
47*da2e3ebdSchin         "[p:prepend?Prepend an empty line before each group.]"
48*da2e3ebdSchin         "[s:separate?Separate each group with an empty line.]"
49*da2e3ebdSchin     "}"
50*da2e3ebdSchin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51*da2e3ebdSchin     "before checking for uniqueness. A field is the minimal string matching "
52*da2e3ebdSchin     "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b.]"
53*da2e3ebdSchin "[i:ignore-case?Ignore case in comparisons.]"
54*da2e3ebdSchin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
55*da2e3ebdSchin 	"before checking for uniqueness.  If specified along with \b-f\b, "
56*da2e3ebdSchin 	"the first \achars\a after the first \afields\a are ignored.  If "
57*da2e3ebdSchin 	"the \achars\a specifies more characters than are on the line, "
58*da2e3ebdSchin 	"an empty string will be used for comparison.]"
59*da2e3ebdSchin "[u:unique?Output unique lines.]"
60*da2e3ebdSchin "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
61*da2e3ebdSchin 	"after skipping any specified fields and characters.]"
62*da2e3ebdSchin "\n"
63*da2e3ebdSchin "\n[infile [outfile]]\n"
64*da2e3ebdSchin "\n"
65*da2e3ebdSchin "[+EXIT STATUS?]{"
66*da2e3ebdSchin 	"[+0?The input file was successfully processed.]"
67*da2e3ebdSchin 	"[+>0?An error occurred.]"
68*da2e3ebdSchin "}"
69*da2e3ebdSchin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
70*da2e3ebdSchin ;
71*da2e3ebdSchin 
72*da2e3ebdSchin #include <cmd.h>
73*da2e3ebdSchin 
74*da2e3ebdSchin #define C_FLAG	1
75*da2e3ebdSchin #define D_FLAG	2
76*da2e3ebdSchin #define U_FLAG	4
77*da2e3ebdSchin 
78*da2e3ebdSchin #define CWIDTH	4
79*da2e3ebdSchin #define MAXCNT	9999
80*da2e3ebdSchin 
81*da2e3ebdSchin typedef int (*Compare_f)(const char*, const char*, size_t);
82*da2e3ebdSchin 
83*da2e3ebdSchin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
84*da2e3ebdSchin {
85*da2e3ebdSchin 	register int n, f, outsize=0;
86*da2e3ebdSchin 	register char *cp, *ep, *bufp, *outp;
87*da2e3ebdSchin 	char *orecp, *sbufp=0, *outbuff;
88*da2e3ebdSchin 	int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
89*da2e3ebdSchin 	if(mode&C_FLAG)
90*da2e3ebdSchin 		cwidth = CWIDTH+1;
91*da2e3ebdSchin 	while(1)
92*da2e3ebdSchin 	{
93*da2e3ebdSchin 		if(bufp = sfgetr(fdin,'\n',0))
94*da2e3ebdSchin 			n = sfvalue(fdin);
95*da2e3ebdSchin 		else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
96*da2e3ebdSchin 		{
97*da2e3ebdSchin 			n = sfvalue(fdin);
98*da2e3ebdSchin 			bufp = memcpy(fmtbuf(n + 1), bufp, n);
99*da2e3ebdSchin 			bufp[n++] = '\n';
100*da2e3ebdSchin 		}
101*da2e3ebdSchin 		else
102*da2e3ebdSchin 			n = 0;
103*da2e3ebdSchin 		if(n)
104*da2e3ebdSchin 		{
105*da2e3ebdSchin 			cp = bufp;
106*da2e3ebdSchin 			ep = cp + n;
107*da2e3ebdSchin 			if(f=fields)
108*da2e3ebdSchin 				while(f-->0 && cp<ep) /* skip over fields */
109*da2e3ebdSchin 				{
110*da2e3ebdSchin 					while(cp<ep && *cp==' ' || *cp=='\t')
111*da2e3ebdSchin 						cp++;
112*da2e3ebdSchin 					while(cp<ep && *cp!=' ' && *cp!='\t')
113*da2e3ebdSchin 						cp++;
114*da2e3ebdSchin 				}
115*da2e3ebdSchin 			if(chars)
116*da2e3ebdSchin 				cp += chars;
117*da2e3ebdSchin 			if((reclen = n - (cp-bufp)) <=0)
118*da2e3ebdSchin 			{
119*da2e3ebdSchin 				reclen = 1;
120*da2e3ebdSchin 				cp = bufp + sfvalue(fdin)-1;
121*da2e3ebdSchin 			}
122*da2e3ebdSchin 			else if(width >= 0 && width < reclen)
123*da2e3ebdSchin 				reclen = width;
124*da2e3ebdSchin 		}
125*da2e3ebdSchin 		else
126*da2e3ebdSchin 			reclen=-2;
127*da2e3ebdSchin 		if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
128*da2e3ebdSchin 		{
129*da2e3ebdSchin 			count++;
130*da2e3ebdSchin 			if (!all)
131*da2e3ebdSchin 				continue;
132*da2e3ebdSchin 			next = count;
133*da2e3ebdSchin 		}
134*da2e3ebdSchin 		else
135*da2e3ebdSchin 		{
136*da2e3ebdSchin 			next = 0;
137*da2e3ebdSchin 			if(outsize>0)
138*da2e3ebdSchin 			{
139*da2e3ebdSchin 				if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
140*da2e3ebdSchin 				{
141*da2e3ebdSchin 					if(outp!=sbufp)
142*da2e3ebdSchin 						sfwrite(fdout,outp,0);
143*da2e3ebdSchin 				}
144*da2e3ebdSchin 				else
145*da2e3ebdSchin 				{
146*da2e3ebdSchin 					if(cwidth)
147*da2e3ebdSchin 					{
148*da2e3ebdSchin 						outp[CWIDTH] = ' ';
149*da2e3ebdSchin 						if(count<MAXCNT)
150*da2e3ebdSchin 						{
151*da2e3ebdSchin 							sfsprintf(outp,cwidth,"%*d",CWIDTH,count+1);
152*da2e3ebdSchin 							outp[CWIDTH] = ' ';
153*da2e3ebdSchin 						}
154*da2e3ebdSchin 						else
155*da2e3ebdSchin 						{
156*da2e3ebdSchin 							outsize -= (CWIDTH+1);
157*da2e3ebdSchin 							if(outp!=sbufp)
158*da2e3ebdSchin 							{
159*da2e3ebdSchin 								if(!(sbufp=fmtbuf(outsize)))
160*da2e3ebdSchin 									return(1);
161*da2e3ebdSchin 								memcpy(sbufp,outp+CWIDTH+1,outsize);
162*da2e3ebdSchin 								sfwrite(fdout,outp,0);
163*da2e3ebdSchin 								outp = sbufp;
164*da2e3ebdSchin 							}
165*da2e3ebdSchin 							else
166*da2e3ebdSchin 								outp += CWIDTH+1;
167*da2e3ebdSchin 							sfprintf(fdout,"%4d ",count+1);
168*da2e3ebdSchin 						}
169*da2e3ebdSchin 					}
170*da2e3ebdSchin 					if(sfwrite(fdout,outp,outsize) != outsize)
171*da2e3ebdSchin 						return(1);
172*da2e3ebdSchin 				}
173*da2e3ebdSchin 			}
174*da2e3ebdSchin 		}
175*da2e3ebdSchin 		if(n==0)
176*da2e3ebdSchin 			break;
177*da2e3ebdSchin 		if(count = next)
178*da2e3ebdSchin 		{
179*da2e3ebdSchin 			if(sfwrite(fdout,outp,outsize) != outsize)
180*da2e3ebdSchin 				return(1);
181*da2e3ebdSchin 			if(*all >= 0)
182*da2e3ebdSchin 				*all = 1;
183*da2e3ebdSchin 			sep = 0;
184*da2e3ebdSchin 		}
185*da2e3ebdSchin 		else
186*da2e3ebdSchin 			sep = all && *all > 0;
187*da2e3ebdSchin 		/* save current record */
188*da2e3ebdSchin 		if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
189*da2e3ebdSchin 			return(1);
190*da2e3ebdSchin 		outp = outbuff;
191*da2e3ebdSchin 		if(outsize < n+cwidth+sep)
192*da2e3ebdSchin 		{
193*da2e3ebdSchin 			/* no room in outp, clear lock and use side buffer */
194*da2e3ebdSchin 			sfwrite(fdout,outp,0);
195*da2e3ebdSchin 			if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
196*da2e3ebdSchin 				return(1);
197*da2e3ebdSchin 		}
198*da2e3ebdSchin 		else
199*da2e3ebdSchin 			outsize = n+cwidth+sep;
200*da2e3ebdSchin 		memcpy(outp+cwidth+sep,bufp,n);
201*da2e3ebdSchin 		if(sep)
202*da2e3ebdSchin 			outp[cwidth] = '\n';
203*da2e3ebdSchin 		oreclen = reclen;
204*da2e3ebdSchin 		orecp = outp+cwidth+sep + (cp-bufp);
205*da2e3ebdSchin 	}
206*da2e3ebdSchin 	return(0);
207*da2e3ebdSchin }
208*da2e3ebdSchin 
209*da2e3ebdSchin int
210*da2e3ebdSchin b_uniq(int argc, char** argv, void* context)
211*da2e3ebdSchin {
212*da2e3ebdSchin 	register int n, mode=0;
213*da2e3ebdSchin 	register char *cp;
214*da2e3ebdSchin 	int fields=0, chars=0, width=-1;
215*da2e3ebdSchin 	Sfio_t *fpin, *fpout;
216*da2e3ebdSchin 	int* all = 0;
217*da2e3ebdSchin 	int sep;
218*da2e3ebdSchin 	Compare_f compare = (Compare_f)memcmp;
219*da2e3ebdSchin 
220*da2e3ebdSchin 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
221*da2e3ebdSchin 	while (n = optget(argv, usage)) switch (n)
222*da2e3ebdSchin 	{
223*da2e3ebdSchin 	    case 'c':
224*da2e3ebdSchin 		mode |= C_FLAG;
225*da2e3ebdSchin 		break;
226*da2e3ebdSchin 	    case 'd':
227*da2e3ebdSchin 		mode |= D_FLAG;
228*da2e3ebdSchin 		break;
229*da2e3ebdSchin 	    case 'D':
230*da2e3ebdSchin 		mode |= D_FLAG;
231*da2e3ebdSchin 		switch ((int)opt_info.num)
232*da2e3ebdSchin 		{
233*da2e3ebdSchin 		case 'p':
234*da2e3ebdSchin 			sep = 1;
235*da2e3ebdSchin 			break;
236*da2e3ebdSchin 		case 's':
237*da2e3ebdSchin 			sep = 0;
238*da2e3ebdSchin 			break;
239*da2e3ebdSchin 		default:
240*da2e3ebdSchin 			sep = -1;
241*da2e3ebdSchin 			break;
242*da2e3ebdSchin 		}
243*da2e3ebdSchin 		all = &sep;
244*da2e3ebdSchin 		break;
245*da2e3ebdSchin 	    case 'i':
246*da2e3ebdSchin 		compare = (Compare_f)strncasecmp;
247*da2e3ebdSchin 		break;
248*da2e3ebdSchin 	    case 'u':
249*da2e3ebdSchin 		mode |= U_FLAG;
250*da2e3ebdSchin 		break;
251*da2e3ebdSchin 	    case 'f':
252*da2e3ebdSchin 		if(*opt_info.option=='-')
253*da2e3ebdSchin 			fields = opt_info.num;
254*da2e3ebdSchin 		else
255*da2e3ebdSchin 			chars = opt_info.num;
256*da2e3ebdSchin 		break;
257*da2e3ebdSchin 	    case 's':
258*da2e3ebdSchin 		chars = opt_info.num;
259*da2e3ebdSchin 		break;
260*da2e3ebdSchin 	    case 'w':
261*da2e3ebdSchin 		width = opt_info.num;
262*da2e3ebdSchin 		break;
263*da2e3ebdSchin 	    case ':':
264*da2e3ebdSchin 		error(2, "%s", opt_info.arg);
265*da2e3ebdSchin 		break;
266*da2e3ebdSchin 	    case '?':
267*da2e3ebdSchin 		error(ERROR_usage(2), "%s", opt_info.arg);
268*da2e3ebdSchin 		break;
269*da2e3ebdSchin 	}
270*da2e3ebdSchin 	argv += opt_info.index;
271*da2e3ebdSchin 	if(all && (mode&C_FLAG))
272*da2e3ebdSchin 		error(2, "-c and -D are mutually exclusive");
273*da2e3ebdSchin 	if(error_info.errors)
274*da2e3ebdSchin 		error(ERROR_usage(2), "%s", optusage(NiL));
275*da2e3ebdSchin 	if((cp = *argv) && (argv++,!streq(cp,"-")))
276*da2e3ebdSchin 	{
277*da2e3ebdSchin 		if(!(fpin = sfopen(NiL,cp,"r")))
278*da2e3ebdSchin 			error(ERROR_system(1),"%s: cannot open",cp);
279*da2e3ebdSchin 	}
280*da2e3ebdSchin 	else
281*da2e3ebdSchin 		fpin = sfstdin;
282*da2e3ebdSchin 	if(cp = *argv)
283*da2e3ebdSchin 	{
284*da2e3ebdSchin 		argv++;
285*da2e3ebdSchin 		if(!(fpout = sfopen(NiL,cp,"w")))
286*da2e3ebdSchin 			error(ERROR_system(1),"%s: cannot create",cp);
287*da2e3ebdSchin 	}
288*da2e3ebdSchin 	else
289*da2e3ebdSchin 		fpout = sfstdout;
290*da2e3ebdSchin 	if(*argv)
291*da2e3ebdSchin 	{
292*da2e3ebdSchin 		error(2, "too many arguments");
293*da2e3ebdSchin 		error(ERROR_usage(2), "%s", optusage(NiL));
294*da2e3ebdSchin 	}
295*da2e3ebdSchin 	error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
296*da2e3ebdSchin 	if(fpin!=sfstdin)
297*da2e3ebdSchin 		sfclose(fpin);
298*da2e3ebdSchin 	if(fpout!=sfstdout)
299*da2e3ebdSchin 		sfclose(fpout);
300*da2e3ebdSchin 	return(error_info.errors);
301*da2e3ebdSchin }
302*da2e3ebdSchin 
303