xref: /titanic_41/usr/src/lib/libcmd/common/uniq.c (revision 4bff34e37def8a90f9194d81bc345c52ba20086a)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *           Copyright (c) 1992-2007 AT&T Knowledge Ventures            *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                      by AT&T Knowledge Ventures                      *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                                                                      *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23  * uniq
24  *
25  * Written by David Korn
26  */
27 
28 static const char usage[] =
29 "[-?\n@(#)$Id: uniq (AT&T Research) 2006-08-28 $\n]"
30 USAGE_LICENSE
31 "[+NAME?uniq - Report or filter out repeated lines in a file]"
32 "[+DESCRIPTION?\buniq\b reads an input, comparing adjacent lines, and "
33 	"writing one copy of each input line on the output.  The second "
34 	"and succeeding copies of the repeated adjacent lines are not "
35 	"written.]"
36 "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37 	"to standard output.  If no \ainfile\a is given, or if the \ainfile\a "
38 	"is \b-\b, \buniq\b reads from standard input with  the start of "
39 	"the file is defined as the current offset.]"
40 "[c:count?Output the number of times each line occurred  along with "
41 	"the line.]"
42 "[d:repeated|duplicates?Output the first of each duplicate line.]"
43 "[D:all-repeated?Output all duplicate lines as a group with an empty "
44     "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
45     "{"
46         "[n:none?Do not delimit duplicate groups.]"
47         "[p:prepend?Prepend an empty line before each group.]"
48         "[s:separate?Separate each group with an empty line.]"
49     "}"
50 "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51     "before checking for uniqueness. A field is the minimal string matching "
52     "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b.]"
53 "[i:ignore-case?Ignore case in comparisons.]"
54 "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
55 	"before checking for uniqueness.  If specified along with \b-f\b, "
56 	"the first \achars\a after the first \afields\a are ignored.  If "
57 	"the \achars\a specifies more characters than are on the line, "
58 	"an empty string will be used for comparison.]"
59 "[u:unique?Output unique lines.]"
60 "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
61 	"after skipping any specified fields and characters.]"
62 "\n"
63 "\n[infile [outfile]]\n"
64 "\n"
65 "[+EXIT STATUS?]{"
66 	"[+0?The input file was successfully processed.]"
67 	"[+>0?An error occurred.]"
68 "}"
69 "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
70 ;
71 
72 #include <cmd.h>
73 
74 #define C_FLAG	1
75 #define D_FLAG	2
76 #define U_FLAG	4
77 
78 #define CWIDTH	4
79 #define MAXCNT	9999
80 
81 typedef int (*Compare_f)(const char*, const char*, size_t);
82 
83 static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
84 {
85 	register int n, f, outsize=0;
86 	register char *cp, *ep, *bufp, *outp;
87 	char *orecp, *sbufp=0, *outbuff;
88 	int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
89 	if(mode&C_FLAG)
90 		cwidth = CWIDTH+1;
91 	while(1)
92 	{
93 		if(bufp = sfgetr(fdin,'\n',0))
94 			n = sfvalue(fdin);
95 		else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
96 		{
97 			n = sfvalue(fdin);
98 			bufp = memcpy(fmtbuf(n + 1), bufp, n);
99 			bufp[n++] = '\n';
100 		}
101 		else
102 			n = 0;
103 		if(n)
104 		{
105 			cp = bufp;
106 			ep = cp + n;
107 			if(f=fields)
108 				while(f-->0 && cp<ep) /* skip over fields */
109 				{
110 					while(cp<ep && *cp==' ' || *cp=='\t')
111 						cp++;
112 					while(cp<ep && *cp!=' ' && *cp!='\t')
113 						cp++;
114 				}
115 			if(chars)
116 				cp += chars;
117 			if((reclen = n - (cp-bufp)) <=0)
118 			{
119 				reclen = 1;
120 				cp = bufp + sfvalue(fdin)-1;
121 			}
122 			else if(width >= 0 && width < reclen)
123 				reclen = width;
124 		}
125 		else
126 			reclen=-2;
127 		if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
128 		{
129 			count++;
130 			if (!all)
131 				continue;
132 			next = count;
133 		}
134 		else
135 		{
136 			next = 0;
137 			if(outsize>0)
138 			{
139 				if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
140 				{
141 					if(outp!=sbufp)
142 						sfwrite(fdout,outp,0);
143 				}
144 				else
145 				{
146 					if(cwidth)
147 					{
148 						outp[CWIDTH] = ' ';
149 						if(count<MAXCNT)
150 						{
151 							sfsprintf(outp,cwidth,"%*d",CWIDTH,count+1);
152 							outp[CWIDTH] = ' ';
153 						}
154 						else
155 						{
156 							outsize -= (CWIDTH+1);
157 							if(outp!=sbufp)
158 							{
159 								if(!(sbufp=fmtbuf(outsize)))
160 									return(1);
161 								memcpy(sbufp,outp+CWIDTH+1,outsize);
162 								sfwrite(fdout,outp,0);
163 								outp = sbufp;
164 							}
165 							else
166 								outp += CWIDTH+1;
167 							sfprintf(fdout,"%4d ",count+1);
168 						}
169 					}
170 					if(sfwrite(fdout,outp,outsize) != outsize)
171 						return(1);
172 				}
173 			}
174 		}
175 		if(n==0)
176 			break;
177 		if(count = next)
178 		{
179 			if(sfwrite(fdout,outp,outsize) != outsize)
180 				return(1);
181 			if(*all >= 0)
182 				*all = 1;
183 			sep = 0;
184 		}
185 		else
186 			sep = all && *all > 0;
187 		/* save current record */
188 		if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
189 			return(1);
190 		outp = outbuff;
191 		if(outsize < n+cwidth+sep)
192 		{
193 			/* no room in outp, clear lock and use side buffer */
194 			sfwrite(fdout,outp,0);
195 			if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
196 				return(1);
197 		}
198 		else
199 			outsize = n+cwidth+sep;
200 		memcpy(outp+cwidth+sep,bufp,n);
201 		if(sep)
202 			outp[cwidth] = '\n';
203 		oreclen = reclen;
204 		orecp = outp+cwidth+sep + (cp-bufp);
205 	}
206 	return(0);
207 }
208 
209 int
210 b_uniq(int argc, char** argv, void* context)
211 {
212 	register int n, mode=0;
213 	register char *cp;
214 	int fields=0, chars=0, width=-1;
215 	Sfio_t *fpin, *fpout;
216 	int* all = 0;
217 	int sep;
218 	Compare_f compare = (Compare_f)memcmp;
219 
220 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
221 	while (n = optget(argv, usage)) switch (n)
222 	{
223 	    case 'c':
224 		mode |= C_FLAG;
225 		break;
226 	    case 'd':
227 		mode |= D_FLAG;
228 		break;
229 	    case 'D':
230 		mode |= D_FLAG;
231 		switch ((int)opt_info.num)
232 		{
233 		case 'p':
234 			sep = 1;
235 			break;
236 		case 's':
237 			sep = 0;
238 			break;
239 		default:
240 			sep = -1;
241 			break;
242 		}
243 		all = &sep;
244 		break;
245 	    case 'i':
246 		compare = (Compare_f)strncasecmp;
247 		break;
248 	    case 'u':
249 		mode |= U_FLAG;
250 		break;
251 	    case 'f':
252 		if(*opt_info.option=='-')
253 			fields = opt_info.num;
254 		else
255 			chars = opt_info.num;
256 		break;
257 	    case 's':
258 		chars = opt_info.num;
259 		break;
260 	    case 'w':
261 		width = opt_info.num;
262 		break;
263 	    case ':':
264 		error(2, "%s", opt_info.arg);
265 		break;
266 	    case '?':
267 		error(ERROR_usage(2), "%s", opt_info.arg);
268 		break;
269 	}
270 	argv += opt_info.index;
271 	if(all && (mode&C_FLAG))
272 		error(2, "-c and -D are mutually exclusive");
273 	if(error_info.errors)
274 		error(ERROR_usage(2), "%s", optusage(NiL));
275 	if((cp = *argv) && (argv++,!streq(cp,"-")))
276 	{
277 		if(!(fpin = sfopen(NiL,cp,"r")))
278 			error(ERROR_system(1),"%s: cannot open",cp);
279 	}
280 	else
281 		fpin = sfstdin;
282 	if(cp = *argv)
283 	{
284 		argv++;
285 		if(!(fpout = sfopen(NiL,cp,"w")))
286 			error(ERROR_system(1),"%s: cannot create",cp);
287 	}
288 	else
289 		fpout = sfstdout;
290 	if(*argv)
291 	{
292 		error(2, "too many arguments");
293 		error(ERROR_usage(2), "%s", optusage(NiL));
294 	}
295 	error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
296 	if(fpin!=sfstdin)
297 		sfclose(fpin);
298 	if(fpout!=sfstdout)
299 		sfclose(fpout);
300 	return(error_info.errors);
301 }
302 
303