xref: /titanic_50/usr/src/lib/libcmd/common/cut.c (revision ad593b7f6950a2f44f8c46948367d37431f74fd0)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1992-2008 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                                                                      *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23  * David Korn
24  * AT&T Bell Laboratories
25  *
26  * cut [-sN] [-f flist] [-c clist] [-d delim] [-D delim] [-r reclen] [file] ...
27  *
28  * cut fields or columns from fields from a file
29  */
30 
31 static const char usage[] =
32 "[-?\n@(#)$Id: cut (AT&T Research) 2008-04-01 $\n]"
33 USAGE_LICENSE
34 "[+NAME?cut - cut out selected columns or fields of each line of a file]"
35 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
36 	"from one or more files, contatenating them on standard output.]"
37 "[+?The option argument \alist\a is a comma-separated or blank-separated "
38 	"list of positive numbers and ranges.  Ranges can be of three "
39 	"forms.  The first is two positive integers separated by a hyphen "
40 	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
41 	"\ahigh\a.  The second is a positive number preceded by a hyphen "
42 	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
43 	"\ahigh\a.  The last is a positive number followed by a hyphen "
44 	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
45 	"last field, inclusive.  Elements in the \alist\a can be repeated, "
46 	"can overlap, and can appear in any order.  The order of the "
47 	"output is that of the input.]"
48 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
49 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
50         "cuts from standard input.   The start of the file is defined "
51         "as the current offset.]"
52 "[b:bytes]:[list?\bcut\b based on a list of bytes.]"
53 "[c:characters]:[list?\bcut\b based on a list of characters.]"
54 "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
55 	"to \adelim\a.  The default is the \btab\b character.]"
56 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
57 	"character specified with the \b-d\b optiion.]"
58 "[n:nosplit?Do not split characters.  Currently ignored.]"
59 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
60 	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
61 	"option.]"
62 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
63 	"when used with the \b-f\b option.  By default, lines with no "
64 	"delimiters will be passsed in untouched.]"
65 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
66 	"the \b-f\b option is set to \aldelim\a.  The default is the "
67 	"\bnewline\b character.]"
68 "[N:nonewline?Do not output new-lines at end of each record when used "
69 	"with the \b-b\b or \b-c\b option.]"
70 "\n"
71 "\n[file ...]\n"
72 "\n"
73 "[+EXIT STATUS?]{"
74 	"[+0?All files processed successfully.]"
75 	"[+>0?One or more files failed to open or could not be read.]"
76 "}"
77 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
78 ;
79 
80 #include <cmd.h>
81 #include <ctype.h>
82 
83 typedef struct Last_s
84 {
85 	int		seqno;
86 	int		seq;
87 	int		wdelim;
88 	int		ldelim;
89 } Last_t;
90 
91 typedef struct Cut_s
92 {
93 	int		cflag;
94 	int		sflag;
95 	int		nlflag;
96 	int		wdelim;
97 	int		ldelim;
98 	int		seqno;
99 	int		reclen;
100 	signed char	space[UCHAR_MAX];
101 	Last_t		last;
102 	int		list[2];	/* NOTE: must be last member */
103 } Cut_t;
104 
105 #define HUGE		(1<<14)
106 #define BLOCK		8*1024
107 #define C_BYTES		1
108 #define C_CHARS		2
109 #define C_FIELDS	4
110 #define C_SUPRESS	8
111 #define C_NOCHOP	16
112 #define C_NONEWLINE	32
113 
114 /*
115  * compare the first of an array of integers
116  */
117 
118 static int mycomp(register const void *a,register const void *b)
119 {
120 	return(*((int*)a) - *((int*)b));
121 }
122 
123 static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen)
124 {
125 	register int *lp, c, n=0;
126 	register int range = 0;
127 	register char *cp = str;
128 	Cut_t *cuthdr;
129 	if (!(cuthdr = (Cut_t*)stakalloc(sizeof(Cut_t)+strlen(cp)*sizeof(int))))
130 		error(ERROR_exit(1), "out of space");
131 	memset(cuthdr->space, 0, sizeof(cuthdr->space));
132 	cuthdr->last.seqno = 0;
133 	cuthdr->last.seq = 0;
134 	cuthdr->last.wdelim = 0;
135 	cuthdr->last.ldelim = '\n';
136 	cuthdr->cflag = ((mode&C_CHARS)!=0 && mbwide());
137 	cuthdr->sflag = ((mode&C_SUPRESS)!=0);
138 	cuthdr->nlflag = ((mode&C_NONEWLINE)!=0);
139 	cuthdr->wdelim = wdelim;
140 	cuthdr->ldelim = ldelim;
141 	cuthdr->reclen = reclen;
142 	cuthdr->seqno = ++cuthdr->last.seqno;
143 	lp = cuthdr->list;
144 	while(1) switch(c= *cp++)
145 	{
146 		case ' ':
147 		case '\t':
148 			while(*cp==' ' || *cp=='\t')
149 				cp++;
150 		case 0:
151 		case ',':
152 			if(range)
153 			{
154 				--range;
155 				if((n = (n==0?HUGE:n-range)) < 0)
156 					error(ERROR_exit(1),"invalid range for c/f option");
157 				*lp++ = range;
158 				*lp++ = n;
159 			}
160 			else
161 			{
162 				*lp++ = --n;
163 				*lp++ = 1;
164 			}
165 			if(c==0)
166 			{
167 				register int *dp;
168 				*lp = HUGE;
169 				n = 1 + (lp-cuthdr->list)/2;
170 				qsort(lp=cuthdr->list,n,2*sizeof(*lp),mycomp);
171 				/* eliminate overlapping regions */
172 				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
173 				{
174 					if(lp[0] <= range)
175 					{
176 						if(lp[1]==HUGE)
177 						{
178 							dp[-1] = HUGE;
179 							break;
180 						}
181 						if((c = lp[0]+lp[1]-range)>0)
182 						{
183 							range += c;
184 							dp[-1] += c;
185 						}
186 					}
187 					else
188 					{
189 						range = *dp++ = lp[0];
190 						if(lp[1]==HUGE)
191 						{
192 							*dp++ = HUGE;
193 							break;
194 						}
195 						range += (*dp++ = lp[1]);
196 					}
197 				}
198 				*dp = HUGE;
199 				lp = cuthdr->list;
200 				/* convert ranges into gaps */
201 				for(n=0; *lp!=HUGE; lp+=2)
202 				{
203 					c = *lp;
204 					*lp -= n;
205 					n = c+lp[1];
206 				}
207 				return(cuthdr);
208 			}
209 			n = range = 0;
210 			break;
211 
212 		case '-':
213 			if(range)
214 				error(ERROR_exit(1),"bad list for c/f option");
215 			range = n?n:1;
216 			n = 0;
217 			break;
218 
219 		default:
220 			if(!isdigit(c))
221 				error(ERROR_exit(1),"bad list for c/f option");
222 			n = 10*n + (c-'0');
223 	}
224 	/* NOTREACHED */
225 }
226 
227 /*
228  * advance <cp> by <n> multi-byte characters
229  */
230 static int advance(const char *str, register int n, register int inlen)
231 {
232 	register int size, len=inlen;
233 	register const char *cp=str;
234 	while(len>0 && n-->0)
235 	{
236 		size = mblen(cp, len);
237 		if(size<0)
238 			size = 1;
239 		cp += size;
240 		len -= size;
241 
242 	}
243 	if(n>0)
244 		return(inlen+1);
245 	return(cp-str);
246 }
247 
248 /*
249  * cut each line of file <fdin> and put results to <fdout> using list <list>
250  */
251 
252 static void cutcols(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout)
253 {
254 	register int		c, ncol=0,len;
255 	register const int	*lp = cuthdr->list;
256 	register char		*inp;
257 	register int		skip; /* non-zero for don't copy */
258 	while(1)
259 	{
260 		if(len = cuthdr->reclen)
261 			inp = sfreserve(fdin, len, -1);
262 		else
263 			inp = sfgetr(fdin, '\n', 0);
264 		if(!inp && !(inp = sfgetr(fdin, 0, SF_LASTR)))
265 			break;
266 		len = sfvalue(fdin);
267 		if((ncol = skip  = *(lp = cuthdr->list)) == 0)
268 			ncol = *++lp;
269 		while(1)
270 		{
271 			if((c=(cuthdr->cflag?advance(inp,ncol,len):ncol)) > len)
272 				c = len;
273 			else if(c==len && !skip)
274 				ncol++;
275 			ncol -= c;
276 			if(!skip && sfwrite(fdout,(char*)inp,c)<0)
277 				return;
278 			inp += c;
279 			if(ncol)
280 				break;
281 			len -= c;
282 			ncol = *++lp;
283 			skip = !skip;
284 		}
285 		if(!cuthdr->nlflag && (skip || cuthdr->reclen))
286 			sfputc(fdout,cuthdr->ldelim);
287 	}
288 }
289 
290 /*
291  * cut each line of file <fdin> and put results to <fdout> using list <list>
292  * stream <fdin> must be line buffered
293  */
294 
295 #define endline(c)	(((signed char)-1)<0?(c)<0:(c)==((char)-1))
296 
297 static void cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout)
298 {
299 	register unsigned char *cp;
300 	register int c, nfields;
301 	register const int *lp = cuthdr->list;
302 	register unsigned char *copy;
303 	register int nodelim, empty, inword=0;
304 	register unsigned char *endbuff;
305 	unsigned char *inbuff, *first;
306 	int lastchar;
307 	Sfio_t *fdtmp = 0;
308 	long offset = 0;
309 	if(cuthdr->seqno != cuthdr->last.seq)
310 	{
311 		cuthdr->space[cuthdr->last.ldelim] = 0;
312 		cuthdr->space[cuthdr->last.wdelim] = 0;
313 		cuthdr->space[cuthdr->last.wdelim=cuthdr->wdelim] = 1;
314 		cuthdr->space[cuthdr->last.ldelim=cuthdr->ldelim] = -1;
315 		cuthdr->last.seq = cuthdr->seqno;
316 	}
317 	/* process each buffer */
318 	while ((inbuff = (unsigned char*)sfreserve(fdin, SF_UNBOUND, 0)) && (c = sfvalue(fdin)) > 0)
319 	{
320 		cp = inbuff;
321 		endbuff = cp + --c;
322 		if((lastchar = cp[c]) != cuthdr->ldelim)
323 			*endbuff = cuthdr->ldelim;
324 		/* process each line in the buffer */
325 		while(cp <= endbuff)
326 		{
327 			first = cp;
328 			if(!inword)
329 			{
330 				nodelim = empty = 1;
331 				copy = cp;
332 				if(nfields = *(lp = cuthdr->list))
333 					copy = 0;
334 				else
335 					nfields = *++lp;
336 			}
337 			else if(copy)
338 				copy = cp;
339 			inword = 0;
340 			while(!inword)
341 			{
342 				/* skip over non-delimiter characters */
343 				while(!(c=cuthdr->space[*cp++]));
344 				/* check for end-of-line */
345 				if(endline(c))
346 				{
347 					if(cp<=endbuff)
348 						break;
349 					if((c=cuthdr->space[lastchar]),endline(c))
350 						break;
351 					/* restore cuthdr->last. character */
352 					if(lastchar != cuthdr->ldelim)
353 						*endbuff = lastchar;
354 					inword++;
355 					if(!c)
356 						break;
357 				}
358 				nodelim = 0;
359 				if(--nfields >0)
360 					continue;
361 				nfields = *++lp;
362 				if(copy)
363 				{
364 					empty = 0;
365 					if((c=(cp-1)-copy)>0 && sfwrite(fdout,(char*)copy,c)< 0)
366 						goto failed;
367 					copy = 0;
368 				}
369 				else
370 					/* set to delimiter unless the first field */
371 					copy = cp -!empty;
372 			}
373 			if(!inword)
374 			{
375 				if(!copy)
376 				{
377 					if(nodelim)
378 					{
379 						if(!cuthdr->sflag)
380 						{
381 							if(offset)
382 							{
383 								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
384 								sfmove(fdtmp,fdout,offset,-1);
385 							}
386 							copy = first;
387 						}
388 					}
389 					else
390 						sfputc(fdout,'\n');
391 				}
392 				if(offset)
393 					sfseek(fdtmp,offset=0,SEEK_SET);
394 			}
395 			if(copy && (c=cp-copy)>0 && (!nodelim || !cuthdr->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
396 				goto failed;
397 		}
398 		/* see whether to save in tmp file */
399 		if(inword && nodelim && !cuthdr->sflag && (c=cp-first)>0)
400 		{
401 			/* copy line to tmpfile in case no fields */
402 			if(!fdtmp)
403 				fdtmp = sftmp(BLOCK);
404 			sfwrite(fdtmp,(char*)first,c);
405 			offset +=c;
406 		}
407 	}
408 failed:
409 	if(fdtmp)
410 		sfclose(fdtmp);
411 }
412 
413 int
414 b_cut(int argc,char *argv[], void* context)
415 {
416 	register char *cp = 0;
417 	register Sfio_t *fp;
418 	int	n;
419 	Cut_t	*cuthdr;
420 	int	mode = 0;
421 	int	wdelim = '\t';
422 	int	ldelim = '\n';
423 	size_t	reclen = 0;
424 
425 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
426 	while (n = optget(argv, usage)) switch (n)
427 	{
428 	  case 'b':
429 	  case 'c':
430 		if(mode&C_FIELDS)
431 		{
432 			error(2, "f option already specified");
433 			break;
434 		}
435 		cp = opt_info.arg;
436 		if(n=='b')
437 			mode |= C_BYTES;
438 		else
439 			mode |= C_CHARS;
440 		break;
441 	  case 'D':
442 		ldelim = *(unsigned char*)opt_info.arg;
443 		break;
444 	  case 'd':
445 		wdelim = *(unsigned char*)opt_info.arg;
446 		break;
447 	  case 'f':
448 		if(mode&(C_CHARS|C_BYTES))
449 		{
450 			error(2, "c option already specified");
451 			break;
452 		}
453 		cp = opt_info.arg;
454 		mode |= C_FIELDS;
455 		break;
456 	  case 'n':
457 		mode |= C_NOCHOP;
458 		break;
459 	  case 'N':
460 		mode |= C_NONEWLINE;
461 		break;
462 	  case 'R':
463 	  case 'r':
464 		if(opt_info.num>0)
465 			reclen = opt_info.num;
466 		break;
467 	  case 's':
468 		mode |= C_SUPRESS;
469 		break;
470 	  case ':':
471 		error(2, "%s", opt_info.arg);
472 		break;
473 	  case '?':
474 		error(ERROR_usage(2), "%s", opt_info.arg);
475 		break;
476 	}
477 	argv += opt_info.index;
478 	if (error_info.errors)
479 		error(ERROR_usage(2), "%s",optusage(NiL));
480 	if(!cp)
481 	{
482 		error(2, "b, c or f option must be specified");
483 		error(ERROR_usage(2), "%s", optusage(NiL));
484 	}
485 	if(!*cp)
486 		error(3, "non-empty b, c or f option must be specified");
487 	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
488 		error(3, "s option requires f option");
489 	cuthdr = cutinit(mode,cp,wdelim,ldelim,reclen);
490 	if(cp = *argv)
491 		argv++;
492 	do
493 	{
494 		if(!cp || streq(cp,"-"))
495 			fp = sfstdin;
496 		else if(!(fp = sfopen(NiL,cp,"r")))
497 		{
498 			error(ERROR_system(0),"%s: cannot open",cp);
499 			continue;
500 		}
501 		if(mode&C_FIELDS)
502 			cutfields(cuthdr,fp,sfstdout);
503 		else
504 			cutcols(cuthdr,fp,sfstdout);
505 		if(fp!=sfstdin)
506 			sfclose(fp);
507 	} while(cp = *argv++);
508 	if (sfsync(sfstdout))
509 		error(ERROR_system(0), "write error");
510 	return(error_info.errors?1:0);
511 }
512