xref: /titanic_51/usr/src/lib/libcmd/common/cut.c (revision 3e14f97f673e8a630f076077de35afdd43dc1587)
1da2e3ebdSchin /***********************************************************************
2da2e3ebdSchin *                                                                      *
3da2e3ebdSchin *               This software is part of the ast package               *
4*3e14f97fSRoger A. Faulkner *          Copyright (c) 1992-2010 AT&T Intellectual Property          *
5da2e3ebdSchin *                      and is licensed under the                       *
6da2e3ebdSchin *                  Common Public License, Version 1.0                  *
77c2fbfb3SApril Chin *                    by AT&T Intellectual Property                     *
8da2e3ebdSchin *                                                                      *
9da2e3ebdSchin *                A copy of the License is available at                 *
10da2e3ebdSchin *            http://www.opensource.org/licenses/cpl1.0.txt             *
11da2e3ebdSchin *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12da2e3ebdSchin *                                                                      *
13da2e3ebdSchin *              Information and Software Systems Research               *
14da2e3ebdSchin *                            AT&T Research                             *
15da2e3ebdSchin *                           Florham Park NJ                            *
16da2e3ebdSchin *                                                                      *
17da2e3ebdSchin *                 Glenn Fowler <gsf@research.att.com>                  *
18da2e3ebdSchin *                  David Korn <dgk@research.att.com>                   *
19da2e3ebdSchin *                                                                      *
20da2e3ebdSchin ***********************************************************************/
21da2e3ebdSchin #pragma prototyped
22da2e3ebdSchin /*
23da2e3ebdSchin  * David Korn
24da2e3ebdSchin  * AT&T Bell Laboratories
25da2e3ebdSchin  *
26da2e3ebdSchin  * cut fields or columns from fields from a file
27da2e3ebdSchin  */
28da2e3ebdSchin 
29da2e3ebdSchin static const char usage[] =
30*3e14f97fSRoger A. Faulkner "[-?\n@(#)$Id: cut (AT&T Research) 2009-12-04 $\n]"
31da2e3ebdSchin USAGE_LICENSE
32da2e3ebdSchin "[+NAME?cut - cut out selected columns or fields of each line of a file]"
33da2e3ebdSchin "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34da2e3ebdSchin 	"from one or more files, contatenating them on standard output.]"
35da2e3ebdSchin "[+?The option argument \alist\a is a comma-separated or blank-separated "
36da2e3ebdSchin 	"list of positive numbers and ranges.  Ranges can be of three "
37da2e3ebdSchin 	"forms.  The first is two positive integers separated by a hyphen "
38da2e3ebdSchin 	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39da2e3ebdSchin 	"\ahigh\a.  The second is a positive number preceded by a hyphen "
40da2e3ebdSchin 	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41da2e3ebdSchin 	"\ahigh\a.  The last is a positive number followed by a hyphen "
42da2e3ebdSchin 	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
43da2e3ebdSchin 	"last field, inclusive.  Elements in the \alist\a can be repeated, "
44da2e3ebdSchin 	"can overlap, and can appear in any order.  The order of the "
45da2e3ebdSchin 	"output is that of the input.]"
46da2e3ebdSchin "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47da2e3ebdSchin "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48da2e3ebdSchin         "cuts from standard input.   The start of the file is defined "
49da2e3ebdSchin         "as the current offset.]"
50*3e14f97fSRoger A. Faulkner "[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51*3e14f97fSRoger A. Faulkner "[c:characters]:[list?\bcut\b based on a list of character counts.]"
52da2e3ebdSchin "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53da2e3ebdSchin 	"to \adelim\a.  The default is the \btab\b character.]"
54da2e3ebdSchin "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55da2e3ebdSchin 	"character specified with the \b-d\b optiion.]"
56*3e14f97fSRoger A. Faulkner "[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57da2e3ebdSchin "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58da2e3ebdSchin 	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
59da2e3ebdSchin 	"option.]"
60da2e3ebdSchin "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61da2e3ebdSchin 	"when used with the \b-f\b option.  By default, lines with no "
62da2e3ebdSchin 	"delimiters will be passsed in untouched.]"
63da2e3ebdSchin "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64da2e3ebdSchin 	"the \b-f\b option is set to \aldelim\a.  The default is the "
65da2e3ebdSchin 	"\bnewline\b character.]"
66*3e14f97fSRoger A. Faulkner "[N!:newline?Output new-lines at end of each record when used "
67da2e3ebdSchin 	"with the \b-b\b or \b-c\b option.]"
68da2e3ebdSchin "\n"
69da2e3ebdSchin "\n[file ...]\n"
70da2e3ebdSchin "\n"
71da2e3ebdSchin "[+EXIT STATUS?]{"
72da2e3ebdSchin 	"[+0?All files processed successfully.]"
73da2e3ebdSchin 	"[+>0?One or more files failed to open or could not be read.]"
74da2e3ebdSchin "}"
75da2e3ebdSchin "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76da2e3ebdSchin ;
77da2e3ebdSchin 
78da2e3ebdSchin #include <cmd.h>
79da2e3ebdSchin #include <ctype.h>
80da2e3ebdSchin 
81*3e14f97fSRoger A. Faulkner typedef struct Delim_s
82da2e3ebdSchin {
83*3e14f97fSRoger A. Faulkner 	char*		str;
84*3e14f97fSRoger A. Faulkner 	int		len;
85*3e14f97fSRoger A. Faulkner 	int		chr;
86*3e14f97fSRoger A. Faulkner } Delim_t;
87da2e3ebdSchin 
88da2e3ebdSchin typedef struct Cut_s
89da2e3ebdSchin {
90*3e14f97fSRoger A. Faulkner 	int		mb;
91*3e14f97fSRoger A. Faulkner 	int		eob;
92da2e3ebdSchin 	int		cflag;
93*3e14f97fSRoger A. Faulkner 	int		nosplit;
94da2e3ebdSchin 	int		sflag;
95da2e3ebdSchin 	int		nlflag;
96da2e3ebdSchin 	int		reclen;
97*3e14f97fSRoger A. Faulkner 	Delim_t		wdelim;
98*3e14f97fSRoger A. Faulkner 	Delim_t		ldelim;
99*3e14f97fSRoger A. Faulkner 	unsigned char	space[UCHAR_MAX+1];
100da2e3ebdSchin 	int		list[2];	/* NOTE: must be last member */
101da2e3ebdSchin } Cut_t;
102da2e3ebdSchin 
103*3e14f97fSRoger A. Faulkner #define HUGE		INT_MAX
104da2e3ebdSchin #define BLOCK		8*1024
105da2e3ebdSchin #define C_BYTES		1
106da2e3ebdSchin #define C_CHARS		2
107da2e3ebdSchin #define C_FIELDS	4
108da2e3ebdSchin #define C_SUPRESS	8
109*3e14f97fSRoger A. Faulkner #define C_NOSPLIT	16
110da2e3ebdSchin #define C_NONEWLINE	32
111da2e3ebdSchin 
112*3e14f97fSRoger A. Faulkner #define SP_LINE		1
113*3e14f97fSRoger A. Faulkner #define SP_WORD		2
114*3e14f97fSRoger A. Faulkner #define SP_WIDE		3
115*3e14f97fSRoger A. Faulkner 
116*3e14f97fSRoger A. Faulkner #define mb2wc(w,p,n)	(*ast.mb_towc)(&w,(char*)p,n)
117*3e14f97fSRoger A. Faulkner 
118da2e3ebdSchin /*
119da2e3ebdSchin  * compare the first of an array of integers
120da2e3ebdSchin  */
121da2e3ebdSchin 
122*3e14f97fSRoger A. Faulkner static int
mycomp(register const void * a,register const void * b)123*3e14f97fSRoger A. Faulkner mycomp(register const void* a, register const void* b)
124da2e3ebdSchin {
125*3e14f97fSRoger A. Faulkner 	if (*((int*)a) < *((int*)b))
126*3e14f97fSRoger A. Faulkner 		return -1;
127*3e14f97fSRoger A. Faulkner 	if (*((int*)a) > *((int*)b))
128*3e14f97fSRoger A. Faulkner 		return 1;
129*3e14f97fSRoger A. Faulkner 	return 0;
130da2e3ebdSchin }
131da2e3ebdSchin 
132*3e14f97fSRoger A. Faulkner static Cut_t*
cutinit(int mode,char * str,Delim_t * wdelim,Delim_t * ldelim,size_t reclen)133*3e14f97fSRoger A. Faulkner cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
134da2e3ebdSchin {
135*3e14f97fSRoger A. Faulkner 	register int*	lp;
136*3e14f97fSRoger A. Faulkner 	register int	c;
137*3e14f97fSRoger A. Faulkner 	register int	n = 0;
138da2e3ebdSchin 	register int	range = 0;
139da2e3ebdSchin 	register char*	cp = str;
140*3e14f97fSRoger A. Faulkner 	Cut_t*		cut;
141*3e14f97fSRoger A. Faulkner 
142*3e14f97fSRoger A. Faulkner 	if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
143da2e3ebdSchin 		error(ERROR_exit(1), "out of space");
144*3e14f97fSRoger A. Faulkner 	if (cut->mb = mbwide())
145*3e14f97fSRoger A. Faulkner 	{
146*3e14f97fSRoger A. Faulkner 		memset(cut->space, 0, sizeof(cut->space) / 2);
147*3e14f97fSRoger A. Faulkner 		memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
148*3e14f97fSRoger A. Faulkner 	}
149*3e14f97fSRoger A. Faulkner 	else
150*3e14f97fSRoger A. Faulkner 		memset(cut->space, 0, sizeof(cut->space));
151*3e14f97fSRoger A. Faulkner 	cut->wdelim = *wdelim;
152*3e14f97fSRoger A. Faulkner 	if (wdelim->len == 1)
153*3e14f97fSRoger A. Faulkner 		cut->space[wdelim->chr] = SP_WORD;
154*3e14f97fSRoger A. Faulkner 	cut->ldelim = *ldelim;
155*3e14f97fSRoger A. Faulkner 	cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
156*3e14f97fSRoger A. Faulkner 	cut->space[cut->eob] = SP_LINE;
157*3e14f97fSRoger A. Faulkner 	cut->cflag = (mode&C_CHARS) && cut->mb;
158*3e14f97fSRoger A. Faulkner 	cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
159*3e14f97fSRoger A. Faulkner 	cut->sflag = (mode&C_SUPRESS) != 0;
160*3e14f97fSRoger A. Faulkner 	cut->nlflag = (mode&C_NONEWLINE) != 0;
161*3e14f97fSRoger A. Faulkner 	cut->reclen = reclen;
162*3e14f97fSRoger A. Faulkner 	lp = cut->list;
163*3e14f97fSRoger A. Faulkner 	for (;;)
164*3e14f97fSRoger A. Faulkner 		switch(c = *cp++)
165da2e3ebdSchin 		{
166da2e3ebdSchin 		case ' ':
167da2e3ebdSchin 		case '\t':
168da2e3ebdSchin 			while(*cp==' ' || *cp=='\t')
169da2e3ebdSchin 				cp++;
170*3e14f97fSRoger A. Faulkner 			/*FALLTHROUGH*/
171da2e3ebdSchin 		case 0:
172da2e3ebdSchin 		case ',':
173da2e3ebdSchin 			if(range)
174da2e3ebdSchin 			{
175da2e3ebdSchin 				--range;
176*3e14f97fSRoger A. Faulkner 				if((n = (n ? (n-range) : (HUGE-1))) < 0)
177da2e3ebdSchin 					error(ERROR_exit(1),"invalid range for c/f option");
178da2e3ebdSchin 				*lp++ = range;
179da2e3ebdSchin 				*lp++ = n;
180da2e3ebdSchin 			}
181da2e3ebdSchin 			else
182da2e3ebdSchin 			{
183da2e3ebdSchin 				*lp++ = --n;
184da2e3ebdSchin 				*lp++ = 1;
185da2e3ebdSchin 			}
186da2e3ebdSchin 			if(c==0)
187da2e3ebdSchin 			{
188da2e3ebdSchin 				register int *dp;
189da2e3ebdSchin 				*lp = HUGE;
190*3e14f97fSRoger A. Faulkner 				n = 1 + (lp-cut->list)/2;
191*3e14f97fSRoger A. Faulkner 				qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
192da2e3ebdSchin 				/* eliminate overlapping regions */
193da2e3ebdSchin 				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
194da2e3ebdSchin 				{
195da2e3ebdSchin 					if(lp[0] <= range)
196da2e3ebdSchin 					{
197da2e3ebdSchin 						if(lp[1]==HUGE)
198da2e3ebdSchin 						{
199da2e3ebdSchin 							dp[-1] = HUGE;
200da2e3ebdSchin 							break;
201da2e3ebdSchin 						}
202da2e3ebdSchin 						if((c = lp[0]+lp[1]-range)>0)
203da2e3ebdSchin 						{
204da2e3ebdSchin 							range += c;
205da2e3ebdSchin 							dp[-1] += c;
206da2e3ebdSchin 						}
207da2e3ebdSchin 					}
208da2e3ebdSchin 					else
209da2e3ebdSchin 					{
210da2e3ebdSchin 						range = *dp++ = lp[0];
211da2e3ebdSchin 						if(lp[1]==HUGE)
212da2e3ebdSchin 						{
213da2e3ebdSchin 							*dp++ = HUGE;
214da2e3ebdSchin 							break;
215da2e3ebdSchin 						}
216da2e3ebdSchin 						range += (*dp++ = lp[1]);
217da2e3ebdSchin 					}
218da2e3ebdSchin 				}
219da2e3ebdSchin 				*dp = HUGE;
220*3e14f97fSRoger A. Faulkner 				lp = cut->list;
221da2e3ebdSchin 				/* convert ranges into gaps */
222da2e3ebdSchin 				for(n=0; *lp!=HUGE; lp+=2)
223da2e3ebdSchin 				{
224da2e3ebdSchin 					c = *lp;
225da2e3ebdSchin 					*lp -= n;
226da2e3ebdSchin 					n = c+lp[1];
227da2e3ebdSchin 				}
228*3e14f97fSRoger A. Faulkner 				return cut;
229da2e3ebdSchin 			}
230da2e3ebdSchin 			n = range = 0;
231da2e3ebdSchin 			break;
232da2e3ebdSchin 
233da2e3ebdSchin 		case '-':
234da2e3ebdSchin 			if(range)
235da2e3ebdSchin 				error(ERROR_exit(1),"bad list for c/f option");
236da2e3ebdSchin 			range = n?n:1;
237da2e3ebdSchin 			n = 0;
238da2e3ebdSchin 			break;
239da2e3ebdSchin 
240da2e3ebdSchin 		default:
241da2e3ebdSchin 			if(!isdigit(c))
242da2e3ebdSchin 				error(ERROR_exit(1),"bad list for c/f option");
243da2e3ebdSchin 			n = 10*n + (c-'0');
244*3e14f97fSRoger A. Faulkner 			break;
245da2e3ebdSchin 		}
246da2e3ebdSchin 	/* NOTREACHED */
247da2e3ebdSchin }
248da2e3ebdSchin 
249da2e3ebdSchin /*
250da2e3ebdSchin  * cut each line of file <fdin> and put results to <fdout> using list <list>
251da2e3ebdSchin  */
252da2e3ebdSchin 
253*3e14f97fSRoger A. Faulkner static void
cutcols(Cut_t * cut,Sfio_t * fdin,Sfio_t * fdout)254*3e14f97fSRoger A. Faulkner cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
255da2e3ebdSchin {
256*3e14f97fSRoger A. Faulkner 	register int		c;
257*3e14f97fSRoger A. Faulkner 	register int		len;
258*3e14f97fSRoger A. Faulkner 	register int		ncol = 0;
259*3e14f97fSRoger A. Faulkner 	register const int*	lp = cut->list;
260*3e14f97fSRoger A. Faulkner 	register char*		bp;
261da2e3ebdSchin 	register int		skip; /* non-zero for don't copy */
262*3e14f97fSRoger A. Faulkner 	int			must;
263*3e14f97fSRoger A. Faulkner 	char*			ep;
264*3e14f97fSRoger A. Faulkner 	const char*		xx;
265*3e14f97fSRoger A. Faulkner 
266*3e14f97fSRoger A. Faulkner 	for (;;)
267da2e3ebdSchin 	{
268*3e14f97fSRoger A. Faulkner 		if (len = cut->reclen)
269*3e14f97fSRoger A. Faulkner 			bp = sfreserve(fdin, len, -1);
270da2e3ebdSchin 		else
271*3e14f97fSRoger A. Faulkner 			bp = sfgetr(fdin, '\n', 0);
272*3e14f97fSRoger A. Faulkner 		if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
273da2e3ebdSchin 			break;
274da2e3ebdSchin 		len = sfvalue(fdin);
275*3e14f97fSRoger A. Faulkner 		ep = bp + len;
276*3e14f97fSRoger A. Faulkner 		xx = 0;
277*3e14f97fSRoger A. Faulkner 		if (!(ncol = skip  = *(lp = cut->list)))
278da2e3ebdSchin 			ncol = *++lp;
279*3e14f97fSRoger A. Faulkner 		must = 1;
280*3e14f97fSRoger A. Faulkner 		do
281da2e3ebdSchin 		{
282*3e14f97fSRoger A. Faulkner 			if (cut->nosplit)
283*3e14f97fSRoger A. Faulkner 			{
284*3e14f97fSRoger A. Faulkner 				register const char*	s = bp;
285*3e14f97fSRoger A. Faulkner 				register int		w = len < ncol ? len : ncol;
286*3e14f97fSRoger A. Faulkner 				register int		z;
287*3e14f97fSRoger A. Faulkner 
288*3e14f97fSRoger A. Faulkner 				while (w > 0)
289*3e14f97fSRoger A. Faulkner 				{
290*3e14f97fSRoger A. Faulkner 					if (!(*s & 0x80))
291*3e14f97fSRoger A. Faulkner 						z = 1;
292*3e14f97fSRoger A. Faulkner 					else if ((z = mblen(s, w)) <= 0)
293*3e14f97fSRoger A. Faulkner 					{
294*3e14f97fSRoger A. Faulkner 						if (s == bp && xx)
295*3e14f97fSRoger A. Faulkner 						{
296*3e14f97fSRoger A. Faulkner 							w += s - xx;
297*3e14f97fSRoger A. Faulkner 							bp = (char*)(s = xx);
298*3e14f97fSRoger A. Faulkner 							xx = 0;
299*3e14f97fSRoger A. Faulkner 							continue;
300*3e14f97fSRoger A. Faulkner 						}
301*3e14f97fSRoger A. Faulkner 						xx = s;
302*3e14f97fSRoger A. Faulkner 						if (skip)
303*3e14f97fSRoger A. Faulkner 							s += w;
304*3e14f97fSRoger A. Faulkner 						w = 0;
305*3e14f97fSRoger A. Faulkner 						break;
306*3e14f97fSRoger A. Faulkner 					}
307*3e14f97fSRoger A. Faulkner 					s += z;
308*3e14f97fSRoger A. Faulkner 					w -= z;
309*3e14f97fSRoger A. Faulkner 				}
310*3e14f97fSRoger A. Faulkner 				c = s - bp;
311*3e14f97fSRoger A. Faulkner 				ncol = !w && ncol >= len;
312*3e14f97fSRoger A. Faulkner 			}
313*3e14f97fSRoger A. Faulkner 			else if (cut->cflag)
314*3e14f97fSRoger A. Faulkner 			{
315*3e14f97fSRoger A. Faulkner 				register const char*	s = bp;
316*3e14f97fSRoger A. Faulkner 				register int		w = len;
317*3e14f97fSRoger A. Faulkner 				register int		z;
318*3e14f97fSRoger A. Faulkner 
319*3e14f97fSRoger A. Faulkner 				while (w > 0 && ncol > 0)
320*3e14f97fSRoger A. Faulkner 				{
321*3e14f97fSRoger A. Faulkner 					ncol--;
322*3e14f97fSRoger A. Faulkner 					if (!(*s & 0x80) || (z = mblen(s, w)) <= 0)
323*3e14f97fSRoger A. Faulkner 						z = 1;
324*3e14f97fSRoger A. Faulkner 					s += z;
325*3e14f97fSRoger A. Faulkner 					w -= z;
326*3e14f97fSRoger A. Faulkner 
327*3e14f97fSRoger A. Faulkner 				}
328*3e14f97fSRoger A. Faulkner 				c = s - bp;
329*3e14f97fSRoger A. Faulkner 				ncol = !w && (ncol || !skip);
330*3e14f97fSRoger A. Faulkner 			}
331*3e14f97fSRoger A. Faulkner 			else
332*3e14f97fSRoger A. Faulkner 			{
333*3e14f97fSRoger A. Faulkner 				if ((c = ncol) > len)
334da2e3ebdSchin 					c = len;
335da2e3ebdSchin 				else if (c == len && !skip)
336da2e3ebdSchin 					ncol++;
337da2e3ebdSchin 				ncol -= c;
338*3e14f97fSRoger A. Faulkner 			}
339*3e14f97fSRoger A. Faulkner 			if (!skip && c)
340*3e14f97fSRoger A. Faulkner 			{
341*3e14f97fSRoger A. Faulkner 				if (sfwrite(fdout, (char*)bp, c) < 0)
3427c2fbfb3SApril Chin 					return;
343*3e14f97fSRoger A. Faulkner 				must = 0;
344*3e14f97fSRoger A. Faulkner 			}
345*3e14f97fSRoger A. Faulkner 			bp += c;
346da2e3ebdSchin 			if (ncol)
347da2e3ebdSchin 				break;
348da2e3ebdSchin 			len -= c;
349da2e3ebdSchin 			ncol = *++lp;
350da2e3ebdSchin 			skip = !skip;
351*3e14f97fSRoger A. Faulkner 		} while (ncol != HUGE);
352*3e14f97fSRoger A. Faulkner 		if (!cut->nlflag && (skip || must || cut->reclen))
353*3e14f97fSRoger A. Faulkner 		{
354*3e14f97fSRoger A. Faulkner 			if (cut->ldelim.len > 1)
355*3e14f97fSRoger A. Faulkner 				sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
356*3e14f97fSRoger A. Faulkner 			else
357*3e14f97fSRoger A. Faulkner 				sfputc(fdout, cut->ldelim.chr);
358da2e3ebdSchin 		}
359da2e3ebdSchin 	}
360da2e3ebdSchin }
361da2e3ebdSchin 
362da2e3ebdSchin /*
363da2e3ebdSchin  * cut each line of file <fdin> and put results to <fdout> using list <list>
364da2e3ebdSchin  * stream <fdin> must be line buffered
365da2e3ebdSchin  */
366da2e3ebdSchin 
367*3e14f97fSRoger A. Faulkner static void
cutfields(Cut_t * cut,Sfio_t * fdin,Sfio_t * fdout)368*3e14f97fSRoger A. Faulkner cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
369da2e3ebdSchin {
370*3e14f97fSRoger A. Faulkner 	register unsigned char *sp = cut->space;
371da2e3ebdSchin 	register unsigned char *cp;
372*3e14f97fSRoger A. Faulkner 	register unsigned char *wp;
373da2e3ebdSchin 	register int c, nfields;
374*3e14f97fSRoger A. Faulkner 	register const int *lp = cut->list;
375da2e3ebdSchin 	register unsigned char *copy;
376da2e3ebdSchin 	register int nodelim, empty, inword=0;
377*3e14f97fSRoger A. Faulkner 	register unsigned char *ep;
378*3e14f97fSRoger A. Faulkner 	unsigned char *bp, *first;
379da2e3ebdSchin 	int lastchar;
380*3e14f97fSRoger A. Faulkner 	wchar_t w;
381da2e3ebdSchin 	Sfio_t *fdtmp = 0;
382da2e3ebdSchin 	long offset = 0;
383*3e14f97fSRoger A. Faulkner 	unsigned char mb[8];
384da2e3ebdSchin 	/* process each buffer */
385*3e14f97fSRoger A. Faulkner 	while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
386da2e3ebdSchin 	{
387*3e14f97fSRoger A. Faulkner 		cp = bp;
388*3e14f97fSRoger A. Faulkner 		ep = cp + --c;
389*3e14f97fSRoger A. Faulkner 		if((lastchar = cp[c]) != cut->eob)
390*3e14f97fSRoger A. Faulkner 			*ep = cut->eob;
391da2e3ebdSchin 		/* process each line in the buffer */
392*3e14f97fSRoger A. Faulkner 		while (cp <= ep)
393da2e3ebdSchin 		{
394da2e3ebdSchin 			first = cp;
395da2e3ebdSchin 			if (!inword)
396da2e3ebdSchin 			{
397da2e3ebdSchin 				nodelim = empty = 1;
398da2e3ebdSchin 				copy = cp;
399*3e14f97fSRoger A. Faulkner 				if (nfields = *(lp = cut->list))
400da2e3ebdSchin 					copy = 0;
401da2e3ebdSchin 				else
402da2e3ebdSchin 					nfields = *++lp;
403da2e3ebdSchin 			}
404da2e3ebdSchin 			else if (copy)
405da2e3ebdSchin 				copy = cp;
406da2e3ebdSchin 			inword = 0;
407*3e14f97fSRoger A. Faulkner 			do
408da2e3ebdSchin 			{
409da2e3ebdSchin 				/* skip over non-delimiter characters */
410*3e14f97fSRoger A. Faulkner 				if (cut->mb)
411*3e14f97fSRoger A. Faulkner 					for (;;)
412da2e3ebdSchin 					{
413*3e14f97fSRoger A. Faulkner 						switch (c = sp[*(unsigned char*)cp++])
414*3e14f97fSRoger A. Faulkner 						{
415*3e14f97fSRoger A. Faulkner 						case 0:
416*3e14f97fSRoger A. Faulkner 							continue;
417*3e14f97fSRoger A. Faulkner 						case SP_WIDE:
418*3e14f97fSRoger A. Faulkner 							wp = --cp;
419*3e14f97fSRoger A. Faulkner 							while ((c = mb2wc(w, cp, ep - cp)) <= 0)
420*3e14f97fSRoger A. Faulkner 							{
421*3e14f97fSRoger A. Faulkner 								/* mb char possibly spanning buffer boundary -- fun stuff */
422*3e14f97fSRoger A. Faulkner 								if ((ep - cp) < mbmax())
423*3e14f97fSRoger A. Faulkner 								{
424*3e14f97fSRoger A. Faulkner 									int	i;
425*3e14f97fSRoger A. Faulkner 									int	j;
426*3e14f97fSRoger A. Faulkner 									int	k;
427*3e14f97fSRoger A. Faulkner 
428*3e14f97fSRoger A. Faulkner 									if (lastchar != cut->eob)
429*3e14f97fSRoger A. Faulkner 									{
430*3e14f97fSRoger A. Faulkner 										*ep = lastchar;
431*3e14f97fSRoger A. Faulkner 										if ((c = mb2wc(w, cp, ep - cp)) > 0)
432da2e3ebdSchin 											break;
433*3e14f97fSRoger A. Faulkner 									}
434*3e14f97fSRoger A. Faulkner 									if (copy)
435*3e14f97fSRoger A. Faulkner 									{
436*3e14f97fSRoger A. Faulkner 										empty = 0;
437*3e14f97fSRoger A. Faulkner 										if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
438*3e14f97fSRoger A. Faulkner 											goto failed;
439*3e14f97fSRoger A. Faulkner 									}
440*3e14f97fSRoger A. Faulkner 									for (i = 0; i <= (ep - cp); i++)
441*3e14f97fSRoger A. Faulkner 										mb[i] = cp[i];
442*3e14f97fSRoger A. Faulkner 									if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
443*3e14f97fSRoger A. Faulkner 										goto failed;
444*3e14f97fSRoger A. Faulkner 									cp = bp;
445*3e14f97fSRoger A. Faulkner 									ep = cp + --c;
446*3e14f97fSRoger A. Faulkner 									if ((lastchar = cp[c]) != cut->eob)
447*3e14f97fSRoger A. Faulkner 										*ep = cut->eob;
448*3e14f97fSRoger A. Faulkner 									j = i;
449*3e14f97fSRoger A. Faulkner 									k = 0;
450*3e14f97fSRoger A. Faulkner 									while (j < mbmax())
451*3e14f97fSRoger A. Faulkner 										mb[j++] = cp[k++];
452*3e14f97fSRoger A. Faulkner 									if ((c = mb2wc(w, (char*)mb, j)) <= 0)
453*3e14f97fSRoger A. Faulkner 									{
454*3e14f97fSRoger A. Faulkner 										c = i;
455*3e14f97fSRoger A. Faulkner 										w = 0;
456*3e14f97fSRoger A. Faulkner 									}
457*3e14f97fSRoger A. Faulkner 									first = bp = cp += c - i;
458*3e14f97fSRoger A. Faulkner 									if (copy)
459*3e14f97fSRoger A. Faulkner 									{
460*3e14f97fSRoger A. Faulkner 										copy = bp;
461*3e14f97fSRoger A. Faulkner 										if (w == cut->ldelim.chr)
462*3e14f97fSRoger A. Faulkner 											lastchar = cut->ldelim.chr;
463*3e14f97fSRoger A. Faulkner 										else if (w != cut->wdelim.chr)
464*3e14f97fSRoger A. Faulkner 										{
465*3e14f97fSRoger A. Faulkner 											empty = 0;
466*3e14f97fSRoger A. Faulkner 											if (sfwrite(fdout, (char*)mb, c) < 0)
467*3e14f97fSRoger A. Faulkner 												goto failed;
468*3e14f97fSRoger A. Faulkner 										}
469*3e14f97fSRoger A. Faulkner 									}
470*3e14f97fSRoger A. Faulkner 									c = 0;
471*3e14f97fSRoger A. Faulkner 								}
472*3e14f97fSRoger A. Faulkner 								else
473*3e14f97fSRoger A. Faulkner 								{
474*3e14f97fSRoger A. Faulkner 									w = *cp;
475*3e14f97fSRoger A. Faulkner 									c = 1;
476*3e14f97fSRoger A. Faulkner 								}
477da2e3ebdSchin 								break;
478*3e14f97fSRoger A. Faulkner 							}
479*3e14f97fSRoger A. Faulkner 							cp += c;
480*3e14f97fSRoger A. Faulkner 							c = w;
481*3e14f97fSRoger A. Faulkner 							if (c == cut->wdelim.chr)
482*3e14f97fSRoger A. Faulkner 							{
483*3e14f97fSRoger A. Faulkner 								c = SP_WORD;
484*3e14f97fSRoger A. Faulkner 								break;
485*3e14f97fSRoger A. Faulkner 							}
486*3e14f97fSRoger A. Faulkner 							if (c == cut->ldelim.chr)
487*3e14f97fSRoger A. Faulkner 							{
488*3e14f97fSRoger A. Faulkner 								c = SP_LINE;
489*3e14f97fSRoger A. Faulkner 								break;
490*3e14f97fSRoger A. Faulkner 							}
491*3e14f97fSRoger A. Faulkner 							continue;
492*3e14f97fSRoger A. Faulkner 						default:
493*3e14f97fSRoger A. Faulkner 							wp = cp - 1;
494*3e14f97fSRoger A. Faulkner 							break;
495*3e14f97fSRoger A. Faulkner 						}
496*3e14f97fSRoger A. Faulkner 						break;
497*3e14f97fSRoger A. Faulkner 					}
498*3e14f97fSRoger A. Faulkner 				else
499*3e14f97fSRoger A. Faulkner 				{
500*3e14f97fSRoger A. Faulkner 					while (!(c = sp[*cp++]));
501*3e14f97fSRoger A. Faulkner 					wp = cp - 1;
502*3e14f97fSRoger A. Faulkner 				}
503*3e14f97fSRoger A. Faulkner 				/* check for end-of-line */
504*3e14f97fSRoger A. Faulkner 				if (c == SP_LINE)
505*3e14f97fSRoger A. Faulkner 				{
506*3e14f97fSRoger A. Faulkner 					if (cp <= ep)
507*3e14f97fSRoger A. Faulkner 						break;
508*3e14f97fSRoger A. Faulkner 					if (lastchar == cut->ldelim.chr)
509*3e14f97fSRoger A. Faulkner 						break;
510*3e14f97fSRoger A. Faulkner 					/* restore cut->last character */
511*3e14f97fSRoger A. Faulkner 					if (lastchar != cut->eob)
512*3e14f97fSRoger A. Faulkner 						*ep = lastchar;
513da2e3ebdSchin 					inword++;
514*3e14f97fSRoger A. Faulkner 					if (!sp[lastchar])
515da2e3ebdSchin 						break;
516da2e3ebdSchin 				}
517da2e3ebdSchin 				nodelim = 0;
518da2e3ebdSchin 				if (--nfields > 0)
519da2e3ebdSchin 					continue;
520da2e3ebdSchin 				nfields = *++lp;
521da2e3ebdSchin 				if (copy)
522da2e3ebdSchin 				{
523da2e3ebdSchin 					empty = 0;
524*3e14f97fSRoger A. Faulkner 					if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
525da2e3ebdSchin 						goto failed;
526da2e3ebdSchin 					copy = 0;
527da2e3ebdSchin 				}
528da2e3ebdSchin 				else
529da2e3ebdSchin 					/* set to delimiter unless the first field */
530*3e14f97fSRoger A. Faulkner 					copy = empty ? cp : wp;
531*3e14f97fSRoger A. Faulkner 			} while (!inword);
532da2e3ebdSchin 			if (!inword)
533da2e3ebdSchin 			{
534da2e3ebdSchin 				if (!copy)
535da2e3ebdSchin 				{
536da2e3ebdSchin 					if (nodelim)
537da2e3ebdSchin 					{
538*3e14f97fSRoger A. Faulkner 						if (!cut->sflag)
539da2e3ebdSchin 						{
540da2e3ebdSchin 							if (offset)
541da2e3ebdSchin 							{
542da2e3ebdSchin 								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
543da2e3ebdSchin 								sfmove(fdtmp,fdout,offset,-1);
544da2e3ebdSchin 							}
545da2e3ebdSchin 							copy = first;
546da2e3ebdSchin 						}
547da2e3ebdSchin 					}
548da2e3ebdSchin 					else
549da2e3ebdSchin 						sfputc(fdout,'\n');
550da2e3ebdSchin 				}
551da2e3ebdSchin 				if (offset)
552da2e3ebdSchin 					sfseek(fdtmp,offset=0,SEEK_SET);
553da2e3ebdSchin 			}
554*3e14f97fSRoger A. Faulkner 			if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
555da2e3ebdSchin 				goto failed;
556da2e3ebdSchin 		}
557da2e3ebdSchin 		/* see whether to save in tmp file */
558*3e14f97fSRoger A. Faulkner 		if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
559da2e3ebdSchin 		{
560da2e3ebdSchin 			/* copy line to tmpfile in case no fields */
561da2e3ebdSchin 			if(!fdtmp)
562da2e3ebdSchin 				fdtmp = sftmp(BLOCK);
563da2e3ebdSchin 			sfwrite(fdtmp,(char*)first,c);
564da2e3ebdSchin 			offset +=c;
565da2e3ebdSchin 		}
566da2e3ebdSchin 	}
567da2e3ebdSchin  failed:
568da2e3ebdSchin 	if(fdtmp)
569da2e3ebdSchin 		sfclose(fdtmp);
570da2e3ebdSchin }
571da2e3ebdSchin 
572da2e3ebdSchin int
b_cut(int argc,char ** argv,void * context)573*3e14f97fSRoger A. Faulkner b_cut(int argc, char** argv, void* context)
574da2e3ebdSchin {
575da2e3ebdSchin 	register char*		cp = 0;
576da2e3ebdSchin 	register Sfio_t*	fp;
577*3e14f97fSRoger A. Faulkner 	char*			s;
578da2e3ebdSchin 	int			n;
579*3e14f97fSRoger A. Faulkner 	Cut_t*			cut;
580da2e3ebdSchin 	int			mode = 0;
581*3e14f97fSRoger A. Faulkner 	Delim_t			wdelim;
582*3e14f97fSRoger A. Faulkner 	Delim_t			ldelim;
583da2e3ebdSchin 	size_t			reclen = 0;
584da2e3ebdSchin 
585da2e3ebdSchin 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
586*3e14f97fSRoger A. Faulkner 	wdelim.chr = '\t';
587*3e14f97fSRoger A. Faulkner 	ldelim.chr = '\n';
588*3e14f97fSRoger A. Faulkner 	wdelim.len = ldelim.len = 1;
589*3e14f97fSRoger A. Faulkner 	for (;;)
590da2e3ebdSchin 	{
591*3e14f97fSRoger A. Faulkner 		switch (n = optget(argv, usage))
592*3e14f97fSRoger A. Faulkner 		{
593*3e14f97fSRoger A. Faulkner 		case 0:
594*3e14f97fSRoger A. Faulkner 			break;
595da2e3ebdSchin 		case 'b':
596da2e3ebdSchin 		case 'c':
597da2e3ebdSchin 			if(mode&C_FIELDS)
598da2e3ebdSchin 			{
599da2e3ebdSchin 				error(2, "f option already specified");
600*3e14f97fSRoger A. Faulkner 				continue;
601da2e3ebdSchin 			}
602da2e3ebdSchin 			cp = opt_info.arg;
603da2e3ebdSchin 			if(n=='b')
604da2e3ebdSchin 				mode |= C_BYTES;
605da2e3ebdSchin 			else
606da2e3ebdSchin 				mode |= C_CHARS;
607*3e14f97fSRoger A. Faulkner 			continue;
608da2e3ebdSchin 		case 'D':
609*3e14f97fSRoger A. Faulkner 			ldelim.str = opt_info.arg;
610*3e14f97fSRoger A. Faulkner 			if (mbwide())
611*3e14f97fSRoger A. Faulkner 			{
612*3e14f97fSRoger A. Faulkner 				s = opt_info.arg;
613*3e14f97fSRoger A. Faulkner 				ldelim.chr = mbchar(s);
614*3e14f97fSRoger A. Faulkner 				if ((n = s - opt_info.arg) > 1)
615*3e14f97fSRoger A. Faulkner 				{
616*3e14f97fSRoger A. Faulkner 					ldelim.len = n;
617*3e14f97fSRoger A. Faulkner 					continue;
618*3e14f97fSRoger A. Faulkner 				}
619*3e14f97fSRoger A. Faulkner 			}
620*3e14f97fSRoger A. Faulkner 			ldelim.chr = *(unsigned char*)opt_info.arg;
621*3e14f97fSRoger A. Faulkner 			ldelim.len = 1;
622*3e14f97fSRoger A. Faulkner 			continue;
623da2e3ebdSchin 		case 'd':
624*3e14f97fSRoger A. Faulkner 			wdelim.str = opt_info.arg;
625*3e14f97fSRoger A. Faulkner 			if (mbwide())
626*3e14f97fSRoger A. Faulkner 			{
627*3e14f97fSRoger A. Faulkner 				s = opt_info.arg;
628*3e14f97fSRoger A. Faulkner 				wdelim.chr = mbchar(s);
629*3e14f97fSRoger A. Faulkner 				if ((n = s - opt_info.arg) > 1)
630*3e14f97fSRoger A. Faulkner 				{
631*3e14f97fSRoger A. Faulkner 					wdelim.len = n;
632*3e14f97fSRoger A. Faulkner 					continue;
633*3e14f97fSRoger A. Faulkner 				}
634*3e14f97fSRoger A. Faulkner 			}
635*3e14f97fSRoger A. Faulkner 			wdelim.chr = *(unsigned char*)opt_info.arg;
636*3e14f97fSRoger A. Faulkner 			wdelim.len = 1;
637*3e14f97fSRoger A. Faulkner 			continue;
638da2e3ebdSchin 		case 'f':
639da2e3ebdSchin 			if(mode&(C_CHARS|C_BYTES))
640da2e3ebdSchin 			{
641da2e3ebdSchin 				error(2, "c option already specified");
642*3e14f97fSRoger A. Faulkner 				continue;
643da2e3ebdSchin 			}
644da2e3ebdSchin 			cp = opt_info.arg;
645da2e3ebdSchin 			mode |= C_FIELDS;
646*3e14f97fSRoger A. Faulkner 			continue;
647da2e3ebdSchin 		case 'n':
648*3e14f97fSRoger A. Faulkner 			mode |= C_NOSPLIT;
649*3e14f97fSRoger A. Faulkner 			continue;
650da2e3ebdSchin 		case 'N':
651da2e3ebdSchin 			mode |= C_NONEWLINE;
652*3e14f97fSRoger A. Faulkner 			continue;
653da2e3ebdSchin 		case 'R':
654da2e3ebdSchin 		case 'r':
655da2e3ebdSchin 			if(opt_info.num>0)
656da2e3ebdSchin 				reclen = opt_info.num;
657*3e14f97fSRoger A. Faulkner 			continue;
658da2e3ebdSchin 		case 's':
659da2e3ebdSchin 			mode |= C_SUPRESS;
660*3e14f97fSRoger A. Faulkner 			continue;
661da2e3ebdSchin 		case ':':
662da2e3ebdSchin 			error(2, "%s", opt_info.arg);
663da2e3ebdSchin 			break;
664da2e3ebdSchin 		case '?':
665da2e3ebdSchin 			error(ERROR_usage(2), "%s", opt_info.arg);
666da2e3ebdSchin 			break;
667da2e3ebdSchin 		}
668*3e14f97fSRoger A. Faulkner 		break;
669*3e14f97fSRoger A. Faulkner 	}
670da2e3ebdSchin 	argv += opt_info.index;
671da2e3ebdSchin 	if (error_info.errors)
672da2e3ebdSchin 		error(ERROR_usage(2), "%s",optusage(NiL));
673da2e3ebdSchin 	if(!cp)
674da2e3ebdSchin 	{
675da2e3ebdSchin 		error(2, "b, c or f option must be specified");
676da2e3ebdSchin 		error(ERROR_usage(2), "%s", optusage(NiL));
677da2e3ebdSchin 	}
678da2e3ebdSchin 	if(!*cp)
679da2e3ebdSchin 		error(3, "non-empty b, c or f option must be specified");
680da2e3ebdSchin 	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
681da2e3ebdSchin 		error(3, "s option requires f option");
682*3e14f97fSRoger A. Faulkner 	cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
683da2e3ebdSchin 	if(cp = *argv)
684da2e3ebdSchin 		argv++;
685da2e3ebdSchin 	do
686da2e3ebdSchin 	{
687da2e3ebdSchin 		if(!cp || streq(cp,"-"))
688da2e3ebdSchin 			fp = sfstdin;
689da2e3ebdSchin 		else if(!(fp = sfopen(NiL,cp,"r")))
690da2e3ebdSchin 		{
691da2e3ebdSchin 			error(ERROR_system(0),"%s: cannot open",cp);
692da2e3ebdSchin 			continue;
693da2e3ebdSchin 		}
694da2e3ebdSchin 		if(mode&C_FIELDS)
695*3e14f97fSRoger A. Faulkner 			cutfields(cut,fp,sfstdout);
696da2e3ebdSchin 		else
697*3e14f97fSRoger A. Faulkner 			cutcols(cut,fp,sfstdout);
698da2e3ebdSchin 		if(fp!=sfstdin)
699da2e3ebdSchin 			sfclose(fp);
7007c2fbfb3SApril Chin 	} while(cp = *argv++);
7017c2fbfb3SApril Chin 	if (sfsync(sfstdout))
7027c2fbfb3SApril Chin 		error(ERROR_system(0), "write error");
703*3e14f97fSRoger A. Faulkner 	return error_info.errors != 0;
704da2e3ebdSchin }
705