xref: /freebsd/usr.bin/cut/cut.c (revision e4e9813eb92cd7c4d4b819a8fbed5cbd3d92f5d8)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Adam S. Moskowitz of Menlo Consulting and Marciano Pitargue.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #ifndef lint
38 static const char copyright[] =
39 "@(#) Copyright (c) 1989, 1993\n\
40 	The Regents of the University of California.  All rights reserved.\n";
41 static const char sccsid[] = "@(#)cut.c	8.3 (Berkeley) 5/4/95";
42 #endif /* not lint */
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include <ctype.h>
47 #include <err.h>
48 #include <errno.h>
49 #include <limits.h>
50 #include <locale.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <wchar.h>
56 
57 int	bflag;
58 int	cflag;
59 wchar_t	dchar;
60 char	dcharmb[MB_LEN_MAX + 1];
61 int	dflag;
62 int	fflag;
63 int	nflag;
64 int	sflag;
65 
66 size_t	autostart, autostop, maxval;
67 char *	positions;
68 
69 int	b_cut(FILE *, const char *);
70 int	b_n_cut(FILE *, const char *);
71 int	c_cut(FILE *, const char *);
72 int	f_cut(FILE *, const char *);
73 void	get_list(char *);
74 void	needpos(size_t);
75 static 	void usage(void);
76 
77 int
78 main(int argc, char *argv[])
79 {
80 	FILE *fp;
81 	int (*fcn)(FILE *, const char *);
82 	int ch, rval;
83 	size_t n;
84 
85 	setlocale(LC_ALL, "");
86 
87 	fcn = NULL;
88 	dchar = '\t';			/* default delimiter is \t */
89 	strcpy(dcharmb, "\t");
90 
91 	while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
92 		switch(ch) {
93 		case 'b':
94 			get_list(optarg);
95 			bflag = 1;
96 			break;
97 		case 'c':
98 			get_list(optarg);
99 			cflag = 1;
100 			break;
101 		case 'd':
102 			n = mbrtowc(&dchar, optarg, MB_LEN_MAX, NULL);
103 			if (dchar == '\0' || n != strlen(optarg))
104 				errx(1, "bad delimiter");
105 			strcpy(dcharmb, optarg);
106 			dflag = 1;
107 			break;
108 		case 'f':
109 			get_list(optarg);
110 			fflag = 1;
111 			break;
112 		case 's':
113 			sflag = 1;
114 			break;
115 		case 'n':
116 			nflag = 1;
117 			break;
118 		case '?':
119 		default:
120 			usage();
121 		}
122 	argc -= optind;
123 	argv += optind;
124 
125 	if (fflag) {
126 		if (bflag || cflag || nflag)
127 			usage();
128 	} else if (!(bflag || cflag) || dflag || sflag)
129 		usage();
130 	else if (!bflag && nflag)
131 		usage();
132 
133 	if (fflag)
134 		fcn = f_cut;
135 	else if (cflag)
136 		fcn = MB_CUR_MAX > 1 ? c_cut : b_cut;
137 	else if (bflag)
138 		fcn = nflag && MB_CUR_MAX > 1 ? b_n_cut : b_cut;
139 
140 	rval = 0;
141 	if (*argv)
142 		for (; *argv; ++argv) {
143 			if (strcmp(*argv, "-") == 0)
144 				rval |= fcn(stdin, "stdin");
145 			else {
146 				if (!(fp = fopen(*argv, "r"))) {
147 					warn("%s", *argv);
148 					rval = 1;
149 					continue;
150 				}
151 				fcn(fp, *argv);
152 				(void)fclose(fp);
153 			}
154 		}
155 	else
156 		rval = fcn(stdin, "stdin");
157 	exit(rval);
158 }
159 
160 void
161 get_list(char *list)
162 {
163 	size_t setautostart, start, stop;
164 	char *pos;
165 	char *p;
166 
167 	/*
168 	 * set a byte in the positions array to indicate if a field or
169 	 * column is to be selected; use +1, it's 1-based, not 0-based.
170 	 * Numbers and number ranges may be overlapping, repeated, and in
171 	 * any order. We handle "-3-5" although there's no real reason too.
172 	 */
173 	for (; (p = strsep(&list, ", \t")) != NULL;) {
174 		setautostart = start = stop = 0;
175 		if (*p == '-') {
176 			++p;
177 			setautostart = 1;
178 		}
179 		if (isdigit((unsigned char)*p)) {
180 			start = stop = strtol(p, &p, 10);
181 			if (setautostart && start > autostart)
182 				autostart = start;
183 		}
184 		if (*p == '-') {
185 			if (isdigit((unsigned char)p[1]))
186 				stop = strtol(p + 1, &p, 10);
187 			if (*p == '-') {
188 				++p;
189 				if (!autostop || autostop > stop)
190 					autostop = stop;
191 			}
192 		}
193 		if (*p)
194 			errx(1, "[-cf] list: illegal list value");
195 		if (!stop || !start)
196 			errx(1, "[-cf] list: values may not include zero");
197 		if (maxval < stop) {
198 			maxval = stop;
199 			needpos(maxval + 1);
200 		}
201 		for (pos = positions + start; start++ <= stop; *pos++ = 1);
202 	}
203 
204 	/* overlapping ranges */
205 	if (autostop && maxval > autostop) {
206 		maxval = autostop;
207 		needpos(maxval + 1);
208 	}
209 
210 	/* set autostart */
211 	if (autostart)
212 		memset(positions + 1, '1', autostart);
213 }
214 
215 void
216 needpos(size_t n)
217 {
218 	static size_t npos;
219 	size_t oldnpos;
220 
221 	/* Grow the positions array to at least the specified size. */
222 	if (n > npos) {
223 		oldnpos = npos;
224 		if (npos == 0)
225 			npos = n;
226 		while (n > npos)
227 			npos *= 2;
228 		if ((positions = realloc(positions, npos)) == NULL)
229 			err(1, "realloc");
230 		memset((char *)positions + oldnpos, 0, npos - oldnpos);
231 	}
232 }
233 
234 int
235 b_cut(FILE *fp, const char *fname __unused)
236 {
237 	int ch, col;
238 	char *pos;
239 
240 	ch = 0;
241 	for (;;) {
242 		pos = positions + 1;
243 		for (col = maxval; col; --col) {
244 			if ((ch = getc(fp)) == EOF)
245 				return (0);
246 			if (ch == '\n')
247 				break;
248 			if (*pos++)
249 				(void)putchar(ch);
250 		}
251 		if (ch != '\n') {
252 			if (autostop)
253 				while ((ch = getc(fp)) != EOF && ch != '\n')
254 					(void)putchar(ch);
255 			else
256 				while ((ch = getc(fp)) != EOF && ch != '\n');
257 		}
258 		(void)putchar('\n');
259 	}
260 	return (0);
261 }
262 
263 /*
264  * Cut based on byte positions, taking care not to split multibyte characters.
265  * Although this function also handles the case where -n is not specified,
266  * b_cut() ought to be much faster.
267  */
268 int
269 b_n_cut(FILE *fp, const char *fname)
270 {
271 	size_t col, i, lbuflen;
272 	char *lbuf;
273 	int canwrite, clen, warned;
274 	mbstate_t mbs;
275 
276 	memset(&mbs, 0, sizeof(mbs));
277 	warned = 0;
278 	while ((lbuf = fgetln(fp, &lbuflen)) != NULL) {
279 		for (col = 0; lbuflen > 0; col += clen) {
280 			if ((clen = mbrlen(lbuf, lbuflen, &mbs)) < 0) {
281 				if (!warned) {
282 					warn("%s", fname);
283 					warned = 1;
284 				}
285 				memset(&mbs, 0, sizeof(mbs));
286 				clen = 1;
287 			}
288 			if (clen == 0 || *lbuf == '\n')
289 				break;
290 			if (col < maxval && !positions[1 + col]) {
291 				/*
292 				 * Print the character if (1) after an initial
293 				 * segment of un-selected bytes, the rest of
294 				 * it is selected, and (2) the last byte is
295 				 * selected.
296 				 */
297 				i = col;
298 				while (i < col + clen && i < maxval &&
299 				    !positions[1 + i])
300 					i++;
301 				canwrite = i < col + clen;
302 				for (; i < col + clen && i < maxval; i++)
303 					canwrite &= positions[1 + i];
304 				if (canwrite)
305 					fwrite(lbuf, 1, clen, stdout);
306 			} else {
307 				/*
308 				 * Print the character if all of it has
309 				 * been selected.
310 				 */
311 				canwrite = 1;
312 				for (i = col; i < col + clen; i++)
313 					if ((i >= maxval && !autostop) ||
314 					    (i < maxval && !positions[1 + i])) {
315 						canwrite = 0;
316 						break;
317 					}
318 				if (canwrite)
319 					fwrite(lbuf, 1, clen, stdout);
320 			}
321 			lbuf += clen;
322 			lbuflen -= clen;
323 		}
324 		if (lbuflen > 0)
325 			putchar('\n');
326 	}
327 	return (warned);
328 }
329 
330 int
331 c_cut(FILE *fp, const char *fname)
332 {
333 	wint_t ch;
334 	int col;
335 	char *pos;
336 
337 	ch = 0;
338 	for (;;) {
339 		pos = positions + 1;
340 		for (col = maxval; col; --col) {
341 			if ((ch = getwc(fp)) == WEOF)
342 				goto out;
343 			if (ch == '\n')
344 				break;
345 			if (*pos++)
346 				(void)putwchar(ch);
347 		}
348 		if (ch != '\n') {
349 			if (autostop)
350 				while ((ch = getwc(fp)) != WEOF && ch != '\n')
351 					(void)putwchar(ch);
352 			else
353 				while ((ch = getwc(fp)) != WEOF && ch != '\n');
354 		}
355 		(void)putwchar('\n');
356 	}
357 out:
358 	if (ferror(fp)) {
359 		warn("%s", fname);
360 		return (1);
361 	}
362 	return (0);
363 }
364 
365 int
366 f_cut(FILE *fp, const char *fname)
367 {
368 	wchar_t ch;
369 	int field, i, isdelim;
370 	char *pos, *p;
371 	wchar_t sep;
372 	int output;
373 	char *lbuf, *mlbuf;
374 	size_t clen, lbuflen, reallen;
375 
376 	mlbuf = NULL;
377 	for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) {
378 		reallen = lbuflen;
379 		/* Assert EOL has a newline. */
380 		if (*(lbuf + lbuflen - 1) != '\n') {
381 			/* Can't have > 1 line with no trailing newline. */
382 			mlbuf = malloc(lbuflen + 1);
383 			if (mlbuf == NULL)
384 				err(1, "malloc");
385 			memcpy(mlbuf, lbuf, lbuflen);
386 			*(mlbuf + lbuflen) = '\n';
387 			lbuf = mlbuf;
388 			reallen++;
389 		}
390 		output = 0;
391 		for (isdelim = 0, p = lbuf;; p += clen) {
392 			clen = mbrtowc(&ch, p, lbuf + reallen - p, NULL);
393 			if (clen == (size_t)-1 || clen == (size_t)-2) {
394 				warnc(EILSEQ, "%s", fname);
395 				free(mlbuf);
396 				return (1);
397 			}
398 			if (clen == 0)
399 				clen = 1;
400 			/* this should work if newline is delimiter */
401 			if (ch == sep)
402 				isdelim = 1;
403 			if (ch == '\n') {
404 				if (!isdelim && !sflag)
405 					(void)fwrite(lbuf, lbuflen, 1, stdout);
406 				break;
407 			}
408 		}
409 		if (!isdelim)
410 			continue;
411 
412 		pos = positions + 1;
413 		for (field = maxval, p = lbuf; field; --field, ++pos) {
414 			if (*pos && output++)
415 				for (i = 0; dcharmb[i] != '\0'; i++)
416 					putchar(dcharmb[i]);
417 			for (;;) {
418 				clen = mbrtowc(&ch, p, lbuf + reallen - p,
419 				    NULL);
420 				if (clen == (size_t)-1 || clen == (size_t)-2) {
421 					warnc(EILSEQ, "%s", fname);
422 					free(mlbuf);
423 					return (1);
424 				}
425 				if (clen == 0)
426 					clen = 1;
427 				p += clen;
428 				if (ch == '\n' || ch == sep)
429 					break;
430 				if (*pos)
431 					for (i = 0; i < (int)clen; i++)
432 						putchar(p[i - clen]);
433 			}
434 			if (ch == '\n')
435 				break;
436 		}
437 		if (ch != '\n') {
438 			if (autostop) {
439 				if (output)
440 					for (i = 0; dcharmb[i] != '\0'; i++)
441 						putchar(dcharmb[i]);
442 				for (; (ch = *p) != '\n'; ++p)
443 					(void)putchar(ch);
444 			} else
445 				for (; (ch = *p) != '\n'; ++p);
446 		}
447 		(void)putchar('\n');
448 	}
449 	free(mlbuf);
450 	return (0);
451 }
452 
453 static void
454 usage(void)
455 {
456 	(void)fprintf(stderr, "%s\n%s\n%s\n",
457 		"usage: cut -b list [-n] [file ...]",
458 		"       cut -c list [file ...]",
459 		"       cut -f list [-s] [-d delim] [file ...]");
460 	exit(1);
461 }
462