xref: /freebsd/usr.bin/split/split.c (revision 2a3e3873a1e4cd958f2b0f85d3b10cfa40575d30)
1 /*
2  * Copyright (c) 1987, 1993, 1994
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #ifndef lint
34 static const char copyright[] =
35 "@(#) Copyright (c) 1987, 1993, 1994\n\
36 	The Regents of the University of California.  All rights reserved.\n";
37 #endif
38 
39 #ifndef lint
40 static const char sccsid[] = "@(#)split.c	8.2 (Berkeley) 4/16/94";
41 #endif
42 
43 #include <sys/param.h>
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 
47 #include <ctype.h>
48 #include <err.h>
49 #include <errno.h>
50 #include <fcntl.h>
51 #include <inttypes.h>
52 #include <limits.h>
53 #include <locale.h>
54 #include <stdbool.h>
55 #include <stdint.h>
56 #include <stdio.h>
57 #include <stdlib.h>
58 #include <string.h>
59 #include <unistd.h>
60 #include <regex.h>
61 #include <sysexits.h>
62 
63 #define DEFLINE	1000			/* Default num lines per file. */
64 
65 static off_t	 bytecnt;		/* Byte count to split on. */
66 static off_t	 chunks = 0;		/* Chunks count to split into. */
67 static long	 numlines;		/* Line count to split on. */
68 static int	 file_open;		/* If a file open. */
69 static int	 ifd = -1, ofd = -1;	/* Input/output file descriptors. */
70 static char	 bfr[MAXBSIZE];		/* I/O buffer. */
71 static char	 fname[MAXPATHLEN];	/* File name prefix. */
72 static regex_t	 rgx;
73 static int	 pflag;
74 static bool	 dflag;
75 static long	 sufflen = 2;		/* File name suffix length. */
76 
77 static void newfile(void);
78 static void split1(void);
79 static void split2(void);
80 static void split3(void);
81 static void usage(void);
82 
83 int
84 main(int argc, char **argv)
85 {
86 	intmax_t bytecnti;
87 	long scale;
88 	int ch;
89 	char *ep, *p;
90 
91 	setlocale(LC_ALL, "");
92 
93 	dflag = false;
94 	while ((ch = getopt(argc, argv, "0123456789a:b:dl:n:p:")) != -1)
95 		switch (ch) {
96 		case '0': case '1': case '2': case '3': case '4':
97 		case '5': case '6': case '7': case '8': case '9':
98 			/*
99 			 * Undocumented kludge: split was originally designed
100 			 * to take a number after a dash.
101 			 */
102 			if (numlines == 0) {
103 				p = argv[optind - 1];
104 				if (p[0] == '-' && p[1] == ch && !p[2])
105 					numlines = strtol(++p, &ep, 10);
106 				else
107 					numlines =
108 					    strtol(argv[optind] + 1, &ep, 10);
109 				if (numlines <= 0 || *ep)
110 					errx(EX_USAGE,
111 					    "%s: illegal line count", optarg);
112 			}
113 			break;
114 		case 'a':		/* Suffix length */
115 			if ((sufflen = strtol(optarg, &ep, 10)) <= 0 || *ep)
116 				errx(EX_USAGE,
117 				    "%s: illegal suffix length", optarg);
118 			break;
119 		case 'b':		/* Byte count. */
120 			errno = 0;
121 			if ((bytecnti = strtoimax(optarg, &ep, 10)) <= 0 ||
122 			    strchr("kKmMgG", *ep) == NULL || errno != 0)
123 				errx(EX_USAGE,
124 				    "%s: illegal byte count", optarg);
125 			if (*ep == 'k' || *ep == 'K')
126 				scale = 1024;
127 			else if (*ep == 'm' || *ep == 'M')
128 				scale = 1024 * 1024;
129 			else if (*ep == 'g' || *ep == 'G')
130 				scale = 1024 * 1024 * 1024;
131 			else
132 				scale = 1;
133 			if (bytecnti > OFF_MAX / scale)
134 				errx(EX_USAGE, "%s: offset too large", optarg);
135 			bytecnt = (off_t)(bytecnti * scale);
136 			break;
137 		case 'd':		/* Decimal suffix */
138 			dflag = true;
139 			break;
140 		case 'l':		/* Line count. */
141 			if (numlines != 0)
142 				usage();
143 			if ((numlines = strtol(optarg, &ep, 10)) <= 0 || *ep)
144 				errx(EX_USAGE,
145 				    "%s: illegal line count", optarg);
146 			break;
147 		case 'n':		/* Chunks. */
148 			if (!isdigit((unsigned char)optarg[0]) ||
149 			    (chunks = (size_t)strtoul(optarg, &ep, 10)) == 0 ||
150 			    *ep != '\0') {
151 				errx(EX_USAGE, "%s: illegal number of chunks",
152 				     optarg);
153 			}
154 			break;
155 
156 		case 'p':		/* pattern matching. */
157 			if (regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB) != 0)
158 				errx(EX_USAGE, "%s: illegal regexp", optarg);
159 			pflag = 1;
160 			break;
161 		default:
162 			usage();
163 		}
164 	argv += optind;
165 	argc -= optind;
166 
167 	if (*argv != NULL) {			/* Input file. */
168 		if (strcmp(*argv, "-") == 0)
169 			ifd = STDIN_FILENO;
170 		else if ((ifd = open(*argv, O_RDONLY, 0)) < 0)
171 			err(EX_NOINPUT, "%s", *argv);
172 		++argv;
173 	}
174 	if (*argv != NULL)			/* File name prefix. */
175 		if (strlcpy(fname, *argv++, sizeof(fname)) >= sizeof(fname))
176 			errx(EX_USAGE, "file name prefix is too long");
177 	if (*argv != NULL)
178 		usage();
179 
180 	if (strlen(fname) + (unsigned long)sufflen >= sizeof(fname))
181 		errx(EX_USAGE, "suffix is too long");
182 	if (pflag && (numlines != 0 || bytecnt != 0 || chunks != 0))
183 		usage();
184 
185 	if (numlines == 0)
186 		numlines = DEFLINE;
187 	else if (bytecnt != 0 || chunks != 0)
188 		usage();
189 
190 	if (bytecnt && chunks)
191 		usage();
192 
193 	if (ifd == -1)				/* Stdin by default. */
194 		ifd = 0;
195 
196 	if (bytecnt) {
197 		split1();
198 		exit (0);
199 	} else if (chunks) {
200 		split3();
201 		exit (0);
202 	}
203 	split2();
204 	if (pflag)
205 		regfree(&rgx);
206 	exit(0);
207 }
208 
209 /*
210  * split1 --
211  *	Split the input by bytes.
212  */
213 static void
214 split1(void)
215 {
216 	off_t bcnt;
217 	char *C;
218 	ssize_t dist, len;
219 	int nfiles;
220 
221 	nfiles = 0;
222 
223 	for (bcnt = 0;;)
224 		switch ((len = read(ifd, bfr, MAXBSIZE))) {
225 		case 0:
226 			exit(0);
227 		case -1:
228 			err(EX_IOERR, "read");
229 			/* NOTREACHED */
230 		default:
231 			if (!file_open) {
232 				if (!chunks || (nfiles < chunks)) {
233 					newfile();
234 					nfiles++;
235 				}
236 			}
237 			if (bcnt + len >= bytecnt) {
238 				dist = bytecnt - bcnt;
239 				if (write(ofd, bfr, dist) != dist)
240 					err(EX_IOERR, "write");
241 				len -= dist;
242 				for (C = bfr + dist; len >= bytecnt;
243 				    len -= bytecnt, C += bytecnt) {
244 					if (!chunks || (nfiles < chunks)) {
245 					newfile();
246 						nfiles++;
247 					}
248 					if (write(ofd,
249 					    C, bytecnt) != bytecnt)
250 						err(EX_IOERR, "write");
251 				}
252 				if (len != 0) {
253 					if (!chunks || (nfiles < chunks)) {
254 					newfile();
255 						nfiles++;
256 					}
257 					if (write(ofd, C, len) != len)
258 						err(EX_IOERR, "write");
259 				} else
260 					file_open = 0;
261 				bcnt = len;
262 			} else {
263 				bcnt += len;
264 				if (write(ofd, bfr, len) != len)
265 					err(EX_IOERR, "write");
266 			}
267 		}
268 }
269 
270 /*
271  * split2 --
272  *	Split the input by lines.
273  */
274 static void
275 split2(void)
276 {
277 	long lcnt = 0;
278 	FILE *infp;
279 
280 	/* Stick a stream on top of input file descriptor */
281 	if ((infp = fdopen(ifd, "r")) == NULL)
282 		err(EX_NOINPUT, "fdopen");
283 
284 	/* Process input one line at a time */
285 	while (fgets(bfr, sizeof(bfr), infp) != NULL) {
286 		const int len = strlen(bfr);
287 
288 		/* If line is too long to deal with, just write it out */
289 		if (bfr[len - 1] != '\n')
290 			goto writeit;
291 
292 		/* Check if we need to start a new file */
293 		if (pflag) {
294 			regmatch_t pmatch;
295 
296 			pmatch.rm_so = 0;
297 			pmatch.rm_eo = len - 1;
298 			if (regexec(&rgx, bfr, 0, &pmatch, REG_STARTEND) == 0)
299 				newfile();
300 		} else if (lcnt++ == numlines) {
301 			newfile();
302 			lcnt = 1;
303 		}
304 
305 writeit:
306 		/* Open output file if needed */
307 		if (!file_open)
308 			newfile();
309 
310 		/* Write out line */
311 		if (write(ofd, bfr, len) != len)
312 			err(EX_IOERR, "write");
313 	}
314 
315 	/* EOF or error? */
316 	if (ferror(infp))
317 		err(EX_IOERR, "read");
318 	else
319 		exit(0);
320 }
321 
322 /*
323  * split3 --
324  *	Split the input into specified number of chunks
325  */
326 static void
327 split3(void)
328 {
329 	struct stat sb;
330 
331 	if (fstat(ifd, &sb) == -1) {
332 		err(1, "stat");
333 		/* NOTREACHED */
334 	}
335 
336 	if (chunks > sb.st_size) {
337 		errx(1, "can't split into more than %d files",
338 		    (int)sb.st_size);
339 		/* NOTREACHED */
340 	}
341 
342 	bytecnt = sb.st_size / chunks;
343 	split1();
344 }
345 
346 
347 /*
348  * newfile --
349  *	Open a new output file.
350  */
351 static void
352 newfile(void)
353 {
354 	long i, maxfiles, tfnum;
355 	static long fnum;
356 	static char *fpnt;
357 	char beg, end;
358 	int pattlen;
359 
360 	if (ofd == -1) {
361 		if (fname[0] == '\0') {
362 			fname[0] = 'x';
363 			fpnt = fname + 1;
364 		} else {
365 			fpnt = fname + strlen(fname);
366 		}
367 		ofd = fileno(stdout);
368 	}
369 
370 	if (dflag) {
371 		beg = '0';
372 		end = '9';
373 	}
374 	else {
375 		beg = 'a';
376 		end = 'z';
377 	}
378 	pattlen = end - beg + 1;
379 
380 	/* maxfiles = pattlen^sufflen, but don't use libm. */
381 	for (maxfiles = 1, i = 0; i < sufflen; i++)
382 		if (LONG_MAX / pattlen < maxfiles)
383 			errx(EX_USAGE, "suffix is too long (max %ld)", i);
384 		else
385 			maxfiles *= pattlen;
386 
387 	if (fnum == maxfiles)
388 		errx(EX_DATAERR, "too many files");
389 
390 	/* Generate suffix of sufflen letters */
391 	tfnum = fnum;
392 	i = sufflen - 1;
393 	do {
394 		fpnt[i] = tfnum % pattlen + beg;
395 		tfnum /= pattlen;
396 	} while (i-- > 0);
397 	fpnt[sufflen] = '\0';
398 
399 	++fnum;
400 	if (!freopen(fname, "w", stdout))
401 		err(EX_IOERR, "%s", fname);
402 	file_open = 1;
403 }
404 
405 static void
406 usage(void)
407 {
408 	(void)fprintf(stderr,
409 "usage: split [-l line_count] [-a suffix_length] [file [prefix]]\n"
410 "       split -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n"
411 "       split -n chunk_count [-a suffix_length] [file [prefix]]\n"
412 "       split -p pattern [-a suffix_length] [file [prefix]]\n");
413 	exit(EX_USAGE);
414 }
415