xref: /freebsd/usr.bin/split/split.c (revision 5b31cc94b10d4bb7109c6b27940a0fc76a44a331)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1987, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 #ifndef lint
34 static const char copyright[] =
35 "@(#) Copyright (c) 1987, 1993, 1994\n\
36 	The Regents of the University of California.  All rights reserved.\n";
37 #endif
38 
39 
40 #include <sys/param.h>
41 #include <sys/stat.h>
42 
43 #include <ctype.h>
44 #include <err.h>
45 #include <errno.h>
46 #include <fcntl.h>
47 #include <inttypes.h>
48 #include <libutil.h>
49 #include <limits.h>
50 #include <locale.h>
51 #include <stdbool.h>
52 #include <stdint.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <unistd.h>
57 #include <regex.h>
58 #include <sysexits.h>
59 
60 #define DEFLINE	1000			/* Default num lines per file. */
61 
62 static off_t	 bytecnt;		/* Byte count to split on. */
63 static long	 chunks;		/* Chunks count to split into. */
64 static bool      clobber = true;        /* Whether to overwrite existing output files. */
65 static long	 numlines;		/* Line count to split on. */
66 static int	 file_open;		/* If a file open. */
67 static int	 ifd = -1, ofd = -1;	/* Input/output file descriptors. */
68 static char	 fname[MAXPATHLEN];	/* File name prefix. */
69 static regex_t	 rgx;
70 static int	 pflag;
71 static bool	 dflag;
72 static long	 sufflen = 2;		/* File name suffix length. */
73 static bool	 autosfx = true;	/* Whether to auto-extend the suffix length. */
74 
75 static void newfile(void);
76 static void split1(void);
77 static void split2(void);
78 static void split3(void);
79 static void usage(void) __dead2;
80 
81 int
82 main(int argc, char **argv)
83 {
84 	char errbuf[64];
85 	const char *p, *errstr;
86 	int ch, error;
87 
88 	setlocale(LC_ALL, "");
89 
90 	dflag = false;
91 	while ((ch = getopt(argc, argv, "0::1::2::3::4::5::6::7::8::9::a:b:cdl:n:p:")) != -1)
92 		switch (ch) {
93 		case '0': case '1': case '2': case '3': case '4':
94 		case '5': case '6': case '7': case '8': case '9':
95 			/*
96 			 * Undocumented kludge: split was originally designed
97 			 * to take a number after a dash.
98 			 */
99 			if (numlines != 0)
100 				usage();
101 			numlines = ch - '0';
102 			p = optarg ? optarg : "";
103 			while (numlines >= 0 && *p >= '0' && *p <= '9')
104 				numlines = numlines * 10 + *p++ - '0';
105 			if (numlines <= 0 || *p != '\0')
106 				errx(EX_USAGE, "%c%s: line count is invalid",
107 				    ch, optarg ? optarg : "");
108 			break;
109 		case 'a':		/* Suffix length */
110 			sufflen = strtonum(optarg, 0, INT_MAX, &errstr);
111 			if (errstr != NULL) {
112 				errx(EX_USAGE, "%s: suffix length is %s",
113 				    optarg, errstr);
114 			}
115 			if (sufflen == 0) {
116 				sufflen = 2;
117 				autosfx = true;
118 			} else {
119 				autosfx = false;
120 			}
121 			break;
122 		case 'b':		/* Byte count. */
123 			if (expand_number(optarg, &bytecnt) != 0) {
124 				errx(EX_USAGE, "%s: byte count is invalid",
125 				    optarg);
126 			}
127 			break;
128 		case 'c':               /* Continue, don't overwrite output files. */
129 			clobber = false;
130 			break;
131 		case 'd':		/* Decimal suffix */
132 			dflag = true;
133 			break;
134 		case 'l':		/* Line count. */
135 			if (numlines != 0)
136 				usage();
137 			numlines = strtonum(optarg, 1, LONG_MAX, &errstr);
138 			if (errstr != NULL) {
139 				errx(EX_USAGE, "%s: line count is %s",
140 				    optarg, errstr);
141 			}
142 			break;
143 		case 'n':		/* Chunks. */
144 			chunks = strtonum(optarg, 1, LONG_MAX, &errstr);
145 			if (errstr != NULL) {
146 				errx(EX_USAGE, "%s: number of chunks is %s",
147 				    optarg, errstr);
148 			}
149 			break;
150 
151 		case 'p':		/* pattern matching. */
152 			error = regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB);
153 			if (error != 0) {
154 				regerror(error, &rgx, errbuf, sizeof(errbuf));
155 				errx(EX_USAGE, "%s: regex is invalid: %s",
156 				    optarg, errbuf);
157 			}
158 			pflag = 1;
159 			break;
160 		default:
161 			usage();
162 		}
163 	argv += optind;
164 	argc -= optind;
165 
166 	if (argc > 0) {			/* Input file. */
167 		if (strcmp(*argv, "-") == 0)
168 			ifd = STDIN_FILENO;
169 		else if ((ifd = open(*argv, O_RDONLY, 0)) < 0)
170 			err(EX_NOINPUT, "%s", *argv);
171 		++argv;
172 		--argc;
173 	}
174 	if (argc > 0) {			/* File name prefix. */
175 		if (strlcpy(fname, *argv, sizeof(fname)) >= sizeof(fname)) {
176 			errx(EX_USAGE, "%s: file name prefix is too long",
177 			    *argv);
178 		}
179 		++argv;
180 		--argc;
181 	}
182 	if (argc > 0)
183 		usage();
184 
185 	if (strlen(fname) + (unsigned long)sufflen >= sizeof(fname))
186 		errx(EX_USAGE, "suffix is too long");
187 	if (pflag && (numlines != 0 || bytecnt != 0 || chunks != 0))
188 		usage();
189 
190 	if (numlines == 0)
191 		numlines = DEFLINE;
192 	else if (bytecnt != 0 || chunks != 0)
193 		usage();
194 
195 	if (bytecnt != 0 && chunks != 0)
196 		usage();
197 
198 	if (ifd == -1)				/* Stdin by default. */
199 		ifd = 0;
200 
201 	if (bytecnt != 0) {
202 		split1();
203 		exit (0);
204 	} else if (chunks != 0) {
205 		split3();
206 		exit (0);
207 	}
208 	split2();
209 	if (pflag)
210 		regfree(&rgx);
211 	exit(0);
212 }
213 
214 /*
215  * split1 --
216  *	Split the input by bytes.
217  */
218 static void
219 split1(void)
220 {
221 	static char bfr[MAXBSIZE];
222 	off_t bcnt;
223 	char *C;
224 	ssize_t dist, len;
225 	int nfiles;
226 
227 	nfiles = 0;
228 
229 	for (bcnt = 0;;)
230 		switch ((len = read(ifd, bfr, sizeof(bfr)))) {
231 		case 0:
232 			exit(0);
233 		case -1:
234 			err(EX_IOERR, "read");
235 			/* NOTREACHED */
236 		default:
237 			if (!file_open) {
238 				if (chunks == 0 || nfiles < chunks) {
239 					newfile();
240 					nfiles++;
241 				}
242 			}
243 			if (bcnt + len >= bytecnt) {
244 				dist = bytecnt - bcnt;
245 				if (write(ofd, bfr, dist) != dist)
246 					err(EX_IOERR, "write");
247 				len -= dist;
248 				for (C = bfr + dist; len >= bytecnt;
249 				     len -= bytecnt, C += bytecnt) {
250 					if (chunks == 0 || nfiles < chunks) {
251 						newfile();
252 						nfiles++;
253 					}
254 					if (write(ofd, C, bytecnt) != bytecnt)
255 						err(EX_IOERR, "write");
256 				}
257 				if (len != 0) {
258 					if (chunks == 0 || nfiles < chunks) {
259 						newfile();
260 						nfiles++;
261 					}
262 					if (write(ofd, C, len) != len)
263 						err(EX_IOERR, "write");
264 				} else {
265 					file_open = 0;
266 				}
267 				bcnt = len;
268 			} else {
269 				bcnt += len;
270 				if (write(ofd, bfr, len) != len)
271 					err(EX_IOERR, "write");
272 			}
273 		}
274 }
275 
276 /*
277  * split2 --
278  *	Split the input by lines.
279  */
280 static void
281 split2(void)
282 {
283 	char *buf;
284 	size_t bufsize;
285 	ssize_t len;
286 	long lcnt = 0;
287 	FILE *infp;
288 
289 	buf = NULL;
290 	bufsize = 0;
291 
292 	/* Stick a stream on top of input file descriptor */
293 	if ((infp = fdopen(ifd, "r")) == NULL)
294 		err(EX_NOINPUT, "fdopen");
295 
296 	/* Process input one line at a time */
297 	while ((errno = 0, len = getline(&buf, &bufsize, infp)) > 0) {
298 		/* Check if we need to start a new file */
299 		if (pflag) {
300 			regmatch_t pmatch;
301 
302 			pmatch.rm_so = 0;
303 			pmatch.rm_eo = len - 1;
304 			if (regexec(&rgx, buf, 0, &pmatch, REG_STARTEND) == 0)
305 				newfile();
306 		} else if (lcnt++ == numlines) {
307 			newfile();
308 			lcnt = 1;
309 		}
310 
311 		/* Open output file if needed */
312 		if (!file_open)
313 			newfile();
314 
315 		/* Write out line */
316 		if (write(ofd, buf, len) != len)
317 			err(EX_IOERR, "write");
318 	}
319 
320 	/* EOF or error? */
321 	if ((len == -1 && errno != 0) || ferror(infp))
322 		err(EX_IOERR, "read");
323 	else
324 		exit(0);
325 }
326 
327 /*
328  * split3 --
329  *	Split the input into specified number of chunks
330  */
331 static void
332 split3(void)
333 {
334 	struct stat sb;
335 
336 	if (fstat(ifd, &sb) == -1) {
337 		err(1, "stat");
338 		/* NOTREACHED */
339 	}
340 
341 	if (chunks > sb.st_size) {
342 		errx(1, "can't split into more than %d files",
343 		    (int)sb.st_size);
344 		/* NOTREACHED */
345 	}
346 
347 	bytecnt = sb.st_size / chunks;
348 	split1();
349 }
350 
351 
352 /*
353  * newfile --
354  *	Open a new output file.
355  */
356 static void
357 newfile(void)
358 {
359 	long i, maxfiles, tfnum;
360 	static long fnum;
361 	static char *fpnt;
362 	char beg, end;
363 	int pattlen;
364 	int flags = O_WRONLY | O_CREAT | O_TRUNC;
365 
366 	if (!clobber)
367 		flags |= O_EXCL;
368 
369 	if (ofd == -1) {
370 		if (fname[0] == '\0') {
371 			fname[0] = 'x';
372 			fpnt = fname + 1;
373 		} else {
374 			fpnt = fname + strlen(fname);
375 		}
376 	} else if (close(ofd) != 0)
377 		err(1, "%s", fname);
378 
379 	again:
380 	if (dflag) {
381 		beg = '0';
382 		end = '9';
383 	}
384 	else {
385 		beg = 'a';
386 		end = 'z';
387 	}
388 	pattlen = end - beg + 1;
389 
390 	/*
391 	 * If '-a' is not specified, then we automatically expand the
392 	 * suffix length to accomodate splitting all input.  We do this
393 	 * by moving the suffix pointer (fpnt) forward and incrementing
394 	 * sufflen by one, thereby yielding an additional two characters
395 	 * and allowing all output files to sort such that 'cat *' yields
396 	 * the input in order.  I.e., the order is '... xyy xyz xzaaa
397 	 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on.
398 	 */
399 	if (!dflag && autosfx && (fpnt[0] == 'y') &&
400 			strspn(fpnt+1, "z") == strlen(fpnt+1)) {
401 		fpnt = fname + strlen(fname) - sufflen;
402 		fpnt[sufflen + 2] = '\0';
403 		fpnt[0] = end;
404 		fpnt[1] = beg;
405 
406 		/*  Basename | Suffix
407 		 *  before:
408 		 *  x        | yz
409 		 *  after:
410 		 *  xz       | a.. */
411 		fpnt++;
412 		sufflen++;
413 
414 		/* Reset so we start back at all 'a's in our extended suffix. */
415 		fnum = 0;
416 	}
417 
418 	/* maxfiles = pattlen^sufflen, but don't use libm. */
419 	for (maxfiles = 1, i = 0; i < sufflen; i++)
420 		if (LONG_MAX / pattlen < maxfiles)
421 			errx(EX_USAGE, "suffix is too long (max %ld)", i);
422 		else
423 			maxfiles *= pattlen;
424 
425 	if (fnum == maxfiles)
426 		errx(EX_DATAERR, "too many files");
427 
428 	/* Generate suffix of sufflen letters */
429 	tfnum = fnum;
430 	i = sufflen - 1;
431 	do {
432 		fpnt[i] = tfnum % pattlen + beg;
433 		tfnum /= pattlen;
434 	} while (i-- > 0);
435 	fpnt[sufflen] = '\0';
436 
437 	++fnum;
438 	if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) {
439 		if (!clobber && errno == EEXIST)
440 			goto again;
441 		err(EX_IOERR, "%s", fname);
442 	}
443 	file_open = 1;
444 }
445 
446 static void
447 usage(void)
448 {
449 	(void)fprintf(stderr,
450 "usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n"
451 "       split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n"
452 "       split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n"
453 "       split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n");
454 	exit(EX_USAGE);
455 }
456