1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1987, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #ifndef lint 34 static const char copyright[] = 35 "@(#) Copyright (c) 1987, 1993, 1994\n\ 36 The Regents of the University of California. All rights reserved.\n"; 37 #endif 38 39 #ifndef lint 40 static const char sccsid[] = "@(#)split.c 8.2 (Berkeley) 4/16/94"; 41 #endif 42 43 #include <sys/param.h> 44 #include <sys/stat.h> 45 46 #include <ctype.h> 47 #include <err.h> 48 #include <errno.h> 49 #include <fcntl.h> 50 #include <inttypes.h> 51 #include <libutil.h> 52 #include <limits.h> 53 #include <locale.h> 54 #include <stdbool.h> 55 #include <stdint.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 #include <string.h> 59 #include <unistd.h> 60 #include <regex.h> 61 #include <sysexits.h> 62 63 #define DEFLINE 1000 /* Default num lines per file. */ 64 65 static off_t bytecnt; /* Byte count to split on. */ 66 static long chunks; /* Chunks count to split into. */ 67 static bool clobber = true; /* Whether to overwrite existing output files. */ 68 static long numlines; /* Line count to split on. */ 69 static int file_open; /* If a file open. */ 70 static int ifd = -1, ofd = -1; /* Input/output file descriptors. */ 71 static char fname[MAXPATHLEN]; /* File name prefix. */ 72 static regex_t rgx; 73 static int pflag; 74 static bool dflag; 75 static long sufflen = 2; /* File name suffix length. */ 76 static bool autosfx = true; /* Whether to auto-extend the suffix length. */ 77 78 static void newfile(void); 79 static void split1(void); 80 static void split2(void); 81 static void split3(void); 82 static void usage(void) __dead2; 83 84 int 85 main(int argc, char **argv) 86 { 87 char errbuf[64]; 88 const char *p, *errstr; 89 int ch, error; 90 91 setlocale(LC_ALL, ""); 92 93 dflag = false; 94 while ((ch = getopt(argc, argv, "0::1::2::3::4::5::6::7::8::9::a:b:cdl:n:p:")) != -1) 95 switch (ch) { 96 case '0': case '1': case '2': case '3': case '4': 97 case '5': case '6': case '7': case '8': case '9': 98 /* 99 * Undocumented kludge: split was originally designed 100 * to take a number after a dash. 101 */ 102 if (numlines != 0) 103 usage(); 104 numlines = ch - '0'; 105 p = optarg ? optarg : ""; 106 while (numlines >= 0 && *p >= '0' && *p <= '9') 107 numlines = numlines * 10 + *p++ - '0'; 108 if (numlines <= 0 || *p != '\0') 109 errx(EX_USAGE, "%c%s: line count is invalid", 110 ch, optarg ? optarg : ""); 111 break; 112 case 'a': /* Suffix length */ 113 sufflen = strtonum(optarg, 0, INT_MAX, &errstr); 114 if (errstr != NULL) { 115 errx(EX_USAGE, "%s: suffix length is %s", 116 optarg, errstr); 117 } 118 if (sufflen == 0) { 119 sufflen = 2; 120 autosfx = true; 121 } else { 122 autosfx = false; 123 } 124 break; 125 case 'b': /* Byte count. */ 126 if (expand_number(optarg, &bytecnt) != 0) { 127 errx(EX_USAGE, "%s: byte count is invalid", 128 optarg); 129 } 130 break; 131 case 'c': /* Continue, don't overwrite output files. */ 132 clobber = false; 133 break; 134 case 'd': /* Decimal suffix */ 135 dflag = true; 136 break; 137 case 'l': /* Line count. */ 138 if (numlines != 0) 139 usage(); 140 numlines = strtonum(optarg, 1, LONG_MAX, &errstr); 141 if (errstr != NULL) { 142 errx(EX_USAGE, "%s: line count is %s", 143 optarg, errstr); 144 } 145 break; 146 case 'n': /* Chunks. */ 147 chunks = strtonum(optarg, 1, LONG_MAX, &errstr); 148 if (errstr != NULL) { 149 errx(EX_USAGE, "%s: number of chunks is %s", 150 optarg, errstr); 151 } 152 break; 153 154 case 'p': /* pattern matching. */ 155 error = regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB); 156 if (error != 0) { 157 regerror(error, &rgx, errbuf, sizeof(errbuf)); 158 errx(EX_USAGE, "%s: regex is invalid: %s", 159 optarg, errbuf); 160 } 161 pflag = 1; 162 break; 163 default: 164 usage(); 165 } 166 argv += optind; 167 argc -= optind; 168 169 if (argc > 0) { /* Input file. */ 170 if (strcmp(*argv, "-") == 0) 171 ifd = STDIN_FILENO; 172 else if ((ifd = open(*argv, O_RDONLY, 0)) < 0) 173 err(EX_NOINPUT, "%s", *argv); 174 ++argv; 175 --argc; 176 } 177 if (argc > 0) { /* File name prefix. */ 178 if (strlcpy(fname, *argv, sizeof(fname)) >= sizeof(fname)) { 179 errx(EX_USAGE, "%s: file name prefix is too long", 180 *argv); 181 } 182 ++argv; 183 --argc; 184 } 185 if (argc > 0) 186 usage(); 187 188 if (strlen(fname) + (unsigned long)sufflen >= sizeof(fname)) 189 errx(EX_USAGE, "suffix is too long"); 190 if (pflag && (numlines != 0 || bytecnt != 0 || chunks != 0)) 191 usage(); 192 193 if (numlines == 0) 194 numlines = DEFLINE; 195 else if (bytecnt != 0 || chunks != 0) 196 usage(); 197 198 if (bytecnt != 0 && chunks != 0) 199 usage(); 200 201 if (ifd == -1) /* Stdin by default. */ 202 ifd = 0; 203 204 if (bytecnt != 0) { 205 split1(); 206 exit (0); 207 } else if (chunks != 0) { 208 split3(); 209 exit (0); 210 } 211 split2(); 212 if (pflag) 213 regfree(&rgx); 214 exit(0); 215 } 216 217 /* 218 * split1 -- 219 * Split the input by bytes. 220 */ 221 static void 222 split1(void) 223 { 224 static char bfr[MAXBSIZE]; 225 off_t bcnt; 226 char *C; 227 ssize_t dist, len; 228 int nfiles; 229 230 nfiles = 0; 231 232 for (bcnt = 0;;) 233 switch ((len = read(ifd, bfr, sizeof(bfr)))) { 234 case 0: 235 exit(0); 236 case -1: 237 err(EX_IOERR, "read"); 238 /* NOTREACHED */ 239 default: 240 if (!file_open) { 241 if (chunks == 0 || nfiles < chunks) { 242 newfile(); 243 nfiles++; 244 } 245 } 246 if (bcnt + len >= bytecnt) { 247 dist = bytecnt - bcnt; 248 if (write(ofd, bfr, dist) != dist) 249 err(EX_IOERR, "write"); 250 len -= dist; 251 for (C = bfr + dist; len >= bytecnt; 252 len -= bytecnt, C += bytecnt) { 253 if (chunks == 0 || nfiles < chunks) { 254 newfile(); 255 nfiles++; 256 } 257 if (write(ofd, C, bytecnt) != bytecnt) 258 err(EX_IOERR, "write"); 259 } 260 if (len != 0) { 261 if (chunks == 0 || nfiles < chunks) { 262 newfile(); 263 nfiles++; 264 } 265 if (write(ofd, C, len) != len) 266 err(EX_IOERR, "write"); 267 } else { 268 file_open = 0; 269 } 270 bcnt = len; 271 } else { 272 bcnt += len; 273 if (write(ofd, bfr, len) != len) 274 err(EX_IOERR, "write"); 275 } 276 } 277 } 278 279 /* 280 * split2 -- 281 * Split the input by lines. 282 */ 283 static void 284 split2(void) 285 { 286 char *buf; 287 size_t bufsize; 288 ssize_t len; 289 long lcnt = 0; 290 FILE *infp; 291 292 buf = NULL; 293 bufsize = 0; 294 295 /* Stick a stream on top of input file descriptor */ 296 if ((infp = fdopen(ifd, "r")) == NULL) 297 err(EX_NOINPUT, "fdopen"); 298 299 /* Process input one line at a time */ 300 while ((errno = 0, len = getline(&buf, &bufsize, infp)) > 0) { 301 /* Check if we need to start a new file */ 302 if (pflag) { 303 regmatch_t pmatch; 304 305 pmatch.rm_so = 0; 306 pmatch.rm_eo = len - 1; 307 if (regexec(&rgx, buf, 0, &pmatch, REG_STARTEND) == 0) 308 newfile(); 309 } else if (lcnt++ == numlines) { 310 newfile(); 311 lcnt = 1; 312 } 313 314 /* Open output file if needed */ 315 if (!file_open) 316 newfile(); 317 318 /* Write out line */ 319 if (write(ofd, buf, len) != len) 320 err(EX_IOERR, "write"); 321 } 322 323 /* EOF or error? */ 324 if ((len == -1 && errno != 0) || ferror(infp)) 325 err(EX_IOERR, "read"); 326 else 327 exit(0); 328 } 329 330 /* 331 * split3 -- 332 * Split the input into specified number of chunks 333 */ 334 static void 335 split3(void) 336 { 337 struct stat sb; 338 339 if (fstat(ifd, &sb) == -1) { 340 err(1, "stat"); 341 /* NOTREACHED */ 342 } 343 344 if (chunks > sb.st_size) { 345 errx(1, "can't split into more than %d files", 346 (int)sb.st_size); 347 /* NOTREACHED */ 348 } 349 350 bytecnt = sb.st_size / chunks; 351 split1(); 352 } 353 354 355 /* 356 * newfile -- 357 * Open a new output file. 358 */ 359 static void 360 newfile(void) 361 { 362 long i, maxfiles, tfnum; 363 static long fnum; 364 static char *fpnt; 365 char beg, end; 366 int pattlen; 367 int flags = O_WRONLY | O_CREAT | O_TRUNC; 368 369 if (!clobber) 370 flags |= O_EXCL; 371 372 if (ofd == -1) { 373 if (fname[0] == '\0') { 374 fname[0] = 'x'; 375 fpnt = fname + 1; 376 } else { 377 fpnt = fname + strlen(fname); 378 } 379 } else if (close(ofd) != 0) 380 err(1, "%s", fname); 381 382 again: 383 if (dflag) { 384 beg = '0'; 385 end = '9'; 386 } 387 else { 388 beg = 'a'; 389 end = 'z'; 390 } 391 pattlen = end - beg + 1; 392 393 /* 394 * If '-a' is not specified, then we automatically expand the 395 * suffix length to accomodate splitting all input. We do this 396 * by moving the suffix pointer (fpnt) forward and incrementing 397 * sufflen by one, thereby yielding an additional two characters 398 * and allowing all output files to sort such that 'cat *' yields 399 * the input in order. I.e., the order is '... xyy xyz xzaaa 400 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on. 401 */ 402 if (!dflag && autosfx && (fpnt[0] == 'y') && 403 strspn(fpnt+1, "z") == strlen(fpnt+1)) { 404 fpnt = fname + strlen(fname) - sufflen; 405 fpnt[sufflen + 2] = '\0'; 406 fpnt[0] = end; 407 fpnt[1] = beg; 408 409 /* Basename | Suffix 410 * before: 411 * x | yz 412 * after: 413 * xz | a.. */ 414 fpnt++; 415 sufflen++; 416 417 /* Reset so we start back at all 'a's in our extended suffix. */ 418 fnum = 0; 419 } 420 421 /* maxfiles = pattlen^sufflen, but don't use libm. */ 422 for (maxfiles = 1, i = 0; i < sufflen; i++) 423 if (LONG_MAX / pattlen < maxfiles) 424 errx(EX_USAGE, "suffix is too long (max %ld)", i); 425 else 426 maxfiles *= pattlen; 427 428 if (fnum == maxfiles) 429 errx(EX_DATAERR, "too many files"); 430 431 /* Generate suffix of sufflen letters */ 432 tfnum = fnum; 433 i = sufflen - 1; 434 do { 435 fpnt[i] = tfnum % pattlen + beg; 436 tfnum /= pattlen; 437 } while (i-- > 0); 438 fpnt[sufflen] = '\0'; 439 440 ++fnum; 441 if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) { 442 if (!clobber && errno == EEXIST) 443 goto again; 444 err(EX_IOERR, "%s", fname); 445 } 446 file_open = 1; 447 } 448 449 static void 450 usage(void) 451 { 452 (void)fprintf(stderr, 453 "usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n" 454 " split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n" 455 " split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n" 456 " split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n"); 457 exit(EX_USAGE); 458 } 459