1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1987, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #ifndef lint 34 static const char copyright[] = 35 "@(#) Copyright (c) 1987, 1993, 1994\n\ 36 The Regents of the University of California. All rights reserved.\n"; 37 #endif 38 39 40 #include <sys/param.h> 41 #include <sys/stat.h> 42 43 #include <ctype.h> 44 #include <err.h> 45 #include <errno.h> 46 #include <fcntl.h> 47 #include <inttypes.h> 48 #include <libutil.h> 49 #include <limits.h> 50 #include <locale.h> 51 #include <stdbool.h> 52 #include <stdint.h> 53 #include <stdio.h> 54 #include <stdlib.h> 55 #include <string.h> 56 #include <unistd.h> 57 #include <regex.h> 58 #include <sysexits.h> 59 60 #define DEFLINE 1000 /* Default num lines per file. */ 61 62 static off_t bytecnt; /* Byte count to split on. */ 63 static long chunks; /* Chunks count to split into. */ 64 static bool clobber = true; /* Whether to overwrite existing output files. */ 65 static long numlines; /* Line count to split on. */ 66 static int file_open; /* If a file open. */ 67 static int ifd = -1, ofd = -1; /* Input/output file descriptors. */ 68 static char fname[MAXPATHLEN]; /* File name prefix. */ 69 static regex_t rgx; 70 static int pflag; 71 static bool dflag; 72 static long sufflen = 2; /* File name suffix length. */ 73 static bool autosfx = true; /* Whether to auto-extend the suffix length. */ 74 75 static void newfile(void); 76 static void split1(void); 77 static void split2(void); 78 static void split3(void); 79 static void usage(void) __dead2; 80 81 int 82 main(int argc, char **argv) 83 { 84 char errbuf[64]; 85 const char *p, *errstr; 86 int ch, error; 87 88 setlocale(LC_ALL, ""); 89 90 dflag = false; 91 while ((ch = getopt(argc, argv, "0::1::2::3::4::5::6::7::8::9::a:b:cdl:n:p:")) != -1) 92 switch (ch) { 93 case '0': case '1': case '2': case '3': case '4': 94 case '5': case '6': case '7': case '8': case '9': 95 /* 96 * Undocumented kludge: split was originally designed 97 * to take a number after a dash. 98 */ 99 if (numlines != 0) 100 usage(); 101 numlines = ch - '0'; 102 p = optarg ? optarg : ""; 103 while (numlines >= 0 && *p >= '0' && *p <= '9') 104 numlines = numlines * 10 + *p++ - '0'; 105 if (numlines <= 0 || *p != '\0') 106 errx(EX_USAGE, "%c%s: line count is invalid", 107 ch, optarg ? optarg : ""); 108 break; 109 case 'a': /* Suffix length */ 110 sufflen = strtonum(optarg, 0, INT_MAX, &errstr); 111 if (errstr != NULL) { 112 errx(EX_USAGE, "%s: suffix length is %s", 113 optarg, errstr); 114 } 115 if (sufflen == 0) { 116 sufflen = 2; 117 autosfx = true; 118 } else { 119 autosfx = false; 120 } 121 break; 122 case 'b': /* Byte count. */ 123 if (expand_number(optarg, &bytecnt) != 0) { 124 errx(EX_USAGE, "%s: byte count is invalid", 125 optarg); 126 } 127 break; 128 case 'c': /* Continue, don't overwrite output files. */ 129 clobber = false; 130 break; 131 case 'd': /* Decimal suffix */ 132 dflag = true; 133 break; 134 case 'l': /* Line count. */ 135 if (numlines != 0) 136 usage(); 137 numlines = strtonum(optarg, 1, LONG_MAX, &errstr); 138 if (errstr != NULL) { 139 errx(EX_USAGE, "%s: line count is %s", 140 optarg, errstr); 141 } 142 break; 143 case 'n': /* Chunks. */ 144 chunks = strtonum(optarg, 1, LONG_MAX, &errstr); 145 if (errstr != NULL) { 146 errx(EX_USAGE, "%s: number of chunks is %s", 147 optarg, errstr); 148 } 149 break; 150 151 case 'p': /* pattern matching. */ 152 error = regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB); 153 if (error != 0) { 154 regerror(error, &rgx, errbuf, sizeof(errbuf)); 155 errx(EX_USAGE, "%s: regex is invalid: %s", 156 optarg, errbuf); 157 } 158 pflag = 1; 159 break; 160 default: 161 usage(); 162 } 163 argv += optind; 164 argc -= optind; 165 166 if (argc > 0) { /* Input file. */ 167 if (strcmp(*argv, "-") == 0) 168 ifd = STDIN_FILENO; 169 else if ((ifd = open(*argv, O_RDONLY, 0)) < 0) 170 err(EX_NOINPUT, "%s", *argv); 171 ++argv; 172 --argc; 173 } 174 if (argc > 0) { /* File name prefix. */ 175 if (strlcpy(fname, *argv, sizeof(fname)) >= sizeof(fname)) { 176 errx(EX_USAGE, "%s: file name prefix is too long", 177 *argv); 178 } 179 ++argv; 180 --argc; 181 } 182 if (argc > 0) 183 usage(); 184 185 if (strlen(fname) + (unsigned long)sufflen >= sizeof(fname)) 186 errx(EX_USAGE, "suffix is too long"); 187 if (pflag && (numlines != 0 || bytecnt != 0 || chunks != 0)) 188 usage(); 189 190 if (numlines == 0) 191 numlines = DEFLINE; 192 else if (bytecnt != 0 || chunks != 0) 193 usage(); 194 195 if (bytecnt != 0 && chunks != 0) 196 usage(); 197 198 if (ifd == -1) /* Stdin by default. */ 199 ifd = 0; 200 201 if (bytecnt != 0) { 202 split1(); 203 exit (0); 204 } else if (chunks != 0) { 205 split3(); 206 exit (0); 207 } 208 split2(); 209 if (pflag) 210 regfree(&rgx); 211 exit(0); 212 } 213 214 /* 215 * split1 -- 216 * Split the input by bytes. 217 */ 218 static void 219 split1(void) 220 { 221 static char bfr[MAXBSIZE]; 222 off_t bcnt; 223 char *C; 224 ssize_t dist, len; 225 int nfiles; 226 227 nfiles = 0; 228 229 for (bcnt = 0;;) 230 switch ((len = read(ifd, bfr, sizeof(bfr)))) { 231 case 0: 232 exit(0); 233 case -1: 234 err(EX_IOERR, "read"); 235 /* NOTREACHED */ 236 default: 237 if (!file_open) { 238 if (chunks == 0 || nfiles < chunks) { 239 newfile(); 240 nfiles++; 241 } 242 } 243 if (bcnt + len >= bytecnt) { 244 dist = bytecnt - bcnt; 245 if (write(ofd, bfr, dist) != dist) 246 err(EX_IOERR, "write"); 247 len -= dist; 248 for (C = bfr + dist; len >= bytecnt; 249 len -= bytecnt, C += bytecnt) { 250 if (chunks == 0 || nfiles < chunks) { 251 newfile(); 252 nfiles++; 253 } 254 if (write(ofd, C, bytecnt) != bytecnt) 255 err(EX_IOERR, "write"); 256 } 257 if (len != 0) { 258 if (chunks == 0 || nfiles < chunks) { 259 newfile(); 260 nfiles++; 261 } 262 if (write(ofd, C, len) != len) 263 err(EX_IOERR, "write"); 264 } else { 265 file_open = 0; 266 } 267 bcnt = len; 268 } else { 269 bcnt += len; 270 if (write(ofd, bfr, len) != len) 271 err(EX_IOERR, "write"); 272 } 273 } 274 } 275 276 /* 277 * split2 -- 278 * Split the input by lines. 279 */ 280 static void 281 split2(void) 282 { 283 char *buf; 284 size_t bufsize; 285 ssize_t len; 286 long lcnt = 0; 287 FILE *infp; 288 289 buf = NULL; 290 bufsize = 0; 291 292 /* Stick a stream on top of input file descriptor */ 293 if ((infp = fdopen(ifd, "r")) == NULL) 294 err(EX_NOINPUT, "fdopen"); 295 296 /* Process input one line at a time */ 297 while ((errno = 0, len = getline(&buf, &bufsize, infp)) > 0) { 298 /* Check if we need to start a new file */ 299 if (pflag) { 300 regmatch_t pmatch; 301 302 pmatch.rm_so = 0; 303 pmatch.rm_eo = len - 1; 304 if (regexec(&rgx, buf, 0, &pmatch, REG_STARTEND) == 0) 305 newfile(); 306 } else if (lcnt++ == numlines) { 307 newfile(); 308 lcnt = 1; 309 } 310 311 /* Open output file if needed */ 312 if (!file_open) 313 newfile(); 314 315 /* Write out line */ 316 if (write(ofd, buf, len) != len) 317 err(EX_IOERR, "write"); 318 } 319 320 /* EOF or error? */ 321 if ((len == -1 && errno != 0) || ferror(infp)) 322 err(EX_IOERR, "read"); 323 else 324 exit(0); 325 } 326 327 /* 328 * split3 -- 329 * Split the input into specified number of chunks 330 */ 331 static void 332 split3(void) 333 { 334 struct stat sb; 335 336 if (fstat(ifd, &sb) == -1) { 337 err(1, "stat"); 338 /* NOTREACHED */ 339 } 340 341 if (chunks > sb.st_size) { 342 errx(1, "can't split into more than %d files", 343 (int)sb.st_size); 344 /* NOTREACHED */ 345 } 346 347 bytecnt = sb.st_size / chunks; 348 split1(); 349 } 350 351 352 /* 353 * newfile -- 354 * Open a new output file. 355 */ 356 static void 357 newfile(void) 358 { 359 long i, maxfiles, tfnum; 360 static long fnum; 361 static char *fpnt; 362 char beg, end; 363 int pattlen; 364 int flags = O_WRONLY | O_CREAT | O_TRUNC; 365 366 if (!clobber) 367 flags |= O_EXCL; 368 369 if (ofd == -1) { 370 if (fname[0] == '\0') { 371 fname[0] = 'x'; 372 fpnt = fname + 1; 373 } else { 374 fpnt = fname + strlen(fname); 375 } 376 } else if (close(ofd) != 0) 377 err(1, "%s", fname); 378 379 again: 380 if (dflag) { 381 beg = '0'; 382 end = '9'; 383 } 384 else { 385 beg = 'a'; 386 end = 'z'; 387 } 388 pattlen = end - beg + 1; 389 390 /* 391 * If '-a' is not specified, then we automatically expand the 392 * suffix length to accomodate splitting all input. We do this 393 * by moving the suffix pointer (fpnt) forward and incrementing 394 * sufflen by one, thereby yielding an additional two characters 395 * and allowing all output files to sort such that 'cat *' yields 396 * the input in order. I.e., the order is '... xyy xyz xzaaa 397 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on. 398 */ 399 if (!dflag && autosfx && (fpnt[0] == 'y') && 400 strspn(fpnt+1, "z") == strlen(fpnt+1)) { 401 fpnt = fname + strlen(fname) - sufflen; 402 fpnt[sufflen + 2] = '\0'; 403 fpnt[0] = end; 404 fpnt[1] = beg; 405 406 /* Basename | Suffix 407 * before: 408 * x | yz 409 * after: 410 * xz | a.. */ 411 fpnt++; 412 sufflen++; 413 414 /* Reset so we start back at all 'a's in our extended suffix. */ 415 fnum = 0; 416 } 417 418 /* maxfiles = pattlen^sufflen, but don't use libm. */ 419 for (maxfiles = 1, i = 0; i < sufflen; i++) 420 if (LONG_MAX / pattlen < maxfiles) 421 errx(EX_USAGE, "suffix is too long (max %ld)", i); 422 else 423 maxfiles *= pattlen; 424 425 if (fnum == maxfiles) 426 errx(EX_DATAERR, "too many files"); 427 428 /* Generate suffix of sufflen letters */ 429 tfnum = fnum; 430 i = sufflen - 1; 431 do { 432 fpnt[i] = tfnum % pattlen + beg; 433 tfnum /= pattlen; 434 } while (i-- > 0); 435 fpnt[sufflen] = '\0'; 436 437 ++fnum; 438 if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) { 439 if (!clobber && errno == EEXIST) 440 goto again; 441 err(EX_IOERR, "%s", fname); 442 } 443 file_open = 1; 444 } 445 446 static void 447 usage(void) 448 { 449 (void)fprintf(stderr, 450 "usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n" 451 " split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n" 452 " split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n" 453 " split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n"); 454 exit(EX_USAGE); 455 } 456