1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1987, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #ifndef lint 34 static const char copyright[] = 35 "@(#) Copyright (c) 1987, 1993, 1994\n\ 36 The Regents of the University of California. All rights reserved.\n"; 37 #endif 38 39 #ifndef lint 40 static const char sccsid[] = "@(#)split.c 8.2 (Berkeley) 4/16/94"; 41 #endif 42 43 #include <sys/param.h> 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 47 #include <ctype.h> 48 #include <err.h> 49 #include <errno.h> 50 #include <fcntl.h> 51 #include <inttypes.h> 52 #include <libutil.h> 53 #include <limits.h> 54 #include <locale.h> 55 #include <stdbool.h> 56 #include <stdint.h> 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <string.h> 60 #include <unistd.h> 61 #include <regex.h> 62 #include <sysexits.h> 63 64 #define DEFLINE 1000 /* Default num lines per file. */ 65 66 static off_t bytecnt; /* Byte count to split on. */ 67 static off_t chunks = 0; /* Chunks count to split into. */ 68 static bool clobber = true; /* Whether to overwrite existing output files. */ 69 static long numlines; /* Line count to split on. */ 70 static int file_open; /* If a file open. */ 71 static int ifd = -1, ofd = -1; /* Input/output file descriptors. */ 72 static char fname[MAXPATHLEN]; /* File name prefix. */ 73 static regex_t rgx; 74 static int pflag; 75 static bool dflag; 76 static long sufflen = 2; /* File name suffix length. */ 77 static int autosfx = 1; /* Whether to auto-extend the suffix length. */ 78 79 static void newfile(void); 80 static void split1(void); 81 static void split2(void); 82 static void split3(void); 83 static void usage(void) __dead2; 84 85 int 86 main(int argc, char **argv) 87 { 88 int ch; 89 int error; 90 char *ep, *p; 91 92 setlocale(LC_ALL, ""); 93 94 dflag = false; 95 while ((ch = getopt(argc, argv, "0123456789a:b:cdl:n:p:")) != -1) 96 switch (ch) { 97 case '0': case '1': case '2': case '3': case '4': 98 case '5': case '6': case '7': case '8': case '9': 99 /* 100 * Undocumented kludge: split was originally designed 101 * to take a number after a dash. 102 */ 103 if (numlines == 0) { 104 p = argv[optind - 1]; 105 if (p[0] == '-' && p[1] == ch && !p[2]) 106 numlines = strtol(++p, &ep, 10); 107 else 108 numlines = 109 strtol(argv[optind] + 1, &ep, 10); 110 if (numlines <= 0 || *ep) 111 errx(EX_USAGE, 112 "%s: illegal line count", optarg); 113 } 114 break; 115 case 'a': /* Suffix length */ 116 if ((sufflen = strtol(optarg, &ep, 10)) <= 0 || *ep) 117 errx(EX_USAGE, 118 "%s: illegal suffix length", optarg); 119 autosfx = 0; 120 break; 121 case 'b': /* Byte count. */ 122 errno = 0; 123 error = expand_number(optarg, &bytecnt); 124 if (error == -1) 125 errx(EX_USAGE, "%s: offset too large", optarg); 126 break; 127 case 'c': /* Continue, don't overwrite output files. */ 128 clobber = false; 129 break; 130 case 'd': /* Decimal suffix */ 131 dflag = true; 132 break; 133 case 'l': /* Line count. */ 134 if (numlines != 0) 135 usage(); 136 if ((numlines = strtol(optarg, &ep, 10)) <= 0 || *ep) 137 errx(EX_USAGE, 138 "%s: illegal line count", optarg); 139 break; 140 case 'n': /* Chunks. */ 141 if (!isdigit((unsigned char)optarg[0]) || 142 (chunks = (size_t)strtoul(optarg, &ep, 10)) == 0 || 143 *ep != '\0') { 144 errx(EX_USAGE, "%s: illegal number of chunks", 145 optarg); 146 } 147 break; 148 149 case 'p': /* pattern matching. */ 150 if (regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB) != 0) 151 errx(EX_USAGE, "%s: illegal regexp", optarg); 152 pflag = 1; 153 break; 154 default: 155 usage(); 156 } 157 argv += optind; 158 argc -= optind; 159 160 if (*argv != NULL) { /* Input file. */ 161 if (strcmp(*argv, "-") == 0) 162 ifd = STDIN_FILENO; 163 else if ((ifd = open(*argv, O_RDONLY, 0)) < 0) 164 err(EX_NOINPUT, "%s", *argv); 165 ++argv; 166 } 167 if (*argv != NULL) /* File name prefix. */ 168 if (strlcpy(fname, *argv++, sizeof(fname)) >= sizeof(fname)) 169 errx(EX_USAGE, "file name prefix is too long"); 170 if (*argv != NULL) 171 usage(); 172 173 if (strlen(fname) + (unsigned long)sufflen >= sizeof(fname)) 174 errx(EX_USAGE, "suffix is too long"); 175 if (pflag && (numlines != 0 || bytecnt != 0 || chunks != 0)) 176 usage(); 177 178 if (numlines == 0) 179 numlines = DEFLINE; 180 else if (bytecnt != 0 || chunks != 0) 181 usage(); 182 183 if (bytecnt && chunks) 184 usage(); 185 186 if (ifd == -1) /* Stdin by default. */ 187 ifd = 0; 188 189 if (bytecnt) { 190 split1(); 191 exit (0); 192 } else if (chunks) { 193 split3(); 194 exit (0); 195 } 196 split2(); 197 if (pflag) 198 regfree(&rgx); 199 exit(0); 200 } 201 202 /* 203 * split1 -- 204 * Split the input by bytes. 205 */ 206 static void 207 split1(void) 208 { 209 static char bfr[MAXBSIZE]; 210 off_t bcnt; 211 char *C; 212 ssize_t dist, len; 213 int nfiles; 214 215 nfiles = 0; 216 217 for (bcnt = 0;;) 218 switch ((len = read(ifd, bfr, sizeof(bfr)))) { 219 case 0: 220 exit(0); 221 case -1: 222 err(EX_IOERR, "read"); 223 /* NOTREACHED */ 224 default: 225 if (!file_open) { 226 if (!chunks || (nfiles < chunks)) { 227 newfile(); 228 nfiles++; 229 } 230 } 231 if (bcnt + len >= bytecnt) { 232 dist = bytecnt - bcnt; 233 if (write(ofd, bfr, dist) != dist) 234 err(EX_IOERR, "write"); 235 len -= dist; 236 for (C = bfr + dist; len >= bytecnt; 237 len -= bytecnt, C += bytecnt) { 238 if (!chunks || (nfiles < chunks)) { 239 newfile(); 240 nfiles++; 241 } 242 if (write(ofd, 243 C, bytecnt) != bytecnt) 244 err(EX_IOERR, "write"); 245 } 246 if (len != 0) { 247 if (!chunks || (nfiles < chunks)) { 248 newfile(); 249 nfiles++; 250 } 251 if (write(ofd, C, len) != len) 252 err(EX_IOERR, "write"); 253 } else 254 file_open = 0; 255 bcnt = len; 256 } else { 257 bcnt += len; 258 if (write(ofd, bfr, len) != len) 259 err(EX_IOERR, "write"); 260 } 261 } 262 } 263 264 /* 265 * split2 -- 266 * Split the input by lines. 267 */ 268 static void 269 split2(void) 270 { 271 char *buf; 272 size_t bufsize; 273 ssize_t len; 274 long lcnt = 0; 275 FILE *infp; 276 277 buf = NULL; 278 bufsize = 0; 279 280 /* Stick a stream on top of input file descriptor */ 281 if ((infp = fdopen(ifd, "r")) == NULL) 282 err(EX_NOINPUT, "fdopen"); 283 284 /* Process input one line at a time */ 285 while ((errno = 0, len = getline(&buf, &bufsize, infp)) > 0) { 286 /* Check if we need to start a new file */ 287 if (pflag) { 288 regmatch_t pmatch; 289 290 pmatch.rm_so = 0; 291 pmatch.rm_eo = len - 1; 292 if (regexec(&rgx, buf, 0, &pmatch, REG_STARTEND) == 0) 293 newfile(); 294 } else if (lcnt++ == numlines) { 295 newfile(); 296 lcnt = 1; 297 } 298 299 /* Open output file if needed */ 300 if (!file_open) 301 newfile(); 302 303 /* Write out line */ 304 if (write(ofd, buf, len) != len) 305 err(EX_IOERR, "write"); 306 } 307 308 /* EOF or error? */ 309 if ((len == -1 && errno != 0) || ferror(infp)) 310 err(EX_IOERR, "read"); 311 else 312 exit(0); 313 } 314 315 /* 316 * split3 -- 317 * Split the input into specified number of chunks 318 */ 319 static void 320 split3(void) 321 { 322 struct stat sb; 323 324 if (fstat(ifd, &sb) == -1) { 325 err(1, "stat"); 326 /* NOTREACHED */ 327 } 328 329 if (chunks > sb.st_size) { 330 errx(1, "can't split into more than %d files", 331 (int)sb.st_size); 332 /* NOTREACHED */ 333 } 334 335 bytecnt = sb.st_size / chunks; 336 split1(); 337 } 338 339 340 /* 341 * newfile -- 342 * Open a new output file. 343 */ 344 static void 345 newfile(void) 346 { 347 long i, maxfiles, tfnum; 348 static long fnum; 349 static char *fpnt; 350 char beg, end; 351 int pattlen; 352 int flags = O_WRONLY | O_CREAT | O_TRUNC; 353 354 if (!clobber) 355 flags |= O_EXCL; 356 357 if (ofd == -1) { 358 if (fname[0] == '\0') { 359 fname[0] = 'x'; 360 fpnt = fname + 1; 361 } else { 362 fpnt = fname + strlen(fname); 363 } 364 } else if (close(ofd) != 0) 365 err(1, "%s", fname); 366 367 again: 368 if (dflag) { 369 beg = '0'; 370 end = '9'; 371 } 372 else { 373 beg = 'a'; 374 end = 'z'; 375 } 376 pattlen = end - beg + 1; 377 378 /* 379 * If '-a' is not specified, then we automatically expand the 380 * suffix length to accomodate splitting all input. We do this 381 * by moving the suffix pointer (fpnt) forward and incrementing 382 * sufflen by one, thereby yielding an additional two characters 383 * and allowing all output files to sort such that 'cat *' yields 384 * the input in order. I.e., the order is '... xyy xyz xzaaa 385 * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on. 386 */ 387 if (!dflag && autosfx && (fpnt[0] == 'y') && 388 strspn(fpnt+1, "z") == strlen(fpnt+1)) { 389 fpnt = fname + strlen(fname) - sufflen; 390 fpnt[sufflen + 2] = '\0'; 391 fpnt[0] = end; 392 fpnt[1] = beg; 393 394 /* Basename | Suffix 395 * before: 396 * x | yz 397 * after: 398 * xz | a.. */ 399 fpnt++; 400 sufflen++; 401 402 /* Reset so we start back at all 'a's in our extended suffix. */ 403 tfnum = 0; 404 fnum = 0; 405 } 406 407 /* maxfiles = pattlen^sufflen, but don't use libm. */ 408 for (maxfiles = 1, i = 0; i < sufflen; i++) 409 if (LONG_MAX / pattlen < maxfiles) 410 errx(EX_USAGE, "suffix is too long (max %ld)", i); 411 else 412 maxfiles *= pattlen; 413 414 if (fnum == maxfiles) 415 errx(EX_DATAERR, "too many files"); 416 417 /* Generate suffix of sufflen letters */ 418 tfnum = fnum; 419 i = sufflen - 1; 420 do { 421 fpnt[i] = tfnum % pattlen + beg; 422 tfnum /= pattlen; 423 } while (i-- > 0); 424 fpnt[sufflen] = '\0'; 425 426 ++fnum; 427 if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) { 428 if (!clobber && errno == EEXIST) 429 goto again; 430 err(EX_IOERR, "%s", fname); 431 } 432 file_open = 1; 433 } 434 435 static void 436 usage(void) 437 { 438 (void)fprintf(stderr, 439 "usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n" 440 " split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n" 441 " split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n" 442 " split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n"); 443 exit(EX_USAGE); 444 } 445