1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1992-2010 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * * 20 ***********************************************************************/ 21 #pragma prototyped 22 /* 23 * David Korn 24 * AT&T Bell Laboratories 25 * 26 * cut fields or columns from fields from a file 27 */ 28 29 static const char usage[] = 30 "[-?\n@(#)$Id: cut (AT&T Research) 2009-12-04 $\n]" 31 USAGE_LICENSE 32 "[+NAME?cut - cut out selected columns or fields of each line of a file]" 33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " 34 "from one or more files, contatenating them on standard output.]" 35 "[+?The option argument \alist\a is a comma-separated or blank-separated " 36 "list of positive numbers and ranges. Ranges can be of three " 37 "forms. The first is two positive integers separated by a hyphen " 38 "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to " 39 "\ahigh\a. The second is a positive number preceded by a hyphen " 40 "(\b-\b\ahigh\a), which represents all fields from field \b1\b to " 41 "\ahigh\a. The last is a positive number followed by a hyphen " 42 "(\alow\a\b-\b), which represents all fields from \alow\a to the " 43 "last field, inclusive. Elements in the \alist\a can be repeated, " 44 "can overlap, and can appear in any order. The order of the " 45 "output is that of the input.]" 46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]" 47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " 48 "cuts from standard input. The start of the file is defined " 49 "as the current offset.]" 50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]" 51 "[c:characters]:[list?\bcut\b based on a list of character counts.]" 52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set " 53 "to \adelim\a. The default is the \btab\b character.]" 54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter " 55 "character specified with the \b-d\b optiion.]" 56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]" 57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " 58 "records of length \areclen\a when used with the \b-b\b or \b-c\b " 59 "option.]" 60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, " 61 "when used with the \b-f\b option. By default, lines with no " 62 "delimiters will be passsed in untouched.]" 63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " 64 "the \b-f\b option is set to \aldelim\a. The default is the " 65 "\bnewline\b character.]" 66 "[N!:newline?Output new-lines at end of each record when used " 67 "with the \b-b\b or \b-c\b option.]" 68 "\n" 69 "\n[file ...]\n" 70 "\n" 71 "[+EXIT STATUS?]{" 72 "[+0?All files processed successfully.]" 73 "[+>0?One or more files failed to open or could not be read.]" 74 "}" 75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]" 76 ; 77 78 #include <cmd.h> 79 #include <ctype.h> 80 81 typedef struct Delim_s 82 { 83 char* str; 84 int len; 85 int chr; 86 } Delim_t; 87 88 typedef struct Cut_s 89 { 90 int mb; 91 int eob; 92 int cflag; 93 int nosplit; 94 int sflag; 95 int nlflag; 96 int reclen; 97 Delim_t wdelim; 98 Delim_t ldelim; 99 unsigned char space[UCHAR_MAX+1]; 100 int list[2]; /* NOTE: must be last member */ 101 } Cut_t; 102 103 #define HUGE INT_MAX 104 #define BLOCK 8*1024 105 #define C_BYTES 1 106 #define C_CHARS 2 107 #define C_FIELDS 4 108 #define C_SUPRESS 8 109 #define C_NOSPLIT 16 110 #define C_NONEWLINE 32 111 112 #define SP_LINE 1 113 #define SP_WORD 2 114 #define SP_WIDE 3 115 116 #define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n) 117 118 /* 119 * compare the first of an array of integers 120 */ 121 122 static int 123 mycomp(register const void* a, register const void* b) 124 { 125 if (*((int*)a) < *((int*)b)) 126 return -1; 127 if (*((int*)a) > *((int*)b)) 128 return 1; 129 return 0; 130 } 131 132 static Cut_t* 133 cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen) 134 { 135 register int* lp; 136 register int c; 137 register int n = 0; 138 register int range = 0; 139 register char* cp = str; 140 Cut_t* cut; 141 142 if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int)))) 143 error(ERROR_exit(1), "out of space"); 144 if (cut->mb = mbwide()) 145 { 146 memset(cut->space, 0, sizeof(cut->space) / 2); 147 memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2); 148 } 149 else 150 memset(cut->space, 0, sizeof(cut->space)); 151 cut->wdelim = *wdelim; 152 if (wdelim->len == 1) 153 cut->space[wdelim->chr] = SP_WORD; 154 cut->ldelim = *ldelim; 155 cut->eob = (ldelim->len == 1) ? ldelim->chr : 0; 156 cut->space[cut->eob] = SP_LINE; 157 cut->cflag = (mode&C_CHARS) && cut->mb; 158 cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb; 159 cut->sflag = (mode&C_SUPRESS) != 0; 160 cut->nlflag = (mode&C_NONEWLINE) != 0; 161 cut->reclen = reclen; 162 lp = cut->list; 163 for (;;) 164 switch(c = *cp++) 165 { 166 case ' ': 167 case '\t': 168 while(*cp==' ' || *cp=='\t') 169 cp++; 170 /*FALLTHROUGH*/ 171 case 0: 172 case ',': 173 if(range) 174 { 175 --range; 176 if((n = (n ? (n-range) : (HUGE-1))) < 0) 177 error(ERROR_exit(1),"invalid range for c/f option"); 178 *lp++ = range; 179 *lp++ = n; 180 } 181 else 182 { 183 *lp++ = --n; 184 *lp++ = 1; 185 } 186 if(c==0) 187 { 188 register int *dp; 189 *lp = HUGE; 190 n = 1 + (lp-cut->list)/2; 191 qsort(lp=cut->list,n,2*sizeof(*lp),mycomp); 192 /* eliminate overlapping regions */ 193 for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) 194 { 195 if(lp[0] <= range) 196 { 197 if(lp[1]==HUGE) 198 { 199 dp[-1] = HUGE; 200 break; 201 } 202 if((c = lp[0]+lp[1]-range)>0) 203 { 204 range += c; 205 dp[-1] += c; 206 } 207 } 208 else 209 { 210 range = *dp++ = lp[0]; 211 if(lp[1]==HUGE) 212 { 213 *dp++ = HUGE; 214 break; 215 } 216 range += (*dp++ = lp[1]); 217 } 218 } 219 *dp = HUGE; 220 lp = cut->list; 221 /* convert ranges into gaps */ 222 for(n=0; *lp!=HUGE; lp+=2) 223 { 224 c = *lp; 225 *lp -= n; 226 n = c+lp[1]; 227 } 228 return cut; 229 } 230 n = range = 0; 231 break; 232 233 case '-': 234 if(range) 235 error(ERROR_exit(1),"bad list for c/f option"); 236 range = n?n:1; 237 n = 0; 238 break; 239 240 default: 241 if(!isdigit(c)) 242 error(ERROR_exit(1),"bad list for c/f option"); 243 n = 10*n + (c-'0'); 244 break; 245 } 246 /* NOTREACHED */ 247 } 248 249 /* 250 * cut each line of file <fdin> and put results to <fdout> using list <list> 251 */ 252 253 static void 254 cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) 255 { 256 register int c; 257 register int len; 258 register int ncol = 0; 259 register const int* lp = cut->list; 260 register char* bp; 261 register int skip; /* non-zero for don't copy */ 262 int must; 263 char* ep; 264 const char* xx; 265 266 for (;;) 267 { 268 if (len = cut->reclen) 269 bp = sfreserve(fdin, len, -1); 270 else 271 bp = sfgetr(fdin, '\n', 0); 272 if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR))) 273 break; 274 len = sfvalue(fdin); 275 ep = bp + len; 276 xx = 0; 277 if (!(ncol = skip = *(lp = cut->list))) 278 ncol = *++lp; 279 must = 1; 280 do 281 { 282 if (cut->nosplit) 283 { 284 register const char* s = bp; 285 register int w = len < ncol ? len : ncol; 286 register int z; 287 288 while (w > 0) 289 { 290 if (!(*s & 0x80)) 291 z = 1; 292 else if ((z = mblen(s, w)) <= 0) 293 { 294 if (s == bp && xx) 295 { 296 w += s - xx; 297 bp = (char*)(s = xx); 298 xx = 0; 299 continue; 300 } 301 xx = s; 302 if (skip) 303 s += w; 304 w = 0; 305 break; 306 } 307 s += z; 308 w -= z; 309 } 310 c = s - bp; 311 ncol = !w && ncol >= len; 312 } 313 else if (cut->cflag) 314 { 315 register const char* s = bp; 316 register int w = len; 317 register int z; 318 319 while (w > 0 && ncol > 0) 320 { 321 ncol--; 322 if (!(*s & 0x80) || (z = mblen(s, w)) <= 0) 323 z = 1; 324 s += z; 325 w -= z; 326 327 } 328 c = s - bp; 329 ncol = !w && (ncol || !skip); 330 } 331 else 332 { 333 if ((c = ncol) > len) 334 c = len; 335 else if (c == len && !skip) 336 ncol++; 337 ncol -= c; 338 } 339 if (!skip && c) 340 { 341 if (sfwrite(fdout, (char*)bp, c) < 0) 342 return; 343 must = 0; 344 } 345 bp += c; 346 if (ncol) 347 break; 348 len -= c; 349 ncol = *++lp; 350 skip = !skip; 351 } while (ncol != HUGE); 352 if (!cut->nlflag && (skip || must || cut->reclen)) 353 { 354 if (cut->ldelim.len > 1) 355 sfwrite(fdout, cut->ldelim.str, cut->ldelim.len); 356 else 357 sfputc(fdout, cut->ldelim.chr); 358 } 359 } 360 } 361 362 /* 363 * cut each line of file <fdin> and put results to <fdout> using list <list> 364 * stream <fdin> must be line buffered 365 */ 366 367 static void 368 cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) 369 { 370 register unsigned char *sp = cut->space; 371 register unsigned char *cp; 372 register unsigned char *wp; 373 register int c, nfields; 374 register const int *lp = cut->list; 375 register unsigned char *copy; 376 register int nodelim, empty, inword=0; 377 register unsigned char *ep; 378 unsigned char *bp, *first; 379 int lastchar; 380 wchar_t w; 381 Sfio_t *fdtmp = 0; 382 long offset = 0; 383 unsigned char mb[8]; 384 /* process each buffer */ 385 while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0) 386 { 387 cp = bp; 388 ep = cp + --c; 389 if((lastchar = cp[c]) != cut->eob) 390 *ep = cut->eob; 391 /* process each line in the buffer */ 392 while (cp <= ep) 393 { 394 first = cp; 395 if (!inword) 396 { 397 nodelim = empty = 1; 398 copy = cp; 399 if (nfields = *(lp = cut->list)) 400 copy = 0; 401 else 402 nfields = *++lp; 403 } 404 else if (copy) 405 copy = cp; 406 inword = 0; 407 do 408 { 409 /* skip over non-delimiter characters */ 410 if (cut->mb) 411 for (;;) 412 { 413 switch (c = sp[*(unsigned char*)cp++]) 414 { 415 case 0: 416 continue; 417 case SP_WIDE: 418 wp = --cp; 419 while ((c = mb2wc(w, cp, ep - cp)) <= 0) 420 { 421 /* mb char possibly spanning buffer boundary -- fun stuff */ 422 if ((ep - cp) < mbmax()) 423 { 424 int i; 425 int j; 426 int k; 427 428 if (lastchar != cut->eob) 429 { 430 *ep = lastchar; 431 if ((c = mb2wc(w, cp, ep - cp)) > 0) 432 break; 433 } 434 if (copy) 435 { 436 empty = 0; 437 if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) 438 goto failed; 439 } 440 for (i = 0; i <= (ep - cp); i++) 441 mb[i] = cp[i]; 442 if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0) 443 goto failed; 444 cp = bp; 445 ep = cp + --c; 446 if ((lastchar = cp[c]) != cut->eob) 447 *ep = cut->eob; 448 j = i; 449 k = 0; 450 while (j < mbmax()) 451 mb[j++] = cp[k++]; 452 if ((c = mb2wc(w, (char*)mb, j)) <= 0) 453 { 454 c = i; 455 w = 0; 456 } 457 first = bp = cp += c - i; 458 if (copy) 459 { 460 copy = bp; 461 if (w == cut->ldelim.chr) 462 lastchar = cut->ldelim.chr; 463 else if (w != cut->wdelim.chr) 464 { 465 empty = 0; 466 if (sfwrite(fdout, (char*)mb, c) < 0) 467 goto failed; 468 } 469 } 470 c = 0; 471 } 472 else 473 { 474 w = *cp; 475 c = 1; 476 } 477 break; 478 } 479 cp += c; 480 c = w; 481 if (c == cut->wdelim.chr) 482 { 483 c = SP_WORD; 484 break; 485 } 486 if (c == cut->ldelim.chr) 487 { 488 c = SP_LINE; 489 break; 490 } 491 continue; 492 default: 493 wp = cp - 1; 494 break; 495 } 496 break; 497 } 498 else 499 { 500 while (!(c = sp[*cp++])); 501 wp = cp - 1; 502 } 503 /* check for end-of-line */ 504 if (c == SP_LINE) 505 { 506 if (cp <= ep) 507 break; 508 if (lastchar == cut->ldelim.chr) 509 break; 510 /* restore cut->last character */ 511 if (lastchar != cut->eob) 512 *ep = lastchar; 513 inword++; 514 if (!sp[lastchar]) 515 break; 516 } 517 nodelim = 0; 518 if (--nfields > 0) 519 continue; 520 nfields = *++lp; 521 if (copy) 522 { 523 empty = 0; 524 if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) 525 goto failed; 526 copy = 0; 527 } 528 else 529 /* set to delimiter unless the first field */ 530 copy = empty ? cp : wp; 531 } while (!inword); 532 if (!inword) 533 { 534 if (!copy) 535 { 536 if (nodelim) 537 { 538 if (!cut->sflag) 539 { 540 if (offset) 541 { 542 sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); 543 sfmove(fdtmp,fdout,offset,-1); 544 } 545 copy = first; 546 } 547 } 548 else 549 sfputc(fdout,'\n'); 550 } 551 if (offset) 552 sfseek(fdtmp,offset=0,SEEK_SET); 553 } 554 if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0) 555 goto failed; 556 } 557 /* see whether to save in tmp file */ 558 if(inword && nodelim && !cut->sflag && (c=cp-first)>0) 559 { 560 /* copy line to tmpfile in case no fields */ 561 if(!fdtmp) 562 fdtmp = sftmp(BLOCK); 563 sfwrite(fdtmp,(char*)first,c); 564 offset +=c; 565 } 566 } 567 failed: 568 if(fdtmp) 569 sfclose(fdtmp); 570 } 571 572 int 573 b_cut(int argc, char** argv, void* context) 574 { 575 register char* cp = 0; 576 register Sfio_t* fp; 577 char* s; 578 int n; 579 Cut_t* cut; 580 int mode = 0; 581 Delim_t wdelim; 582 Delim_t ldelim; 583 size_t reclen = 0; 584 585 cmdinit(argc, argv, context, ERROR_CATALOG, 0); 586 wdelim.chr = '\t'; 587 ldelim.chr = '\n'; 588 wdelim.len = ldelim.len = 1; 589 for (;;) 590 { 591 switch (n = optget(argv, usage)) 592 { 593 case 0: 594 break; 595 case 'b': 596 case 'c': 597 if(mode&C_FIELDS) 598 { 599 error(2, "f option already specified"); 600 continue; 601 } 602 cp = opt_info.arg; 603 if(n=='b') 604 mode |= C_BYTES; 605 else 606 mode |= C_CHARS; 607 continue; 608 case 'D': 609 ldelim.str = opt_info.arg; 610 if (mbwide()) 611 { 612 s = opt_info.arg; 613 ldelim.chr = mbchar(s); 614 if ((n = s - opt_info.arg) > 1) 615 { 616 ldelim.len = n; 617 continue; 618 } 619 } 620 ldelim.chr = *(unsigned char*)opt_info.arg; 621 ldelim.len = 1; 622 continue; 623 case 'd': 624 wdelim.str = opt_info.arg; 625 if (mbwide()) 626 { 627 s = opt_info.arg; 628 wdelim.chr = mbchar(s); 629 if ((n = s - opt_info.arg) > 1) 630 { 631 wdelim.len = n; 632 continue; 633 } 634 } 635 wdelim.chr = *(unsigned char*)opt_info.arg; 636 wdelim.len = 1; 637 continue; 638 case 'f': 639 if(mode&(C_CHARS|C_BYTES)) 640 { 641 error(2, "c option already specified"); 642 continue; 643 } 644 cp = opt_info.arg; 645 mode |= C_FIELDS; 646 continue; 647 case 'n': 648 mode |= C_NOSPLIT; 649 continue; 650 case 'N': 651 mode |= C_NONEWLINE; 652 continue; 653 case 'R': 654 case 'r': 655 if(opt_info.num>0) 656 reclen = opt_info.num; 657 continue; 658 case 's': 659 mode |= C_SUPRESS; 660 continue; 661 case ':': 662 error(2, "%s", opt_info.arg); 663 break; 664 case '?': 665 error(ERROR_usage(2), "%s", opt_info.arg); 666 break; 667 } 668 break; 669 } 670 argv += opt_info.index; 671 if (error_info.errors) 672 error(ERROR_usage(2), "%s",optusage(NiL)); 673 if(!cp) 674 { 675 error(2, "b, c or f option must be specified"); 676 error(ERROR_usage(2), "%s", optusage(NiL)); 677 } 678 if(!*cp) 679 error(3, "non-empty b, c or f option must be specified"); 680 if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) 681 error(3, "s option requires f option"); 682 cut = cutinit(mode, cp, &wdelim, &ldelim, reclen); 683 if(cp = *argv) 684 argv++; 685 do 686 { 687 if(!cp || streq(cp,"-")) 688 fp = sfstdin; 689 else if(!(fp = sfopen(NiL,cp,"r"))) 690 { 691 error(ERROR_system(0),"%s: cannot open",cp); 692 continue; 693 } 694 if(mode&C_FIELDS) 695 cutfields(cut,fp,sfstdout); 696 else 697 cutcols(cut,fp,sfstdout); 698 if(fp!=sfstdin) 699 sfclose(fp); 700 } while(cp = *argv++); 701 if (sfsync(sfstdout)) 702 error(ERROR_system(0), "write error"); 703 return error_info.errors != 0; 704 } 705