1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1992-2012 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Eclipse Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.eclipse.org/org/documents/epl-v10.html * 11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * * 20 ***********************************************************************/ 21 #pragma prototyped 22 /* 23 * David Korn 24 * AT&T Bell Laboratories 25 * 26 * cut fields or columns from fields from a file 27 */ 28 29 static const char usage[] = 30 "[-?\n@(#)$Id: cut (AT&T Research) 2010-08-11 $\n]" 31 USAGE_LICENSE 32 "[+NAME?cut - cut out selected columns or fields of each line of a file]" 33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " 34 "from one or more files, contatenating them on standard output.]" 35 "[+?The option argument \alist\a is a comma-separated or blank-separated " 36 "list of positive numbers and ranges. Ranges can be of three " 37 "forms. The first is two positive integers separated by a hyphen " 38 "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to " 39 "\ahigh\a. The second is a positive number preceded by a hyphen " 40 "(\b-\b\ahigh\a), which represents all fields from field \b1\b to " 41 "\ahigh\a. The last is a positive number followed by a hyphen " 42 "(\alow\a\b-\b), which represents all fields from \alow\a to the " 43 "last field, inclusive. Elements in the \alist\a can be repeated, " 44 "can overlap, and can appear in any order. The order of the " 45 "output is that of the input.]" 46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]" 47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " 48 "cuts from standard input. The start of the file is defined " 49 "as the current offset.]" 50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]" 51 "[c:characters]:[list?\bcut\b based on a list of character counts.]" 52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set " 53 "to \adelim\a. The default is the \btab\b character.]" 54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter " 55 "character specified with the \b-d\b optiion.]" 56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]" 57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " 58 "records of length \areclen\a when used with the \b-b\b or \b-c\b " 59 "option.]" 60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, " 61 "when used with the \b-f\b option. By default, lines with no " 62 "delimiters will be passsed in untouched.]" 63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " 64 "the \b-f\b option is set to \aldelim\a. The default is the " 65 "\bnewline\b character.]" 66 "[N!:newline?Output new-lines at end of each record when used " 67 "with the \b-b\b or \b-c\b option.]" 68 "\n" 69 "\n[file ...]\n" 70 "\n" 71 "[+EXIT STATUS?]{" 72 "[+0?All files processed successfully.]" 73 "[+>0?One or more files failed to open or could not be read.]" 74 "}" 75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]" 76 ; 77 78 #include <cmd.h> 79 #include <ctype.h> 80 81 typedef struct Delim_s 82 { 83 char* str; 84 int len; 85 int chr; 86 } Delim_t; 87 88 typedef struct Cut_s 89 { 90 int mb; 91 int eob; 92 int cflag; 93 int nosplit; 94 int sflag; 95 int nlflag; 96 int reclen; 97 Delim_t wdelim; 98 Delim_t ldelim; 99 unsigned char space[UCHAR_MAX+1]; 100 int list[2]; /* NOTE: must be last member */ 101 } Cut_t; 102 103 #define HUGE INT_MAX 104 #define BLOCK 8*1024 105 #define C_BYTES 1 106 #define C_CHARS 2 107 #define C_FIELDS 4 108 #define C_SUPRESS 8 109 #define C_NOSPLIT 16 110 #define C_NONEWLINE 32 111 112 #define SP_LINE 1 113 #define SP_WORD 2 114 #define SP_WIDE 3 115 116 /* 117 * compare the first of an array of integers 118 */ 119 120 static int 121 mycomp(register const void* a, register const void* b) 122 { 123 if (*((int*)a) < *((int*)b)) 124 return -1; 125 if (*((int*)a) > *((int*)b)) 126 return 1; 127 return 0; 128 } 129 130 static Cut_t* 131 cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen) 132 { 133 register int* lp; 134 register int c; 135 register int n = 0; 136 register int range = 0; 137 register char* cp = str; 138 Cut_t* cut; 139 140 if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int)))) 141 error(ERROR_exit(1), "out of space"); 142 if (cut->mb = mbwide()) 143 { 144 memset(cut->space, 0, sizeof(cut->space) / 2); 145 memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2); 146 } 147 else 148 memset(cut->space, 0, sizeof(cut->space)); 149 cut->wdelim = *wdelim; 150 if (wdelim->len == 1) 151 cut->space[wdelim->chr] = SP_WORD; 152 cut->ldelim = *ldelim; 153 cut->eob = (ldelim->len == 1) ? ldelim->chr : 0; 154 cut->space[cut->eob] = SP_LINE; 155 cut->cflag = (mode&C_CHARS) && cut->mb; 156 cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb; 157 cut->sflag = (mode&C_SUPRESS) != 0; 158 cut->nlflag = (mode&C_NONEWLINE) != 0; 159 cut->reclen = reclen; 160 lp = cut->list; 161 for (;;) 162 switch(c = *cp++) 163 { 164 case ' ': 165 case '\t': 166 while(*cp==' ' || *cp=='\t') 167 cp++; 168 /*FALLTHROUGH*/ 169 case 0: 170 case ',': 171 if(range) 172 { 173 --range; 174 if((n = (n ? (n-range) : (HUGE-1))) < 0) 175 error(ERROR_exit(1),"invalid range for c/f option"); 176 *lp++ = range; 177 *lp++ = n; 178 } 179 else 180 { 181 *lp++ = --n; 182 *lp++ = 1; 183 } 184 if(c==0) 185 { 186 register int *dp; 187 *lp = HUGE; 188 n = 1 + (lp-cut->list)/2; 189 qsort(lp=cut->list,n,2*sizeof(*lp),mycomp); 190 /* eliminate overlapping regions */ 191 for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) 192 { 193 if(lp[0] <= range) 194 { 195 if(lp[1]==HUGE) 196 { 197 dp[-1] = HUGE; 198 break; 199 } 200 if((c = lp[0]+lp[1]-range)>0) 201 { 202 range += c; 203 dp[-1] += c; 204 } 205 } 206 else 207 { 208 range = *dp++ = lp[0]; 209 if(lp[1]==HUGE) 210 { 211 *dp++ = HUGE; 212 break; 213 } 214 range += (*dp++ = lp[1]); 215 } 216 } 217 *dp = HUGE; 218 lp = cut->list; 219 /* convert ranges into gaps */ 220 for(n=0; *lp!=HUGE; lp+=2) 221 { 222 c = *lp; 223 *lp -= n; 224 n = c+lp[1]; 225 } 226 return cut; 227 } 228 n = range = 0; 229 break; 230 231 case '-': 232 if(range) 233 error(ERROR_exit(1),"bad list for c/f option"); 234 range = n?n:1; 235 n = 0; 236 break; 237 238 default: 239 if(!isdigit(c)) 240 error(ERROR_exit(1),"bad list for c/f option"); 241 n = 10*n + (c-'0'); 242 break; 243 } 244 /* NOTREACHED */ 245 } 246 247 /* 248 * cut each line of file <fdin> and put results to <fdout> using list <list> 249 */ 250 251 static void 252 cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) 253 { 254 register int c; 255 register int len; 256 register int ncol = 0; 257 register const int* lp = cut->list; 258 register char* bp; 259 register int skip; /* non-zero for don't copy */ 260 int must; 261 char* ep; 262 const char* xx; 263 264 for (;;) 265 { 266 if (len = cut->reclen) 267 bp = sfreserve(fdin, len, -1); 268 else 269 bp = sfgetr(fdin, '\n', 0); 270 if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR))) 271 break; 272 len = sfvalue(fdin); 273 ep = bp + len; 274 xx = 0; 275 if (!(ncol = skip = *(lp = cut->list))) 276 ncol = *++lp; 277 must = 1; 278 do 279 { 280 if (cut->nosplit) 281 { 282 register const char* s = bp; 283 register int w = len < ncol ? len : ncol; 284 register int z; 285 286 while (w > 0) 287 { 288 if (!(*s & 0x80)) 289 z = 1; 290 else if ((z = mbnsize(s, w)) <= 0) 291 { 292 if (s == bp && xx) 293 { 294 w += s - xx; 295 bp = (char*)(s = xx); 296 xx = 0; 297 continue; 298 } 299 xx = s; 300 if (skip) 301 s += w; 302 w = 0; 303 break; 304 } 305 s += z; 306 w -= z; 307 } 308 c = s - bp; 309 ncol = !w && ncol >= len; 310 } 311 else if (cut->cflag) 312 { 313 register const char* s = bp; 314 register int w = len; 315 register int z; 316 317 while (w > 0 && ncol > 0) 318 { 319 ncol--; 320 if (!(*s & 0x80) || (z = mbnsize(s, w)) <= 0) 321 z = 1; 322 s += z; 323 w -= z; 324 325 } 326 c = s - bp; 327 ncol = !w && (ncol || !skip); 328 } 329 else 330 { 331 if ((c = ncol) > len) 332 c = len; 333 else if (c == len && !skip) 334 ncol++; 335 ncol -= c; 336 } 337 if (!skip && c) 338 { 339 if (sfwrite(fdout, (char*)bp, c) < 0) 340 return; 341 must = 0; 342 } 343 bp += c; 344 if (ncol) 345 break; 346 len -= c; 347 ncol = *++lp; 348 skip = !skip; 349 } while (ncol != HUGE); 350 if (!cut->nlflag && (skip || must || cut->reclen)) 351 { 352 if (cut->ldelim.len > 1) 353 sfwrite(fdout, cut->ldelim.str, cut->ldelim.len); 354 else 355 sfputc(fdout, cut->ldelim.chr); 356 } 357 } 358 } 359 360 /* 361 * cut each line of file <fdin> and put results to <fdout> using list <list> 362 * stream <fdin> must be line buffered 363 */ 364 365 static void 366 cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) 367 { 368 register unsigned char *sp = cut->space; 369 register unsigned char *cp; 370 register unsigned char *wp; 371 register int c, nfields; 372 register const int *lp = cut->list; 373 register unsigned char *copy; 374 register int nodelim, empty, inword=0; 375 register unsigned char *ep; 376 unsigned char *bp, *first; 377 int lastchar; 378 wchar_t w; 379 Sfio_t *fdtmp = 0; 380 long offset = 0; 381 unsigned char mb[8]; 382 /* process each buffer */ 383 while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0) 384 { 385 cp = bp; 386 ep = cp + --c; 387 if((lastchar = cp[c]) != cut->eob) 388 *ep = cut->eob; 389 /* process each line in the buffer */ 390 while (cp <= ep) 391 { 392 first = cp; 393 if (!inword) 394 { 395 nodelim = empty = 1; 396 copy = cp; 397 if (nfields = *(lp = cut->list)) 398 copy = 0; 399 else 400 nfields = *++lp; 401 } 402 else if (copy) 403 copy = cp; 404 inword = 0; 405 do 406 { 407 /* skip over non-delimiter characters */ 408 if (cut->mb) 409 for (;;) 410 { 411 switch (c = sp[*(unsigned char*)cp++]) 412 { 413 case 0: 414 continue; 415 case SP_WIDE: 416 wp = --cp; 417 while ((c = mb2wc(w, cp, ep - cp)) <= 0) 418 { 419 /* mb char possibly spanning buffer boundary -- fun stuff */ 420 if ((ep - cp) < mbmax()) 421 { 422 int i; 423 int j; 424 int k; 425 426 if (lastchar != cut->eob) 427 { 428 *ep = lastchar; 429 if ((c = mb2wc(w, cp, ep - cp)) > 0) 430 break; 431 } 432 if (copy) 433 { 434 empty = 0; 435 if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) 436 goto failed; 437 } 438 for (i = 0; i <= (ep - cp); i++) 439 mb[i] = cp[i]; 440 if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0) 441 goto failed; 442 cp = bp; 443 ep = cp + --c; 444 if ((lastchar = cp[c]) != cut->eob) 445 *ep = cut->eob; 446 j = i; 447 k = 0; 448 while (j < mbmax()) 449 mb[j++] = cp[k++]; 450 if ((c = mb2wc(w, (char*)mb, j)) <= 0) 451 { 452 c = i; 453 w = 0; 454 } 455 first = bp = cp += c - i; 456 if (copy) 457 { 458 copy = bp; 459 if (w == cut->ldelim.chr) 460 lastchar = cut->ldelim.chr; 461 else if (w != cut->wdelim.chr) 462 { 463 empty = 0; 464 if (sfwrite(fdout, (char*)mb, c) < 0) 465 goto failed; 466 } 467 } 468 c = 0; 469 } 470 else 471 { 472 w = *cp; 473 c = 1; 474 } 475 break; 476 } 477 cp += c; 478 c = w; 479 if (c == cut->wdelim.chr) 480 { 481 c = SP_WORD; 482 break; 483 } 484 if (c == cut->ldelim.chr) 485 { 486 c = SP_LINE; 487 break; 488 } 489 continue; 490 default: 491 wp = cp - 1; 492 break; 493 } 494 break; 495 } 496 else 497 { 498 while (!(c = sp[*cp++])); 499 wp = cp - 1; 500 } 501 /* check for end-of-line */ 502 if (c == SP_LINE) 503 { 504 if (cp <= ep) 505 break; 506 if (lastchar == cut->ldelim.chr) 507 break; 508 /* restore cut->last character */ 509 if (lastchar != cut->eob) 510 *ep = lastchar; 511 inword++; 512 if (!sp[lastchar]) 513 break; 514 } 515 nodelim = 0; 516 if (--nfields > 0) 517 continue; 518 nfields = *++lp; 519 if (copy) 520 { 521 empty = 0; 522 if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) 523 goto failed; 524 copy = 0; 525 } 526 else 527 /* set to delimiter unless the first field */ 528 copy = empty ? cp : wp; 529 } while (!inword); 530 if (!inword) 531 { 532 if (!copy) 533 { 534 if (nodelim) 535 { 536 if (!cut->sflag) 537 { 538 if (offset) 539 { 540 sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); 541 sfmove(fdtmp,fdout,offset,-1); 542 } 543 copy = first; 544 } 545 } 546 else 547 sfputc(fdout,'\n'); 548 } 549 if (offset) 550 sfseek(fdtmp,offset=0,SEEK_SET); 551 } 552 if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0) 553 goto failed; 554 } 555 /* see whether to save in tmp file */ 556 if(inword && nodelim && !cut->sflag && (c=cp-first)>0) 557 { 558 /* copy line to tmpfile in case no fields */ 559 if(!fdtmp) 560 fdtmp = sftmp(BLOCK); 561 sfwrite(fdtmp,(char*)first,c); 562 offset +=c; 563 } 564 } 565 failed: 566 if(fdtmp) 567 sfclose(fdtmp); 568 } 569 570 int 571 b_cut(int argc, char** argv, Shbltin_t* context) 572 { 573 register char* cp = 0; 574 register Sfio_t* fp; 575 char* s; 576 int n; 577 Cut_t* cut; 578 int mode = 0; 579 Delim_t wdelim; 580 Delim_t ldelim; 581 size_t reclen = 0; 582 583 cmdinit(argc, argv, context, ERROR_CATALOG, 0); 584 wdelim.chr = '\t'; 585 ldelim.chr = '\n'; 586 wdelim.len = ldelim.len = 1; 587 for (;;) 588 { 589 switch (optget(argv, usage)) 590 { 591 case 0: 592 break; 593 case 'b': 594 case 'c': 595 if(mode&C_FIELDS) 596 { 597 error(2, "f option already specified"); 598 continue; 599 } 600 cp = opt_info.arg; 601 if(opt_info.option[1]=='b') 602 mode |= C_BYTES; 603 else 604 mode |= C_CHARS; 605 continue; 606 case 'D': 607 ldelim.str = opt_info.arg; 608 if (mbwide()) 609 { 610 s = opt_info.arg; 611 ldelim.chr = mbchar(s); 612 if ((n = s - opt_info.arg) > 1) 613 { 614 ldelim.len = n; 615 continue; 616 } 617 } 618 ldelim.chr = *(unsigned char*)opt_info.arg; 619 ldelim.len = 1; 620 continue; 621 case 'd': 622 wdelim.str = opt_info.arg; 623 if (mbwide()) 624 { 625 s = opt_info.arg; 626 wdelim.chr = mbchar(s); 627 if ((n = s - opt_info.arg) > 1) 628 { 629 wdelim.len = n; 630 continue; 631 } 632 } 633 wdelim.chr = *(unsigned char*)opt_info.arg; 634 wdelim.len = 1; 635 continue; 636 case 'f': 637 if(mode&(C_CHARS|C_BYTES)) 638 { 639 error(2, "c option already specified"); 640 continue; 641 } 642 cp = opt_info.arg; 643 mode |= C_FIELDS; 644 continue; 645 case 'n': 646 mode |= C_NOSPLIT; 647 continue; 648 case 'N': 649 mode |= C_NONEWLINE; 650 continue; 651 case 'R': 652 case 'r': 653 if(opt_info.num>0) 654 reclen = opt_info.num; 655 continue; 656 case 's': 657 mode |= C_SUPRESS; 658 continue; 659 case ':': 660 error(2, "%s", opt_info.arg); 661 break; 662 case '?': 663 error(ERROR_usage(2), "%s", opt_info.arg); 664 break; 665 } 666 break; 667 } 668 argv += opt_info.index; 669 if (error_info.errors) 670 error(ERROR_usage(2), "%s",optusage(NiL)); 671 if(!cp) 672 { 673 error(2, "b, c or f option must be specified"); 674 error(ERROR_usage(2), "%s", optusage(NiL)); 675 } 676 if(!*cp) 677 error(3, "non-empty b, c or f option must be specified"); 678 if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) 679 error(3, "s option requires f option"); 680 cut = cutinit(mode, cp, &wdelim, &ldelim, reclen); 681 if(cp = *argv) 682 argv++; 683 do 684 { 685 if(!cp || streq(cp,"-")) 686 fp = sfstdin; 687 else if(!(fp = sfopen(NiL,cp,"r"))) 688 { 689 error(ERROR_system(0),"%s: cannot open",cp); 690 continue; 691 } 692 if(mode&C_FIELDS) 693 cutfields(cut,fp,sfstdout); 694 else 695 cutcols(cut,fp,sfstdout); 696 if(fp!=sfstdin) 697 sfclose(fp); 698 } while(cp = *argv++); 699 if (sfsync(sfstdout)) 700 error(ERROR_system(0), "write error"); 701 return error_info.errors != 0; 702 } 703