1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1992-2012 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Eclipse Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.eclipse.org/org/documents/epl-v10.html * 11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * * 20 ***********************************************************************/ 21 #pragma prototyped 22 /* 23 * David Korn 24 * AT&T Bell Laboratories 25 * 26 * cut fields or columns from fields from a file 27 */ 28 29 static const char usage[] = 30 "[-?\n@(#)$Id: cut (AT&T Research) 2010-08-11 $\n]" 31 USAGE_LICENSE 32 "[+NAME?cut - cut out selected columns or fields of each line of a file]" 33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " 34 "from one or more files, contatenating them on standard output.]" 35 "[+?The option argument \alist\a is a comma-separated or blank-separated " 36 "list of positive numbers and ranges. Ranges can be of three " 37 "forms. The first is two positive integers separated by a hyphen " 38 "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to " 39 "\ahigh\a. The second is a positive number preceded by a hyphen " 40 "(\b-\b\ahigh\a), which represents all fields from field \b1\b to " 41 "\ahigh\a. The last is a positive number followed by a hyphen " 42 "(\alow\a\b-\b), which represents all fields from \alow\a to the " 43 "last field, inclusive. Elements in the \alist\a can be repeated, " 44 "can overlap, and can appear in any order. The order of the " 45 "output is that of the input.]" 46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]" 47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " 48 "cuts from standard input. The start of the file is defined " 49 "as the current offset.]" 50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]" 51 "[c:characters]:[list?\bcut\b based on a list of character counts.]" 52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set " 53 "to \adelim\a. The default is the \btab\b character.]" 54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter " 55 "character specified with the \b-d\b optiion.]" 56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]" 57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " 58 "records of length \areclen\a when used with the \b-b\b or \b-c\b " 59 "option.]" 60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, " 61 "when used with the \b-f\b option. By default, lines with no " 62 "delimiters will be passsed in untouched.]" 63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " 64 "the \b-f\b option is set to \aldelim\a. The default is the " 65 "\bnewline\b character.]" 66 "[N!:newline?Output new-lines at end of each record when used " 67 "with the \b-b\b or \b-c\b option.]" 68 "\n" 69 "\n[file ...]\n" 70 "\n" 71 "[+EXIT STATUS?]{" 72 "[+0?All files processed successfully.]" 73 "[+>0?One or more files failed to open or could not be read.]" 74 "}" 75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]" 76 ; 77 78 #include <cmd.h> 79 #include <ctype.h> 80 81 typedef struct Delim_s 82 { 83 char* str; 84 int len; 85 int chr; 86 } Delim_t; 87 88 typedef struct Cut_s 89 { 90 int mb; 91 int eob; 92 int cflag; 93 int nosplit; 94 int sflag; 95 int nlflag; 96 int reclen; 97 Delim_t wdelim; 98 Delim_t ldelim; 99 unsigned char space[UCHAR_MAX+1]; 100 int list[2]; /* NOTE: must be last member */ 101 } Cut_t; 102 103 #define HUGE INT_MAX 104 #define BLOCK 8*1024 105 #define C_BYTES 1 106 #define C_CHARS 2 107 #define C_FIELDS 4 108 #define C_SUPRESS 8 109 #define C_NOSPLIT 16 110 #define C_NONEWLINE 32 111 112 #define SP_LINE 1 113 #define SP_WORD 2 114 #define SP_WIDE 3 115 116 /* 117 * compare the first of an array of integers 118 */ 119 120 static int 121 mycomp(register const void* a, register const void* b) 122 { 123 if (*((int*)a) < *((int*)b)) 124 return -1; 125 if (*((int*)a) > *((int*)b)) 126 return 1; 127 return 0; 128 } 129 130 static Cut_t* 131 cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen) 132 { 133 register int* lp; 134 register int c; 135 register int n = 0; 136 register int range = 0; 137 register char* cp = str; 138 Cut_t* cut; 139 140 if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int)))) 141 error(ERROR_exit(1), "out of space"); 142 if (cut->mb = mbwide()) 143 { 144 memset(cut->space, 0, sizeof(cut->space) / 2); 145 memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2); 146 } 147 else 148 memset(cut->space, 0, sizeof(cut->space)); 149 cut->wdelim = *wdelim; 150 if (wdelim->len == 1) 151 cut->space[wdelim->chr] = SP_WORD; 152 cut->ldelim = *ldelim; 153 cut->eob = (ldelim->len == 1) ? ldelim->chr : 0; 154 cut->space[cut->eob] = SP_LINE; 155 cut->cflag = (mode&C_CHARS) && cut->mb; 156 cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb; 157 cut->sflag = (mode&C_SUPRESS) != 0; 158 cut->nlflag = (mode&C_NONEWLINE) != 0; 159 cut->reclen = reclen; 160 lp = cut->list; 161 for (;;) 162 switch(c = *cp++) 163 { 164 case ' ': 165 case '\t': 166 while(*cp==' ' || *cp=='\t') 167 cp++; 168 /*FALLTHROUGH*/ 169 case 0: 170 case ',': 171 if(range) 172 { 173 --range; 174 if((n = (n ? (n-range) : (HUGE-1))) < 0) 175 error(ERROR_exit(1),"invalid range for c/f option"); 176 *lp++ = range; 177 *lp++ = n; 178 } 179 else 180 { 181 *lp++ = --n; 182 *lp++ = 1; 183 } 184 if(c==0) 185 { 186 register int *dp; 187 *lp = HUGE; 188 n = 1 + (lp-cut->list)/2; 189 qsort(lp=cut->list,n,2*sizeof(*lp),mycomp); 190 /* eliminate overlapping regions */ 191 for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) 192 { 193 if(lp[0] <= range) 194 { 195 if(lp[1]==HUGE) 196 { 197 dp[-1] = HUGE; 198 break; 199 } 200 if((c = lp[0]+lp[1]-range)>0) 201 { 202 range += c; 203 dp[-1] += c; 204 } 205 } 206 else 207 { 208 range = *dp++ = lp[0]; 209 if(lp[1]==HUGE) 210 { 211 *dp++ = HUGE; 212 break; 213 } 214 range += (*dp++ = lp[1]); 215 } 216 } 217 *dp = HUGE; 218 lp = cut->list; 219 /* convert ranges into gaps */ 220 for(n=0; *lp!=HUGE; lp+=2) 221 { 222 c = *lp; 223 *lp -= n; 224 n = c+lp[1]; 225 } 226 return cut; 227 } 228 n = range = 0; 229 break; 230 231 case '-': 232 if(range) 233 error(ERROR_exit(1),"bad list for c/f option"); 234 range = n?n:1; 235 n = 0; 236 break; 237 238 default: 239 if(!isdigit(c)) 240 error(ERROR_exit(1),"bad list for c/f option"); 241 n = 10*n + (c-'0'); 242 break; 243 } 244 /* NOTREACHED */ 245 } 246 247 /* 248 * cut each line of file <fdin> and put results to <fdout> using list <list> 249 */ 250 251 static void 252 cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) 253 { 254 register int c; 255 register int len; 256 register int ncol = 0; 257 register const int* lp = cut->list; 258 register char* bp; 259 register int skip; /* non-zero for don't copy */ 260 int must; 261 const char* xx; 262 263 for (;;) 264 { 265 if (len = cut->reclen) 266 bp = sfreserve(fdin, len, -1); 267 else 268 bp = sfgetr(fdin, '\n', 0); 269 if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR))) 270 break; 271 len = sfvalue(fdin); 272 xx = 0; 273 if (!(ncol = skip = *(lp = cut->list))) 274 ncol = *++lp; 275 must = 1; 276 do 277 { 278 if (cut->nosplit) 279 { 280 register const char* s = bp; 281 register int w = len < ncol ? len : ncol; 282 register int z; 283 284 while (w > 0) 285 { 286 if (!(*s & 0x80)) 287 z = 1; 288 else if ((z = mbnsize(s, w)) <= 0) 289 { 290 if (s == bp && xx) 291 { 292 w += s - xx; 293 bp = (char*)(s = xx); 294 xx = 0; 295 continue; 296 } 297 xx = s; 298 if (skip) 299 s += w; 300 w = 0; 301 break; 302 } 303 s += z; 304 w -= z; 305 } 306 c = s - bp; 307 ncol = !w && ncol >= len; 308 } 309 else if (cut->cflag) 310 { 311 register const char* s = bp; 312 register int w = len; 313 register int z; 314 315 while (w > 0 && ncol > 0) 316 { 317 ncol--; 318 if (!(*s & 0x80) || (z = mbnsize(s, w)) <= 0) 319 z = 1; 320 s += z; 321 w -= z; 322 323 } 324 c = s - bp; 325 ncol = !w && (ncol || !skip); 326 } 327 else 328 { 329 if ((c = ncol) > len) 330 c = len; 331 else if (c == len && !skip) 332 ncol++; 333 ncol -= c; 334 } 335 if (!skip && c) 336 { 337 if (sfwrite(fdout, (char*)bp, c) < 0) 338 return; 339 must = 0; 340 } 341 bp += c; 342 if (ncol) 343 break; 344 len -= c; 345 ncol = *++lp; 346 skip = !skip; 347 } while (ncol != HUGE); 348 if (!cut->nlflag && (skip || must || cut->reclen)) 349 { 350 if (cut->ldelim.len > 1) 351 sfwrite(fdout, cut->ldelim.str, cut->ldelim.len); 352 else 353 sfputc(fdout, cut->ldelim.chr); 354 } 355 } 356 } 357 358 /* 359 * cut each line of file <fdin> and put results to <fdout> using list <list> 360 * stream <fdin> must be line buffered 361 */ 362 363 static void 364 cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout) 365 { 366 register unsigned char *sp = cut->space; 367 register unsigned char *cp; 368 register unsigned char *wp; 369 register int c, nfields; 370 register const int *lp = cut->list; 371 register unsigned char *copy; 372 register int nodelim, empty, inword=0; 373 register unsigned char *ep; 374 unsigned char *bp, *first; 375 int lastchar; 376 wchar_t w; 377 Sfio_t *fdtmp = 0; 378 long offset = 0; 379 unsigned char mb[8]; 380 /* process each buffer */ 381 while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0) 382 { 383 cp = bp; 384 ep = cp + --c; 385 if((lastchar = cp[c]) != cut->eob) 386 *ep = cut->eob; 387 /* process each line in the buffer */ 388 while (cp <= ep) 389 { 390 first = cp; 391 if (!inword) 392 { 393 nodelim = empty = 1; 394 copy = cp; 395 if (nfields = *(lp = cut->list)) 396 copy = 0; 397 else 398 nfields = *++lp; 399 } 400 else if (copy) 401 copy = cp; 402 inword = 0; 403 do 404 { 405 /* skip over non-delimiter characters */ 406 if (cut->mb) 407 for (;;) 408 { 409 switch (c = sp[*(unsigned char*)cp++]) 410 { 411 case 0: 412 continue; 413 case SP_WIDE: 414 wp = --cp; 415 while ((c = mb2wc(w, cp, ep - cp)) <= 0) 416 { 417 /* mb char possibly spanning buffer boundary -- fun stuff */ 418 if ((ep - cp) < mbmax()) 419 { 420 int i; 421 int j; 422 int k; 423 424 if (lastchar != cut->eob) 425 { 426 *ep = lastchar; 427 if ((c = mb2wc(w, cp, ep - cp)) > 0) 428 break; 429 } 430 if (copy) 431 { 432 empty = 0; 433 if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) 434 goto failed; 435 } 436 for (i = 0; i <= (ep - cp); i++) 437 mb[i] = cp[i]; 438 if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0) 439 goto failed; 440 cp = bp; 441 ep = cp + --c; 442 if ((lastchar = cp[c]) != cut->eob) 443 *ep = cut->eob; 444 j = i; 445 k = 0; 446 while (j < mbmax()) 447 mb[j++] = cp[k++]; 448 if ((c = mb2wc(w, (char*)mb, j)) <= 0) 449 { 450 c = i; 451 w = 0; 452 } 453 first = bp = cp += c - i; 454 if (copy) 455 { 456 copy = bp; 457 if (w == cut->ldelim.chr) 458 lastchar = cut->ldelim.chr; 459 else if (w != cut->wdelim.chr) 460 { 461 empty = 0; 462 if (sfwrite(fdout, (char*)mb, c) < 0) 463 goto failed; 464 } 465 } 466 c = 0; 467 } 468 else 469 { 470 w = *cp; 471 c = 1; 472 } 473 break; 474 } 475 cp += c; 476 c = w; 477 if (c == cut->wdelim.chr) 478 { 479 c = SP_WORD; 480 break; 481 } 482 if (c == cut->ldelim.chr) 483 { 484 c = SP_LINE; 485 break; 486 } 487 continue; 488 default: 489 wp = cp - 1; 490 break; 491 } 492 break; 493 } 494 else 495 { 496 while (!(c = sp[*cp++])); 497 wp = cp - 1; 498 } 499 /* check for end-of-line */ 500 if (c == SP_LINE) 501 { 502 if (cp <= ep) 503 break; 504 if (lastchar == cut->ldelim.chr) 505 break; 506 /* restore cut->last character */ 507 if (lastchar != cut->eob) 508 *ep = lastchar; 509 inword++; 510 if (!sp[lastchar]) 511 break; 512 } 513 nodelim = 0; 514 if (--nfields > 0) 515 continue; 516 nfields = *++lp; 517 if (copy) 518 { 519 empty = 0; 520 if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0) 521 goto failed; 522 copy = 0; 523 } 524 else 525 /* set to delimiter unless the first field */ 526 copy = empty ? cp : wp; 527 } while (!inword); 528 if (!inword) 529 { 530 if (!copy) 531 { 532 if (nodelim) 533 { 534 if (!cut->sflag) 535 { 536 if (offset) 537 { 538 sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); 539 sfmove(fdtmp,fdout,offset,-1); 540 } 541 copy = first; 542 } 543 } 544 else 545 sfputc(fdout,'\n'); 546 } 547 if (offset) 548 sfseek(fdtmp,offset=0,SEEK_SET); 549 } 550 if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0) 551 goto failed; 552 } 553 /* see whether to save in tmp file */ 554 if(inword && nodelim && !cut->sflag && (c=cp-first)>0) 555 { 556 /* copy line to tmpfile in case no fields */ 557 if(!fdtmp) 558 fdtmp = sftmp(BLOCK); 559 sfwrite(fdtmp,(char*)first,c); 560 offset +=c; 561 } 562 } 563 failed: 564 if(fdtmp) 565 sfclose(fdtmp); 566 } 567 568 int 569 b_cut(int argc, char** argv, Shbltin_t* context) 570 { 571 register char* cp = 0; 572 register Sfio_t* fp; 573 char* s; 574 int n; 575 Cut_t* cut; 576 int mode = 0; 577 Delim_t wdelim; 578 Delim_t ldelim; 579 size_t reclen = 0; 580 581 cmdinit(argc, argv, context, ERROR_CATALOG, 0); 582 wdelim.chr = '\t'; 583 ldelim.chr = '\n'; 584 wdelim.len = ldelim.len = 1; 585 for (;;) 586 { 587 switch (optget(argv, usage)) 588 { 589 case 0: 590 break; 591 case 'b': 592 case 'c': 593 if(mode&C_FIELDS) 594 { 595 error(2, "f option already specified"); 596 continue; 597 } 598 cp = opt_info.arg; 599 if(opt_info.option[1]=='b') 600 mode |= C_BYTES; 601 else 602 mode |= C_CHARS; 603 continue; 604 case 'D': 605 ldelim.str = opt_info.arg; 606 if (mbwide()) 607 { 608 s = opt_info.arg; 609 ldelim.chr = mbchar(s); 610 if ((n = s - opt_info.arg) > 1) 611 { 612 ldelim.len = n; 613 continue; 614 } 615 } 616 ldelim.chr = *(unsigned char*)opt_info.arg; 617 ldelim.len = 1; 618 continue; 619 case 'd': 620 wdelim.str = opt_info.arg; 621 if (mbwide()) 622 { 623 s = opt_info.arg; 624 wdelim.chr = mbchar(s); 625 if ((n = s - opt_info.arg) > 1) 626 { 627 wdelim.len = n; 628 continue; 629 } 630 } 631 wdelim.chr = *(unsigned char*)opt_info.arg; 632 wdelim.len = 1; 633 continue; 634 case 'f': 635 if(mode&(C_CHARS|C_BYTES)) 636 { 637 error(2, "c option already specified"); 638 continue; 639 } 640 cp = opt_info.arg; 641 mode |= C_FIELDS; 642 continue; 643 case 'n': 644 mode |= C_NOSPLIT; 645 continue; 646 case 'N': 647 mode |= C_NONEWLINE; 648 continue; 649 case 'R': 650 case 'r': 651 if(opt_info.num>0) 652 reclen = opt_info.num; 653 continue; 654 case 's': 655 mode |= C_SUPRESS; 656 continue; 657 case ':': 658 error(2, "%s", opt_info.arg); 659 break; 660 case '?': 661 error(ERROR_usage(2), "%s", opt_info.arg); 662 break; 663 } 664 break; 665 } 666 argv += opt_info.index; 667 if (error_info.errors) 668 error(ERROR_usage(2), "%s",optusage(NiL)); 669 if(!cp) 670 { 671 error(2, "b, c or f option must be specified"); 672 error(ERROR_usage(2), "%s", optusage(NiL)); 673 } 674 if(!*cp) 675 error(3, "non-empty b, c or f option must be specified"); 676 if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) 677 error(3, "s option requires f option"); 678 cut = cutinit(mode, cp, &wdelim, &ldelim, reclen); 679 if(cp = *argv) 680 argv++; 681 do 682 { 683 if(!cp || streq(cp,"-")) 684 fp = sfstdin; 685 else if(!(fp = sfopen(NiL,cp,"r"))) 686 { 687 error(ERROR_system(0),"%s: cannot open",cp); 688 continue; 689 } 690 if(mode&C_FIELDS) 691 cutfields(cut,fp,sfstdout); 692 else 693 cutcols(cut,fp,sfstdout); 694 if(fp!=sfstdin) 695 sfclose(fp); 696 } while(cp = *argv++); 697 if (sfsync(sfstdout)) 698 error(ERROR_system(0), "write error"); 699 return error_info.errors != 0; 700 } 701