1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1992-2007 AT&T Knowledge Ventures * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Knowledge Ventures * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * * 20 ***********************************************************************/ 21 #pragma prototyped 22 /* 23 * David Korn 24 * AT&T Bell Laboratories 25 * 26 * cut [-sN] [-f flist] [-c clist] [-d delim] [-D delim] [-r reclen] [file] ... 27 * 28 * cut fields or columns from fields from a file 29 */ 30 31 static const char usage[] = 32 "[-?\n@(#)$Id: cut (AT&T Research) 2007-01-23 $\n]" 33 USAGE_LICENSE 34 "[+NAME?cut - cut out selected columns or fields of each line of a file]" 35 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " 36 "from one or more files, contatenating them on standard output.]" 37 "[+?The option argument \alist\a is a comma-separated or blank-separated " 38 "list of positive numbers and ranges. Ranges can be of three " 39 "forms. The first is two positive integers separated by a hyphen " 40 "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to " 41 "\ahigh\a. The second is a positive number preceded by a hyphen " 42 "(\b-\b\ahigh\a), which represents all fields from field \b1\b to " 43 "\ahigh\a. The last is a positive number followed by a hyphen " 44 "(\alow\a\b-\b), which represents all fields from \alow\a to the " 45 "last field, inclusive. Elements in the \alist\a can be repeated, " 46 "can overlap, and can appear in any order. The order of the " 47 "output is that of the input.]" 48 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]" 49 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " 50 "cuts from standard input. The start of the file is defined " 51 "as the current offset.]" 52 "[b:bytes]:[list?\bcut\b based on a list of bytes.]" 53 "[c:characters]:[list?\bcut\b based on a list of characters.]" 54 "[d:delimiter]:[delim?The field character for the \b-f\b option is set " 55 "to \adelim\a. The default is the \btab\b character.]" 56 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter " 57 "character specified with the \b-d\b optiion.]" 58 "[n:nosplit?Do not split characters. Currently ignored.]" 59 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " 60 "records of length \areclen\a when used with the \b-b\b or \b-c\b " 61 "option.]" 62 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, " 63 "when used with the \b-f\b option. By default, lines with no " 64 "delimiters will be passsed in untouched.]" 65 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " 66 "the \b-f\b option is set to \aldelim\a. The default is the " 67 "\bnewline\b character.]" 68 "[N:nonewline?Do not output new-lines at end of each record when used " 69 "with the \b-b\b or \b-c\b option.]" 70 "\n" 71 "\n[file ...]\n" 72 "\n" 73 "[+EXIT STATUS?]{" 74 "[+0?All files processed successfully.]" 75 "[+>0?One or more files failed to open or could not be read.]" 76 "}" 77 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]" 78 ; 79 80 #include <cmd.h> 81 #include <ctype.h> 82 83 typedef struct Last_s 84 { 85 int seqno; 86 int seq; 87 int wdelim; 88 int ldelim; 89 } Last_t; 90 91 typedef struct Cut_s 92 { 93 int cflag; 94 int sflag; 95 int nlflag; 96 int wdelim; 97 int ldelim; 98 int seqno; 99 int reclen; 100 signed char space[UCHAR_MAX]; 101 Last_t last; 102 int list[2]; /* NOTE: must be last member */ 103 } Cut_t; 104 105 #define HUGE (1<<14) 106 #define BLOCK 8*1024 107 #define C_BYTES 1 108 #define C_CHARS 2 109 #define C_FIELDS 4 110 #define C_SUPRESS 8 111 #define C_NOCHOP 16 112 #define C_NONEWLINE 32 113 114 /* 115 * compare the first of an array of integers 116 */ 117 118 static int mycomp(register const void *a,register const void *b) 119 { 120 return(*((int*)a) - *((int*)b)); 121 } 122 123 static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) 124 { 125 register int *lp, c, n=0; 126 register int range = 0; 127 register char *cp = str; 128 Cut_t *cuthdr; 129 if (!(cuthdr = (Cut_t*)stakalloc(sizeof(Cut_t)+strlen(cp)*sizeof(int)))) 130 error(ERROR_exit(1), "out of space"); 131 memset(cuthdr->space, 0, sizeof(cuthdr->space)); 132 cuthdr->last.seqno = 0; 133 cuthdr->last.seq = 0; 134 cuthdr->last.wdelim = 0; 135 cuthdr->last.ldelim = '\n'; 136 cuthdr->cflag = ((mode&C_CHARS)!=0 && mbwide()); 137 cuthdr->sflag = ((mode&C_SUPRESS)!=0); 138 cuthdr->nlflag = ((mode&C_NONEWLINE)!=0); 139 cuthdr->wdelim = wdelim; 140 cuthdr->ldelim = ldelim; 141 cuthdr->reclen = reclen; 142 cuthdr->seqno = ++cuthdr->last.seqno; 143 lp = cuthdr->list; 144 while(1) switch(c= *cp++) 145 { 146 case ' ': 147 case '\t': 148 while(*cp==' ' || *cp=='\t') 149 cp++; 150 case 0: 151 case ',': 152 if(range) 153 { 154 --range; 155 if((n = (n==0?HUGE:n-range)) < 0) 156 error(ERROR_exit(1),"invalid range for c/f option"); 157 *lp++ = range; 158 *lp++ = n; 159 } 160 else 161 { 162 *lp++ = --n; 163 *lp++ = 1; 164 } 165 if(c==0) 166 { 167 register int *dp; 168 *lp = HUGE; 169 n = 1 + (lp-cuthdr->list)/2; 170 qsort(lp=cuthdr->list,n,2*sizeof(*lp),mycomp); 171 /* eliminate overlapping regions */ 172 for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) 173 { 174 if(lp[0] <= range) 175 { 176 if(lp[1]==HUGE) 177 { 178 dp[-1] = HUGE; 179 break; 180 } 181 if((c = lp[0]+lp[1]-range)>0) 182 { 183 range += c; 184 dp[-1] += c; 185 } 186 } 187 else 188 { 189 range = *dp++ = lp[0]; 190 if(lp[1]==HUGE) 191 { 192 *dp++ = HUGE; 193 break; 194 } 195 range += (*dp++ = lp[1]); 196 } 197 } 198 *dp = HUGE; 199 lp = cuthdr->list; 200 /* convert ranges into gaps */ 201 for(n=0; *lp!=HUGE; lp+=2) 202 { 203 c = *lp; 204 *lp -= n; 205 n = c+lp[1]; 206 } 207 return(cuthdr); 208 } 209 n = range = 0; 210 break; 211 212 case '-': 213 if(range) 214 error(ERROR_exit(1),"bad list for c/f option"); 215 range = n?n:1; 216 n = 0; 217 break; 218 219 default: 220 if(!isdigit(c)) 221 error(ERROR_exit(1),"bad list for c/f option"); 222 n = 10*n + (c-'0'); 223 } 224 /* NOTREACHED */ 225 } 226 227 /* 228 * advance <cp> by <n> multi-byte characters 229 */ 230 static int advance(const char *str, register int n, register int inlen) 231 { 232 register int size, len=inlen; 233 register const char *cp=str; 234 while(len>0 && n-->0) 235 { 236 size = mblen(cp, len); 237 if(size<0) 238 size = 1; 239 cp += size; 240 len -= size; 241 242 } 243 if(n>0) 244 return(inlen+1); 245 return(cp-str); 246 } 247 248 /* 249 * cut each line of file <fdin> and put results to <fdout> using list <list> 250 */ 251 252 static int cutcols(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) 253 { 254 register int c, ncol=0,len; 255 register const int *lp = cuthdr->list; 256 register char *inp; 257 register int skip; /* non-zero for don't copy */ 258 while(1) 259 { 260 if(len = cuthdr->reclen) 261 inp = sfreserve(fdin, len, -1); 262 else 263 inp = sfgetr(fdin, '\n', 0); 264 if(!inp && !(inp = sfgetr(fdin, 0, SF_LASTR))) 265 break; 266 len = sfvalue(fdin); 267 if((ncol = skip = *(lp = cuthdr->list)) == 0) 268 ncol = *++lp; 269 while(1) 270 { 271 if((c=(cuthdr->cflag?advance(inp,ncol,len):ncol)) > len) 272 c = len; 273 else if(c==len && !skip) 274 ncol++; 275 ncol -= c; 276 if(!skip && sfwrite(fdout,(char*)inp,c)<0) 277 return(-1); 278 inp += c; 279 if(ncol) 280 break; 281 len -= c; 282 ncol = *++lp; 283 skip = !skip; 284 } 285 if(!cuthdr->nlflag && (skip || cuthdr->reclen)) 286 sfputc(fdout,cuthdr->ldelim); 287 } 288 return(c); 289 } 290 291 /* 292 * cut each line of file <fdin> and put results to <fdout> using list <list> 293 * stream <fdin> must be line buffered 294 */ 295 296 #define endline(c) (((signed char)-1)<0?(c)<0:(c)==((char)-1)) 297 298 static int cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) 299 { 300 register unsigned char *cp; 301 register int c, nfields; 302 register const int *lp = cuthdr->list; 303 register unsigned char *copy; 304 register int nodelim, empty, inword=0; 305 register unsigned char *endbuff; 306 unsigned char *inbuff, *first; 307 int lastchar; 308 Sfio_t *fdtmp = 0; 309 long offset = 0; 310 if(cuthdr->seqno != cuthdr->last.seq) 311 { 312 cuthdr->space[cuthdr->last.ldelim] = 0; 313 cuthdr->space[cuthdr->last.wdelim] = 0; 314 cuthdr->space[cuthdr->last.wdelim=cuthdr->wdelim] = 1; 315 cuthdr->space[cuthdr->last.ldelim=cuthdr->ldelim] = -1; 316 cuthdr->last.seq = cuthdr->seqno; 317 } 318 /* process each buffer */ 319 while ((inbuff = (unsigned char*)sfreserve(fdin, SF_UNBOUND, 0)) && (c = sfvalue(fdin)) > 0) 320 { 321 cp = inbuff; 322 endbuff = cp + --c; 323 if((lastchar = cp[c]) != cuthdr->ldelim) 324 *endbuff = cuthdr->ldelim; 325 /* process each line in the buffer */ 326 while(cp <= endbuff) 327 { 328 first = cp; 329 if(!inword) 330 { 331 nodelim = empty = 1; 332 copy = cp; 333 if(nfields = *(lp = cuthdr->list)) 334 copy = 0; 335 else 336 nfields = *++lp; 337 } 338 else if(copy) 339 copy = cp; 340 inword = 0; 341 while(!inword) 342 { 343 /* skip over non-delimiter characters */ 344 while(!(c=cuthdr->space[*cp++])); 345 /* check for end-of-line */ 346 if(endline(c)) 347 { 348 if(cp<=endbuff) 349 break; 350 if((c=cuthdr->space[lastchar]),endline(c)) 351 break; 352 /* restore cuthdr->last. character */ 353 if(lastchar != cuthdr->ldelim) 354 *endbuff = lastchar; 355 inword++; 356 if(!c) 357 break; 358 } 359 nodelim = 0; 360 if(--nfields >0) 361 continue; 362 nfields = *++lp; 363 if(copy) 364 { 365 empty = 0; 366 if((c=(cp-1)-copy)>0 && sfwrite(fdout,(char*)copy,c)< 0) 367 goto failed; 368 copy = 0; 369 } 370 else 371 /* set to delimiter unless the first field */ 372 copy = cp -!empty; 373 } 374 if(!inword) 375 { 376 if(!copy) 377 { 378 if(nodelim) 379 { 380 if(!cuthdr->sflag) 381 { 382 if(offset) 383 { 384 sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); 385 sfmove(fdtmp,fdout,offset,-1); 386 } 387 copy = first; 388 } 389 } 390 else 391 sfputc(fdout,'\n'); 392 } 393 if(offset) 394 sfseek(fdtmp,offset=0,SEEK_SET); 395 } 396 if(copy && (c=cp-copy)>0 && (!nodelim || !cuthdr->sflag) && sfwrite(fdout,(char*)copy,c)< 0) 397 goto failed; 398 } 399 /* see whether to save in tmp file */ 400 if(nodelim && inword && !cuthdr->sflag && (c=cp-first)>0) 401 { 402 /* copy line to tmpfile in case no fields */ 403 if(!fdtmp) 404 fdtmp = sftmp(BLOCK); 405 sfwrite(fdtmp,(char*)first,c); 406 offset +=c; 407 } 408 } 409 failed: 410 if(fdtmp) 411 sfclose(fdtmp); 412 return(0); 413 } 414 415 int 416 b_cut(int argc,char *argv[], void* context) 417 { 418 register char *cp = 0; 419 register Sfio_t *fp; 420 int n; 421 Cut_t *cuthdr; 422 int mode = 0; 423 int wdelim = '\t'; 424 int ldelim = '\n'; 425 size_t reclen = 0; 426 427 cmdinit(argc, argv, context, ERROR_CATALOG, 0); 428 while (n = optget(argv, usage)) switch (n) 429 { 430 case 'b': 431 case 'c': 432 if(mode&C_FIELDS) 433 { 434 error(2, "f option already specified"); 435 break; 436 } 437 cp = opt_info.arg; 438 if(n=='b') 439 mode |= C_BYTES; 440 else 441 mode |= C_CHARS; 442 break; 443 case 'D': 444 ldelim = *(unsigned char*)opt_info.arg; 445 break; 446 case 'd': 447 wdelim = *(unsigned char*)opt_info.arg; 448 break; 449 case 'f': 450 if(mode&(C_CHARS|C_BYTES)) 451 { 452 error(2, "c option already specified"); 453 break; 454 } 455 cp = opt_info.arg; 456 mode |= C_FIELDS; 457 break; 458 case 'n': 459 mode |= C_NOCHOP; 460 break; 461 case 'N': 462 mode |= C_NONEWLINE; 463 break; 464 case 'R': 465 case 'r': 466 if(opt_info.num>0) 467 reclen = opt_info.num; 468 break; 469 case 's': 470 mode |= C_SUPRESS; 471 break; 472 case ':': 473 error(2, "%s", opt_info.arg); 474 break; 475 case '?': 476 error(ERROR_usage(2), "%s", opt_info.arg); 477 break; 478 } 479 argv += opt_info.index; 480 if (error_info.errors) 481 error(ERROR_usage(2), "%s",optusage(NiL)); 482 if(!cp) 483 { 484 error(2, "b, c or f option must be specified"); 485 error(ERROR_usage(2), "%s", optusage(NiL)); 486 } 487 if(!*cp) 488 error(3, "non-empty b, c or f option must be specified"); 489 if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) 490 error(3, "s option requires f option"); 491 cuthdr = cutinit(mode,cp,wdelim,ldelim,reclen); 492 if(cp = *argv) 493 argv++; 494 do 495 { 496 if(!cp || streq(cp,"-")) 497 fp = sfstdin; 498 else if(!(fp = sfopen(NiL,cp,"r"))) 499 { 500 error(ERROR_system(0),"%s: cannot open",cp); 501 continue; 502 } 503 if(mode&C_FIELDS) 504 cutfields(cuthdr,fp,sfstdout); 505 else 506 cutcols(cuthdr,fp,sfstdout); 507 if(fp!=sfstdin) 508 sfclose(fp); 509 } 510 while(cp= *argv++); 511 return(error_info.errors?1:0); 512 } 513