1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1992-2008 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * * 20 ***********************************************************************/ 21 #pragma prototyped 22 /* 23 * David Korn 24 * AT&T Bell Laboratories 25 * 26 * cut [-sN] [-f flist] [-c clist] [-d delim] [-D delim] [-r reclen] [file] ... 27 * 28 * cut fields or columns from fields from a file 29 */ 30 31 static const char usage[] = 32 "[-?\n@(#)$Id: cut (AT&T Research) 2008-04-01 $\n]" 33 USAGE_LICENSE 34 "[+NAME?cut - cut out selected columns or fields of each line of a file]" 35 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields " 36 "from one or more files, contatenating them on standard output.]" 37 "[+?The option argument \alist\a is a comma-separated or blank-separated " 38 "list of positive numbers and ranges. Ranges can be of three " 39 "forms. The first is two positive integers separated by a hyphen " 40 "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to " 41 "\ahigh\a. The second is a positive number preceded by a hyphen " 42 "(\b-\b\ahigh\a), which represents all fields from field \b1\b to " 43 "\ahigh\a. The last is a positive number followed by a hyphen " 44 "(\alow\a\b-\b), which represents all fields from \alow\a to the " 45 "last field, inclusive. Elements in the \alist\a can be repeated, " 46 "can overlap, and can appear in any order. The order of the " 47 "output is that of the input.]" 48 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]" 49 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b " 50 "cuts from standard input. The start of the file is defined " 51 "as the current offset.]" 52 "[b:bytes]:[list?\bcut\b based on a list of bytes.]" 53 "[c:characters]:[list?\bcut\b based on a list of characters.]" 54 "[d:delimiter]:[delim?The field character for the \b-f\b option is set " 55 "to \adelim\a. The default is the \btab\b character.]" 56 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter " 57 "character specified with the \b-d\b optiion.]" 58 "[n:nosplit?Do not split characters. Currently ignored.]" 59 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length " 60 "records of length \areclen\a when used with the \b-b\b or \b-c\b " 61 "option.]" 62 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, " 63 "when used with the \b-f\b option. By default, lines with no " 64 "delimiters will be passsed in untouched.]" 65 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for " 66 "the \b-f\b option is set to \aldelim\a. The default is the " 67 "\bnewline\b character.]" 68 "[N:nonewline?Do not output new-lines at end of each record when used " 69 "with the \b-b\b or \b-c\b option.]" 70 "\n" 71 "\n[file ...]\n" 72 "\n" 73 "[+EXIT STATUS?]{" 74 "[+0?All files processed successfully.]" 75 "[+>0?One or more files failed to open or could not be read.]" 76 "}" 77 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]" 78 ; 79 80 #include <cmd.h> 81 #include <ctype.h> 82 83 typedef struct Last_s 84 { 85 int seqno; 86 int seq; 87 int wdelim; 88 int ldelim; 89 } Last_t; 90 91 typedef struct Cut_s 92 { 93 int cflag; 94 int sflag; 95 int nlflag; 96 int wdelim; 97 int ldelim; 98 int seqno; 99 int reclen; 100 signed char space[UCHAR_MAX]; 101 Last_t last; 102 int list[2]; /* NOTE: must be last member */ 103 } Cut_t; 104 105 #define HUGE (1<<14) 106 #define BLOCK 8*1024 107 #define C_BYTES 1 108 #define C_CHARS 2 109 #define C_FIELDS 4 110 #define C_SUPRESS 8 111 #define C_NOCHOP 16 112 #define C_NONEWLINE 32 113 114 /* 115 * compare the first of an array of integers 116 */ 117 118 static int mycomp(register const void *a,register const void *b) 119 { 120 return(*((int*)a) - *((int*)b)); 121 } 122 123 static Cut_t *cutinit(int mode,char *str,int wdelim,int ldelim,size_t reclen) 124 { 125 register int *lp, c, n=0; 126 register int range = 0; 127 register char *cp = str; 128 Cut_t *cuthdr; 129 if (!(cuthdr = (Cut_t*)stakalloc(sizeof(Cut_t)+strlen(cp)*sizeof(int)))) 130 error(ERROR_exit(1), "out of space"); 131 memset(cuthdr->space, 0, sizeof(cuthdr->space)); 132 cuthdr->last.seqno = 0; 133 cuthdr->last.seq = 0; 134 cuthdr->last.wdelim = 0; 135 cuthdr->last.ldelim = '\n'; 136 cuthdr->cflag = ((mode&C_CHARS)!=0 && mbwide()); 137 cuthdr->sflag = ((mode&C_SUPRESS)!=0); 138 cuthdr->nlflag = ((mode&C_NONEWLINE)!=0); 139 cuthdr->wdelim = wdelim; 140 cuthdr->ldelim = ldelim; 141 cuthdr->reclen = reclen; 142 cuthdr->seqno = ++cuthdr->last.seqno; 143 lp = cuthdr->list; 144 while(1) switch(c= *cp++) 145 { 146 case ' ': 147 case '\t': 148 while(*cp==' ' || *cp=='\t') 149 cp++; 150 case 0: 151 case ',': 152 if(range) 153 { 154 --range; 155 if((n = (n==0?HUGE:n-range)) < 0) 156 error(ERROR_exit(1),"invalid range for c/f option"); 157 *lp++ = range; 158 *lp++ = n; 159 } 160 else 161 { 162 *lp++ = --n; 163 *lp++ = 1; 164 } 165 if(c==0) 166 { 167 register int *dp; 168 *lp = HUGE; 169 n = 1 + (lp-cuthdr->list)/2; 170 qsort(lp=cuthdr->list,n,2*sizeof(*lp),mycomp); 171 /* eliminate overlapping regions */ 172 for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2) 173 { 174 if(lp[0] <= range) 175 { 176 if(lp[1]==HUGE) 177 { 178 dp[-1] = HUGE; 179 break; 180 } 181 if((c = lp[0]+lp[1]-range)>0) 182 { 183 range += c; 184 dp[-1] += c; 185 } 186 } 187 else 188 { 189 range = *dp++ = lp[0]; 190 if(lp[1]==HUGE) 191 { 192 *dp++ = HUGE; 193 break; 194 } 195 range += (*dp++ = lp[1]); 196 } 197 } 198 *dp = HUGE; 199 lp = cuthdr->list; 200 /* convert ranges into gaps */ 201 for(n=0; *lp!=HUGE; lp+=2) 202 { 203 c = *lp; 204 *lp -= n; 205 n = c+lp[1]; 206 } 207 return(cuthdr); 208 } 209 n = range = 0; 210 break; 211 212 case '-': 213 if(range) 214 error(ERROR_exit(1),"bad list for c/f option"); 215 range = n?n:1; 216 n = 0; 217 break; 218 219 default: 220 if(!isdigit(c)) 221 error(ERROR_exit(1),"bad list for c/f option"); 222 n = 10*n + (c-'0'); 223 } 224 /* NOTREACHED */ 225 } 226 227 /* 228 * advance <cp> by <n> multi-byte characters 229 */ 230 static int advance(const char *str, register int n, register int inlen) 231 { 232 register int size, len=inlen; 233 register const char *cp=str; 234 while(len>0 && n-->0) 235 { 236 size = mblen(cp, len); 237 if(size<0) 238 size = 1; 239 cp += size; 240 len -= size; 241 242 } 243 if(n>0) 244 return(inlen+1); 245 return(cp-str); 246 } 247 248 /* 249 * cut each line of file <fdin> and put results to <fdout> using list <list> 250 */ 251 252 static void cutcols(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) 253 { 254 register int c, ncol=0,len; 255 register const int *lp = cuthdr->list; 256 register char *inp; 257 register int skip; /* non-zero for don't copy */ 258 while(1) 259 { 260 if(len = cuthdr->reclen) 261 inp = sfreserve(fdin, len, -1); 262 else 263 inp = sfgetr(fdin, '\n', 0); 264 if(!inp && !(inp = sfgetr(fdin, 0, SF_LASTR))) 265 break; 266 len = sfvalue(fdin); 267 if((ncol = skip = *(lp = cuthdr->list)) == 0) 268 ncol = *++lp; 269 while(1) 270 { 271 if((c=(cuthdr->cflag?advance(inp,ncol,len):ncol)) > len) 272 c = len; 273 else if(c==len && !skip) 274 ncol++; 275 ncol -= c; 276 if(!skip && sfwrite(fdout,(char*)inp,c)<0) 277 return; 278 inp += c; 279 if(ncol) 280 break; 281 len -= c; 282 ncol = *++lp; 283 skip = !skip; 284 } 285 if(!cuthdr->nlflag && (skip || cuthdr->reclen)) 286 sfputc(fdout,cuthdr->ldelim); 287 } 288 } 289 290 /* 291 * cut each line of file <fdin> and put results to <fdout> using list <list> 292 * stream <fdin> must be line buffered 293 */ 294 295 #define endline(c) (((signed char)-1)<0?(c)<0:(c)==((char)-1)) 296 297 static void cutfields(Cut_t *cuthdr,Sfio_t *fdin,Sfio_t *fdout) 298 { 299 register unsigned char *cp; 300 register int c, nfields; 301 register const int *lp = cuthdr->list; 302 register unsigned char *copy; 303 register int nodelim, empty, inword=0; 304 register unsigned char *endbuff; 305 unsigned char *inbuff, *first; 306 int lastchar; 307 Sfio_t *fdtmp = 0; 308 long offset = 0; 309 if(cuthdr->seqno != cuthdr->last.seq) 310 { 311 cuthdr->space[cuthdr->last.ldelim] = 0; 312 cuthdr->space[cuthdr->last.wdelim] = 0; 313 cuthdr->space[cuthdr->last.wdelim=cuthdr->wdelim] = 1; 314 cuthdr->space[cuthdr->last.ldelim=cuthdr->ldelim] = -1; 315 cuthdr->last.seq = cuthdr->seqno; 316 } 317 /* process each buffer */ 318 while ((inbuff = (unsigned char*)sfreserve(fdin, SF_UNBOUND, 0)) && (c = sfvalue(fdin)) > 0) 319 { 320 cp = inbuff; 321 endbuff = cp + --c; 322 if((lastchar = cp[c]) != cuthdr->ldelim) 323 *endbuff = cuthdr->ldelim; 324 /* process each line in the buffer */ 325 while(cp <= endbuff) 326 { 327 first = cp; 328 if(!inword) 329 { 330 nodelim = empty = 1; 331 copy = cp; 332 if(nfields = *(lp = cuthdr->list)) 333 copy = 0; 334 else 335 nfields = *++lp; 336 } 337 else if(copy) 338 copy = cp; 339 inword = 0; 340 while(!inword) 341 { 342 /* skip over non-delimiter characters */ 343 while(!(c=cuthdr->space[*cp++])); 344 /* check for end-of-line */ 345 if(endline(c)) 346 { 347 if(cp<=endbuff) 348 break; 349 if((c=cuthdr->space[lastchar]),endline(c)) 350 break; 351 /* restore cuthdr->last. character */ 352 if(lastchar != cuthdr->ldelim) 353 *endbuff = lastchar; 354 inword++; 355 if(!c) 356 break; 357 } 358 nodelim = 0; 359 if(--nfields >0) 360 continue; 361 nfields = *++lp; 362 if(copy) 363 { 364 empty = 0; 365 if((c=(cp-1)-copy)>0 && sfwrite(fdout,(char*)copy,c)< 0) 366 goto failed; 367 copy = 0; 368 } 369 else 370 /* set to delimiter unless the first field */ 371 copy = cp -!empty; 372 } 373 if(!inword) 374 { 375 if(!copy) 376 { 377 if(nodelim) 378 { 379 if(!cuthdr->sflag) 380 { 381 if(offset) 382 { 383 sfseek(fdtmp,(Sfoff_t)0,SEEK_SET); 384 sfmove(fdtmp,fdout,offset,-1); 385 } 386 copy = first; 387 } 388 } 389 else 390 sfputc(fdout,'\n'); 391 } 392 if(offset) 393 sfseek(fdtmp,offset=0,SEEK_SET); 394 } 395 if(copy && (c=cp-copy)>0 && (!nodelim || !cuthdr->sflag) && sfwrite(fdout,(char*)copy,c)< 0) 396 goto failed; 397 } 398 /* see whether to save in tmp file */ 399 if(inword && nodelim && !cuthdr->sflag && (c=cp-first)>0) 400 { 401 /* copy line to tmpfile in case no fields */ 402 if(!fdtmp) 403 fdtmp = sftmp(BLOCK); 404 sfwrite(fdtmp,(char*)first,c); 405 offset +=c; 406 } 407 } 408 failed: 409 if(fdtmp) 410 sfclose(fdtmp); 411 } 412 413 int 414 b_cut(int argc,char *argv[], void* context) 415 { 416 register char *cp = 0; 417 register Sfio_t *fp; 418 int n; 419 Cut_t *cuthdr; 420 int mode = 0; 421 int wdelim = '\t'; 422 int ldelim = '\n'; 423 size_t reclen = 0; 424 425 cmdinit(argc, argv, context, ERROR_CATALOG, 0); 426 while (n = optget(argv, usage)) switch (n) 427 { 428 case 'b': 429 case 'c': 430 if(mode&C_FIELDS) 431 { 432 error(2, "f option already specified"); 433 break; 434 } 435 cp = opt_info.arg; 436 if(n=='b') 437 mode |= C_BYTES; 438 else 439 mode |= C_CHARS; 440 break; 441 case 'D': 442 ldelim = *(unsigned char*)opt_info.arg; 443 break; 444 case 'd': 445 wdelim = *(unsigned char*)opt_info.arg; 446 break; 447 case 'f': 448 if(mode&(C_CHARS|C_BYTES)) 449 { 450 error(2, "c option already specified"); 451 break; 452 } 453 cp = opt_info.arg; 454 mode |= C_FIELDS; 455 break; 456 case 'n': 457 mode |= C_NOCHOP; 458 break; 459 case 'N': 460 mode |= C_NONEWLINE; 461 break; 462 case 'R': 463 case 'r': 464 if(opt_info.num>0) 465 reclen = opt_info.num; 466 break; 467 case 's': 468 mode |= C_SUPRESS; 469 break; 470 case ':': 471 error(2, "%s", opt_info.arg); 472 break; 473 case '?': 474 error(ERROR_usage(2), "%s", opt_info.arg); 475 break; 476 } 477 argv += opt_info.index; 478 if (error_info.errors) 479 error(ERROR_usage(2), "%s",optusage(NiL)); 480 if(!cp) 481 { 482 error(2, "b, c or f option must be specified"); 483 error(ERROR_usage(2), "%s", optusage(NiL)); 484 } 485 if(!*cp) 486 error(3, "non-empty b, c or f option must be specified"); 487 if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS) 488 error(3, "s option requires f option"); 489 cuthdr = cutinit(mode,cp,wdelim,ldelim,reclen); 490 if(cp = *argv) 491 argv++; 492 do 493 { 494 if(!cp || streq(cp,"-")) 495 fp = sfstdin; 496 else if(!(fp = sfopen(NiL,cp,"r"))) 497 { 498 error(ERROR_system(0),"%s: cannot open",cp); 499 continue; 500 } 501 if(mode&C_FIELDS) 502 cutfields(cuthdr,fp,sfstdout); 503 else 504 cutcols(cuthdr,fp,sfstdout); 505 if(fp!=sfstdin) 506 sfclose(fp); 507 } while(cp = *argv++); 508 if (sfsync(sfstdout)) 509 error(ERROR_system(0), "write error"); 510 return(error_info.errors?1:0); 511 } 512