1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1992-2010 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * * 20 ***********************************************************************/ 21 #pragma prototyped 22 /* 23 * David Korn 24 * AT&T Bell Laboratories 25 * 26 * library interface for word count 27 */ 28 29 #include <cmd.h> 30 #include <wc.h> 31 #include <ctype.h> 32 33 #if _hdr_wchar && _hdr_wctype && _lib_iswctype 34 35 #include <wchar.h> 36 #include <wctype.h> 37 #include <lc.h> 38 39 #else 40 41 #ifndef iswspace 42 #define iswspace(x) isspace(x) 43 #endif 44 45 #endif 46 47 #define WC_SP 0x08 48 #define WC_NL 0x10 49 #define WC_MB 0x20 50 #define WC_ERR 0x40 51 52 #define eol(c) ((c)&WC_NL) 53 #define mbc(c) ((c)&WC_MB) 54 #define spc(c) ((c)&WC_SP) 55 #define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n) 56 57 Wc_t* wc_init(int mode) 58 { 59 register int n; 60 register int w; 61 Wc_t* wp; 62 63 if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t)))) 64 return 0; 65 if (!mbwide()) 66 wp->mb = 0; 67 #if _hdr_wchar && _hdr_wctype && _lib_iswctype 68 else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8)) 69 wp->mb = 1; 70 #endif 71 else 72 wp->mb = -1; 73 w = mode & WC_WORDS; 74 for (n = (1<<CHAR_BIT); --n >= 0;) 75 wp->type[n] = (w && isspace(n)) ? WC_SP : 0; 76 wp->type['\n'] = WC_SP|WC_NL; 77 if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0) 78 { 79 for (n = 0; n < 64; n++) 80 { 81 wp->type[0x80+n] |= WC_MB; 82 if (n<32) 83 wp->type[0xc0+n] |= WC_MB+1; 84 else if (n<48) 85 wp->type[0xc0+n] |= WC_MB+2; 86 else if (n<56) 87 wp->type[0xc0+n] |= WC_MB+3; 88 else if (n<60) 89 wp->type[0xc0+n] |= WC_MB+4; 90 else if (n<62) 91 wp->type[0xc0+n] |= WC_MB+5; 92 } 93 wp->type[0xc0] = WC_MB|WC_ERR; 94 wp->type[0xc1] = WC_MB|WC_ERR; 95 wp->type[0xfe] = WC_MB|WC_ERR; 96 wp->type[0xff] = WC_MB|WC_ERR; 97 } 98 wp->mode = mode; 99 return wp; 100 } 101 102 static int invalid(const char *file, int nlines) 103 { 104 error_info.file = (char*)file; 105 error_info.line = nlines; 106 error(ERROR_SYSTEM|1, "invalid multibyte character"); 107 error_info.file = 0; 108 error_info.line = 0; 109 return nlines; 110 } 111 112 /* 113 * handle utf space characters 114 */ 115 116 static int chkstate(int state, register unsigned int c) 117 { 118 switch(state) 119 { 120 case 1: 121 state = (c==0x9a?4:0); 122 break; 123 case 2: 124 state = ((c==0x80||c==0x81)?6+(c&1):0); 125 break; 126 case 3: 127 state = (c==0x80?5:0); 128 break; 129 case 4: 130 state = (c==0x80?10:0); 131 break; 132 case 5: 133 state = (c==0x80?10:0); 134 break; 135 case 6: 136 state = 0; 137 if(c==0xa0 || c==0xa1) 138 return(10); 139 else if((c&0xf0)== 0x80) 140 { 141 if((c&=0xf)==7) 142 return(iswspace(0x2007)?10:0); 143 if(c<=0xb) 144 return(10); 145 } 146 else if(c==0xaf && iswspace(0x202f)) 147 return(10); 148 break; 149 case 7: 150 state = (c==0x9f?10:0); 151 break; 152 case 8: 153 return (iswspace(c)?10:0); 154 } 155 return state; 156 } 157 158 /* 159 * compute the line, word, and character count for file <fd> 160 */ 161 162 int wc_count(Wc_t *wp, Sfio_t *fd, const char* file) 163 { 164 register char* type = wp->type; 165 register unsigned char* cp; 166 register Sfoff_t nbytes; 167 register Sfoff_t nchars; 168 register Sfoff_t nwords; 169 register Sfoff_t nlines; 170 register Sfoff_t eline = -1; 171 register Sfoff_t longest = 0; 172 register ssize_t c; 173 register unsigned char* endbuff; 174 register int lasttype = WC_SP; 175 unsigned int lastchar; 176 ssize_t n; 177 ssize_t o; 178 unsigned char* buff; 179 wchar_t x; 180 unsigned char side[32]; 181 182 sfset(fd,SF_WRITE,1); 183 nlines = nwords = nchars = nbytes = 0; 184 wp->longest = 0; 185 if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS))) 186 { 187 cp = buff = endbuff = 0; 188 for (;;) 189 { 190 if (cp >= endbuff || (n = mb2wc(x, cp, endbuff-cp)) < 0) 191 { 192 if ((o = endbuff-cp) < sizeof(side)) 193 { 194 if (buff) 195 { 196 if (o) 197 memcpy(side, cp, o); 198 mbinit(); 199 } 200 else 201 o = 0; 202 cp = side + o; 203 if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0) 204 { 205 if ((nchars - longest) > wp->longest) 206 wp->longest = nchars - longest; 207 break; 208 } 209 nbytes += n; 210 if ((c = sizeof(side) - o) > n) 211 c = n; 212 if (c) 213 memcpy(cp, buff, c); 214 endbuff = buff + n; 215 cp = side; 216 x = mbchar(cp); 217 if ((cp-side) < o) 218 { 219 cp = buff; 220 nchars += (cp-side) - 1; 221 } 222 else 223 cp = buff + (cp-side) - o; 224 } 225 else 226 { 227 cp++; 228 x = -1; 229 } 230 if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET)) 231 eline = invalid(file, nlines); 232 } 233 else 234 cp += n ? n : 1; 235 if (x == '\n') 236 { 237 if ((nchars - longest) > wp->longest) 238 wp->longest = nchars - longest; 239 longest = nchars + 1; 240 nlines++; 241 lasttype = 1; 242 } 243 else if (iswspace(x)) 244 lasttype = 1; 245 else if (lasttype) 246 { 247 lasttype = 0; 248 nwords++; 249 } 250 nchars++; 251 } 252 if (!(wp->mode & WC_MBYTE)) 253 nchars = nbytes; 254 } 255 else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST))) 256 { 257 if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST))) 258 { 259 while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) 260 { 261 nchars += c; 262 endbuff = cp + c; 263 if (*--endbuff == '\n') 264 nlines++; 265 else 266 *endbuff = '\n'; 267 for (;;) 268 if (*cp++ == '\n') 269 { 270 if (cp > endbuff) 271 break; 272 nlines++; 273 } 274 } 275 } 276 else 277 { 278 while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) 279 { 280 nchars += c; 281 /* check to see whether first character terminates word */ 282 if (c==1) 283 { 284 if (eol(lasttype)) 285 nlines++; 286 if ((c = type[*cp]) && !lasttype) 287 nwords++; 288 lasttype = c; 289 continue; 290 } 291 if (!lasttype && type[*cp]) 292 nwords++; 293 lastchar = cp[--c]; 294 *(endbuff = cp+c) = '\n'; 295 c = lasttype; 296 /* process each buffer */ 297 for (;;) 298 { 299 /* process spaces and new-lines */ 300 do 301 { 302 if (eol(c)) 303 for (;;) 304 { 305 /* check for end of buffer */ 306 if (cp > endbuff) 307 goto beob; 308 nlines++; 309 if (*cp != '\n') 310 break; 311 cp++; 312 } 313 } while (c = type[*cp++]); 314 /* skip over word characters */ 315 while (!(c = type[*cp++])); 316 nwords++; 317 } 318 beob: 319 if ((cp -= 2) >= buff) 320 c = type[*cp]; 321 else 322 c = lasttype; 323 lasttype = type[lastchar]; 324 /* see if was in word */ 325 if (!c && !lasttype) 326 nwords--; 327 } 328 if (eol(lasttype)) 329 nlines++; 330 else if (!lasttype) 331 nwords++; 332 } 333 } 334 else 335 { 336 int lineoff=0; 337 int skip=0; 338 int adjust=0; 339 int state=0; 340 int oldc; 341 int xspace; 342 int wasspace = 1; 343 unsigned char* start; 344 345 lastchar = 0; 346 start = (endbuff = side) + 1; 347 xspace = iswspace(0xa0) || iswspace(0x85); 348 while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) 349 { 350 nbytes += c; 351 nchars += c; 352 start = cp-lineoff; 353 /* check to see whether first character terminates word */ 354 if(c==1) 355 { 356 if(eol(lasttype)) 357 nlines++; 358 if((c = type[*cp]) && !lasttype) 359 nwords++; 360 lasttype = c; 361 endbuff = start; 362 continue; 363 } 364 lastchar = cp[--c]; 365 endbuff = cp+c; 366 cp[c] = '\n'; 367 if(mbc(lasttype)) 368 { 369 c = lasttype; 370 goto mbyte; 371 } 372 if(!lasttype && spc(type[*cp])) 373 nwords++; 374 c = lasttype; 375 /* process each buffer */ 376 for (;;) 377 { 378 /* process spaces and new-lines */ 379 spaces: 380 do 381 { 382 if (eol(c)) 383 { 384 /* check for end of buffer */ 385 if (cp > endbuff) 386 goto eob; 387 if(wp->mode&WC_LONGEST) 388 { 389 if((cp-start)-adjust > longest) 390 longest = (cp-start)-adjust-1; 391 start = cp; 392 } 393 nlines++; 394 nchars -= adjust; 395 adjust = 0; 396 } 397 } while (spc(c = type[*cp++])); 398 wasspace=1; 399 if(mbc(c)) 400 { 401 mbyte: 402 do 403 { 404 if(c&WC_ERR) 405 goto err; 406 if(skip && (c&7)) 407 break; 408 if(!skip) 409 { 410 if(!(c&7)) 411 { 412 skip=1; 413 break; 414 } 415 skip = (c&7); 416 adjust += skip; 417 state = 0; 418 if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3))) 419 oldc = *cp; 420 else if(xspace && cp[-1]==0xc2) 421 { 422 state = 8; 423 oldc = *cp; 424 } 425 } 426 else 427 { 428 skip--; 429 if(state && (state=chkstate(state,oldc))) 430 { 431 if(state==10) 432 { 433 if(!wasspace) 434 nwords++; 435 wasspace = 1; 436 state=0; 437 goto spaces; 438 } 439 oldc = *cp; 440 } 441 } 442 } while (mbc(c = type[*cp++])); 443 wasspace = 0; 444 if(skip) 445 { 446 if(eol(c) && (cp > endbuff)) 447 goto eob; 448 err: 449 skip = 0; 450 state = 0; 451 if(eline!=nlines && !(wp->mode & WC_QUIET)) 452 eline = invalid(file, nlines); 453 while(mbc(c) && ((c|WC_ERR) || (c&7)==0)) 454 c=type[*cp++]; 455 if(eol(c) && (cp > endbuff)) 456 { 457 c = WC_MB|WC_ERR; 458 goto eob; 459 } 460 if(mbc(c)) 461 goto mbyte; 462 else if(c&WC_SP) 463 goto spaces; 464 } 465 if(spc(c)) 466 { 467 nwords++; 468 continue; 469 } 470 } 471 /* skip over word characters */ 472 while(!(c = type[*cp++])); 473 if(mbc(c)) 474 goto mbyte; 475 nwords++; 476 } 477 eob: 478 lineoff = cp-start; 479 if((cp -= 2) >= buff) 480 c = type[*cp]; 481 else 482 c = lasttype; 483 lasttype = type[lastchar]; 484 /* see if was in word */ 485 if(!c && !lasttype) 486 nwords--; 487 } 488 if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest) 489 longest = (endbuff + 1 - start) - adjust - (lastchar == '\n'); 490 wp->longest = longest; 491 if (eol(lasttype)) 492 nlines++; 493 else if (!lasttype) 494 nwords++; 495 if (wp->mode & WC_MBYTE) 496 nchars -= adjust; 497 else 498 nchars = nbytes; 499 } 500 wp->chars = nchars; 501 wp->words = nwords; 502 wp->lines = nlines; 503 return 0; 504 } 505 506