1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1992-2011 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Eclipse Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.eclipse.org/org/documents/epl-v10.html * 11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * * 20 ***********************************************************************/ 21 /* 22 * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved. 23 */ 24 #pragma prototyped 25 /* 26 * David Korn 27 * AT&T Bell Laboratories 28 * 29 * library interface for word count 30 */ 31 32 #include <cmd.h> 33 #include <wc.h> 34 #include <ctype.h> 35 36 #if _hdr_wchar && _hdr_wctype && _lib_iswctype 37 38 #include <wchar.h> 39 #include <wctype.h> 40 #include <lc.h> 41 42 #else 43 44 #ifndef iswspace 45 #define iswspace(x) isspace(x) 46 #endif 47 48 #endif 49 50 #define WC_SP 0x08 51 #define WC_NL 0x10 52 #define WC_MB 0x20 53 #define WC_ERR 0x40 54 55 #define eol(c) ((c)&WC_NL) 56 #define mbc(c) ((c)&WC_MB) 57 #define spc(c) ((c)&WC_SP) 58 #define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n) 59 60 Wc_t* wc_init(int mode) 61 { 62 register int n; 63 register int w; 64 Wc_t* wp; 65 66 if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t)))) 67 return 0; 68 if (!mbwide()) 69 wp->mb = 0; 70 #if _hdr_wchar && _hdr_wctype && _lib_iswctype 71 else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8)) 72 wp->mb = 1; 73 #endif 74 else 75 wp->mb = -1; 76 w = mode & WC_WORDS; 77 for (n = (1<<CHAR_BIT); --n >= 0;) 78 wp->type[n] = (w && isspace(n)) ? WC_SP : 0; 79 wp->type['\n'] = WC_SP|WC_NL; 80 if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0) 81 { 82 for (n = 0; n < 64; n++) 83 { 84 wp->type[0x80+n] |= WC_MB; 85 if (n<32) 86 wp->type[0xc0+n] |= WC_MB+1; 87 else if (n<48) 88 wp->type[0xc0+n] |= WC_MB+2; 89 else if (n<56) 90 wp->type[0xc0+n] |= WC_MB+3; 91 else if (n<60) 92 wp->type[0xc0+n] |= WC_MB+4; 93 else if (n<62) 94 wp->type[0xc0+n] |= WC_MB+5; 95 } 96 wp->type[0xc0] = WC_MB|WC_ERR; 97 wp->type[0xc1] = WC_MB|WC_ERR; 98 wp->type[0xfe] = WC_MB|WC_ERR; 99 wp->type[0xff] = WC_MB|WC_ERR; 100 } 101 wp->mode = mode; 102 return wp; 103 } 104 105 static int invalid(const char *file, int nlines) 106 { 107 error_info.file = (char*)file; 108 error_info.line = nlines; 109 error(ERROR_SYSTEM|1, "invalid multibyte character"); 110 error_info.file = 0; 111 error_info.line = 0; 112 return nlines; 113 } 114 115 /* 116 * handle utf space characters 117 */ 118 119 static int chkstate(int state, register unsigned int c) 120 { 121 switch(state) 122 { 123 case 1: 124 state = (c==0x9a?4:0); 125 break; 126 case 2: 127 state = ((c==0x80||c==0x81)?6+(c&1):0); 128 break; 129 case 3: 130 state = (c==0x80?5:0); 131 break; 132 case 4: 133 state = (c==0x80?10:0); 134 break; 135 case 5: 136 state = (c==0x80?10:0); 137 break; 138 case 6: 139 state = 0; 140 if(c==0xa0 || c==0xa1) 141 return(10); 142 else if((c&0xf0)== 0x80) 143 { 144 if((c&=0xf)==7) 145 return(iswspace(0x2007)?10:0); 146 if(c<=0xb) 147 return(10); 148 } 149 else if(c==0xaf && iswspace(0x202f)) 150 return(10); 151 break; 152 case 7: 153 state = (c==0x9f?10:0); 154 break; 155 case 8: 156 return (iswspace(c)?10:0); 157 } 158 return state; 159 } 160 161 /* 162 * compute the line, word, and character count for file <fd> 163 */ 164 165 int wc_count(Wc_t *wp, Sfio_t *fd, const char* file) 166 { 167 register char* type = wp->type; 168 register unsigned char* cp; 169 register Sfoff_t nbytes; 170 register Sfoff_t nchars; 171 register Sfoff_t nwords; 172 register Sfoff_t nlines; 173 register Sfoff_t eline = -1; 174 register Sfoff_t longest = 0; 175 register ssize_t c; 176 register unsigned char* endbuff; 177 register int lasttype = WC_SP; 178 unsigned int lastchar; 179 ssize_t n; 180 ssize_t o; 181 unsigned char* buff; 182 wchar_t x; 183 unsigned char side[32]; 184 185 sfset(fd,SF_WRITE,1); 186 nlines = nwords = nchars = nbytes = 0; 187 wp->longest = 0; 188 if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS))) 189 { 190 cp = buff = endbuff = 0; 191 for (;;) 192 { 193 if (cp >= endbuff || (n = mb2wc(x, cp, endbuff-cp)) < 0) 194 { 195 if ((o = endbuff-cp) < sizeof(side)) 196 { 197 if (buff) 198 { 199 if (o) 200 memcpy(side, cp, o); 201 mbinit(); 202 } 203 else 204 o = 0; 205 cp = side + o; 206 if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0) 207 { 208 if ((nchars - longest) > wp->longest) 209 wp->longest = nchars - longest; 210 break; 211 } 212 nbytes += n; 213 if ((c = sizeof(side) - o) > n) 214 c = n; 215 if (c) 216 memcpy(cp, buff, c); 217 endbuff = buff + n; 218 cp = side; 219 x = mbchar(cp); 220 if ((cp-side) < o) 221 { 222 cp = buff; 223 nchars += (cp-side) - 1; 224 } 225 else 226 cp = buff + (cp-side) - o; 227 } 228 else 229 { 230 cp++; 231 x = -1; 232 } 233 if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET)) 234 eline = invalid(file, nlines); 235 } 236 else 237 cp += n ? n : 1; 238 if (x == '\n') 239 { 240 if ((nchars - longest) > wp->longest) 241 wp->longest = nchars - longest; 242 longest = nchars + 1; 243 nlines++; 244 lasttype = 1; 245 } 246 else if (iswspace(x)) 247 lasttype = 1; 248 else if (lasttype) 249 { 250 lasttype = 0; 251 nwords++; 252 } 253 nchars++; 254 } 255 if (!(wp->mode & WC_MBYTE)) 256 nchars = nbytes; 257 } 258 else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST))) 259 { 260 if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST))) 261 { 262 while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) 263 { 264 nchars += c; 265 endbuff = cp + c; 266 if (*--endbuff == '\n') 267 nlines++; 268 else 269 *endbuff = '\n'; 270 for (;;) 271 if (*cp++ == '\n') 272 { 273 if (cp > endbuff) 274 break; 275 nlines++; 276 } 277 } 278 } 279 else 280 { 281 while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) 282 { 283 nchars += c; 284 /* check to see whether first character terminates word */ 285 if (c==1) 286 { 287 if (eol(lasttype)) 288 nlines++; 289 if ((c = type[*cp]) && !lasttype) 290 nwords++; 291 lasttype = c; 292 continue; 293 } 294 if (!lasttype && type[*cp]) 295 nwords++; 296 lastchar = cp[--c]; 297 *(endbuff = cp+c) = '\n'; 298 c = lasttype; 299 /* process each buffer */ 300 for (;;) 301 { 302 /* process spaces and new-lines */ 303 do 304 { 305 if (eol(c)) 306 for (;;) 307 { 308 /* check for end of buffer */ 309 if (cp > endbuff) 310 goto beob; 311 nlines++; 312 if (*cp != '\n') 313 break; 314 cp++; 315 } 316 } while (c = type[*cp++]); 317 /* skip over word characters */ 318 while (!(c = type[*cp++])); 319 nwords++; 320 } 321 beob: 322 if ((cp -= 2) >= buff) 323 c = type[*cp]; 324 else 325 c = lasttype; 326 lasttype = type[lastchar]; 327 /* see if was in word */ 328 if (!c && !lasttype) 329 nwords--; 330 } 331 if (eol(lasttype)) 332 nlines++; 333 else if (!lasttype) 334 nwords++; 335 } 336 } 337 else 338 { 339 int lineoff=0; 340 int skip=0; 341 int adjust=0; 342 int state=0; 343 int oldc; 344 int xspace; 345 int wasspace = 1; 346 unsigned char* start; 347 int flagm = 0; 348 349 350 lastchar = 0; 351 start = (endbuff = side) + 1; 352 xspace = iswspace(0xa0) || iswspace(0x85); 353 while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0) 354 { 355 nbytes += c; 356 nchars += c; 357 start = cp-lineoff; 358 /* check to see whether first character terminates word */ 359 if(c==1) 360 { 361 if(eol(lasttype)) 362 nlines++; 363 if((c = type[*cp]) && !lasttype) 364 nwords++; 365 lasttype = c; 366 endbuff = start; 367 continue; 368 } 369 lastchar = cp[--c]; 370 endbuff = cp+c; 371 cp[c] = '\n'; 372 if(mbc(lasttype)) 373 { 374 c = lasttype; 375 flagm = 1; 376 goto mbyte; 377 } 378 if(!lasttype && spc(type[*cp])) 379 nwords++; 380 c = lasttype; 381 /* process each buffer */ 382 for (;;) 383 { 384 /* process spaces and new-lines */ 385 spaces: 386 do 387 { 388 if (eol(c)) 389 { 390 /* check for end of buffer */ 391 if (cp > endbuff) 392 goto eob; 393 if(wp->mode&WC_LONGEST) 394 { 395 if((cp-start)-adjust > longest) 396 longest = (cp-start)-adjust-1; 397 start = cp; 398 } 399 nlines++; 400 nchars -= adjust; 401 adjust = 0; 402 } 403 } while (spc(c = type[*cp++])); 404 wasspace=1; 405 if(mbc(c)) 406 { 407 mbyte: 408 do 409 { 410 if(c&WC_ERR) 411 goto err; 412 if(skip && (c&7)) 413 break; 414 if(!skip) 415 { 416 if(!(c&7)) 417 { 418 skip=1; 419 break; 420 } 421 skip = (c&7); 422 adjust += skip; 423 state = 0; 424 if (flagm == 1) { 425 flagm = 0; 426 oldc = *cp; 427 if (xspace && ( 428 iswspace 429 (*cp) 430 == 1)) { 431 state 432 = 8; 433 } 434 continue; 435 } 436 if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3))) 437 oldc = *cp; 438 else if(xspace && cp[-1]==0xc2) 439 { 440 state = 8; 441 oldc = *cp; 442 } 443 } 444 else 445 { 446 skip--; 447 if(state && (state=chkstate(state,oldc))) 448 { 449 if(state==10) 450 { 451 if(!wasspace) 452 nwords++; 453 wasspace = 1; 454 state=0; 455 goto spaces; 456 } 457 oldc = *cp; 458 } 459 } 460 } while (mbc(c = type[*cp++])); 461 wasspace = 0; 462 if(skip) 463 { 464 if(eol(c) && (cp > endbuff)) 465 goto eob; 466 err: 467 skip = 0; 468 state = 0; 469 if(eline!=nlines && !(wp->mode & WC_QUIET)) 470 eline = invalid(file, nlines); 471 while(mbc(c) && ((c|WC_ERR) || (c&7)==0)) 472 c=type[*cp++]; 473 if(eol(c) && (cp > endbuff)) 474 { 475 c = WC_MB|WC_ERR; 476 goto eob; 477 } 478 if(mbc(c)) 479 goto mbyte; 480 else if(c&WC_SP) 481 goto spaces; 482 } 483 if(spc(c)) 484 { 485 nwords++; 486 continue; 487 } 488 } 489 /* skip over word characters */ 490 while(!(c = type[*cp++])); 491 if(mbc(c)) 492 goto mbyte; 493 nwords++; 494 } 495 eob: 496 lineoff = cp-start; 497 if((cp -= 2) >= buff) 498 c = type[*cp]; 499 else 500 c = lasttype; 501 lasttype = type[lastchar]; 502 /* see if was in word */ 503 if(!c && !lasttype) 504 nwords--; 505 } 506 if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest) 507 longest = (endbuff + 1 - start) - adjust - (lastchar == '\n'); 508 wp->longest = longest; 509 if (eol(lasttype)) 510 nlines++; 511 else if (!lasttype) 512 nwords++; 513 if (wp->mode & WC_MBYTE) 514 nchars -= adjust; 515 else 516 nchars = nbytes; 517 } 518 wp->chars = nchars; 519 wp->words = nwords; 520 wp->lines = nlines; 521 return 0; 522 } 523 524