1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23 * David Korn
24 * AT&T Bell Laboratories
25 *
26 * library interface for word count
27 */
28
29 #include <cmd.h>
30 #include <wc.h>
31 #include <ctype.h>
32
33 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
34
35 #include <wchar.h>
36 #include <wctype.h>
37 #include <lc.h>
38
39 #else
40
41 #ifndef iswspace
42 #define iswspace(x) isspace(x)
43 #endif
44
45 #endif
46
47 #define WC_SP 0x08
48 #define WC_NL 0x10
49 #define WC_MB 0x20
50 #define WC_ERR 0x40
51
52 #define eol(c) ((c)&WC_NL)
53 #define mbc(c) ((c)&WC_MB)
54 #define spc(c) ((c)&WC_SP)
55 #define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
56
wc_init(int mode)57 Wc_t* wc_init(int mode)
58 {
59 register int n;
60 register int w;
61 Wc_t* wp;
62
63 if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
64 return 0;
65 if (!mbwide())
66 wp->mb = 0;
67 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
68 else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
69 wp->mb = 1;
70 #endif
71 else
72 wp->mb = -1;
73 w = mode & WC_WORDS;
74 for (n = (1<<CHAR_BIT); --n >= 0;)
75 wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
76 wp->type['\n'] = WC_SP|WC_NL;
77 if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
78 {
79 for (n = 0; n < 64; n++)
80 {
81 wp->type[0x80+n] |= WC_MB;
82 if (n<32)
83 wp->type[0xc0+n] |= WC_MB+1;
84 else if (n<48)
85 wp->type[0xc0+n] |= WC_MB+2;
86 else if (n<56)
87 wp->type[0xc0+n] |= WC_MB+3;
88 else if (n<60)
89 wp->type[0xc0+n] |= WC_MB+4;
90 else if (n<62)
91 wp->type[0xc0+n] |= WC_MB+5;
92 }
93 wp->type[0xc0] = WC_MB|WC_ERR;
94 wp->type[0xc1] = WC_MB|WC_ERR;
95 wp->type[0xfe] = WC_MB|WC_ERR;
96 wp->type[0xff] = WC_MB|WC_ERR;
97 }
98 wp->mode = mode;
99 return wp;
100 }
101
invalid(const char * file,int nlines)102 static int invalid(const char *file, int nlines)
103 {
104 error_info.file = (char*)file;
105 error_info.line = nlines;
106 error(ERROR_SYSTEM|1, "invalid multibyte character");
107 error_info.file = 0;
108 error_info.line = 0;
109 return nlines;
110 }
111
112 /*
113 * handle utf space characters
114 */
115
chkstate(int state,register unsigned int c)116 static int chkstate(int state, register unsigned int c)
117 {
118 switch(state)
119 {
120 case 1:
121 state = (c==0x9a?4:0);
122 break;
123 case 2:
124 state = ((c==0x80||c==0x81)?6+(c&1):0);
125 break;
126 case 3:
127 state = (c==0x80?5:0);
128 break;
129 case 4:
130 state = (c==0x80?10:0);
131 break;
132 case 5:
133 state = (c==0x80?10:0);
134 break;
135 case 6:
136 state = 0;
137 if(c==0xa0 || c==0xa1)
138 return(10);
139 else if((c&0xf0)== 0x80)
140 {
141 if((c&=0xf)==7)
142 return(iswspace(0x2007)?10:0);
143 if(c<=0xb)
144 return(10);
145 }
146 else if(c==0xaf && iswspace(0x202f))
147 return(10);
148 break;
149 case 7:
150 state = (c==0x9f?10:0);
151 break;
152 case 8:
153 return (iswspace(c)?10:0);
154 }
155 return state;
156 }
157
158 /*
159 * compute the line, word, and character count for file <fd>
160 */
161
wc_count(Wc_t * wp,Sfio_t * fd,const char * file)162 int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
163 {
164 register char* type = wp->type;
165 register unsigned char* cp;
166 register Sfoff_t nbytes;
167 register Sfoff_t nchars;
168 register Sfoff_t nwords;
169 register Sfoff_t nlines;
170 register Sfoff_t eline = -1;
171 register Sfoff_t longest = 0;
172 register ssize_t c;
173 register unsigned char* endbuff;
174 register int lasttype = WC_SP;
175 unsigned int lastchar;
176 ssize_t n;
177 ssize_t o;
178 unsigned char* buff;
179 wchar_t x;
180 unsigned char side[32];
181
182 sfset(fd,SF_WRITE,1);
183 nlines = nwords = nchars = nbytes = 0;
184 wp->longest = 0;
185 if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
186 {
187 cp = buff = endbuff = 0;
188 for (;;)
189 {
190 if (cp >= endbuff || (n = mb2wc(x, cp, endbuff-cp)) < 0)
191 {
192 if ((o = endbuff-cp) < sizeof(side))
193 {
194 if (buff)
195 {
196 if (o)
197 memcpy(side, cp, o);
198 mbinit();
199 }
200 else
201 o = 0;
202 cp = side + o;
203 if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
204 {
205 if ((nchars - longest) > wp->longest)
206 wp->longest = nchars - longest;
207 break;
208 }
209 nbytes += n;
210 if ((c = sizeof(side) - o) > n)
211 c = n;
212 if (c)
213 memcpy(cp, buff, c);
214 endbuff = buff + n;
215 cp = side;
216 x = mbchar(cp);
217 if ((cp-side) < o)
218 {
219 cp = buff;
220 nchars += (cp-side) - 1;
221 }
222 else
223 cp = buff + (cp-side) - o;
224 }
225 else
226 {
227 cp++;
228 x = -1;
229 }
230 if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
231 eline = invalid(file, nlines);
232 }
233 else
234 cp += n ? n : 1;
235 if (x == '\n')
236 {
237 if ((nchars - longest) > wp->longest)
238 wp->longest = nchars - longest;
239 longest = nchars + 1;
240 nlines++;
241 lasttype = 1;
242 }
243 else if (iswspace(x))
244 lasttype = 1;
245 else if (lasttype)
246 {
247 lasttype = 0;
248 nwords++;
249 }
250 nchars++;
251 }
252 if (!(wp->mode & WC_MBYTE))
253 nchars = nbytes;
254 }
255 else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
256 {
257 if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
258 {
259 while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
260 {
261 nchars += c;
262 endbuff = cp + c;
263 if (*--endbuff == '\n')
264 nlines++;
265 else
266 *endbuff = '\n';
267 for (;;)
268 if (*cp++ == '\n')
269 {
270 if (cp > endbuff)
271 break;
272 nlines++;
273 }
274 }
275 }
276 else
277 {
278 while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
279 {
280 nchars += c;
281 /* check to see whether first character terminates word */
282 if (c==1)
283 {
284 if (eol(lasttype))
285 nlines++;
286 if ((c = type[*cp]) && !lasttype)
287 nwords++;
288 lasttype = c;
289 continue;
290 }
291 if (!lasttype && type[*cp])
292 nwords++;
293 lastchar = cp[--c];
294 *(endbuff = cp+c) = '\n';
295 c = lasttype;
296 /* process each buffer */
297 for (;;)
298 {
299 /* process spaces and new-lines */
300 do
301 {
302 if (eol(c))
303 for (;;)
304 {
305 /* check for end of buffer */
306 if (cp > endbuff)
307 goto beob;
308 nlines++;
309 if (*cp != '\n')
310 break;
311 cp++;
312 }
313 } while (c = type[*cp++]);
314 /* skip over word characters */
315 while (!(c = type[*cp++]));
316 nwords++;
317 }
318 beob:
319 if ((cp -= 2) >= buff)
320 c = type[*cp];
321 else
322 c = lasttype;
323 lasttype = type[lastchar];
324 /* see if was in word */
325 if (!c && !lasttype)
326 nwords--;
327 }
328 if (eol(lasttype))
329 nlines++;
330 else if (!lasttype)
331 nwords++;
332 }
333 }
334 else
335 {
336 int lineoff=0;
337 int skip=0;
338 int adjust=0;
339 int state=0;
340 int oldc;
341 int xspace;
342 int wasspace = 1;
343 unsigned char* start;
344
345 lastchar = 0;
346 start = (endbuff = side) + 1;
347 xspace = iswspace(0xa0) || iswspace(0x85);
348 while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
349 {
350 nbytes += c;
351 nchars += c;
352 start = cp-lineoff;
353 /* check to see whether first character terminates word */
354 if(c==1)
355 {
356 if(eol(lasttype))
357 nlines++;
358 if((c = type[*cp]) && !lasttype)
359 nwords++;
360 lasttype = c;
361 endbuff = start;
362 continue;
363 }
364 lastchar = cp[--c];
365 endbuff = cp+c;
366 cp[c] = '\n';
367 if(mbc(lasttype))
368 {
369 c = lasttype;
370 goto mbyte;
371 }
372 if(!lasttype && spc(type[*cp]))
373 nwords++;
374 c = lasttype;
375 /* process each buffer */
376 for (;;)
377 {
378 /* process spaces and new-lines */
379 spaces:
380 do
381 {
382 if (eol(c))
383 {
384 /* check for end of buffer */
385 if (cp > endbuff)
386 goto eob;
387 if(wp->mode&WC_LONGEST)
388 {
389 if((cp-start)-adjust > longest)
390 longest = (cp-start)-adjust-1;
391 start = cp;
392 }
393 nlines++;
394 nchars -= adjust;
395 adjust = 0;
396 }
397 } while (spc(c = type[*cp++]));
398 wasspace=1;
399 if(mbc(c))
400 {
401 mbyte:
402 do
403 {
404 if(c&WC_ERR)
405 goto err;
406 if(skip && (c&7))
407 break;
408 if(!skip)
409 {
410 if(!(c&7))
411 {
412 skip=1;
413 break;
414 }
415 skip = (c&7);
416 adjust += skip;
417 state = 0;
418 if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
419 oldc = *cp;
420 else if(xspace && cp[-1]==0xc2)
421 {
422 state = 8;
423 oldc = *cp;
424 }
425 }
426 else
427 {
428 skip--;
429 if(state && (state=chkstate(state,oldc)))
430 {
431 if(state==10)
432 {
433 if(!wasspace)
434 nwords++;
435 wasspace = 1;
436 state=0;
437 goto spaces;
438 }
439 oldc = *cp;
440 }
441 }
442 } while (mbc(c = type[*cp++]));
443 wasspace = 0;
444 if(skip)
445 {
446 if(eol(c) && (cp > endbuff))
447 goto eob;
448 err:
449 skip = 0;
450 state = 0;
451 if(eline!=nlines && !(wp->mode & WC_QUIET))
452 eline = invalid(file, nlines);
453 while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
454 c=type[*cp++];
455 if(eol(c) && (cp > endbuff))
456 {
457 c = WC_MB|WC_ERR;
458 goto eob;
459 }
460 if(mbc(c))
461 goto mbyte;
462 else if(c&WC_SP)
463 goto spaces;
464 }
465 if(spc(c))
466 {
467 nwords++;
468 continue;
469 }
470 }
471 /* skip over word characters */
472 while(!(c = type[*cp++]));
473 if(mbc(c))
474 goto mbyte;
475 nwords++;
476 }
477 eob:
478 lineoff = cp-start;
479 if((cp -= 2) >= buff)
480 c = type[*cp];
481 else
482 c = lasttype;
483 lasttype = type[lastchar];
484 /* see if was in word */
485 if(!c && !lasttype)
486 nwords--;
487 }
488 if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
489 longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
490 wp->longest = longest;
491 if (eol(lasttype))
492 nlines++;
493 else if (!lasttype)
494 nwords++;
495 if (wp->mode & WC_MBYTE)
496 nchars -= adjust;
497 else
498 nchars = nbytes;
499 }
500 wp->chars = nchars;
501 wp->words = nwords;
502 wp->lines = nlines;
503 return 0;
504 }
505
506