1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2011 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Eclipse Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.eclipse.org/org/documents/epl-v10.html *
11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * *
20 ***********************************************************************/
21 /*
22 * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
23 */
24 #pragma prototyped
25 /*
26 * David Korn
27 * AT&T Bell Laboratories
28 *
29 * library interface for word count
30 */
31
32 #include <cmd.h>
33 #include <wc.h>
34 #include <ctype.h>
35
36 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
37
38 #include <wchar.h>
39 #include <wctype.h>
40 #include <lc.h>
41
42 #else
43
44 #ifndef iswspace
45 #define iswspace(x) isspace(x)
46 #endif
47
48 #endif
49
50 #define WC_SP 0x08
51 #define WC_NL 0x10
52 #define WC_MB 0x20
53 #define WC_ERR 0x40
54
55 #define eol(c) ((c)&WC_NL)
56 #define mbc(c) ((c)&WC_MB)
57 #define spc(c) ((c)&WC_SP)
58 #define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
59
wc_init(int mode)60 Wc_t* wc_init(int mode)
61 {
62 register int n;
63 register int w;
64 Wc_t* wp;
65
66 if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
67 return 0;
68 if (!mbwide())
69 wp->mb = 0;
70 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
71 else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
72 wp->mb = 1;
73 #endif
74 else
75 wp->mb = -1;
76 w = mode & WC_WORDS;
77 for (n = (1<<CHAR_BIT); --n >= 0;)
78 wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
79 wp->type['\n'] = WC_SP|WC_NL;
80 if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
81 {
82 for (n = 0; n < 64; n++)
83 {
84 wp->type[0x80+n] |= WC_MB;
85 if (n<32)
86 wp->type[0xc0+n] |= WC_MB+1;
87 else if (n<48)
88 wp->type[0xc0+n] |= WC_MB+2;
89 else if (n<56)
90 wp->type[0xc0+n] |= WC_MB+3;
91 else if (n<60)
92 wp->type[0xc0+n] |= WC_MB+4;
93 else if (n<62)
94 wp->type[0xc0+n] |= WC_MB+5;
95 }
96 wp->type[0xc0] = WC_MB|WC_ERR;
97 wp->type[0xc1] = WC_MB|WC_ERR;
98 wp->type[0xfe] = WC_MB|WC_ERR;
99 wp->type[0xff] = WC_MB|WC_ERR;
100 }
101 wp->mode = mode;
102 return wp;
103 }
104
invalid(const char * file,int nlines)105 static int invalid(const char *file, int nlines)
106 {
107 error_info.file = (char*)file;
108 error_info.line = nlines;
109 error(ERROR_SYSTEM|1, "invalid multibyte character");
110 error_info.file = 0;
111 error_info.line = 0;
112 return nlines;
113 }
114
115 /*
116 * handle utf space characters
117 */
118
chkstate(int state,register unsigned int c)119 static int chkstate(int state, register unsigned int c)
120 {
121 switch(state)
122 {
123 case 1:
124 state = (c==0x9a?4:0);
125 break;
126 case 2:
127 state = ((c==0x80||c==0x81)?6+(c&1):0);
128 break;
129 case 3:
130 state = (c==0x80?5:0);
131 break;
132 case 4:
133 state = (c==0x80?10:0);
134 break;
135 case 5:
136 state = (c==0x80?10:0);
137 break;
138 case 6:
139 state = 0;
140 if(c==0xa0 || c==0xa1)
141 return(10);
142 else if((c&0xf0)== 0x80)
143 {
144 if((c&=0xf)==7)
145 return(iswspace(0x2007)?10:0);
146 if(c<=0xb)
147 return(10);
148 }
149 else if(c==0xaf && iswspace(0x202f))
150 return(10);
151 break;
152 case 7:
153 state = (c==0x9f?10:0);
154 break;
155 case 8:
156 return (iswspace(c)?10:0);
157 }
158 return state;
159 }
160
161 /*
162 * compute the line, word, and character count for file <fd>
163 */
164
wc_count(Wc_t * wp,Sfio_t * fd,const char * file)165 int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
166 {
167 register char* type = wp->type;
168 register unsigned char* cp;
169 register Sfoff_t nbytes;
170 register Sfoff_t nchars;
171 register Sfoff_t nwords;
172 register Sfoff_t nlines;
173 register Sfoff_t eline = -1;
174 register Sfoff_t longest = 0;
175 register ssize_t c;
176 register unsigned char* endbuff;
177 register int lasttype = WC_SP;
178 unsigned int lastchar;
179 ssize_t n;
180 ssize_t o;
181 unsigned char* buff;
182 wchar_t x;
183 unsigned char side[32];
184
185 sfset(fd,SF_WRITE,1);
186 nlines = nwords = nchars = nbytes = 0;
187 wp->longest = 0;
188 if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
189 {
190 cp = buff = endbuff = 0;
191 for (;;)
192 {
193 if (cp >= endbuff || (n = mb2wc(x, cp, endbuff-cp)) < 0)
194 {
195 if ((o = endbuff-cp) < sizeof(side))
196 {
197 if (buff)
198 {
199 if (o)
200 memcpy(side, cp, o);
201 mbinit();
202 }
203 else
204 o = 0;
205 cp = side + o;
206 if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
207 {
208 if ((nchars - longest) > wp->longest)
209 wp->longest = nchars - longest;
210 break;
211 }
212 nbytes += n;
213 if ((c = sizeof(side) - o) > n)
214 c = n;
215 if (c)
216 memcpy(cp, buff, c);
217 endbuff = buff + n;
218 cp = side;
219 x = mbchar(cp);
220 if ((cp-side) < o)
221 {
222 cp = buff;
223 nchars += (cp-side) - 1;
224 }
225 else
226 cp = buff + (cp-side) - o;
227 }
228 else
229 {
230 cp++;
231 x = -1;
232 }
233 if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
234 eline = invalid(file, nlines);
235 }
236 else
237 cp += n ? n : 1;
238 if (x == '\n')
239 {
240 if ((nchars - longest) > wp->longest)
241 wp->longest = nchars - longest;
242 longest = nchars + 1;
243 nlines++;
244 lasttype = 1;
245 }
246 else if (iswspace(x))
247 lasttype = 1;
248 else if (lasttype)
249 {
250 lasttype = 0;
251 nwords++;
252 }
253 nchars++;
254 }
255 if (!(wp->mode & WC_MBYTE))
256 nchars = nbytes;
257 }
258 else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
259 {
260 if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
261 {
262 while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
263 {
264 nchars += c;
265 endbuff = cp + c;
266 if (*--endbuff == '\n')
267 nlines++;
268 else
269 *endbuff = '\n';
270 for (;;)
271 if (*cp++ == '\n')
272 {
273 if (cp > endbuff)
274 break;
275 nlines++;
276 }
277 }
278 }
279 else
280 {
281 while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
282 {
283 nchars += c;
284 /* check to see whether first character terminates word */
285 if (c==1)
286 {
287 if (eol(lasttype))
288 nlines++;
289 if ((c = type[*cp]) && !lasttype)
290 nwords++;
291 lasttype = c;
292 continue;
293 }
294 if (!lasttype && type[*cp])
295 nwords++;
296 lastchar = cp[--c];
297 *(endbuff = cp+c) = '\n';
298 c = lasttype;
299 /* process each buffer */
300 for (;;)
301 {
302 /* process spaces and new-lines */
303 do
304 {
305 if (eol(c))
306 for (;;)
307 {
308 /* check for end of buffer */
309 if (cp > endbuff)
310 goto beob;
311 nlines++;
312 if (*cp != '\n')
313 break;
314 cp++;
315 }
316 } while (c = type[*cp++]);
317 /* skip over word characters */
318 while (!(c = type[*cp++]));
319 nwords++;
320 }
321 beob:
322 if ((cp -= 2) >= buff)
323 c = type[*cp];
324 else
325 c = lasttype;
326 lasttype = type[lastchar];
327 /* see if was in word */
328 if (!c && !lasttype)
329 nwords--;
330 }
331 if (eol(lasttype))
332 nlines++;
333 else if (!lasttype)
334 nwords++;
335 }
336 }
337 else
338 {
339 int lineoff=0;
340 int skip=0;
341 int adjust=0;
342 int state=0;
343 int oldc;
344 int xspace;
345 int wasspace = 1;
346 unsigned char* start;
347 int flagm = 0;
348
349
350 lastchar = 0;
351 start = (endbuff = side) + 1;
352 xspace = iswspace(0xa0) || iswspace(0x85);
353 while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
354 {
355 nbytes += c;
356 nchars += c;
357 start = cp-lineoff;
358 /* check to see whether first character terminates word */
359 if(c==1)
360 {
361 if(eol(lasttype))
362 nlines++;
363 if((c = type[*cp]) && !lasttype)
364 nwords++;
365 lasttype = c;
366 endbuff = start;
367 continue;
368 }
369 lastchar = cp[--c];
370 endbuff = cp+c;
371 cp[c] = '\n';
372 if(mbc(lasttype))
373 {
374 c = lasttype;
375 flagm = 1;
376 goto mbyte;
377 }
378 if(!lasttype && spc(type[*cp]))
379 nwords++;
380 c = lasttype;
381 /* process each buffer */
382 for (;;)
383 {
384 /* process spaces and new-lines */
385 spaces:
386 do
387 {
388 if (eol(c))
389 {
390 /* check for end of buffer */
391 if (cp > endbuff)
392 goto eob;
393 if(wp->mode&WC_LONGEST)
394 {
395 if((cp-start)-adjust > longest)
396 longest = (cp-start)-adjust-1;
397 start = cp;
398 }
399 nlines++;
400 nchars -= adjust;
401 adjust = 0;
402 }
403 } while (spc(c = type[*cp++]));
404 wasspace=1;
405 if(mbc(c))
406 {
407 mbyte:
408 do
409 {
410 if(c&WC_ERR)
411 goto err;
412 if(skip && (c&7))
413 break;
414 if(!skip)
415 {
416 if(!(c&7))
417 {
418 skip=1;
419 break;
420 }
421 skip = (c&7);
422 adjust += skip;
423 state = 0;
424 if (flagm == 1) {
425 flagm = 0;
426 oldc = *cp;
427 if (xspace && (
428 iswspace
429 (*cp)
430 == 1)) {
431 state
432 = 8;
433 }
434 continue;
435 }
436 if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
437 oldc = *cp;
438 else if(xspace && cp[-1]==0xc2)
439 {
440 state = 8;
441 oldc = *cp;
442 }
443 }
444 else
445 {
446 skip--;
447 if(state && (state=chkstate(state,oldc)))
448 {
449 if(state==10)
450 {
451 if(!wasspace)
452 nwords++;
453 wasspace = 1;
454 state=0;
455 goto spaces;
456 }
457 oldc = *cp;
458 }
459 }
460 } while (mbc(c = type[*cp++]));
461 wasspace = 0;
462 if(skip)
463 {
464 if(eol(c) && (cp > endbuff))
465 goto eob;
466 err:
467 skip = 0;
468 state = 0;
469 if(eline!=nlines && !(wp->mode & WC_QUIET))
470 eline = invalid(file, nlines);
471 while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
472 c=type[*cp++];
473 if(eol(c) && (cp > endbuff))
474 {
475 c = WC_MB|WC_ERR;
476 goto eob;
477 }
478 if(mbc(c))
479 goto mbyte;
480 else if(c&WC_SP)
481 goto spaces;
482 }
483 if(spc(c))
484 {
485 nwords++;
486 continue;
487 }
488 }
489 /* skip over word characters */
490 while(!(c = type[*cp++]));
491 if(mbc(c))
492 goto mbyte;
493 nwords++;
494 }
495 eob:
496 lineoff = cp-start;
497 if((cp -= 2) >= buff)
498 c = type[*cp];
499 else
500 c = lasttype;
501 lasttype = type[lastchar];
502 /* see if was in word */
503 if(!c && !lasttype)
504 nwords--;
505 }
506 if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
507 longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
508 wp->longest = longest;
509 if (eol(lasttype))
510 nlines++;
511 else if (!lasttype)
512 nwords++;
513 if (wp->mode & WC_MBYTE)
514 nchars -= adjust;
515 else
516 nchars = nbytes;
517 }
518 wp->chars = nchars;
519 wp->words = nwords;
520 wp->lines = nlines;
521 return 0;
522 }
523
524