xref: /titanic_41/usr/src/lib/libcmd/common/wclib.c (revision d2b5b2d357ee3172eacb6860be1891259902203d)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1992-2010 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                                                                      *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23  * David Korn
24  * AT&T Bell Laboratories
25  *
26  * library interface for word count
27  */
28 
29 #include <cmd.h>
30 #include <wc.h>
31 #include <ctype.h>
32 
33 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
34 
35 #include <wchar.h>
36 #include <wctype.h>
37 #include <lc.h>
38 
39 #else
40 
41 #ifndef iswspace
42 #define iswspace(x)	isspace(x)
43 #endif
44 
45 #endif
46 
47 #define	WC_SP		0x08
48 #define	WC_NL		0x10
49 #define	WC_MB		0x20
50 #define	WC_ERR		0x40
51 
52 #define eol(c)		((c)&WC_NL)
53 #define mbc(c)		((c)&WC_MB)
54 #define spc(c)		((c)&WC_SP)
55 #define mb2wc(w,p,n)	(*ast.mb_towc)(&w,(char*)p,n)
56 
57 Wc_t* wc_init(int mode)
58 {
59 	register int	n;
60 	register int	w;
61 	Wc_t*		wp;
62 
63 	if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
64 		return 0;
65 	if (!mbwide())
66 		wp->mb = 0;
67 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
68 	else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
69 		wp->mb = 1;
70 #endif
71 	else
72 		wp->mb = -1;
73 	w = mode & WC_WORDS;
74 	for (n = (1<<CHAR_BIT); --n >= 0;)
75 		wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
76 	wp->type['\n'] = WC_SP|WC_NL;
77 	if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
78 	{
79 		for (n = 0; n < 64; n++)
80 		{
81 			wp->type[0x80+n] |= WC_MB;
82 			if (n<32)
83 				wp->type[0xc0+n] |= WC_MB+1;
84 			else if (n<48)
85 				wp->type[0xc0+n] |= WC_MB+2;
86 			else if (n<56)
87 				wp->type[0xc0+n] |= WC_MB+3;
88 			else if (n<60)
89 				wp->type[0xc0+n] |= WC_MB+4;
90 			else if (n<62)
91 				wp->type[0xc0+n] |= WC_MB+5;
92 		}
93 		wp->type[0xc0] = WC_MB|WC_ERR;
94 		wp->type[0xc1] = WC_MB|WC_ERR;
95 		wp->type[0xfe] = WC_MB|WC_ERR;
96 		wp->type[0xff] = WC_MB|WC_ERR;
97 	}
98 	wp->mode = mode;
99 	return wp;
100 }
101 
102 static int invalid(const char *file, int nlines)
103 {
104 	error_info.file = (char*)file;
105 	error_info.line = nlines;
106 	error(ERROR_SYSTEM|1, "invalid multibyte character");
107 	error_info.file = 0;
108 	error_info.line = 0;
109 	return nlines;
110 }
111 
112 /*
113  * handle utf space characters
114  */
115 
116 static int chkstate(int state, register unsigned int c)
117 {
118 	switch(state)
119 	{
120 	case 1:
121 		state = (c==0x9a?4:0);
122 		break;
123 	case 2:
124 		state = ((c==0x80||c==0x81)?6+(c&1):0);
125 		break;
126 	case 3:
127 		state = (c==0x80?5:0);
128 		break;
129 	case 4:
130 		state = (c==0x80?10:0);
131 		break;
132 	case 5:
133 		state = (c==0x80?10:0);
134 		break;
135 	case 6:
136 		state = 0;
137 		if(c==0xa0 || c==0xa1)
138 			return(10);
139 		else if((c&0xf0)== 0x80)
140 		{
141 			if((c&=0xf)==7)
142 				return(iswspace(0x2007)?10:0);
143 			if(c<=0xb)
144 				return(10);
145 		}
146 		else if(c==0xaf && iswspace(0x202f))
147 			return(10);
148 		break;
149 	case 7:
150 		state = (c==0x9f?10:0);
151 		break;
152 	case 8:
153 		return (iswspace(c)?10:0);
154 	}
155 	return state;
156 }
157 
158 /*
159  * compute the line, word, and character count for file <fd>
160  */
161 
162 int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
163 {
164 	register char*		type = wp->type;
165 	register unsigned char*	cp;
166 	register Sfoff_t	nbytes;
167 	register Sfoff_t	nchars;
168 	register Sfoff_t	nwords;
169 	register Sfoff_t	nlines;
170 	register Sfoff_t	eline = -1;
171 	register Sfoff_t	longest = 0;
172 	register ssize_t	c;
173 	register unsigned char*	endbuff;
174 	register int		lasttype = WC_SP;
175 	unsigned int		lastchar;
176 	ssize_t			n;
177 	ssize_t			o;
178 	unsigned char*		buff;
179 	wchar_t			x;
180 	unsigned char		side[32];
181 
182 	sfset(fd,SF_WRITE,1);
183 	nlines = nwords = nchars = nbytes = 0;
184 	wp->longest = 0;
185 	if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
186 	{
187 		cp = buff = endbuff = 0;
188 		for (;;)
189 		{
190 			if (cp >= endbuff || (n = mb2wc(x, cp, endbuff-cp)) < 0)
191 			{
192 				if ((o = endbuff-cp) < sizeof(side))
193 				{
194 					if (buff)
195 					{
196 						if (o)
197 							memcpy(side, cp, o);
198 						mbinit();
199 					}
200 					else
201 						o = 0;
202 					cp = side + o;
203 					if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
204 					{
205 						if ((nchars - longest) > wp->longest)
206 							wp->longest = nchars - longest;
207 						break;
208 					}
209 					nbytes += n;
210 					if ((c = sizeof(side) - o) > n)
211 						c = n;
212 					if (c)
213 						memcpy(cp, buff, c);
214 					endbuff = buff + n;
215 					cp = side;
216 					x = mbchar(cp);
217 					if ((cp-side) < o)
218 					{
219 						cp = buff;
220 						nchars += (cp-side) - 1;
221 					}
222 					else
223 						cp = buff + (cp-side) - o;
224 				}
225 				else
226 				{
227 					cp++;
228 					x = -1;
229 				}
230 				if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
231 					eline = invalid(file, nlines);
232 			}
233 			else
234 				cp += n ? n : 1;
235 			if (x == '\n')
236 			{
237 				if ((nchars - longest) > wp->longest)
238 					wp->longest = nchars - longest;
239 				longest = nchars + 1;
240 				nlines++;
241 				lasttype = 1;
242 			}
243 			else if (iswspace(x))
244 				lasttype = 1;
245 			else if (lasttype)
246 			{
247 				lasttype = 0;
248 				nwords++;
249 			}
250 			nchars++;
251 		}
252 		if (!(wp->mode & WC_MBYTE))
253 			nchars = nbytes;
254 	}
255 	else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
256 	{
257 		if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
258 		{
259 			while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
260 			{
261 				nchars += c;
262 				endbuff = cp + c;
263 				if (*--endbuff == '\n')
264 					nlines++;
265 				else
266 					*endbuff = '\n';
267 				for (;;)
268 					if (*cp++ == '\n')
269 					{
270 						if (cp > endbuff)
271 							break;
272 						nlines++;
273 					}
274 			}
275 		}
276 		else
277 		{
278 			while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
279 			{
280 				nchars += c;
281 				/* check to see whether first character terminates word */
282 				if (c==1)
283 				{
284 					if (eol(lasttype))
285 						nlines++;
286 					if ((c = type[*cp]) && !lasttype)
287 						nwords++;
288 					lasttype = c;
289 					continue;
290 				}
291 				if (!lasttype && type[*cp])
292 					nwords++;
293 				lastchar = cp[--c];
294 				*(endbuff = cp+c) = '\n';
295 				c = lasttype;
296 				/* process each buffer */
297 				for (;;)
298 				{
299 					/* process spaces and new-lines */
300 					do
301 					{
302 						if (eol(c))
303 							for (;;)
304 							{
305 								/* check for end of buffer */
306 								if (cp > endbuff)
307 									goto beob;
308 								nlines++;
309 								if (*cp != '\n')
310 									break;
311 								cp++;
312 							}
313 					} while (c = type[*cp++]);
314 					/* skip over word characters */
315 					while (!(c = type[*cp++]));
316 					nwords++;
317 				}
318 			beob:
319 				if ((cp -= 2) >= buff)
320 					c = type[*cp];
321 				else
322 					c = lasttype;
323 				lasttype = type[lastchar];
324 				/* see if was in word */
325 				if (!c && !lasttype)
326 					nwords--;
327 			}
328 			if (eol(lasttype))
329 				nlines++;
330 			else if (!lasttype)
331 				nwords++;
332 		}
333 	}
334 	else
335 	{
336 		int		lineoff=0;
337 		int		skip=0;
338 		int		adjust=0;
339 		int		state=0;
340 		int		oldc;
341 		int		xspace;
342 		int		wasspace = 1;
343 		unsigned char*	start;
344 
345 		lastchar = 0;
346 		start = (endbuff = side) + 1;
347 		xspace = iswspace(0xa0) || iswspace(0x85);
348 		while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
349 		{
350 			nbytes += c;
351 			nchars += c;
352 			start = cp-lineoff;
353 			/* check to see whether first character terminates word */
354 			if(c==1)
355 			{
356 				if(eol(lasttype))
357 					nlines++;
358 				if((c = type[*cp]) && !lasttype)
359 					nwords++;
360 				lasttype = c;
361 				endbuff = start;
362 				continue;
363 			}
364 			lastchar = cp[--c];
365 			endbuff = cp+c;
366 			cp[c] = '\n';
367 			if(mbc(lasttype))
368 			{
369 				c = lasttype;
370 				goto mbyte;
371 			}
372 			if(!lasttype && spc(type[*cp]))
373 				nwords++;
374 			c = lasttype;
375 			/* process each buffer */
376 			for (;;)
377 			{
378 				/* process spaces and new-lines */
379 			spaces:
380 				do
381 				{
382 					if (eol(c))
383 					{
384 						/* check for end of buffer */
385 						if (cp > endbuff)
386 							goto eob;
387 						if(wp->mode&WC_LONGEST)
388 						{
389 							if((cp-start)-adjust > longest)
390 								longest = (cp-start)-adjust-1;
391 							start = cp;
392 						}
393 						nlines++;
394 						nchars -= adjust;
395 						adjust = 0;
396 					}
397 				} while (spc(c = type[*cp++]));
398 				wasspace=1;
399 				if(mbc(c))
400 				{
401 				mbyte:
402 					do
403 					{
404 						if(c&WC_ERR)
405 							goto err;
406 						if(skip && (c&7))
407 							break;
408 						if(!skip)
409 						{
410 							if(!(c&7))
411 							{
412 								skip=1;
413 								break;
414 							}
415 							skip = (c&7);
416 							adjust += skip;
417 							state = 0;
418 							if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
419 								oldc = *cp;
420 							else if(xspace && cp[-1]==0xc2)
421 							{
422 								state = 8;
423 								oldc = *cp;
424 							}
425 						}
426 						else
427 						{
428 							skip--;
429 							if(state && (state=chkstate(state,oldc)))
430 							{
431 								if(state==10)
432 								{
433 									if(!wasspace)
434 										nwords++;
435 									wasspace = 1;
436 									state=0;
437 									goto spaces;
438 								}
439 								oldc = *cp;
440 							}
441 						}
442 					} while (mbc(c = type[*cp++]));
443 					wasspace = 0;
444 					if(skip)
445 					{
446 						if(eol(c) && (cp > endbuff))
447 							goto eob;
448 				err:
449 						skip = 0;
450 						state = 0;
451 						if(eline!=nlines && !(wp->mode & WC_QUIET))
452 							eline = invalid(file, nlines);
453 						while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
454 							c=type[*cp++];
455 						if(eol(c) && (cp > endbuff))
456 						{
457 							c = WC_MB|WC_ERR;
458 							goto eob;
459 						}
460 						if(mbc(c))
461 							goto mbyte;
462 						else if(c&WC_SP)
463 							goto spaces;
464 					}
465 					if(spc(c))
466 					{
467 						nwords++;
468 						continue;
469 					}
470 				}
471 				/* skip over word characters */
472 				while(!(c = type[*cp++]));
473 				if(mbc(c))
474 					goto mbyte;
475 				nwords++;
476 			}
477 		eob:
478 			lineoff = cp-start;
479 			if((cp -= 2) >= buff)
480 				c = type[*cp];
481 			else
482 				c = lasttype;
483 			lasttype = type[lastchar];
484 			/* see if was in word */
485 			if(!c && !lasttype)
486 				nwords--;
487 		}
488 		if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
489 			longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
490 		wp->longest = longest;
491 		if (eol(lasttype))
492 			nlines++;
493 		else if (!lasttype)
494 			nwords++;
495 		if (wp->mode & WC_MBYTE)
496 			nchars -= adjust;
497 		else
498 			nchars = nbytes;
499 	}
500 	wp->chars = nchars;
501 	wp->words = nwords;
502 	wp->lines = nlines;
503 	return 0;
504 }
505 
506