xref: /titanic_44/usr/src/lib/libcmd/common/wclib.c (revision 888e055994b8b0dc77b98c53dd97026237caec5d)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1992-2009 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                                                                      *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23  * David Korn
24  * AT&T Bell Laboratories
25  *
26  * library interface for word count
27  */
28 
29 #include <cmd.h>
30 #include <wc.h>
31 #include <ctype.h>
32 
33 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
34 
35 #include <wchar.h>
36 #include <wctype.h>
37 #include <lc.h>
38 
39 #else
40 
41 #ifndef iswspace
42 #define iswspace(x)	isspace(x)
43 #endif
44 
45 #endif
46 
47 #define	WC_SP		0x08
48 #define	WC_NL		0x10
49 #define	WC_MB		0x20
50 #define	WC_ERR		0x40
51 
52 #define eol(c)		((c)&WC_NL)
53 #define mbc(c)		((c)&WC_MB)
54 #define spc(c)		((c)&WC_SP)
55 #define mbwc(w,p,n)	(*ast.mb_towc)(&w,(char*)p,n)
56 
57 Wc_t* wc_init(int mode)
58 {
59 	register int	n;
60 	register int	w;
61 	Wc_t*		wp;
62 
63 	if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
64 		return 0;
65 	if (!mbwide())
66 		wp->mb = 0;
67 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
68 	else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
69 		wp->mb = 1;
70 #endif
71 	else
72 		wp->mb = -1;
73 	w = mode & WC_WORDS;
74 	for (n = (1<<CHAR_BIT); --n >= 0;)
75 		wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
76 	wp->type['\n'] = WC_SP|WC_NL;
77 	if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
78 	{
79 		for (n = 0; n < 64; n++)
80 		{
81 			wp->type[0x80+n] |= WC_MB;
82 			if (n<32)
83 				wp->type[0xc0+n] |= WC_MB+1;
84 			else if (n<48)
85 				wp->type[0xc0+n] |= WC_MB+2;
86 			else if (n<56)
87 				wp->type[0xc0+n] |= WC_MB+3;
88 			else if (n<60)
89 				wp->type[0xc0+n] |= WC_MB+4;
90 			else if (n<62)
91 				wp->type[0xc0+n] |= WC_MB+5;
92 		}
93 		wp->type[0xc0] = WC_MB|WC_ERR;
94 		wp->type[0xc1] = WC_MB|WC_ERR;
95 		wp->type[0xfe] = WC_MB|WC_ERR;
96 		wp->type[0xff] = WC_MB|WC_ERR;
97 	}
98 	wp->mode = mode;
99 	return wp;
100 }
101 
102 static int invalid(const char *file, int nlines)
103 {
104 	error_info.file = (char*)file;
105 	error_info.line = nlines;
106 	error(ERROR_SYSTEM|1, "invalid multibyte character");
107 	error_info.file = 0;
108 	error_info.line = 0;
109 	return nlines;
110 }
111 
112 /*
113  * handle utf space characters
114  */
115 
116 static int chkstate(int state, register unsigned int c)
117 {
118 	switch(state)
119 	{
120 	case 1:
121 		state = (c==0x9a?4:0);
122 		break;
123 	case 2:
124 		state = ((c==0x80||c==0x81)?6+(c&1):0);
125 		break;
126 	case 3:
127 		state = (c==0x80?5:0);
128 		break;
129 	case 4:
130 		state = (c==0x80?10:0);
131 		break;
132 	case 5:
133 		state = (c==0x80?10:0);
134 		break;
135 	case 6:
136 		state = 0;
137 		if(c==0xa0 || c==0xa1)
138 			return(10);
139 		else if((c&0xf0)== 0x80)
140 		{
141 			if((c&=0xf)==7)
142 				return(iswspace(0x2007)?10:0);
143 			if(c<=0xb)
144 				return(10);
145 		}
146 		else if(c==0xaf && iswspace(0x202f))
147 			return(10);
148 		break;
149 	case 7:
150 		state = (c==0x9f?10:0);
151 		break;
152 	case 8:
153 		return (iswspace(c)?10:0);
154 	}
155 	return state;
156 }
157 
158 /*
159  * compute the line, word, and character count for file <fd>
160  */
161 
162 int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
163 {
164 	register char*		type = wp->type;
165 	register unsigned char*	cp;
166 	register Sfoff_t	nchars;
167 	register Sfoff_t	nwords;
168 	register Sfoff_t	nlines;
169 	register Sfoff_t	eline = -1;
170 	register Sfoff_t	longest = 0;
171 	register ssize_t	c;
172 	register unsigned char*	endbuff;
173 	register int		lasttype = WC_SP;
174 	unsigned int		lastchar;
175 	ssize_t			n;
176 	ssize_t			o;
177 	unsigned char*		buff;
178 	wchar_t			x;
179 	unsigned char		side[32];
180 
181 	sfset(fd,SF_WRITE,1);
182 	nlines = nwords = nchars = 0;
183 	wp->longest = 0;
184 	if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
185 	{
186 		cp = buff = endbuff = 0;
187 		for (;;)
188 		{
189 			if (cp >= endbuff || (n = mbwc(x, cp, endbuff-cp)) < 0)
190 			{
191 				if ((o = endbuff-cp) < sizeof(side))
192 				{
193 					if (buff)
194 					{
195 						if (o)
196 							memcpy(side, cp, o);
197 						mbinit();
198 					}
199 					else
200 						o = 0;
201 					cp = side + o;
202 					if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
203 					{
204 						if ((nchars - longest) > wp->longest)
205 							wp->longest = nchars - longest;
206 						break;
207 					}
208 					if ((c = sizeof(side) - o) > n)
209 						c = n;
210 					if (c)
211 						memcpy(cp, buff, c);
212 					endbuff = buff + n;
213 					cp = side;
214 					x = mbchar(cp);
215 					if ((cp-side) < o)
216 					{
217 						cp = buff;
218 						nchars += (cp-side) - 1;
219 					}
220 					else
221 						cp = buff + (cp-side) - o;
222 				}
223 				else
224 				{
225 					cp++;
226 					x = -1;
227 				}
228 				if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
229 					eline = invalid(file, nlines);
230 			}
231 			else
232 				cp += n ? n : 1;
233 			if (x == '\n')
234 			{
235 				if ((nchars - longest) > wp->longest)
236 					wp->longest = nchars - longest;
237 				longest = nchars + 1;
238 				nlines++;
239 				lasttype = 1;
240 			}
241 			else if (iswspace(x))
242 				lasttype = 1;
243 			else if (lasttype)
244 			{
245 				lasttype = 0;
246 				nwords++;
247 			}
248 			nchars++;
249 		}
250 	}
251 	else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
252 	{
253 		if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
254 		{
255 			while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
256 			{
257 				nchars += c;
258 				endbuff = cp + c;
259 				if (*--endbuff == '\n')
260 					nlines++;
261 				else
262 					*endbuff = '\n';
263 				for (;;)
264 					if (*cp++ == '\n')
265 					{
266 						if (cp > endbuff)
267 							break;
268 						nlines++;
269 					}
270 			}
271 		}
272 		else
273 		{
274 			while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
275 			{
276 				nchars += c;
277 				/* check to see whether first character terminates word */
278 				if (c==1)
279 				{
280 					if (eol(lasttype))
281 						nlines++;
282 					if ((c = type[*cp]) && !lasttype)
283 						nwords++;
284 					lasttype = c;
285 					continue;
286 				}
287 				if (!lasttype && type[*cp])
288 					nwords++;
289 				lastchar = cp[--c];
290 				*(endbuff = cp+c) = '\n';
291 				c = lasttype;
292 				/* process each buffer */
293 				for (;;)
294 				{
295 					/* process spaces and new-lines */
296 					do
297 					{
298 						if (eol(c))
299 							for (;;)
300 							{
301 								/* check for end of buffer */
302 								if (cp > endbuff)
303 									goto beob;
304 								nlines++;
305 								if (*cp != '\n')
306 									break;
307 								cp++;
308 							}
309 					} while (c = type[*cp++]);
310 					/* skip over word characters */
311 					while (!(c = type[*cp++]));
312 					nwords++;
313 				}
314 			beob:
315 				if ((cp -= 2) >= buff)
316 					c = type[*cp];
317 				else
318 					c = lasttype;
319 				lasttype = type[lastchar];
320 				/* see if was in word */
321 				if (!c && !lasttype)
322 					nwords--;
323 			}
324 			if (eol(lasttype))
325 				nlines++;
326 			else if (!lasttype)
327 				nwords++;
328 		}
329 	}
330 	else
331 	{
332 		int		lineoff=0;
333 		int		skip=0;
334 		int		adjust=0;
335 		int		state=0;
336 		int		oldc;
337 		int		xspace;
338 		int		wasspace = 1;
339 		unsigned char*	start;
340 
341 		lastchar = 0;
342 		start = (endbuff = side) + 1;
343 		xspace = iswspace(0xa0) || iswspace(0x85);
344 		while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
345 		{
346 			nchars += c;
347 			start = cp-lineoff;
348 			/* check to see whether first character terminates word */
349 			if(c==1)
350 			{
351 				if(eol(lasttype))
352 					nlines++;
353 				if((c = type[*cp]) && !lasttype)
354 					nwords++;
355 				lasttype = c;
356 				endbuff = start;
357 				continue;
358 			}
359 			lastchar = cp[--c];
360 			endbuff = cp+c;
361 			cp[c] = '\n';
362 			if(mbc(lasttype))
363 			{
364 				c = lasttype;
365 				goto mbyte;
366 			}
367 			if(!lasttype && spc(type[*cp]))
368 				nwords++;
369 			c = lasttype;
370 			/* process each buffer */
371 			for (;;)
372 			{
373 				/* process spaces and new-lines */
374 			spaces:
375 				do
376 				{
377 					if (eol(c))
378 					{
379 						/* check for end of buffer */
380 						if (cp > endbuff)
381 							goto eob;
382 						if(wp->mode&WC_LONGEST)
383 						{
384 							if((cp-start)-adjust > longest)
385 								longest = (cp-start)-adjust-1;
386 							start = cp;
387 						}
388 						nlines++;
389 						nchars -= adjust;
390 						adjust = 0;
391 					}
392 				} while (spc(c = type[*cp++]));
393 				wasspace=1;
394 				if(mbc(c))
395 				{
396 				mbyte:
397 					do
398 					{
399 						if(c&WC_ERR)
400 							goto err;
401 						if(skip && (c&7))
402 							break;
403 						if(!skip)
404 						{
405 							if(!(c&7))
406 							{
407 								skip=1;
408 								break;
409 							}
410 							skip = (c&7);
411 							adjust += skip;
412 							state = 0;
413 							if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
414 								oldc = *cp;
415 							else if(xspace && cp[-1]==0xc2)
416 							{
417 								state = 8;
418 								oldc = *cp;
419 							}
420 						}
421 						else
422 						{
423 							skip--;
424 							if(state && (state=chkstate(state,oldc)))
425 							{
426 								if(state==10)
427 								{
428 									if(!wasspace)
429 										nwords++;
430 									wasspace = 1;
431 									state=0;
432 									goto spaces;
433 								}
434 								oldc = *cp;
435 							}
436 						}
437 					} while (mbc(c = type[*cp++]));
438 					wasspace = 0;
439 					if(skip)
440 					{
441 						if(eol(c) && (cp > endbuff))
442 							goto eob;
443 				err:
444 						skip = 0;
445 						state = 0;
446 						if(eline!=nlines && !(wp->mode & WC_QUIET))
447 							eline = invalid(file, nlines);
448 						while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
449 							c=type[*cp++];
450 						if(eol(c) && (cp > endbuff))
451 						{
452 							c = WC_MB|WC_ERR;
453 							goto eob;
454 						}
455 						if(mbc(c))
456 							goto mbyte;
457 						else if(c&WC_SP)
458 							goto spaces;
459 					}
460 					if(spc(c))
461 					{
462 						nwords++;
463 						continue;
464 					}
465 				}
466 				/* skip over word characters */
467 				while(!(c = type[*cp++]));
468 				if(mbc(c))
469 					goto mbyte;
470 				nwords++;
471 			}
472 		eob:
473 			lineoff = cp-start;
474 			if((cp -= 2) >= buff)
475 				c = type[*cp];
476 			else
477 				c = lasttype;
478 			lasttype = type[lastchar];
479 			/* see if was in word */
480 			if(!c && !lasttype)
481 				nwords--;
482 		}
483 		if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
484 			longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
485 		wp->longest = longest;
486 		if (eol(lasttype))
487 			nlines++;
488 		else if (!lasttype)
489 			nwords++;
490 		nchars -= adjust;
491 	}
492 	wp->chars = nchars;
493 	wp->words = nwords;
494 	wp->lines = nlines;
495 	return 0;
496 }
497 
498