xref: /titanic_52/usr/src/cmd/egrep/egrep.y (revision 1cb6af97c6f66f456d4f726ef056e1ebc0f73305)
1 %{
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License, Version 1.0 only
7  * (the "License").  You may not use this file except in compliance
8  * with the License.
9  *
10  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11  * or http://www.opensolaris.org/os/licensing.
12  * See the License for the specific language governing permissions
13  * and limitations under the License.
14  *
15  * When distributing Covered Code, include this CDDL HEADER in each
16  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17  * If applicable, add the following below this CDDL HEADER, with the
18  * fields enclosed by brackets "[]" replaced with your own identifying
19  * information: Portions Copyright [yyyy] [name of copyright owner]
20  *
21  * CDDL HEADER END
22  */
23 %}
24 /*
25  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
26  * Use is subject to license terms.
27  */
28 
29 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
30 /*	  All Rights Reserved  	*/
31 
32 /*	Copyright (c) 1987, 1988 Microsoft Corporation	*/
33 /*	  All Rights Reserved	*/
34 
35 %{
36 #pragma ident	"%Z%%M%	%I%	%E% SMI"
37 %}
38 
39 /*
40  * egrep -- print lines containing (or not containing) a regular expression
41  *
42  *	status returns:
43  *		0 - ok, and some matches
44  *		1 - ok, but no matches
45  *		2 - some error; matches irrelevant
46  */
47 %token CHAR MCHAR DOT MDOT CCL NCCL MCCL NMCCL OR CAT STAR PLUS QUEST
48 %left OR
49 %left CHAR MCHAR DOT CCL NCCL MCCL NMCCL '('
50 %left CAT
51 %left STAR PLUS QUEST
52 
53 %{
54 #include <stdio.h>
55 #include <ctype.h>
56 #include <memory.h>
57 #include <wchar.h>
58 #include <wctype.h>
59 #include <widec.h>
60 #include <stdlib.h>
61 #include <limits.h>
62 #include <locale.h>
63 
64 #define BLKSIZE 512	/* size of reported disk blocks */
65 #define EBUFSIZ 8192
66 #define MAXLIN 350
67 #define NCHARS 256
68 #define MAXPOS 4000
69 #define NSTATES 64
70 #define FINAL -1
71 #define RIGHT '\n'	/* serves as record separator and as $ */
72 #define LEFT '\n'	/* beginning of line */
73 int gotofn[NSTATES][NCHARS];
74 int state[NSTATES];
75 int out[NSTATES];
76 int line  = 1;
77 int *name;
78 int *left;
79 int *right;
80 int *parent;
81 int *foll;
82 int *positions;
83 char *chars;
84 wchar_t *lower;
85 wchar_t *upper;
86 int maxlin, maxclin, maxwclin, maxpos;
87 int nxtpos = 0;
88 int inxtpos;
89 int nxtchar = 0;
90 int *tmpstat;
91 int *initstat;
92 int istat;
93 int nstate = 1;
94 int xstate;
95 int count;
96 int icount;
97 char *input;
98 
99 
100 wchar_t lyylval;
101 wchar_t nextch();
102 wchar_t maxmin();
103 int compare();
104 void overflo();
105 
106 char reinit = 0;
107 
108 long long lnum;
109 int	bflag;
110 int	cflag;
111 int	eflag;
112 int	fflag;
113 int	hflag;
114 int	iflag;
115 int	lflag;
116 int	nflag;
117 int	sflag;
118 int	vflag;
119 int	nfile;
120 long long blkno;
121 long long tln;
122 int	nsucc;
123 int	badbotch;
124 extern 	char *optarg;
125 extern 	int optind;
126 
127 int	f;
128 FILE	*expfile;
129 %}
130 
131 %%
132 s:	t
133 		{
134 		  unary(FINAL, $1);
135 		  line--;
136 		}
137 	;
138 t:	b r
139 		{ $$ = node(CAT, $1, $2); }
140 	| OR b r OR
141 		{ $$ = node(CAT, $2, $3); }
142 	| OR b r
143 		{ $$ = node(CAT, $2, $3); }
144 	| b r OR
145 		{ $$ = node(CAT, $1, $2); }
146 	;
147 b:
148 		{ /* if(multibyte)
149 			$$ = mdotenter();
150 		  else */
151 			$$ = enter(DOT);
152 		  $$ = unary(STAR, $$);
153 		}
154 	;
155 r:	CHAR
156 		{ $$ = iflag && isalpha($1) ?
157 		node(OR, enter(tolower($1)), enter(toupper($1))) : enter($1); }
158 	| MCHAR
159 		{ $$ = (iflag && iswalpha(lyylval)) ?
160 		node(OR, mchar(towlower(lyylval)), mchar(towupper(lyylval))) :
161 		mchar(lyylval); }
162 	| DOT
163 		{ if(multibyte)
164 			$$ = mdotenter();
165 		  else
166 			$$ = enter(DOT);
167 		}
168 	| CCL
169 		{ $$ = cclenter(CCL); }
170 	| NCCL
171 		{ $$ = cclenter(NCCL); }
172 	| MCCL
173 		{ $$ = ccl(CCL); }
174 	| NMCCL
175 		{ $$ = ccl(NCCL); }
176 	;
177 
178 r:	r OR r
179 		{ $$ = node(OR, $1, $3); }
180 	| r r %prec CAT
181 		{ $$ = node(CAT, $1, $2); }
182 	| r STAR
183 		{ $$ = unary(STAR, $1); }
184 	| r PLUS
185 		{ $$ = unary(PLUS, $1); }
186 	| r QUEST
187 		{ $$ = unary(QUEST, $1); }
188 	| '(' r ')'
189 		{ $$ = $2; }
190 	| error
191 	;
192 
193 %%
194 void	add(int *, int);
195 void	clearg(void);
196 void	execute(char *);
197 void	follow(int);
198 int	mgetc(void);
199 void	synerror(void);
200 
201 
202 void
203 yyerror(char *s)
204 {
205 	fprintf(stderr, "egrep: %s\n", s);
206 	exit(2);
207 }
208 
209 int
210 yylex(void)
211 {
212 	extern int yylval;
213 	int cclcnt, x, ccount, oldccount;
214 	wchar_t c, lc;
215 
216 	c = nextch();
217 	switch(c) {
218 		case '^':
219 			yylval = LEFT;
220 			return(CHAR);
221 		case '$':
222 			c = RIGHT;
223 			goto defchar;
224 		case '|': return (OR);
225 		case '*': return (STAR);
226 		case '+': return (PLUS);
227 		case '?': return (QUEST);
228 		case '(': return (c);
229 		case ')': return (c);
230 		case '.': return(DOT);
231 		case '\0': return (0);
232 		case RIGHT: return (OR);
233 		case '[':
234 			x = (multibyte ? MCCL : CCL);
235 			cclcnt = 0;
236 			count = nxtchar++;
237 			if ((c = nextch()) == '^') {
238 				x = (multibyte ? NMCCL : NCCL);
239 				c = nextch();
240 			}
241 			lc = 0;
242 			do {
243 				if (iflag && iswalpha(c))
244 					c = towlower(c);
245 				if (c == '\0') synerror();
246 				if (c == '-' && cclcnt > 0 && lc != 0) {
247 					if ((c = nextch()) != 0) {
248 						if(c == ']') {
249 							chars[nxtchar++] = '-';
250 							cclcnt++;
251 							break;
252 						}
253 						if (iflag && iswalpha(c))
254 							c = towlower(c);
255 						if (!multibyte ||
256 						(c & WCHAR_CSMASK) == (lc & WCHAR_CSMASK) &&
257 						lc < c &&
258 						!iswcntrl(c) && !iswcntrl(lc)) {
259 							if (nxtchar >= maxclin)
260 								if (allocchars() == 0)
261 									overflo();
262 							chars[nxtchar++] = '-';
263 							cclcnt++;
264 						}
265 					}
266 				}
267 				ccount = oldccount = nxtchar;
268 				if(ccount + MB_LEN_MAX >= maxclin)
269 					if(allocchars() == 0)
270 						overflo();
271 				ccount += wctomb(&chars[ccount], c);
272 				cclcnt += ccount - oldccount;
273 				nxtchar += ccount - oldccount;
274 				lc = c;
275 			} while ((c = nextch()) != ']');
276 			chars[count] = cclcnt;
277 			return(x);
278 
279 		case '\\':
280 			if ((c = nextch()) == '\0') synerror();
281 		defchar:
282 		default:
283 			if (c <= 0177) {
284 				yylval = c;
285 				return (CHAR);
286 			} else {
287 				lyylval = c;
288 				return (MCHAR);
289 			}
290 	}
291 }
292 
293 wchar_t
294 nextch(void)
295 {
296 	wchar_t lc;
297 	char multic[MB_LEN_MAX];
298 	int length, d;
299 	if (fflag) {
300 		if ((length = _mbftowc(multic, &lc, mgetc, &d)) < 0)
301 			synerror();
302 		if(length == 0)
303 			lc = '\0';
304 	}
305 	else  {
306 		if((length = mbtowc(&lc, input, MB_LEN_MAX)) == -1)
307 			synerror();
308 		if(length == 0)
309 			return(0);
310 		input += length;
311 	}
312 	return(lc);
313 }
314 
315 int
316 mgetc(void)
317 {
318 	return(getc(expfile));
319 }
320 
321 void
322 synerror(void)
323 {
324 	fprintf(stderr, gettext("egrep: syntax error\n"));
325 	exit(2);
326 }
327 
328 int
329 enter(int x)
330 {
331 	if(line >= maxlin)
332 		if(alloctree() == 0)
333 			overflo();
334 	name[line] = x;
335 	left[line] = 0;
336 	right[line] = 0;
337 	return(line++);
338 }
339 
340 int
341 cclenter(int x)
342 {
343 	int linno;
344 	linno = enter(x);
345 	right[linno] = count;
346 	return (linno);
347 }
348 
349 int
350 node(int x, int l, int r)
351 {
352 	if(line >= maxlin)
353 		if(alloctree() == 0)
354 			overflo();
355 	name[line] = x;
356 	left[line] = l;
357 	right[line] = r;
358 	parent[l] = line;
359 	parent[r] = line;
360 	return(line++);
361 }
362 
363 int
364 unary(int x, int d)
365 {
366 	if(line >= maxlin)
367 		if(alloctree() == 0)
368 			overflo();
369 	name[line] = x;
370 	left[line] = d;
371 	right[line] = 0;
372 	parent[d] = line;
373 	return(line++);
374 }
375 
376 int
377 allocchars(void)
378 {
379 	maxclin += MAXLIN;
380 	if((chars = realloc(chars, maxclin)) == (char *)0)
381 		return 0;
382 	return 1;
383 }
384 
385 int
386 alloctree(void)
387 {
388 	maxlin += MAXLIN;
389 	if((name = (int *)realloc(name, maxlin*sizeof(int))) == (int *)0)
390 		return 0;
391 	if((left = (int *)realloc(left, maxlin*sizeof(int))) == (int *)0)
392 		return 0;
393 	if((right = (int *)realloc(right, maxlin*sizeof(int))) == (int *)0)
394 		return 0;
395 	if((parent = (int *)realloc(parent, maxlin*sizeof(int))) == (int *)0)
396 		return 0;
397 	if((foll = (int *)realloc(foll, maxlin*sizeof(int))) == (int *)0)
398 		return 0;
399 	if((tmpstat = (int *)realloc(tmpstat, maxlin*sizeof(int))) == (int *)0)
400 		return 0;
401 	if((initstat = (int *)realloc(initstat, maxlin*sizeof(int))) == (int *)0)
402 		return 0;
403 	return 1;
404 }
405 
406 void
407 overflo(void)
408 {
409 	fprintf(stderr, gettext("egrep: regular expression too long\n"));
410 	exit(2);
411 }
412 
413 void
414 cfoll(int v)
415 {
416 	int i;
417 	if (left[v] == 0) {
418 		count = 0;
419 		for (i=1; i<=line; i++) tmpstat[i] = 0;
420 		follow(v);
421 		add(foll, v);
422 	}
423 	else if (right[v] == 0) cfoll(left[v]);
424 	else {
425 		cfoll(left[v]);
426 		cfoll(right[v]);
427 	}
428 }
429 
430 void
431 cgotofn(void)
432 {
433 	int i;
434 	count = 0;
435 	inxtpos = nxtpos;
436 	for (i=3; i<=line; i++) tmpstat[i] = 0;
437 	if (cstate(line-1)==0) {
438 		tmpstat[line] = 1;
439 		count++;
440 		out[1] = 1;
441 	}
442 	for (i=3; i<=line; i++) initstat[i] = tmpstat[i];
443 	count--;		/*leave out position 1 */
444 	icount = count;
445 	tmpstat[1] = 0;
446 	add(state, 1);
447 	istat = nxtst(1, LEFT);
448 }
449 
450 int
451 nxtst(int s, int c)
452 {
453 	int i, num, k;
454 	int pos, curpos, number, newpos;
455 	num = positions[state[s]];
456 	count = icount;
457 	for (i=3; i<=line; i++) tmpstat[i] = initstat[i];
458 	pos = state[s] + 1;
459 	for (i=0; i<num; i++) {
460 		curpos = positions[pos];
461 		k = name[curpos];
462 		if (k >= 0)
463 			if (
464 				(k == c)
465 				|| (k == DOT && dot(c))
466 				|| (k == MDOT && mdot(c))
467 				|| (k == CCL && dot(c) && member(c, right[curpos], 1))
468 				|| (k == NCCL && dot(c) && member(c, right[curpos], 0))
469 				|| (k == MCCL && mdot(c) && member(c, right[curpos], 1))
470 			) {
471 				number = positions[foll[curpos]];
472 				newpos = foll[curpos] + 1;
473 				for (k=0; k<number; k++) {
474 					if (tmpstat[positions[newpos]] != 1) {
475 						tmpstat[positions[newpos]] = 1;
476 						count++;
477 					}
478 					newpos++;
479 				}
480 			}
481 		pos++;
482 	}
483 	if (notin(nstate)) {
484 		if (++nstate >= NSTATES) {
485 			for (i=1; i<NSTATES; i++)
486 				out[i] = 0;
487 			for (i=1; i<NSTATES; i++)
488 				for (k=0; k<NCHARS; k++)
489 					gotofn[i][k] = 0;
490 			nstate = 1;
491 			nxtpos = inxtpos;
492 			reinit = 1;
493 			add(state, nstate);
494 			if (tmpstat[line] == 1) out[nstate] = 1;
495 			return nstate;
496 		}
497 		add(state, nstate);
498 		if (tmpstat[line] == 1) out[nstate] = 1;
499 		gotofn[s][c] = nstate;
500 		return nstate;
501 	}
502 	else {
503 		gotofn[s][c] = xstate;
504 		return xstate;
505 	}
506 }
507 
508 
509 int
510 cstate(int v)
511 {
512 	int b;
513 	if (left[v] == 0) {
514 		if (tmpstat[v] != 1) {
515 			tmpstat[v] = 1;
516 			count++;
517 		}
518 		return(1);
519 	}
520 	else if (right[v] == 0) {
521 		if (cstate(left[v]) == 0) return (0);
522 		else if (name[v] == PLUS) return (1);
523 		else return (0);
524 	}
525 	else if (name[v] == CAT) {
526 		if (cstate(left[v]) == 0 && cstate(right[v]) == 0) return (0);
527 		else return (1);
528 	}
529 	else { /* name[v] == OR */
530 		b = cstate(right[v]);
531 		if (cstate(left[v]) == 0 || b == 0) return (0);
532 		else return (1);
533 	}
534 }
535 
536 
537 int
538 dot(int c)
539 {
540 	if(multibyte && c >= 0200 && (!iscntrl(c) || c == SS2 && eucw2 || c == SS3 && eucw3))
541 		return(0);
542 	if(c == RIGHT || c == LEFT)
543 		return(0);
544 	return(1);
545 }
546 
547 int
548 mdot(int c)
549 {
550 	if(c >= 0200 && !iscntrl(c))
551 		return(1);
552 	return(0);
553 }
554 
555 int
556 member(int symb, int set, int torf)
557 {
558 	int i, num, pos, c, lc;
559 	if(symb == RIGHT || symb == LEFT)
560 		return(0);
561 	num = chars[set];
562 	pos = set + 1;
563 	lc = 0;
564 	if(iflag)
565 		symb = tolower(symb);
566 	for (i=0; i<num; i++) {
567 		c = (unsigned char)chars[pos++];
568 		if(c == '-' && lc != 0 && ++i < num) {
569 			c = (unsigned char)chars[pos++];
570 			if(lc <= symb && symb <= c)
571 				return(torf);
572 		}
573 		if (symb == c)
574 			return (torf);
575 		lc = c;
576 	}
577 	return(!torf);
578 }
579 
580 int
581 notin(int n)
582 {
583 	int i, j, pos;
584 	for (i=1; i<=n; i++) {
585 		if (positions[state[i]] == count) {
586 			pos = state[i] + 1;
587 			for (j=0; j < count; j++)
588 				if (tmpstat[positions[pos++]] != 1) goto nxt;
589 			xstate = i;
590 			return (0);
591 		}
592 		nxt: ;
593 	}
594 	return (1);
595 }
596 
597 void
598 add(int *array, int n)
599 {
600 	int i;
601 	if (nxtpos + count >= maxpos) {
602 		maxpos += MAXPOS + count;
603 		if((positions = (int *)realloc(positions, maxpos *sizeof(int))) == (int *)0)
604 			overflo();
605 	}
606 	array[n] = nxtpos;
607 	positions[nxtpos++] = count;
608 	for (i=3; i <= line; i++) {
609 		if (tmpstat[i] == 1) {
610 			positions[nxtpos++] = i;
611 		}
612 	}
613 }
614 
615 void
616 follow(int v)
617 {
618 	int p;
619 	if (v == line) return;
620 	p = parent[v];
621 	switch(name[p]) {
622 		case STAR:
623 		case PLUS:	cstate(v);
624 				follow(p);
625 				return;
626 
627 		case OR:
628 		case QUEST:	follow(p);
629 				return;
630 
631 		case CAT:	if (v == left[p]) {
632 					if (cstate(right[p]) == 0) {
633 						follow(p);
634 						return;
635 					}
636 				}
637 				else follow(p);
638 				return;
639 		case FINAL:	if (tmpstat[line] != 1) {
640 					tmpstat[line] = 1;
641 					count++;
642 				}
643 				return;
644 	}
645 }
646 
647 #define USAGE "[ -bchilnsv ] [ -e exp ] [ -f file ] [ strings ] [ file ] ..."
648 
649 int
650 main(int argc, char **argv)
651 {
652 	char c;
653 	char nl = '\n';
654 	int errflag = 0;
655 
656 	(void)setlocale(LC_ALL, "");
657 
658 #if !defined(TEXT_DOMAIN)		/* Should be defined by cc -D */
659 	#define TEXT_DOMAIN "SYS_TEST"  /* Use this only if it weren't. */
660 #endif
661 	(void) textdomain(TEXT_DOMAIN);
662 
663 	while((c = getopt(argc, argv, "ybcie:f:hlnvs")) != -1)
664 		switch(c) {
665 
666 		case 'b':
667 			bflag++;
668 			continue;
669 
670 		case 'c':
671 			cflag++;
672 			continue;
673 
674 		case 'e':
675 			eflag++;
676 			input = optarg;
677 			continue;
678 
679 		case 'f':
680 			fflag++;
681 			expfile = fopen(optarg, "r");
682 			if(expfile == NULL) {
683 				fprintf(stderr,
684 				  gettext("egrep: can't open %s\n"), optarg);
685 				exit(2);
686 			}
687 			continue;
688 
689 		case 'h':
690 			hflag++;
691 			continue;
692 
693 		case 'y':
694 		case 'i':
695 			iflag++;
696 			continue;
697 
698 		case 'l':
699 			lflag++;
700 			continue;
701 
702 		case 'n':
703 			nflag++;
704 			continue;
705 
706 		case 's':
707 			sflag++;
708 			continue;
709 
710 		case 'v':
711 			vflag++;
712 			continue;
713 
714 		case '?':
715 			errflag++;
716 		}
717 	if (errflag || ((argc <= 0) && !fflag && !eflag)) {
718 		fprintf(stderr, gettext("usage: egrep %s\n"), gettext(USAGE));
719 		exit(2);
720 	}
721 	if(!eflag && !fflag) {
722 		input = argv[optind];
723 		optind++;
724 	}
725 
726 	argc -= optind;
727 	argv = &argv[optind];
728 
729 	/* allocate initial space for arrays */
730 	if((name = (int *)malloc(MAXLIN*sizeof(int))) == (int *)0)
731 		overflo();
732 	if((left = (int *)malloc(MAXLIN*sizeof(int))) == (int *)0)
733 		overflo();
734 	if((right = (int *)malloc(MAXLIN*sizeof(int))) == (int *)0)
735 		overflo();
736 	if((parent = (int *)malloc(MAXLIN*sizeof(int))) == (int *)0)
737 		overflo();
738 	if((foll = (int *)malloc(MAXLIN*sizeof(int))) == (int *)0)
739 		overflo();
740 	if((tmpstat = (int *)malloc(MAXLIN*sizeof(int))) == (int *)0)
741 		overflo();
742 	if((initstat = (int *)malloc(MAXLIN*sizeof(int))) == (int *)0)
743 		overflo();
744 	if((chars = (char *)malloc(MAXLIN)) == (char *)0)
745 		overflo();
746 	if((lower = (wchar_t *)malloc(MAXLIN*sizeof(wchar_t))) == (wchar_t *)0)
747 		overflo();
748 	if((upper = (wchar_t *)malloc(MAXLIN*sizeof(wchar_t))) == (wchar_t *)0)
749 		overflo();
750 	if((positions = (int *)malloc(MAXPOS*sizeof(int))) == (int *)0)
751 		overflo();
752 	maxlin = MAXLIN;
753 	maxclin = MAXLIN;
754 	maxwclin = MAXLIN;
755 	maxpos = MAXPOS;
756 
757 	yyparse();
758 
759 	cfoll(line-1);
760 	cgotofn();
761 	nfile = argc;
762 	if (argc<=0) {
763 		execute(0);
764 	}
765 	else while (--argc >= 0) {
766 		if (reinit == 1) clearg();
767 		execute(*argv++);
768 	}
769 	return (badbotch ? 2 : nsucc==0);
770 }
771 
772 void
773 execute(char *file)
774 {
775 	char *p;
776 	int cstat;
777 	wchar_t c;
778 	int t;
779 	long count;
780 	long count1, count2;
781 	long nchars;
782 	int succ;
783 	char *ptr, *ptrend, *lastptr;
784 	char *buf;
785 	long lBufSiz;
786 	FILE *f;
787 	int nlflag;
788 
789 	lBufSiz = EBUFSIZ;
790 	if ((buf = malloc (lBufSiz + EBUFSIZ)) == NULL) {
791 		exit (2); /* out of memory - BAIL */
792 	}
793 
794 	if (file) {
795 		if ((f = fopen(file, "r")) == NULL) {
796 			fprintf(stderr,
797 				gettext("egrep: can't open %s\n"), file);
798 			badbotch=1;
799 			return;
800 		}
801 	} else {
802 		file = "<stdin>";
803 		f = stdin;
804 	}
805 	lnum = 1;
806 	tln = 0;
807 	if((count = read(fileno(f), buf, EBUFSIZ)) <= 0) {
808 		fclose(f);
809 
810 		if (cflag) {
811 			if (nfile>1 && !hflag)
812 				fprintf(stdout, "%s:", file);
813 			fprintf(stdout, "%lld\n", tln);
814 		}
815 		return;
816 	}
817 
818 	blkno = count;
819 	ptr = buf;
820 	for(;;) {
821 		if((ptrend = memchr(ptr, '\n', buf + count - ptr)) == NULL) {
822 			/*
823 				move the unused partial record to the head of the buffer
824 			*/
825 			if (ptr > buf) {
826 				count = buf + count - ptr;
827 				memmove (buf, ptr, count);
828 				ptr = buf;
829 			}
830 
831 			/*
832 				Get a bigger buffer if this one is full
833 			*/
834 			if(count > lBufSiz) {
835 				/*
836 					expand the buffer
837 				*/
838 				lBufSiz += EBUFSIZ;
839 				if ((buf = realloc (buf, lBufSiz + EBUFSIZ)) == NULL) {
840 					exit (2); /* out of memory - BAIL */
841 				}
842 
843 				ptr = buf;
844 			}
845 
846 			p = buf + count;
847 			if((count1 = read(fileno(f), p, EBUFSIZ)) > 0) {
848 				count += count1;
849 				blkno += count1;
850 				continue;
851 			}
852 			ptrend = ptr + count;
853 			nlflag = 0;
854 		} else
855 			nlflag = 1;
856 		*ptrend = '\n';
857 		p = ptr;
858 		lastptr = ptr;
859 		cstat = istat;
860 		succ = 0;
861 		for(;;) {
862 			if(out[cstat]) {
863 				if(multibyte && p > ptr) {
864 					wchar_t wchar;
865 					int length;
866 					char *endptr = p;
867 					p = lastptr;
868 					while(p < endptr) {
869 						length = mbtowc(&wchar, p, MB_LEN_MAX);
870 						if(length <= 1)
871 							p++;
872 						else
873 							p += length;
874 					}
875 					if(p == endptr) {
876 						succ = !vflag;
877 						break;
878 					}
879 					cstat = 1;
880 					length = mbtowc(&wchar, lastptr, MB_LEN_MAX);
881 					if(length <= 1)
882 						lastptr++;
883 					else
884 						lastptr += length;
885 					p = lastptr;
886 					continue;
887 				}
888 				succ = !vflag;
889 				break;
890 			}
891 			c = (unsigned char)*p++;
892 			if ((t = gotofn[cstat][c]) == 0)
893 				cstat = nxtst(cstat, c);
894 			else
895 				cstat = t;
896 			if(c == RIGHT) {
897 				if(out[cstat]) {
898 					succ = !vflag;
899 					break;
900 				}
901 				succ = vflag;
902 				break;
903 			}
904 		}
905 		if(succ) {
906 			nsucc = 1;
907 			if (cflag) tln++;
908 			else if (sflag)
909 				;	/* ugh */
910 			else if (lflag) {
911 				printf("%s\n", file);
912 				fclose(f);
913 				return;
914 			}
915 			else {
916 				if (nfile > 1 && !hflag)
917 					printf(gettext("%s:"), file);
918 				if (bflag) {
919 					nchars = blkno - (buf + count - ptrend) - 2;
920 					if(nlflag)
921 						nchars++;
922 					printf("%lld:", nchars/BLKSIZE);
923 				}
924 				if (nflag)
925 					printf("%lld:", lnum);
926 				if(nlflag)
927 					nchars = ptrend - ptr + 1;
928 				else
929 					nchars = ptrend - ptr;
930 				fwrite(ptr, (size_t)1, (size_t)nchars, stdout);
931 			}
932 		}
933 		if(!nlflag)
934 			break;
935 		ptr = ptrend + 1;
936 		if(ptr >= buf + count) {
937 			ptr = buf;
938 			if((count = read(fileno(f), buf, EBUFSIZ)) <= 0)
939 				break;
940 			blkno += count;
941 		}
942 		lnum++;
943 		if (reinit == 1)
944 			clearg();
945 	}
946 	fclose(f);
947 	if (cflag) {
948 		if (nfile > 1 && !hflag)
949 			printf(gettext("%s:"), file);
950 		printf("%lld\n", tln);
951 	}
952 }
953 
954 void
955 clearg(void)
956 {
957 	int i, k;
958 	for (i=1; i<=nstate; i++)
959 		out[i] = 0;
960 	for (i=1; i<=nstate; i++)
961 		for (k=0; k<NCHARS; k++)
962 			gotofn[i][k] = 0;
963 	nstate = 1;
964 	nxtpos = inxtpos;
965 	reinit = 0;
966 	count = 0;
967 	for (i=3; i<=line; i++) tmpstat[i] = 0;
968 	if (cstate(line-1)==0) {
969 		tmpstat[line] = 1;
970 		count++;
971 		out[1] = 1;
972 	}
973 	for (i=3; i<=line; i++) initstat[i] = tmpstat[i];
974 	count--;		/*leave out position 1 */
975 	icount = count;
976 	tmpstat[1] = 0;
977 	add(state, 1);
978 	istat = nxtst(1, LEFT);
979 }
980 
981 int
982 mdotenter(void)
983 {
984 	int i, x1, x2;
985 	x1 = enter(DOT);
986 	x2 = enter(MDOT);
987 	for(i = 1; i < (int) eucw1; i++)
988 		x2 = node(CAT, x2, enter(MDOT));
989 	x1 = node(OR, x1, x2);
990 	if(eucw2) {
991 		x2 = enter('\216');
992 		for(i = 1; i <= (int) eucw2; i++)
993 			x2 = node(CAT, x2, enter(MDOT));
994 		x1 = node(OR, x1, x2);
995 	}
996 	if(eucw3) {
997 		x2 = enter('\217');
998 		for(i = 1; i <= (int) eucw3; i++)
999 			x2 = node(CAT, x2, enter(MDOT));
1000 		x1 = node(OR, x1, x2);
1001 	}
1002 	return(x1);
1003 }
1004 
1005 int
1006 mchar(wchar_t c)
1007 {
1008 	char multichar[MB_LEN_MAX+1];
1009 	char *p;
1010 	int x1, lc, length;
1011 
1012 	length = wctomb(multichar, c);
1013 	p = multichar;
1014 	*(p + length) = '\0';
1015 	x1 = enter((unsigned char)*p++);
1016 	while(lc = (unsigned char)*p++)
1017 		x1 = node(CAT, x1, enter(lc));
1018 	return(x1);
1019 }
1020 
1021 int
1022 ccl(int type)
1023 {
1024 	wchar_t c, lc;
1025 	char multic1[MB_LEN_MAX];
1026 	char multic2[MB_LEN_MAX];
1027 	int x1, x2, length, current, last, cclcnt;
1028 	x2 = 0;
1029 	current = 0;
1030 	last = genrange(type);
1031 	nxtchar = count + 1;
1032 	cclcnt = 0;
1033 	/* create usual character class for single byte characters */
1034 	while(current <= last && (isascii(c = lower[current]) || c <= 0377 && iscntrl(c))) {
1035 		cclcnt++;
1036 		chars[nxtchar++] = c;
1037 		if(lower[current] != upper[current]) {
1038 			chars[nxtchar++] = '-';
1039 			chars[nxtchar++] = upper[current];
1040 			cclcnt += 2;
1041 		}
1042 		current++;
1043 	}
1044 
1045 	if(cclcnt)
1046 		chars[count] = cclcnt;
1047 	else
1048 		nxtchar = count;
1049 	if(current > 0)
1050 		/* single byte part of character class */
1051 		x2 = cclenter(type);
1052 	else if(type == NCCL)
1053 		/* all single byte characters match */
1054 		x2 = enter(DOT);
1055 	while(current <= last) {
1056 		if(upper[current] == lower[current])
1057 			x1 = mchar(lower[current]);
1058 		else {
1059 			length = wctomb(multic1, lower[current]);
1060 			wctomb(multic2, upper[current]);
1061 			x1 = range((unsigned char *)multic1,
1062 			    (unsigned char *)multic2, length);
1063 		}
1064 		if(x2)
1065 			x2 = node(OR, x2, x1);
1066 		else
1067 			x2 = x1;
1068 		current++;
1069 	}
1070 	return x2;
1071 }
1072 
1073 int
1074 range(unsigned char *p1, unsigned char *p2, int length)
1075 {
1076 	char multic[MB_LEN_MAX+1];
1077 	char *p;
1078 	int i, x1, x2;
1079 	if(length == 1)
1080 		return(classenter(*p1, *p2));
1081 	if(p1[0] == p2[0])
1082 		return(node(CAT, enter(p1[0]), range(p1+1, p2+1, length - 1)));
1083 	p = multic;
1084 	for(i = 1; i < length; i++)
1085 		*p++ = 0377;
1086 	x1 = node(CAT, enter(p1[0]),
1087 	    range(p1+1, (unsigned char *)multic, length - 1));
1088 	if((unsigned char)(p1[0] + 1) < p2[0]) {
1089 		x2 = classenter(p1[0] + 1, p2[0] - 1);
1090 		for(i = 1; i < length; i++)
1091 			x2 = node(CAT, x2, enter(MDOT));
1092 		x1 = node(OR, x1, x2);
1093 	}
1094 	p = multic;
1095 	for(i = 1; i < length; i++)
1096 		*p++ = 0200;
1097 	x2 = node(CAT, enter(p2[0]),
1098 	    range((unsigned char *)multic, p2+1, length - 1));
1099 	return node(OR, x1, x2);
1100 }
1101 
1102 int
1103 classenter(int x1, int x2)
1104 {
1105 	static int max, min;
1106 	if(!max) {
1107 		int i;
1108 		for(i = 0200; i <= 0377; i++)
1109 			if(!iscntrl(i))
1110 				break;
1111 		min = i;
1112 		for(i = 0377; i >= 0200; i--)
1113 			if(!iscntrl(i))
1114 				break;
1115 		max = i;
1116 	}
1117 	if(x1 <= min && x2 >= max)
1118 		return enter(MDOT);
1119 	if(nxtchar + 4 >= maxclin)
1120 		if(allocchars() == 0)
1121 			overflo();
1122 	count = nxtchar++;
1123 	chars[nxtchar++] = x1;
1124 	chars[nxtchar++] = '-';
1125 	chars[nxtchar++] = x2;
1126 	chars[count] = 3;
1127 	return cclenter(MCCL);
1128 }
1129 
1130 int
1131 genrange(int type)
1132 {
1133 	char *p, *endp;
1134 	int current, nel, i, last, length;
1135 	wchar_t c, lc;
1136 
1137 	current = 0;
1138 	p = &chars[count+1];
1139 	endp = &chars[count+1] + chars[count];
1140 	lc = 0;
1141 
1142 	/* convert character class into union of ranges */
1143 	while(p < endp) {
1144 		length = mbtowc(&c, p, MB_LEN_MAX);
1145 		p += length;
1146 		if(c == '-' && lc != 0) {
1147 			length = mbtowc(&c, p, MB_LEN_MAX);
1148 			upper[current-1] = c;
1149 			p += length;
1150 		} else {
1151 			lower[current] = c;
1152 			upper[current++] = c;
1153 		}
1154 		lc = c;
1155 	}
1156 	nel = current;
1157 	/* sort lower and upper bounds of ranges */
1158 	qsort((char *)lower, nel, sizeof(wchar_t), compare);
1159 	qsort((char *)upper, nel, sizeof(wchar_t), compare);
1160 	last = current - 1;
1161 	current = 0;
1162 	/* combine overlapping or adjacent ranges */
1163 	for(i = 0; i < last; i++)
1164 		if(upper[i] >= lower[i+1] - 1)
1165 			upper[current] = upper[i+1];
1166 		else {
1167 			lower[++current] = lower[i+1];
1168 			upper[current] = upper[i+1];
1169 		}
1170 	if(type == NCCL) {
1171 		/* find complement of character class */
1172 		int j, next;
1173 		i = 0;
1174 		while(i <= current && isascii(c=lower[i]) || c <= 0377 && iscntrl(c))
1175 			i++;
1176 		if(i > current) {
1177 			/* match all multibyte characters */
1178 			if(eucw2) {
1179 				lower[i] = maxmin(WCHAR_CS2, 0);
1180 				upper[i++] = maxmin(WCHAR_CS2, 1);
1181 			}
1182 			if(eucw3) {
1183 				lower[i] = maxmin(WCHAR_CS3, 0);
1184 				upper[i++] = maxmin(WCHAR_CS3, 1);
1185 			}
1186 			lower[i] = maxmin(WCHAR_CS1, 0);
1187 			upper[i++] = maxmin(WCHAR_CS1, 1);
1188 			return i - 1;
1189 		}
1190 		next = current + 1;
1191 		if(next + current + 2 >= maxwclin) {
1192 			maxwclin += MAXLIN + next + current + 2;
1193 			if((lower = (wchar_t *)realloc(lower, maxwclin *sizeof(wchar_t))) == (wchar_t *)0 ||
1194 			   (upper = (wchar_t *)realloc(upper, maxwclin * sizeof(wchar_t))) == (wchar_t *)0)
1195 				overflo();
1196 		}
1197 		if(eucw2 && lower[i] > maxmin(WCHAR_CS2, 0)) {
1198 			lower[next] = maxmin(WCHAR_CS2, 0);
1199 			if((lower[i] & WCHAR_CSMASK) != WCHAR_CS2) {
1200 				upper[next++] = maxmin(WCHAR_CS2, 1);
1201 				if((lower[i] & WCHAR_CSMASK) == WCHAR_CS1 && eucw3) {
1202 					lower[next] = maxmin(WCHAR_CS3, 0);
1203 					upper[next++] = maxmin(WCHAR_CS3, 1);
1204 				}
1205 				if(lower[i] > maxmin(lower[i] & WCHAR_CSMASK, 0)) {
1206 					lower[next] = maxmin(lower[i] & WCHAR_CSMASK, 0);
1207 					upper[next++] = lower[i] - 1;
1208 				}
1209 			} else
1210 				upper[next++] = lower[i] - 1;
1211 		} else if(lower[i] > maxmin(lower[i] & WCHAR_CSMASK, 0)) {
1212 			lower[next] = maxmin(lower[i] & WCHAR_CSMASK, 0);
1213 			upper[next++] = lower[i] - 1;
1214 		}
1215 		for(j = i; j < current; j++) {
1216 			if(upper[j] < maxmin(upper[j] & WCHAR_CSMASK, 1)) {
1217 				lower[next] = upper[j] + 1;
1218 				if((upper[j] & WCHAR_CSMASK) != (lower[j+1] & WCHAR_CSMASK)) {
1219 					upper[next++] = maxmin(upper[j] & WCHAR_CSMASK, 1);
1220 					if(eucw3 && (upper[j] & WCHAR_CSMASK) == WCHAR_CS2 && (lower[j+1] & WCHAR_CSMASK) == WCHAR_CS1) {
1221 						lower[next] = maxmin(WCHAR_CS3, 0);
1222 						upper[next++] = maxmin(WCHAR_CS3, 1);
1223 					}
1224 					if(lower[j+1] > maxmin(lower[j+1] & WCHAR_CSMASK, 0)) {
1225 						lower[next] = maxmin(lower[j+1] & WCHAR_CSMASK, 0);
1226 						upper[next++] = lower[j+1] - 1;
1227 					}
1228 				} else
1229 					upper[next++] = lower[j+1] - 1;
1230 			} else if(lower[j+1] > maxmin(lower[j+1], 0)) {
1231 				lower[next] = maxmin(lower[j+1], 0);
1232 				upper[next++] = lower[j+1] - 1;
1233 			}
1234 		}
1235 		if(upper[current] < maxmin(upper[current] & WCHAR_CSMASK, 1)) {
1236 			lower[next] = upper[current] + 1;
1237 			upper[next++] = maxmin(upper[current] & WCHAR_CSMASK, 1);
1238 		}
1239 		if((upper[current] & WCHAR_CSMASK) != WCHAR_CS1) {
1240 			if((upper[current] & WCHAR_CSMASK) == WCHAR_CS2 && eucw3) {
1241 				lower[next] = maxmin(WCHAR_CS3, 0);
1242 				upper[next++] = maxmin(WCHAR_CS3, 1);
1243 			}
1244 			lower[next] = maxmin(WCHAR_CS1, 0);
1245 			upper[next++] = maxmin(WCHAR_CS1, 1);
1246 		}
1247 		for(j = current + 1; j < next; j++) {
1248 			lower[i] = lower[j];
1249 			upper[i++] = upper[j];
1250 		}
1251 		current = i - 1;
1252 	}
1253 	return(current);
1254 }
1255 
1256 int
1257 compare(wchar_t *c, wchar_t *d)
1258 {
1259 	if(*c < *d)
1260 		return -1;
1261 	if(*c == *d)
1262 		return 0;
1263 	return 1;
1264 }
1265 
1266 wchar_t
1267 maxmin(wchar_t c, int flag)
1268 {
1269 	static wchar_t minmax1[2], minmax2[2], minmax3[2];
1270 
1271 	if(!minmax1[0]) {
1272 		/* compute min and max process codes for all code sets */
1273 		int length, i;
1274 		char multic[MB_LEN_MAX], minmax[2];
1275 		for(i = 0377; i >= 0200; i--)
1276 			if(!iscntrl(i))
1277 				break;
1278 		minmax[1] = i;
1279 		for(i = 0240; i <= 0377; i++)
1280 			if(!iscntrl(i))
1281 				break;
1282 		minmax[0] = i;
1283 		for(i = 0; i <= 1; i++) {
1284 			length = MB_LEN_MAX;
1285 			while(length--)
1286 				multic[length] = minmax[i];
1287 			mbtowc(&minmax1[i], multic, MB_LEN_MAX);
1288 			if(eucw2) {
1289 				multic[0] = SS2;
1290 				mbtowc(&minmax2[i], multic, MB_LEN_MAX);
1291 			}
1292 			if(eucw3) {
1293 				multic[0] = SS3;
1294 				mbtowc(&minmax3[i], multic, MB_LEN_MAX);
1295 			}
1296 		}
1297 	}
1298 	switch(c) {
1299 		case WCHAR_CS1: return minmax1[flag];
1300 		case WCHAR_CS2: return minmax2[flag];
1301 		case WCHAR_CS3: return minmax3[flag];
1302 	}
1303 
1304 	/* NOTREACHED */
1305 	return (0);
1306 }
1307