xref: /freebsd/contrib/one-true-awk/b.c (revision 1ca63a8219b88b752b064d19bd3428c61dbcf1f9)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36 
37 #define MAXLIN 22
38 
39 #define type(v)		(v)->nobj	/* badly overloaded here */
40 #define info(v)		(v)->ntype	/* badly overloaded here */
41 #define left(v)		(v)->narg[0]
42 #define right(v)	(v)->narg[1]
43 #define parent(v)	(v)->nnext
44 
45 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
47 #define UNARY	case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 		left is index, right contains value or pointer to value
52 	unary (STAR, PLUS, QUEST): left is child, right is null
53 	binary (CAT, OR): left and right are children
54 	parent contains pointer to parent
55 */
56 
57 
58 int	*setvec;
59 int	*tmpset;
60 int	maxsetvec = 0;
61 
62 int	rtok;		/* next token in current re */
63 int	rlxval;
64 static const uschar	*rlxstr;
65 static const uschar	*prestr;	/* current position in current re */
66 static const uschar	*lastre;	/* origin of last re */
67 static const uschar	*lastatom;	/* origin of last Atom */
68 static const uschar	*starttok;
69 static const uschar 	*basestr;	/* starts with original, replaced during
70 				   repetition processing */
71 static const uschar 	*firstbasestr;
72 
73 static	int setcnt;
74 static	int poscnt;
75 
76 const char	*patbeg;
77 int	patlen;
78 
79 #define	NFA	128	/* cache this many dynamic fa's */
80 fa	*fatab[NFA];
81 int	nfatab	= 0;	/* entries in fatab */
82 
83 extern int u8_nextlen(const char *s);
84 
85 
86 /* utf-8 mechanism:
87 
88    For most of Awk, utf-8 strings just "work", since they look like
89    null-terminated sequences of 8-bit bytes.
90 
91    Functions like length(), index(), and substr() have to operate
92    in units of utf-8 characters.  The u8_* functions in run.c
93    handle this.
94 
95    Regular expressions are more complicated, since the basic
96    mechanism of the goto table used 8-bit byte indices into the
97    gototab entries to compute the next state.  Unicode is a lot
98    bigger, so the gototab entries are now structs with a character
99    and a next state, and there is a linear search of the characters
100    to find the state.  (Yes, this is slower, by a significant
101    amount.  Tough.)
102 
103    Throughout the RE mechanism in b.c, utf-8 characters are
104    converted to their utf-32 value.  This mostly shows up in
105    cclenter, which expands character class ranges like a-z and now
106    alpha-omega.  The size of a gototab array is still about 256.
107    This should be dynamic, but for now things work ok for a single
108    code page of Unicode, which is the most likely case.
109 
110    The code changes are localized in run.c and b.c.  I have added a
111    handful of functions to somewhat better hide the implementation,
112    but a lot more could be done.
113 
114  */
115 
116 static int get_gototab(fa*, int, int);
117 static int set_gototab(fa*, int, int, int);
118 extern int u8_rune(int *, const uschar *);
119 
120 static int *
121 intalloc(size_t n, const char *f)
122 {
123 	int *p = (int *) calloc(n, sizeof(int));
124 	if (p == NULL)
125 		overflo(f);
126 	return p;
127 }
128 
129 static void
130 resizesetvec(const char *f)
131 {
132 	if (maxsetvec == 0)
133 		maxsetvec = MAXLIN;
134 	else
135 		maxsetvec *= 4;
136 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
137 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
138 	if (setvec == NULL || tmpset == NULL)
139 		overflo(f);
140 }
141 
142 static void
143 resize_state(fa *f, int state)
144 {
145 	gtt **p;
146 	uschar *p2;
147 	int **p3;
148 	int i, new_count;
149 
150 	if (++state < f->state_count)
151 		return;
152 
153 	new_count = state + 10; /* needs to be tuned */
154 
155 	p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
156 	if (p == NULL)
157 		goto out;
158 	f->gototab = p;
159 
160 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
161 	if (p2 == NULL)
162 		goto out;
163 	f->out = p2;
164 
165 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
166 	if (p3 == NULL)
167 		goto out;
168 	f->posns = p3;
169 
170 	for (i = f->state_count; i < new_count; ++i) {
171 		f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
172 		if (f->gototab[i] == NULL)
173 			goto out;
174 		f->out[i]  = 0;
175 		f->posns[i] = NULL;
176 	}
177 	f->gototab_len = NCHARS; /* should be variable, growable */
178 	f->state_count = new_count;
179 	return;
180 out:
181 	overflo(__func__);
182 }
183 
184 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
185 {
186 	int i, use, nuse;
187 	fa *pfa;
188 	static int now = 1;
189 
190 	if (setvec == NULL) {	/* first time through any RE */
191 		resizesetvec(__func__);
192 	}
193 
194 	if (compile_time != RUNNING)	/* a constant for sure */
195 		return mkdfa(s, anchor);
196 	for (i = 0; i < nfatab; i++)	/* is it there already? */
197 		if (fatab[i]->anchor == anchor
198 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
199 			fatab[i]->use = now++;
200 			return fatab[i];
201 		}
202 	pfa = mkdfa(s, anchor);
203 	if (nfatab < NFA) {	/* room for another */
204 		fatab[nfatab] = pfa;
205 		fatab[nfatab]->use = now++;
206 		nfatab++;
207 		return pfa;
208 	}
209 	use = fatab[0]->use;	/* replace least-recently used */
210 	nuse = 0;
211 	for (i = 1; i < nfatab; i++)
212 		if (fatab[i]->use < use) {
213 			use = fatab[i]->use;
214 			nuse = i;
215 		}
216 	freefa(fatab[nuse]);
217 	fatab[nuse] = pfa;
218 	pfa->use = now++;
219 	return pfa;
220 }
221 
222 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
223 				/* anchor = true for anchored matches, else false */
224 {
225 	Node *p, *p1;
226 	fa *f;
227 
228 	firstbasestr = (const uschar *) s;
229 	basestr = firstbasestr;
230 	p = reparse(s);
231 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
232 		/* put ALL STAR in front of reg.  exp. */
233 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
234 		/* put FINAL after reg.  exp. */
235 
236 	poscnt = 0;
237 	penter(p1);	/* enter parent pointers and leaf indices */
238 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
239 		overflo(__func__);
240 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
241 	cfoll(f, p1);	/* set up follow sets */
242 	freetr(p1);
243 	resize_state(f, 1);
244 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
245 	f->posns[1] = intalloc(1, __func__);
246 	*f->posns[1] = 0;
247 	f->initstat = makeinit(f, anchor);
248 	f->anchor = anchor;
249 	f->restr = (uschar *) tostring(s);
250 	if (firstbasestr != basestr) {
251 		if (basestr)
252 			xfree(basestr);
253 	}
254 	return f;
255 }
256 
257 int makeinit(fa *f, bool anchor)
258 {
259 	int i, k;
260 
261 	f->curstat = 2;
262 	f->out[2] = 0;
263 	k = *(f->re[0].lfollow);
264 	xfree(f->posns[2]);
265 	f->posns[2] = intalloc(k + 1,  __func__);
266 	for (i = 0; i <= k; i++) {
267 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
268 	}
269 	if ((f->posns[2])[1] == f->accept)
270 		f->out[2] = 1;
271 	for (i = 0; i < NCHARS; i++)
272 		set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */
273 	f->curstat = cgoto(f, 2, HAT);
274 	if (anchor) {
275 		*f->posns[2] = k-1;	/* leave out position 0 */
276 		for (i = 0; i < k; i++) {
277 			(f->posns[0])[i] = (f->posns[2])[i];
278 		}
279 
280 		f->out[0] = f->out[2];
281 		if (f->curstat != 2)
282 			--(*f->posns[f->curstat]);
283 	}
284 	return f->curstat;
285 }
286 
287 void penter(Node *p)	/* set up parent pointers and leaf indices */
288 {
289 	switch (type(p)) {
290 	ELEAF
291 	LEAF
292 		info(p) = poscnt;
293 		poscnt++;
294 		break;
295 	UNARY
296 		penter(left(p));
297 		parent(left(p)) = p;
298 		break;
299 	case CAT:
300 	case OR:
301 		penter(left(p));
302 		penter(right(p));
303 		parent(left(p)) = p;
304 		parent(right(p)) = p;
305 		break;
306 	case ZERO:
307 		break;
308 	default:	/* can't happen */
309 		FATAL("can't happen: unknown type %d in penter", type(p));
310 		break;
311 	}
312 }
313 
314 void freetr(Node *p)	/* free parse tree */
315 {
316 	switch (type(p)) {
317 	ELEAF
318 	LEAF
319 		xfree(p);
320 		break;
321 	UNARY
322 	case ZERO:
323 		freetr(left(p));
324 		xfree(p);
325 		break;
326 	case CAT:
327 	case OR:
328 		freetr(left(p));
329 		freetr(right(p));
330 		xfree(p);
331 		break;
332 	default:	/* can't happen */
333 		FATAL("can't happen: unknown type %d in freetr", type(p));
334 		break;
335 	}
336 }
337 
338 /* in the parsing of regular expressions, metacharacters like . have */
339 /* to be seen literally;  \056 is not a metacharacter. */
340 
341 int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
342 {			/* only pick up one 8-bit byte (2 chars) */
343 	const uschar *p;
344 	int n = 0;
345 	int i;
346 
347 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
348 		if (isdigit(*p))
349 			n = 16 * n + *p - '0';
350 		else if (*p >= 'a' && *p <= 'f')
351 			n = 16 * n + *p - 'a' + 10;
352 		else if (*p >= 'A' && *p <= 'F')
353 			n = 16 * n + *p - 'A' + 10;
354 	}
355 	*pp = p;
356 	return n;
357 }
358 
359 
360 
361 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
362 
363 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
364 			/* and increment *pp */
365 {
366 	const uschar *p = *pp;
367 	int c;
368 
369 /* BUG: should advance by utf-8 char even if makes no sense */
370 
371 	if ((c = *p++) == 't') {
372 		c = '\t';
373 	} else if (c == 'n') {
374 		c = '\n';
375 	} else if (c == 'f') {
376 		c = '\f';
377 	} else if (c == 'r') {
378 		c = '\r';
379 	} else if (c == 'b') {
380 		c = '\b';
381 	} else if (c == 'v') {
382 		c = '\v';
383 	} else if (c == 'a') {
384 		c = '\a';
385 	} else if (c == '\\') {
386 		c = '\\';
387 	} else if (c == 'x') {	/* 2 hex digits follow */
388 		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
389 	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
390 		c = hexstr(&p, 8);
391 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
392 		int n = c - '0';
393 		if (isoctdigit(*p)) {
394 			n = 8 * n + *p++ - '0';
395 			if (isoctdigit(*p))
396 				n = 8 * n + *p++ - '0';
397 		}
398 		c = n;
399 	} /* else */
400 		/* c = c; */
401 	*pp = p;
402 	return c;
403 }
404 
405 int *cclenter(const char *argp)	/* add a character class */
406 {
407 	int i, c, c2;
408 	int n;
409 	const uschar *p = (const uschar *) argp;
410 	int *bp, *retp;
411 	static int *buf = NULL;
412 	static int bufsz = 100;
413 
414 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
415 		FATAL("out of space for character class [%.10s...] 1", p);
416 	bp = buf;
417 	for (i = 0; *p != 0; ) {
418 		n = u8_rune(&c, p);
419 		p += n;
420 		if (c == '\\') {
421 			c = quoted(&p);
422 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
423 			if (*p != 0) {
424 				c = bp[-1];
425 				/* c2 = *p++; */
426 				n = u8_rune(&c2, p);
427 				p += n;
428 				if (c2 == '\\')
429 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
430 				if (c > c2) {	/* empty; ignore */
431 					bp--;
432 					i--;
433 					continue;
434 				}
435 				while (c < c2) {
436 					if (i >= bufsz) {
437 						bufsz *= 2;
438 						buf = (int *) realloc(buf, bufsz * sizeof(int));
439 						if (buf == NULL)
440 							FATAL("out of space for character class [%.10s...] 2", p);
441 						bp = buf + i;
442 					}
443 					*bp++ = ++c;
444 					i++;
445 				}
446 				continue;
447 			}
448 		}
449 		if (i >= bufsz) {
450 			bufsz *= 2;
451 			buf = (int *) realloc(buf, bufsz * sizeof(int));
452 			if (buf == NULL)
453 				FATAL("out of space for character class [%.10s...] 2", p);
454 			bp = buf + i;
455 		}
456 		*bp++ = c;
457 		i++;
458 	}
459 	*bp = 0;
460 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
461 	/* xfree(op);  BUG: what are we freeing here? */
462 	retp = (int *) calloc(bp-buf+1, sizeof(int));
463 	for (i = 0; i < bp-buf+1; i++)
464 		retp[i] = buf[i];
465 	return retp;
466 }
467 
468 void overflo(const char *s)
469 {
470 	FATAL("regular expression too big: out of space in %.30s...", s);
471 }
472 
473 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
474 {
475 	int i;
476 	int *p;
477 
478 	switch (type(v)) {
479 	ELEAF
480 	LEAF
481 		f->re[info(v)].ltype = type(v);
482 		f->re[info(v)].lval.np = right(v);
483 		while (f->accept >= maxsetvec) {	/* guessing here! */
484 			resizesetvec(__func__);
485 		}
486 		for (i = 0; i <= f->accept; i++)
487 			setvec[i] = 0;
488 		setcnt = 0;
489 		follow(v);	/* computes setvec and setcnt */
490 		p = intalloc(setcnt + 1, __func__);
491 		f->re[info(v)].lfollow = p;
492 		*p = setcnt;
493 		for (i = f->accept; i >= 0; i--)
494 			if (setvec[i] == 1)
495 				*++p = i;
496 		break;
497 	UNARY
498 		cfoll(f,left(v));
499 		break;
500 	case CAT:
501 	case OR:
502 		cfoll(f,left(v));
503 		cfoll(f,right(v));
504 		break;
505 	case ZERO:
506 		break;
507 	default:	/* can't happen */
508 		FATAL("can't happen: unknown type %d in cfoll", type(v));
509 	}
510 }
511 
512 int first(Node *p)	/* collects initially active leaves of p into setvec */
513 			/* returns 0 if p matches empty string */
514 {
515 	int b, lp;
516 
517 	switch (type(p)) {
518 	ELEAF
519 	LEAF
520 		lp = info(p);	/* look for high-water mark of subscripts */
521 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
522 			resizesetvec(__func__);
523 		}
524 		if (type(p) == EMPTYRE) {
525 			setvec[lp] = 0;
526 			return(0);
527 		}
528 		if (setvec[lp] != 1) {
529 			setvec[lp] = 1;
530 			setcnt++;
531 		}
532 		if (type(p) == CCL && (*(int *) right(p)) == 0)
533 			return(0);		/* empty CCL */
534 		return(1);
535 	case PLUS:
536 		if (first(left(p)) == 0)
537 			return(0);
538 		return(1);
539 	case STAR:
540 	case QUEST:
541 		first(left(p));
542 		return(0);
543 	case CAT:
544 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
545 		return(1);
546 	case OR:
547 		b = first(right(p));
548 		if (first(left(p)) == 0 || b == 0) return(0);
549 		return(1);
550 	case ZERO:
551 		return 0;
552 	}
553 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
554 	return(-1);
555 }
556 
557 void follow(Node *v)	/* collects leaves that can follow v into setvec */
558 {
559 	Node *p;
560 
561 	if (type(v) == FINAL)
562 		return;
563 	p = parent(v);
564 	switch (type(p)) {
565 	case STAR:
566 	case PLUS:
567 		first(v);
568 		follow(p);
569 		return;
570 
571 	case OR:
572 	case QUEST:
573 		follow(p);
574 		return;
575 
576 	case CAT:
577 		if (v == left(p)) {	/* v is left child of p */
578 			if (first(right(p)) == 0) {
579 				follow(p);
580 				return;
581 			}
582 		} else		/* v is right child */
583 			follow(p);
584 		return;
585 	}
586 }
587 
588 int member(int c, int *sarg)	/* is c in s? */
589 {
590 	int *s = (int *) sarg;
591 
592 	while (*s)
593 		if (c == *s++)
594 			return(1);
595 	return(0);
596 }
597 
598 static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
599 {
600 	int i;
601 	for (i = 0; i < f->gototab_len; i++) {
602 		if (f->gototab[state][i].ch == 0)
603 			break;
604 		if (f->gototab[state][i].ch == ch)
605 			return f->gototab[state][i].state;
606 	}
607 	return 0;
608 }
609 
610 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
611 {
612 	int i;
613 	for (i = 0; i < f->gototab_len; i++) {
614 		if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
615 			f->gototab[state][i].ch = ch;
616 			f->gototab[state][i].state = val;
617 			return val;
618 		}
619 	}
620 	overflo(__func__);
621 	return val; /* not used anywhere at the moment */
622 }
623 
624 int match(fa *f, const char *p0)	/* shortest match ? */
625 {
626 	int s, ns;
627 	int n;
628 	int rune;
629 	const uschar *p = (const uschar *) p0;
630 
631 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
632 
633 	s = f->initstat;
634 	assert (s < f->state_count);
635 
636 	if (f->out[s])
637 		return(1);
638 	do {
639 		/* assert(*p < NCHARS); */
640 		n = u8_rune(&rune, p);
641 		if ((ns = get_gototab(f, s, rune)) != 0)
642 			s = ns;
643 		else
644 			s = cgoto(f, s, rune);
645 		if (f->out[s])
646 			return(1);
647 		if (*p == 0)
648 			break;
649 		p += n;
650 	} while (1);  /* was *p++ != 0 */
651 	return(0);
652 }
653 
654 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
655 {
656 	int s, ns;
657 	int n;
658 	int rune;
659 	const uschar *p = (const uschar *) p0;
660 	const uschar *q;
661 
662 	s = f->initstat;
663 	assert(s < f->state_count);
664 
665 	patbeg = (const char *)p;
666 	patlen = -1;
667 	do {
668 		q = p;
669 		do {
670 			if (f->out[s])		/* final state */
671 				patlen = q-p;
672 			/* assert(*q < NCHARS); */
673 			n = u8_rune(&rune, q);
674 			if ((ns = get_gototab(f, s, rune)) != 0)
675 				s = ns;
676 			else
677 				s = cgoto(f, s, rune);
678 
679 			assert(s < f->state_count);
680 
681 			if (s == 1) {	/* no transition */
682 				if (patlen >= 0) {
683 					patbeg = (const char *) p;
684 					return(1);
685 				}
686 				else
687 					goto nextin;	/* no match */
688 			}
689 			if (*q == 0)
690 				break;
691 			q += n;
692 		} while (1);
693 		q++;  /* was *q++ */
694 		if (f->out[s])
695 			patlen = q-p-1;	/* don't count $ */
696 		if (patlen >= 0) {
697 			patbeg = (const char *) p;
698 			return(1);
699 		}
700 	nextin:
701 		s = 2;
702 		if (*p == 0)
703 			break;
704 		n = u8_rune(&rune, p);
705 		p += n;
706 	} while (1); /* was *p++ */
707 	return (0);
708 }
709 
710 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
711 {
712 	int s, ns;
713         int n;
714         int rune;
715 	const uschar *p = (const uschar *) p0;
716 	const uschar *q;
717 
718 	s = f->initstat;
719 	assert(s < f->state_count);
720 
721 	patbeg = (const char *)p;
722 	patlen = -1;
723 	while (*p) {
724 		q = p;
725 		do {
726 			if (f->out[s])		/* final state */
727 				patlen = q-p;
728 			/* assert(*q < NCHARS); */
729 			n = u8_rune(&rune, q);
730 			if ((ns = get_gototab(f, s, rune)) != 0)
731 				s = ns;
732 			else
733 				s = cgoto(f, s, rune);
734 			if (s == 1) {	/* no transition */
735 				if (patlen > 0) {
736 					patbeg = (const char *) p;
737 					return(1);
738 				} else
739 					goto nnextin;	/* no nonempty match */
740 			}
741 			if (*q == 0)
742 				break;
743 			q += n;
744 		} while (1);
745 		q++;
746 		if (f->out[s])
747 			patlen = q-p-1;	/* don't count $ */
748 		if (patlen > 0 ) {
749 			patbeg = (const char *) p;
750 			return(1);
751 		}
752 	nnextin:
753 		s = 2;
754 		p++;
755 	}
756 	return (0);
757 }
758 
759 
760 #define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
761 
762 // Read one rune at a time from the given FILE*. Return both
763 // the bytes and the actual rune.
764 
765 struct runedata {
766 	int rune;
767 	size_t len;
768 	char bytes[6];
769 };
770 
771 struct runedata getrune(FILE *fp)
772 {
773 	struct runedata result;
774 	int c, next;
775 
776 	memset(&result, 0, sizeof(result));
777 
778 	c = getc(fp);
779 	if (c == EOF)
780 		return result;	// result.rune == 0 --> EOF
781 	else if (c < 128 || awk_mb_cur_max == 1) {
782 		result.bytes[0] = c;
783 		result.len = 1;
784 		result.rune = c;
785 
786 		return result;
787 	}
788 
789 	// need to get bytes and fill things in
790 	result.bytes[0] = c;
791 	result.len = 1;
792 
793 	next = 1;
794 	for (int i = 1; i < MAX_UTF_BYTES; i++) {
795 		c = getc(fp);
796 		if (c == EOF)
797 			break;
798 		result.bytes[next++] = c;
799 		result.len++;
800 	}
801 
802 	// put back any extra input bytes
803 	int actual_len = u8_nextlen(result.bytes);
804 	while (result.len > actual_len) {
805 		ungetc(result.bytes[--result.len], fp);
806 	}
807 
808 	result.bytes[result.len] = '\0';
809 	(void) u8_rune(& result.rune, (uschar *) result.bytes);
810 
811 	return result;
812 }
813 
814 
815 /*
816  * NAME
817  *     fnematch
818  *
819  * DESCRIPTION
820  *     A stream-fed version of nematch which transfers characters to a
821  *     null-terminated buffer. All characters up to and including the last
822  *     character of the matching text or EOF are placed in the buffer. If
823  *     a match is found, patbeg and patlen are set appropriately.
824  *
825  * RETURN VALUES
826  *     false    No match found.
827  *     true     Match found.
828  */
829 
830 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
831 {
832 	char *buf = *pbuf;
833 	int bufsize = *pbufsize;
834 	int i, j, k, ns, s;
835 	struct runedata r;
836 
837 	s = pfa->initstat;
838 	patlen = 0;
839 
840 	/*
841 	 * All indices relative to buf.
842 	 * i <= j <= k <= bufsize
843 	 *
844 	 * i: origin of active substring (first byte of first character)
845 	 * j: current character		(last byte of current character)
846 	 * k: destination of next getc()
847 	 */
848 	i = -1, k = 0;
849         do {
850 		j = i++;
851 		do {
852 			r = getrune(f);
853 			if ((++j + r.len) >= k) {
854 				if (k >= bufsize)
855 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
856 						FATAL("stream '%.30s...' too long", buf);
857 			}
858 			memcpy(buf + k, r.bytes, r.len);
859 			j += r.len - 1;	// incremented next time around the loop
860 			k += r.len;
861 
862 			if ((ns = get_gototab(pfa, s, r.rune)) != 0)
863 				s = ns;
864 			else
865 				s = cgoto(pfa, s, r.rune);
866 
867 			if (pfa->out[s]) {	/* final state */
868 				patlen = j - i + 1;
869 				if (r.rune == 0)	/* don't count $ */
870 					patlen--;
871 			}
872 		} while (buf[j] && s != 1);
873 		s = 2;
874 		if (r.len > 1)
875 			i += r.len - 1;	// i incremented around the loop
876 	} while (buf[i] && !patlen);
877 
878 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
879 	*pbuf = buf;
880 	*pbufsize = bufsize;
881 
882 	if (patlen) {
883 		patbeg = (char *) buf + i;
884 		/*
885 		 * Under no circumstances is the last character fed to
886 		 * the automaton part of the match. It is EOF's nullbyte,
887 		 * or it sent the automaton into a state with no further
888 		 * transitions available (s==1), or both. Room for a
889 		 * terminating nullbyte is guaranteed.
890 		 *
891 		 * ungetc any chars after the end of matching text
892 		 * (except for EOF's nullbyte, if present) and null
893 		 * terminate the buffer.
894 		 */
895 		do
896 			for (int ii = r.len; ii > 0; ii--)
897 				if (buf[--k] && ungetc(buf[k], f) == EOF)
898 					FATAL("unable to ungetc '%c'", buf[k]);
899 		while (k > i + patlen);
900 		buf[k] = '\0';
901 		return true;
902 	}
903 	else
904 		return false;
905 }
906 
907 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
908 {			/* uses relex() to scan regular expression */
909 	Node *np;
910 
911 	DPRINTF("reparse <%s>\n", p);
912 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
913 	rtok = relex();
914 	/* GNU compatibility: an empty regexp matches anything */
915 	if (rtok == '\0') {
916 		/* FATAL("empty regular expression"); previous */
917 		return(op2(EMPTYRE, NIL, NIL));
918 	}
919 	np = regexp();
920 	if (rtok != '\0')
921 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
922 	return(np);
923 }
924 
925 Node *regexp(void)	/* top-level parse of reg expr */
926 {
927 	return (alt(concat(primary())));
928 }
929 
930 Node *primary(void)
931 {
932 	Node *np;
933 	int savelastatom;
934 
935 	switch (rtok) {
936 	case CHAR:
937 		lastatom = starttok;
938 		np = op2(CHAR, NIL, itonp(rlxval));
939 		rtok = relex();
940 		return (unary(np));
941 	case ALL:
942 		rtok = relex();
943 		return (unary(op2(ALL, NIL, NIL)));
944 	case EMPTYRE:
945 		rtok = relex();
946 		return (unary(op2(EMPTYRE, NIL, NIL)));
947 	case DOT:
948 		lastatom = starttok;
949 		rtok = relex();
950 		return (unary(op2(DOT, NIL, NIL)));
951 	case CCL:
952 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
953 		lastatom = starttok;
954 		rtok = relex();
955 		return (unary(np));
956 	case NCCL:
957 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
958 		lastatom = starttok;
959 		rtok = relex();
960 		return (unary(np));
961 	case '^':
962 		rtok = relex();
963 		return (unary(op2(CHAR, NIL, itonp(HAT))));
964 	case '$':
965 		rtok = relex();
966 		return (unary(op2(CHAR, NIL, NIL)));
967 	case '(':
968 		lastatom = starttok;
969 		savelastatom = starttok - basestr; /* Retain over recursion */
970 		rtok = relex();
971 		if (rtok == ')') {	/* special pleading for () */
972 			rtok = relex();
973 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
974 		}
975 		np = regexp();
976 		if (rtok == ')') {
977 			lastatom = basestr + savelastatom; /* Restore */
978 			rtok = relex();
979 			return (unary(np));
980 		}
981 		else
982 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
983 	default:
984 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
985 	}
986 	return 0;	/*NOTREACHED*/
987 }
988 
989 Node *concat(Node *np)
990 {
991 	switch (rtok) {
992 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
993 		return (concat(op2(CAT, np, primary())));
994 	case EMPTYRE:
995 		rtok = relex();
996 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
997 				primary())));
998 	}
999 	return (np);
1000 }
1001 
1002 Node *alt(Node *np)
1003 {
1004 	if (rtok == OR) {
1005 		rtok = relex();
1006 		return (alt(op2(OR, np, concat(primary()))));
1007 	}
1008 	return (np);
1009 }
1010 
1011 Node *unary(Node *np)
1012 {
1013 	switch (rtok) {
1014 	case STAR:
1015 		rtok = relex();
1016 		return (unary(op2(STAR, np, NIL)));
1017 	case PLUS:
1018 		rtok = relex();
1019 		return (unary(op2(PLUS, np, NIL)));
1020 	case QUEST:
1021 		rtok = relex();
1022 		return (unary(op2(QUEST, np, NIL)));
1023 	case ZERO:
1024 		rtok = relex();
1025 		return (unary(op2(ZERO, np, NIL)));
1026 	default:
1027 		return (np);
1028 	}
1029 }
1030 
1031 /*
1032  * Character class definitions conformant to the POSIX locale as
1033  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1034  * and operating character sets are both ASCII (ISO646) or supersets
1035  * thereof.
1036  *
1037  * Note that to avoid overflowing the temporary buffer used in
1038  * relex(), the expanded character class (prior to range expansion)
1039  * must be less than twice the size of their full name.
1040  */
1041 
1042 /* Because isblank doesn't show up in any of the header files on any
1043  * system i use, it's defined here.  if some other locale has a richer
1044  * definition of "blank", define HAS_ISBLANK and provide your own
1045  * version.
1046  * the parentheses here are an attempt to find a path through the maze
1047  * of macro definition and/or function and/or version provided.  thanks
1048  * to nelson beebe for the suggestion; let's see if it works everywhere.
1049  */
1050 
1051 /* #define HAS_ISBLANK */
1052 #ifndef HAS_ISBLANK
1053 
1054 int (xisblank)(int c)
1055 {
1056 	return c==' ' || c=='\t';
1057 }
1058 
1059 #endif
1060 
1061 static const struct charclass {
1062 	const char *cc_name;
1063 	int cc_namelen;
1064 	int (*cc_func)(int);
1065 } charclasses[] = {
1066 	{ "alnum",	5,	isalnum },
1067 	{ "alpha",	5,	isalpha },
1068 #ifndef HAS_ISBLANK
1069 	{ "blank",	5,	xisblank },
1070 #else
1071 	{ "blank",	5,	isblank },
1072 #endif
1073 	{ "cntrl",	5,	iscntrl },
1074 	{ "digit",	5,	isdigit },
1075 	{ "graph",	5,	isgraph },
1076 	{ "lower",	5,	islower },
1077 	{ "print",	5,	isprint },
1078 	{ "punct",	5,	ispunct },
1079 	{ "space",	5,	isspace },
1080 	{ "upper",	5,	isupper },
1081 	{ "xdigit",	6,	isxdigit },
1082 	{ NULL,		0,	NULL },
1083 };
1084 
1085 #define REPEAT_SIMPLE		0
1086 #define REPEAT_PLUS_APPENDED	1
1087 #define REPEAT_WITH_Q		2
1088 #define REPEAT_ZERO		3
1089 
1090 static int
1091 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1092 	       int atomlen, int firstnum, int secondnum, int special_case)
1093 {
1094 	int i, j;
1095 	uschar *buf = 0;
1096 	int ret = 1;
1097 	int init_q = (firstnum == 0);		/* first added char will be ? */
1098 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1099 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1100 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1101 	int size = prefix_length +  suffix_length;
1102 
1103 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1104 		size += atomlen*(firstnum-1);
1105 	}
1106 
1107 	/* Adjust size of buffer for special cases */
1108 	if (special_case == REPEAT_PLUS_APPENDED) {
1109 		size++;		/* for the final + */
1110 	} else if (special_case == REPEAT_WITH_Q) {
1111 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1112 	} else if (special_case == REPEAT_ZERO) {
1113 		size += 2;	/* just a null ERE: () */
1114 	}
1115 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1116 		FATAL("out of space in reg expr %.10s..", lastre);
1117 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1118 	j = prefix_length;
1119 	if (special_case == REPEAT_ZERO) {
1120 		j -= atomlen;
1121 		buf[j++] = '(';
1122 		buf[j++] = ')';
1123 	}
1124 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1125 		memcpy(&buf[j], atom, atomlen);
1126 		j += atomlen;
1127 	}
1128 	if (special_case == REPEAT_PLUS_APPENDED) {
1129 		buf[j++] = '+';
1130 	} else if (special_case == REPEAT_WITH_Q) {
1131 		if (init_q)
1132 			buf[j++] = '?';
1133 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1134 			memcpy(&buf[j], atom, atomlen);
1135 			j += atomlen;
1136 			buf[j++] = '?';
1137 		}
1138 	}
1139 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1140 	j += suffix_length;
1141 	buf[j] = '\0';
1142 	/* free old basestr */
1143 	if (firstbasestr != basestr) {
1144 		if (basestr)
1145 			xfree(basestr);
1146 	}
1147 	basestr = buf;
1148 	prestr  = buf + prefix_length;
1149 	if (special_case == REPEAT_ZERO) {
1150 		prestr  -= atomlen;
1151 		ret++;
1152 	}
1153 	return ret;
1154 }
1155 
1156 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1157 		  int atomlen, int firstnum, int secondnum)
1158 {
1159 	/*
1160 	   In general, the repetition specifier or "bound" is replaced here
1161 	   by an equivalent ERE string, repeating the immediately previous atom
1162 	   and appending ? and + as needed. Note that the first copy of the
1163 	   atom is left in place, except in the special_case of a zero-repeat
1164 	   (i.e., {0}).
1165 	 */
1166 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1167 		if (firstnum < 2) {
1168 			/* 0 or 1: should be handled before you get here */
1169 			FATAL("internal error");
1170 		} else {
1171 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1172 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1173 		}
1174 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1175 		if (firstnum == 0) {	/* {0} or {0,0} */
1176 			/* This case is unusual because the resulting
1177 			   replacement string might actually be SMALLER than
1178 			   the original ERE */
1179 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1180 					firstnum, secondnum, REPEAT_ZERO);
1181 		} else {		/* (firstnum >= 1) */
1182 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1183 					firstnum, secondnum, REPEAT_SIMPLE);
1184 		}
1185 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1186 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1187 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1188 					firstnum, secondnum, REPEAT_WITH_Q);
1189 	} else {	/* Error - shouldn't be here (n>m) */
1190 		FATAL("internal error");
1191 	}
1192 	return 0;
1193 }
1194 
1195 extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
1196 
1197 int relex(void)		/* lexical analyzer for reparse */
1198 {
1199 	int c, n;
1200 	int cflag;
1201 	static uschar *buf = NULL;
1202 	static int bufsz = 100;
1203 	uschar *bp;
1204 	const struct charclass *cc;
1205 	int i;
1206 	int num, m;
1207 	bool commafound, digitfound;
1208 	const uschar *startreptok;
1209 	static int parens = 0;
1210 
1211 rescan:
1212 	starttok = prestr;
1213 
1214 	if ((n = u8_rune(&rlxval, prestr)) > 1) {
1215 		prestr += n;
1216 		starttok = prestr;
1217 		return CHAR;
1218 	}
1219 
1220 	switch (c = *prestr++) {
1221 	case '|': return OR;
1222 	case '*': return STAR;
1223 	case '+': return PLUS;
1224 	case '?': return QUEST;
1225 	case '.': return DOT;
1226 	case '\0': prestr--; return '\0';
1227 	case '^':
1228 	case '$':
1229 		return c;
1230 	case '(':
1231 		parens++;
1232 		return c;
1233 	case ')':
1234 		if (parens) {
1235 			parens--;
1236 			return c;
1237 		}
1238 		/* unmatched close parenthesis; per POSIX, treat as literal */
1239 		rlxval = c;
1240 		return CHAR;
1241 	case '\\':
1242 		rlxval = quoted(&prestr);
1243 		return CHAR;
1244 	default:
1245 		rlxval = c;
1246 		return CHAR;
1247 	case '[':
1248 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1249 			FATAL("out of space in reg expr %.10s..", lastre);
1250 		bp = buf;
1251 		if (*prestr == '^') {
1252 			cflag = 1;
1253 			prestr++;
1254 		}
1255 		else
1256 			cflag = 0;
1257 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1258 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1259 			FATAL("out of space for reg expr %.10s...", lastre);
1260 		for (; ; ) {
1261 			if ((n = u8_rune(&rlxval, prestr)) > 1) {
1262 				for (i = 0; i < n; i++)
1263 					*bp++ = *prestr++;
1264 				continue;
1265 			}
1266 			if ((c = *prestr++) == '\\') {
1267 				*bp++ = '\\';
1268 				if ((c = *prestr++) == '\0')
1269 					FATAL("nonterminated character class %.20s...", lastre);
1270 				*bp++ = c;
1271 			/* } else if (c == '\n') { */
1272 			/* 	FATAL("newline in character class %.20s...", lastre); */
1273 			} else if (c == '[' && *prestr == ':') {
1274 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1275 				for (cc = charclasses; cc->cc_name; cc++)
1276 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1277 						break;
1278 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1279 				    prestr[2 + cc->cc_namelen] == ']') {
1280 					prestr += cc->cc_namelen + 3;
1281 					/*
1282 					 * BUG: We begin at 1, instead of 0, since we
1283 					 * would otherwise prematurely terminate the
1284 					 * string for classes like [[:cntrl:]]. This
1285 					 * means that we can't match the NUL character,
1286 					 * not without first adapting the entire
1287 					 * program to track each string's length.
1288 					 */
1289 					for (i = 1; i <= UCHAR_MAX; i++) {
1290 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1291 						    FATAL("out of space for reg expr %.10s...", lastre);
1292 						if (cc->cc_func(i)) {
1293 							/* escape backslash */
1294 							if (i == '\\') {
1295 								*bp++ = '\\';
1296 								n++;
1297 							}
1298 
1299 							*bp++ = i;
1300 							n++;
1301 						}
1302 					}
1303 				} else
1304 					*bp++ = c;
1305 			} else if (c == '[' && *prestr == '.') {
1306 				char collate_char;
1307 				prestr++;
1308 				collate_char = *prestr++;
1309 				if (*prestr == '.' && prestr[1] == ']') {
1310 					prestr += 2;
1311 					/* Found it: map via locale TBD: for
1312 					   now, simply return this char.  This
1313 					   is sufficient to pass conformance
1314 					   test awk.ex 156
1315 					 */
1316 					if (*prestr == ']') {
1317 						prestr++;
1318 						rlxval = collate_char;
1319 						return CHAR;
1320 					}
1321 				}
1322 			} else if (c == '[' && *prestr == '=') {
1323 				char equiv_char;
1324 				prestr++;
1325 				equiv_char = *prestr++;
1326 				if (*prestr == '=' && prestr[1] == ']') {
1327 					prestr += 2;
1328 					/* Found it: map via locale TBD: for now
1329 					   simply return this char. This is
1330 					   sufficient to pass conformance test
1331 					   awk.ex 156
1332 					 */
1333 					if (*prestr == ']') {
1334 						prestr++;
1335 						rlxval = equiv_char;
1336 						return CHAR;
1337 					}
1338 				}
1339 			} else if (c == '\0') {
1340 				FATAL("nonterminated character class %.20s", lastre);
1341 			} else if (bp == buf) {	/* 1st char is special */
1342 				*bp++ = c;
1343 			} else if (c == ']') {
1344 				*bp++ = 0;
1345 				rlxstr = (uschar *) tostring((char *) buf);
1346 				if (cflag == 0)
1347 					return CCL;
1348 				else
1349 					return NCCL;
1350 			} else
1351 				*bp++ = c;
1352 		}
1353 		break;
1354 	case '{':
1355 		if (isdigit(*(prestr))) {
1356 			num = 0;	/* Process as a repetition */
1357 			n = -1; m = -1;
1358 			commafound = false;
1359 			digitfound = false;
1360 			startreptok = prestr-1;
1361 			/* Remember start of previous atom here ? */
1362 		} else {        	/* just a { char, not a repetition */
1363 			rlxval = c;
1364 			return CHAR;
1365                 }
1366 		for (; ; ) {
1367 			if ((c = *prestr++) == '}') {
1368 				if (commafound) {
1369 					if (digitfound) { /* {n,m} */
1370 						m = num;
1371 						if (m < n)
1372 							FATAL("illegal repetition expression: class %.20s",
1373 								lastre);
1374 						if (n == 0 && m == 1) {
1375 							return QUEST;
1376 						}
1377 					} else {	/* {n,} */
1378 						if (n == 0)
1379 							return STAR;
1380 						else if (n == 1)
1381 							return PLUS;
1382 					}
1383 				} else {
1384 					if (digitfound) { /* {n} same as {n,n} */
1385 						n = num;
1386 						m = num;
1387 					} else {	/* {} */
1388 						FATAL("illegal repetition expression: class %.20s",
1389 							lastre);
1390 					}
1391 				}
1392 				if (repeat(starttok, prestr-starttok, lastatom,
1393 					   startreptok - lastatom, n, m) > 0) {
1394 					if (n == 0 && m == 0) {
1395 						return ZERO;
1396 					}
1397 					/* must rescan input for next token */
1398 					goto rescan;
1399 				}
1400 				/* Failed to replace: eat up {...} characters
1401 				   and treat like just PLUS */
1402 				return PLUS;
1403 			} else if (c == '\0') {
1404 				FATAL("nonterminated character class %.20s",
1405 					lastre);
1406 			} else if (isdigit(c)) {
1407 				num = 10 * num + c - '0';
1408 				digitfound = true;
1409 			} else if (c == ',') {
1410 				if (commafound)
1411 					FATAL("illegal repetition expression: class %.20s",
1412 						lastre);
1413 				/* looking for {n,} or {n,m} */
1414 				commafound = true;
1415 				n = num;
1416 				digitfound = false; /* reset */
1417 				num = 0;
1418 			} else {
1419 				FATAL("illegal repetition expression: class %.20s",
1420 					lastre);
1421 			}
1422 		}
1423 		break;
1424 	}
1425 }
1426 
1427 int cgoto(fa *f, int s, int c)
1428 {
1429 	int *p, *q;
1430 	int i, j, k;
1431 
1432 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1433 	while (f->accept >= maxsetvec) {	/* guessing here! */
1434 		resizesetvec(__func__);
1435 	}
1436 	for (i = 0; i <= f->accept; i++)
1437 		setvec[i] = 0;
1438 	setcnt = 0;
1439 	resize_state(f, s);
1440 	/* compute positions of gototab[s,c] into setvec */
1441 	p = f->posns[s];
1442 	for (i = 1; i <= *p; i++) {
1443 		if ((k = f->re[p[i]].ltype) != FINAL) {
1444 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1445 			 || (k == DOT && c != 0 && c != HAT)
1446 			 || (k == ALL && c != 0)
1447 			 || (k == EMPTYRE && c != 0)
1448 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1449 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1450 				q = f->re[p[i]].lfollow;
1451 				for (j = 1; j <= *q; j++) {
1452 					if (q[j] >= maxsetvec) {
1453 						resizesetvec(__func__);
1454 					}
1455 					if (setvec[q[j]] == 0) {
1456 						setcnt++;
1457 						setvec[q[j]] = 1;
1458 					}
1459 				}
1460 			}
1461 		}
1462 	}
1463 	/* determine if setvec is a previous state */
1464 	tmpset[0] = setcnt;
1465 	j = 1;
1466 	for (i = f->accept; i >= 0; i--)
1467 		if (setvec[i]) {
1468 			tmpset[j++] = i;
1469 		}
1470 	resize_state(f, f->curstat > s ? f->curstat : s);
1471 	/* tmpset == previous state? */
1472 	for (i = 1; i <= f->curstat; i++) {
1473 		p = f->posns[i];
1474 		if ((k = tmpset[0]) != p[0])
1475 			goto different;
1476 		for (j = 1; j <= k; j++)
1477 			if (tmpset[j] != p[j])
1478 				goto different;
1479 		/* setvec is state i */
1480 		if (c != HAT)
1481 			set_gototab(f, s, c, i);
1482 		return i;
1483 	  different:;
1484 	}
1485 
1486 	/* add tmpset to current set of states */
1487 	++(f->curstat);
1488 	resize_state(f, f->curstat);
1489 	for (i = 0; i < NCHARS; i++)
1490 		set_gototab(f, f->curstat, 0, 0);
1491 	xfree(f->posns[f->curstat]);
1492 	p = intalloc(setcnt + 1, __func__);
1493 
1494 	f->posns[f->curstat] = p;
1495 	if (c != HAT)
1496 		set_gototab(f, s, c, f->curstat);
1497 	for (i = 0; i <= setcnt; i++)
1498 		p[i] = tmpset[i];
1499 	if (setvec[f->accept])
1500 		f->out[f->curstat] = 1;
1501 	else
1502 		f->out[f->curstat] = 0;
1503 	return f->curstat;
1504 }
1505 
1506 
1507 void freefa(fa *f)	/* free a finite automaton */
1508 {
1509 	int i;
1510 
1511 	if (f == NULL)
1512 		return;
1513 	for (i = 0; i < f->state_count; i++)
1514 		xfree(f->gototab[i])
1515 	for (i = 0; i <= f->curstat; i++)
1516 		xfree(f->posns[i]);
1517 	for (i = 0; i <= f->accept; i++) {
1518 		xfree(f->re[i].lfollow);
1519 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1520 			xfree(f->re[i].lval.np);
1521 	}
1522 	xfree(f->restr);
1523 	xfree(f->out);
1524 	xfree(f->posns);
1525 	xfree(f->gototab);
1526 	xfree(f);
1527 }
1528