xref: /freebsd/contrib/one-true-awk/b.c (revision 2db2566f55b06d80e28c932ec30b5730815d95a2)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36 
37 #define MAXLIN 22
38 
39 #define type(v)		(v)->nobj	/* badly overloaded here */
40 #define info(v)		(v)->ntype	/* badly overloaded here */
41 #define left(v)		(v)->narg[0]
42 #define right(v)	(v)->narg[1]
43 #define parent(v)	(v)->nnext
44 
45 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
47 #define UNARY	case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 		left is index, right contains value or pointer to value
52 	unary (STAR, PLUS, QUEST): left is child, right is null
53 	binary (CAT, OR): left and right are children
54 	parent contains pointer to parent
55 */
56 
57 
58 int	*setvec;
59 int	*tmpset;
60 int	maxsetvec = 0;
61 
62 int	rtok;		/* next token in current re */
63 int	rlxval;
64 static const uschar	*rlxstr;
65 static const uschar	*prestr;	/* current position in current re */
66 static const uschar	*lastre;	/* origin of last re */
67 static const uschar	*lastatom;	/* origin of last Atom */
68 static const uschar	*starttok;
69 static const uschar 	*basestr;	/* starts with original, replaced during
70 				   repetition processing */
71 static const uschar 	*firstbasestr;
72 
73 static	int setcnt;
74 static	int poscnt;
75 
76 const char	*patbeg;
77 int	patlen;
78 
79 #define	NFA	128	/* cache this many dynamic fa's */
80 fa	*fatab[NFA];
81 int	nfatab	= 0;	/* entries in fatab */
82 
83 extern int u8_nextlen(const char *s);
84 
85 
86 /* utf-8 mechanism:
87 
88    For most of Awk, utf-8 strings just "work", since they look like
89    null-terminated sequences of 8-bit bytes.
90 
91    Functions like length(), index(), and substr() have to operate
92    in units of utf-8 characters.  The u8_* functions in run.c
93    handle this.
94 
95    Regular expressions are more complicated, since the basic
96    mechanism of the goto table used 8-bit byte indices into the
97    gototab entries to compute the next state.  Unicode is a lot
98    bigger, so the gototab entries are now structs with a character
99    and a next state. These are sorted by code point and binary
100    searched.
101 
102    Throughout the RE mechanism in b.c, utf-8 characters are
103    converted to their utf-32 value.  This mostly shows up in
104    cclenter, which expands character class ranges like a-z and now
105    alpha-omega.  The size of a gototab array is still about 256.
106    This should be dynamic, but for now things work ok for a single
107    code page of Unicode, which is the most likely case.
108 
109    The code changes are localized in run.c and b.c.  I have added a
110    handful of functions to somewhat better hide the implementation,
111    but a lot more could be done.
112 
113  */
114 
115 static int entry_cmp(const void *l, const void *r);
116 static int get_gototab(fa*, int, int);
117 static int set_gototab(fa*, int, int, int);
118 static void clear_gototab(fa*, int);
119 extern int u8_rune(int *, const char *);
120 
121 static int *
122 intalloc(size_t n, const char *f)
123 {
124 	int *p = (int *) calloc(n, sizeof(int));
125 	if (p == NULL)
126 		overflo(f);
127 	return p;
128 }
129 
130 static void
131 resizesetvec(const char *f)
132 {
133 	if (maxsetvec == 0)
134 		maxsetvec = MAXLIN;
135 	else
136 		maxsetvec *= 4;
137 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139 	if (setvec == NULL || tmpset == NULL)
140 		overflo(f);
141 }
142 
143 static void
144 resize_state(fa *f, int state)
145 {
146 	gtt *p;
147 	uschar *p2;
148 	int **p3;
149 	int i, new_count;
150 
151 	if (++state < f->state_count)
152 		return;
153 
154 	new_count = state + 10; /* needs to be tuned */
155 
156 	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157 	if (p == NULL)
158 		goto out;
159 	f->gototab = p;
160 
161 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162 	if (p2 == NULL)
163 		goto out;
164 	f->out = p2;
165 
166 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167 	if (p3 == NULL)
168 		goto out;
169 	f->posns = p3;
170 
171 	for (i = f->state_count; i < new_count; ++i) {
172 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173 		if (f->gototab[i].entries == NULL)
174 			goto out;
175 		f->gototab[i].allocated = NCHARS;
176 		f->gototab[i].inuse = 0;
177 		f->out[i] = 0;
178 		f->posns[i] = NULL;
179 	}
180 	f->state_count = new_count;
181 	return;
182 out:
183 	overflo(__func__);
184 }
185 
186 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
187 {
188 	int i, use, nuse;
189 	fa *pfa;
190 	static int now = 1;
191 
192 	if (setvec == NULL) {	/* first time through any RE */
193 		resizesetvec(__func__);
194 	}
195 
196 	if (compile_time != RUNNING)	/* a constant for sure */
197 		return mkdfa(s, anchor);
198 	for (i = 0; i < nfatab; i++)	/* is it there already? */
199 		if (fatab[i]->anchor == anchor
200 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
201 			fatab[i]->use = now++;
202 			return fatab[i];
203 		}
204 	pfa = mkdfa(s, anchor);
205 	if (nfatab < NFA) {	/* room for another */
206 		fatab[nfatab] = pfa;
207 		fatab[nfatab]->use = now++;
208 		nfatab++;
209 		return pfa;
210 	}
211 	use = fatab[0]->use;	/* replace least-recently used */
212 	nuse = 0;
213 	for (i = 1; i < nfatab; i++)
214 		if (fatab[i]->use < use) {
215 			use = fatab[i]->use;
216 			nuse = i;
217 		}
218 	freefa(fatab[nuse]);
219 	fatab[nuse] = pfa;
220 	pfa->use = now++;
221 	return pfa;
222 }
223 
224 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
225 				/* anchor = true for anchored matches, else false */
226 {
227 	Node *p, *p1;
228 	fa *f;
229 
230 	firstbasestr = (const uschar *) s;
231 	basestr = firstbasestr;
232 	p = reparse(s);
233 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234 		/* put ALL STAR in front of reg.  exp. */
235 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236 		/* put FINAL after reg.  exp. */
237 
238 	poscnt = 0;
239 	penter(p1);	/* enter parent pointers and leaf indices */
240 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241 		overflo(__func__);
242 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
243 	cfoll(f, p1);	/* set up follow sets */
244 	freetr(p1);
245 	resize_state(f, 1);
246 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247 	f->posns[1] = intalloc(1, __func__);
248 	*f->posns[1] = 0;
249 	f->initstat = makeinit(f, anchor);
250 	f->anchor = anchor;
251 	f->restr = (uschar *) tostring(s);
252 	if (firstbasestr != basestr) {
253 		if (basestr)
254 			xfree(basestr);
255 	}
256 	return f;
257 }
258 
259 int makeinit(fa *f, bool anchor)
260 {
261 	int i, k;
262 
263 	f->curstat = 2;
264 	f->out[2] = 0;
265 	k = *(f->re[0].lfollow);
266 	xfree(f->posns[2]);
267 	f->posns[2] = intalloc(k + 1,  __func__);
268 	for (i = 0; i <= k; i++) {
269 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
270 	}
271 	if ((f->posns[2])[1] == f->accept)
272 		f->out[2] = 1;
273 	clear_gototab(f, 2);
274 	f->curstat = cgoto(f, 2, HAT);
275 	if (anchor) {
276 		*f->posns[2] = k-1;	/* leave out position 0 */
277 		for (i = 0; i < k; i++) {
278 			(f->posns[0])[i] = (f->posns[2])[i];
279 		}
280 
281 		f->out[0] = f->out[2];
282 		if (f->curstat != 2)
283 			--(*f->posns[f->curstat]);
284 	}
285 	return f->curstat;
286 }
287 
288 void penter(Node *p)	/* set up parent pointers and leaf indices */
289 {
290 	switch (type(p)) {
291 	ELEAF
292 	LEAF
293 		info(p) = poscnt;
294 		poscnt++;
295 		break;
296 	UNARY
297 		penter(left(p));
298 		parent(left(p)) = p;
299 		break;
300 	case CAT:
301 	case OR:
302 		penter(left(p));
303 		penter(right(p));
304 		parent(left(p)) = p;
305 		parent(right(p)) = p;
306 		break;
307 	case ZERO:
308 		break;
309 	default:	/* can't happen */
310 		FATAL("can't happen: unknown type %d in penter", type(p));
311 		break;
312 	}
313 }
314 
315 void freetr(Node *p)	/* free parse tree */
316 {
317 	switch (type(p)) {
318 	ELEAF
319 	LEAF
320 		xfree(p);
321 		break;
322 	UNARY
323 	case ZERO:
324 		freetr(left(p));
325 		xfree(p);
326 		break;
327 	case CAT:
328 	case OR:
329 		freetr(left(p));
330 		freetr(right(p));
331 		xfree(p);
332 		break;
333 	default:	/* can't happen */
334 		FATAL("can't happen: unknown type %d in freetr", type(p));
335 		break;
336 	}
337 }
338 
339 /* in the parsing of regular expressions, metacharacters like . have */
340 /* to be seen literally;  \056 is not a metacharacter. */
341 
342 int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
343 {			/* only pick up one 8-bit byte (2 chars) */
344 	const uschar *p;
345 	int n = 0;
346 	int i;
347 
348 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349 		if (isdigit((int) *p))
350 			n = 16 * n + *p - '0';
351 		else if (*p >= 'a' && *p <= 'f')
352 			n = 16 * n + *p - 'a' + 10;
353 		else if (*p >= 'A' && *p <= 'F')
354 			n = 16 * n + *p - 'A' + 10;
355 	}
356 	*pp = p;
357 	return n;
358 }
359 
360 
361 
362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
363 
364 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
365 			/* and increment *pp */
366 {
367 	const uschar *p = *pp;
368 	int c;
369 
370 /* BUG: should advance by utf-8 char even if makes no sense */
371 
372 	if ((c = *p++) == 't') {
373 		c = '\t';
374 	} else if (c == 'n') {
375 		c = '\n';
376 	} else if (c == 'f') {
377 		c = '\f';
378 	} else if (c == 'r') {
379 		c = '\r';
380 	} else if (c == 'b') {
381 		c = '\b';
382 	} else if (c == 'v') {
383 		c = '\v';
384 	} else if (c == 'a') {
385 		c = '\a';
386 	} else if (c == '\\') {
387 		c = '\\';
388 	} else if (c == 'x') {	/* 2 hex digits follow */
389 		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
390 	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
391 		c = hexstr(&p, 8);
392 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
393 		int n = c - '0';
394 		if (isoctdigit(*p)) {
395 			n = 8 * n + *p++ - '0';
396 			if (isoctdigit(*p))
397 				n = 8 * n + *p++ - '0';
398 		}
399 		c = n;
400 	} /* else */
401 		/* c = c; */
402 	*pp = p;
403 	return c;
404 }
405 
406 int *cclenter(const char *argp)	/* add a character class */
407 {
408 	int i, c, c2;
409 	int n;
410 	const uschar *p = (const uschar *) argp;
411 	int *bp, *retp;
412 	static int *buf = NULL;
413 	static int bufsz = 100;
414 
415 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
416 		FATAL("out of space for character class [%.10s...] 1", p);
417 	bp = buf;
418 	for (i = 0; *p != 0; ) {
419 		n = u8_rune(&c, (const char *) p);
420 		p += n;
421 		if (c == '\\') {
422 			c = quoted(&p);
423 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
424 			if (*p != 0) {
425 				c = bp[-1];
426 				/* c2 = *p++; */
427 				n = u8_rune(&c2, (const char *) p);
428 				p += n;
429 				if (c2 == '\\')
430 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
431 				if (c > c2) {	/* empty; ignore */
432 					bp--;
433 					i--;
434 					continue;
435 				}
436 				while (c < c2) {
437 					if (i >= bufsz) {
438 						bufsz *= 2;
439 						buf = (int *) realloc(buf, bufsz * sizeof(int));
440 						if (buf == NULL)
441 							FATAL("out of space for character class [%.10s...] 2", p);
442 						bp = buf + i;
443 					}
444 					*bp++ = ++c;
445 					i++;
446 				}
447 				continue;
448 			}
449 		}
450 		if (i >= bufsz) {
451 			bufsz *= 2;
452 			buf = (int *) realloc(buf, bufsz * sizeof(int));
453 			if (buf == NULL)
454 				FATAL("out of space for character class [%.10s...] 2", p);
455 			bp = buf + i;
456 		}
457 		*bp++ = c;
458 		i++;
459 	}
460 	*bp = 0;
461 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
462 	/* xfree(op);  BUG: what are we freeing here? */
463 	retp = (int *) calloc(bp-buf+1, sizeof(int));
464 	for (i = 0; i < bp-buf+1; i++)
465 		retp[i] = buf[i];
466 	return retp;
467 }
468 
469 void overflo(const char *s)
470 {
471 	FATAL("regular expression too big: out of space in %.30s...", s);
472 }
473 
474 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
475 {
476 	int i;
477 	int *p;
478 
479 	switch (type(v)) {
480 	ELEAF
481 	LEAF
482 		f->re[info(v)].ltype = type(v);
483 		f->re[info(v)].lval.np = right(v);
484 		while (f->accept >= maxsetvec) {	/* guessing here! */
485 			resizesetvec(__func__);
486 		}
487 		for (i = 0; i <= f->accept; i++)
488 			setvec[i] = 0;
489 		setcnt = 0;
490 		follow(v);	/* computes setvec and setcnt */
491 		p = intalloc(setcnt + 1, __func__);
492 		f->re[info(v)].lfollow = p;
493 		*p = setcnt;
494 		for (i = f->accept; i >= 0; i--)
495 			if (setvec[i] == 1)
496 				*++p = i;
497 		break;
498 	UNARY
499 		cfoll(f,left(v));
500 		break;
501 	case CAT:
502 	case OR:
503 		cfoll(f,left(v));
504 		cfoll(f,right(v));
505 		break;
506 	case ZERO:
507 		break;
508 	default:	/* can't happen */
509 		FATAL("can't happen: unknown type %d in cfoll", type(v));
510 	}
511 }
512 
513 int first(Node *p)	/* collects initially active leaves of p into setvec */
514 			/* returns 0 if p matches empty string */
515 {
516 	int b, lp;
517 
518 	switch (type(p)) {
519 	ELEAF
520 	LEAF
521 		lp = info(p);	/* look for high-water mark of subscripts */
522 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
523 			resizesetvec(__func__);
524 		}
525 		if (type(p) == EMPTYRE) {
526 			setvec[lp] = 0;
527 			return(0);
528 		}
529 		if (setvec[lp] != 1) {
530 			setvec[lp] = 1;
531 			setcnt++;
532 		}
533 		if (type(p) == CCL && (*(int *) right(p)) == 0)
534 			return(0);		/* empty CCL */
535 		return(1);
536 	case PLUS:
537 		if (first(left(p)) == 0)
538 			return(0);
539 		return(1);
540 	case STAR:
541 	case QUEST:
542 		first(left(p));
543 		return(0);
544 	case CAT:
545 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
546 		return(1);
547 	case OR:
548 		b = first(right(p));
549 		if (first(left(p)) == 0 || b == 0) return(0);
550 		return(1);
551 	case ZERO:
552 		return 0;
553 	}
554 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
555 	return(-1);
556 }
557 
558 void follow(Node *v)	/* collects leaves that can follow v into setvec */
559 {
560 	Node *p;
561 
562 	if (type(v) == FINAL)
563 		return;
564 	p = parent(v);
565 	switch (type(p)) {
566 	case STAR:
567 	case PLUS:
568 		first(v);
569 		follow(p);
570 		return;
571 
572 	case OR:
573 	case QUEST:
574 		follow(p);
575 		return;
576 
577 	case CAT:
578 		if (v == left(p)) {	/* v is left child of p */
579 			if (first(right(p)) == 0) {
580 				follow(p);
581 				return;
582 			}
583 		} else		/* v is right child */
584 			follow(p);
585 		return;
586 	}
587 }
588 
589 int member(int c, int *sarg)	/* is c in s? */
590 {
591 	int *s = (int *) sarg;
592 
593 	while (*s)
594 		if (c == *s++)
595 			return(1);
596 	return(0);
597 }
598 
599 static void resize_gototab(fa *f, int state)
600 {
601 	size_t new_size = f->gototab[state].allocated * 2;
602 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
603 	if (p == NULL)
604 		overflo(__func__);
605 
606 	// need to initialized the new memory to zero
607 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
608 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
609 
610 	f->gototab[state].allocated = new_size;			// update gototab info
611 	f->gototab[state].entries = p;
612 }
613 
614 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
615 {
616 	gtte key;
617 	gtte *item;
618 
619 	key.ch = ch;
620 	key.state = 0;	/* irrelevant */
621 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
622 			f->gototab[state].inuse, sizeof(gtte),
623 			entry_cmp);
624 
625 	if (item == NULL)
626 		return 0;
627 	else
628 		return item->state;
629 }
630 
631 static int entry_cmp(const void *l, const void *r)
632 {
633 	const gtte *left, *right;
634 
635 	left = (const gtte *) l;
636 	right = (const gtte *) r;
637 
638 	return left->ch - right->ch;
639 }
640 
641 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
642 {
643 	if (f->gototab[state].inuse == 0) {
644 		f->gototab[state].entries[0].ch = ch;
645 		f->gototab[state].entries[0].state = val;
646 		f->gototab[state].inuse++;
647 		return val;
648 	} else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
649 		// not seen yet, insert and return
650 		gtt *tab = & f->gototab[state];
651 		if (tab->inuse + 1 >= tab->allocated)
652 			resize_gototab(f, state);
653 
654 		f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
655 		f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
656 		f->gototab[state].inuse++;
657 		return val;
658 	} else {
659 		// maybe we have it, maybe we don't
660 		gtte key;
661 		gtte *item;
662 
663 		key.ch = ch;
664 		key.state = 0;	/* irrelevant */
665 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
666 				f->gototab[state].inuse, sizeof(gtte),
667 				entry_cmp);
668 
669 		if (item != NULL) {
670 			// we have it, update state and return
671 			item->state = val;
672 			return item->state;
673 		}
674 		// otherwise, fall through to insert and reallocate.
675 	}
676 
677 	gtt *tab = & f->gototab[state];
678 	if (tab->inuse + 1 >= tab->allocated)
679 		resize_gototab(f, state);
680 	++tab->inuse;
681 	f->gototab[state].entries[tab->inuse].ch = ch;
682 	f->gototab[state].entries[tab->inuse].state = val;
683 
684 	qsort(f->gototab[state].entries,
685 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
686 
687 	return val; /* not used anywhere at the moment */
688 }
689 
690 static void clear_gototab(fa *f, int state)
691 {
692 	memset(f->gototab[state].entries, 0,
693 		f->gototab[state].allocated * sizeof(gtte));
694 	f->gototab[state].inuse = 0;
695 }
696 
697 int match(fa *f, const char *p0)	/* shortest match ? */
698 {
699 	int s, ns;
700 	int n;
701 	int rune;
702 	const uschar *p = (const uschar *) p0;
703 
704 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
705 
706 	s = f->initstat;
707 	assert (s < f->state_count);
708 
709 	if (f->out[s])
710 		return(1);
711 	do {
712 		/* assert(*p < NCHARS); */
713 		n = u8_rune(&rune, (const char *) p);
714 		if ((ns = get_gototab(f, s, rune)) != 0)
715 			s = ns;
716 		else
717 			s = cgoto(f, s, rune);
718 		if (f->out[s])
719 			return(1);
720 		if (*p == 0)
721 			break;
722 		p += n;
723 	} while (1);  /* was *p++ != 0 */
724 	return(0);
725 }
726 
727 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
728 {
729 	int s, ns;
730 	int n;
731 	int rune;
732 	const uschar *p = (const uschar *) p0;
733 	const uschar *q;
734 
735 	s = f->initstat;
736 	assert(s < f->state_count);
737 
738 	patbeg = (const char *)p;
739 	patlen = -1;
740 	do {
741 		q = p;
742 		do {
743 			if (f->out[s])		/* final state */
744 				patlen = q-p;
745 			/* assert(*q < NCHARS); */
746 			n = u8_rune(&rune, (const char *) q);
747 			if ((ns = get_gototab(f, s, rune)) != 0)
748 				s = ns;
749 			else
750 				s = cgoto(f, s, rune);
751 
752 			assert(s < f->state_count);
753 
754 			if (s == 1) {	/* no transition */
755 				if (patlen >= 0) {
756 					patbeg = (const char *) p;
757 					return(1);
758 				}
759 				else
760 					goto nextin;	/* no match */
761 			}
762 			if (*q == 0)
763 				break;
764 			q += n;
765 		} while (1);
766 		q++;  /* was *q++ */
767 		if (f->out[s])
768 			patlen = q-p-1;	/* don't count $ */
769 		if (patlen >= 0) {
770 			patbeg = (const char *) p;
771 			return(1);
772 		}
773 	nextin:
774 		s = 2;
775 		if (*p == 0)
776 			break;
777 		n = u8_rune(&rune, (const char *) p);
778 		p += n;
779 	} while (1); /* was *p++ */
780 	return (0);
781 }
782 
783 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
784 {
785 	int s, ns;
786         int n;
787         int rune;
788 	const uschar *p = (const uschar *) p0;
789 	const uschar *q;
790 
791 	s = f->initstat;
792 	assert(s < f->state_count);
793 
794 	patbeg = (const char *)p;
795 	patlen = -1;
796 	while (*p) {
797 		q = p;
798 		do {
799 			if (f->out[s])		/* final state */
800 				patlen = q-p;
801 			/* assert(*q < NCHARS); */
802 			n = u8_rune(&rune, (const char *) q);
803 			if ((ns = get_gototab(f, s, rune)) != 0)
804 				s = ns;
805 			else
806 				s = cgoto(f, s, rune);
807 			if (s == 1) {	/* no transition */
808 				if (patlen > 0) {
809 					patbeg = (const char *) p;
810 					return(1);
811 				} else
812 					goto nnextin;	/* no nonempty match */
813 			}
814 			if (*q == 0)
815 				break;
816 			q += n;
817 		} while (1);
818 		q++;
819 		if (f->out[s])
820 			patlen = q-p-1;	/* don't count $ */
821 		if (patlen > 0 ) {
822 			patbeg = (const char *) p;
823 			return(1);
824 		}
825 	nnextin:
826 		s = 2;
827 		p++;
828 	}
829 	return (0);
830 }
831 
832 
833 #define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
834 
835 /*
836  * NAME
837  *     fnematch
838  *
839  * DESCRIPTION
840  *     A stream-fed version of nematch which transfers characters to a
841  *     null-terminated buffer. All characters up to and including the last
842  *     character of the matching text or EOF are placed in the buffer. If
843  *     a match is found, patbeg and patlen are set appropriately.
844  *
845  * RETURN VALUES
846  *     false    No match found.
847  *     true     Match found.
848  */
849 
850 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
851 {
852 	char *i, *j, *k, *buf = *pbuf;
853 	int bufsize = *pbufsize;
854 	int c, n, ns, s;
855 
856 	s = pfa->initstat;
857 	patlen = 0;
858 
859 	/*
860 	 * buf <= i <= j <= k <= buf+bufsize
861 	 *
862 	 * i: origin of active substring
863 	 * j: current character
864 	 * k: destination of the next getc
865 	 */
866 
867 	i = j = k = buf;
868 
869 	do {
870 		/*
871 		 * Call u8_rune with at least MAX_UTF_BYTES ahead in
872 		 * the buffer until EOF interferes.
873 		 */
874 		if (k - j < MAX_UTF_BYTES) {
875 			if (k + MAX_UTF_BYTES > buf + bufsize) {
876 				adjbuf((char **) &buf, &bufsize,
877 				    bufsize + MAX_UTF_BYTES,
878 				    quantum, 0, "fnematch");
879 			}
880 			for (n = MAX_UTF_BYTES ; n > 0; n--) {
881 				*k++ = (c = getc(f)) != EOF ? c : 0;
882 				if (c == EOF) {
883 					if (ferror(f))
884 						FATAL("fnematch: getc error");
885 					break;
886 				}
887 			}
888 		}
889 
890 		j += u8_rune(&c, j);
891 
892 		if ((ns = get_gototab(pfa, s, c)) != 0)
893 			s = ns;
894 		else
895 			s = cgoto(pfa, s, c);
896 
897 		if (pfa->out[s]) {	/* final state */
898 			patbeg = i;
899 			patlen = j - i;
900 			if (c == 0)	/* don't count $ */
901 				patlen--;
902 		}
903 
904 		if (c && s != 1)
905 			continue;  /* origin i still viable, next j */
906 		if (patlen)
907 			break;     /* best match found */
908 
909 		/* no match at origin i, next i and start over */
910 		i += u8_rune(&c, i);
911 		if (c == 0)
912 			break;    /* no match */
913 		j = i;
914 		s = 2;
915 	} while (1);
916 
917 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
918 	*pbuf = buf;
919 	*pbufsize = bufsize;
920 
921 	if (patlen) {
922 		/*
923 		 * Under no circumstances is the last character fed to
924 		 * the automaton part of the match. It is EOF's nullbyte,
925 		 * or it sent the automaton into a state with no further
926 		 * transitions available (s==1), or both. Room for a
927 		 * terminating nullbyte is guaranteed.
928 		 *
929 		 * ungetc any chars after the end of matching text
930 		 * (except for EOF's nullbyte, if present) and null
931 		 * terminate the buffer.
932 		 */
933 		do
934 			if (*--k && ungetc(*k, f) == EOF)
935 				FATAL("unable to ungetc '%c'", *k);
936 		while (k > patbeg + patlen);
937 		*k = '\0';
938 		return true;
939 	}
940 	else
941 		return false;
942 }
943 
944 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
945 {			/* uses relex() to scan regular expression */
946 	Node *np;
947 
948 	DPRINTF("reparse <%s>\n", p);
949 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
950 	rtok = relex();
951 	/* GNU compatibility: an empty regexp matches anything */
952 	if (rtok == '\0') {
953 		/* FATAL("empty regular expression"); previous */
954 		return(op2(EMPTYRE, NIL, NIL));
955 	}
956 	np = regexp();
957 	if (rtok != '\0')
958 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
959 	return(np);
960 }
961 
962 Node *regexp(void)	/* top-level parse of reg expr */
963 {
964 	return (alt(concat(primary())));
965 }
966 
967 Node *primary(void)
968 {
969 	Node *np;
970 	int savelastatom;
971 
972 	switch (rtok) {
973 	case CHAR:
974 		lastatom = starttok;
975 		np = op2(CHAR, NIL, itonp(rlxval));
976 		rtok = relex();
977 		return (unary(np));
978 	case ALL:
979 		rtok = relex();
980 		return (unary(op2(ALL, NIL, NIL)));
981 	case EMPTYRE:
982 		rtok = relex();
983 		return (unary(op2(EMPTYRE, NIL, NIL)));
984 	case DOT:
985 		lastatom = starttok;
986 		rtok = relex();
987 		return (unary(op2(DOT, NIL, NIL)));
988 	case CCL:
989 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
990 		lastatom = starttok;
991 		rtok = relex();
992 		return (unary(np));
993 	case NCCL:
994 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
995 		lastatom = starttok;
996 		rtok = relex();
997 		return (unary(np));
998 	case '^':
999 		rtok = relex();
1000 		return (unary(op2(CHAR, NIL, itonp(HAT))));
1001 	case '$':
1002 		rtok = relex();
1003 		return (unary(op2(CHAR, NIL, NIL)));
1004 	case '(':
1005 		lastatom = starttok;
1006 		savelastatom = starttok - basestr; /* Retain over recursion */
1007 		rtok = relex();
1008 		if (rtok == ')') {	/* special pleading for () */
1009 			rtok = relex();
1010 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1011 		}
1012 		np = regexp();
1013 		if (rtok == ')') {
1014 			lastatom = basestr + savelastatom; /* Restore */
1015 			rtok = relex();
1016 			return (unary(np));
1017 		}
1018 		else
1019 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1020 	default:
1021 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1022 	}
1023 	return 0;	/*NOTREACHED*/
1024 }
1025 
1026 Node *concat(Node *np)
1027 {
1028 	switch (rtok) {
1029 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1030 		return (concat(op2(CAT, np, primary())));
1031 	case EMPTYRE:
1032 		rtok = relex();
1033 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1034 				primary())));
1035 	}
1036 	return (np);
1037 }
1038 
1039 Node *alt(Node *np)
1040 {
1041 	if (rtok == OR) {
1042 		rtok = relex();
1043 		return (alt(op2(OR, np, concat(primary()))));
1044 	}
1045 	return (np);
1046 }
1047 
1048 Node *unary(Node *np)
1049 {
1050 	switch (rtok) {
1051 	case STAR:
1052 		rtok = relex();
1053 		return (unary(op2(STAR, np, NIL)));
1054 	case PLUS:
1055 		rtok = relex();
1056 		return (unary(op2(PLUS, np, NIL)));
1057 	case QUEST:
1058 		rtok = relex();
1059 		return (unary(op2(QUEST, np, NIL)));
1060 	case ZERO:
1061 		rtok = relex();
1062 		return (unary(op2(ZERO, np, NIL)));
1063 	default:
1064 		return (np);
1065 	}
1066 }
1067 
1068 /*
1069  * Character class definitions conformant to the POSIX locale as
1070  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1071  * and operating character sets are both ASCII (ISO646) or supersets
1072  * thereof.
1073  *
1074  * Note that to avoid overflowing the temporary buffer used in
1075  * relex(), the expanded character class (prior to range expansion)
1076  * must be less than twice the size of their full name.
1077  */
1078 
1079 /* Because isblank doesn't show up in any of the header files on any
1080  * system i use, it's defined here.  if some other locale has a richer
1081  * definition of "blank", define HAS_ISBLANK and provide your own
1082  * version.
1083  * the parentheses here are an attempt to find a path through the maze
1084  * of macro definition and/or function and/or version provided.  thanks
1085  * to nelson beebe for the suggestion; let's see if it works everywhere.
1086  */
1087 
1088 /* #define HAS_ISBLANK */
1089 #ifndef HAS_ISBLANK
1090 
1091 int (xisblank)(int c)
1092 {
1093 	return c==' ' || c=='\t';
1094 }
1095 
1096 #endif
1097 
1098 static const struct charclass {
1099 	const char *cc_name;
1100 	int cc_namelen;
1101 	int (*cc_func)(int);
1102 } charclasses[] = {
1103 	{ "alnum",	5,	isalnum },
1104 	{ "alpha",	5,	isalpha },
1105 #ifndef HAS_ISBLANK
1106 	{ "blank",	5,	xisblank },
1107 #else
1108 	{ "blank",	5,	isblank },
1109 #endif
1110 	{ "cntrl",	5,	iscntrl },
1111 	{ "digit",	5,	isdigit },
1112 	{ "graph",	5,	isgraph },
1113 	{ "lower",	5,	islower },
1114 	{ "print",	5,	isprint },
1115 	{ "punct",	5,	ispunct },
1116 	{ "space",	5,	isspace },
1117 	{ "upper",	5,	isupper },
1118 	{ "xdigit",	6,	isxdigit },
1119 	{ NULL,		0,	NULL },
1120 };
1121 
1122 #define REPEAT_SIMPLE		0
1123 #define REPEAT_PLUS_APPENDED	1
1124 #define REPEAT_WITH_Q		2
1125 #define REPEAT_ZERO		3
1126 
1127 static int
1128 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1129 	       int atomlen, int firstnum, int secondnum, int special_case)
1130 {
1131 	int i, j;
1132 	uschar *buf = 0;
1133 	int ret = 1;
1134 	int init_q = (firstnum == 0);		/* first added char will be ? */
1135 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1136 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1137 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1138 	int size = prefix_length +  suffix_length;
1139 
1140 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1141 		size += atomlen*(firstnum-1);
1142 	}
1143 
1144 	/* Adjust size of buffer for special cases */
1145 	if (special_case == REPEAT_PLUS_APPENDED) {
1146 		size++;		/* for the final + */
1147 	} else if (special_case == REPEAT_WITH_Q) {
1148 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1149 	} else if (special_case == REPEAT_ZERO) {
1150 		size += 2;	/* just a null ERE: () */
1151 	}
1152 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1153 		FATAL("out of space in reg expr %.10s..", lastre);
1154 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1155 	j = prefix_length;
1156 	if (special_case == REPEAT_ZERO) {
1157 		j -= atomlen;
1158 		buf[j++] = '(';
1159 		buf[j++] = ')';
1160 	}
1161 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1162 		memcpy(&buf[j], atom, atomlen);
1163 		j += atomlen;
1164 	}
1165 	if (special_case == REPEAT_PLUS_APPENDED) {
1166 		buf[j++] = '+';
1167 	} else if (special_case == REPEAT_WITH_Q) {
1168 		if (init_q)
1169 			buf[j++] = '?';
1170 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1171 			memcpy(&buf[j], atom, atomlen);
1172 			j += atomlen;
1173 			buf[j++] = '?';
1174 		}
1175 	}
1176 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1177 	j += suffix_length;
1178 	buf[j] = '\0';
1179 	/* free old basestr */
1180 	if (firstbasestr != basestr) {
1181 		if (basestr)
1182 			xfree(basestr);
1183 	}
1184 	basestr = buf;
1185 	prestr  = buf + prefix_length;
1186 	if (special_case == REPEAT_ZERO) {
1187 		prestr  -= atomlen;
1188 		ret++;
1189 	}
1190 	return ret;
1191 }
1192 
1193 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1194 		  int atomlen, int firstnum, int secondnum)
1195 {
1196 	/*
1197 	   In general, the repetition specifier or "bound" is replaced here
1198 	   by an equivalent ERE string, repeating the immediately previous atom
1199 	   and appending ? and + as needed. Note that the first copy of the
1200 	   atom is left in place, except in the special_case of a zero-repeat
1201 	   (i.e., {0}).
1202 	 */
1203 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1204 		if (firstnum < 2) {
1205 			/* 0 or 1: should be handled before you get here */
1206 			FATAL("internal error");
1207 		} else {
1208 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1209 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1210 		}
1211 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1212 		if (firstnum == 0) {	/* {0} or {0,0} */
1213 			/* This case is unusual because the resulting
1214 			   replacement string might actually be SMALLER than
1215 			   the original ERE */
1216 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1217 					firstnum, secondnum, REPEAT_ZERO);
1218 		} else {		/* (firstnum >= 1) */
1219 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1220 					firstnum, secondnum, REPEAT_SIMPLE);
1221 		}
1222 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1223 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1224 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1225 					firstnum, secondnum, REPEAT_WITH_Q);
1226 	} else {	/* Error - shouldn't be here (n>m) */
1227 		FATAL("internal error");
1228 	}
1229 	return 0;
1230 }
1231 
1232 int relex(void)		/* lexical analyzer for reparse */
1233 {
1234 	int c, n;
1235 	int cflag;
1236 	static uschar *buf = NULL;
1237 	static int bufsz = 100;
1238 	uschar *bp;
1239 	const struct charclass *cc;
1240 	int i;
1241 	int num, m;
1242 	bool commafound, digitfound;
1243 	const uschar *startreptok;
1244 	static int parens = 0;
1245 
1246 rescan:
1247 	starttok = prestr;
1248 
1249 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1250 		prestr += n;
1251 		starttok = prestr;
1252 		return CHAR;
1253 	}
1254 
1255 	switch (c = *prestr++) {
1256 	case '|': return OR;
1257 	case '*': return STAR;
1258 	case '+': return PLUS;
1259 	case '?': return QUEST;
1260 	case '.': return DOT;
1261 	case '\0': prestr--; return '\0';
1262 	case '^':
1263 	case '$':
1264 		return c;
1265 	case '(':
1266 		parens++;
1267 		return c;
1268 	case ')':
1269 		if (parens) {
1270 			parens--;
1271 			return c;
1272 		}
1273 		/* unmatched close parenthesis; per POSIX, treat as literal */
1274 		rlxval = c;
1275 		return CHAR;
1276 	case '\\':
1277 		rlxval = quoted(&prestr);
1278 		return CHAR;
1279 	default:
1280 		rlxval = c;
1281 		return CHAR;
1282 	case '[':
1283 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1284 			FATAL("out of space in reg expr %.10s..", lastre);
1285 		bp = buf;
1286 		if (*prestr == '^') {
1287 			cflag = 1;
1288 			prestr++;
1289 		}
1290 		else
1291 			cflag = 0;
1292 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1293 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1294 			FATAL("out of space for reg expr %.10s...", lastre);
1295 		for (; ; ) {
1296 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1297 				for (i = 0; i < n; i++)
1298 					*bp++ = *prestr++;
1299 				continue;
1300 			}
1301 			if ((c = *prestr++) == '\\') {
1302 				*bp++ = '\\';
1303 				if ((c = *prestr++) == '\0')
1304 					FATAL("nonterminated character class %.20s...", lastre);
1305 				*bp++ = c;
1306 			/* } else if (c == '\n') { */
1307 			/* 	FATAL("newline in character class %.20s...", lastre); */
1308 			} else if (c == '[' && *prestr == ':') {
1309 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1310 				for (cc = charclasses; cc->cc_name; cc++)
1311 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1312 						break;
1313 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1314 				    prestr[2 + cc->cc_namelen] == ']') {
1315 					prestr += cc->cc_namelen + 3;
1316 					/*
1317 					 * BUG: We begin at 1, instead of 0, since we
1318 					 * would otherwise prematurely terminate the
1319 					 * string for classes like [[:cntrl:]]. This
1320 					 * means that we can't match the NUL character,
1321 					 * not without first adapting the entire
1322 					 * program to track each string's length.
1323 					 */
1324 					for (i = 1; i <= UCHAR_MAX; i++) {
1325 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1326 						    FATAL("out of space for reg expr %.10s...", lastre);
1327 						if (cc->cc_func(i)) {
1328 							/* escape backslash */
1329 							if (i == '\\') {
1330 								*bp++ = '\\';
1331 								n++;
1332 							}
1333 
1334 							*bp++ = i;
1335 							n++;
1336 						}
1337 					}
1338 				} else
1339 					*bp++ = c;
1340 			} else if (c == '[' && *prestr == '.') {
1341 				char collate_char;
1342 				prestr++;
1343 				collate_char = *prestr++;
1344 				if (*prestr == '.' && prestr[1] == ']') {
1345 					prestr += 2;
1346 					/* Found it: map via locale TBD: for
1347 					   now, simply return this char.  This
1348 					   is sufficient to pass conformance
1349 					   test awk.ex 156
1350 					 */
1351 					if (*prestr == ']') {
1352 						prestr++;
1353 						rlxval = collate_char;
1354 						return CHAR;
1355 					}
1356 				}
1357 			} else if (c == '[' && *prestr == '=') {
1358 				char equiv_char;
1359 				prestr++;
1360 				equiv_char = *prestr++;
1361 				if (*prestr == '=' && prestr[1] == ']') {
1362 					prestr += 2;
1363 					/* Found it: map via locale TBD: for now
1364 					   simply return this char. This is
1365 					   sufficient to pass conformance test
1366 					   awk.ex 156
1367 					 */
1368 					if (*prestr == ']') {
1369 						prestr++;
1370 						rlxval = equiv_char;
1371 						return CHAR;
1372 					}
1373 				}
1374 			} else if (c == '\0') {
1375 				FATAL("nonterminated character class %.20s", lastre);
1376 			} else if (bp == buf) {	/* 1st char is special */
1377 				*bp++ = c;
1378 			} else if (c == ']') {
1379 				*bp++ = 0;
1380 				rlxstr = (uschar *) tostring((char *) buf);
1381 				if (cflag == 0)
1382 					return CCL;
1383 				else
1384 					return NCCL;
1385 			} else
1386 				*bp++ = c;
1387 		}
1388 		break;
1389 	case '{':
1390 		if (isdigit((int) *(prestr))) {
1391 			num = 0;	/* Process as a repetition */
1392 			n = -1; m = -1;
1393 			commafound = false;
1394 			digitfound = false;
1395 			startreptok = prestr-1;
1396 			/* Remember start of previous atom here ? */
1397 		} else {        	/* just a { char, not a repetition */
1398 			rlxval = c;
1399 			return CHAR;
1400                 }
1401 		for (; ; ) {
1402 			if ((c = *prestr++) == '}') {
1403 				if (commafound) {
1404 					if (digitfound) { /* {n,m} */
1405 						m = num;
1406 						if (m < n)
1407 							FATAL("illegal repetition expression: class %.20s",
1408 								lastre);
1409 						if (n == 0 && m == 1) {
1410 							return QUEST;
1411 						}
1412 					} else {	/* {n,} */
1413 						if (n == 0)
1414 							return STAR;
1415 						else if (n == 1)
1416 							return PLUS;
1417 					}
1418 				} else {
1419 					if (digitfound) { /* {n} same as {n,n} */
1420 						n = num;
1421 						m = num;
1422 					} else {	/* {} */
1423 						FATAL("illegal repetition expression: class %.20s",
1424 							lastre);
1425 					}
1426 				}
1427 				if (repeat(starttok, prestr-starttok, lastatom,
1428 					   startreptok - lastatom, n, m) > 0) {
1429 					if (n == 0 && m == 0) {
1430 						return ZERO;
1431 					}
1432 					/* must rescan input for next token */
1433 					goto rescan;
1434 				}
1435 				/* Failed to replace: eat up {...} characters
1436 				   and treat like just PLUS */
1437 				return PLUS;
1438 			} else if (c == '\0') {
1439 				FATAL("nonterminated character class %.20s",
1440 					lastre);
1441 			} else if (isdigit(c)) {
1442 				num = 10 * num + c - '0';
1443 				digitfound = true;
1444 			} else if (c == ',') {
1445 				if (commafound)
1446 					FATAL("illegal repetition expression: class %.20s",
1447 						lastre);
1448 				/* looking for {n,} or {n,m} */
1449 				commafound = true;
1450 				n = num;
1451 				digitfound = false; /* reset */
1452 				num = 0;
1453 			} else {
1454 				FATAL("illegal repetition expression: class %.20s",
1455 					lastre);
1456 			}
1457 		}
1458 		break;
1459 	}
1460 }
1461 
1462 int cgoto(fa *f, int s, int c)
1463 {
1464 	int *p, *q;
1465 	int i, j, k;
1466 
1467 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1468 	while (f->accept >= maxsetvec) {	/* guessing here! */
1469 		resizesetvec(__func__);
1470 	}
1471 	for (i = 0; i <= f->accept; i++)
1472 		setvec[i] = 0;
1473 	setcnt = 0;
1474 	resize_state(f, s);
1475 	/* compute positions of gototab[s,c] into setvec */
1476 	p = f->posns[s];
1477 	for (i = 1; i <= *p; i++) {
1478 		if ((k = f->re[p[i]].ltype) != FINAL) {
1479 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1480 			 || (k == DOT && c != 0 && c != HAT)
1481 			 || (k == ALL && c != 0)
1482 			 || (k == EMPTYRE && c != 0)
1483 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1484 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1485 				q = f->re[p[i]].lfollow;
1486 				for (j = 1; j <= *q; j++) {
1487 					if (q[j] >= maxsetvec) {
1488 						resizesetvec(__func__);
1489 					}
1490 					if (setvec[q[j]] == 0) {
1491 						setcnt++;
1492 						setvec[q[j]] = 1;
1493 					}
1494 				}
1495 			}
1496 		}
1497 	}
1498 	/* determine if setvec is a previous state */
1499 	tmpset[0] = setcnt;
1500 	j = 1;
1501 	for (i = f->accept; i >= 0; i--)
1502 		if (setvec[i]) {
1503 			tmpset[j++] = i;
1504 		}
1505 	resize_state(f, f->curstat > s ? f->curstat : s);
1506 	/* tmpset == previous state? */
1507 	for (i = 1; i <= f->curstat; i++) {
1508 		p = f->posns[i];
1509 		if ((k = tmpset[0]) != p[0])
1510 			goto different;
1511 		for (j = 1; j <= k; j++)
1512 			if (tmpset[j] != p[j])
1513 				goto different;
1514 		/* setvec is state i */
1515 		if (c != HAT)
1516 			set_gototab(f, s, c, i);
1517 		return i;
1518 	  different:;
1519 	}
1520 
1521 	/* add tmpset to current set of states */
1522 	++(f->curstat);
1523 	resize_state(f, f->curstat);
1524 	clear_gototab(f, f->curstat);
1525 	xfree(f->posns[f->curstat]);
1526 	p = intalloc(setcnt + 1, __func__);
1527 
1528 	f->posns[f->curstat] = p;
1529 	if (c != HAT)
1530 		set_gototab(f, s, c, f->curstat);
1531 	for (i = 0; i <= setcnt; i++)
1532 		p[i] = tmpset[i];
1533 	if (setvec[f->accept])
1534 		f->out[f->curstat] = 1;
1535 	else
1536 		f->out[f->curstat] = 0;
1537 	return f->curstat;
1538 }
1539 
1540 
1541 void freefa(fa *f)	/* free a finite automaton */
1542 {
1543 	int i;
1544 
1545 	if (f == NULL)
1546 		return;
1547 	for (i = 0; i < f->state_count; i++)
1548 		xfree(f->gototab[i].entries);
1549 	xfree(f->gototab);
1550 	for (i = 0; i <= f->curstat; i++)
1551 		xfree(f->posns[i]);
1552 	for (i = 0; i <= f->accept; i++) {
1553 		xfree(f->re[i].lfollow);
1554 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1555 			xfree(f->re[i].lval.np);
1556 	}
1557 	xfree(f->restr);
1558 	xfree(f->out);
1559 	xfree(f->posns);
1560 	xfree(f->gototab);
1561 	xfree(f);
1562 }
1563