xref: /freebsd/contrib/one-true-awk/b.c (revision d316de24faa7453118a90fb0e9839e8026e36a4e)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36 
37 #define MAXLIN 22
38 
39 #define type(v)		(v)->nobj	/* badly overloaded here */
40 #define info(v)		(v)->ntype	/* badly overloaded here */
41 #define left(v)		(v)->narg[0]
42 #define right(v)	(v)->narg[1]
43 #define parent(v)	(v)->nnext
44 
45 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
47 #define UNARY	case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 		left is index, right contains value or pointer to value
52 	unary (STAR, PLUS, QUEST): left is child, right is null
53 	binary (CAT, OR): left and right are children
54 	parent contains pointer to parent
55 */
56 
57 
58 int	*setvec;
59 int	*tmpset;
60 int	maxsetvec = 0;
61 
62 int	rtok;		/* next token in current re */
63 int	rlxval;
64 static const uschar	*rlxstr;
65 static const uschar	*prestr;	/* current position in current re */
66 static const uschar	*lastre;	/* origin of last re */
67 static const uschar	*lastatom;	/* origin of last Atom */
68 static const uschar	*starttok;
69 static const uschar 	*basestr;	/* starts with original, replaced during
70 				   repetition processing */
71 static const uschar 	*firstbasestr;
72 
73 static	int setcnt;
74 static	int poscnt;
75 
76 const char	*patbeg;
77 int	patlen;
78 
79 #define	NFA	128	/* cache this many dynamic fa's */
80 fa	*fatab[NFA];
81 int	nfatab	= 0;	/* entries in fatab */
82 
83 extern int u8_nextlen(const char *s);
84 
85 
86 /* utf-8 mechanism:
87 
88    For most of Awk, utf-8 strings just "work", since they look like
89    null-terminated sequences of 8-bit bytes.
90 
91    Functions like length(), index(), and substr() have to operate
92    in units of utf-8 characters.  The u8_* functions in run.c
93    handle this.
94 
95    Regular expressions are more complicated, since the basic
96    mechanism of the goto table used 8-bit byte indices into the
97    gototab entries to compute the next state.  Unicode is a lot
98    bigger, so the gototab entries are now structs with a character
99    and a next state. These are sorted by code point and binary
100    searched.
101 
102    Throughout the RE mechanism in b.c, utf-8 characters are
103    converted to their utf-32 value.  This mostly shows up in
104    cclenter, which expands character class ranges like a-z and now
105    alpha-omega.  The size of a gototab array is still about 256.
106    This should be dynamic, but for now things work ok for a single
107    code page of Unicode, which is the most likely case.
108 
109    The code changes are localized in run.c and b.c.  I have added a
110    handful of functions to somewhat better hide the implementation,
111    but a lot more could be done.
112 
113  */
114 
115 static int entry_cmp(const void *l, const void *r);
116 static int get_gototab(fa*, int, int);
117 static int set_gototab(fa*, int, int, int);
118 static void clear_gototab(fa*, int);
119 extern int u8_rune(int *, const char *);
120 
121 static int *
122 intalloc(size_t n, const char *f)
123 {
124 	int *p = (int *) calloc(n, sizeof(int));
125 	if (p == NULL)
126 		overflo(f);
127 	return p;
128 }
129 
130 static void
131 resizesetvec(const char *f)
132 {
133 	if (maxsetvec == 0)
134 		maxsetvec = MAXLIN;
135 	else
136 		maxsetvec *= 4;
137 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139 	if (setvec == NULL || tmpset == NULL)
140 		overflo(f);
141 }
142 
143 static void
144 resize_state(fa *f, int state)
145 {
146 	gtt *p;
147 	uschar *p2;
148 	int **p3;
149 	int i, new_count;
150 
151 	if (++state < f->state_count)
152 		return;
153 
154 	new_count = state + 10; /* needs to be tuned */
155 
156 	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157 	if (p == NULL)
158 		goto out;
159 	f->gototab = p;
160 
161 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162 	if (p2 == NULL)
163 		goto out;
164 	f->out = p2;
165 
166 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167 	if (p3 == NULL)
168 		goto out;
169 	f->posns = p3;
170 
171 	for (i = f->state_count; i < new_count; ++i) {
172 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173 		if (f->gototab[i].entries == NULL)
174 			goto out;
175 		f->gototab[i].allocated = NCHARS;
176 		f->gototab[i].inuse = 0;
177 		f->out[i] = 0;
178 		f->posns[i] = NULL;
179 	}
180 	f->state_count = new_count;
181 	return;
182 out:
183 	overflo(__func__);
184 }
185 
186 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
187 {
188 	int i, use, nuse;
189 	fa *pfa;
190 	static int now = 1;
191 
192 	if (setvec == NULL) {	/* first time through any RE */
193 		resizesetvec(__func__);
194 	}
195 
196 	if (compile_time != RUNNING)	/* a constant for sure */
197 		return mkdfa(s, anchor);
198 	for (i = 0; i < nfatab; i++)	/* is it there already? */
199 		if (fatab[i]->anchor == anchor
200 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
201 			fatab[i]->use = now++;
202 			return fatab[i];
203 		}
204 	pfa = mkdfa(s, anchor);
205 	if (nfatab < NFA) {	/* room for another */
206 		fatab[nfatab] = pfa;
207 		fatab[nfatab]->use = now++;
208 		nfatab++;
209 		return pfa;
210 	}
211 	use = fatab[0]->use;	/* replace least-recently used */
212 	nuse = 0;
213 	for (i = 1; i < nfatab; i++)
214 		if (fatab[i]->use < use) {
215 			use = fatab[i]->use;
216 			nuse = i;
217 		}
218 	freefa(fatab[nuse]);
219 	fatab[nuse] = pfa;
220 	pfa->use = now++;
221 	return pfa;
222 }
223 
224 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
225 				/* anchor = true for anchored matches, else false */
226 {
227 	Node *p, *p1;
228 	fa *f;
229 
230 	firstbasestr = (const uschar *) s;
231 	basestr = firstbasestr;
232 	p = reparse(s);
233 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234 		/* put ALL STAR in front of reg.  exp. */
235 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236 		/* put FINAL after reg.  exp. */
237 
238 	poscnt = 0;
239 	penter(p1);	/* enter parent pointers and leaf indices */
240 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241 		overflo(__func__);
242 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
243 	cfoll(f, p1);	/* set up follow sets */
244 	freetr(p1);
245 	resize_state(f, 1);
246 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247 	f->posns[1] = intalloc(1, __func__);
248 	*f->posns[1] = 0;
249 	f->initstat = makeinit(f, anchor);
250 	f->anchor = anchor;
251 	f->restr = (uschar *) tostring(s);
252 	if (firstbasestr != basestr) {
253 		if (basestr)
254 			xfree(basestr);
255 	}
256 	return f;
257 }
258 
259 int makeinit(fa *f, bool anchor)
260 {
261 	int i, k;
262 
263 	f->curstat = 2;
264 	f->out[2] = 0;
265 	k = *(f->re[0].lfollow);
266 	xfree(f->posns[2]);
267 	f->posns[2] = intalloc(k + 1,  __func__);
268 	for (i = 0; i <= k; i++) {
269 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
270 	}
271 	if ((f->posns[2])[1] == f->accept)
272 		f->out[2] = 1;
273 	clear_gototab(f, 2);
274 	f->curstat = cgoto(f, 2, HAT);
275 	if (anchor) {
276 		*f->posns[2] = k-1;	/* leave out position 0 */
277 		for (i = 0; i < k; i++) {
278 			(f->posns[0])[i] = (f->posns[2])[i];
279 		}
280 
281 		f->out[0] = f->out[2];
282 		if (f->curstat != 2)
283 			--(*f->posns[f->curstat]);
284 	}
285 	return f->curstat;
286 }
287 
288 void penter(Node *p)	/* set up parent pointers and leaf indices */
289 {
290 	switch (type(p)) {
291 	ELEAF
292 	LEAF
293 		info(p) = poscnt;
294 		poscnt++;
295 		break;
296 	UNARY
297 		penter(left(p));
298 		parent(left(p)) = p;
299 		break;
300 	case CAT:
301 	case OR:
302 		penter(left(p));
303 		penter(right(p));
304 		parent(left(p)) = p;
305 		parent(right(p)) = p;
306 		break;
307 	case ZERO:
308 		break;
309 	default:	/* can't happen */
310 		FATAL("can't happen: unknown type %d in penter", type(p));
311 		break;
312 	}
313 }
314 
315 void freetr(Node *p)	/* free parse tree */
316 {
317 	switch (type(p)) {
318 	ELEAF
319 	LEAF
320 		xfree(p);
321 		break;
322 	UNARY
323 	case ZERO:
324 		freetr(left(p));
325 		xfree(p);
326 		break;
327 	case CAT:
328 	case OR:
329 		freetr(left(p));
330 		freetr(right(p));
331 		xfree(p);
332 		break;
333 	default:	/* can't happen */
334 		FATAL("can't happen: unknown type %d in freetr", type(p));
335 		break;
336 	}
337 }
338 
339 /* in the parsing of regular expressions, metacharacters like . have */
340 /* to be seen literally;  \056 is not a metacharacter. */
341 
342 int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
343 {			/* only pick up one 8-bit byte (2 chars) */
344 	const uschar *p;
345 	int n = 0;
346 	int i;
347 
348 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349 		if (isdigit((int) *p))
350 			n = 16 * n + *p - '0';
351 		else if (*p >= 'a' && *p <= 'f')
352 			n = 16 * n + *p - 'a' + 10;
353 		else if (*p >= 'A' && *p <= 'F')
354 			n = 16 * n + *p - 'A' + 10;
355 	}
356 	*pp = p;
357 	return n;
358 }
359 
360 
361 
362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
363 
364 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
365 			/* and increment *pp */
366 {
367 	const uschar *p = *pp;
368 	int c;
369 
370 /* BUG: should advance by utf-8 char even if makes no sense */
371 
372 	if ((c = *p++) == 't') {
373 		c = '\t';
374 	} else if (c == 'n') {
375 		c = '\n';
376 	} else if (c == 'f') {
377 		c = '\f';
378 	} else if (c == 'r') {
379 		c = '\r';
380 	} else if (c == 'b') {
381 		c = '\b';
382 	} else if (c == 'v') {
383 		c = '\v';
384 	} else if (c == 'a') {
385 		c = '\a';
386 	} else if (c == '\\') {
387 		c = '\\';
388 	} else if (c == 'x') {	/* 2 hex digits follow */
389 		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
390 	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
391 		c = hexstr(&p, 8);
392 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
393 		int n = c - '0';
394 		if (isoctdigit(*p)) {
395 			n = 8 * n + *p++ - '0';
396 			if (isoctdigit(*p))
397 				n = 8 * n + *p++ - '0';
398 		}
399 		c = n;
400 	} /* else */
401 		/* c = c; */
402 	*pp = p;
403 	return c;
404 }
405 
406 int *cclenter(const char *argp)	/* add a character class */
407 {
408 	int i, c, c2;
409 	int n;
410 	const uschar *p = (const uschar *) argp;
411 	int *bp, *retp;
412 	static int *buf = NULL;
413 	static int bufsz = 100;
414 
415 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
416 		FATAL("out of space for character class [%.10s...] 1", p);
417 	bp = buf;
418 	for (i = 0; *p != 0; ) {
419 		n = u8_rune(&c, (const char *) p);
420 		p += n;
421 		if (c == '\\') {
422 			c = quoted(&p);
423 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
424 			if (*p != 0) {
425 				c = bp[-1];
426 				/* c2 = *p++; */
427 				n = u8_rune(&c2, (const char *) p);
428 				p += n;
429 				if (c2 == '\\')
430 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
431 				if (c > c2) {	/* empty; ignore */
432 					bp--;
433 					i--;
434 					continue;
435 				}
436 				while (c < c2) {
437 					if (i >= bufsz) {
438 						bufsz *= 2;
439 						buf = (int *) realloc(buf, bufsz * sizeof(int));
440 						if (buf == NULL)
441 							FATAL("out of space for character class [%.10s...] 2", p);
442 						bp = buf + i;
443 					}
444 					*bp++ = ++c;
445 					i++;
446 				}
447 				continue;
448 			}
449 		}
450 		if (i >= bufsz) {
451 			bufsz *= 2;
452 			buf = (int *) realloc(buf, bufsz * sizeof(int));
453 			if (buf == NULL)
454 				FATAL("out of space for character class [%.10s...] 2", p);
455 			bp = buf + i;
456 		}
457 		*bp++ = c;
458 		i++;
459 	}
460 	*bp = 0;
461 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
462 	/* xfree(op);  BUG: what are we freeing here? */
463 	retp = (int *) calloc(bp-buf+1, sizeof(int));
464 	for (i = 0; i < bp-buf+1; i++)
465 		retp[i] = buf[i];
466 	return retp;
467 }
468 
469 void overflo(const char *s)
470 {
471 	FATAL("regular expression too big: out of space in %.30s...", s);
472 }
473 
474 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
475 {
476 	int i;
477 	int *p;
478 
479 	switch (type(v)) {
480 	ELEAF
481 	LEAF
482 		f->re[info(v)].ltype = type(v);
483 		f->re[info(v)].lval.np = right(v);
484 		while (f->accept >= maxsetvec) {	/* guessing here! */
485 			resizesetvec(__func__);
486 		}
487 		for (i = 0; i <= f->accept; i++)
488 			setvec[i] = 0;
489 		setcnt = 0;
490 		follow(v);	/* computes setvec and setcnt */
491 		p = intalloc(setcnt + 1, __func__);
492 		f->re[info(v)].lfollow = p;
493 		*p = setcnt;
494 		for (i = f->accept; i >= 0; i--)
495 			if (setvec[i] == 1)
496 				*++p = i;
497 		break;
498 	UNARY
499 		cfoll(f,left(v));
500 		break;
501 	case CAT:
502 	case OR:
503 		cfoll(f,left(v));
504 		cfoll(f,right(v));
505 		break;
506 	case ZERO:
507 		break;
508 	default:	/* can't happen */
509 		FATAL("can't happen: unknown type %d in cfoll", type(v));
510 	}
511 }
512 
513 int first(Node *p)	/* collects initially active leaves of p into setvec */
514 			/* returns 0 if p matches empty string */
515 {
516 	int b, lp;
517 
518 	switch (type(p)) {
519 	ELEAF
520 	LEAF
521 		lp = info(p);	/* look for high-water mark of subscripts */
522 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
523 			resizesetvec(__func__);
524 		}
525 		if (type(p) == EMPTYRE) {
526 			setvec[lp] = 0;
527 			return(0);
528 		}
529 		if (setvec[lp] != 1) {
530 			setvec[lp] = 1;
531 			setcnt++;
532 		}
533 		if (type(p) == CCL && (*(int *) right(p)) == 0)
534 			return(0);		/* empty CCL */
535 		return(1);
536 	case PLUS:
537 		if (first(left(p)) == 0)
538 			return(0);
539 		return(1);
540 	case STAR:
541 	case QUEST:
542 		first(left(p));
543 		return(0);
544 	case CAT:
545 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
546 		return(1);
547 	case OR:
548 		b = first(right(p));
549 		if (first(left(p)) == 0 || b == 0) return(0);
550 		return(1);
551 	case ZERO:
552 		return 0;
553 	}
554 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
555 	return(-1);
556 }
557 
558 void follow(Node *v)	/* collects leaves that can follow v into setvec */
559 {
560 	Node *p;
561 
562 	if (type(v) == FINAL)
563 		return;
564 	p = parent(v);
565 	switch (type(p)) {
566 	case STAR:
567 	case PLUS:
568 		first(v);
569 		follow(p);
570 		return;
571 
572 	case OR:
573 	case QUEST:
574 		follow(p);
575 		return;
576 
577 	case CAT:
578 		if (v == left(p)) {	/* v is left child of p */
579 			if (first(right(p)) == 0) {
580 				follow(p);
581 				return;
582 			}
583 		} else		/* v is right child */
584 			follow(p);
585 		return;
586 	}
587 }
588 
589 int member(int c, int *sarg)	/* is c in s? */
590 {
591 	int *s = (int *) sarg;
592 
593 	while (*s)
594 		if (c == *s++)
595 			return(1);
596 	return(0);
597 }
598 
599 static void resize_gototab(fa *f, int state)
600 {
601 	size_t new_size = f->gototab[state].allocated * 2;
602 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
603 	if (p == NULL)
604 		overflo(__func__);
605 
606 	// need to initialized the new memory to zero
607 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
608 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
609 
610 	f->gototab[state].allocated = new_size;			// update gototab info
611 	f->gototab[state].entries = p;
612 }
613 
614 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
615 {
616 	gtte key;
617 	gtte *item;
618 
619 	key.ch = ch;
620 	key.state = 0;	/* irrelevant */
621 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
622 			f->gototab[state].inuse, sizeof(gtte),
623 			entry_cmp);
624 
625 	if (item == NULL)
626 		return 0;
627 	else
628 		return item->state;
629 }
630 
631 static int entry_cmp(const void *l, const void *r)
632 {
633 	const gtte *left, *right;
634 
635 	left = (const gtte *) l;
636 	right = (const gtte *) r;
637 
638 	return left->ch - right->ch;
639 }
640 
641 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
642 {
643 	if (f->gototab[state].inuse == 0) {
644 		f->gototab[state].entries[0].ch = ch;
645 		f->gototab[state].entries[0].state = val;
646 		f->gototab[state].inuse++;
647 		return val;
648 	} else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
649 		// not seen yet, insert and return
650 		gtt *tab = & f->gototab[state];
651 		if (tab->inuse + 1 >= tab->allocated)
652 			resize_gototab(f, state);
653 
654 		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
655 		f->gototab[state].entries[f->gototab[state].inuse].state = val;
656 		f->gototab[state].inuse++;
657 		return val;
658 	} else {
659 		// maybe we have it, maybe we don't
660 		gtte key;
661 		gtte *item;
662 
663 		key.ch = ch;
664 		key.state = 0;	/* irrelevant */
665 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
666 				f->gototab[state].inuse, sizeof(gtte),
667 				entry_cmp);
668 
669 		if (item != NULL) {
670 			// we have it, update state and return
671 			item->state = val;
672 			return item->state;
673 		}
674 		// otherwise, fall through to insert and reallocate.
675 	}
676 
677 	gtt *tab = & f->gototab[state];
678 	if (tab->inuse + 1 >= tab->allocated)
679 		resize_gototab(f, state);
680 	f->gototab[state].entries[tab->inuse].ch = ch;
681 	f->gototab[state].entries[tab->inuse].state = val;
682 	++tab->inuse;
683 
684 	qsort(f->gototab[state].entries,
685 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
686 
687 	return val; /* not used anywhere at the moment */
688 }
689 
690 static void clear_gototab(fa *f, int state)
691 {
692 	memset(f->gototab[state].entries, 0,
693 		f->gototab[state].allocated * sizeof(gtte));
694 	f->gototab[state].inuse = 0;
695 }
696 
697 int match(fa *f, const char *p0)	/* shortest match ? */
698 {
699 	int s, ns;
700 	int n;
701 	int rune;
702 	const uschar *p = (const uschar *) p0;
703 
704 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
705 
706 	s = f->initstat;
707 	assert (s < f->state_count);
708 
709 	if (f->out[s])
710 		return(1);
711 	do {
712 		/* assert(*p < NCHARS); */
713 		n = u8_rune(&rune, (const char *) p);
714 		if ((ns = get_gototab(f, s, rune)) != 0)
715 			s = ns;
716 		else
717 			s = cgoto(f, s, rune);
718 		if (f->out[s])
719 			return(1);
720 		if (*p == 0)
721 			break;
722 		p += n;
723 	} while (1);  /* was *p++ != 0 */
724 	return(0);
725 }
726 
727 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
728 {
729 	int s, ns;
730 	int n;
731 	int rune;
732 	const uschar *p = (const uschar *) p0;
733 	const uschar *q;
734 
735 	s = f->initstat;
736 	assert(s < f->state_count);
737 
738 	patbeg = (const char *)p;
739 	patlen = -1;
740 	do {
741 		q = p;
742 		do {
743 			if (f->out[s])		/* final state */
744 				patlen = q-p;
745 			/* assert(*q < NCHARS); */
746 			n = u8_rune(&rune, (const char *) q);
747 			if ((ns = get_gototab(f, s, rune)) != 0)
748 				s = ns;
749 			else
750 				s = cgoto(f, s, rune);
751 
752 			assert(s < f->state_count);
753 
754 			if (s == 1) {	/* no transition */
755 				if (patlen >= 0) {
756 					patbeg = (const char *) p;
757 					return(1);
758 				}
759 				else
760 					goto nextin;	/* no match */
761 			}
762 			if (*q == 0)
763 				break;
764 			q += n;
765 		} while (1);
766 		q++;  /* was *q++ */
767 		if (f->out[s])
768 			patlen = q-p-1;	/* don't count $ */
769 		if (patlen >= 0) {
770 			patbeg = (const char *) p;
771 			return(1);
772 		}
773 	nextin:
774 		s = 2;
775 		if (*p == 0)
776 			break;
777 		n = u8_rune(&rune, (const char *) p);
778 		p += n;
779 	} while (1); /* was *p++ */
780 	return (0);
781 }
782 
783 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
784 {
785 	int s, ns;
786         int n;
787         int rune;
788 	const uschar *p = (const uschar *) p0;
789 	const uschar *q;
790 
791 	s = f->initstat;
792 	assert(s < f->state_count);
793 
794 	patbeg = (const char *)p;
795 	patlen = -1;
796 	while (*p) {
797 		q = p;
798 		do {
799 			if (f->out[s])		/* final state */
800 				patlen = q-p;
801 			/* assert(*q < NCHARS); */
802 			n = u8_rune(&rune, (const char *) q);
803 			if ((ns = get_gototab(f, s, rune)) != 0)
804 				s = ns;
805 			else
806 				s = cgoto(f, s, rune);
807 			if (s == 1) {	/* no transition */
808 				if (patlen > 0) {
809 					patbeg = (const char *) p;
810 					return(1);
811 				} else
812 					goto nnextin;	/* no nonempty match */
813 			}
814 			if (*q == 0)
815 				break;
816 			q += n;
817 		} while (1);
818 		q++;
819 		if (f->out[s])
820 			patlen = q-p-1;	/* don't count $ */
821 		if (patlen > 0 ) {
822 			patbeg = (const char *) p;
823 			return(1);
824 		}
825 	nnextin:
826 		s = 2;
827 		p++;
828 	}
829 	return (0);
830 }
831 
832 
833 /*
834  * NAME
835  *     fnematch
836  *
837  * DESCRIPTION
838  *     A stream-fed version of nematch which transfers characters to a
839  *     null-terminated buffer. All characters up to and including the last
840  *     character of the matching text or EOF are placed in the buffer. If
841  *     a match is found, patbeg and patlen are set appropriately.
842  *
843  * RETURN VALUES
844  *     false    No match found.
845  *     true     Match found.
846  */
847 
848 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
849 {
850 	char *i, *j, *k, *buf = *pbuf;
851 	int bufsize = *pbufsize;
852 	int c, n, ns, s;
853 
854 	s = pfa->initstat;
855 	patlen = 0;
856 
857 	/*
858 	 * buf <= i <= j <= k <= buf+bufsize
859 	 *
860 	 * i: origin of active substring
861 	 * j: current character
862 	 * k: destination of the next getc
863 	 */
864 
865 	i = j = k = buf;
866 
867 	do {
868 		/*
869 		 * Call u8_rune with at least awk_mb_cur_max ahead in
870 		 * the buffer until EOF interferes.
871 		 */
872 		if (k - j < awk_mb_cur_max) {
873 			if (k + awk_mb_cur_max > buf + bufsize) {
874 				char *obuf = buf;
875 				adjbuf((char **) &buf, &bufsize,
876 				    bufsize + awk_mb_cur_max,
877 				    quantum, 0, "fnematch");
878 
879 				/* buf resized, maybe moved. update pointers */
880 				*pbufsize = bufsize;
881 				if (obuf != buf) {
882 					i = buf + (i - obuf);
883 					j = buf + (j - obuf);
884 					k = buf + (k - obuf);
885 					*pbuf = buf;
886 					if (patlen)
887 						patbeg = buf + (patbeg - obuf);
888 				}
889 			}
890 			for (n = awk_mb_cur_max ; n > 0; n--) {
891 				*k++ = (c = getc(f)) != EOF ? c : 0;
892 				if (c == EOF) {
893 					if (ferror(f))
894 						FATAL("fnematch: getc error");
895 					break;
896 				}
897 			}
898 		}
899 
900 		j += u8_rune(&c, j);
901 
902 		if ((ns = get_gototab(pfa, s, c)) != 0)
903 			s = ns;
904 		else
905 			s = cgoto(pfa, s, c);
906 
907 		if (pfa->out[s]) {	/* final state */
908 			patbeg = i;
909 			patlen = j - i;
910 			if (c == 0)	/* don't count $ */
911 				patlen--;
912 		}
913 
914 		if (c && s != 1)
915 			continue;  /* origin i still viable, next j */
916 		if (patlen)
917 			break;     /* best match found */
918 
919 		/* no match at origin i, next i and start over */
920 		i += u8_rune(&c, i);
921 		if (c == 0)
922 			break;    /* no match */
923 		j = i;
924 		s = 2;
925 	} while (1);
926 
927 	if (patlen) {
928 		/*
929 		 * Under no circumstances is the last character fed to
930 		 * the automaton part of the match. It is EOF's nullbyte,
931 		 * or it sent the automaton into a state with no further
932 		 * transitions available (s==1), or both. Room for a
933 		 * terminating nullbyte is guaranteed.
934 		 *
935 		 * ungetc any chars after the end of matching text
936 		 * (except for EOF's nullbyte, if present) and null
937 		 * terminate the buffer.
938 		 */
939 		do
940 			if (*--k && ungetc(*k, f) == EOF)
941 				FATAL("unable to ungetc '%c'", *k);
942 		while (k > patbeg + patlen);
943 		*k = '\0';
944 		return true;
945 	}
946 	else
947 		return false;
948 }
949 
950 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
951 {			/* uses relex() to scan regular expression */
952 	Node *np;
953 
954 	DPRINTF("reparse <%s>\n", p);
955 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
956 	rtok = relex();
957 	/* GNU compatibility: an empty regexp matches anything */
958 	if (rtok == '\0') {
959 		/* FATAL("empty regular expression"); previous */
960 		return(op2(EMPTYRE, NIL, NIL));
961 	}
962 	np = regexp();
963 	if (rtok != '\0')
964 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
965 	return(np);
966 }
967 
968 Node *regexp(void)	/* top-level parse of reg expr */
969 {
970 	return (alt(concat(primary())));
971 }
972 
973 Node *primary(void)
974 {
975 	Node *np;
976 	int savelastatom;
977 
978 	switch (rtok) {
979 	case CHAR:
980 		lastatom = starttok;
981 		np = op2(CHAR, NIL, itonp(rlxval));
982 		rtok = relex();
983 		return (unary(np));
984 	case ALL:
985 		rtok = relex();
986 		return (unary(op2(ALL, NIL, NIL)));
987 	case EMPTYRE:
988 		rtok = relex();
989 		return (unary(op2(EMPTYRE, NIL, NIL)));
990 	case DOT:
991 		lastatom = starttok;
992 		rtok = relex();
993 		return (unary(op2(DOT, NIL, NIL)));
994 	case CCL:
995 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
996 		lastatom = starttok;
997 		rtok = relex();
998 		return (unary(np));
999 	case NCCL:
1000 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1001 		lastatom = starttok;
1002 		rtok = relex();
1003 		return (unary(np));
1004 	case '^':
1005 		rtok = relex();
1006 		return (unary(op2(CHAR, NIL, itonp(HAT))));
1007 	case '$':
1008 		rtok = relex();
1009 		return (unary(op2(CHAR, NIL, NIL)));
1010 	case '(':
1011 		lastatom = starttok;
1012 		savelastatom = starttok - basestr; /* Retain over recursion */
1013 		rtok = relex();
1014 		if (rtok == ')') {	/* special pleading for () */
1015 			rtok = relex();
1016 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1017 		}
1018 		np = regexp();
1019 		if (rtok == ')') {
1020 			lastatom = basestr + savelastatom; /* Restore */
1021 			rtok = relex();
1022 			return (unary(np));
1023 		}
1024 		else
1025 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1026 	default:
1027 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1028 	}
1029 	return 0;	/*NOTREACHED*/
1030 }
1031 
1032 Node *concat(Node *np)
1033 {
1034 	switch (rtok) {
1035 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1036 		return (concat(op2(CAT, np, primary())));
1037 	case EMPTYRE:
1038 		rtok = relex();
1039 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1040 				primary())));
1041 	}
1042 	return (np);
1043 }
1044 
1045 Node *alt(Node *np)
1046 {
1047 	if (rtok == OR) {
1048 		rtok = relex();
1049 		return (alt(op2(OR, np, concat(primary()))));
1050 	}
1051 	return (np);
1052 }
1053 
1054 Node *unary(Node *np)
1055 {
1056 	switch (rtok) {
1057 	case STAR:
1058 		rtok = relex();
1059 		return (unary(op2(STAR, np, NIL)));
1060 	case PLUS:
1061 		rtok = relex();
1062 		return (unary(op2(PLUS, np, NIL)));
1063 	case QUEST:
1064 		rtok = relex();
1065 		return (unary(op2(QUEST, np, NIL)));
1066 	case ZERO:
1067 		rtok = relex();
1068 		return (unary(op2(ZERO, np, NIL)));
1069 	default:
1070 		return (np);
1071 	}
1072 }
1073 
1074 /*
1075  * Character class definitions conformant to the POSIX locale as
1076  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1077  * and operating character sets are both ASCII (ISO646) or supersets
1078  * thereof.
1079  *
1080  * Note that to avoid overflowing the temporary buffer used in
1081  * relex(), the expanded character class (prior to range expansion)
1082  * must be less than twice the size of their full name.
1083  */
1084 
1085 /* Because isblank doesn't show up in any of the header files on any
1086  * system i use, it's defined here.  if some other locale has a richer
1087  * definition of "blank", define HAS_ISBLANK and provide your own
1088  * version.
1089  * the parentheses here are an attempt to find a path through the maze
1090  * of macro definition and/or function and/or version provided.  thanks
1091  * to nelson beebe for the suggestion; let's see if it works everywhere.
1092  */
1093 
1094 /* #define HAS_ISBLANK */
1095 #ifndef HAS_ISBLANK
1096 
1097 int (xisblank)(int c)
1098 {
1099 	return c==' ' || c=='\t';
1100 }
1101 
1102 #endif
1103 
1104 static const struct charclass {
1105 	const char *cc_name;
1106 	int cc_namelen;
1107 	int (*cc_func)(int);
1108 } charclasses[] = {
1109 	{ "alnum",	5,	isalnum },
1110 	{ "alpha",	5,	isalpha },
1111 #ifndef HAS_ISBLANK
1112 	{ "blank",	5,	xisblank },
1113 #else
1114 	{ "blank",	5,	isblank },
1115 #endif
1116 	{ "cntrl",	5,	iscntrl },
1117 	{ "digit",	5,	isdigit },
1118 	{ "graph",	5,	isgraph },
1119 	{ "lower",	5,	islower },
1120 	{ "print",	5,	isprint },
1121 	{ "punct",	5,	ispunct },
1122 	{ "space",	5,	isspace },
1123 	{ "upper",	5,	isupper },
1124 	{ "xdigit",	6,	isxdigit },
1125 	{ NULL,		0,	NULL },
1126 };
1127 
1128 #define REPEAT_SIMPLE		0
1129 #define REPEAT_PLUS_APPENDED	1
1130 #define REPEAT_WITH_Q		2
1131 #define REPEAT_ZERO		3
1132 
1133 static int
1134 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1135 	       int atomlen, int firstnum, int secondnum, int special_case)
1136 {
1137 	int i, j;
1138 	uschar *buf = 0;
1139 	int ret = 1;
1140 	int init_q = (firstnum == 0);		/* first added char will be ? */
1141 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1142 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1143 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1144 	int size = prefix_length +  suffix_length;
1145 
1146 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1147 		size += atomlen*(firstnum-1);
1148 	}
1149 
1150 	/* Adjust size of buffer for special cases */
1151 	if (special_case == REPEAT_PLUS_APPENDED) {
1152 		size++;		/* for the final + */
1153 	} else if (special_case == REPEAT_WITH_Q) {
1154 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1155 	} else if (special_case == REPEAT_ZERO) {
1156 		size += 2;	/* just a null ERE: () */
1157 	}
1158 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1159 		FATAL("out of space in reg expr %.10s..", lastre);
1160 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1161 	j = prefix_length;
1162 	if (special_case == REPEAT_ZERO) {
1163 		j -= atomlen;
1164 		buf[j++] = '(';
1165 		buf[j++] = ')';
1166 	}
1167 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1168 		memcpy(&buf[j], atom, atomlen);
1169 		j += atomlen;
1170 	}
1171 	if (special_case == REPEAT_PLUS_APPENDED) {
1172 		buf[j++] = '+';
1173 	} else if (special_case == REPEAT_WITH_Q) {
1174 		if (init_q)
1175 			buf[j++] = '?';
1176 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1177 			memcpy(&buf[j], atom, atomlen);
1178 			j += atomlen;
1179 			buf[j++] = '?';
1180 		}
1181 	}
1182 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1183 	j += suffix_length;
1184 	buf[j] = '\0';
1185 	/* free old basestr */
1186 	if (firstbasestr != basestr) {
1187 		if (basestr)
1188 			xfree(basestr);
1189 	}
1190 	basestr = buf;
1191 	prestr  = buf + prefix_length;
1192 	if (special_case == REPEAT_ZERO) {
1193 		prestr  -= atomlen;
1194 		ret++;
1195 	}
1196 	return ret;
1197 }
1198 
1199 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1200 		  int atomlen, int firstnum, int secondnum)
1201 {
1202 	/*
1203 	   In general, the repetition specifier or "bound" is replaced here
1204 	   by an equivalent ERE string, repeating the immediately previous atom
1205 	   and appending ? and + as needed. Note that the first copy of the
1206 	   atom is left in place, except in the special_case of a zero-repeat
1207 	   (i.e., {0}).
1208 	 */
1209 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1210 		if (firstnum < 2) {
1211 			/* 0 or 1: should be handled before you get here */
1212 			FATAL("internal error");
1213 		} else {
1214 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1215 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1216 		}
1217 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1218 		if (firstnum == 0) {	/* {0} or {0,0} */
1219 			/* This case is unusual because the resulting
1220 			   replacement string might actually be SMALLER than
1221 			   the original ERE */
1222 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1223 					firstnum, secondnum, REPEAT_ZERO);
1224 		} else {		/* (firstnum >= 1) */
1225 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1226 					firstnum, secondnum, REPEAT_SIMPLE);
1227 		}
1228 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1229 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1230 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1231 					firstnum, secondnum, REPEAT_WITH_Q);
1232 	} else {	/* Error - shouldn't be here (n>m) */
1233 		FATAL("internal error");
1234 	}
1235 	return 0;
1236 }
1237 
1238 int relex(void)		/* lexical analyzer for reparse */
1239 {
1240 	int c, n;
1241 	int cflag;
1242 	static uschar *buf = NULL;
1243 	static int bufsz = 100;
1244 	uschar *bp;
1245 	const struct charclass *cc;
1246 	int i;
1247 	int num, m;
1248 	bool commafound, digitfound;
1249 	const uschar *startreptok;
1250 	static int parens = 0;
1251 
1252 rescan:
1253 	starttok = prestr;
1254 
1255 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1256 		prestr += n;
1257 		starttok = prestr;
1258 		return CHAR;
1259 	}
1260 
1261 	switch (c = *prestr++) {
1262 	case '|': return OR;
1263 	case '*': return STAR;
1264 	case '+': return PLUS;
1265 	case '?': return QUEST;
1266 	case '.': return DOT;
1267 	case '\0': prestr--; return '\0';
1268 	case '^':
1269 	case '$':
1270 		return c;
1271 	case '(':
1272 		parens++;
1273 		return c;
1274 	case ')':
1275 		if (parens) {
1276 			parens--;
1277 			return c;
1278 		}
1279 		/* unmatched close parenthesis; per POSIX, treat as literal */
1280 		rlxval = c;
1281 		return CHAR;
1282 	case '\\':
1283 		rlxval = quoted(&prestr);
1284 		return CHAR;
1285 	default:
1286 		rlxval = c;
1287 		return CHAR;
1288 	case '[':
1289 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1290 			FATAL("out of space in reg expr %.10s..", lastre);
1291 		bp = buf;
1292 		if (*prestr == '^') {
1293 			cflag = 1;
1294 			prestr++;
1295 		}
1296 		else
1297 			cflag = 0;
1298 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1299 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1300 			FATAL("out of space for reg expr %.10s...", lastre);
1301 		for (; ; ) {
1302 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1303 				for (i = 0; i < n; i++)
1304 					*bp++ = *prestr++;
1305 				continue;
1306 			}
1307 			if ((c = *prestr++) == '\\') {
1308 				*bp++ = '\\';
1309 				if ((c = *prestr++) == '\0')
1310 					FATAL("nonterminated character class %.20s...", lastre);
1311 				*bp++ = c;
1312 			/* } else if (c == '\n') { */
1313 			/* 	FATAL("newline in character class %.20s...", lastre); */
1314 			} else if (c == '[' && *prestr == ':') {
1315 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1316 				for (cc = charclasses; cc->cc_name; cc++)
1317 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1318 						break;
1319 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1320 				    prestr[2 + cc->cc_namelen] == ']') {
1321 					prestr += cc->cc_namelen + 3;
1322 					/*
1323 					 * BUG: We begin at 1, instead of 0, since we
1324 					 * would otherwise prematurely terminate the
1325 					 * string for classes like [[:cntrl:]]. This
1326 					 * means that we can't match the NUL character,
1327 					 * not without first adapting the entire
1328 					 * program to track each string's length.
1329 					 */
1330 					for (i = 1; i <= UCHAR_MAX; i++) {
1331 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1332 						    FATAL("out of space for reg expr %.10s...", lastre);
1333 						if (cc->cc_func(i)) {
1334 							/* escape backslash */
1335 							if (i == '\\') {
1336 								*bp++ = '\\';
1337 								n++;
1338 							}
1339 
1340 							*bp++ = i;
1341 							n++;
1342 						}
1343 					}
1344 				} else
1345 					*bp++ = c;
1346 			} else if (c == '[' && *prestr == '.') {
1347 				char collate_char;
1348 				prestr++;
1349 				collate_char = *prestr++;
1350 				if (*prestr == '.' && prestr[1] == ']') {
1351 					prestr += 2;
1352 					/* Found it: map via locale TBD: for
1353 					   now, simply return this char.  This
1354 					   is sufficient to pass conformance
1355 					   test awk.ex 156
1356 					 */
1357 					if (*prestr == ']') {
1358 						prestr++;
1359 						rlxval = collate_char;
1360 						return CHAR;
1361 					}
1362 				}
1363 			} else if (c == '[' && *prestr == '=') {
1364 				char equiv_char;
1365 				prestr++;
1366 				equiv_char = *prestr++;
1367 				if (*prestr == '=' && prestr[1] == ']') {
1368 					prestr += 2;
1369 					/* Found it: map via locale TBD: for now
1370 					   simply return this char. This is
1371 					   sufficient to pass conformance test
1372 					   awk.ex 156
1373 					 */
1374 					if (*prestr == ']') {
1375 						prestr++;
1376 						rlxval = equiv_char;
1377 						return CHAR;
1378 					}
1379 				}
1380 			} else if (c == '\0') {
1381 				FATAL("nonterminated character class %.20s", lastre);
1382 			} else if (bp == buf) {	/* 1st char is special */
1383 				*bp++ = c;
1384 			} else if (c == ']') {
1385 				*bp++ = 0;
1386 				rlxstr = (uschar *) tostring((char *) buf);
1387 				if (cflag == 0)
1388 					return CCL;
1389 				else
1390 					return NCCL;
1391 			} else
1392 				*bp++ = c;
1393 		}
1394 		break;
1395 	case '{':
1396 		if (isdigit((int) *(prestr))) {
1397 			num = 0;	/* Process as a repetition */
1398 			n = -1; m = -1;
1399 			commafound = false;
1400 			digitfound = false;
1401 			startreptok = prestr-1;
1402 			/* Remember start of previous atom here ? */
1403 		} else {        	/* just a { char, not a repetition */
1404 			rlxval = c;
1405 			return CHAR;
1406                 }
1407 		for (; ; ) {
1408 			if ((c = *prestr++) == '}') {
1409 				if (commafound) {
1410 					if (digitfound) { /* {n,m} */
1411 						m = num;
1412 						if (m < n)
1413 							FATAL("illegal repetition expression: class %.20s",
1414 								lastre);
1415 						if (n == 0 && m == 1) {
1416 							return QUEST;
1417 						}
1418 					} else {	/* {n,} */
1419 						if (n == 0)
1420 							return STAR;
1421 						else if (n == 1)
1422 							return PLUS;
1423 					}
1424 				} else {
1425 					if (digitfound) { /* {n} same as {n,n} */
1426 						n = num;
1427 						m = num;
1428 					} else {	/* {} */
1429 						FATAL("illegal repetition expression: class %.20s",
1430 							lastre);
1431 					}
1432 				}
1433 				if (repeat(starttok, prestr-starttok, lastatom,
1434 					   startreptok - lastatom, n, m) > 0) {
1435 					if (n == 0 && m == 0) {
1436 						return ZERO;
1437 					}
1438 					/* must rescan input for next token */
1439 					goto rescan;
1440 				}
1441 				/* Failed to replace: eat up {...} characters
1442 				   and treat like just PLUS */
1443 				return PLUS;
1444 			} else if (c == '\0') {
1445 				FATAL("nonterminated character class %.20s",
1446 					lastre);
1447 			} else if (isdigit(c)) {
1448 				num = 10 * num + c - '0';
1449 				digitfound = true;
1450 			} else if (c == ',') {
1451 				if (commafound)
1452 					FATAL("illegal repetition expression: class %.20s",
1453 						lastre);
1454 				/* looking for {n,} or {n,m} */
1455 				commafound = true;
1456 				n = num;
1457 				digitfound = false; /* reset */
1458 				num = 0;
1459 			} else {
1460 				FATAL("illegal repetition expression: class %.20s",
1461 					lastre);
1462 			}
1463 		}
1464 		break;
1465 	}
1466 }
1467 
1468 int cgoto(fa *f, int s, int c)
1469 {
1470 	int *p, *q;
1471 	int i, j, k;
1472 
1473 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1474 	while (f->accept >= maxsetvec) {	/* guessing here! */
1475 		resizesetvec(__func__);
1476 	}
1477 	for (i = 0; i <= f->accept; i++)
1478 		setvec[i] = 0;
1479 	setcnt = 0;
1480 	resize_state(f, s);
1481 	/* compute positions of gototab[s,c] into setvec */
1482 	p = f->posns[s];
1483 	for (i = 1; i <= *p; i++) {
1484 		if ((k = f->re[p[i]].ltype) != FINAL) {
1485 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1486 			 || (k == DOT && c != 0 && c != HAT)
1487 			 || (k == ALL && c != 0)
1488 			 || (k == EMPTYRE && c != 0)
1489 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1490 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1491 				q = f->re[p[i]].lfollow;
1492 				for (j = 1; j <= *q; j++) {
1493 					if (q[j] >= maxsetvec) {
1494 						resizesetvec(__func__);
1495 					}
1496 					if (setvec[q[j]] == 0) {
1497 						setcnt++;
1498 						setvec[q[j]] = 1;
1499 					}
1500 				}
1501 			}
1502 		}
1503 	}
1504 	/* determine if setvec is a previous state */
1505 	tmpset[0] = setcnt;
1506 	j = 1;
1507 	for (i = f->accept; i >= 0; i--)
1508 		if (setvec[i]) {
1509 			tmpset[j++] = i;
1510 		}
1511 	resize_state(f, f->curstat > s ? f->curstat : s);
1512 	/* tmpset == previous state? */
1513 	for (i = 1; i <= f->curstat; i++) {
1514 		p = f->posns[i];
1515 		if ((k = tmpset[0]) != p[0])
1516 			goto different;
1517 		for (j = 1; j <= k; j++)
1518 			if (tmpset[j] != p[j])
1519 				goto different;
1520 		/* setvec is state i */
1521 		if (c != HAT)
1522 			set_gototab(f, s, c, i);
1523 		return i;
1524 	  different:;
1525 	}
1526 
1527 	/* add tmpset to current set of states */
1528 	++(f->curstat);
1529 	resize_state(f, f->curstat);
1530 	clear_gototab(f, f->curstat);
1531 	xfree(f->posns[f->curstat]);
1532 	p = intalloc(setcnt + 1, __func__);
1533 
1534 	f->posns[f->curstat] = p;
1535 	if (c != HAT)
1536 		set_gototab(f, s, c, f->curstat);
1537 	for (i = 0; i <= setcnt; i++)
1538 		p[i] = tmpset[i];
1539 	if (setvec[f->accept])
1540 		f->out[f->curstat] = 1;
1541 	else
1542 		f->out[f->curstat] = 0;
1543 	return f->curstat;
1544 }
1545 
1546 
1547 void freefa(fa *f)	/* free a finite automaton */
1548 {
1549 	int i;
1550 
1551 	if (f == NULL)
1552 		return;
1553 	for (i = 0; i < f->state_count; i++)
1554 		xfree(f->gototab[i].entries);
1555 	xfree(f->gototab);
1556 	for (i = 0; i <= f->curstat; i++)
1557 		xfree(f->posns[i]);
1558 	for (i = 0; i <= f->accept; i++) {
1559 		xfree(f->re[i].lfollow);
1560 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1561 			xfree(f->re[i].lval.np);
1562 	}
1563 	xfree(f->restr);
1564 	xfree(f->out);
1565 	xfree(f->posns);
1566 	xfree(f->gototab);
1567 	xfree(f);
1568 }
1569