xref: /freebsd/contrib/one-true-awk/b.c (revision 17853db4b0dc36ed32af039cd803f13b692913da)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36 
37 #define MAXLIN 22
38 
39 #define type(v)		(v)->nobj	/* badly overloaded here */
40 #define info(v)		(v)->ntype	/* badly overloaded here */
41 #define left(v)		(v)->narg[0]
42 #define right(v)	(v)->narg[1]
43 #define parent(v)	(v)->nnext
44 
45 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
47 #define UNARY	case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 		left is index, right contains value or pointer to value
52 	unary (STAR, PLUS, QUEST): left is child, right is null
53 	binary (CAT, OR): left and right are children
54 	parent contains pointer to parent
55 */
56 
57 
58 int	*setvec;
59 int	*tmpset;
60 int	maxsetvec = 0;
61 
62 int	rtok;		/* next token in current re */
63 int	rlxval;
64 static const uschar	*rlxstr;
65 static const uschar	*prestr;	/* current position in current re */
66 static const uschar	*lastre;	/* origin of last re */
67 static const uschar	*lastatom;	/* origin of last Atom */
68 static const uschar	*starttok;
69 static const uschar 	*basestr;	/* starts with original, replaced during
70 				   repetition processing */
71 static const uschar 	*firstbasestr;
72 
73 static	int setcnt;
74 static	int poscnt;
75 
76 const char	*patbeg;
77 int	patlen;
78 
79 #define	NFA	128	/* cache this many dynamic fa's */
80 fa	*fatab[NFA];
81 int	nfatab	= 0;	/* entries in fatab */
82 
83 extern int u8_nextlen(const char *s);
84 
85 
86 /* utf-8 mechanism:
87 
88    For most of Awk, utf-8 strings just "work", since they look like
89    null-terminated sequences of 8-bit bytes.
90 
91    Functions like length(), index(), and substr() have to operate
92    in units of utf-8 characters.  The u8_* functions in run.c
93    handle this.
94 
95    Regular expressions are more complicated, since the basic
96    mechanism of the goto table used 8-bit byte indices into the
97    gototab entries to compute the next state.  Unicode is a lot
98    bigger, so the gototab entries are now structs with a character
99    and a next state. These are sorted by code point and binary
100    searched.
101 
102    Throughout the RE mechanism in b.c, utf-8 characters are
103    converted to their utf-32 value.  This mostly shows up in
104    cclenter, which expands character class ranges like a-z and now
105    alpha-omega.  The size of a gototab array is still about 256.
106    This should be dynamic, but for now things work ok for a single
107    code page of Unicode, which is the most likely case.
108 
109    The code changes are localized in run.c and b.c.  I have added a
110    handful of functions to somewhat better hide the implementation,
111    but a lot more could be done.
112 
113  */
114 
115 static int entry_cmp(const void *l, const void *r);
116 static int get_gototab(fa*, int, int);
117 static int set_gototab(fa*, int, int, int);
118 static void clear_gototab(fa*, int);
119 extern int u8_rune(int *, const char *);
120 
121 static int *
intalloc(size_t n,const char * f)122 intalloc(size_t n, const char *f)
123 {
124 	int *p = (int *) calloc(n, sizeof(int));
125 	if (p == NULL)
126 		overflo(f);
127 	return p;
128 }
129 
130 static void
resizesetvec(const char * f)131 resizesetvec(const char *f)
132 {
133 	if (maxsetvec == 0)
134 		maxsetvec = MAXLIN;
135 	else
136 		maxsetvec *= 4;
137 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139 	if (setvec == NULL || tmpset == NULL)
140 		overflo(f);
141 }
142 
143 static void
resize_state(fa * f,int state)144 resize_state(fa *f, int state)
145 {
146 	gtt *p;
147 	uschar *p2;
148 	int **p3;
149 	int i, new_count;
150 
151 	if (++state < f->state_count)
152 		return;
153 
154 	new_count = state + 10; /* needs to be tuned */
155 
156 	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157 	if (p == NULL)
158 		goto out;
159 	f->gototab = p;
160 
161 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162 	if (p2 == NULL)
163 		goto out;
164 	f->out = p2;
165 
166 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167 	if (p3 == NULL)
168 		goto out;
169 	f->posns = p3;
170 
171 	for (i = f->state_count; i < new_count; ++i) {
172 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173 		if (f->gototab[i].entries == NULL)
174 			goto out;
175 		f->gototab[i].allocated = NCHARS;
176 		f->gototab[i].inuse = 0;
177 		f->out[i] = 0;
178 		f->posns[i] = NULL;
179 	}
180 	f->state_count = new_count;
181 	return;
182 out:
183 	overflo(__func__);
184 }
185 
makedfa(const char * s,bool anchor)186 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
187 {
188 	int i, use, nuse;
189 	fa *pfa;
190 	static int now = 1;
191 
192 	if (setvec == NULL) {	/* first time through any RE */
193 		resizesetvec(__func__);
194 	}
195 
196 	if (compile_time != RUNNING)	/* a constant for sure */
197 		return mkdfa(s, anchor);
198 	for (i = 0; i < nfatab; i++)	/* is it there already? */
199 		if (fatab[i]->anchor == anchor
200 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
201 			fatab[i]->use = now++;
202 			return fatab[i];
203 		}
204 	pfa = mkdfa(s, anchor);
205 	if (nfatab < NFA) {	/* room for another */
206 		fatab[nfatab] = pfa;
207 		fatab[nfatab]->use = now++;
208 		nfatab++;
209 		return pfa;
210 	}
211 	use = fatab[0]->use;	/* replace least-recently used */
212 	nuse = 0;
213 	for (i = 1; i < nfatab; i++)
214 		if (fatab[i]->use < use) {
215 			use = fatab[i]->use;
216 			nuse = i;
217 		}
218 	freefa(fatab[nuse]);
219 	fatab[nuse] = pfa;
220 	pfa->use = now++;
221 	return pfa;
222 }
223 
mkdfa(const char * s,bool anchor)224 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
225 				/* anchor = true for anchored matches, else false */
226 {
227 	Node *p, *p1;
228 	fa *f;
229 
230 	firstbasestr = (const uschar *) s;
231 	basestr = firstbasestr;
232 	p = reparse(s);
233 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234 		/* put ALL STAR in front of reg.  exp. */
235 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236 		/* put FINAL after reg.  exp. */
237 
238 	poscnt = 0;
239 	penter(p1);	/* enter parent pointers and leaf indices */
240 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241 		overflo(__func__);
242 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
243 	cfoll(f, p1);	/* set up follow sets */
244 	freetr(p1);
245 	resize_state(f, 1);
246 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247 	f->posns[1] = intalloc(1, __func__);
248 	*f->posns[1] = 0;
249 	f->initstat = makeinit(f, anchor);
250 	f->anchor = anchor;
251 	f->restr = (uschar *) tostring(s);
252 	if (firstbasestr != basestr) {
253 		if (basestr)
254 			xfree(basestr);
255 	}
256 	return f;
257 }
258 
makeinit(fa * f,bool anchor)259 int makeinit(fa *f, bool anchor)
260 {
261 	int i, k;
262 
263 	f->curstat = 2;
264 	f->out[2] = 0;
265 	k = *(f->re[0].lfollow);
266 	xfree(f->posns[2]);
267 	f->posns[2] = intalloc(k + 1,  __func__);
268 	for (i = 0; i <= k; i++) {
269 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
270 	}
271 	if ((f->posns[2])[1] == f->accept)
272 		f->out[2] = 1;
273 	clear_gototab(f, 2);
274 	f->curstat = cgoto(f, 2, HAT);
275 	if (anchor) {
276 		*f->posns[2] = k-1;	/* leave out position 0 */
277 		for (i = 0; i < k; i++) {
278 			(f->posns[0])[i] = (f->posns[2])[i];
279 		}
280 
281 		f->out[0] = f->out[2];
282 		if (f->curstat != 2)
283 			--(*f->posns[f->curstat]);
284 	}
285 	return f->curstat;
286 }
287 
penter(Node * p)288 void penter(Node *p)	/* set up parent pointers and leaf indices */
289 {
290 	switch (type(p)) {
291 	ELEAF
292 	LEAF
293 		info(p) = poscnt;
294 		poscnt++;
295 		break;
296 	UNARY
297 		penter(left(p));
298 		parent(left(p)) = p;
299 		break;
300 	case CAT:
301 	case OR:
302 		penter(left(p));
303 		penter(right(p));
304 		parent(left(p)) = p;
305 		parent(right(p)) = p;
306 		break;
307 	case ZERO:
308 		break;
309 	default:	/* can't happen */
310 		FATAL("can't happen: unknown type %d in penter", type(p));
311 		break;
312 	}
313 }
314 
freetr(Node * p)315 void freetr(Node *p)	/* free parse tree */
316 {
317 	switch (type(p)) {
318 	ELEAF
319 	LEAF
320 		xfree(p);
321 		break;
322 	UNARY
323 	case ZERO:
324 		freetr(left(p));
325 		xfree(p);
326 		break;
327 	case CAT:
328 	case OR:
329 		freetr(left(p));
330 		freetr(right(p));
331 		xfree(p);
332 		break;
333 	default:	/* can't happen */
334 		FATAL("can't happen: unknown type %d in freetr", type(p));
335 		break;
336 	}
337 }
338 
339 /* in the parsing of regular expressions, metacharacters like . have */
340 /* to be seen literally;  \056 is not a metacharacter. */
341 
hexstr(const uschar ** pp,int max)342 int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
343 {			/* only pick up one 8-bit byte (2 chars) */
344 	const uschar *p;
345 	int n = 0;
346 	int i;
347 
348 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349 		if (isdigit((int) *p))
350 			n = 16 * n + *p - '0';
351 		else if (*p >= 'a' && *p <= 'f')
352 			n = 16 * n + *p - 'a' + 10;
353 		else if (*p >= 'A' && *p <= 'F')
354 			n = 16 * n + *p - 'A' + 10;
355 	}
356 	*pp = p;
357 	return n;
358 }
359 
360 
361 
362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
363 
quoted(const uschar ** pp)364 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
365 			/* and increment *pp */
366 {
367 	const uschar *p = *pp;
368 	int c;
369 
370 /* BUG: should advance by utf-8 char even if makes no sense */
371 
372 	switch ((c = *p++)) {
373 	case 't':
374 		c = '\t';
375 		break;
376 	case 'n':
377 		c = '\n';
378 		break;
379 	case 'f':
380 		c = '\f';
381 		break;
382 	case 'r':
383 		c = '\r';
384 		break;
385 	case 'b':
386 		c = '\b';
387 		break;
388 	case 'v':
389 		c = '\v';
390 		break;
391 	case 'a':
392 		c = '\a';
393 		break;
394 	case '\\':
395 		c = '\\';
396 		break;
397 	case 'x': /* 2 hex digits follow */
398 		c = hexstr(&p, 2); /* this adds a null if number is invalid */
399 		break;
400 	case 'u': /* unicode char number up to 8 hex digits */
401 		c = hexstr(&p, 8);
402 		break;
403 	default:
404 		if (isoctdigit(c)) { /* \d \dd \ddd */
405 			int n = c - '0';
406 			if (isoctdigit(*p)) {
407 				n = 8 * n + *p++ - '0';
408 				if (isoctdigit(*p))
409 					n = 8 * n + *p++ - '0';
410 			}
411 			c = n;
412 		}
413 	}
414 
415 	*pp = p;
416 	return c;
417 }
418 
cclenter(const char * argp)419 int *cclenter(const char *argp)	/* add a character class */
420 {
421 	int i, c, c2;
422 	int n;
423 	const uschar *p = (const uschar *) argp;
424 	int *bp, *retp;
425 	static int *buf = NULL;
426 	static int bufsz = 100;
427 
428 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
429 		FATAL("out of space for character class [%.10s...] 1", p);
430 	bp = buf;
431 	for (i = 0; *p != 0; ) {
432 		n = u8_rune(&c, (const char *) p);
433 		p += n;
434 		if (c == '\\') {
435 			c = quoted(&p);
436 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
437 			if (*p != 0) {
438 				c = bp[-1];
439 				/* c2 = *p++; */
440 				n = u8_rune(&c2, (const char *) p);
441 				p += n;
442 				if (c2 == '\\')
443 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
444 				if (c > c2) {	/* empty; ignore */
445 					bp--;
446 					i--;
447 					continue;
448 				}
449 				while (c < c2) {
450 					if (i >= bufsz) {
451 						bufsz *= 2;
452 						buf = (int *) realloc(buf, bufsz * sizeof(int));
453 						if (buf == NULL)
454 							FATAL("out of space for character class [%.10s...] 2", p);
455 						bp = buf + i;
456 					}
457 					*bp++ = ++c;
458 					i++;
459 				}
460 				continue;
461 			}
462 		}
463 		if (i >= bufsz) {
464 			bufsz *= 2;
465 			buf = (int *) realloc(buf, bufsz * sizeof(int));
466 			if (buf == NULL)
467 				FATAL("out of space for character class [%.10s...] 2", p);
468 			bp = buf + i;
469 		}
470 		*bp++ = c;
471 		i++;
472 	}
473 	*bp = 0;
474 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
475 	/* xfree(op);  BUG: what are we freeing here? */
476 	retp = (int *) calloc(bp-buf+1, sizeof(int));
477 	for (i = 0; i < bp-buf+1; i++)
478 		retp[i] = buf[i];
479 	return retp;
480 }
481 
overflo(const char * s)482 void overflo(const char *s)
483 {
484 	FATAL("regular expression too big: out of space in %.30s...", s);
485 }
486 
cfoll(fa * f,Node * v)487 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
488 {
489 	int i;
490 	int *p;
491 
492 	switch (type(v)) {
493 	ELEAF
494 	LEAF
495 		f->re[info(v)].ltype = type(v);
496 		f->re[info(v)].lval.np = right(v);
497 		while (f->accept >= maxsetvec) {	/* guessing here! */
498 			resizesetvec(__func__);
499 		}
500 		for (i = 0; i <= f->accept; i++)
501 			setvec[i] = 0;
502 		setcnt = 0;
503 		follow(v);	/* computes setvec and setcnt */
504 		p = intalloc(setcnt + 1, __func__);
505 		f->re[info(v)].lfollow = p;
506 		*p = setcnt;
507 		for (i = f->accept; i >= 0; i--)
508 			if (setvec[i] == 1)
509 				*++p = i;
510 		break;
511 	UNARY
512 		cfoll(f,left(v));
513 		break;
514 	case CAT:
515 	case OR:
516 		cfoll(f,left(v));
517 		cfoll(f,right(v));
518 		break;
519 	case ZERO:
520 		break;
521 	default:	/* can't happen */
522 		FATAL("can't happen: unknown type %d in cfoll", type(v));
523 	}
524 }
525 
first(Node * p)526 int first(Node *p)	/* collects initially active leaves of p into setvec */
527 			/* returns 0 if p matches empty string */
528 {
529 	int b, lp;
530 
531 	switch (type(p)) {
532 	ELEAF
533 	LEAF
534 		lp = info(p);	/* look for high-water mark of subscripts */
535 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
536 			resizesetvec(__func__);
537 		}
538 		if (type(p) == EMPTYRE) {
539 			setvec[lp] = 0;
540 			return(0);
541 		}
542 		if (setvec[lp] != 1) {
543 			setvec[lp] = 1;
544 			setcnt++;
545 		}
546 		if (type(p) == CCL && (*(int *) right(p)) == 0)
547 			return(0);		/* empty CCL */
548 		return(1);
549 	case PLUS:
550 		if (first(left(p)) == 0)
551 			return(0);
552 		return(1);
553 	case STAR:
554 	case QUEST:
555 		first(left(p));
556 		return(0);
557 	case CAT:
558 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
559 		return(1);
560 	case OR:
561 		b = first(right(p));
562 		if (first(left(p)) == 0 || b == 0) return(0);
563 		return(1);
564 	case ZERO:
565 		return 0;
566 	}
567 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
568 	return(-1);
569 }
570 
follow(Node * v)571 void follow(Node *v)	/* collects leaves that can follow v into setvec */
572 {
573 	Node *p;
574 
575 	if (type(v) == FINAL)
576 		return;
577 	p = parent(v);
578 	switch (type(p)) {
579 	case STAR:
580 	case PLUS:
581 		first(v);
582 		follow(p);
583 		return;
584 
585 	case OR:
586 	case QUEST:
587 		follow(p);
588 		return;
589 
590 	case CAT:
591 		if (v == left(p)) {	/* v is left child of p */
592 			if (first(right(p)) == 0) {
593 				follow(p);
594 				return;
595 			}
596 		} else		/* v is right child */
597 			follow(p);
598 		return;
599 	}
600 }
601 
member(int c,int * sarg)602 int member(int c, int *sarg)	/* is c in s? */
603 {
604 	int *s = (int *) sarg;
605 
606 	while (*s)
607 		if (c == *s++)
608 			return(1);
609 	return(0);
610 }
611 
resize_gototab(fa * f,int state)612 static void resize_gototab(fa *f, int state)
613 {
614 	size_t new_size = f->gototab[state].allocated * 2;
615 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
616 	if (p == NULL)
617 		overflo(__func__);
618 
619 	// need to initialized the new memory to zero
620 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
621 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
622 
623 	f->gototab[state].allocated = new_size;			// update gototab info
624 	f->gototab[state].entries = p;
625 }
626 
get_gototab(fa * f,int state,int ch)627 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
628 {
629 	gtte key;
630 	gtte *item;
631 
632 	key.ch = ch;
633 	key.state = 0;	/* irrelevant */
634 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
635 			f->gototab[state].inuse, sizeof(gtte),
636 			entry_cmp);
637 
638 	if (item == NULL)
639 		return 0;
640 	else
641 		return item->state;
642 }
643 
entry_cmp(const void * l,const void * r)644 static int entry_cmp(const void *l, const void *r)
645 {
646 	const gtte *left, *right;
647 
648 	left = (const gtte *) l;
649 	right = (const gtte *) r;
650 
651 	return left->ch - right->ch;
652 }
653 
set_gototab(fa * f,int state,int ch,int val)654 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
655 {
656 	if (f->gototab[state].inuse == 0) {
657 		f->gototab[state].entries[0].ch = ch;
658 		f->gototab[state].entries[0].state = val;
659 		f->gototab[state].inuse++;
660 		return val;
661 	} else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
662 		// not seen yet, insert and return
663 		gtt *tab = & f->gototab[state];
664 		if (tab->inuse + 1 >= tab->allocated)
665 			resize_gototab(f, state);
666 
667 		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
668 		f->gototab[state].entries[f->gototab[state].inuse].state = val;
669 		f->gototab[state].inuse++;
670 		return val;
671 	} else {
672 		// maybe we have it, maybe we don't
673 		gtte key;
674 		gtte *item;
675 
676 		key.ch = ch;
677 		key.state = 0;	/* irrelevant */
678 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
679 				f->gototab[state].inuse, sizeof(gtte),
680 				entry_cmp);
681 
682 		if (item != NULL) {
683 			// we have it, update state and return
684 			item->state = val;
685 			return item->state;
686 		}
687 		// otherwise, fall through to insert and reallocate.
688 	}
689 
690 	gtt *tab = & f->gototab[state];
691 	if (tab->inuse + 1 >= tab->allocated)
692 		resize_gototab(f, state);
693 	f->gototab[state].entries[tab->inuse].ch = ch;
694 	f->gototab[state].entries[tab->inuse].state = val;
695 	++tab->inuse;
696 
697 	qsort(f->gototab[state].entries,
698 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
699 
700 	return val; /* not used anywhere at the moment */
701 }
702 
clear_gototab(fa * f,int state)703 static void clear_gototab(fa *f, int state)
704 {
705 	memset(f->gototab[state].entries, 0,
706 		f->gototab[state].allocated * sizeof(gtte));
707 	f->gototab[state].inuse = 0;
708 }
709 
match(fa * f,const char * p0)710 int match(fa *f, const char *p0)	/* shortest match ? */
711 {
712 	int s, ns;
713 	int n;
714 	int rune;
715 	const uschar *p = (const uschar *) p0;
716 
717 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
718 
719 	s = f->initstat;
720 	assert (s < f->state_count);
721 
722 	if (f->out[s])
723 		return(1);
724 	do {
725 		/* assert(*p < NCHARS); */
726 		n = u8_rune(&rune, (const char *) p);
727 		if ((ns = get_gototab(f, s, rune)) != 0)
728 			s = ns;
729 		else
730 			s = cgoto(f, s, rune);
731 		if (f->out[s])
732 			return(1);
733 		if (*p == 0)
734 			break;
735 		p += n;
736 	} while (1);  /* was *p++ != 0 */
737 	return(0);
738 }
739 
pmatch(fa * f,const char * p0)740 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
741 {
742 	int s, ns;
743 	int n;
744 	int rune;
745 	const uschar *p = (const uschar *) p0;
746 	const uschar *q;
747 
748 	s = f->initstat;
749 	assert(s < f->state_count);
750 
751 	patbeg = (const char *)p;
752 	patlen = -1;
753 	do {
754 		q = p;
755 		do {
756 			if (f->out[s])		/* final state */
757 				patlen = q-p;
758 			/* assert(*q < NCHARS); */
759 			n = u8_rune(&rune, (const char *) q);
760 			if ((ns = get_gototab(f, s, rune)) != 0)
761 				s = ns;
762 			else
763 				s = cgoto(f, s, rune);
764 
765 			assert(s < f->state_count);
766 
767 			if (s == 1) {	/* no transition */
768 				if (patlen >= 0) {
769 					patbeg = (const char *) p;
770 					return(1);
771 				}
772 				else
773 					goto nextin;	/* no match */
774 			}
775 			if (*q == 0)
776 				break;
777 			q += n;
778 		} while (1);
779 		q++;  /* was *q++ */
780 		if (f->out[s])
781 			patlen = q-p-1;	/* don't count $ */
782 		if (patlen >= 0) {
783 			patbeg = (const char *) p;
784 			return(1);
785 		}
786 	nextin:
787 		s = 2;
788 		if (*p == 0)
789 			break;
790 		n = u8_rune(&rune, (const char *) p);
791 		p += n;
792 	} while (1); /* was *p++ */
793 	return (0);
794 }
795 
nematch(fa * f,const char * p0)796 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
797 {
798 	int s, ns;
799         int n;
800         int rune;
801 	const uschar *p = (const uschar *) p0;
802 	const uschar *q;
803 
804 	s = f->initstat;
805 	assert(s < f->state_count);
806 
807 	patbeg = (const char *)p;
808 	patlen = -1;
809 	while (*p) {
810 		q = p;
811 		do {
812 			if (f->out[s])		/* final state */
813 				patlen = q-p;
814 			/* assert(*q < NCHARS); */
815 			n = u8_rune(&rune, (const char *) q);
816 			if ((ns = get_gototab(f, s, rune)) != 0)
817 				s = ns;
818 			else
819 				s = cgoto(f, s, rune);
820 			if (s == 1) {	/* no transition */
821 				if (patlen > 0) {
822 					patbeg = (const char *) p;
823 					return(1);
824 				} else
825 					goto nnextin;	/* no nonempty match */
826 			}
827 			if (*q == 0)
828 				break;
829 			q += n;
830 		} while (1);
831 		q++;
832 		if (f->out[s])
833 			patlen = q-p-1;	/* don't count $ */
834 		if (patlen > 0 ) {
835 			patbeg = (const char *) p;
836 			return(1);
837 		}
838 	nnextin:
839 		s = 2;
840 		p++;
841 	}
842 	return (0);
843 }
844 
845 
846 /*
847  * NAME
848  *     fnematch
849  *
850  * DESCRIPTION
851  *     A stream-fed version of nematch which transfers characters to a
852  *     null-terminated buffer. All characters up to and including the last
853  *     character of the matching text or EOF are placed in the buffer. If
854  *     a match is found, patbeg and patlen are set appropriately.
855  *
856  * RETURN VALUES
857  *     false    No match found.
858  *     true     Match found.
859  */
860 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)861 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
862 {
863 	char *i, *j, *k, *buf = *pbuf;
864 	int bufsize = *pbufsize;
865 	int c, n, ns, s;
866 
867 	s = pfa->initstat;
868 	patlen = 0;
869 
870 	/*
871 	 * buf <= i <= j <= k <= buf+bufsize
872 	 *
873 	 * i: origin of active substring
874 	 * j: current character
875 	 * k: destination of the next getc
876 	 */
877 
878 	i = j = k = buf;
879 
880 	do {
881 		/*
882 		 * Call u8_rune with at least awk_mb_cur_max ahead in
883 		 * the buffer until EOF interferes.
884 		 */
885 		if (k - j < (int)awk_mb_cur_max) {
886 			if (k + awk_mb_cur_max > buf + bufsize) {
887 				char *obuf = buf;
888 				adjbuf((char **) &buf, &bufsize,
889 				    bufsize + awk_mb_cur_max,
890 				    quantum, 0, "fnematch");
891 
892 				/* buf resized, maybe moved. update pointers */
893 				*pbufsize = bufsize;
894 				if (obuf != buf) {
895 					i = buf + (i - obuf);
896 					j = buf + (j - obuf);
897 					k = buf + (k - obuf);
898 					*pbuf = buf;
899 					if (patlen)
900 						patbeg = buf + (patbeg - obuf);
901 				}
902 			}
903 			for (n = awk_mb_cur_max ; n > 0; n--) {
904 				*k++ = (c = getc(f)) != EOF ? c : 0;
905 				if (c == EOF) {
906 					if (ferror(f))
907 						FATAL("fnematch: getc error");
908 					break;
909 				}
910 			}
911 		}
912 
913 		j += u8_rune(&c, j);
914 
915 		if ((ns = get_gototab(pfa, s, c)) != 0)
916 			s = ns;
917 		else
918 			s = cgoto(pfa, s, c);
919 
920 		if (pfa->out[s]) {	/* final state */
921 			patbeg = i;
922 			patlen = j - i;
923 			if (c == 0)	/* don't count $ */
924 				patlen--;
925 		}
926 
927 		if (c && s != 1)
928 			continue;  /* origin i still viable, next j */
929 		if (patlen)
930 			break;     /* best match found */
931 
932 		/* no match at origin i, next i and start over */
933 		i += u8_rune(&c, i);
934 		if (c == 0)
935 			break;    /* no match */
936 		j = i;
937 		s = 2;
938 	} while (1);
939 
940 	if (patlen) {
941 		/*
942 		 * Under no circumstances is the last character fed to
943 		 * the automaton part of the match. It is EOF's nullbyte,
944 		 * or it sent the automaton into a state with no further
945 		 * transitions available (s==1), or both. Room for a
946 		 * terminating nullbyte is guaranteed.
947 		 *
948 		 * ungetc any chars after the end of matching text
949 		 * (except for EOF's nullbyte, if present) and null
950 		 * terminate the buffer.
951 		 */
952 		do
953 			if (*--k && ungetc(*k, f) == EOF)
954 				FATAL("unable to ungetc '%c'", *k);
955 		while (k > patbeg + patlen);
956 		*k = '\0';
957 		return true;
958 	}
959 	else
960 		return false;
961 }
962 
reparse(const char * p)963 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
964 {			/* uses relex() to scan regular expression */
965 	Node *np;
966 
967 	DPRINTF("reparse <%s>\n", p);
968 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
969 	rtok = relex();
970 	/* GNU compatibility: an empty regexp matches anything */
971 	if (rtok == '\0') {
972 		/* FATAL("empty regular expression"); previous */
973 		return(op2(EMPTYRE, NIL, NIL));
974 	}
975 	np = regexp();
976 	if (rtok != '\0')
977 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
978 	return(np);
979 }
980 
regexp(void)981 Node *regexp(void)	/* top-level parse of reg expr */
982 {
983 	return (alt(concat(primary())));
984 }
985 
primary(void)986 Node *primary(void)
987 {
988 	Node *np;
989 	int savelastatom;
990 
991 	switch (rtok) {
992 	case CHAR:
993 		lastatom = starttok;
994 		np = op2(CHAR, NIL, itonp(rlxval));
995 		rtok = relex();
996 		return (unary(np));
997 	case ALL:
998 		rtok = relex();
999 		return (unary(op2(ALL, NIL, NIL)));
1000 	case EMPTYRE:
1001 		rtok = relex();
1002 		return (unary(op2(EMPTYRE, NIL, NIL)));
1003 	case DOT:
1004 		lastatom = starttok;
1005 		rtok = relex();
1006 		return (unary(op2(DOT, NIL, NIL)));
1007 	case CCL:
1008 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1009 		lastatom = starttok;
1010 		rtok = relex();
1011 		return (unary(np));
1012 	case NCCL:
1013 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1014 		lastatom = starttok;
1015 		rtok = relex();
1016 		return (unary(np));
1017 	case '^':
1018 		rtok = relex();
1019 		return (unary(op2(CHAR, NIL, itonp(HAT))));
1020 	case '$':
1021 		rtok = relex();
1022 		return (unary(op2(CHAR, NIL, NIL)));
1023 	case '(':
1024 		lastatom = starttok;
1025 		savelastatom = starttok - basestr; /* Retain over recursion */
1026 		rtok = relex();
1027 		if (rtok == ')') {	/* special pleading for () */
1028 			rtok = relex();
1029 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
1030 		}
1031 		np = regexp();
1032 		if (rtok == ')') {
1033 			lastatom = basestr + savelastatom; /* Restore */
1034 			rtok = relex();
1035 			return (unary(np));
1036 		}
1037 		else
1038 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1039 	default:
1040 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1041 	}
1042 	return 0;	/*NOTREACHED*/
1043 }
1044 
concat(Node * np)1045 Node *concat(Node *np)
1046 {
1047 	switch (rtok) {
1048 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1049 		return (concat(op2(CAT, np, primary())));
1050 	case EMPTYRE:
1051 		rtok = relex();
1052 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1053 				primary())));
1054 	}
1055 	return (np);
1056 }
1057 
alt(Node * np)1058 Node *alt(Node *np)
1059 {
1060 	if (rtok == OR) {
1061 		rtok = relex();
1062 		return (alt(op2(OR, np, concat(primary()))));
1063 	}
1064 	return (np);
1065 }
1066 
unary(Node * np)1067 Node *unary(Node *np)
1068 {
1069 	switch (rtok) {
1070 	case STAR:
1071 		rtok = relex();
1072 		return (unary(op2(STAR, np, NIL)));
1073 	case PLUS:
1074 		rtok = relex();
1075 		return (unary(op2(PLUS, np, NIL)));
1076 	case QUEST:
1077 		rtok = relex();
1078 		return (unary(op2(QUEST, np, NIL)));
1079 	case ZERO:
1080 		rtok = relex();
1081 		return (unary(op2(ZERO, np, NIL)));
1082 	default:
1083 		return (np);
1084 	}
1085 }
1086 
1087 /*
1088  * Character class definitions conformant to the POSIX locale as
1089  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1090  * and operating character sets are both ASCII (ISO646) or supersets
1091  * thereof.
1092  *
1093  * Note that to avoid overflowing the temporary buffer used in
1094  * relex(), the expanded character class (prior to range expansion)
1095  * must be less than twice the size of their full name.
1096  */
1097 
1098 /* Because isblank doesn't show up in any of the header files on any
1099  * system i use, it's defined here.  if some other locale has a richer
1100  * definition of "blank", define HAS_ISBLANK and provide your own
1101  * version.
1102  * the parentheses here are an attempt to find a path through the maze
1103  * of macro definition and/or function and/or version provided.  thanks
1104  * to nelson beebe for the suggestion; let's see if it works everywhere.
1105  */
1106 
1107 /* #define HAS_ISBLANK */
1108 #ifndef HAS_ISBLANK
1109 
1110 int (xisblank)(int c)
1111 {
1112 	return c==' ' || c=='\t';
1113 }
1114 
1115 #endif
1116 
1117 static const struct charclass {
1118 	const char *cc_name;
1119 	int cc_namelen;
1120 	int (*cc_func)(int);
1121 } charclasses[] = {
1122 	{ "alnum",	5,	isalnum },
1123 	{ "alpha",	5,	isalpha },
1124 #ifndef HAS_ISBLANK
1125 	{ "blank",	5,	xisblank },
1126 #else
1127 	{ "blank",	5,	isblank },
1128 #endif
1129 	{ "cntrl",	5,	iscntrl },
1130 	{ "digit",	5,	isdigit },
1131 	{ "graph",	5,	isgraph },
1132 	{ "lower",	5,	islower },
1133 	{ "print",	5,	isprint },
1134 	{ "punct",	5,	ispunct },
1135 	{ "space",	5,	isspace },
1136 	{ "upper",	5,	isupper },
1137 	{ "xdigit",	6,	isxdigit },
1138 	{ NULL,		0,	NULL },
1139 };
1140 
1141 #define REPEAT_SIMPLE		0
1142 #define REPEAT_PLUS_APPENDED	1
1143 #define REPEAT_WITH_Q		2
1144 #define REPEAT_ZERO		3
1145 
1146 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1147 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1148 	       int atomlen, int firstnum, int secondnum, int special_case)
1149 {
1150 	int i, j;
1151 	uschar *buf = 0;
1152 	int ret = 1;
1153 	int init_q = (firstnum == 0);		/* first added char will be ? */
1154 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1155 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1156 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1157 	int size = prefix_length +  suffix_length;
1158 
1159 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1160 		size += atomlen*(firstnum-1);
1161 	}
1162 
1163 	/* Adjust size of buffer for special cases */
1164 	if (special_case == REPEAT_PLUS_APPENDED) {
1165 		size++;		/* for the final + */
1166 	} else if (special_case == REPEAT_WITH_Q) {
1167 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1168 	} else if (special_case == REPEAT_ZERO) {
1169 		size += 2;	/* just a null ERE: () */
1170 	}
1171 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1172 		FATAL("out of space in reg expr %.10s..", lastre);
1173 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1174 	j = prefix_length;
1175 	if (special_case == REPEAT_ZERO) {
1176 		j -= atomlen;
1177 		buf[j++] = '(';
1178 		buf[j++] = ')';
1179 	}
1180 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1181 		memcpy(&buf[j], atom, atomlen);
1182 		j += atomlen;
1183 	}
1184 	if (special_case == REPEAT_PLUS_APPENDED) {
1185 		buf[j++] = '+';
1186 	} else if (special_case == REPEAT_WITH_Q) {
1187 		if (init_q)
1188 			buf[j++] = '?';
1189 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1190 			memcpy(&buf[j], atom, atomlen);
1191 			j += atomlen;
1192 			buf[j++] = '?';
1193 		}
1194 	}
1195 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
1196 	j += suffix_length;
1197 	buf[j] = '\0';
1198 	/* free old basestr */
1199 	if (firstbasestr != basestr) {
1200 		if (basestr)
1201 			xfree(basestr);
1202 	}
1203 	basestr = buf;
1204 	prestr  = buf + prefix_length;
1205 	if (special_case == REPEAT_ZERO) {
1206 		prestr  -= atomlen;
1207 		ret++;
1208 	}
1209 	return ret;
1210 }
1211 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1212 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1213 		  int atomlen, int firstnum, int secondnum)
1214 {
1215 	/*
1216 	   In general, the repetition specifier or "bound" is replaced here
1217 	   by an equivalent ERE string, repeating the immediately previous atom
1218 	   and appending ? and + as needed. Note that the first copy of the
1219 	   atom is left in place, except in the special_case of a zero-repeat
1220 	   (i.e., {0}).
1221 	 */
1222 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1223 		if (firstnum < 2) {
1224 			/* 0 or 1: should be handled before you get here */
1225 			FATAL("internal error");
1226 		} else {
1227 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1228 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1229 		}
1230 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1231 		if (firstnum == 0) {	/* {0} or {0,0} */
1232 			/* This case is unusual because the resulting
1233 			   replacement string might actually be SMALLER than
1234 			   the original ERE */
1235 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1236 					firstnum, secondnum, REPEAT_ZERO);
1237 		} else {		/* (firstnum >= 1) */
1238 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1239 					firstnum, secondnum, REPEAT_SIMPLE);
1240 		}
1241 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1242 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1243 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1244 					firstnum, secondnum, REPEAT_WITH_Q);
1245 	} else {	/* Error - shouldn't be here (n>m) */
1246 		FATAL("internal error");
1247 	}
1248 	return 0;
1249 }
1250 
relex(void)1251 int relex(void)		/* lexical analyzer for reparse */
1252 {
1253 	int c, n;
1254 	int cflag;
1255 	static uschar *buf = NULL;
1256 	static int bufsz = 100;
1257 	uschar *bp;
1258 	const struct charclass *cc;
1259 	int i;
1260 	int num, m;
1261 	bool commafound, digitfound;
1262 	const uschar *startreptok;
1263 	static int parens = 0;
1264 
1265 rescan:
1266 	starttok = prestr;
1267 
1268 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1269 		prestr += n;
1270 		starttok = prestr;
1271 		return CHAR;
1272 	}
1273 
1274 	switch (c = *prestr++) {
1275 	case '|': return OR;
1276 	case '*': return STAR;
1277 	case '+': return PLUS;
1278 	case '?': return QUEST;
1279 	case '.': return DOT;
1280 	case '\0': prestr--; return '\0';
1281 	case '^':
1282 	case '$':
1283 		return c;
1284 	case '(':
1285 		parens++;
1286 		return c;
1287 	case ')':
1288 		if (parens) {
1289 			parens--;
1290 			return c;
1291 		}
1292 		/* unmatched close parenthesis; per POSIX, treat as literal */
1293 		rlxval = c;
1294 		return CHAR;
1295 	case '\\':
1296 		rlxval = quoted(&prestr);
1297 		return CHAR;
1298 	default:
1299 		rlxval = c;
1300 		return CHAR;
1301 	case '[':
1302 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1303 			FATAL("out of space in reg expr %.10s..", lastre);
1304 		bp = buf;
1305 		if (*prestr == '^') {
1306 			cflag = 1;
1307 			prestr++;
1308 		}
1309 		else
1310 			cflag = 0;
1311 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1312 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1313 			FATAL("out of space for reg expr %.10s...", lastre);
1314 		for (; ; ) {
1315 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1316 				for (i = 0; i < n; i++)
1317 					*bp++ = *prestr++;
1318 				continue;
1319 			}
1320 			if ((c = *prestr++) == '\\') {
1321 				*bp++ = '\\';
1322 				if ((c = *prestr++) == '\0')
1323 					FATAL("nonterminated character class %.20s...", lastre);
1324 				*bp++ = c;
1325 			/* } else if (c == '\n') { */
1326 			/* 	FATAL("newline in character class %.20s...", lastre); */
1327 			} else if (c == '[' && *prestr == ':') {
1328 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1329 				for (cc = charclasses; cc->cc_name; cc++)
1330 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1331 						break;
1332 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1333 				    prestr[2 + cc->cc_namelen] == ']') {
1334 					prestr += cc->cc_namelen + 3;
1335 					/*
1336 					 * BUG: We begin at 1, instead of 0, since we
1337 					 * would otherwise prematurely terminate the
1338 					 * string for classes like [[:cntrl:]]. This
1339 					 * means that we can't match the NUL character,
1340 					 * not without first adapting the entire
1341 					 * program to track each string's length.
1342 					 */
1343 					for (i = 1; i <= UCHAR_MAX; i++) {
1344 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1345 						    FATAL("out of space for reg expr %.10s...", lastre);
1346 						if (cc->cc_func(i)) {
1347 							/* escape backslash */
1348 							if (i == '\\') {
1349 								*bp++ = '\\';
1350 								n++;
1351 							}
1352 
1353 							*bp++ = i;
1354 							n++;
1355 						}
1356 					}
1357 				} else
1358 					*bp++ = c;
1359 			} else if (c == '[' && *prestr == '.') {
1360 				char collate_char;
1361 				prestr++;
1362 				collate_char = *prestr++;
1363 				if (*prestr == '.' && prestr[1] == ']') {
1364 					prestr += 2;
1365 					/* Found it: map via locale TBD: for
1366 					   now, simply return this char.  This
1367 					   is sufficient to pass conformance
1368 					   test awk.ex 156
1369 					 */
1370 					if (*prestr == ']') {
1371 						prestr++;
1372 						rlxval = collate_char;
1373 						return CHAR;
1374 					}
1375 				}
1376 			} else if (c == '[' && *prestr == '=') {
1377 				char equiv_char;
1378 				prestr++;
1379 				equiv_char = *prestr++;
1380 				if (*prestr == '=' && prestr[1] == ']') {
1381 					prestr += 2;
1382 					/* Found it: map via locale TBD: for now
1383 					   simply return this char. This is
1384 					   sufficient to pass conformance test
1385 					   awk.ex 156
1386 					 */
1387 					if (*prestr == ']') {
1388 						prestr++;
1389 						rlxval = equiv_char;
1390 						return CHAR;
1391 					}
1392 				}
1393 			} else if (c == '\0') {
1394 				FATAL("nonterminated character class %.20s", lastre);
1395 			} else if (bp == buf) {	/* 1st char is special */
1396 				*bp++ = c;
1397 			} else if (c == ']') {
1398 				*bp++ = 0;
1399 				rlxstr = (uschar *) tostring((char *) buf);
1400 				if (cflag == 0)
1401 					return CCL;
1402 				else
1403 					return NCCL;
1404 			} else
1405 				*bp++ = c;
1406 		}
1407 		break;
1408 	case '{':
1409 		if (isdigit((int) *(prestr))) {
1410 			num = 0;	/* Process as a repetition */
1411 			n = -1; m = -1;
1412 			commafound = false;
1413 			digitfound = false;
1414 			startreptok = prestr-1;
1415 			/* Remember start of previous atom here ? */
1416 		} else {        	/* just a { char, not a repetition */
1417 			rlxval = c;
1418 			return CHAR;
1419                 }
1420 		for (; ; ) {
1421 			if ((c = *prestr++) == '}') {
1422 				if (commafound) {
1423 					if (digitfound) { /* {n,m} */
1424 						m = num;
1425 						if (m < n)
1426 							FATAL("illegal repetition expression: class %.20s",
1427 								lastre);
1428 						if (n == 0 && m == 1) {
1429 							return QUEST;
1430 						}
1431 					} else {	/* {n,} */
1432 						if (n == 0)
1433 							return STAR;
1434 						else if (n == 1)
1435 							return PLUS;
1436 					}
1437 				} else {
1438 					if (digitfound) { /* {n} same as {n,n} */
1439 						n = num;
1440 						m = num;
1441 					} else {	/* {} */
1442 						FATAL("illegal repetition expression: class %.20s",
1443 							lastre);
1444 					}
1445 				}
1446 				if (repeat(starttok, prestr-starttok, lastatom,
1447 					   startreptok - lastatom, n, m) > 0) {
1448 					if (n == 0 && m == 0) {
1449 						return ZERO;
1450 					}
1451 					/* must rescan input for next token */
1452 					goto rescan;
1453 				}
1454 				/* Failed to replace: eat up {...} characters
1455 				   and treat like just PLUS */
1456 				return PLUS;
1457 			} else if (c == '\0') {
1458 				FATAL("nonterminated character class %.20s",
1459 					lastre);
1460 			} else if (isdigit(c)) {
1461 				num = 10 * num + c - '0';
1462 				digitfound = true;
1463 			} else if (c == ',') {
1464 				if (commafound)
1465 					FATAL("illegal repetition expression: class %.20s",
1466 						lastre);
1467 				/* looking for {n,} or {n,m} */
1468 				commafound = true;
1469 				n = num;
1470 				digitfound = false; /* reset */
1471 				num = 0;
1472 			} else {
1473 				FATAL("illegal repetition expression: class %.20s",
1474 					lastre);
1475 			}
1476 		}
1477 		break;
1478 	}
1479 }
1480 
cgoto(fa * f,int s,int c)1481 int cgoto(fa *f, int s, int c)
1482 {
1483 	int *p, *q;
1484 	int i, j, k;
1485 
1486 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1487 	while (f->accept >= maxsetvec) {	/* guessing here! */
1488 		resizesetvec(__func__);
1489 	}
1490 	for (i = 0; i <= f->accept; i++)
1491 		setvec[i] = 0;
1492 	setcnt = 0;
1493 	resize_state(f, s);
1494 	/* compute positions of gototab[s,c] into setvec */
1495 	p = f->posns[s];
1496 	for (i = 1; i <= *p; i++) {
1497 		if ((k = f->re[p[i]].ltype) != FINAL) {
1498 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1499 			 || (k == DOT && c != 0 && c != HAT)
1500 			 || (k == ALL && c != 0)
1501 			 || (k == EMPTYRE && c != 0)
1502 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1503 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1504 				q = f->re[p[i]].lfollow;
1505 				for (j = 1; j <= *q; j++) {
1506 					if (q[j] >= maxsetvec) {
1507 						resizesetvec(__func__);
1508 					}
1509 					if (setvec[q[j]] == 0) {
1510 						setcnt++;
1511 						setvec[q[j]] = 1;
1512 					}
1513 				}
1514 			}
1515 		}
1516 	}
1517 	/* determine if setvec is a previous state */
1518 	tmpset[0] = setcnt;
1519 	j = 1;
1520 	for (i = f->accept; i >= 0; i--)
1521 		if (setvec[i]) {
1522 			tmpset[j++] = i;
1523 		}
1524 	resize_state(f, f->curstat > s ? f->curstat : s);
1525 	/* tmpset == previous state? */
1526 	for (i = 1; i <= f->curstat; i++) {
1527 		p = f->posns[i];
1528 		if ((k = tmpset[0]) != p[0])
1529 			goto different;
1530 		for (j = 1; j <= k; j++)
1531 			if (tmpset[j] != p[j])
1532 				goto different;
1533 		/* setvec is state i */
1534 		if (c != HAT)
1535 			set_gototab(f, s, c, i);
1536 		return i;
1537 	  different:;
1538 	}
1539 
1540 	/* add tmpset to current set of states */
1541 	++(f->curstat);
1542 	resize_state(f, f->curstat);
1543 	clear_gototab(f, f->curstat);
1544 	xfree(f->posns[f->curstat]);
1545 	p = intalloc(setcnt + 1, __func__);
1546 
1547 	f->posns[f->curstat] = p;
1548 	if (c != HAT)
1549 		set_gototab(f, s, c, f->curstat);
1550 	for (i = 0; i <= setcnt; i++)
1551 		p[i] = tmpset[i];
1552 	if (setvec[f->accept])
1553 		f->out[f->curstat] = 1;
1554 	else
1555 		f->out[f->curstat] = 0;
1556 	return f->curstat;
1557 }
1558 
1559 
freefa(fa * f)1560 void freefa(fa *f)	/* free a finite automaton */
1561 {
1562 	int i;
1563 
1564 	if (f == NULL)
1565 		return;
1566 	for (i = 0; i < f->state_count; i++)
1567 		xfree(f->gototab[i].entries);
1568 	xfree(f->gototab);
1569 	for (i = 0; i <= f->curstat; i++)
1570 		xfree(f->posns[i]);
1571 	for (i = 0; i <= f->accept; i++) {
1572 		xfree(f->re[i].lfollow);
1573 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1574 			xfree(f->re[i].lval.np);
1575 	}
1576 	xfree(f->restr);
1577 	xfree(f->out);
1578 	xfree(f->posns);
1579 	xfree(f->gototab);
1580 	xfree(f);
1581 }
1582