xref: /freebsd/contrib/one-true-awk/b.c (revision 6f63e88c0166ed3e5f2805a9e667c7d24d304cf1)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #define	DEBUG
31 
32 #include <ctype.h>
33 #include <limits.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include "awk.h"
38 #include "ytab.h"
39 
40 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
41 				/* NCHARS is 2**n */
42 #define MAXLIN 22
43 
44 #define type(v)		(v)->nobj	/* badly overloaded here */
45 #define info(v)		(v)->ntype	/* badly overloaded here */
46 #define left(v)		(v)->narg[0]
47 #define right(v)	(v)->narg[1]
48 #define parent(v)	(v)->nnext
49 
50 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
51 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
52 #define UNARY	case STAR: case PLUS: case QUEST:
53 
54 /* encoding in tree Nodes:
55 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
56 		left is index, right contains value or pointer to value
57 	unary (STAR, PLUS, QUEST): left is child, right is null
58 	binary (CAT, OR): left and right are children
59 	parent contains pointer to parent
60 */
61 
62 
63 int	*setvec;
64 int	*tmpset;
65 int	maxsetvec = 0;
66 
67 int	rtok;		/* next token in current re */
68 int	rlxval;
69 static uschar	*rlxstr;
70 static uschar	*prestr;	/* current position in current re */
71 static uschar	*lastre;	/* origin of last re */
72 static uschar	*lastatom;	/* origin of last Atom */
73 static uschar	*starttok;
74 static uschar 	*basestr;	/* starts with original, replaced during
75 				   repetition processing */
76 static uschar 	*firstbasestr;
77 
78 static	int setcnt;
79 static	int poscnt;
80 
81 char	*patbeg;
82 int	patlen;
83 
84 #define	NFA	20	/* cache this many dynamic fa's */
85 fa	*fatab[NFA];
86 int	nfatab	= 0;	/* entries in fatab */
87 
88 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
89 {
90 	int i, use, nuse;
91 	fa *pfa;
92 	static int now = 1;
93 
94 	if (setvec == NULL) {	/* first time through any RE */
95 		maxsetvec = MAXLIN;
96 		setvec = (int *) malloc(maxsetvec * sizeof(int));
97 		tmpset = (int *) malloc(maxsetvec * sizeof(int));
98 		if (setvec == NULL || tmpset == NULL)
99 			overflo("out of space initializing makedfa");
100 	}
101 
102 	if (compile_time)	/* a constant for sure */
103 		return mkdfa(s, anchor);
104 	for (i = 0; i < nfatab; i++)	/* is it there already? */
105 		if (fatab[i]->anchor == anchor
106 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
107 			fatab[i]->use = now++;
108 			return fatab[i];
109 		}
110 	pfa = mkdfa(s, anchor);
111 	if (nfatab < NFA) {	/* room for another */
112 		fatab[nfatab] = pfa;
113 		fatab[nfatab]->use = now++;
114 		nfatab++;
115 		return pfa;
116 	}
117 	use = fatab[0]->use;	/* replace least-recently used */
118 	nuse = 0;
119 	for (i = 1; i < nfatab; i++)
120 		if (fatab[i]->use < use) {
121 			use = fatab[i]->use;
122 			nuse = i;
123 		}
124 	freefa(fatab[nuse]);
125 	fatab[nuse] = pfa;
126 	pfa->use = now++;
127 	return pfa;
128 }
129 
130 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
131 				/* anchor = 1 for anchored matches, else 0 */
132 {
133 	Node *p, *p1;
134 	fa *f;
135 
136 	firstbasestr = (uschar *) s;
137 	basestr = firstbasestr;
138 	p = reparse(s);
139 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
140 		/* put ALL STAR in front of reg.  exp. */
141 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
142 		/* put FINAL after reg.  exp. */
143 
144 	poscnt = 0;
145 	penter(p1);	/* enter parent pointers and leaf indices */
146 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
147 		overflo("out of space for fa");
148 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
149 	cfoll(f, p1);	/* set up follow sets */
150 	freetr(p1);
151 	if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL)
152 			overflo("out of space in makedfa");
153 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
154 		overflo("out of space in makedfa");
155 	*f->posns[1] = 0;
156 	f->initstat = makeinit(f, anchor);
157 	f->anchor = anchor;
158 	f->restr = (uschar *) tostring(s);
159 	if (firstbasestr != basestr) {
160 		if (basestr)
161 			xfree(basestr);
162 	}
163 	return f;
164 }
165 
166 int makeinit(fa *f, int anchor)
167 {
168 	int i, k;
169 
170 	f->curstat = 2;
171 	f->out[2] = 0;
172 	f->reset = 0;
173 	k = *(f->re[0].lfollow);
174 	xfree(f->posns[2]);
175 	if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
176 		overflo("out of space in makeinit");
177 	for (i=0; i <= k; i++) {
178 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
179 	}
180 	if ((f->posns[2])[1] == f->accept)
181 		f->out[2] = 1;
182 	for (i=0; i < NCHARS; i++)
183 		f->gototab[2][i] = 0;
184 	f->curstat = cgoto(f, 2, HAT);
185 	if (anchor) {
186 		*f->posns[2] = k-1;	/* leave out position 0 */
187 		for (i=0; i < k; i++) {
188 			(f->posns[0])[i] = (f->posns[2])[i];
189 		}
190 
191 		f->out[0] = f->out[2];
192 		if (f->curstat != 2)
193 			--(*f->posns[f->curstat]);
194 	}
195 	return f->curstat;
196 }
197 
198 void penter(Node *p)	/* set up parent pointers and leaf indices */
199 {
200 	switch (type(p)) {
201 	ELEAF
202 	LEAF
203 		info(p) = poscnt;
204 		poscnt++;
205 		break;
206 	UNARY
207 		penter(left(p));
208 		parent(left(p)) = p;
209 		break;
210 	case CAT:
211 	case OR:
212 		penter(left(p));
213 		penter(right(p));
214 		parent(left(p)) = p;
215 		parent(right(p)) = p;
216 		break;
217 	default:	/* can't happen */
218 		FATAL("can't happen: unknown type %d in penter", type(p));
219 		break;
220 	}
221 }
222 
223 void freetr(Node *p)	/* free parse tree */
224 {
225 	switch (type(p)) {
226 	ELEAF
227 	LEAF
228 		xfree(p);
229 		break;
230 	UNARY
231 		freetr(left(p));
232 		xfree(p);
233 		break;
234 	case CAT:
235 	case OR:
236 		freetr(left(p));
237 		freetr(right(p));
238 		xfree(p);
239 		break;
240 	default:	/* can't happen */
241 		FATAL("can't happen: unknown type %d in freetr", type(p));
242 		break;
243 	}
244 }
245 
246 /* in the parsing of regular expressions, metacharacters like . have */
247 /* to be seen literally;  \056 is not a metacharacter. */
248 
249 int hexstr(uschar **pp)	/* find and eval hex string at pp, return new p */
250 {			/* only pick up one 8-bit byte (2 chars) */
251 	uschar *p;
252 	int n = 0;
253 	int i;
254 
255 	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
256 		if (isdigit(*p))
257 			n = 16 * n + *p - '0';
258 		else if (*p >= 'a' && *p <= 'f')
259 			n = 16 * n + *p - 'a' + 10;
260 		else if (*p >= 'A' && *p <= 'F')
261 			n = 16 * n + *p - 'A' + 10;
262 	}
263 	*pp = (uschar *) p;
264 	return n;
265 }
266 
267 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
268 
269 int quoted(uschar **pp)	/* pick up next thing after a \\ */
270 			/* and increment *pp */
271 {
272 	uschar *p = *pp;
273 	int c;
274 
275 	if ((c = *p++) == 't')
276 		c = '\t';
277 	else if (c == 'n')
278 		c = '\n';
279 	else if (c == 'f')
280 		c = '\f';
281 	else if (c == 'r')
282 		c = '\r';
283 	else if (c == 'b')
284 		c = '\b';
285 	else if (c == '\\')
286 		c = '\\';
287 	else if (c == 'x') {	/* hexadecimal goo follows */
288 		c = hexstr(&p);	/* this adds a null if number is invalid */
289 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
290 		int n = c - '0';
291 		if (isoctdigit(*p)) {
292 			n = 8 * n + *p++ - '0';
293 			if (isoctdigit(*p))
294 				n = 8 * n + *p++ - '0';
295 		}
296 		c = n;
297 	} /* else */
298 		/* c = c; */
299 	*pp = p;
300 	return c;
301 }
302 
303 static int collate_range_cmp(int a, int b)
304 {
305 	static char s[2][2];
306 
307 	if ((uschar)a == (uschar)b)
308 		return 0;
309 	s[0][0] = a;
310 	s[1][0] = b;
311 	return (strcoll(s[0], s[1]));
312 }
313 
314 char *cclenter(const char *argp)	/* add a character class */
315 {
316 	int i, c, c2;
317 	int j;
318 	uschar *p = (uschar *) argp;
319 	uschar *op, *bp;
320 	static uschar *buf = NULL;
321 	static int bufsz = 100;
322 
323 	op = p;
324 	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
325 		FATAL("out of space for character class [%.10s...] 1", p);
326 	bp = buf;
327 	for (i = 0; (c = *p++) != 0; ) {
328 		if (c == '\\') {
329 			c = quoted(&p);
330 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
331 			if (*p != 0) {
332 				c = bp[-1];
333 				c2 = *p++;
334 				if (c2 == '\\')
335 					c2 = quoted(&p);
336 				if (collate_range_cmp(c, c2) > 0) {
337 					bp--;
338 					i--;
339 					continue;
340 				}
341 				for (j = 0; j < NCHARS; j++) {
342 					if ((collate_range_cmp(c, j) > 0) ||
343 					    collate_range_cmp(j, c2) > 0)
344 						continue;
345 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
346 						FATAL("out of space for character class [%.10s...] 2", p);
347 					*bp++ = j;
348 					i++;
349 				}
350 				continue;
351 			}
352 		}
353 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
354 			FATAL("out of space for character class [%.10s...] 3", p);
355 		*bp++ = c;
356 		i++;
357 	}
358 	*bp = 0;
359 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
360 	xfree(op);
361 	return (char *) tostring((char *) buf);
362 }
363 
364 void overflo(const char *s)
365 {
366 	FATAL("regular expression too big: %.30s...", s);
367 }
368 
369 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
370 {
371 	int i;
372 	int *p;
373 
374 	switch (type(v)) {
375 	ELEAF
376 	LEAF
377 		f->re[info(v)].ltype = type(v);
378 		f->re[info(v)].lval.np = right(v);
379 		while (f->accept >= maxsetvec) {	/* guessing here! */
380 			maxsetvec *= 4;
381 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
382 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
383 			if (setvec == NULL || tmpset == NULL)
384 				overflo("out of space in cfoll()");
385 		}
386 		for (i = 0; i <= f->accept; i++)
387 			setvec[i] = 0;
388 		setcnt = 0;
389 		follow(v);	/* computes setvec and setcnt */
390 		if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
391 			overflo("out of space building follow set");
392 		f->re[info(v)].lfollow = p;
393 		*p = setcnt;
394 		for (i = f->accept; i >= 0; i--)
395 			if (setvec[i] == 1)
396 				*++p = i;
397 		break;
398 	UNARY
399 		cfoll(f,left(v));
400 		break;
401 	case CAT:
402 	case OR:
403 		cfoll(f,left(v));
404 		cfoll(f,right(v));
405 		break;
406 	default:	/* can't happen */
407 		FATAL("can't happen: unknown type %d in cfoll", type(v));
408 	}
409 }
410 
411 int first(Node *p)	/* collects initially active leaves of p into setvec */
412 			/* returns 0 if p matches empty string */
413 {
414 	int b, lp;
415 
416 	switch (type(p)) {
417 	ELEAF
418 	LEAF
419 		lp = info(p);	/* look for high-water mark of subscripts */
420 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
421 			maxsetvec *= 4;
422 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
423 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
424 			if (setvec == NULL || tmpset == NULL)
425 				overflo("out of space in first()");
426 		}
427 		if (type(p) == EMPTYRE) {
428 			setvec[lp] = 0;
429 			return(0);
430 		}
431 		if (setvec[lp] != 1) {
432 			setvec[lp] = 1;
433 			setcnt++;
434 		}
435 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
436 			return(0);		/* empty CCL */
437 		else return(1);
438 	case PLUS:
439 		if (first(left(p)) == 0) return(0);
440 		return(1);
441 	case STAR:
442 	case QUEST:
443 		first(left(p));
444 		return(0);
445 	case CAT:
446 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
447 		return(1);
448 	case OR:
449 		b = first(right(p));
450 		if (first(left(p)) == 0 || b == 0) return(0);
451 		return(1);
452 	}
453 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
454 	return(-1);
455 }
456 
457 void follow(Node *v)	/* collects leaves that can follow v into setvec */
458 {
459 	Node *p;
460 
461 	if (type(v) == FINAL)
462 		return;
463 	p = parent(v);
464 	switch (type(p)) {
465 	case STAR:
466 	case PLUS:
467 		first(v);
468 		follow(p);
469 		return;
470 
471 	case OR:
472 	case QUEST:
473 		follow(p);
474 		return;
475 
476 	case CAT:
477 		if (v == left(p)) {	/* v is left child of p */
478 			if (first(right(p)) == 0) {
479 				follow(p);
480 				return;
481 			}
482 		} else		/* v is right child */
483 			follow(p);
484 		return;
485 	}
486 }
487 
488 int member(int c, const char *sarg)	/* is c in s? */
489 {
490 	uschar *s = (uschar *) sarg;
491 
492 	while (*s)
493 		if (c == *s++)
494 			return(1);
495 	return(0);
496 }
497 
498 int match(fa *f, const char *p0)	/* shortest match ? */
499 {
500 	int s, ns;
501 	uschar *p = (uschar *) p0;
502 
503 	s = f->reset ? makeinit(f,0) : f->initstat;
504 	if (f->out[s])
505 		return(1);
506 	do {
507 		/* assert(*p < NCHARS); */
508 		if ((ns = f->gototab[s][*p]) != 0)
509 			s = ns;
510 		else
511 			s = cgoto(f, s, *p);
512 		if (f->out[s])
513 			return(1);
514 	} while (*p++ != 0);
515 	return(0);
516 }
517 
518 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
519 {
520 	int s, ns;
521 	uschar *p = (uschar *) p0;
522 	uschar *q;
523 	int i, k;
524 
525 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
526 	if (f->reset) {
527 		f->initstat = s = makeinit(f,1);
528 	} else {
529 		s = f->initstat;
530 	}
531 	patbeg = (char *) p;
532 	patlen = -1;
533 	do {
534 		q = p;
535 		do {
536 			if (f->out[s])		/* final state */
537 				patlen = q-p;
538 			/* assert(*q < NCHARS); */
539 			if ((ns = f->gototab[s][*q]) != 0)
540 				s = ns;
541 			else
542 				s = cgoto(f, s, *q);
543 			if (s == 1) {	/* no transition */
544 				if (patlen >= 0) {
545 					patbeg = (char *) p;
546 					return(1);
547 				}
548 				else
549 					goto nextin;	/* no match */
550 			}
551 		} while (*q++ != 0);
552 		if (f->out[s])
553 			patlen = q-p-1;	/* don't count $ */
554 		if (patlen >= 0) {
555 			patbeg = (char *) p;
556 			return(1);
557 		}
558 	nextin:
559 		s = 2;
560 		if (f->reset) {
561 			for (i = 2; i <= f->curstat; i++)
562 				xfree(f->posns[i]);
563 			k = *f->posns[0];
564 			if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
565 				overflo("out of space in pmatch");
566 			for (i = 0; i <= k; i++)
567 				(f->posns[2])[i] = (f->posns[0])[i];
568 			f->initstat = f->curstat = 2;
569 			f->out[2] = f->out[0];
570 			for (i = 0; i < NCHARS; i++)
571 				f->gototab[2][i] = 0;
572 		}
573 	} while (*p++ != 0);
574 	return (0);
575 }
576 
577 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
578 {
579 	int s, ns;
580 	uschar *p = (uschar *) p0;
581 	uschar *q;
582 	int i, k;
583 
584 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
585 	if (f->reset) {
586 		f->initstat = s = makeinit(f,1);
587 	} else {
588 		s = f->initstat;
589 	}
590 	patlen = -1;
591 	while (*p) {
592 		q = p;
593 		do {
594 			if (f->out[s])		/* final state */
595 				patlen = q-p;
596 			/* assert(*q < NCHARS); */
597 			if ((ns = f->gototab[s][*q]) != 0)
598 				s = ns;
599 			else
600 				s = cgoto(f, s, *q);
601 			if (s == 1) {	/* no transition */
602 				if (patlen > 0) {
603 					patbeg = (char *) p;
604 					return(1);
605 				} else
606 					goto nnextin;	/* no nonempty match */
607 			}
608 		} while (*q++ != 0);
609 		if (f->out[s])
610 			patlen = q-p-1;	/* don't count $ */
611 		if (patlen > 0 ) {
612 			patbeg = (char *) p;
613 			return(1);
614 		}
615 	nnextin:
616 		s = 2;
617 		if (f->reset) {
618 			for (i = 2; i <= f->curstat; i++)
619 				xfree(f->posns[i]);
620 			k = *f->posns[0];
621 			if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
622 				overflo("out of state space");
623 			for (i = 0; i <= k; i++)
624 				(f->posns[2])[i] = (f->posns[0])[i];
625 			f->initstat = f->curstat = 2;
626 			f->out[2] = f->out[0];
627 			for (i = 0; i < NCHARS; i++)
628 				f->gototab[2][i] = 0;
629 		}
630 		p++;
631 	}
632 	return (0);
633 }
634 
635 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
636 {			/* uses relex() to scan regular expression */
637 	Node *np;
638 
639 	dprintf( ("reparse <%s>\n", p) );
640 	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
641 	rtok = relex();
642 	/* GNU compatibility: an empty regexp matches anything */
643 	if (rtok == '\0') {
644 		/* FATAL("empty regular expression"); previous */
645 		return(op2(EMPTYRE, NIL, NIL));
646 	}
647 	np = regexp();
648 	if (rtok != '\0')
649 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
650 	return(np);
651 }
652 
653 Node *regexp(void)	/* top-level parse of reg expr */
654 {
655 	return (alt(concat(primary())));
656 }
657 
658 Node *primary(void)
659 {
660 	Node *np;
661 	int savelastatom;
662 
663 	switch (rtok) {
664 	case CHAR:
665 		lastatom = starttok;
666 		np = op2(CHAR, NIL, itonp(rlxval));
667 		rtok = relex();
668 		return (unary(np));
669 	case ALL:
670 		rtok = relex();
671 		return (unary(op2(ALL, NIL, NIL)));
672 	case EMPTYRE:
673 		rtok = relex();
674 		return (unary(op2(EMPTYRE, NIL, NIL)));
675 	case DOT:
676 		lastatom = starttok;
677 		rtok = relex();
678 		return (unary(op2(DOT, NIL, NIL)));
679 	case CCL:
680 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
681 		lastatom = starttok;
682 		rtok = relex();
683 		return (unary(np));
684 	case NCCL:
685 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
686 		lastatom = starttok;
687 		rtok = relex();
688 		return (unary(np));
689 	case '^':
690 		rtok = relex();
691 		return (unary(op2(CHAR, NIL, itonp(HAT))));
692 	case '$':
693 		rtok = relex();
694 		return (unary(op2(CHAR, NIL, NIL)));
695 	case '(':
696 		lastatom = starttok;
697 		savelastatom = starttok - basestr; /* Retain over recursion */
698 		rtok = relex();
699 		if (rtok == ')') {	/* special pleading for () */
700 			rtok = relex();
701 			return unary(op2(CCL, NIL, (Node *) tostring("")));
702 		}
703 		np = regexp();
704 		if (rtok == ')') {
705 			lastatom = basestr + savelastatom; /* Restore */
706 			rtok = relex();
707 			return (unary(np));
708 		}
709 		else
710 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
711 	default:
712 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
713 	}
714 	return 0;	/*NOTREACHED*/
715 }
716 
717 Node *concat(Node *np)
718 {
719 	switch (rtok) {
720 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
721 		return (concat(op2(CAT, np, primary())));
722 	case EMPTYRE:
723 		rtok = relex();
724 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
725 				primary())));
726 	}
727 	return (np);
728 }
729 
730 Node *alt(Node *np)
731 {
732 	if (rtok == OR) {
733 		rtok = relex();
734 		return (alt(op2(OR, np, concat(primary()))));
735 	}
736 	return (np);
737 }
738 
739 Node *unary(Node *np)
740 {
741 	switch (rtok) {
742 	case STAR:
743 		rtok = relex();
744 		return (unary(op2(STAR, np, NIL)));
745 	case PLUS:
746 		rtok = relex();
747 		return (unary(op2(PLUS, np, NIL)));
748 	case QUEST:
749 		rtok = relex();
750 		return (unary(op2(QUEST, np, NIL)));
751 	default:
752 		return (np);
753 	}
754 }
755 
756 /*
757  * Character class definitions conformant to the POSIX locale as
758  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
759  * and operating character sets are both ASCII (ISO646) or supersets
760  * thereof.
761  *
762  * Note that to avoid overflowing the temporary buffer used in
763  * relex(), the expanded character class (prior to range expansion)
764  * must be less than twice the size of their full name.
765  */
766 
767 /* Because isblank doesn't show up in any of the header files on any
768  * system i use, it's defined here.  if some other locale has a richer
769  * definition of "blank", define HAS_ISBLANK and provide your own
770  * version.
771  * the parentheses here are an attempt to find a path through the maze
772  * of macro definition and/or function and/or version provided.  thanks
773  * to nelson beebe for the suggestion; let's see if it works everywhere.
774  */
775 
776 /* #define HAS_ISBLANK */
777 #ifndef HAS_ISBLANK
778 
779 int (xisblank)(int c)
780 {
781 	return c==' ' || c=='\t';
782 }
783 
784 #endif
785 
786 struct charclass {
787 	const char *cc_name;
788 	int cc_namelen;
789 	int (*cc_func)(int);
790 } charclasses[] = {
791 	{ "alnum",	5,	isalnum },
792 	{ "alpha",	5,	isalpha },
793 #ifndef HAS_ISBLANK
794 	{ "blank",	5,	xisblank },
795 #else
796 	{ "blank",	5,	isblank },
797 #endif
798 	{ "cntrl",	5,	iscntrl },
799 	{ "digit",	5,	isdigit },
800 	{ "graph",	5,	isgraph },
801 	{ "lower",	5,	islower },
802 	{ "print",	5,	isprint },
803 	{ "punct",	5,	ispunct },
804 	{ "space",	5,	isspace },
805 	{ "upper",	5,	isupper },
806 	{ "xdigit",	6,	isxdigit },
807 	{ NULL,		0,	NULL },
808 };
809 
810 #define REPEAT_SIMPLE		0
811 #define REPEAT_PLUS_APPENDED	1
812 #define REPEAT_WITH_Q		2
813 #define REPEAT_ZERO		3
814 
815 static int
816 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
817 	       int atomlen, int firstnum, int secondnum, int special_case)
818 {
819 	int i, j;
820 	uschar *buf = 0;
821 	int ret = 1;
822 	int init_q = (firstnum==0);		/* first added char will be ? */
823 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
824 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
825 	int suffix_length = strlen((char *) reptok) - reptoklen;	/* string after rep specifier	*/
826 	int size = prefix_length +  suffix_length;
827 
828 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
829 		size += atomlen*(firstnum-1);
830 	}
831 
832 	/* Adjust size of buffer for special cases */
833 	if (special_case == REPEAT_PLUS_APPENDED) {
834 		size++;		/* for the final + */
835 	} else if (special_case == REPEAT_WITH_Q) {
836 		size += init_q + (atomlen+1)* n_q_reps;
837 	} else if (special_case == REPEAT_ZERO) {
838 		size += 2;	/* just a null ERE: () */
839 	}
840 	if ((buf = (uschar *) malloc(size+1)) == NULL)
841 		FATAL("out of space in reg expr %.10s..", lastre);
842 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
843 	j = prefix_length;
844 	if (special_case == REPEAT_ZERO) {
845 		j -= atomlen;
846 		buf[j++] = '(';
847 		buf[j++] = ')';
848 	}
849 	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
850 		memcpy(&buf[j], atom, atomlen);
851 		j += atomlen;
852 	}
853 	if (special_case == REPEAT_PLUS_APPENDED) {
854 		buf[j++] = '+';
855 	} else if (special_case == REPEAT_WITH_Q) {
856 		if (init_q) buf[j++] = '?';
857 		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
858 			memcpy(&buf[j], atom, atomlen);
859 			j += atomlen;
860 			buf[j++] = '?';
861 		}
862 	}
863 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
864 	if (special_case == REPEAT_ZERO) {
865 		buf[j+suffix_length] = '\0';
866 	} else {
867 		buf[size] = '\0';
868 	}
869 	/* free old basestr */
870 	if (firstbasestr != basestr) {
871 		if (basestr)
872 			xfree(basestr);
873 	}
874 	basestr = buf;
875 	prestr  = buf + prefix_length;
876 	if (special_case == REPEAT_ZERO) {
877 		prestr  -= atomlen;
878 		ret++;
879 	}
880 	return ret;
881 }
882 
883 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
884 		  int atomlen, int firstnum, int secondnum)
885 {
886 	/*
887 	   In general, the repetition specifier or "bound" is replaced here
888 	   by an equivalent ERE string, repeating the immediately previous atom
889 	   and appending ? and + as needed. Note that the first copy of the
890 	   atom is left in place, except in the special_case of a zero-repeat
891 	   (i.e., {0}).
892 	 */
893 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
894 		if (firstnum < 2) {
895 			/* 0 or 1: should be handled before you get here */
896 			FATAL("internal error");
897 		} else {
898 			return replace_repeat(reptok, reptoklen, atom, atomlen,
899 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
900 		}
901 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
902 		if (firstnum == 0) {	/* {0} or {0,0} */
903 			/* This case is unusual because the resulting
904 			   replacement string might actually be SMALLER than
905 			   the original ERE */
906 			return replace_repeat(reptok, reptoklen, atom, atomlen,
907 					firstnum, secondnum, REPEAT_ZERO);
908 		} else {		/* (firstnum >= 1) */
909 			return replace_repeat(reptok, reptoklen, atom, atomlen,
910 					firstnum, secondnum, REPEAT_SIMPLE);
911 		}
912 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
913 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
914 		return replace_repeat(reptok, reptoklen, atom, atomlen,
915 					firstnum, secondnum, REPEAT_WITH_Q);
916 	} else {	/* Error - shouldn't be here (n>m) */
917 		FATAL("internal error");
918 	}
919 	return 0;
920 }
921 
922 int relex(void)		/* lexical analyzer for reparse */
923 {
924 	int c, n;
925 	int cflag;
926 	static uschar *buf = NULL;
927 	static int bufsz = 100;
928 	uschar *bp;
929 	struct charclass *cc;
930 	int i;
931 	int num, m, commafound, digitfound;
932 	const uschar *startreptok;
933 
934 rescan:
935 	starttok = prestr;
936 
937 	switch (c = *prestr++) {
938 	case '|': return OR;
939 	case '*': return STAR;
940 	case '+': return PLUS;
941 	case '?': return QUEST;
942 	case '.': return DOT;
943 	case '\0': prestr--; return '\0';
944 	case '^':
945 	case '$':
946 	case '(':
947 	case ')':
948 		return c;
949 	case '\\':
950 		rlxval = quoted(&prestr);
951 		return CHAR;
952 	default:
953 		rlxval = c;
954 		return CHAR;
955 	case '[':
956 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
957 			FATAL("out of space in reg expr %.10s..", lastre);
958 		bp = buf;
959 		if (*prestr == '^') {
960 			cflag = 1;
961 			prestr++;
962 		}
963 		else
964 			cflag = 0;
965 		n = 2 * strlen((const char *) prestr)+1;
966 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
967 			FATAL("out of space for reg expr %.10s...", lastre);
968 		for (; ; ) {
969 			if ((c = *prestr++) == '\\') {
970 				*bp++ = '\\';
971 				if ((c = *prestr++) == '\0')
972 					FATAL("nonterminated character class %.20s...", lastre);
973 				*bp++ = c;
974 			/* } else if (c == '\n') { */
975 			/* 	FATAL("newline in character class %.20s...", lastre); */
976 			} else if (c == '[' && *prestr == ':') {
977 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
978 				for (cc = charclasses; cc->cc_name; cc++)
979 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
980 						break;
981 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
982 				    prestr[2 + cc->cc_namelen] == ']') {
983 					prestr += cc->cc_namelen + 3;
984 					/*
985 					 * BUG: We begin at 1, instead of 0, since we
986 					 * would otherwise prematurely terminate the
987 					 * string for classes like [[:cntrl:]]. This
988 					 * means that we can't match the NUL character,
989 					 * not without first adapting the entire
990 					 * program to track each string's length.
991 					 */
992 					for (i = 1; i <= UCHAR_MAX; i++) {
993 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
994 						    FATAL("out of space for reg expr %.10s...", lastre);
995 						if (cc->cc_func(i)) {
996 							*bp++ = i;
997 							n++;
998 						}
999 					}
1000 				} else
1001 					*bp++ = c;
1002 			} else if (c == '[' && *prestr == '.') {
1003 				char collate_char;
1004 				prestr++;
1005 				collate_char = *prestr++;
1006 				if (*prestr == '.' && prestr[1] == ']') {
1007 					prestr += 2;
1008 					/* Found it: map via locale TBD: for
1009 					   now, simply return this char.  This
1010 					   is sufficient to pass conformance
1011 					   test awk.ex 156
1012 					 */
1013 					if (*prestr == ']') {
1014 						prestr++;
1015 						rlxval = collate_char;
1016 						return CHAR;
1017 					}
1018 				}
1019 			} else if (c == '[' && *prestr == '=') {
1020 				char equiv_char;
1021 				prestr++;
1022 				equiv_char = *prestr++;
1023 				if (*prestr == '=' && prestr[1] == ']') {
1024 					prestr += 2;
1025 					/* Found it: map via locale TBD: for now
1026 					   simply return this char. This is
1027 					   sufficient to pass conformance test
1028 					   awk.ex 156
1029 					 */
1030 					if (*prestr == ']') {
1031 						prestr++;
1032 						rlxval = equiv_char;
1033 						return CHAR;
1034 					}
1035 				}
1036 			} else if (c == '\0') {
1037 				FATAL("nonterminated character class %.20s", lastre);
1038 			} else if (bp == buf) {	/* 1st char is special */
1039 				*bp++ = c;
1040 			} else if (c == ']') {
1041 				*bp++ = 0;
1042 				rlxstr = (uschar *) tostring((char *) buf);
1043 				if (cflag == 0)
1044 					return CCL;
1045 				else
1046 					return NCCL;
1047 			} else
1048 				*bp++ = c;
1049 		}
1050 		break;
1051 	case '{':
1052 		if (isdigit(*(prestr))) {
1053 			num = 0;	/* Process as a repetition */
1054 			n = -1; m = -1;
1055 			commafound = 0;
1056 			digitfound = 0;
1057 			startreptok = prestr-1;
1058 			/* Remember start of previous atom here ? */
1059 		} else {        	/* just a { char, not a repetition */
1060 			rlxval = c;
1061 			return CHAR;
1062                 }
1063 		for (; ; ) {
1064 			if ((c = *prestr++) == '}') {
1065 				if (commafound) {
1066 					if (digitfound) { /* {n,m} */
1067 						m = num;
1068 						if (m<n)
1069 							FATAL("illegal repetition expression: class %.20s",
1070 								lastre);
1071 						if ((n==0) && (m==1)) {
1072 							return QUEST;
1073 						}
1074 					} else {	/* {n,} */
1075 						if (n==0) return STAR;
1076 						if (n==1) return PLUS;
1077 					}
1078 				} else {
1079 					if (digitfound) { /* {n} same as {n,n} */
1080 						n = num;
1081 						m = num;
1082 					} else {	/* {} */
1083 						FATAL("illegal repetition expression: class %.20s",
1084 							lastre);
1085 					}
1086 				}
1087 				if (repeat(starttok, prestr-starttok, lastatom,
1088 					   startreptok - lastatom, n, m) > 0) {
1089 					if ((n==0) && (m==0)) {
1090 						return EMPTYRE;
1091 					}
1092 					/* must rescan input for next token */
1093 					goto rescan;
1094 				}
1095 				/* Failed to replace: eat up {...} characters
1096 				   and treat like just PLUS */
1097 				return PLUS;
1098 			} else if (c == '\0') {
1099 				FATAL("nonterminated character class %.20s",
1100 					lastre);
1101 			} else if (isdigit(c)) {
1102 				num = 10 * num + c - '0';
1103 				digitfound = 1;
1104 			} else if (c == ',') {
1105 				if (commafound)
1106 					FATAL("illegal repetition expression: class %.20s",
1107 						lastre);
1108 				/* looking for {n,} or {n,m} */
1109 				commafound = 1;
1110 				n = num;
1111 				digitfound = 0; /* reset */
1112 				num = 0;
1113 			} else {
1114 				FATAL("illegal repetition expression: class %.20s",
1115 					lastre);
1116 			}
1117 		}
1118 		break;
1119 	}
1120 }
1121 
1122 int cgoto(fa *f, int s, int c)
1123 {
1124 	int i, j, k;
1125 	int *p, *q;
1126 
1127 	assert(c == HAT || c < NCHARS);
1128 	while (f->accept >= maxsetvec) {	/* guessing here! */
1129 		maxsetvec *= 4;
1130 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
1131 		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1132 		if (setvec == NULL || tmpset == NULL)
1133 			overflo("out of space in cgoto()");
1134 	}
1135 	for (i = 0; i <= f->accept; i++)
1136 		setvec[i] = 0;
1137 	setcnt = 0;
1138 	/* compute positions of gototab[s,c] into setvec */
1139 	p = f->posns[s];
1140 	for (i = 1; i <= *p; i++) {
1141 		if ((k = f->re[p[i]].ltype) != FINAL) {
1142 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1143 			 || (k == DOT && c != 0 && c != HAT)
1144 			 || (k == ALL && c != 0)
1145 			 || (k == EMPTYRE && c != 0)
1146 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1147 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1148 				q = f->re[p[i]].lfollow;
1149 				for (j = 1; j <= *q; j++) {
1150 					if (q[j] >= maxsetvec) {
1151 						maxsetvec *= 4;
1152 						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
1153 						tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1154 						if (setvec == NULL || tmpset == NULL)
1155 							overflo("cgoto overflow");
1156 					}
1157 					if (setvec[q[j]] == 0) {
1158 						setcnt++;
1159 						setvec[q[j]] = 1;
1160 					}
1161 				}
1162 			}
1163 		}
1164 	}
1165 	/* determine if setvec is a previous state */
1166 	tmpset[0] = setcnt;
1167 	j = 1;
1168 	for (i = f->accept; i >= 0; i--)
1169 		if (setvec[i]) {
1170 			tmpset[j++] = i;
1171 		}
1172 	/* tmpset == previous state? */
1173 	for (i = 1; i <= f->curstat; i++) {
1174 		p = f->posns[i];
1175 		if ((k = tmpset[0]) != p[0])
1176 			goto different;
1177 		for (j = 1; j <= k; j++)
1178 			if (tmpset[j] != p[j])
1179 				goto different;
1180 		/* setvec is state i */
1181 		f->gototab[s][c] = i;
1182 		return i;
1183 	  different:;
1184 	}
1185 
1186 	/* add tmpset to current set of states */
1187 	if (f->curstat >= NSTATES-1) {
1188 		f->curstat = 2;
1189 		f->reset = 1;
1190 		for (i = 2; i < NSTATES; i++)
1191 			xfree(f->posns[i]);
1192 	} else
1193 		++(f->curstat);
1194 	for (i = 0; i < NCHARS; i++)
1195 		f->gototab[f->curstat][i] = 0;
1196 	xfree(f->posns[f->curstat]);
1197 	if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1198 		overflo("out of space in cgoto");
1199 
1200 	f->posns[f->curstat] = p;
1201 	f->gototab[s][c] = f->curstat;
1202 	for (i = 0; i <= setcnt; i++)
1203 		p[i] = tmpset[i];
1204 	if (setvec[f->accept])
1205 		f->out[f->curstat] = 1;
1206 	else
1207 		f->out[f->curstat] = 0;
1208 	return f->curstat;
1209 }
1210 
1211 
1212 void freefa(fa *f)	/* free a finite automaton */
1213 {
1214 	int i;
1215 
1216 	if (f == NULL)
1217 		return;
1218 	for (i = 0; i <= f->curstat; i++)
1219 		xfree(f->posns[i]);
1220 	for (i = 0; i <= f->accept; i++) {
1221 		xfree(f->re[i].lfollow);
1222 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1223 			xfree((f->re[i].lval.np));
1224 	}
1225 	xfree(f->restr);
1226 	xfree(f);
1227 }
1228