xref: /freebsd/contrib/one-true-awk/b.c (revision 7aa383846770374466b1dcb2cefd71bde9acf463)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #define	DEBUG
31 
32 #include <ctype.h>
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include "awk.h"
37 #include "ytab.h"
38 
39 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
40 				/* NCHARS is 2**n */
41 #define MAXLIN 22
42 
43 #define type(v)		(v)->nobj	/* badly overloaded here */
44 #define info(v)		(v)->ntype	/* badly overloaded here */
45 #define left(v)		(v)->narg[0]
46 #define right(v)	(v)->narg[1]
47 #define parent(v)	(v)->nnext
48 
49 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
50 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
51 #define UNARY	case STAR: case PLUS: case QUEST:
52 
53 /* encoding in tree Nodes:
54 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
55 		left is index, right contains value or pointer to value
56 	unary (STAR, PLUS, QUEST): left is child, right is null
57 	binary (CAT, OR): left and right are children
58 	parent contains pointer to parent
59 */
60 
61 
62 int	*setvec;
63 int	*tmpset;
64 int	maxsetvec = 0;
65 
66 int	rtok;		/* next token in current re */
67 int	rlxval;
68 static uschar	*rlxstr;
69 static uschar	*prestr;	/* current position in current re */
70 static uschar	*lastre;	/* origin of last re */
71 
72 static	int setcnt;
73 static	int poscnt;
74 
75 char	*patbeg;
76 int	patlen;
77 
78 #define	NFA	20	/* cache this many dynamic fa's */
79 fa	*fatab[NFA];
80 int	nfatab	= 0;	/* entries in fatab */
81 
82 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
83 {
84 	int i, use, nuse;
85 	fa *pfa;
86 	static int now = 1;
87 
88 	if (setvec == 0) {	/* first time through any RE */
89 		maxsetvec = MAXLIN;
90 		setvec = (int *) malloc(maxsetvec * sizeof(int));
91 		tmpset = (int *) malloc(maxsetvec * sizeof(int));
92 		if (setvec == 0 || tmpset == 0)
93 			overflo("out of space initializing makedfa");
94 	}
95 
96 	if (compile_time)	/* a constant for sure */
97 		return mkdfa(s, anchor);
98 	for (i = 0; i < nfatab; i++)	/* is it there already? */
99 		if (fatab[i]->anchor == anchor
100 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
101 			fatab[i]->use = now++;
102 			return fatab[i];
103 		}
104 	pfa = mkdfa(s, anchor);
105 	if (nfatab < NFA) {	/* room for another */
106 		fatab[nfatab] = pfa;
107 		fatab[nfatab]->use = now++;
108 		nfatab++;
109 		return pfa;
110 	}
111 	use = fatab[0]->use;	/* replace least-recently used */
112 	nuse = 0;
113 	for (i = 1; i < nfatab; i++)
114 		if (fatab[i]->use < use) {
115 			use = fatab[i]->use;
116 			nuse = i;
117 		}
118 	freefa(fatab[nuse]);
119 	fatab[nuse] = pfa;
120 	pfa->use = now++;
121 	return pfa;
122 }
123 
124 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
125 				/* anchor = 1 for anchored matches, else 0 */
126 {
127 	Node *p, *p1;
128 	fa *f;
129 
130 	p = reparse(s);
131 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
132 		/* put ALL STAR in front of reg.  exp. */
133 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
134 		/* put FINAL after reg.  exp. */
135 
136 	poscnt = 0;
137 	penter(p1);	/* enter parent pointers and leaf indices */
138 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
139 		overflo("out of space for fa");
140 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
141 	cfoll(f, p1);	/* set up follow sets */
142 	freetr(p1);
143 	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
144 			overflo("out of space in makedfa");
145 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
146 		overflo("out of space in makedfa");
147 	*f->posns[1] = 0;
148 	f->initstat = makeinit(f, anchor);
149 	f->anchor = anchor;
150 	f->restr = (uschar *) tostring(s);
151 	return f;
152 }
153 
154 int makeinit(fa *f, int anchor)
155 {
156 	int i, k;
157 
158 	f->curstat = 2;
159 	f->out[2] = 0;
160 	f->reset = 0;
161 	k = *(f->re[0].lfollow);
162 	xfree(f->posns[2]);
163 	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
164 		overflo("out of space in makeinit");
165 	for (i=0; i <= k; i++) {
166 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
167 	}
168 	if ((f->posns[2])[1] == f->accept)
169 		f->out[2] = 1;
170 	for (i=0; i < NCHARS; i++)
171 		f->gototab[2][i] = 0;
172 	f->curstat = cgoto(f, 2, HAT);
173 	if (anchor) {
174 		*f->posns[2] = k-1;	/* leave out position 0 */
175 		for (i=0; i < k; i++) {
176 			(f->posns[0])[i] = (f->posns[2])[i];
177 		}
178 
179 		f->out[0] = f->out[2];
180 		if (f->curstat != 2)
181 			--(*f->posns[f->curstat]);
182 	}
183 	return f->curstat;
184 }
185 
186 void penter(Node *p)	/* set up parent pointers and leaf indices */
187 {
188 	switch (type(p)) {
189 	ELEAF
190 	LEAF
191 		info(p) = poscnt;
192 		poscnt++;
193 		break;
194 	UNARY
195 		penter(left(p));
196 		parent(left(p)) = p;
197 		break;
198 	case CAT:
199 	case OR:
200 		penter(left(p));
201 		penter(right(p));
202 		parent(left(p)) = p;
203 		parent(right(p)) = p;
204 		break;
205 	default:	/* can't happen */
206 		FATAL("can't happen: unknown type %d in penter", type(p));
207 		break;
208 	}
209 }
210 
211 void freetr(Node *p)	/* free parse tree */
212 {
213 	switch (type(p)) {
214 	ELEAF
215 	LEAF
216 		xfree(p);
217 		break;
218 	UNARY
219 		freetr(left(p));
220 		xfree(p);
221 		break;
222 	case CAT:
223 	case OR:
224 		freetr(left(p));
225 		freetr(right(p));
226 		xfree(p);
227 		break;
228 	default:	/* can't happen */
229 		FATAL("can't happen: unknown type %d in freetr", type(p));
230 		break;
231 	}
232 }
233 
234 /* in the parsing of regular expressions, metacharacters like . have */
235 /* to be seen literally;  \056 is not a metacharacter. */
236 
237 int hexstr(char **pp)	/* find and eval hex string at pp, return new p */
238 {			/* only pick up one 8-bit byte (2 chars) */
239 	uschar *p;
240 	int n = 0;
241 	int i;
242 
243 	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
244 		if (isdigit(*p))
245 			n = 16 * n + *p - '0';
246 		else if (*p >= 'a' && *p <= 'f')
247 			n = 16 * n + *p - 'a' + 10;
248 		else if (*p >= 'A' && *p <= 'F')
249 			n = 16 * n + *p - 'A' + 10;
250 	}
251 	*pp = (char *) p;
252 	return n;
253 }
254 
255 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
256 
257 int quoted(char **pp)	/* pick up next thing after a \\ */
258 			/* and increment *pp */
259 {
260 	char *p = *pp;
261 	int c;
262 
263 	if ((c = *p++) == 't')
264 		c = '\t';
265 	else if (c == 'n')
266 		c = '\n';
267 	else if (c == 'f')
268 		c = '\f';
269 	else if (c == 'r')
270 		c = '\r';
271 	else if (c == 'b')
272 		c = '\b';
273 	else if (c == '\\')
274 		c = '\\';
275 	else if (c == 'x') {	/* hexadecimal goo follows */
276 		c = hexstr(&p);	/* this adds a null if number is invalid */
277 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
278 		int n = c - '0';
279 		if (isoctdigit(*p)) {
280 			n = 8 * n + *p++ - '0';
281 			if (isoctdigit(*p))
282 				n = 8 * n + *p++ - '0';
283 		}
284 		c = n;
285 	} /* else */
286 		/* c = c; */
287 	*pp = p;
288 	return c;
289 }
290 
291 static int collate_range_cmp(int a, int b)
292 {
293 	static char s[2][2];
294 
295 	if ((uschar)a == (uschar)b)
296 		return 0;
297 	s[0][0] = a;
298 	s[1][0] = b;
299 	return (strcoll(s[0], s[1]));
300 }
301 
302 char *cclenter(const char *argp)	/* add a character class */
303 {
304 	int i, c, c2;
305 	int j;
306 	uschar *p = (uschar *) argp;
307 	uschar *op, *bp;
308 	static uschar *buf = 0;
309 	static int bufsz = 100;
310 
311 	op = p;
312 	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
313 		FATAL("out of space for character class [%.10s...] 1", p);
314 	bp = buf;
315 	for (i = 0; (c = *p++) != 0; ) {
316 		if (c == '\\') {
317 			c = quoted((char **) &p);
318 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
319 			if (*p != 0) {
320 				c = bp[-1];
321 				c2 = *p++;
322 				if (c2 == '\\')
323 					c2 = quoted((char **) &p);
324 				if (collate_range_cmp(c, c2) > 0) {
325 					bp--;
326 					i--;
327 					continue;
328 				}
329 				for (j = 0; j < NCHARS; j++) {
330 					if ((collate_range_cmp(c, j) > 0) ||
331 					    collate_range_cmp(j, c2) > 0)
332 						continue;
333 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
334 						FATAL("out of space for character class [%.10s...] 2", p);
335 					*bp++ = j;
336 					i++;
337 				}
338 				continue;
339 			}
340 		}
341 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
342 			FATAL("out of space for character class [%.10s...] 3", p);
343 		*bp++ = c;
344 		i++;
345 	}
346 	*bp = 0;
347 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
348 	xfree(op);
349 	return (char *) tostring((char *) buf);
350 }
351 
352 void overflo(const char *s)
353 {
354 	FATAL("regular expression too big: %.30s...", s);
355 }
356 
357 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
358 {
359 	int i;
360 	int *p;
361 
362 	switch (type(v)) {
363 	ELEAF
364 	LEAF
365 		f->re[info(v)].ltype = type(v);
366 		f->re[info(v)].lval.np = right(v);
367 		while (f->accept >= maxsetvec) {	/* guessing here! */
368 			maxsetvec *= 4;
369 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
370 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
371 			if (setvec == 0 || tmpset == 0)
372 				overflo("out of space in cfoll()");
373 		}
374 		for (i = 0; i <= f->accept; i++)
375 			setvec[i] = 0;
376 		setcnt = 0;
377 		follow(v);	/* computes setvec and setcnt */
378 		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
379 			overflo("out of space building follow set");
380 		f->re[info(v)].lfollow = p;
381 		*p = setcnt;
382 		for (i = f->accept; i >= 0; i--)
383 			if (setvec[i] == 1)
384 				*++p = i;
385 		break;
386 	UNARY
387 		cfoll(f,left(v));
388 		break;
389 	case CAT:
390 	case OR:
391 		cfoll(f,left(v));
392 		cfoll(f,right(v));
393 		break;
394 	default:	/* can't happen */
395 		FATAL("can't happen: unknown type %d in cfoll", type(v));
396 	}
397 }
398 
399 int first(Node *p)	/* collects initially active leaves of p into setvec */
400 			/* returns 0 if p matches empty string */
401 {
402 	int b, lp;
403 
404 	switch (type(p)) {
405 	ELEAF
406 	LEAF
407 		lp = info(p);	/* look for high-water mark of subscripts */
408 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
409 			maxsetvec *= 4;
410 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
411 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
412 			if (setvec == 0 || tmpset == 0)
413 				overflo("out of space in first()");
414 		}
415 		if (type(p) == EMPTYRE) {
416 			setvec[lp] = 0;
417 			return(0);
418 		}
419 		if (setvec[lp] != 1) {
420 			setvec[lp] = 1;
421 			setcnt++;
422 		}
423 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
424 			return(0);		/* empty CCL */
425 		else return(1);
426 	case PLUS:
427 		if (first(left(p)) == 0) return(0);
428 		return(1);
429 	case STAR:
430 	case QUEST:
431 		first(left(p));
432 		return(0);
433 	case CAT:
434 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
435 		return(1);
436 	case OR:
437 		b = first(right(p));
438 		if (first(left(p)) == 0 || b == 0) return(0);
439 		return(1);
440 	}
441 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
442 	return(-1);
443 }
444 
445 void follow(Node *v)	/* collects leaves that can follow v into setvec */
446 {
447 	Node *p;
448 
449 	if (type(v) == FINAL)
450 		return;
451 	p = parent(v);
452 	switch (type(p)) {
453 	case STAR:
454 	case PLUS:
455 		first(v);
456 		follow(p);
457 		return;
458 
459 	case OR:
460 	case QUEST:
461 		follow(p);
462 		return;
463 
464 	case CAT:
465 		if (v == left(p)) {	/* v is left child of p */
466 			if (first(right(p)) == 0) {
467 				follow(p);
468 				return;
469 			}
470 		} else		/* v is right child */
471 			follow(p);
472 		return;
473 	}
474 }
475 
476 int member(int c, const char *sarg)	/* is c in s? */
477 {
478 	uschar *s = (uschar *) sarg;
479 
480 	while (*s)
481 		if (c == *s++)
482 			return(1);
483 	return(0);
484 }
485 
486 int match(fa *f, const char *p0)	/* shortest match ? */
487 {
488 	int s, ns;
489 	uschar *p = (uschar *) p0;
490 
491 	s = f->reset ? makeinit(f,0) : f->initstat;
492 	if (f->out[s])
493 		return(1);
494 	do {
495 		/* assert(*p < NCHARS); */
496 		if ((ns = f->gototab[s][*p]) != 0)
497 			s = ns;
498 		else
499 			s = cgoto(f, s, *p);
500 		if (f->out[s])
501 			return(1);
502 	} while (*p++ != 0);
503 	return(0);
504 }
505 
506 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
507 {
508 	int s, ns;
509 	uschar *p = (uschar *) p0;
510 	uschar *q;
511 	int i, k;
512 
513 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
514 	if (f->reset) {
515 		f->initstat = s = makeinit(f,1);
516 	} else {
517 		s = f->initstat;
518 	}
519 	patbeg = (char *) p;
520 	patlen = -1;
521 	do {
522 		q = p;
523 		do {
524 			if (f->out[s])		/* final state */
525 				patlen = q-p;
526 			/* assert(*q < NCHARS); */
527 			if ((ns = f->gototab[s][*q]) != 0)
528 				s = ns;
529 			else
530 				s = cgoto(f, s, *q);
531 			if (s == 1) {	/* no transition */
532 				if (patlen >= 0) {
533 					patbeg = (char *) p;
534 					return(1);
535 				}
536 				else
537 					goto nextin;	/* no match */
538 			}
539 		} while (*q++ != 0);
540 		if (f->out[s])
541 			patlen = q-p-1;	/* don't count $ */
542 		if (patlen >= 0) {
543 			patbeg = (char *) p;
544 			return(1);
545 		}
546 	nextin:
547 		s = 2;
548 		if (f->reset) {
549 			for (i = 2; i <= f->curstat; i++)
550 				xfree(f->posns[i]);
551 			k = *f->posns[0];
552 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
553 				overflo("out of space in pmatch");
554 			for (i = 0; i <= k; i++)
555 				(f->posns[2])[i] = (f->posns[0])[i];
556 			f->initstat = f->curstat = 2;
557 			f->out[2] = f->out[0];
558 			for (i = 0; i < NCHARS; i++)
559 				f->gototab[2][i] = 0;
560 		}
561 	} while (*p++ != 0);
562 	return (0);
563 }
564 
565 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
566 {
567 	int s, ns;
568 	uschar *p = (uschar *) p0;
569 	uschar *q;
570 	int i, k;
571 
572 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
573 	if (f->reset) {
574 		f->initstat = s = makeinit(f,1);
575 	} else {
576 		s = f->initstat;
577 	}
578 	patlen = -1;
579 	while (*p) {
580 		q = p;
581 		do {
582 			if (f->out[s])		/* final state */
583 				patlen = q-p;
584 			/* assert(*q < NCHARS); */
585 			if ((ns = f->gototab[s][*q]) != 0)
586 				s = ns;
587 			else
588 				s = cgoto(f, s, *q);
589 			if (s == 1) {	/* no transition */
590 				if (patlen > 0) {
591 					patbeg = (char *) p;
592 					return(1);
593 				} else
594 					goto nnextin;	/* no nonempty match */
595 			}
596 		} while (*q++ != 0);
597 		if (f->out[s])
598 			patlen = q-p-1;	/* don't count $ */
599 		if (patlen > 0 ) {
600 			patbeg = (char *) p;
601 			return(1);
602 		}
603 	nnextin:
604 		s = 2;
605 		if (f->reset) {
606 			for (i = 2; i <= f->curstat; i++)
607 				xfree(f->posns[i]);
608 			k = *f->posns[0];
609 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
610 				overflo("out of state space");
611 			for (i = 0; i <= k; i++)
612 				(f->posns[2])[i] = (f->posns[0])[i];
613 			f->initstat = f->curstat = 2;
614 			f->out[2] = f->out[0];
615 			for (i = 0; i < NCHARS; i++)
616 				f->gototab[2][i] = 0;
617 		}
618 		p++;
619 	}
620 	return (0);
621 }
622 
623 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
624 {			/* uses relex() to scan regular expression */
625 	Node *np;
626 
627 	dprintf( ("reparse <%s>\n", p) );
628 	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
629 	rtok = relex();
630 	/* GNU compatibility: an empty regexp matches anything */
631 	if (rtok == '\0') {
632 		/* FATAL("empty regular expression"); previous */
633 		return(op2(EMPTYRE, NIL, NIL));
634 	}
635 	np = regexp();
636 	if (rtok != '\0')
637 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
638 	return(np);
639 }
640 
641 Node *regexp(void)	/* top-level parse of reg expr */
642 {
643 	return (alt(concat(primary())));
644 }
645 
646 Node *primary(void)
647 {
648 	Node *np;
649 
650 	switch (rtok) {
651 	case CHAR:
652 		np = op2(CHAR, NIL, itonp(rlxval));
653 		rtok = relex();
654 		return (unary(np));
655 	case ALL:
656 		rtok = relex();
657 		return (unary(op2(ALL, NIL, NIL)));
658 	case EMPTYRE:
659 		rtok = relex();
660 		return (unary(op2(ALL, NIL, NIL)));
661 	case DOT:
662 		rtok = relex();
663 		return (unary(op2(DOT, NIL, NIL)));
664 	case CCL:
665 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
666 		rtok = relex();
667 		return (unary(np));
668 	case NCCL:
669 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
670 		rtok = relex();
671 		return (unary(np));
672 	case '^':
673 		rtok = relex();
674 		return (unary(op2(CHAR, NIL, itonp(HAT))));
675 	case '$':
676 		rtok = relex();
677 		return (unary(op2(CHAR, NIL, NIL)));
678 	case '(':
679 		rtok = relex();
680 		if (rtok == ')') {	/* special pleading for () */
681 			rtok = relex();
682 			return unary(op2(CCL, NIL, (Node *) tostring("")));
683 		}
684 		np = regexp();
685 		if (rtok == ')') {
686 			rtok = relex();
687 			return (unary(np));
688 		}
689 		else
690 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
691 	default:
692 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
693 	}
694 	return 0;	/*NOTREACHED*/
695 }
696 
697 Node *concat(Node *np)
698 {
699 	switch (rtok) {
700 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
701 		return (concat(op2(CAT, np, primary())));
702 	}
703 	return (np);
704 }
705 
706 Node *alt(Node *np)
707 {
708 	if (rtok == OR) {
709 		rtok = relex();
710 		return (alt(op2(OR, np, concat(primary()))));
711 	}
712 	return (np);
713 }
714 
715 Node *unary(Node *np)
716 {
717 	switch (rtok) {
718 	case STAR:
719 		rtok = relex();
720 		return (unary(op2(STAR, np, NIL)));
721 	case PLUS:
722 		rtok = relex();
723 		return (unary(op2(PLUS, np, NIL)));
724 	case QUEST:
725 		rtok = relex();
726 		return (unary(op2(QUEST, np, NIL)));
727 	default:
728 		return (np);
729 	}
730 }
731 
732 /*
733  * Character class definitions conformant to the POSIX locale as
734  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
735  * and operating character sets are both ASCII (ISO646) or supersets
736  * thereof.
737  *
738  * Note that to avoid overflowing the temporary buffer used in
739  * relex(), the expanded character class (prior to range expansion)
740  * must be less than twice the size of their full name.
741  */
742 
743 /* Because isblank doesn't show up in any of the header files on any
744  * system i use, it's defined here.  if some other locale has a richer
745  * definition of "blank", define HAS_ISBLANK and provide your own
746  * version.
747  * the parentheses here are an attempt to find a path through the maze
748  * of macro definition and/or function and/or version provided.  thanks
749  * to nelson beebe for the suggestion; let's see if it works everywhere.
750  */
751 
752 /* #define HAS_ISBLANK */
753 #ifndef HAS_ISBLANK
754 
755 int (isblank)(int c)
756 {
757 	return c==' ' || c=='\t';
758 }
759 
760 #endif
761 
762 struct charclass {
763 	const char *cc_name;
764 	int cc_namelen;
765 	int (*cc_func)(int);
766 } charclasses[] = {
767 	{ "alnum",	5,	isalnum },
768 	{ "alpha",	5,	isalpha },
769 	{ "blank",	5,	isblank },
770 	{ "cntrl",	5,	iscntrl },
771 	{ "digit",	5,	isdigit },
772 	{ "graph",	5,	isgraph },
773 	{ "lower",	5,	islower },
774 	{ "print",	5,	isprint },
775 	{ "punct",	5,	ispunct },
776 	{ "space",	5,	isspace },
777 	{ "upper",	5,	isupper },
778 	{ "xdigit",	6,	isxdigit },
779 	{ NULL,		0,	NULL },
780 };
781 
782 
783 int relex(void)		/* lexical analyzer for reparse */
784 {
785 	int c, n;
786 	int cflag;
787 	static uschar *buf = 0;
788 	static int bufsz = 100;
789 	uschar *bp;
790 	struct charclass *cc;
791 	int i;
792 
793 	switch (c = *prestr++) {
794 	case '|': return OR;
795 	case '*': return STAR;
796 	case '+': return PLUS;
797 	case '?': return QUEST;
798 	case '.': return DOT;
799 	case '\0': prestr--; return '\0';
800 	case '^':
801 	case '$':
802 	case '(':
803 	case ')':
804 		return c;
805 	case '\\':
806 		rlxval = quoted((char **) &prestr);
807 		return CHAR;
808 	default:
809 		rlxval = c;
810 		return CHAR;
811 	case '[':
812 		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
813 			FATAL("out of space in reg expr %.10s..", lastre);
814 		bp = buf;
815 		if (*prestr == '^') {
816 			cflag = 1;
817 			prestr++;
818 		}
819 		else
820 			cflag = 0;
821 		n = 2 * strlen((const char *) prestr)+1;
822 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
823 			FATAL("out of space for reg expr %.10s...", lastre);
824 		for (; ; ) {
825 			if ((c = *prestr++) == '\\') {
826 				*bp++ = '\\';
827 				if ((c = *prestr++) == '\0')
828 					FATAL("nonterminated character class %.20s...", lastre);
829 				*bp++ = c;
830 			/* } else if (c == '\n') { */
831 			/* 	FATAL("newline in character class %.20s...", lastre); */
832 			} else if (c == '[' && *prestr == ':') {
833 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
834 				for (cc = charclasses; cc->cc_name; cc++)
835 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
836 						break;
837 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
838 				    prestr[2 + cc->cc_namelen] == ']') {
839 					prestr += cc->cc_namelen + 3;
840 					for (i = 0; i < NCHARS; i++) {
841 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
842 						    FATAL("out of space for reg expr %.10s...", lastre);
843 						if (cc->cc_func(i)) {
844 							*bp++ = i;
845 							n++;
846 						}
847 					}
848 				} else
849 					*bp++ = c;
850 			} else if (c == '\0') {
851 				FATAL("nonterminated character class %.20s", lastre);
852 			} else if (bp == buf) {	/* 1st char is special */
853 				*bp++ = c;
854 			} else if (c == ']') {
855 				*bp++ = 0;
856 				rlxstr = (uschar *) tostring((char *) buf);
857 				if (cflag == 0)
858 					return CCL;
859 				else
860 					return NCCL;
861 			} else
862 				*bp++ = c;
863 		}
864 	}
865 }
866 
867 int cgoto(fa *f, int s, int c)
868 {
869 	int i, j, k;
870 	int *p, *q;
871 
872 	assert(c == HAT || c < NCHARS);
873 	while (f->accept >= maxsetvec) {	/* guessing here! */
874 		maxsetvec *= 4;
875 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
876 		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
877 		if (setvec == 0 || tmpset == 0)
878 			overflo("out of space in cgoto()");
879 	}
880 	for (i = 0; i <= f->accept; i++)
881 		setvec[i] = 0;
882 	setcnt = 0;
883 	/* compute positions of gototab[s,c] into setvec */
884 	p = f->posns[s];
885 	for (i = 1; i <= *p; i++) {
886 		if ((k = f->re[p[i]].ltype) != FINAL) {
887 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
888 			 || (k == DOT && c != 0 && c != HAT)
889 			 || (k == ALL && c != 0)
890 			 || (k == EMPTYRE && c != 0)
891 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
892 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
893 				q = f->re[p[i]].lfollow;
894 				for (j = 1; j <= *q; j++) {
895 					if (q[j] >= maxsetvec) {
896 						maxsetvec *= 4;
897 						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
898 						tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
899 						if (setvec == 0 || tmpset == 0)
900 							overflo("cgoto overflow");
901 					}
902 					if (setvec[q[j]] == 0) {
903 						setcnt++;
904 						setvec[q[j]] = 1;
905 					}
906 				}
907 			}
908 		}
909 	}
910 	/* determine if setvec is a previous state */
911 	tmpset[0] = setcnt;
912 	j = 1;
913 	for (i = f->accept; i >= 0; i--)
914 		if (setvec[i]) {
915 			tmpset[j++] = i;
916 		}
917 	/* tmpset == previous state? */
918 	for (i = 1; i <= f->curstat; i++) {
919 		p = f->posns[i];
920 		if ((k = tmpset[0]) != p[0])
921 			goto different;
922 		for (j = 1; j <= k; j++)
923 			if (tmpset[j] != p[j])
924 				goto different;
925 		/* setvec is state i */
926 		f->gototab[s][c] = i;
927 		return i;
928 	  different:;
929 	}
930 
931 	/* add tmpset to current set of states */
932 	if (f->curstat >= NSTATES-1) {
933 		f->curstat = 2;
934 		f->reset = 1;
935 		for (i = 2; i < NSTATES; i++)
936 			xfree(f->posns[i]);
937 	} else
938 		++(f->curstat);
939 	for (i = 0; i < NCHARS; i++)
940 		f->gototab[f->curstat][i] = 0;
941 	xfree(f->posns[f->curstat]);
942 	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
943 		overflo("out of space in cgoto");
944 
945 	f->posns[f->curstat] = p;
946 	f->gototab[s][c] = f->curstat;
947 	for (i = 0; i <= setcnt; i++)
948 		p[i] = tmpset[i];
949 	if (setvec[f->accept])
950 		f->out[f->curstat] = 1;
951 	else
952 		f->out[f->curstat] = 0;
953 	return f->curstat;
954 }
955 
956 
957 void freefa(fa *f)	/* free a finite automaton */
958 {
959 	int i;
960 
961 	if (f == NULL)
962 		return;
963 	for (i = 0; i <= f->curstat; i++)
964 		xfree(f->posns[i]);
965 	for (i = 0; i <= f->accept; i++) {
966 		xfree(f->re[i].lfollow);
967 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
968 			xfree((f->re[i].lval.np));
969 	}
970 	xfree(f->restr);
971 	xfree(f);
972 }
973