xref: /freebsd/contrib/one-true-awk/b.c (revision 1603881667360c015f6685131f2f25474fa67a72)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #define	DEBUG
31 
32 #include <ctype.h>
33 #include <limits.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include "awk.h"
38 #include "ytab.h"
39 
40 #define MAXLIN 22
41 
42 #define type(v)		(v)->nobj	/* badly overloaded here */
43 #define info(v)		(v)->ntype	/* badly overloaded here */
44 #define left(v)		(v)->narg[0]
45 #define right(v)	(v)->narg[1]
46 #define parent(v)	(v)->nnext
47 
48 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
49 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
50 #define UNARY	case STAR: case PLUS: case QUEST:
51 
52 /* encoding in tree Nodes:
53 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
54 		left is index, right contains value or pointer to value
55 	unary (STAR, PLUS, QUEST): left is child, right is null
56 	binary (CAT, OR): left and right are children
57 	parent contains pointer to parent
58 */
59 
60 
61 int	*setvec;
62 int	*tmpset;
63 int	maxsetvec = 0;
64 
65 int	rtok;		/* next token in current re */
66 int	rlxval;
67 static uschar	*rlxstr;
68 static uschar	*prestr;	/* current position in current re */
69 static uschar	*lastre;	/* origin of last re */
70 static uschar	*lastatom;	/* origin of last Atom */
71 static uschar	*starttok;
72 static uschar 	*basestr;	/* starts with original, replaced during
73 				   repetition processing */
74 static uschar 	*firstbasestr;
75 
76 static	int setcnt;
77 static	int poscnt;
78 
79 char	*patbeg;
80 int	patlen;
81 
82 #define	NFA	20	/* cache this many dynamic fa's */
83 fa	*fatab[NFA];
84 int	nfatab	= 0;	/* entries in fatab */
85 
86 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
87 {
88 	int i, use, nuse;
89 	fa *pfa;
90 	static int now = 1;
91 
92 	if (setvec == NULL) {	/* first time through any RE */
93 		maxsetvec = MAXLIN;
94 		setvec = (int *) malloc(maxsetvec * sizeof(int));
95 		tmpset = (int *) malloc(maxsetvec * sizeof(int));
96 		if (setvec == NULL || tmpset == NULL)
97 			overflo("out of space initializing makedfa");
98 	}
99 
100 	if (compile_time)	/* a constant for sure */
101 		return mkdfa(s, anchor);
102 	for (i = 0; i < nfatab; i++)	/* is it there already? */
103 		if (fatab[i]->anchor == anchor
104 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
105 			fatab[i]->use = now++;
106 			return fatab[i];
107 		}
108 	pfa = mkdfa(s, anchor);
109 	if (nfatab < NFA) {	/* room for another */
110 		fatab[nfatab] = pfa;
111 		fatab[nfatab]->use = now++;
112 		nfatab++;
113 		return pfa;
114 	}
115 	use = fatab[0]->use;	/* replace least-recently used */
116 	nuse = 0;
117 	for (i = 1; i < nfatab; i++)
118 		if (fatab[i]->use < use) {
119 			use = fatab[i]->use;
120 			nuse = i;
121 		}
122 	freefa(fatab[nuse]);
123 	fatab[nuse] = pfa;
124 	pfa->use = now++;
125 	return pfa;
126 }
127 
128 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
129 				/* anchor = 1 for anchored matches, else 0 */
130 {
131 	Node *p, *p1;
132 	fa *f;
133 
134 	firstbasestr = (uschar *) s;
135 	basestr = firstbasestr;
136 	p = reparse(s);
137 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
138 		/* put ALL STAR in front of reg.  exp. */
139 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
140 		/* put FINAL after reg.  exp. */
141 
142 	poscnt = 0;
143 	penter(p1);	/* enter parent pointers and leaf indices */
144 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
145 		overflo("out of space for fa");
146 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
147 	cfoll(f, p1);	/* set up follow sets */
148 	freetr(p1);
149 	if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL)
150 			overflo("out of space in makedfa");
151 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
152 		overflo("out of space in makedfa");
153 	*f->posns[1] = 0;
154 	f->initstat = makeinit(f, anchor);
155 	f->anchor = anchor;
156 	f->restr = (uschar *) tostring(s);
157 	if (firstbasestr != basestr) {
158 		if (basestr)
159 			xfree(basestr);
160 	}
161 	return f;
162 }
163 
164 int makeinit(fa *f, int anchor)
165 {
166 	int i, k;
167 
168 	f->curstat = 2;
169 	f->out[2] = 0;
170 	f->reset = 0;
171 	k = *(f->re[0].lfollow);
172 	xfree(f->posns[2]);
173 	if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
174 		overflo("out of space in makeinit");
175 	for (i=0; i <= k; i++) {
176 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
177 	}
178 	if ((f->posns[2])[1] == f->accept)
179 		f->out[2] = 1;
180 	for (i=0; i < NCHARS; i++)
181 		f->gototab[2][i] = 0;
182 	f->curstat = cgoto(f, 2, HAT);
183 	if (anchor) {
184 		*f->posns[2] = k-1;	/* leave out position 0 */
185 		for (i=0; i < k; i++) {
186 			(f->posns[0])[i] = (f->posns[2])[i];
187 		}
188 
189 		f->out[0] = f->out[2];
190 		if (f->curstat != 2)
191 			--(*f->posns[f->curstat]);
192 	}
193 	return f->curstat;
194 }
195 
196 void penter(Node *p)	/* set up parent pointers and leaf indices */
197 {
198 	switch (type(p)) {
199 	ELEAF
200 	LEAF
201 		info(p) = poscnt;
202 		poscnt++;
203 		break;
204 	UNARY
205 		penter(left(p));
206 		parent(left(p)) = p;
207 		break;
208 	case CAT:
209 	case OR:
210 		penter(left(p));
211 		penter(right(p));
212 		parent(left(p)) = p;
213 		parent(right(p)) = p;
214 		break;
215 	default:	/* can't happen */
216 		FATAL("can't happen: unknown type %d in penter", type(p));
217 		break;
218 	}
219 }
220 
221 void freetr(Node *p)	/* free parse tree */
222 {
223 	switch (type(p)) {
224 	ELEAF
225 	LEAF
226 		xfree(p);
227 		break;
228 	UNARY
229 		freetr(left(p));
230 		xfree(p);
231 		break;
232 	case CAT:
233 	case OR:
234 		freetr(left(p));
235 		freetr(right(p));
236 		xfree(p);
237 		break;
238 	default:	/* can't happen */
239 		FATAL("can't happen: unknown type %d in freetr", type(p));
240 		break;
241 	}
242 }
243 
244 /* in the parsing of regular expressions, metacharacters like . have */
245 /* to be seen literally;  \056 is not a metacharacter. */
246 
247 int hexstr(uschar **pp)	/* find and eval hex string at pp, return new p */
248 {			/* only pick up one 8-bit byte (2 chars) */
249 	uschar *p;
250 	int n = 0;
251 	int i;
252 
253 	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
254 		if (isdigit(*p))
255 			n = 16 * n + *p - '0';
256 		else if (*p >= 'a' && *p <= 'f')
257 			n = 16 * n + *p - 'a' + 10;
258 		else if (*p >= 'A' && *p <= 'F')
259 			n = 16 * n + *p - 'A' + 10;
260 	}
261 	*pp = (uschar *) p;
262 	return n;
263 }
264 
265 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
266 
267 int quoted(uschar **pp)	/* pick up next thing after a \\ */
268 			/* and increment *pp */
269 {
270 	uschar *p = *pp;
271 	int c;
272 
273 	if ((c = *p++) == 't')
274 		c = '\t';
275 	else if (c == 'n')
276 		c = '\n';
277 	else if (c == 'f')
278 		c = '\f';
279 	else if (c == 'r')
280 		c = '\r';
281 	else if (c == 'b')
282 		c = '\b';
283 	else if (c == '\\')
284 		c = '\\';
285 	else if (c == 'x') {	/* hexadecimal goo follows */
286 		c = hexstr(&p);	/* this adds a null if number is invalid */
287 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
288 		int n = c - '0';
289 		if (isoctdigit(*p)) {
290 			n = 8 * n + *p++ - '0';
291 			if (isoctdigit(*p))
292 				n = 8 * n + *p++ - '0';
293 		}
294 		c = n;
295 	} /* else */
296 		/* c = c; */
297 	*pp = p;
298 	return c;
299 }
300 
301 static int collate_range_cmp(int a, int b)
302 {
303 	static char s[2][2];
304 
305 	if ((uschar)a == (uschar)b)
306 		return 0;
307 	s[0][0] = a;
308 	s[1][0] = b;
309 	return (strcoll(s[0], s[1]));
310 }
311 
312 char *cclenter(const char *argp)	/* add a character class */
313 {
314 	int i, c, c2;
315 	int j;
316 	uschar *p = (uschar *) argp;
317 	uschar *op, *bp;
318 	static uschar *buf = NULL;
319 	static int bufsz = 100;
320 
321 	op = p;
322 	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
323 		FATAL("out of space for character class [%.10s...] 1", p);
324 	bp = buf;
325 	for (i = 0; (c = *p++) != 0; ) {
326 		if (c == '\\') {
327 			c = quoted(&p);
328 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
329 			if (*p != 0) {
330 				c = bp[-1];
331 				c2 = *p++;
332 				if (c2 == '\\')
333 					c2 = quoted(&p);
334 				if (collate_range_cmp(c, c2) > 0) {
335 					bp--;
336 					i--;
337 					continue;
338 				}
339 				for (j = 0; j < NCHARS; j++) {
340 					if ((collate_range_cmp(c, j) > 0) ||
341 					    collate_range_cmp(j, c2) > 0)
342 						continue;
343 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
344 						FATAL("out of space for character class [%.10s...] 2", p);
345 					*bp++ = j;
346 					i++;
347 				}
348 				continue;
349 			}
350 		}
351 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
352 			FATAL("out of space for character class [%.10s...] 3", p);
353 		*bp++ = c;
354 		i++;
355 	}
356 	*bp = 0;
357 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
358 	xfree(op);
359 	return (char *) tostring((char *) buf);
360 }
361 
362 void overflo(const char *s)
363 {
364 	FATAL("regular expression too big: %.30s...", s);
365 }
366 
367 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
368 {
369 	int i;
370 	int *p;
371 
372 	switch (type(v)) {
373 	ELEAF
374 	LEAF
375 		f->re[info(v)].ltype = type(v);
376 		f->re[info(v)].lval.np = right(v);
377 		while (f->accept >= maxsetvec) {	/* guessing here! */
378 			maxsetvec *= 4;
379 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
380 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
381 			if (setvec == NULL || tmpset == NULL)
382 				overflo("out of space in cfoll()");
383 		}
384 		for (i = 0; i <= f->accept; i++)
385 			setvec[i] = 0;
386 		setcnt = 0;
387 		follow(v);	/* computes setvec and setcnt */
388 		if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
389 			overflo("out of space building follow set");
390 		f->re[info(v)].lfollow = p;
391 		*p = setcnt;
392 		for (i = f->accept; i >= 0; i--)
393 			if (setvec[i] == 1)
394 				*++p = i;
395 		break;
396 	UNARY
397 		cfoll(f,left(v));
398 		break;
399 	case CAT:
400 	case OR:
401 		cfoll(f,left(v));
402 		cfoll(f,right(v));
403 		break;
404 	default:	/* can't happen */
405 		FATAL("can't happen: unknown type %d in cfoll", type(v));
406 	}
407 }
408 
409 int first(Node *p)	/* collects initially active leaves of p into setvec */
410 			/* returns 0 if p matches empty string */
411 {
412 	int b, lp;
413 
414 	switch (type(p)) {
415 	ELEAF
416 	LEAF
417 		lp = info(p);	/* look for high-water mark of subscripts */
418 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
419 			maxsetvec *= 4;
420 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
421 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
422 			if (setvec == NULL || tmpset == NULL)
423 				overflo("out of space in first()");
424 		}
425 		if (type(p) == EMPTYRE) {
426 			setvec[lp] = 0;
427 			return(0);
428 		}
429 		if (setvec[lp] != 1) {
430 			setvec[lp] = 1;
431 			setcnt++;
432 		}
433 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
434 			return(0);		/* empty CCL */
435 		else return(1);
436 	case PLUS:
437 		if (first(left(p)) == 0) return(0);
438 		return(1);
439 	case STAR:
440 	case QUEST:
441 		first(left(p));
442 		return(0);
443 	case CAT:
444 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
445 		return(1);
446 	case OR:
447 		b = first(right(p));
448 		if (first(left(p)) == 0 || b == 0) return(0);
449 		return(1);
450 	}
451 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
452 	return(-1);
453 }
454 
455 void follow(Node *v)	/* collects leaves that can follow v into setvec */
456 {
457 	Node *p;
458 
459 	if (type(v) == FINAL)
460 		return;
461 	p = parent(v);
462 	switch (type(p)) {
463 	case STAR:
464 	case PLUS:
465 		first(v);
466 		follow(p);
467 		return;
468 
469 	case OR:
470 	case QUEST:
471 		follow(p);
472 		return;
473 
474 	case CAT:
475 		if (v == left(p)) {	/* v is left child of p */
476 			if (first(right(p)) == 0) {
477 				follow(p);
478 				return;
479 			}
480 		} else		/* v is right child */
481 			follow(p);
482 		return;
483 	}
484 }
485 
486 int member(int c, const char *sarg)	/* is c in s? */
487 {
488 	uschar *s = (uschar *) sarg;
489 
490 	while (*s)
491 		if (c == *s++)
492 			return(1);
493 	return(0);
494 }
495 
496 int match(fa *f, const char *p0)	/* shortest match ? */
497 {
498 	int s, ns;
499 	uschar *p = (uschar *) p0;
500 
501 	s = f->reset ? makeinit(f,0) : f->initstat;
502 	if (f->out[s])
503 		return(1);
504 	do {
505 		/* assert(*p < NCHARS); */
506 		if ((ns = f->gototab[s][*p]) != 0)
507 			s = ns;
508 		else
509 			s = cgoto(f, s, *p);
510 		if (f->out[s])
511 			return(1);
512 	} while (*p++ != 0);
513 	return(0);
514 }
515 
516 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
517 {
518 	int s, ns;
519 	uschar *p = (uschar *) p0;
520 	uschar *q;
521 	int i, k;
522 
523 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
524 	if (f->reset) {
525 		f->initstat = s = makeinit(f,1);
526 	} else {
527 		s = f->initstat;
528 	}
529 	patbeg = (char *) p;
530 	patlen = -1;
531 	do {
532 		q = p;
533 		do {
534 			if (f->out[s])		/* final state */
535 				patlen = q-p;
536 			/* assert(*q < NCHARS); */
537 			if ((ns = f->gototab[s][*q]) != 0)
538 				s = ns;
539 			else
540 				s = cgoto(f, s, *q);
541 			if (s == 1) {	/* no transition */
542 				if (patlen >= 0) {
543 					patbeg = (char *) p;
544 					return(1);
545 				}
546 				else
547 					goto nextin;	/* no match */
548 			}
549 		} while (*q++ != 0);
550 		if (f->out[s])
551 			patlen = q-p-1;	/* don't count $ */
552 		if (patlen >= 0) {
553 			patbeg = (char *) p;
554 			return(1);
555 		}
556 	nextin:
557 		s = 2;
558 		if (f->reset) {
559 			for (i = 2; i <= f->curstat; i++)
560 				xfree(f->posns[i]);
561 			k = *f->posns[0];
562 			if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
563 				overflo("out of space in pmatch");
564 			for (i = 0; i <= k; i++)
565 				(f->posns[2])[i] = (f->posns[0])[i];
566 			f->initstat = f->curstat = 2;
567 			f->out[2] = f->out[0];
568 			for (i = 0; i < NCHARS; i++)
569 				f->gototab[2][i] = 0;
570 		}
571 	} while (*p++ != 0);
572 	return (0);
573 }
574 
575 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
576 {
577 	int s, ns;
578 	uschar *p = (uschar *) p0;
579 	uschar *q;
580 	int i, k;
581 
582 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
583 	if (f->reset) {
584 		f->initstat = s = makeinit(f,1);
585 	} else {
586 		s = f->initstat;
587 	}
588 	patlen = -1;
589 	while (*p) {
590 		q = p;
591 		do {
592 			if (f->out[s])		/* final state */
593 				patlen = q-p;
594 			/* assert(*q < NCHARS); */
595 			if ((ns = f->gototab[s][*q]) != 0)
596 				s = ns;
597 			else
598 				s = cgoto(f, s, *q);
599 			if (s == 1) {	/* no transition */
600 				if (patlen > 0) {
601 					patbeg = (char *) p;
602 					return(1);
603 				} else
604 					goto nnextin;	/* no nonempty match */
605 			}
606 		} while (*q++ != 0);
607 		if (f->out[s])
608 			patlen = q-p-1;	/* don't count $ */
609 		if (patlen > 0 ) {
610 			patbeg = (char *) p;
611 			return(1);
612 		}
613 	nnextin:
614 		s = 2;
615 		if (f->reset) {
616 			for (i = 2; i <= f->curstat; i++)
617 				xfree(f->posns[i]);
618 			k = *f->posns[0];
619 			if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
620 				overflo("out of state space");
621 			for (i = 0; i <= k; i++)
622 				(f->posns[2])[i] = (f->posns[0])[i];
623 			f->initstat = f->curstat = 2;
624 			f->out[2] = f->out[0];
625 			for (i = 0; i < NCHARS; i++)
626 				f->gototab[2][i] = 0;
627 		}
628 		p++;
629 	}
630 	return (0);
631 }
632 
633 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
634 {			/* uses relex() to scan regular expression */
635 	Node *np;
636 
637 	dprintf( ("reparse <%s>\n", p) );
638 	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
639 	rtok = relex();
640 	/* GNU compatibility: an empty regexp matches anything */
641 	if (rtok == '\0') {
642 		/* FATAL("empty regular expression"); previous */
643 		return(op2(EMPTYRE, NIL, NIL));
644 	}
645 	np = regexp();
646 	if (rtok != '\0')
647 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
648 	return(np);
649 }
650 
651 Node *regexp(void)	/* top-level parse of reg expr */
652 {
653 	return (alt(concat(primary())));
654 }
655 
656 Node *primary(void)
657 {
658 	Node *np;
659 	int savelastatom;
660 
661 	switch (rtok) {
662 	case CHAR:
663 		lastatom = starttok;
664 		np = op2(CHAR, NIL, itonp(rlxval));
665 		rtok = relex();
666 		return (unary(np));
667 	case ALL:
668 		rtok = relex();
669 		return (unary(op2(ALL, NIL, NIL)));
670 	case EMPTYRE:
671 		rtok = relex();
672 		return (unary(op2(EMPTYRE, NIL, NIL)));
673 	case DOT:
674 		lastatom = starttok;
675 		rtok = relex();
676 		return (unary(op2(DOT, NIL, NIL)));
677 	case CCL:
678 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
679 		lastatom = starttok;
680 		rtok = relex();
681 		return (unary(np));
682 	case NCCL:
683 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
684 		lastatom = starttok;
685 		rtok = relex();
686 		return (unary(np));
687 	case '^':
688 		rtok = relex();
689 		return (unary(op2(CHAR, NIL, itonp(HAT))));
690 	case '$':
691 		rtok = relex();
692 		return (unary(op2(CHAR, NIL, NIL)));
693 	case '(':
694 		lastatom = starttok;
695 		savelastatom = starttok - basestr; /* Retain over recursion */
696 		rtok = relex();
697 		if (rtok == ')') {	/* special pleading for () */
698 			rtok = relex();
699 			return unary(op2(CCL, NIL, (Node *) tostring("")));
700 		}
701 		np = regexp();
702 		if (rtok == ')') {
703 			lastatom = basestr + savelastatom; /* Restore */
704 			rtok = relex();
705 			return (unary(np));
706 		}
707 		else
708 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
709 	default:
710 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
711 	}
712 	return 0;	/*NOTREACHED*/
713 }
714 
715 Node *concat(Node *np)
716 {
717 	switch (rtok) {
718 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
719 		return (concat(op2(CAT, np, primary())));
720 	case EMPTYRE:
721 		rtok = relex();
722 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
723 				primary())));
724 	}
725 	return (np);
726 }
727 
728 Node *alt(Node *np)
729 {
730 	if (rtok == OR) {
731 		rtok = relex();
732 		return (alt(op2(OR, np, concat(primary()))));
733 	}
734 	return (np);
735 }
736 
737 Node *unary(Node *np)
738 {
739 	switch (rtok) {
740 	case STAR:
741 		rtok = relex();
742 		return (unary(op2(STAR, np, NIL)));
743 	case PLUS:
744 		rtok = relex();
745 		return (unary(op2(PLUS, np, NIL)));
746 	case QUEST:
747 		rtok = relex();
748 		return (unary(op2(QUEST, np, NIL)));
749 	default:
750 		return (np);
751 	}
752 }
753 
754 /*
755  * Character class definitions conformant to the POSIX locale as
756  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
757  * and operating character sets are both ASCII (ISO646) or supersets
758  * thereof.
759  *
760  * Note that to avoid overflowing the temporary buffer used in
761  * relex(), the expanded character class (prior to range expansion)
762  * must be less than twice the size of their full name.
763  */
764 
765 /* Because isblank doesn't show up in any of the header files on any
766  * system i use, it's defined here.  if some other locale has a richer
767  * definition of "blank", define HAS_ISBLANK and provide your own
768  * version.
769  * the parentheses here are an attempt to find a path through the maze
770  * of macro definition and/or function and/or version provided.  thanks
771  * to nelson beebe for the suggestion; let's see if it works everywhere.
772  */
773 
774 /* #define HAS_ISBLANK */
775 #ifndef HAS_ISBLANK
776 
777 int (xisblank)(int c)
778 {
779 	return c==' ' || c=='\t';
780 }
781 
782 #endif
783 
784 struct charclass {
785 	const char *cc_name;
786 	int cc_namelen;
787 	int (*cc_func)(int);
788 } charclasses[] = {
789 	{ "alnum",	5,	isalnum },
790 	{ "alpha",	5,	isalpha },
791 #ifndef HAS_ISBLANK
792 	{ "blank",	5,	xisblank },
793 #else
794 	{ "blank",	5,	isblank },
795 #endif
796 	{ "cntrl",	5,	iscntrl },
797 	{ "digit",	5,	isdigit },
798 	{ "graph",	5,	isgraph },
799 	{ "lower",	5,	islower },
800 	{ "print",	5,	isprint },
801 	{ "punct",	5,	ispunct },
802 	{ "space",	5,	isspace },
803 	{ "upper",	5,	isupper },
804 	{ "xdigit",	6,	isxdigit },
805 	{ NULL,		0,	NULL },
806 };
807 
808 #define REPEAT_SIMPLE		0
809 #define REPEAT_PLUS_APPENDED	1
810 #define REPEAT_WITH_Q		2
811 #define REPEAT_ZERO		3
812 
813 static int
814 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
815 	       int atomlen, int firstnum, int secondnum, int special_case)
816 {
817 	int i, j;
818 	uschar *buf = 0;
819 	int ret = 1;
820 	int init_q = (firstnum==0);		/* first added char will be ? */
821 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
822 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
823 	int suffix_length = strlen((char *) reptok) - reptoklen;	/* string after rep specifier	*/
824 	int size = prefix_length +  suffix_length;
825 
826 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
827 		size += atomlen*(firstnum-1);
828 	}
829 
830 	/* Adjust size of buffer for special cases */
831 	if (special_case == REPEAT_PLUS_APPENDED) {
832 		size++;		/* for the final + */
833 	} else if (special_case == REPEAT_WITH_Q) {
834 		size += init_q + (atomlen+1)* n_q_reps;
835 	} else if (special_case == REPEAT_ZERO) {
836 		size += 2;	/* just a null ERE: () */
837 	}
838 	if ((buf = (uschar *) malloc(size+1)) == NULL)
839 		FATAL("out of space in reg expr %.10s..", lastre);
840 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
841 	j = prefix_length;
842 	if (special_case == REPEAT_ZERO) {
843 		j -= atomlen;
844 		buf[j++] = '(';
845 		buf[j++] = ')';
846 	}
847 	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
848 		memcpy(&buf[j], atom, atomlen);
849 		j += atomlen;
850 	}
851 	if (special_case == REPEAT_PLUS_APPENDED) {
852 		buf[j++] = '+';
853 	} else if (special_case == REPEAT_WITH_Q) {
854 		if (init_q) buf[j++] = '?';
855 		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
856 			memcpy(&buf[j], atom, atomlen);
857 			j += atomlen;
858 			buf[j++] = '?';
859 		}
860 	}
861 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
862 	if (special_case == REPEAT_ZERO) {
863 		buf[j+suffix_length] = '\0';
864 	} else {
865 		buf[size] = '\0';
866 	}
867 	/* free old basestr */
868 	if (firstbasestr != basestr) {
869 		if (basestr)
870 			xfree(basestr);
871 	}
872 	basestr = buf;
873 	prestr  = buf + prefix_length;
874 	if (special_case == REPEAT_ZERO) {
875 		prestr  -= atomlen;
876 		ret++;
877 	}
878 	return ret;
879 }
880 
881 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
882 		  int atomlen, int firstnum, int secondnum)
883 {
884 	/*
885 	   In general, the repetition specifier or "bound" is replaced here
886 	   by an equivalent ERE string, repeating the immediately previous atom
887 	   and appending ? and + as needed. Note that the first copy of the
888 	   atom is left in place, except in the special_case of a zero-repeat
889 	   (i.e., {0}).
890 	 */
891 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
892 		if (firstnum < 2) {
893 			/* 0 or 1: should be handled before you get here */
894 			FATAL("internal error");
895 		} else {
896 			return replace_repeat(reptok, reptoklen, atom, atomlen,
897 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
898 		}
899 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
900 		if (firstnum == 0) {	/* {0} or {0,0} */
901 			/* This case is unusual because the resulting
902 			   replacement string might actually be SMALLER than
903 			   the original ERE */
904 			return replace_repeat(reptok, reptoklen, atom, atomlen,
905 					firstnum, secondnum, REPEAT_ZERO);
906 		} else {		/* (firstnum >= 1) */
907 			return replace_repeat(reptok, reptoklen, atom, atomlen,
908 					firstnum, secondnum, REPEAT_SIMPLE);
909 		}
910 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
911 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
912 		return replace_repeat(reptok, reptoklen, atom, atomlen,
913 					firstnum, secondnum, REPEAT_WITH_Q);
914 	} else {	/* Error - shouldn't be here (n>m) */
915 		FATAL("internal error");
916 	}
917 	return 0;
918 }
919 
920 int relex(void)		/* lexical analyzer for reparse */
921 {
922 	int c, n;
923 	int cflag;
924 	static uschar *buf = NULL;
925 	static int bufsz = 100;
926 	uschar *bp;
927 	struct charclass *cc;
928 	int i;
929 	int num, m, commafound, digitfound;
930 	const uschar *startreptok;
931 
932 rescan:
933 	starttok = prestr;
934 
935 	switch (c = *prestr++) {
936 	case '|': return OR;
937 	case '*': return STAR;
938 	case '+': return PLUS;
939 	case '?': return QUEST;
940 	case '.': return DOT;
941 	case '\0': prestr--; return '\0';
942 	case '^':
943 	case '$':
944 	case '(':
945 	case ')':
946 		return c;
947 	case '\\':
948 		rlxval = quoted(&prestr);
949 		return CHAR;
950 	default:
951 		rlxval = c;
952 		return CHAR;
953 	case '[':
954 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
955 			FATAL("out of space in reg expr %.10s..", lastre);
956 		bp = buf;
957 		if (*prestr == '^') {
958 			cflag = 1;
959 			prestr++;
960 		}
961 		else
962 			cflag = 0;
963 		n = 2 * strlen((const char *) prestr)+1;
964 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
965 			FATAL("out of space for reg expr %.10s...", lastre);
966 		for (; ; ) {
967 			if ((c = *prestr++) == '\\') {
968 				*bp++ = '\\';
969 				if ((c = *prestr++) == '\0')
970 					FATAL("nonterminated character class %.20s...", lastre);
971 				*bp++ = c;
972 			/* } else if (c == '\n') { */
973 			/* 	FATAL("newline in character class %.20s...", lastre); */
974 			} else if (c == '[' && *prestr == ':') {
975 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
976 				for (cc = charclasses; cc->cc_name; cc++)
977 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
978 						break;
979 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
980 				    prestr[2 + cc->cc_namelen] == ']') {
981 					prestr += cc->cc_namelen + 3;
982 					/*
983 					 * BUG: We begin at 1, instead of 0, since we
984 					 * would otherwise prematurely terminate the
985 					 * string for classes like [[:cntrl:]]. This
986 					 * means that we can't match the NUL character,
987 					 * not without first adapting the entire
988 					 * program to track each string's length.
989 					 */
990 					for (i = 1; i <= UCHAR_MAX; i++) {
991 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
992 						    FATAL("out of space for reg expr %.10s...", lastre);
993 						if (cc->cc_func(i)) {
994 							*bp++ = i;
995 							n++;
996 						}
997 					}
998 				} else
999 					*bp++ = c;
1000 			} else if (c == '[' && *prestr == '.') {
1001 				char collate_char;
1002 				prestr++;
1003 				collate_char = *prestr++;
1004 				if (*prestr == '.' && prestr[1] == ']') {
1005 					prestr += 2;
1006 					/* Found it: map via locale TBD: for
1007 					   now, simply return this char.  This
1008 					   is sufficient to pass conformance
1009 					   test awk.ex 156
1010 					 */
1011 					if (*prestr == ']') {
1012 						prestr++;
1013 						rlxval = collate_char;
1014 						return CHAR;
1015 					}
1016 				}
1017 			} else if (c == '[' && *prestr == '=') {
1018 				char equiv_char;
1019 				prestr++;
1020 				equiv_char = *prestr++;
1021 				if (*prestr == '=' && prestr[1] == ']') {
1022 					prestr += 2;
1023 					/* Found it: map via locale TBD: for now
1024 					   simply return this char. This is
1025 					   sufficient to pass conformance test
1026 					   awk.ex 156
1027 					 */
1028 					if (*prestr == ']') {
1029 						prestr++;
1030 						rlxval = equiv_char;
1031 						return CHAR;
1032 					}
1033 				}
1034 			} else if (c == '\0') {
1035 				FATAL("nonterminated character class %.20s", lastre);
1036 			} else if (bp == buf) {	/* 1st char is special */
1037 				*bp++ = c;
1038 			} else if (c == ']') {
1039 				*bp++ = 0;
1040 				rlxstr = (uschar *) tostring((char *) buf);
1041 				if (cflag == 0)
1042 					return CCL;
1043 				else
1044 					return NCCL;
1045 			} else
1046 				*bp++ = c;
1047 		}
1048 		break;
1049 	case '{':
1050 		if (isdigit(*(prestr))) {
1051 			num = 0;	/* Process as a repetition */
1052 			n = -1; m = -1;
1053 			commafound = 0;
1054 			digitfound = 0;
1055 			startreptok = prestr-1;
1056 			/* Remember start of previous atom here ? */
1057 		} else {        	/* just a { char, not a repetition */
1058 			rlxval = c;
1059 			return CHAR;
1060                 }
1061 		for (; ; ) {
1062 			if ((c = *prestr++) == '}') {
1063 				if (commafound) {
1064 					if (digitfound) { /* {n,m} */
1065 						m = num;
1066 						if (m<n)
1067 							FATAL("illegal repetition expression: class %.20s",
1068 								lastre);
1069 						if ((n==0) && (m==1)) {
1070 							return QUEST;
1071 						}
1072 					} else {	/* {n,} */
1073 						if (n==0) return STAR;
1074 						if (n==1) return PLUS;
1075 					}
1076 				} else {
1077 					if (digitfound) { /* {n} same as {n,n} */
1078 						n = num;
1079 						m = num;
1080 					} else {	/* {} */
1081 						FATAL("illegal repetition expression: class %.20s",
1082 							lastre);
1083 					}
1084 				}
1085 				if (repeat(starttok, prestr-starttok, lastatom,
1086 					   startreptok - lastatom, n, m) > 0) {
1087 					if ((n==0) && (m==0)) {
1088 						return EMPTYRE;
1089 					}
1090 					/* must rescan input for next token */
1091 					goto rescan;
1092 				}
1093 				/* Failed to replace: eat up {...} characters
1094 				   and treat like just PLUS */
1095 				return PLUS;
1096 			} else if (c == '\0') {
1097 				FATAL("nonterminated character class %.20s",
1098 					lastre);
1099 			} else if (isdigit(c)) {
1100 				num = 10 * num + c - '0';
1101 				digitfound = 1;
1102 			} else if (c == ',') {
1103 				if (commafound)
1104 					FATAL("illegal repetition expression: class %.20s",
1105 						lastre);
1106 				/* looking for {n,} or {n,m} */
1107 				commafound = 1;
1108 				n = num;
1109 				digitfound = 0; /* reset */
1110 				num = 0;
1111 			} else {
1112 				FATAL("illegal repetition expression: class %.20s",
1113 					lastre);
1114 			}
1115 		}
1116 		break;
1117 	}
1118 }
1119 
1120 int cgoto(fa *f, int s, int c)
1121 {
1122 	int i, j, k;
1123 	int *p, *q;
1124 
1125 	assert(c == HAT || c < NCHARS);
1126 	while (f->accept >= maxsetvec) {	/* guessing here! */
1127 		maxsetvec *= 4;
1128 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
1129 		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1130 		if (setvec == NULL || tmpset == NULL)
1131 			overflo("out of space in cgoto()");
1132 	}
1133 	for (i = 0; i <= f->accept; i++)
1134 		setvec[i] = 0;
1135 	setcnt = 0;
1136 	/* compute positions of gototab[s,c] into setvec */
1137 	p = f->posns[s];
1138 	for (i = 1; i <= *p; i++) {
1139 		if ((k = f->re[p[i]].ltype) != FINAL) {
1140 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1141 			 || (k == DOT && c != 0 && c != HAT)
1142 			 || (k == ALL && c != 0)
1143 			 || (k == EMPTYRE && c != 0)
1144 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1145 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1146 				q = f->re[p[i]].lfollow;
1147 				for (j = 1; j <= *q; j++) {
1148 					if (q[j] >= maxsetvec) {
1149 						maxsetvec *= 4;
1150 						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
1151 						tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1152 						if (setvec == NULL || tmpset == NULL)
1153 							overflo("cgoto overflow");
1154 					}
1155 					if (setvec[q[j]] == 0) {
1156 						setcnt++;
1157 						setvec[q[j]] = 1;
1158 					}
1159 				}
1160 			}
1161 		}
1162 	}
1163 	/* determine if setvec is a previous state */
1164 	tmpset[0] = setcnt;
1165 	j = 1;
1166 	for (i = f->accept; i >= 0; i--)
1167 		if (setvec[i]) {
1168 			tmpset[j++] = i;
1169 		}
1170 	/* tmpset == previous state? */
1171 	for (i = 1; i <= f->curstat; i++) {
1172 		p = f->posns[i];
1173 		if ((k = tmpset[0]) != p[0])
1174 			goto different;
1175 		for (j = 1; j <= k; j++)
1176 			if (tmpset[j] != p[j])
1177 				goto different;
1178 		/* setvec is state i */
1179 		f->gototab[s][c] = i;
1180 		return i;
1181 	  different:;
1182 	}
1183 
1184 	/* add tmpset to current set of states */
1185 	if (f->curstat >= NSTATES-1) {
1186 		f->curstat = 2;
1187 		f->reset = 1;
1188 		for (i = 2; i < NSTATES; i++)
1189 			xfree(f->posns[i]);
1190 	} else
1191 		++(f->curstat);
1192 	for (i = 0; i < NCHARS; i++)
1193 		f->gototab[f->curstat][i] = 0;
1194 	xfree(f->posns[f->curstat]);
1195 	if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1196 		overflo("out of space in cgoto");
1197 
1198 	f->posns[f->curstat] = p;
1199 	f->gototab[s][c] = f->curstat;
1200 	for (i = 0; i <= setcnt; i++)
1201 		p[i] = tmpset[i];
1202 	if (setvec[f->accept])
1203 		f->out[f->curstat] = 1;
1204 	else
1205 		f->out[f->curstat] = 0;
1206 	return f->curstat;
1207 }
1208 
1209 
1210 void freefa(fa *f)	/* free a finite automaton */
1211 {
1212 	int i;
1213 
1214 	if (f == NULL)
1215 		return;
1216 	for (i = 0; i <= f->curstat; i++)
1217 		xfree(f->posns[i]);
1218 	for (i = 0; i <= f->accept; i++) {
1219 		xfree(f->re[i].lfollow);
1220 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1221 			xfree((f->re[i].lval.np));
1222 	}
1223 	xfree(f->restr);
1224 	xfree(f);
1225 }
1226