xref: /freebsd/contrib/one-true-awk/b.c (revision b78ee15e9f04ae15c3e1200df974473167524d17)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #define	DEBUG
31 
32 #include <ctype.h>
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include "awk.h"
37 #include "ytab.h"
38 
39 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
40 				/* NCHARS is 2**n */
41 #define MAXLIN 22
42 
43 #define type(v)		(v)->nobj	/* badly overloaded here */
44 #define info(v)		(v)->ntype	/* badly overloaded here */
45 #define left(v)		(v)->narg[0]
46 #define right(v)	(v)->narg[1]
47 #define parent(v)	(v)->nnext
48 
49 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
50 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
51 #define UNARY	case STAR: case PLUS: case QUEST:
52 
53 /* encoding in tree Nodes:
54 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
55 		left is index, right contains value or pointer to value
56 	unary (STAR, PLUS, QUEST): left is child, right is null
57 	binary (CAT, OR): left and right are children
58 	parent contains pointer to parent
59 */
60 
61 
62 int	*setvec;
63 int	*tmpset;
64 int	maxsetvec = 0;
65 
66 int	rtok;		/* next token in current re */
67 int	rlxval;
68 static uschar	*rlxstr;
69 static uschar	*prestr;	/* current position in current re */
70 static uschar	*lastre;	/* origin of last re */
71 
72 static	int setcnt;
73 static	int poscnt;
74 
75 char	*patbeg;
76 int	patlen;
77 
78 #define	NFA	20	/* cache this many dynamic fa's */
79 fa	*fatab[NFA];
80 int	nfatab	= 0;	/* entries in fatab */
81 
82 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
83 {
84 	int i, use, nuse;
85 	fa *pfa;
86 	static int now = 1;
87 
88 	if (setvec == 0) {	/* first time through any RE */
89 		maxsetvec = MAXLIN;
90 		setvec = (int *) malloc(maxsetvec * sizeof(int));
91 		tmpset = (int *) malloc(maxsetvec * sizeof(int));
92 		if (setvec == 0 || tmpset == 0)
93 			overflo("out of space initializing makedfa");
94 	}
95 
96 	if (compile_time)	/* a constant for sure */
97 		return mkdfa(s, anchor);
98 	for (i = 0; i < nfatab; i++)	/* is it there already? */
99 		if (fatab[i]->anchor == anchor
100 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
101 			fatab[i]->use = now++;
102 			return fatab[i];
103 		}
104 	pfa = mkdfa(s, anchor);
105 	if (nfatab < NFA) {	/* room for another */
106 		fatab[nfatab] = pfa;
107 		fatab[nfatab]->use = now++;
108 		nfatab++;
109 		return pfa;
110 	}
111 	use = fatab[0]->use;	/* replace least-recently used */
112 	nuse = 0;
113 	for (i = 1; i < nfatab; i++)
114 		if (fatab[i]->use < use) {
115 			use = fatab[i]->use;
116 			nuse = i;
117 		}
118 	freefa(fatab[nuse]);
119 	fatab[nuse] = pfa;
120 	pfa->use = now++;
121 	return pfa;
122 }
123 
124 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
125 				/* anchor = 1 for anchored matches, else 0 */
126 {
127 	Node *p, *p1;
128 	fa *f;
129 
130 	p = reparse(s);
131 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
132 		/* put ALL STAR in front of reg.  exp. */
133 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
134 		/* put FINAL after reg.  exp. */
135 
136 	poscnt = 0;
137 	penter(p1);	/* enter parent pointers and leaf indices */
138 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
139 		overflo("out of space for fa");
140 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
141 	cfoll(f, p1);	/* set up follow sets */
142 	freetr(p1);
143 	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
144 			overflo("out of space in makedfa");
145 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
146 		overflo("out of space in makedfa");
147 	*f->posns[1] = 0;
148 	f->initstat = makeinit(f, anchor);
149 	f->anchor = anchor;
150 	f->restr = (uschar *) tostring(s);
151 	return f;
152 }
153 
154 int makeinit(fa *f, int anchor)
155 {
156 	int i, k;
157 
158 	f->curstat = 2;
159 	f->out[2] = 0;
160 	f->reset = 0;
161 	k = *(f->re[0].lfollow);
162 	xfree(f->posns[2]);
163 	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
164 		overflo("out of space in makeinit");
165 	for (i=0; i <= k; i++) {
166 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
167 	}
168 	if ((f->posns[2])[1] == f->accept)
169 		f->out[2] = 1;
170 	for (i=0; i < NCHARS; i++)
171 		f->gototab[2][i] = 0;
172 	f->curstat = cgoto(f, 2, HAT);
173 	if (anchor) {
174 		*f->posns[2] = k-1;	/* leave out position 0 */
175 		for (i=0; i < k; i++) {
176 			(f->posns[0])[i] = (f->posns[2])[i];
177 		}
178 
179 		f->out[0] = f->out[2];
180 		if (f->curstat != 2)
181 			--(*f->posns[f->curstat]);
182 	}
183 	return f->curstat;
184 }
185 
186 void penter(Node *p)	/* set up parent pointers and leaf indices */
187 {
188 	switch (type(p)) {
189 	ELEAF
190 	LEAF
191 		info(p) = poscnt;
192 		poscnt++;
193 		break;
194 	UNARY
195 		penter(left(p));
196 		parent(left(p)) = p;
197 		break;
198 	case CAT:
199 	case OR:
200 		penter(left(p));
201 		penter(right(p));
202 		parent(left(p)) = p;
203 		parent(right(p)) = p;
204 		break;
205 	default:	/* can't happen */
206 		FATAL("can't happen: unknown type %d in penter", type(p));
207 		break;
208 	}
209 }
210 
211 void freetr(Node *p)	/* free parse tree */
212 {
213 	switch (type(p)) {
214 	ELEAF
215 	LEAF
216 		xfree(p);
217 		break;
218 	UNARY
219 		freetr(left(p));
220 		xfree(p);
221 		break;
222 	case CAT:
223 	case OR:
224 		freetr(left(p));
225 		freetr(right(p));
226 		xfree(p);
227 		break;
228 	default:	/* can't happen */
229 		FATAL("can't happen: unknown type %d in freetr", type(p));
230 		break;
231 	}
232 }
233 
234 /* in the parsing of regular expressions, metacharacters like . have */
235 /* to be seen literally;  \056 is not a metacharacter. */
236 
237 int hexstr(uschar **pp)	/* find and eval hex string at pp, return new p */
238 {			/* only pick up one 8-bit byte (2 chars) */
239 	uschar *p;
240 	int n = 0;
241 	int i;
242 
243 	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
244 		if (isdigit(*p))
245 			n = 16 * n + *p - '0';
246 		else if (*p >= 'a' && *p <= 'f')
247 			n = 16 * n + *p - 'a' + 10;
248 		else if (*p >= 'A' && *p <= 'F')
249 			n = 16 * n + *p - 'A' + 10;
250 	}
251 	*pp = (uschar *) p;
252 	return n;
253 }
254 
255 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
256 
257 int quoted(uschar **pp)	/* pick up next thing after a \\ */
258 			/* and increment *pp */
259 {
260 	uschar *p = *pp;
261 	int c;
262 
263 	if ((c = *p++) == 't')
264 		c = '\t';
265 	else if (c == 'n')
266 		c = '\n';
267 	else if (c == 'f')
268 		c = '\f';
269 	else if (c == 'r')
270 		c = '\r';
271 	else if (c == 'b')
272 		c = '\b';
273 	else if (c == '\\')
274 		c = '\\';
275 	else if (c == 'x') {	/* hexadecimal goo follows */
276 		c = hexstr(&p);	/* this adds a null if number is invalid */
277 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
278 		int n = c - '0';
279 		if (isoctdigit(*p)) {
280 			n = 8 * n + *p++ - '0';
281 			if (isoctdigit(*p))
282 				n = 8 * n + *p++ - '0';
283 		}
284 		c = n;
285 	} /* else */
286 		/* c = c; */
287 	*pp = p;
288 	return c;
289 }
290 
291 static int collate_range_cmp(int a, int b)
292 {
293 	static char s[2][2];
294 
295 	if ((uschar)a == (uschar)b)
296 		return 0;
297 	s[0][0] = a;
298 	s[1][0] = b;
299 	return (strcoll(s[0], s[1]));
300 }
301 
302 char *cclenter(const char *argp)	/* add a character class */
303 {
304 	int i, c, c2;
305 	int j;
306 	uschar *p = (uschar *) argp;
307 	uschar *op, *bp;
308 	static uschar *buf = 0;
309 	static int bufsz = 100;
310 
311 	op = p;
312 	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
313 		FATAL("out of space for character class [%.10s...] 1", p);
314 	bp = buf;
315 	for (i = 0; (c = *p++) != 0; ) {
316 		if (c == '\\') {
317 			c = quoted(&p);
318 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
319 			if (*p != 0) {
320 				c = bp[-1];
321 				c2 = *p++;
322 				if (c2 == '\\')
323 					c2 = quoted(&p);
324 				if (collate_range_cmp(c, c2) > 0) {
325 					bp--;
326 					i--;
327 					continue;
328 				}
329 				for (j = 0; j < NCHARS; j++) {
330 					if ((collate_range_cmp(c, j) > 0) ||
331 					    collate_range_cmp(j, c2) > 0)
332 						continue;
333 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
334 						FATAL("out of space for character class [%.10s...] 2", p);
335 					*bp++ = j;
336 					i++;
337 				}
338 				continue;
339 			}
340 		}
341 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
342 			FATAL("out of space for character class [%.10s...] 3", p);
343 		*bp++ = c;
344 		i++;
345 	}
346 	*bp = 0;
347 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
348 	xfree(op);
349 	return (char *) tostring((char *) buf);
350 }
351 
352 void overflo(const char *s)
353 {
354 	FATAL("regular expression too big: %.30s...", s);
355 }
356 
357 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
358 {
359 	int i;
360 	int *p;
361 
362 	switch (type(v)) {
363 	ELEAF
364 	LEAF
365 		f->re[info(v)].ltype = type(v);
366 		f->re[info(v)].lval.np = right(v);
367 		while (f->accept >= maxsetvec) {	/* guessing here! */
368 			maxsetvec *= 4;
369 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
370 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
371 			if (setvec == 0 || tmpset == 0)
372 				overflo("out of space in cfoll()");
373 		}
374 		for (i = 0; i <= f->accept; i++)
375 			setvec[i] = 0;
376 		setcnt = 0;
377 		follow(v);	/* computes setvec and setcnt */
378 		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
379 			overflo("out of space building follow set");
380 		f->re[info(v)].lfollow = p;
381 		*p = setcnt;
382 		for (i = f->accept; i >= 0; i--)
383 			if (setvec[i] == 1)
384 				*++p = i;
385 		break;
386 	UNARY
387 		cfoll(f,left(v));
388 		break;
389 	case CAT:
390 	case OR:
391 		cfoll(f,left(v));
392 		cfoll(f,right(v));
393 		break;
394 	default:	/* can't happen */
395 		FATAL("can't happen: unknown type %d in cfoll", type(v));
396 	}
397 }
398 
399 int first(Node *p)	/* collects initially active leaves of p into setvec */
400 			/* returns 0 if p matches empty string */
401 {
402 	int b, lp;
403 
404 	switch (type(p)) {
405 	ELEAF
406 	LEAF
407 		lp = info(p);	/* look for high-water mark of subscripts */
408 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
409 			maxsetvec *= 4;
410 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
411 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
412 			if (setvec == 0 || tmpset == 0)
413 				overflo("out of space in first()");
414 		}
415 		if (type(p) == EMPTYRE) {
416 			setvec[lp] = 0;
417 			return(0);
418 		}
419 		if (setvec[lp] != 1) {
420 			setvec[lp] = 1;
421 			setcnt++;
422 		}
423 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
424 			return(0);		/* empty CCL */
425 		else return(1);
426 	case PLUS:
427 		if (first(left(p)) == 0) return(0);
428 		return(1);
429 	case STAR:
430 	case QUEST:
431 		first(left(p));
432 		return(0);
433 	case CAT:
434 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
435 		return(1);
436 	case OR:
437 		b = first(right(p));
438 		if (first(left(p)) == 0 || b == 0) return(0);
439 		return(1);
440 	}
441 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
442 	return(-1);
443 }
444 
445 void follow(Node *v)	/* collects leaves that can follow v into setvec */
446 {
447 	Node *p;
448 
449 	if (type(v) == FINAL)
450 		return;
451 	p = parent(v);
452 	switch (type(p)) {
453 	case STAR:
454 	case PLUS:
455 		first(v);
456 		follow(p);
457 		return;
458 
459 	case OR:
460 	case QUEST:
461 		follow(p);
462 		return;
463 
464 	case CAT:
465 		if (v == left(p)) {	/* v is left child of p */
466 			if (first(right(p)) == 0) {
467 				follow(p);
468 				return;
469 			}
470 		} else		/* v is right child */
471 			follow(p);
472 		return;
473 	}
474 }
475 
476 int member(int c, const char *sarg)	/* is c in s? */
477 {
478 	uschar *s = (uschar *) sarg;
479 
480 	while (*s)
481 		if (c == *s++)
482 			return(1);
483 	return(0);
484 }
485 
486 int match(fa *f, const char *p0)	/* shortest match ? */
487 {
488 	int s, ns;
489 	uschar *p = (uschar *) p0;
490 
491 	s = f->reset ? makeinit(f,0) : f->initstat;
492 	if (f->out[s])
493 		return(1);
494 	do {
495 		/* assert(*p < NCHARS); */
496 		if ((ns = f->gototab[s][*p]) != 0)
497 			s = ns;
498 		else
499 			s = cgoto(f, s, *p);
500 		if (f->out[s])
501 			return(1);
502 	} while (*p++ != 0);
503 	return(0);
504 }
505 
506 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
507 {
508 	int s, ns;
509 	uschar *p = (uschar *) p0;
510 	uschar *q;
511 	int i, k;
512 
513 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
514 	if (f->reset) {
515 		f->initstat = s = makeinit(f,1);
516 	} else {
517 		s = f->initstat;
518 	}
519 	patbeg = (char *) p;
520 	patlen = -1;
521 	do {
522 		q = p;
523 		do {
524 			if (f->out[s])		/* final state */
525 				patlen = q-p;
526 			/* assert(*q < NCHARS); */
527 			if ((ns = f->gototab[s][*q]) != 0)
528 				s = ns;
529 			else
530 				s = cgoto(f, s, *q);
531 			if (s == 1) {	/* no transition */
532 				if (patlen >= 0) {
533 					patbeg = (char *) p;
534 					return(1);
535 				}
536 				else
537 					goto nextin;	/* no match */
538 			}
539 		} while (*q++ != 0);
540 		if (f->out[s])
541 			patlen = q-p-1;	/* don't count $ */
542 		if (patlen >= 0) {
543 			patbeg = (char *) p;
544 			return(1);
545 		}
546 	nextin:
547 		s = 2;
548 		if (f->reset) {
549 			for (i = 2; i <= f->curstat; i++)
550 				xfree(f->posns[i]);
551 			k = *f->posns[0];
552 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
553 				overflo("out of space in pmatch");
554 			for (i = 0; i <= k; i++)
555 				(f->posns[2])[i] = (f->posns[0])[i];
556 			f->initstat = f->curstat = 2;
557 			f->out[2] = f->out[0];
558 			for (i = 0; i < NCHARS; i++)
559 				f->gototab[2][i] = 0;
560 		}
561 	} while (*p++ != 0);
562 	return (0);
563 }
564 
565 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
566 {
567 	int s, ns;
568 	uschar *p = (uschar *) p0;
569 	uschar *q;
570 	int i, k;
571 
572 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
573 	if (f->reset) {
574 		f->initstat = s = makeinit(f,1);
575 	} else {
576 		s = f->initstat;
577 	}
578 	patlen = -1;
579 	while (*p) {
580 		q = p;
581 		do {
582 			if (f->out[s])		/* final state */
583 				patlen = q-p;
584 			/* assert(*q < NCHARS); */
585 			if ((ns = f->gototab[s][*q]) != 0)
586 				s = ns;
587 			else
588 				s = cgoto(f, s, *q);
589 			if (s == 1) {	/* no transition */
590 				if (patlen > 0) {
591 					patbeg = (char *) p;
592 					return(1);
593 				} else
594 					goto nnextin;	/* no nonempty match */
595 			}
596 		} while (*q++ != 0);
597 		if (f->out[s])
598 			patlen = q-p-1;	/* don't count $ */
599 		if (patlen > 0 ) {
600 			patbeg = (char *) p;
601 			return(1);
602 		}
603 	nnextin:
604 		s = 2;
605 		if (f->reset) {
606 			for (i = 2; i <= f->curstat; i++)
607 				xfree(f->posns[i]);
608 			k = *f->posns[0];
609 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
610 				overflo("out of state space");
611 			for (i = 0; i <= k; i++)
612 				(f->posns[2])[i] = (f->posns[0])[i];
613 			f->initstat = f->curstat = 2;
614 			f->out[2] = f->out[0];
615 			for (i = 0; i < NCHARS; i++)
616 				f->gototab[2][i] = 0;
617 		}
618 		p++;
619 	}
620 	return (0);
621 }
622 
623 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
624 {			/* uses relex() to scan regular expression */
625 	Node *np;
626 
627 	dprintf( ("reparse <%s>\n", p) );
628 	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
629 	rtok = relex();
630 	/* GNU compatibility: an empty regexp matches anything */
631 	if (rtok == '\0') {
632 		/* FATAL("empty regular expression"); previous */
633 		return(op2(EMPTYRE, NIL, NIL));
634 	}
635 	np = regexp();
636 	if (rtok != '\0')
637 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
638 	return(np);
639 }
640 
641 Node *regexp(void)	/* top-level parse of reg expr */
642 {
643 	return (alt(concat(primary())));
644 }
645 
646 Node *primary(void)
647 {
648 	Node *np;
649 
650 	switch (rtok) {
651 	case CHAR:
652 		np = op2(CHAR, NIL, itonp(rlxval));
653 		rtok = relex();
654 		return (unary(np));
655 	case ALL:
656 		rtok = relex();
657 		return (unary(op2(ALL, NIL, NIL)));
658 	case EMPTYRE:
659 		rtok = relex();
660 		return (unary(op2(ALL, NIL, NIL)));
661 	case DOT:
662 		rtok = relex();
663 		return (unary(op2(DOT, NIL, NIL)));
664 	case CCL:
665 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
666 		rtok = relex();
667 		return (unary(np));
668 	case NCCL:
669 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
670 		rtok = relex();
671 		return (unary(np));
672 	case '^':
673 		rtok = relex();
674 		return (unary(op2(CHAR, NIL, itonp(HAT))));
675 	case '$':
676 		rtok = relex();
677 		return (unary(op2(CHAR, NIL, NIL)));
678 	case '(':
679 		rtok = relex();
680 		if (rtok == ')') {	/* special pleading for () */
681 			rtok = relex();
682 			return unary(op2(CCL, NIL, (Node *) tostring("")));
683 		}
684 		np = regexp();
685 		if (rtok == ')') {
686 			rtok = relex();
687 			return (unary(np));
688 		}
689 		else
690 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
691 	default:
692 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
693 	}
694 	return 0;	/*NOTREACHED*/
695 }
696 
697 Node *concat(Node *np)
698 {
699 	switch (rtok) {
700 	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
701 		return (concat(op2(CAT, np, primary())));
702 	}
703 	return (np);
704 }
705 
706 Node *alt(Node *np)
707 {
708 	if (rtok == OR) {
709 		rtok = relex();
710 		return (alt(op2(OR, np, concat(primary()))));
711 	}
712 	return (np);
713 }
714 
715 Node *unary(Node *np)
716 {
717 	switch (rtok) {
718 	case STAR:
719 		rtok = relex();
720 		return (unary(op2(STAR, np, NIL)));
721 	case PLUS:
722 		rtok = relex();
723 		return (unary(op2(PLUS, np, NIL)));
724 	case QUEST:
725 		rtok = relex();
726 		return (unary(op2(QUEST, np, NIL)));
727 	default:
728 		return (np);
729 	}
730 }
731 
732 /*
733  * Character class definitions conformant to the POSIX locale as
734  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
735  * and operating character sets are both ASCII (ISO646) or supersets
736  * thereof.
737  *
738  * Note that to avoid overflowing the temporary buffer used in
739  * relex(), the expanded character class (prior to range expansion)
740  * must be less than twice the size of their full name.
741  */
742 
743 /* Because isblank doesn't show up in any of the header files on any
744  * system i use, it's defined here.  if some other locale has a richer
745  * definition of "blank", define HAS_ISBLANK and provide your own
746  * version.
747  * the parentheses here are an attempt to find a path through the maze
748  * of macro definition and/or function and/or version provided.  thanks
749  * to nelson beebe for the suggestion; let's see if it works everywhere.
750  */
751 
752 /* #define HAS_ISBLANK */
753 #ifndef HAS_ISBLANK
754 
755 int (xisblank)(int c)
756 {
757 	return c==' ' || c=='\t';
758 }
759 
760 #endif
761 
762 struct charclass {
763 	const char *cc_name;
764 	int cc_namelen;
765 	int (*cc_func)(int);
766 } charclasses[] = {
767 	{ "alnum",	5,	isalnum },
768 	{ "alpha",	5,	isalpha },
769 #ifndef HAS_ISBLANK
770 	{ "blank",	5,	isspace }, /* was isblank */
771 #else
772 	{ "blank",	5,	isblank },
773 #endif
774 	{ "cntrl",	5,	iscntrl },
775 	{ "digit",	5,	isdigit },
776 	{ "graph",	5,	isgraph },
777 	{ "lower",	5,	islower },
778 	{ "print",	5,	isprint },
779 	{ "punct",	5,	ispunct },
780 	{ "space",	5,	isspace },
781 	{ "upper",	5,	isupper },
782 	{ "xdigit",	6,	isxdigit },
783 	{ NULL,		0,	NULL },
784 };
785 
786 
787 int relex(void)		/* lexical analyzer for reparse */
788 {
789 	int c, n;
790 	int cflag;
791 	static uschar *buf = 0;
792 	static int bufsz = 100;
793 	uschar *bp;
794 	struct charclass *cc;
795 	int i;
796 
797 	switch (c = *prestr++) {
798 	case '|': return OR;
799 	case '*': return STAR;
800 	case '+': return PLUS;
801 	case '?': return QUEST;
802 	case '.': return DOT;
803 	case '\0': prestr--; return '\0';
804 	case '^':
805 	case '$':
806 	case '(':
807 	case ')':
808 		return c;
809 	case '\\':
810 		rlxval = quoted(&prestr);
811 		return CHAR;
812 	default:
813 		rlxval = c;
814 		return CHAR;
815 	case '[':
816 		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
817 			FATAL("out of space in reg expr %.10s..", lastre);
818 		bp = buf;
819 		if (*prestr == '^') {
820 			cflag = 1;
821 			prestr++;
822 		}
823 		else
824 			cflag = 0;
825 		n = 2 * strlen((const char *) prestr)+1;
826 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
827 			FATAL("out of space for reg expr %.10s...", lastre);
828 		for (; ; ) {
829 			if ((c = *prestr++) == '\\') {
830 				*bp++ = '\\';
831 				if ((c = *prestr++) == '\0')
832 					FATAL("nonterminated character class %.20s...", lastre);
833 				*bp++ = c;
834 			/* } else if (c == '\n') { */
835 			/* 	FATAL("newline in character class %.20s...", lastre); */
836 			} else if (c == '[' && *prestr == ':') {
837 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
838 				for (cc = charclasses; cc->cc_name; cc++)
839 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
840 						break;
841 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
842 				    prestr[2 + cc->cc_namelen] == ']') {
843 					prestr += cc->cc_namelen + 3;
844 					for (i = 0; i < NCHARS; i++) {
845 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
846 						    FATAL("out of space for reg expr %.10s...", lastre);
847 						if (cc->cc_func(i)) {
848 							*bp++ = i;
849 							n++;
850 						}
851 					}
852 				} else
853 					*bp++ = c;
854 			} else if (c == '\0') {
855 				FATAL("nonterminated character class %.20s", lastre);
856 			} else if (bp == buf) {	/* 1st char is special */
857 				*bp++ = c;
858 			} else if (c == ']') {
859 				*bp++ = 0;
860 				rlxstr = (uschar *) tostring((char *) buf);
861 				if (cflag == 0)
862 					return CCL;
863 				else
864 					return NCCL;
865 			} else
866 				*bp++ = c;
867 		}
868 	}
869 }
870 
871 int cgoto(fa *f, int s, int c)
872 {
873 	int i, j, k;
874 	int *p, *q;
875 
876 	assert(c == HAT || c < NCHARS);
877 	while (f->accept >= maxsetvec) {	/* guessing here! */
878 		maxsetvec *= 4;
879 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
880 		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
881 		if (setvec == 0 || tmpset == 0)
882 			overflo("out of space in cgoto()");
883 	}
884 	for (i = 0; i <= f->accept; i++)
885 		setvec[i] = 0;
886 	setcnt = 0;
887 	/* compute positions of gototab[s,c] into setvec */
888 	p = f->posns[s];
889 	for (i = 1; i <= *p; i++) {
890 		if ((k = f->re[p[i]].ltype) != FINAL) {
891 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
892 			 || (k == DOT && c != 0 && c != HAT)
893 			 || (k == ALL && c != 0)
894 			 || (k == EMPTYRE && c != 0)
895 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
896 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
897 				q = f->re[p[i]].lfollow;
898 				for (j = 1; j <= *q; j++) {
899 					if (q[j] >= maxsetvec) {
900 						maxsetvec *= 4;
901 						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
902 						tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
903 						if (setvec == 0 || tmpset == 0)
904 							overflo("cgoto overflow");
905 					}
906 					if (setvec[q[j]] == 0) {
907 						setcnt++;
908 						setvec[q[j]] = 1;
909 					}
910 				}
911 			}
912 		}
913 	}
914 	/* determine if setvec is a previous state */
915 	tmpset[0] = setcnt;
916 	j = 1;
917 	for (i = f->accept; i >= 0; i--)
918 		if (setvec[i]) {
919 			tmpset[j++] = i;
920 		}
921 	/* tmpset == previous state? */
922 	for (i = 1; i <= f->curstat; i++) {
923 		p = f->posns[i];
924 		if ((k = tmpset[0]) != p[0])
925 			goto different;
926 		for (j = 1; j <= k; j++)
927 			if (tmpset[j] != p[j])
928 				goto different;
929 		/* setvec is state i */
930 		f->gototab[s][c] = i;
931 		return i;
932 	  different:;
933 	}
934 
935 	/* add tmpset to current set of states */
936 	if (f->curstat >= NSTATES-1) {
937 		f->curstat = 2;
938 		f->reset = 1;
939 		for (i = 2; i < NSTATES; i++)
940 			xfree(f->posns[i]);
941 	} else
942 		++(f->curstat);
943 	for (i = 0; i < NCHARS; i++)
944 		f->gototab[f->curstat][i] = 0;
945 	xfree(f->posns[f->curstat]);
946 	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
947 		overflo("out of space in cgoto");
948 
949 	f->posns[f->curstat] = p;
950 	f->gototab[s][c] = f->curstat;
951 	for (i = 0; i <= setcnt; i++)
952 		p[i] = tmpset[i];
953 	if (setvec[f->accept])
954 		f->out[f->curstat] = 1;
955 	else
956 		f->out[f->curstat] = 0;
957 	return f->curstat;
958 }
959 
960 
961 void freefa(fa *f)	/* free a finite automaton */
962 {
963 	int i;
964 
965 	if (f == NULL)
966 		return;
967 	for (i = 0; i <= f->curstat; i++)
968 		xfree(f->posns[i]);
969 	for (i = 0; i <= f->accept; i++) {
970 		xfree(f->re[i].lfollow);
971 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
972 			xfree((f->re[i].lval.np));
973 	}
974 	xfree(f->restr);
975 	xfree(f);
976 }
977