xref: /freebsd/contrib/one-true-awk/b.c (revision d056fa046c6a91b90cd98165face0e42a33a5173)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'entrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include "awk.h"
34 #include "ytab.h"
35 
36 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
37 				/* NCHARS is 2**n */
38 #define MAXLIN 22
39 
40 #define type(v)		(v)->nobj	/* badly overloaded here */
41 #define info(v)		(v)->ntype	/* badly overloaded here */
42 #define left(v)		(v)->narg[0]
43 #define right(v)	(v)->narg[1]
44 #define parent(v)	(v)->nnext
45 
46 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define UNARY	case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
51 		left is index, right contains value or pointer to value
52 	unary (STAR, PLUS, QUEST): left is child, right is null
53 	binary (CAT, OR): left and right are children
54 	parent contains pointer to parent
55 */
56 
57 
58 int	*setvec;
59 int	*tmpset;
60 int	maxsetvec = 0;
61 
62 int	rtok;		/* next token in current re */
63 int	rlxval;
64 static uschar	*rlxstr;
65 static uschar	*prestr;	/* current position in current re */
66 static uschar	*lastre;	/* origin of last re */
67 
68 static	int setcnt;
69 static	int poscnt;
70 
71 char	*patbeg;
72 int	patlen;
73 
74 #define	NFA	20	/* cache this many dynamic fa's */
75 fa	*fatab[NFA];
76 int	nfatab	= 0;	/* entries in fatab */
77 
78 fa *makedfa(const char *s, int anchor)	/* returns dfa for reg expr s */
79 {
80 	int i, use, nuse;
81 	fa *pfa;
82 	static int now = 1;
83 
84 	if (setvec == 0) {	/* first time through any RE */
85 		maxsetvec = MAXLIN;
86 		setvec = (int *) malloc(maxsetvec * sizeof(int));
87 		tmpset = (int *) malloc(maxsetvec * sizeof(int));
88 		if (setvec == 0 || tmpset == 0)
89 			overflo("out of space initializing makedfa");
90 	}
91 
92 	if (compile_time)	/* a constant for sure */
93 		return mkdfa(s, anchor);
94 	for (i = 0; i < nfatab; i++)	/* is it there already? */
95 		if (fatab[i]->anchor == anchor
96 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
97 			fatab[i]->use = now++;
98 			return fatab[i];
99 		}
100 	pfa = mkdfa(s, anchor);
101 	if (nfatab < NFA) {	/* room for another */
102 		fatab[nfatab] = pfa;
103 		fatab[nfatab]->use = now++;
104 		nfatab++;
105 		return pfa;
106 	}
107 	use = fatab[0]->use;	/* replace least-recently used */
108 	nuse = 0;
109 	for (i = 1; i < nfatab; i++)
110 		if (fatab[i]->use < use) {
111 			use = fatab[i]->use;
112 			nuse = i;
113 		}
114 	freefa(fatab[nuse]);
115 	fatab[nuse] = pfa;
116 	pfa->use = now++;
117 	return pfa;
118 }
119 
120 fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
121 				/* anchor = 1 for anchored matches, else 0 */
122 {
123 	Node *p, *p1;
124 	fa *f;
125 
126 	p = reparse(s);
127 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
128 		/* put ALL STAR in front of reg.  exp. */
129 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
130 		/* put FINAL after reg.  exp. */
131 
132 	poscnt = 0;
133 	penter(p1);	/* enter parent pointers and leaf indices */
134 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
135 		overflo("out of space for fa");
136 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
137 	cfoll(f, p1);	/* set up follow sets */
138 	freetr(p1);
139 	if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
140 			overflo("out of space in makedfa");
141 	if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
142 		overflo("out of space in makedfa");
143 	*f->posns[1] = 0;
144 	f->initstat = makeinit(f, anchor);
145 	f->anchor = anchor;
146 	f->restr = (uschar *) tostring(s);
147 	return f;
148 }
149 
150 int makeinit(fa *f, int anchor)
151 {
152 	int i, k;
153 
154 	f->curstat = 2;
155 	f->out[2] = 0;
156 	f->reset = 0;
157 	k = *(f->re[0].lfollow);
158 	xfree(f->posns[2]);
159 	if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
160 		overflo("out of space in makeinit");
161 	for (i=0; i <= k; i++) {
162 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
163 	}
164 	if ((f->posns[2])[1] == f->accept)
165 		f->out[2] = 1;
166 	for (i=0; i < NCHARS; i++)
167 		f->gototab[2][i] = 0;
168 	f->curstat = cgoto(f, 2, HAT);
169 	if (anchor) {
170 		*f->posns[2] = k-1;	/* leave out position 0 */
171 		for (i=0; i < k; i++) {
172 			(f->posns[0])[i] = (f->posns[2])[i];
173 		}
174 
175 		f->out[0] = f->out[2];
176 		if (f->curstat != 2)
177 			--(*f->posns[f->curstat]);
178 	}
179 	return f->curstat;
180 }
181 
182 void penter(Node *p)	/* set up parent pointers and leaf indices */
183 {
184 	switch (type(p)) {
185 	LEAF
186 		info(p) = poscnt;
187 		poscnt++;
188 		break;
189 	UNARY
190 		penter(left(p));
191 		parent(left(p)) = p;
192 		break;
193 	case CAT:
194 	case OR:
195 		penter(left(p));
196 		penter(right(p));
197 		parent(left(p)) = p;
198 		parent(right(p)) = p;
199 		break;
200 	default:	/* can't happen */
201 		FATAL("can't happen: unknown type %d in penter", type(p));
202 		break;
203 	}
204 }
205 
206 void freetr(Node *p)	/* free parse tree */
207 {
208 	switch (type(p)) {
209 	LEAF
210 		xfree(p);
211 		break;
212 	UNARY
213 		freetr(left(p));
214 		xfree(p);
215 		break;
216 	case CAT:
217 	case OR:
218 		freetr(left(p));
219 		freetr(right(p));
220 		xfree(p);
221 		break;
222 	default:	/* can't happen */
223 		FATAL("can't happen: unknown type %d in freetr", type(p));
224 		break;
225 	}
226 }
227 
228 /* in the parsing of regular expressions, metacharacters like . have */
229 /* to be seen literally;  \056 is not a metacharacter. */
230 
231 int hexstr(char **pp)	/* find and eval hex string at pp, return new p */
232 {			/* only pick up one 8-bit byte (2 chars) */
233 	uschar *p;
234 	int n = 0;
235 	int i;
236 
237 	for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
238 		if (isdigit(*p))
239 			n = 16 * n + *p - '0';
240 		else if (*p >= 'a' && *p <= 'f')
241 			n = 16 * n + *p - 'a' + 10;
242 		else if (*p >= 'A' && *p <= 'F')
243 			n = 16 * n + *p - 'A' + 10;
244 	}
245 	*pp = (char *) p;
246 	return n;
247 }
248 
249 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
250 
251 int quoted(char **pp)	/* pick up next thing after a \\ */
252 			/* and increment *pp */
253 {
254 	char *p = *pp;
255 	int c;
256 
257 	if ((c = *p++) == 't')
258 		c = '\t';
259 	else if (c == 'n')
260 		c = '\n';
261 	else if (c == 'f')
262 		c = '\f';
263 	else if (c == 'r')
264 		c = '\r';
265 	else if (c == 'b')
266 		c = '\b';
267 	else if (c == '\\')
268 		c = '\\';
269 	else if (c == 'x') {	/* hexadecimal goo follows */
270 		c = hexstr(&p);	/* this adds a null if number is invalid */
271 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
272 		int n = c - '0';
273 		if (isoctdigit(*p)) {
274 			n = 8 * n + *p++ - '0';
275 			if (isoctdigit(*p))
276 				n = 8 * n + *p++ - '0';
277 		}
278 		c = n;
279 	} /* else */
280 		/* c = c; */
281 	*pp = p;
282 	return c;
283 }
284 
285 char *cclenter(const char *argp)	/* add a character class */
286 {
287 	int i, c, c2;
288 	uschar *p = (uschar *) argp;
289 	uschar *op, *bp;
290 	static uschar *buf = 0;
291 	static int bufsz = 100;
292 
293 	op = p;
294 	if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
295 		FATAL("out of space for character class [%.10s...] 1", p);
296 	bp = buf;
297 	for (i = 0; (c = *p++) != 0; ) {
298 		if (c == '\\') {
299 			c = quoted((char **) &p);
300 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
301 			if (*p != 0) {
302 				c = bp[-1];
303 				c2 = *p++;
304 				if (c2 == '\\')
305 					c2 = quoted((char **) &p);
306 				if (c > c2) {	/* empty; ignore */
307 					bp--;
308 					i--;
309 					continue;
310 				}
311 				while (c < c2) {
312 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
313 						FATAL("out of space for character class [%.10s...] 2", p);
314 					*bp++ = ++c;
315 					i++;
316 				}
317 				continue;
318 			}
319 		}
320 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
321 			FATAL("out of space for character class [%.10s...] 3", p);
322 		*bp++ = c;
323 		i++;
324 	}
325 	*bp = 0;
326 	dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
327 	xfree(op);
328 	return (char *) tostring((char *) buf);
329 }
330 
331 void overflo(const char *s)
332 {
333 	FATAL("regular expression too big: %.30s...", s);
334 }
335 
336 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
337 {
338 	int i;
339 	int *p;
340 
341 	switch (type(v)) {
342 	LEAF
343 		f->re[info(v)].ltype = type(v);
344 		f->re[info(v)].lval.np = right(v);
345 		while (f->accept >= maxsetvec) {	/* guessing here! */
346 			maxsetvec *= 4;
347 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
348 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
349 			if (setvec == 0 || tmpset == 0)
350 				overflo("out of space in cfoll()");
351 		}
352 		for (i = 0; i <= f->accept; i++)
353 			setvec[i] = 0;
354 		setcnt = 0;
355 		follow(v);	/* computes setvec and setcnt */
356 		if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
357 			overflo("out of space building follow set");
358 		f->re[info(v)].lfollow = p;
359 		*p = setcnt;
360 		for (i = f->accept; i >= 0; i--)
361 			if (setvec[i] == 1)
362 				*++p = i;
363 		break;
364 	UNARY
365 		cfoll(f,left(v));
366 		break;
367 	case CAT:
368 	case OR:
369 		cfoll(f,left(v));
370 		cfoll(f,right(v));
371 		break;
372 	default:	/* can't happen */
373 		FATAL("can't happen: unknown type %d in cfoll", type(v));
374 	}
375 }
376 
377 int first(Node *p)	/* collects initially active leaves of p into setvec */
378 			/* returns 1 if p matches empty string */
379 {
380 	int b, lp;
381 
382 	switch (type(p)) {
383 	LEAF
384 		lp = info(p);	/* look for high-water mark of subscripts */
385 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
386 			maxsetvec *= 4;
387 			setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
388 			tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
389 			if (setvec == 0 || tmpset == 0)
390 				overflo("out of space in first()");
391 		}
392 		if (setvec[lp] != 1) {
393 			setvec[lp] = 1;
394 			setcnt++;
395 		}
396 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
397 			return(0);		/* empty CCL */
398 		else return(1);
399 	case PLUS:
400 		if (first(left(p)) == 0) return(0);
401 		return(1);
402 	case STAR:
403 	case QUEST:
404 		first(left(p));
405 		return(0);
406 	case CAT:
407 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
408 		return(1);
409 	case OR:
410 		b = first(right(p));
411 		if (first(left(p)) == 0 || b == 0) return(0);
412 		return(1);
413 	}
414 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
415 	return(-1);
416 }
417 
418 void follow(Node *v)	/* collects leaves that can follow v into setvec */
419 {
420 	Node *p;
421 
422 	if (type(v) == FINAL)
423 		return;
424 	p = parent(v);
425 	switch (type(p)) {
426 	case STAR:
427 	case PLUS:
428 		first(v);
429 		follow(p);
430 		return;
431 
432 	case OR:
433 	case QUEST:
434 		follow(p);
435 		return;
436 
437 	case CAT:
438 		if (v == left(p)) {	/* v is left child of p */
439 			if (first(right(p)) == 0) {
440 				follow(p);
441 				return;
442 			}
443 		} else		/* v is right child */
444 			follow(p);
445 		return;
446 	}
447 }
448 
449 int member(int c, const char *sarg)	/* is c in s? */
450 {
451 	uschar *s = (uschar *) sarg;
452 
453 	while (*s)
454 		if (c == *s++)
455 			return(1);
456 	return(0);
457 }
458 
459 int match(fa *f, const char *p0)	/* shortest match ? */
460 {
461 	int s, ns;
462 	uschar *p = (uschar *) p0;
463 
464 	s = f->reset ? makeinit(f,0) : f->initstat;
465 	if (f->out[s])
466 		return(1);
467 	do {
468 		assert(*p < NCHARS);
469 		if ((ns = f->gototab[s][*p]) != 0)
470 			s = ns;
471 		else
472 			s = cgoto(f, s, *p);
473 		if (f->out[s])
474 			return(1);
475 	} while (*p++ != 0);
476 	return(0);
477 }
478 
479 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
480 {
481 	int s, ns;
482 	uschar *p = (uschar *) p0;
483 	uschar *q;
484 	int i, k;
485 
486 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
487 	if (f->reset) {
488 		f->initstat = s = makeinit(f,1);
489 	} else {
490 		s = f->initstat;
491 	}
492 	patbeg = (char *) p;
493 	patlen = -1;
494 	do {
495 		q = p;
496 		do {
497 			if (f->out[s])		/* final state */
498 				patlen = q-p;
499 			assert(*q < NCHARS);
500 			if ((ns = f->gototab[s][*q]) != 0)
501 				s = ns;
502 			else
503 				s = cgoto(f, s, *q);
504 			if (s == 1) {	/* no transition */
505 				if (patlen >= 0) {
506 					patbeg = (char *) p;
507 					return(1);
508 				}
509 				else
510 					goto nextin;	/* no match */
511 			}
512 		} while (*q++ != 0);
513 		if (f->out[s])
514 			patlen = q-p-1;	/* don't count $ */
515 		if (patlen >= 0) {
516 			patbeg = (char *) p;
517 			return(1);
518 		}
519 	nextin:
520 		s = 2;
521 		if (f->reset) {
522 			for (i = 2; i <= f->curstat; i++)
523 				xfree(f->posns[i]);
524 			k = *f->posns[0];
525 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
526 				overflo("out of space in pmatch");
527 			for (i = 0; i <= k; i++)
528 				(f->posns[2])[i] = (f->posns[0])[i];
529 			f->initstat = f->curstat = 2;
530 			f->out[2] = f->out[0];
531 			for (i = 0; i < NCHARS; i++)
532 				f->gototab[2][i] = 0;
533 		}
534 	} while (*p++ != 0);
535 	return (0);
536 }
537 
538 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
539 {
540 	int s, ns;
541 	uschar *p = (uschar *) p0;
542 	uschar *q;
543 	int i, k;
544 
545 	/* s = f->reset ? makeinit(f,1) : f->initstat; */
546 	if (f->reset) {
547 		f->initstat = s = makeinit(f,1);
548 	} else {
549 		s = f->initstat;
550 	}
551 	patlen = -1;
552 	while (*p) {
553 		q = p;
554 		do {
555 			if (f->out[s])		/* final state */
556 				patlen = q-p;
557 			assert(*q < NCHARS);
558 			if ((ns = f->gototab[s][*q]) != 0)
559 				s = ns;
560 			else
561 				s = cgoto(f, s, *q);
562 			if (s == 1) {	/* no transition */
563 				if (patlen > 0) {
564 					patbeg = (char *) p;
565 					return(1);
566 				} else
567 					goto nnextin;	/* no nonempty match */
568 			}
569 		} while (*q++ != 0);
570 		if (f->out[s])
571 			patlen = q-p-1;	/* don't count $ */
572 		if (patlen > 0 ) {
573 			patbeg = (char *) p;
574 			return(1);
575 		}
576 	nnextin:
577 		s = 2;
578 		if (f->reset) {
579 			for (i = 2; i <= f->curstat; i++)
580 				xfree(f->posns[i]);
581 			k = *f->posns[0];
582 			if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
583 				overflo("out of state space");
584 			for (i = 0; i <= k; i++)
585 				(f->posns[2])[i] = (f->posns[0])[i];
586 			f->initstat = f->curstat = 2;
587 			f->out[2] = f->out[0];
588 			for (i = 0; i < NCHARS; i++)
589 				f->gototab[2][i] = 0;
590 		}
591 		p++;
592 	}
593 	return (0);
594 }
595 
596 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
597 {			/* uses relex() to scan regular expression */
598 	Node *np;
599 
600 	dprintf( ("reparse <%s>\n", p) );
601 	lastre = prestr = (uschar *) p;	/* prestr points to string to be parsed */
602 	rtok = relex();
603 	/* GNU compatibility: an empty regexp matches anything */
604 	if (rtok == '\0')
605 		/* FATAL("empty regular expression"); previous */
606 		return(op2(ALL, NIL, NIL));
607 	np = regexp();
608 	if (rtok != '\0')
609 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
610 	return(np);
611 }
612 
613 Node *regexp(void)	/* top-level parse of reg expr */
614 {
615 	return (alt(concat(primary())));
616 }
617 
618 Node *primary(void)
619 {
620 	Node *np;
621 
622 	switch (rtok) {
623 	case CHAR:
624 		np = op2(CHAR, NIL, itonp(rlxval));
625 		rtok = relex();
626 		return (unary(np));
627 	case ALL:
628 		rtok = relex();
629 		return (unary(op2(ALL, NIL, NIL)));
630 	case DOT:
631 		rtok = relex();
632 		return (unary(op2(DOT, NIL, NIL)));
633 	case CCL:
634 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
635 		rtok = relex();
636 		return (unary(np));
637 	case NCCL:
638 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
639 		rtok = relex();
640 		return (unary(np));
641 	case '^':
642 		rtok = relex();
643 		return (unary(op2(CHAR, NIL, itonp(HAT))));
644 	case '$':
645 		rtok = relex();
646 		return (unary(op2(CHAR, NIL, NIL)));
647 	case '(':
648 		rtok = relex();
649 		if (rtok == ')') {	/* special pleading for () */
650 			rtok = relex();
651 			return unary(op2(CCL, NIL, (Node *) tostring("")));
652 		}
653 		np = regexp();
654 		if (rtok == ')') {
655 			rtok = relex();
656 			return (unary(np));
657 		}
658 		else
659 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
660 	default:
661 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
662 	}
663 	return 0;	/*NOTREACHED*/
664 }
665 
666 Node *concat(Node *np)
667 {
668 	switch (rtok) {
669 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
670 		return (concat(op2(CAT, np, primary())));
671 	}
672 	return (np);
673 }
674 
675 Node *alt(Node *np)
676 {
677 	if (rtok == OR) {
678 		rtok = relex();
679 		return (alt(op2(OR, np, concat(primary()))));
680 	}
681 	return (np);
682 }
683 
684 Node *unary(Node *np)
685 {
686 	switch (rtok) {
687 	case STAR:
688 		rtok = relex();
689 		return (unary(op2(STAR, np, NIL)));
690 	case PLUS:
691 		rtok = relex();
692 		return (unary(op2(PLUS, np, NIL)));
693 	case QUEST:
694 		rtok = relex();
695 		return (unary(op2(QUEST, np, NIL)));
696 	default:
697 		return (np);
698 	}
699 }
700 
701 /*
702  * Character class definitions conformant to the POSIX locale as
703  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
704  * and operating character sets are both ASCII (ISO646) or supersets
705  * thereof.
706  *
707  * Note that to avoid overflowing the temporary buffer used in
708  * relex(), the expanded character class (prior to range expansion)
709  * must be less than twice the size of their full name.
710  */
711 
712 /* Because isblank doesn't show up in any of the header files on any
713  * system i use, it's defined here.  if some other locale has a richer
714  * definition of "blank", define HAS_ISBLANK and provide your own
715  * version.
716  * the parentheses here are an attempt to find a path through the maze
717  * of macro definition and/or function and/or version provided.  thanks
718  * to nelson beebe for the suggestion; let's see if it works everywhere.
719  */
720 
721 #ifndef HAS_ISBLANK
722 
723 int (isblank)(int c)
724 {
725 	return c==' ' || c=='\t';
726 }
727 
728 #endif
729 
730 struct charclass {
731 	const char *cc_name;
732 	int cc_namelen;
733 	int (*cc_func)(int);
734 } charclasses[] = {
735 	{ "alnum",	5,	isalnum },
736 	{ "alpha",	5,	isalpha },
737 	{ "blank",	5,	isblank },
738 	{ "cntrl",	5,	iscntrl },
739 	{ "digit",	5,	isdigit },
740 	{ "graph",	5,	isgraph },
741 	{ "lower",	5,	islower },
742 	{ "print",	5,	isprint },
743 	{ "punct",	5,	ispunct },
744 	{ "space",	5,	isspace },
745 	{ "upper",	5,	isupper },
746 	{ "xdigit",	6,	isxdigit },
747 	{ NULL,		0,	NULL },
748 };
749 
750 
751 int relex(void)		/* lexical analyzer for reparse */
752 {
753 	int c, n;
754 	int cflag;
755 	static uschar *buf = 0;
756 	static int bufsz = 100;
757 	uschar *bp;
758 	struct charclass *cc;
759 	int i;
760 
761 	switch (c = *prestr++) {
762 	case '|': return OR;
763 	case '*': return STAR;
764 	case '+': return PLUS;
765 	case '?': return QUEST;
766 	case '.': return DOT;
767 	case '\0': prestr--; return '\0';
768 	case '^':
769 	case '$':
770 	case '(':
771 	case ')':
772 		return c;
773 	case '\\':
774 		rlxval = quoted((char **) &prestr);
775 		return CHAR;
776 	default:
777 		rlxval = c;
778 		return CHAR;
779 	case '[':
780 		if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
781 			FATAL("out of space in reg expr %.10s..", lastre);
782 		bp = buf;
783 		if (*prestr == '^') {
784 			cflag = 1;
785 			prestr++;
786 		}
787 		else
788 			cflag = 0;
789 		n = 2 * strlen((const char *) prestr)+1;
790 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, 0))
791 			FATAL("out of space for reg expr %.10s...", lastre);
792 		for (; ; ) {
793 			if ((c = *prestr++) == '\\') {
794 				*bp++ = '\\';
795 				if ((c = *prestr++) == '\0')
796 					FATAL("nonterminated character class %.20s...", lastre);
797 				*bp++ = c;
798 			/* } else if (c == '\n') { */
799 			/* 	FATAL("newline in character class %.20s...", lastre); */
800 			} else if (c == '[' && *prestr == ':') {
801 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
802 				for (cc = charclasses; cc->cc_name; cc++)
803 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
804 						break;
805 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
806 				    prestr[2 + cc->cc_namelen] == ']') {
807 					prestr += cc->cc_namelen + 3;
808 					for (i = 0; i < NCHARS; i++) {
809 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0))
810 						    FATAL("out of space for reg expr %.10s...", lastre);
811 						if (cc->cc_func(i)) {
812 							*bp++ = i;
813 							n++;
814 						}
815 					}
816 				} else
817 					*bp++ = c;
818 			} else if (c == '\0') {
819 				FATAL("nonterminated character class %.20s", lastre);
820 			} else if (bp == buf) {	/* 1st char is special */
821 				*bp++ = c;
822 			} else if (c == ']') {
823 				*bp++ = 0;
824 				rlxstr = (uschar *) tostring((char *) buf);
825 				if (cflag == 0)
826 					return CCL;
827 				else
828 					return NCCL;
829 			} else
830 				*bp++ = c;
831 		}
832 	}
833 }
834 
835 int cgoto(fa *f, int s, int c)
836 {
837 	int i, j, k;
838 	int *p, *q;
839 
840 	assert(c == HAT || c < NCHARS);
841 	while (f->accept >= maxsetvec) {	/* guessing here! */
842 		maxsetvec *= 4;
843 		setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
844 		tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
845 		if (setvec == 0 || tmpset == 0)
846 			overflo("out of space in cgoto()");
847 	}
848 	for (i = 0; i <= f->accept; i++)
849 		setvec[i] = 0;
850 	setcnt = 0;
851 	/* compute positions of gototab[s,c] into setvec */
852 	p = f->posns[s];
853 	for (i = 1; i <= *p; i++) {
854 		if ((k = f->re[p[i]].ltype) != FINAL) {
855 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
856 			 || (k == DOT && c != 0 && c != HAT)
857 			 || (k == ALL && c != 0)
858 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
859 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
860 				q = f->re[p[i]].lfollow;
861 				for (j = 1; j <= *q; j++) {
862 					if (q[j] >= maxsetvec) {
863 						maxsetvec *= 4;
864 						setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
865 						tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int));
866 						if (setvec == 0 || tmpset == 0)
867 							overflo("cgoto overflow");
868 					}
869 					if (setvec[q[j]] == 0) {
870 						setcnt++;
871 						setvec[q[j]] = 1;
872 					}
873 				}
874 			}
875 		}
876 	}
877 	/* determine if setvec is a previous state */
878 	tmpset[0] = setcnt;
879 	j = 1;
880 	for (i = f->accept; i >= 0; i--)
881 		if (setvec[i]) {
882 			tmpset[j++] = i;
883 		}
884 	/* tmpset == previous state? */
885 	for (i = 1; i <= f->curstat; i++) {
886 		p = f->posns[i];
887 		if ((k = tmpset[0]) != p[0])
888 			goto different;
889 		for (j = 1; j <= k; j++)
890 			if (tmpset[j] != p[j])
891 				goto different;
892 		/* setvec is state i */
893 		f->gototab[s][c] = i;
894 		return i;
895 	  different:;
896 	}
897 
898 	/* add tmpset to current set of states */
899 	if (f->curstat >= NSTATES-1) {
900 		f->curstat = 2;
901 		f->reset = 1;
902 		for (i = 2; i < NSTATES; i++)
903 			xfree(f->posns[i]);
904 	} else
905 		++(f->curstat);
906 	for (i = 0; i < NCHARS; i++)
907 		f->gototab[f->curstat][i] = 0;
908 	xfree(f->posns[f->curstat]);
909 	if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
910 		overflo("out of space in cgoto");
911 
912 	f->posns[f->curstat] = p;
913 	f->gototab[s][c] = f->curstat;
914 	for (i = 0; i <= setcnt; i++)
915 		p[i] = tmpset[i];
916 	if (setvec[f->accept])
917 		f->out[f->curstat] = 1;
918 	else
919 		f->out[f->curstat] = 0;
920 	return f->curstat;
921 }
922 
923 
924 void freefa(fa *f)	/* free a finite automaton */
925 {
926 	int i;
927 
928 	if (f == NULL)
929 		return;
930 	for (i = 0; i <= f->curstat; i++)
931 		xfree(f->posns[i]);
932 	for (i = 0; i <= f->accept; i++) {
933 		xfree(f->re[i].lfollow);
934 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
935 			xfree((f->re[i].lval.np));
936 	}
937 	xfree(f->restr);
938 	xfree(f);
939 }
940