xref: /freebsd/contrib/one-true-awk/b.c (revision 8881d206f4e68b564c2c5f50fc717086fc3e827a)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36 
37 #define MAXLIN 22
38 
39 #define type(v)		(v)->nobj	/* badly overloaded here */
40 #define info(v)		(v)->ntype	/* badly overloaded here */
41 #define left(v)		(v)->narg[0]
42 #define right(v)	(v)->narg[1]
43 #define parent(v)	(v)->nnext
44 
45 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
47 #define UNARY	case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 		left is index, right contains value or pointer to value
52 	unary (STAR, PLUS, QUEST): left is child, right is null
53 	binary (CAT, OR): left and right are children
54 	parent contains pointer to parent
55 */
56 
57 
58 int	*setvec;
59 int	*tmpset;
60 int	maxsetvec = 0;
61 
62 int	rtok;		/* next token in current re */
63 int	rlxval;
64 static const uschar	*rlxstr;
65 static const uschar	*prestr;	/* current position in current re */
66 static const uschar	*lastre;	/* origin of last re */
67 static const uschar	*lastatom;	/* origin of last Atom */
68 static const uschar	*starttok;
69 static const uschar 	*basestr;	/* starts with original, replaced during
70 				   repetition processing */
71 static const uschar 	*firstbasestr;
72 
73 static	int setcnt;
74 static	int poscnt;
75 
76 const char	*patbeg;
77 int	patlen;
78 
79 #define	NFA	128	/* cache this many dynamic fa's */
80 fa	*fatab[NFA];
81 int	nfatab	= 0;	/* entries in fatab */
82 
83 static int *
84 intalloc(size_t n, const char *f)
85 {
86 	int *p = (int *) calloc(n, sizeof(int));
87 	if (p == NULL)
88 		overflo(f);
89 	return p;
90 }
91 
92 static void
93 resizesetvec(const char *f)
94 {
95 	if (maxsetvec == 0)
96 		maxsetvec = MAXLIN;
97 	else
98 		maxsetvec *= 4;
99 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
100 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
101 	if (setvec == NULL || tmpset == NULL)
102 		overflo(f);
103 }
104 
105 static void
106 resize_state(fa *f, int state)
107 {
108 	unsigned int **p;
109 	uschar *p2;
110 	int **p3;
111 	int i, new_count;
112 
113 	if (++state < f->state_count)
114 		return;
115 
116 	new_count = state + 10; /* needs to be tuned */
117 
118 	p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
119 	if (p == NULL)
120 		goto out;
121 	f->gototab = p;
122 
123 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
124 	if (p2 == NULL)
125 		goto out;
126 	f->out = p2;
127 
128 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
129 	if (p3 == NULL)
130 		goto out;
131 	f->posns = p3;
132 
133 	for (i = f->state_count; i < new_count; ++i) {
134 		f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
135 		if (f->gototab[i] == NULL)
136 			goto out;
137 		f->out[i]  = 0;
138 		f->posns[i] = NULL;
139 	}
140 	f->state_count = new_count;
141 	return;
142 out:
143 	overflo(__func__);
144 }
145 
146 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
147 {
148 	int i, use, nuse;
149 	fa *pfa;
150 	static int now = 1;
151 
152 	if (setvec == NULL) {	/* first time through any RE */
153 		resizesetvec(__func__);
154 	}
155 
156 	if (compile_time != RUNNING)	/* a constant for sure */
157 		return mkdfa(s, anchor);
158 	for (i = 0; i < nfatab; i++)	/* is it there already? */
159 		if (fatab[i]->anchor == anchor
160 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
161 			fatab[i]->use = now++;
162 			return fatab[i];
163 		}
164 	pfa = mkdfa(s, anchor);
165 	if (nfatab < NFA) {	/* room for another */
166 		fatab[nfatab] = pfa;
167 		fatab[nfatab]->use = now++;
168 		nfatab++;
169 		return pfa;
170 	}
171 	use = fatab[0]->use;	/* replace least-recently used */
172 	nuse = 0;
173 	for (i = 1; i < nfatab; i++)
174 		if (fatab[i]->use < use) {
175 			use = fatab[i]->use;
176 			nuse = i;
177 		}
178 	freefa(fatab[nuse]);
179 	fatab[nuse] = pfa;
180 	pfa->use = now++;
181 	return pfa;
182 }
183 
184 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
185 				/* anchor = true for anchored matches, else false */
186 {
187 	Node *p, *p1;
188 	fa *f;
189 
190 	firstbasestr = (const uschar *) s;
191 	basestr = firstbasestr;
192 	p = reparse(s);
193 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
194 		/* put ALL STAR in front of reg.  exp. */
195 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
196 		/* put FINAL after reg.  exp. */
197 
198 	poscnt = 0;
199 	penter(p1);	/* enter parent pointers and leaf indices */
200 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
201 		overflo(__func__);
202 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
203 	cfoll(f, p1);	/* set up follow sets */
204 	freetr(p1);
205 	resize_state(f, 1);
206 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
207 	f->posns[1] = intalloc(1, __func__);
208 	*f->posns[1] = 0;
209 	f->initstat = makeinit(f, anchor);
210 	f->anchor = anchor;
211 	f->restr = (uschar *) tostring(s);
212 	if (firstbasestr != basestr) {
213 		if (basestr)
214 			xfree(basestr);
215 	}
216 	return f;
217 }
218 
219 int makeinit(fa *f, bool anchor)
220 {
221 	int i, k;
222 
223 	f->curstat = 2;
224 	f->out[2] = 0;
225 	k = *(f->re[0].lfollow);
226 	xfree(f->posns[2]);
227 	f->posns[2] = intalloc(k + 1,  __func__);
228 	for (i = 0; i <= k; i++) {
229 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
230 	}
231 	if ((f->posns[2])[1] == f->accept)
232 		f->out[2] = 1;
233 	for (i = 0; i < NCHARS; i++)
234 		f->gototab[2][i] = 0;
235 	f->curstat = cgoto(f, 2, HAT);
236 	if (anchor) {
237 		*f->posns[2] = k-1;	/* leave out position 0 */
238 		for (i = 0; i < k; i++) {
239 			(f->posns[0])[i] = (f->posns[2])[i];
240 		}
241 
242 		f->out[0] = f->out[2];
243 		if (f->curstat != 2)
244 			--(*f->posns[f->curstat]);
245 	}
246 	return f->curstat;
247 }
248 
249 void penter(Node *p)	/* set up parent pointers and leaf indices */
250 {
251 	switch (type(p)) {
252 	ELEAF
253 	LEAF
254 		info(p) = poscnt;
255 		poscnt++;
256 		break;
257 	UNARY
258 		penter(left(p));
259 		parent(left(p)) = p;
260 		break;
261 	case CAT:
262 	case OR:
263 		penter(left(p));
264 		penter(right(p));
265 		parent(left(p)) = p;
266 		parent(right(p)) = p;
267 		break;
268 	case ZERO:
269 		break;
270 	default:	/* can't happen */
271 		FATAL("can't happen: unknown type %d in penter", type(p));
272 		break;
273 	}
274 }
275 
276 void freetr(Node *p)	/* free parse tree */
277 {
278 	switch (type(p)) {
279 	ELEAF
280 	LEAF
281 		xfree(p);
282 		break;
283 	UNARY
284 	case ZERO:
285 		freetr(left(p));
286 		xfree(p);
287 		break;
288 	case CAT:
289 	case OR:
290 		freetr(left(p));
291 		freetr(right(p));
292 		xfree(p);
293 		break;
294 	default:	/* can't happen */
295 		FATAL("can't happen: unknown type %d in freetr", type(p));
296 		break;
297 	}
298 }
299 
300 /* in the parsing of regular expressions, metacharacters like . have */
301 /* to be seen literally;  \056 is not a metacharacter. */
302 
303 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
304 {			/* only pick up one 8-bit byte (2 chars) */
305 	const uschar *p;
306 	int n = 0;
307 	int i;
308 
309 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
310 		if (isdigit(*p))
311 			n = 16 * n + *p - '0';
312 		else if (*p >= 'a' && *p <= 'f')
313 			n = 16 * n + *p - 'a' + 10;
314 		else if (*p >= 'A' && *p <= 'F')
315 			n = 16 * n + *p - 'A' + 10;
316 	}
317 	*pp = p;
318 	return n;
319 }
320 
321 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
322 
323 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
324 			/* and increment *pp */
325 {
326 	const uschar *p = *pp;
327 	int c;
328 
329 	if ((c = *p++) == 't')
330 		c = '\t';
331 	else if (c == 'n')
332 		c = '\n';
333 	else if (c == 'f')
334 		c = '\f';
335 	else if (c == 'r')
336 		c = '\r';
337 	else if (c == 'b')
338 		c = '\b';
339 	else if (c == 'v')
340 		c = '\v';
341 	else if (c == 'a')
342 		c = '\a';
343 	else if (c == '\\')
344 		c = '\\';
345 	else if (c == 'x') {	/* hexadecimal goo follows */
346 		c = hexstr(&p);	/* this adds a null if number is invalid */
347 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
348 		int n = c - '0';
349 		if (isoctdigit(*p)) {
350 			n = 8 * n + *p++ - '0';
351 			if (isoctdigit(*p))
352 				n = 8 * n + *p++ - '0';
353 		}
354 		c = n;
355 	} /* else */
356 		/* c = c; */
357 	*pp = p;
358 	return c;
359 }
360 
361 char *cclenter(const char *argp)	/* add a character class */
362 {
363 	int i, c, c2;
364 	const uschar *op, *p = (const uschar *) argp;
365 	uschar *bp;
366 	static uschar *buf = NULL;
367 	static int bufsz = 100;
368 
369 	op = p;
370 	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
371 		FATAL("out of space for character class [%.10s...] 1", p);
372 	bp = buf;
373 	for (i = 0; (c = *p++) != 0; ) {
374 		if (c == '\\') {
375 			c = quoted(&p);
376 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
377 			if (*p != 0) {
378 				c = bp[-1];
379 				c2 = *p++;
380 				if (c2 == '\\')
381 					c2 = quoted(&p);
382 				if (c > c2) {	/* empty; ignore */
383 					bp--;
384 					i--;
385 					continue;
386 				}
387 				while (c < c2) {
388 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
389 						FATAL("out of space for character class [%.10s...] 2", p);
390 					*bp++ = ++c;
391 					i++;
392 				}
393 				continue;
394 			}
395 		}
396 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
397 			FATAL("out of space for character class [%.10s...] 3", p);
398 		*bp++ = c;
399 		i++;
400 	}
401 	*bp = 0;
402 	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
403 	xfree(op);
404 	return (char *) tostring((char *) buf);
405 }
406 
407 void overflo(const char *s)
408 {
409 	FATAL("regular expression too big: out of space in %.30s...", s);
410 }
411 
412 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
413 {
414 	int i;
415 	int *p;
416 
417 	switch (type(v)) {
418 	ELEAF
419 	LEAF
420 		f->re[info(v)].ltype = type(v);
421 		f->re[info(v)].lval.np = right(v);
422 		while (f->accept >= maxsetvec) {	/* guessing here! */
423 			resizesetvec(__func__);
424 		}
425 		for (i = 0; i <= f->accept; i++)
426 			setvec[i] = 0;
427 		setcnt = 0;
428 		follow(v);	/* computes setvec and setcnt */
429 		p = intalloc(setcnt + 1, __func__);
430 		f->re[info(v)].lfollow = p;
431 		*p = setcnt;
432 		for (i = f->accept; i >= 0; i--)
433 			if (setvec[i] == 1)
434 				*++p = i;
435 		break;
436 	UNARY
437 		cfoll(f,left(v));
438 		break;
439 	case CAT:
440 	case OR:
441 		cfoll(f,left(v));
442 		cfoll(f,right(v));
443 		break;
444 	case ZERO:
445 		break;
446 	default:	/* can't happen */
447 		FATAL("can't happen: unknown type %d in cfoll", type(v));
448 	}
449 }
450 
451 int first(Node *p)	/* collects initially active leaves of p into setvec */
452 			/* returns 0 if p matches empty string */
453 {
454 	int b, lp;
455 
456 	switch (type(p)) {
457 	ELEAF
458 	LEAF
459 		lp = info(p);	/* look for high-water mark of subscripts */
460 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
461 			resizesetvec(__func__);
462 		}
463 		if (type(p) == EMPTYRE) {
464 			setvec[lp] = 0;
465 			return(0);
466 		}
467 		if (setvec[lp] != 1) {
468 			setvec[lp] = 1;
469 			setcnt++;
470 		}
471 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
472 			return(0);		/* empty CCL */
473 		return(1);
474 	case PLUS:
475 		if (first(left(p)) == 0)
476 			return(0);
477 		return(1);
478 	case STAR:
479 	case QUEST:
480 		first(left(p));
481 		return(0);
482 	case CAT:
483 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
484 		return(1);
485 	case OR:
486 		b = first(right(p));
487 		if (first(left(p)) == 0 || b == 0) return(0);
488 		return(1);
489 	case ZERO:
490 		return 0;
491 	}
492 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
493 	return(-1);
494 }
495 
496 void follow(Node *v)	/* collects leaves that can follow v into setvec */
497 {
498 	Node *p;
499 
500 	if (type(v) == FINAL)
501 		return;
502 	p = parent(v);
503 	switch (type(p)) {
504 	case STAR:
505 	case PLUS:
506 		first(v);
507 		follow(p);
508 		return;
509 
510 	case OR:
511 	case QUEST:
512 		follow(p);
513 		return;
514 
515 	case CAT:
516 		if (v == left(p)) {	/* v is left child of p */
517 			if (first(right(p)) == 0) {
518 				follow(p);
519 				return;
520 			}
521 		} else		/* v is right child */
522 			follow(p);
523 		return;
524 	}
525 }
526 
527 int member(int c, const char *sarg)	/* is c in s? */
528 {
529 	const uschar *s = (const uschar *) sarg;
530 
531 	while (*s)
532 		if (c == *s++)
533 			return(1);
534 	return(0);
535 }
536 
537 int match(fa *f, const char *p0)	/* shortest match ? */
538 {
539 	int s, ns;
540 	const uschar *p = (const uschar *) p0;
541 
542 	s = f->initstat;
543 	assert (s < f->state_count);
544 
545 	if (f->out[s])
546 		return(1);
547 	do {
548 		/* assert(*p < NCHARS); */
549 		if ((ns = f->gototab[s][*p]) != 0)
550 			s = ns;
551 		else
552 			s = cgoto(f, s, *p);
553 		if (f->out[s])
554 			return(1);
555 	} while (*p++ != 0);
556 	return(0);
557 }
558 
559 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
560 {
561 	int s, ns;
562 	const uschar *p = (const uschar *) p0;
563 	const uschar *q;
564 
565 	s = f->initstat;
566 	assert(s < f->state_count);
567 
568 	patbeg = (const char *)p;
569 	patlen = -1;
570 	do {
571 		q = p;
572 		do {
573 			if (f->out[s])		/* final state */
574 				patlen = q-p;
575 			/* assert(*q < NCHARS); */
576 			if ((ns = f->gototab[s][*q]) != 0)
577 				s = ns;
578 			else
579 				s = cgoto(f, s, *q);
580 
581 			assert(s < f->state_count);
582 
583 			if (s == 1) {	/* no transition */
584 				if (patlen >= 0) {
585 					patbeg = (const char *) p;
586 					return(1);
587 				}
588 				else
589 					goto nextin;	/* no match */
590 			}
591 		} while (*q++ != 0);
592 		if (f->out[s])
593 			patlen = q-p-1;	/* don't count $ */
594 		if (patlen >= 0) {
595 			patbeg = (const char *) p;
596 			return(1);
597 		}
598 	nextin:
599 		s = 2;
600 	} while (*p++);
601 	return (0);
602 }
603 
604 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
605 {
606 	int s, ns;
607 	const uschar *p = (const uschar *) p0;
608 	const uschar *q;
609 
610 	s = f->initstat;
611 	assert(s < f->state_count);
612 
613 	patbeg = (const char *)p;
614 	patlen = -1;
615 	while (*p) {
616 		q = p;
617 		do {
618 			if (f->out[s])		/* final state */
619 				patlen = q-p;
620 			/* assert(*q < NCHARS); */
621 			if ((ns = f->gototab[s][*q]) != 0)
622 				s = ns;
623 			else
624 				s = cgoto(f, s, *q);
625 			if (s == 1) {	/* no transition */
626 				if (patlen > 0) {
627 					patbeg = (const char *) p;
628 					return(1);
629 				} else
630 					goto nnextin;	/* no nonempty match */
631 			}
632 		} while (*q++ != 0);
633 		if (f->out[s])
634 			patlen = q-p-1;	/* don't count $ */
635 		if (patlen > 0 ) {
636 			patbeg = (const char *) p;
637 			return(1);
638 		}
639 	nnextin:
640 		s = 2;
641 		p++;
642 	}
643 	return (0);
644 }
645 
646 
647 /*
648  * NAME
649  *     fnematch
650  *
651  * DESCRIPTION
652  *     A stream-fed version of nematch which transfers characters to a
653  *     null-terminated buffer. All characters up to and including the last
654  *     character of the matching text or EOF are placed in the buffer. If
655  *     a match is found, patbeg and patlen are set appropriately.
656  *
657  * RETURN VALUES
658  *     false    No match found.
659  *     true     Match found.
660  */
661 
662 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
663 {
664 	char *buf = *pbuf;
665 	int bufsize = *pbufsize;
666 	int c, i, j, k, ns, s;
667 
668 	s = pfa->initstat;
669 	patlen = 0;
670 
671 	/*
672 	 * All indices relative to buf.
673 	 * i <= j <= k <= bufsize
674 	 *
675 	 * i: origin of active substring
676 	 * j: current character
677 	 * k: destination of next getc()
678 	 */
679 	i = -1, k = 0;
680         do {
681 		j = i++;
682 		do {
683 			if (++j == k) {
684 				if (k == bufsize)
685 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
686 						FATAL("stream '%.30s...' too long", buf);
687 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
688 			}
689 			c = (uschar)buf[j];
690 			/* assert(c < NCHARS); */
691 
692 			if ((ns = pfa->gototab[s][c]) != 0)
693 				s = ns;
694 			else
695 				s = cgoto(pfa, s, c);
696 
697 			if (pfa->out[s]) {	/* final state */
698 				patlen = j - i + 1;
699 				if (c == 0)	/* don't count $ */
700 					patlen--;
701 			}
702 		} while (buf[j] && s != 1);
703 		s = 2;
704 	} while (buf[i] && !patlen);
705 
706 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
707 	*pbuf = buf;
708 	*pbufsize = bufsize;
709 
710 	if (patlen) {
711 		patbeg = (char *) buf + i;
712 		/*
713 		 * Under no circumstances is the last character fed to
714 		 * the automaton part of the match. It is EOF's nullbyte,
715 		 * or it sent the automaton into a state with no further
716 		 * transitions available (s==1), or both. Room for a
717 		 * terminating nullbyte is guaranteed.
718 		 *
719 		 * ungetc any chars after the end of matching text
720 		 * (except for EOF's nullbyte, if present) and null
721 		 * terminate the buffer.
722 		 */
723 		do
724 			if (buf[--k] && ungetc(buf[k], f) == EOF)
725 				FATAL("unable to ungetc '%c'", buf[k]);
726 		while (k > i + patlen);
727 		buf[k] = '\0';
728 		return true;
729 	}
730 	else
731 		return false;
732 }
733 
734 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
735 {			/* uses relex() to scan regular expression */
736 	Node *np;
737 
738 	DPRINTF("reparse <%s>\n", p);
739 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
740 	rtok = relex();
741 	/* GNU compatibility: an empty regexp matches anything */
742 	if (rtok == '\0') {
743 		/* FATAL("empty regular expression"); previous */
744 		return(op2(EMPTYRE, NIL, NIL));
745 	}
746 	np = regexp();
747 	if (rtok != '\0')
748 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
749 	return(np);
750 }
751 
752 Node *regexp(void)	/* top-level parse of reg expr */
753 {
754 	return (alt(concat(primary())));
755 }
756 
757 Node *primary(void)
758 {
759 	Node *np;
760 	int savelastatom;
761 
762 	switch (rtok) {
763 	case CHAR:
764 		lastatom = starttok;
765 		np = op2(CHAR, NIL, itonp(rlxval));
766 		rtok = relex();
767 		return (unary(np));
768 	case ALL:
769 		rtok = relex();
770 		return (unary(op2(ALL, NIL, NIL)));
771 	case EMPTYRE:
772 		rtok = relex();
773 		return (unary(op2(EMPTYRE, NIL, NIL)));
774 	case DOT:
775 		lastatom = starttok;
776 		rtok = relex();
777 		return (unary(op2(DOT, NIL, NIL)));
778 	case CCL:
779 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
780 		lastatom = starttok;
781 		rtok = relex();
782 		return (unary(np));
783 	case NCCL:
784 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
785 		lastatom = starttok;
786 		rtok = relex();
787 		return (unary(np));
788 	case '^':
789 		rtok = relex();
790 		return (unary(op2(CHAR, NIL, itonp(HAT))));
791 	case '$':
792 		rtok = relex();
793 		return (unary(op2(CHAR, NIL, NIL)));
794 	case '(':
795 		lastatom = starttok;
796 		savelastatom = starttok - basestr; /* Retain over recursion */
797 		rtok = relex();
798 		if (rtok == ')') {	/* special pleading for () */
799 			rtok = relex();
800 			return unary(op2(CCL, NIL, (Node *) tostring("")));
801 		}
802 		np = regexp();
803 		if (rtok == ')') {
804 			lastatom = basestr + savelastatom; /* Restore */
805 			rtok = relex();
806 			return (unary(np));
807 		}
808 		else
809 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
810 	default:
811 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
812 	}
813 	return 0;	/*NOTREACHED*/
814 }
815 
816 Node *concat(Node *np)
817 {
818 	switch (rtok) {
819 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
820 		return (concat(op2(CAT, np, primary())));
821 	case EMPTYRE:
822 		rtok = relex();
823 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
824 				primary())));
825 	}
826 	return (np);
827 }
828 
829 Node *alt(Node *np)
830 {
831 	if (rtok == OR) {
832 		rtok = relex();
833 		return (alt(op2(OR, np, concat(primary()))));
834 	}
835 	return (np);
836 }
837 
838 Node *unary(Node *np)
839 {
840 	switch (rtok) {
841 	case STAR:
842 		rtok = relex();
843 		return (unary(op2(STAR, np, NIL)));
844 	case PLUS:
845 		rtok = relex();
846 		return (unary(op2(PLUS, np, NIL)));
847 	case QUEST:
848 		rtok = relex();
849 		return (unary(op2(QUEST, np, NIL)));
850 	case ZERO:
851 		rtok = relex();
852 		return (unary(op2(ZERO, np, NIL)));
853 	default:
854 		return (np);
855 	}
856 }
857 
858 /*
859  * Character class definitions conformant to the POSIX locale as
860  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
861  * and operating character sets are both ASCII (ISO646) or supersets
862  * thereof.
863  *
864  * Note that to avoid overflowing the temporary buffer used in
865  * relex(), the expanded character class (prior to range expansion)
866  * must be less than twice the size of their full name.
867  */
868 
869 /* Because isblank doesn't show up in any of the header files on any
870  * system i use, it's defined here.  if some other locale has a richer
871  * definition of "blank", define HAS_ISBLANK and provide your own
872  * version.
873  * the parentheses here are an attempt to find a path through the maze
874  * of macro definition and/or function and/or version provided.  thanks
875  * to nelson beebe for the suggestion; let's see if it works everywhere.
876  */
877 
878 /* #define HAS_ISBLANK */
879 #ifndef HAS_ISBLANK
880 
881 int (xisblank)(int c)
882 {
883 	return c==' ' || c=='\t';
884 }
885 
886 #endif
887 
888 static const struct charclass {
889 	const char *cc_name;
890 	int cc_namelen;
891 	int (*cc_func)(int);
892 } charclasses[] = {
893 	{ "alnum",	5,	isalnum },
894 	{ "alpha",	5,	isalpha },
895 #ifndef HAS_ISBLANK
896 	{ "blank",	5,	xisblank },
897 #else
898 	{ "blank",	5,	isblank },
899 #endif
900 	{ "cntrl",	5,	iscntrl },
901 	{ "digit",	5,	isdigit },
902 	{ "graph",	5,	isgraph },
903 	{ "lower",	5,	islower },
904 	{ "print",	5,	isprint },
905 	{ "punct",	5,	ispunct },
906 	{ "space",	5,	isspace },
907 	{ "upper",	5,	isupper },
908 	{ "xdigit",	6,	isxdigit },
909 	{ NULL,		0,	NULL },
910 };
911 
912 #define REPEAT_SIMPLE		0
913 #define REPEAT_PLUS_APPENDED	1
914 #define REPEAT_WITH_Q		2
915 #define REPEAT_ZERO		3
916 
917 static int
918 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
919 	       int atomlen, int firstnum, int secondnum, int special_case)
920 {
921 	int i, j;
922 	uschar *buf = 0;
923 	int ret = 1;
924 	int init_q = (firstnum == 0);		/* first added char will be ? */
925 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
926 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
927 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
928 	int size = prefix_length +  suffix_length;
929 
930 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
931 		size += atomlen*(firstnum-1);
932 	}
933 
934 	/* Adjust size of buffer for special cases */
935 	if (special_case == REPEAT_PLUS_APPENDED) {
936 		size++;		/* for the final + */
937 	} else if (special_case == REPEAT_WITH_Q) {
938 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
939 	} else if (special_case == REPEAT_ZERO) {
940 		size += 2;	/* just a null ERE: () */
941 	}
942 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
943 		FATAL("out of space in reg expr %.10s..", lastre);
944 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
945 	j = prefix_length;
946 	if (special_case == REPEAT_ZERO) {
947 		j -= atomlen;
948 		buf[j++] = '(';
949 		buf[j++] = ')';
950 	}
951 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
952 		memcpy(&buf[j], atom, atomlen);
953 		j += atomlen;
954 	}
955 	if (special_case == REPEAT_PLUS_APPENDED) {
956 		buf[j++] = '+';
957 	} else if (special_case == REPEAT_WITH_Q) {
958 		if (init_q)
959 			buf[j++] = '?';
960 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
961 			memcpy(&buf[j], atom, atomlen);
962 			j += atomlen;
963 			buf[j++] = '?';
964 		}
965 	}
966 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
967 	j += suffix_length;
968 	buf[j] = '\0';
969 	/* free old basestr */
970 	if (firstbasestr != basestr) {
971 		if (basestr)
972 			xfree(basestr);
973 	}
974 	basestr = buf;
975 	prestr  = buf + prefix_length;
976 	if (special_case == REPEAT_ZERO) {
977 		prestr  -= atomlen;
978 		ret++;
979 	}
980 	return ret;
981 }
982 
983 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
984 		  int atomlen, int firstnum, int secondnum)
985 {
986 	/*
987 	   In general, the repetition specifier or "bound" is replaced here
988 	   by an equivalent ERE string, repeating the immediately previous atom
989 	   and appending ? and + as needed. Note that the first copy of the
990 	   atom is left in place, except in the special_case of a zero-repeat
991 	   (i.e., {0}).
992 	 */
993 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
994 		if (firstnum < 2) {
995 			/* 0 or 1: should be handled before you get here */
996 			FATAL("internal error");
997 		} else {
998 			return replace_repeat(reptok, reptoklen, atom, atomlen,
999 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1000 		}
1001 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1002 		if (firstnum == 0) {	/* {0} or {0,0} */
1003 			/* This case is unusual because the resulting
1004 			   replacement string might actually be SMALLER than
1005 			   the original ERE */
1006 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1007 					firstnum, secondnum, REPEAT_ZERO);
1008 		} else {		/* (firstnum >= 1) */
1009 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1010 					firstnum, secondnum, REPEAT_SIMPLE);
1011 		}
1012 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1013 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1014 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1015 					firstnum, secondnum, REPEAT_WITH_Q);
1016 	} else {	/* Error - shouldn't be here (n>m) */
1017 		FATAL("internal error");
1018 	}
1019 	return 0;
1020 }
1021 
1022 int relex(void)		/* lexical analyzer for reparse */
1023 {
1024 	int c, n;
1025 	int cflag;
1026 	static uschar *buf = NULL;
1027 	static int bufsz = 100;
1028 	uschar *bp;
1029 	const struct charclass *cc;
1030 	int i;
1031 	int num, m;
1032 	bool commafound, digitfound;
1033 	const uschar *startreptok;
1034 	static int parens = 0;
1035 
1036 rescan:
1037 	starttok = prestr;
1038 
1039 	switch (c = *prestr++) {
1040 	case '|': return OR;
1041 	case '*': return STAR;
1042 	case '+': return PLUS;
1043 	case '?': return QUEST;
1044 	case '.': return DOT;
1045 	case '\0': prestr--; return '\0';
1046 	case '^':
1047 	case '$':
1048 		return c;
1049 	case '(':
1050 		parens++;
1051 		return c;
1052 	case ')':
1053 		if (parens) {
1054 			parens--;
1055 			return c;
1056 		}
1057 		/* unmatched close parenthesis; per POSIX, treat as literal */
1058 		rlxval = c;
1059 		return CHAR;
1060 	case '\\':
1061 		rlxval = quoted(&prestr);
1062 		return CHAR;
1063 	default:
1064 		rlxval = c;
1065 		return CHAR;
1066 	case '[':
1067 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1068 			FATAL("out of space in reg expr %.10s..", lastre);
1069 		bp = buf;
1070 		if (*prestr == '^') {
1071 			cflag = 1;
1072 			prestr++;
1073 		}
1074 		else
1075 			cflag = 0;
1076 		n = 2 * strlen((const char *) prestr)+1;
1077 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1078 			FATAL("out of space for reg expr %.10s...", lastre);
1079 		for (; ; ) {
1080 			if ((c = *prestr++) == '\\') {
1081 				*bp++ = '\\';
1082 				if ((c = *prestr++) == '\0')
1083 					FATAL("nonterminated character class %.20s...", lastre);
1084 				*bp++ = c;
1085 			/* } else if (c == '\n') { */
1086 			/* 	FATAL("newline in character class %.20s...", lastre); */
1087 			} else if (c == '[' && *prestr == ':') {
1088 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1089 				for (cc = charclasses; cc->cc_name; cc++)
1090 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1091 						break;
1092 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1093 				    prestr[2 + cc->cc_namelen] == ']') {
1094 					prestr += cc->cc_namelen + 3;
1095 					/*
1096 					 * BUG: We begin at 1, instead of 0, since we
1097 					 * would otherwise prematurely terminate the
1098 					 * string for classes like [[:cntrl:]]. This
1099 					 * means that we can't match the NUL character,
1100 					 * not without first adapting the entire
1101 					 * program to track each string's length.
1102 					 */
1103 					for (i = 1; i <= UCHAR_MAX; i++) {
1104 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1105 						    FATAL("out of space for reg expr %.10s...", lastre);
1106 						if (cc->cc_func(i)) {
1107 							/* escape backslash */
1108 							if (i == '\\') {
1109 								*bp++ = '\\';
1110 								n++;
1111 							}
1112 
1113 							*bp++ = i;
1114 							n++;
1115 						}
1116 					}
1117 				} else
1118 					*bp++ = c;
1119 			} else if (c == '[' && *prestr == '.') {
1120 				char collate_char;
1121 				prestr++;
1122 				collate_char = *prestr++;
1123 				if (*prestr == '.' && prestr[1] == ']') {
1124 					prestr += 2;
1125 					/* Found it: map via locale TBD: for
1126 					   now, simply return this char.  This
1127 					   is sufficient to pass conformance
1128 					   test awk.ex 156
1129 					 */
1130 					if (*prestr == ']') {
1131 						prestr++;
1132 						rlxval = collate_char;
1133 						return CHAR;
1134 					}
1135 				}
1136 			} else if (c == '[' && *prestr == '=') {
1137 				char equiv_char;
1138 				prestr++;
1139 				equiv_char = *prestr++;
1140 				if (*prestr == '=' && prestr[1] == ']') {
1141 					prestr += 2;
1142 					/* Found it: map via locale TBD: for now
1143 					   simply return this char. This is
1144 					   sufficient to pass conformance test
1145 					   awk.ex 156
1146 					 */
1147 					if (*prestr == ']') {
1148 						prestr++;
1149 						rlxval = equiv_char;
1150 						return CHAR;
1151 					}
1152 				}
1153 			} else if (c == '\0') {
1154 				FATAL("nonterminated character class %.20s", lastre);
1155 			} else if (bp == buf) {	/* 1st char is special */
1156 				*bp++ = c;
1157 			} else if (c == ']') {
1158 				*bp++ = 0;
1159 				rlxstr = (uschar *) tostring((char *) buf);
1160 				if (cflag == 0)
1161 					return CCL;
1162 				else
1163 					return NCCL;
1164 			} else
1165 				*bp++ = c;
1166 		}
1167 		break;
1168 	case '{':
1169 		if (isdigit(*(prestr))) {
1170 			num = 0;	/* Process as a repetition */
1171 			n = -1; m = -1;
1172 			commafound = false;
1173 			digitfound = false;
1174 			startreptok = prestr-1;
1175 			/* Remember start of previous atom here ? */
1176 		} else {        	/* just a { char, not a repetition */
1177 			rlxval = c;
1178 			return CHAR;
1179                 }
1180 		for (; ; ) {
1181 			if ((c = *prestr++) == '}') {
1182 				if (commafound) {
1183 					if (digitfound) { /* {n,m} */
1184 						m = num;
1185 						if (m < n)
1186 							FATAL("illegal repetition expression: class %.20s",
1187 								lastre);
1188 						if (n == 0 && m == 1) {
1189 							return QUEST;
1190 						}
1191 					} else {	/* {n,} */
1192 						if (n == 0)
1193 							return STAR;
1194 						else if (n == 1)
1195 							return PLUS;
1196 					}
1197 				} else {
1198 					if (digitfound) { /* {n} same as {n,n} */
1199 						n = num;
1200 						m = num;
1201 					} else {	/* {} */
1202 						FATAL("illegal repetition expression: class %.20s",
1203 							lastre);
1204 					}
1205 				}
1206 				if (repeat(starttok, prestr-starttok, lastatom,
1207 					   startreptok - lastatom, n, m) > 0) {
1208 					if (n == 0 && m == 0) {
1209 						return ZERO;
1210 					}
1211 					/* must rescan input for next token */
1212 					goto rescan;
1213 				}
1214 				/* Failed to replace: eat up {...} characters
1215 				   and treat like just PLUS */
1216 				return PLUS;
1217 			} else if (c == '\0') {
1218 				FATAL("nonterminated character class %.20s",
1219 					lastre);
1220 			} else if (isdigit(c)) {
1221 				num = 10 * num + c - '0';
1222 				digitfound = true;
1223 			} else if (c == ',') {
1224 				if (commafound)
1225 					FATAL("illegal repetition expression: class %.20s",
1226 						lastre);
1227 				/* looking for {n,} or {n,m} */
1228 				commafound = true;
1229 				n = num;
1230 				digitfound = false; /* reset */
1231 				num = 0;
1232 			} else {
1233 				FATAL("illegal repetition expression: class %.20s",
1234 					lastre);
1235 			}
1236 		}
1237 		break;
1238 	}
1239 }
1240 
1241 int cgoto(fa *f, int s, int c)
1242 {
1243 	int *p, *q;
1244 	int i, j, k;
1245 
1246 	assert(c == HAT || c < NCHARS);
1247 	while (f->accept >= maxsetvec) {	/* guessing here! */
1248 		resizesetvec(__func__);
1249 	}
1250 	for (i = 0; i <= f->accept; i++)
1251 		setvec[i] = 0;
1252 	setcnt = 0;
1253 	resize_state(f, s);
1254 	/* compute positions of gototab[s,c] into setvec */
1255 	p = f->posns[s];
1256 	for (i = 1; i <= *p; i++) {
1257 		if ((k = f->re[p[i]].ltype) != FINAL) {
1258 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1259 			 || (k == DOT && c != 0 && c != HAT)
1260 			 || (k == ALL && c != 0)
1261 			 || (k == EMPTYRE && c != 0)
1262 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1263 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1264 				q = f->re[p[i]].lfollow;
1265 				for (j = 1; j <= *q; j++) {
1266 					if (q[j] >= maxsetvec) {
1267 						resizesetvec(__func__);
1268 					}
1269 					if (setvec[q[j]] == 0) {
1270 						setcnt++;
1271 						setvec[q[j]] = 1;
1272 					}
1273 				}
1274 			}
1275 		}
1276 	}
1277 	/* determine if setvec is a previous state */
1278 	tmpset[0] = setcnt;
1279 	j = 1;
1280 	for (i = f->accept; i >= 0; i--)
1281 		if (setvec[i]) {
1282 			tmpset[j++] = i;
1283 		}
1284 	resize_state(f, f->curstat > s ? f->curstat : s);
1285 	/* tmpset == previous state? */
1286 	for (i = 1; i <= f->curstat; i++) {
1287 		p = f->posns[i];
1288 		if ((k = tmpset[0]) != p[0])
1289 			goto different;
1290 		for (j = 1; j <= k; j++)
1291 			if (tmpset[j] != p[j])
1292 				goto different;
1293 		/* setvec is state i */
1294 		if (c != HAT)
1295 			f->gototab[s][c] = i;
1296 		return i;
1297 	  different:;
1298 	}
1299 
1300 	/* add tmpset to current set of states */
1301 	++(f->curstat);
1302 	resize_state(f, f->curstat);
1303 	for (i = 0; i < NCHARS; i++)
1304 		f->gototab[f->curstat][i] = 0;
1305 	xfree(f->posns[f->curstat]);
1306 	p = intalloc(setcnt + 1, __func__);
1307 
1308 	f->posns[f->curstat] = p;
1309 	if (c != HAT)
1310 		f->gototab[s][c] = f->curstat;
1311 	for (i = 0; i <= setcnt; i++)
1312 		p[i] = tmpset[i];
1313 	if (setvec[f->accept])
1314 		f->out[f->curstat] = 1;
1315 	else
1316 		f->out[f->curstat] = 0;
1317 	return f->curstat;
1318 }
1319 
1320 
1321 void freefa(fa *f)	/* free a finite automaton */
1322 {
1323 	int i;
1324 
1325 	if (f == NULL)
1326 		return;
1327 	for (i = 0; i < f->state_count; i++)
1328 		xfree(f->gototab[i])
1329 	for (i = 0; i <= f->curstat; i++)
1330 		xfree(f->posns[i]);
1331 	for (i = 0; i <= f->accept; i++) {
1332 		xfree(f->re[i].lfollow);
1333 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1334 			xfree(f->re[i].lval.np);
1335 	}
1336 	xfree(f->restr);
1337 	xfree(f->out);
1338 	xfree(f->posns);
1339 	xfree(f->gototab);
1340 	xfree(f);
1341 }
1342