xref: /freebsd/contrib/one-true-awk/b.c (revision 17853db4b0dc36ed32af039cd803f13b692913da)
12a55deb1SDavid E. O'Brien /****************************************************************
22a55deb1SDavid E. O'Brien Copyright (C) Lucent Technologies 1997
32a55deb1SDavid E. O'Brien All Rights Reserved
42a55deb1SDavid E. O'Brien 
52a55deb1SDavid E. O'Brien Permission to use, copy, modify, and distribute this software and
62a55deb1SDavid E. O'Brien its documentation for any purpose and without fee is hereby
72a55deb1SDavid E. O'Brien granted, provided that the above copyright notice appear in all
82a55deb1SDavid E. O'Brien copies and that both that the copyright notice and this
92a55deb1SDavid E. O'Brien permission notice and warranty disclaimer appear in supporting
102a55deb1SDavid E. O'Brien documentation, and that the name Lucent Technologies or any of
112a55deb1SDavid E. O'Brien its entities not be used in advertising or publicity pertaining
122a55deb1SDavid E. O'Brien to distribution of the software without specific, written prior
132a55deb1SDavid E. O'Brien permission.
142a55deb1SDavid E. O'Brien 
152a55deb1SDavid E. O'Brien LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
162a55deb1SDavid E. O'Brien INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
172a55deb1SDavid E. O'Brien IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
182a55deb1SDavid E. O'Brien SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
192a55deb1SDavid E. O'Brien WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
202a55deb1SDavid E. O'Brien IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
212a55deb1SDavid E. O'Brien ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
222a55deb1SDavid E. O'Brien THIS SOFTWARE.
232a55deb1SDavid E. O'Brien ****************************************************************/
242a55deb1SDavid E. O'Brien 
25addad6afSRong-En Fan /* lasciate ogne speranza, voi ch'intrate. */
262a55deb1SDavid E. O'Brien 
272a55deb1SDavid E. O'Brien #define	DEBUG
282a55deb1SDavid E. O'Brien 
292a55deb1SDavid E. O'Brien #include <ctype.h>
30b5253557SWarner Losh #include <limits.h>
312a55deb1SDavid E. O'Brien #include <stdio.h>
322a55deb1SDavid E. O'Brien #include <string.h>
332a55deb1SDavid E. O'Brien #include <stdlib.h>
342a55deb1SDavid E. O'Brien #include "awk.h"
35f39dd6a9SWarner Losh #include "awkgram.tab.h"
362a55deb1SDavid E. O'Brien 
372a55deb1SDavid E. O'Brien #define MAXLIN 22
382a55deb1SDavid E. O'Brien 
392a55deb1SDavid E. O'Brien #define type(v)		(v)->nobj	/* badly overloaded here */
402a55deb1SDavid E. O'Brien #define info(v)		(v)->ntype	/* badly overloaded here */
412a55deb1SDavid E. O'Brien #define left(v)		(v)->narg[0]
422a55deb1SDavid E. O'Brien #define right(v)	(v)->narg[1]
432a55deb1SDavid E. O'Brien #define parent(v)	(v)->nnext
442a55deb1SDavid E. O'Brien 
452a55deb1SDavid E. O'Brien #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46addad6afSRong-En Fan #define ELEAF	case EMPTYRE:		/* empty string in regexp */
472a55deb1SDavid E. O'Brien #define UNARY	case STAR: case PLUS: case QUEST:
482a55deb1SDavid E. O'Brien 
492a55deb1SDavid E. O'Brien /* encoding in tree Nodes:
50addad6afSRong-En Fan 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
512a55deb1SDavid E. O'Brien 		left is index, right contains value or pointer to value
522a55deb1SDavid E. O'Brien 	unary (STAR, PLUS, QUEST): left is child, right is null
532a55deb1SDavid E. O'Brien 	binary (CAT, OR): left and right are children
542a55deb1SDavid E. O'Brien 	parent contains pointer to parent
552a55deb1SDavid E. O'Brien */
562a55deb1SDavid E. O'Brien 
572a55deb1SDavid E. O'Brien 
582a55deb1SDavid E. O'Brien int	*setvec;
592a55deb1SDavid E. O'Brien int	*tmpset;
602a55deb1SDavid E. O'Brien int	maxsetvec = 0;
612a55deb1SDavid E. O'Brien 
622a55deb1SDavid E. O'Brien int	rtok;		/* next token in current re */
632a55deb1SDavid E. O'Brien int	rlxval;
64f39dd6a9SWarner Losh static const uschar	*rlxstr;
65f39dd6a9SWarner Losh static const uschar	*prestr;	/* current position in current re */
66f39dd6a9SWarner Losh static const uschar	*lastre;	/* origin of last re */
67f39dd6a9SWarner Losh static const uschar	*lastatom;	/* origin of last Atom */
68f39dd6a9SWarner Losh static const uschar	*starttok;
69f39dd6a9SWarner Losh static const uschar 	*basestr;	/* starts with original, replaced during
70b5253557SWarner Losh 				   repetition processing */
71f39dd6a9SWarner Losh static const uschar 	*firstbasestr;
722a55deb1SDavid E. O'Brien 
732a55deb1SDavid E. O'Brien static	int setcnt;
742a55deb1SDavid E. O'Brien static	int poscnt;
752a55deb1SDavid E. O'Brien 
76f39dd6a9SWarner Losh const char	*patbeg;
772a55deb1SDavid E. O'Brien int	patlen;
782a55deb1SDavid E. O'Brien 
79f39dd6a9SWarner Losh #define	NFA	128	/* cache this many dynamic fa's */
802a55deb1SDavid E. O'Brien fa	*fatab[NFA];
812a55deb1SDavid E. O'Brien int	nfatab	= 0;	/* entries in fatab */
822a55deb1SDavid E. O'Brien 
83f32a6403SWarner Losh extern int u8_nextlen(const char *s);
84f32a6403SWarner Losh 
85f32a6403SWarner Losh 
86f32a6403SWarner Losh /* utf-8 mechanism:
87f32a6403SWarner Losh 
88f32a6403SWarner Losh    For most of Awk, utf-8 strings just "work", since they look like
89f32a6403SWarner Losh    null-terminated sequences of 8-bit bytes.
90f32a6403SWarner Losh 
91f32a6403SWarner Losh    Functions like length(), index(), and substr() have to operate
92f32a6403SWarner Losh    in units of utf-8 characters.  The u8_* functions in run.c
93f32a6403SWarner Losh    handle this.
94f32a6403SWarner Losh 
95f32a6403SWarner Losh    Regular expressions are more complicated, since the basic
96f32a6403SWarner Losh    mechanism of the goto table used 8-bit byte indices into the
97f32a6403SWarner Losh    gototab entries to compute the next state.  Unicode is a lot
98f32a6403SWarner Losh    bigger, so the gototab entries are now structs with a character
99f32a6403SWarner Losh    and a next state. These are sorted by code point and binary
100f32a6403SWarner Losh    searched.
101f32a6403SWarner Losh 
102f32a6403SWarner Losh    Throughout the RE mechanism in b.c, utf-8 characters are
103f32a6403SWarner Losh    converted to their utf-32 value.  This mostly shows up in
104f32a6403SWarner Losh    cclenter, which expands character class ranges like a-z and now
105f32a6403SWarner Losh    alpha-omega.  The size of a gototab array is still about 256.
106f32a6403SWarner Losh    This should be dynamic, but for now things work ok for a single
107f32a6403SWarner Losh    code page of Unicode, which is the most likely case.
108f32a6403SWarner Losh 
109f32a6403SWarner Losh    The code changes are localized in run.c and b.c.  I have added a
110f32a6403SWarner Losh    handful of functions to somewhat better hide the implementation,
111f32a6403SWarner Losh    but a lot more could be done.
112f32a6403SWarner Losh 
113f32a6403SWarner Losh  */
114f32a6403SWarner Losh 
115f32a6403SWarner Losh static int entry_cmp(const void *l, const void *r);
116f32a6403SWarner Losh static int get_gototab(fa*, int, int);
117f32a6403SWarner Losh static int set_gototab(fa*, int, int, int);
118f32a6403SWarner Losh static void clear_gototab(fa*, int);
119f32a6403SWarner Losh extern int u8_rune(int *, const char *);
120f32a6403SWarner Losh 
121f39dd6a9SWarner Losh static int *
intalloc(size_t n,const char * f)122f39dd6a9SWarner Losh intalloc(size_t n, const char *f)
123f39dd6a9SWarner Losh {
124f39dd6a9SWarner Losh 	int *p = (int *) calloc(n, sizeof(int));
125f39dd6a9SWarner Losh 	if (p == NULL)
126f39dd6a9SWarner Losh 		overflo(f);
127f39dd6a9SWarner Losh 	return p;
128f39dd6a9SWarner Losh }
129f39dd6a9SWarner Losh 
130f39dd6a9SWarner Losh static void
resizesetvec(const char * f)131f39dd6a9SWarner Losh resizesetvec(const char *f)
132f39dd6a9SWarner Losh {
133f39dd6a9SWarner Losh 	if (maxsetvec == 0)
134f39dd6a9SWarner Losh 		maxsetvec = MAXLIN;
135f39dd6a9SWarner Losh 	else
136f39dd6a9SWarner Losh 		maxsetvec *= 4;
137f39dd6a9SWarner Losh 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138f39dd6a9SWarner Losh 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139f39dd6a9SWarner Losh 	if (setvec == NULL || tmpset == NULL)
140f39dd6a9SWarner Losh 		overflo(f);
141f39dd6a9SWarner Losh }
142f39dd6a9SWarner Losh 
143f39dd6a9SWarner Losh static void
resize_state(fa * f,int state)144f39dd6a9SWarner Losh resize_state(fa *f, int state)
145f39dd6a9SWarner Losh {
146f32a6403SWarner Losh 	gtt *p;
147f39dd6a9SWarner Losh 	uschar *p2;
148f39dd6a9SWarner Losh 	int **p3;
149f39dd6a9SWarner Losh 	int i, new_count;
150f39dd6a9SWarner Losh 
151f39dd6a9SWarner Losh 	if (++state < f->state_count)
152f39dd6a9SWarner Losh 		return;
153f39dd6a9SWarner Losh 
154f39dd6a9SWarner Losh 	new_count = state + 10; /* needs to be tuned */
155f39dd6a9SWarner Losh 
156f32a6403SWarner Losh 	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157f39dd6a9SWarner Losh 	if (p == NULL)
158f39dd6a9SWarner Losh 		goto out;
159f39dd6a9SWarner Losh 	f->gototab = p;
160f39dd6a9SWarner Losh 
161f39dd6a9SWarner Losh 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162f39dd6a9SWarner Losh 	if (p2 == NULL)
163f39dd6a9SWarner Losh 		goto out;
164f39dd6a9SWarner Losh 	f->out = p2;
165f39dd6a9SWarner Losh 
166f39dd6a9SWarner Losh 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167f39dd6a9SWarner Losh 	if (p3 == NULL)
168f39dd6a9SWarner Losh 		goto out;
169f39dd6a9SWarner Losh 	f->posns = p3;
170f39dd6a9SWarner Losh 
171f39dd6a9SWarner Losh 	for (i = f->state_count; i < new_count; ++i) {
172f32a6403SWarner Losh 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173f32a6403SWarner Losh 		if (f->gototab[i].entries == NULL)
174f39dd6a9SWarner Losh 			goto out;
175f32a6403SWarner Losh 		f->gototab[i].allocated = NCHARS;
176f32a6403SWarner Losh 		f->gototab[i].inuse = 0;
177f39dd6a9SWarner Losh 		f->out[i] = 0;
178f39dd6a9SWarner Losh 		f->posns[i] = NULL;
179f39dd6a9SWarner Losh 	}
180f39dd6a9SWarner Losh 	f->state_count = new_count;
181f39dd6a9SWarner Losh 	return;
182f39dd6a9SWarner Losh out:
183f39dd6a9SWarner Losh 	overflo(__func__);
184f39dd6a9SWarner Losh }
185f39dd6a9SWarner Losh 
makedfa(const char * s,bool anchor)186f39dd6a9SWarner Losh fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
1872a55deb1SDavid E. O'Brien {
1882a55deb1SDavid E. O'Brien 	int i, use, nuse;
1892a55deb1SDavid E. O'Brien 	fa *pfa;
1902a55deb1SDavid E. O'Brien 	static int now = 1;
1912a55deb1SDavid E. O'Brien 
19210ce5b99SWarner Losh 	if (setvec == NULL) {	/* first time through any RE */
193f39dd6a9SWarner Losh 		resizesetvec(__func__);
1942a55deb1SDavid E. O'Brien 	}
1952a55deb1SDavid E. O'Brien 
196f39dd6a9SWarner Losh 	if (compile_time != RUNNING)	/* a constant for sure */
1972a55deb1SDavid E. O'Brien 		return mkdfa(s, anchor);
1982a55deb1SDavid E. O'Brien 	for (i = 0; i < nfatab; i++)	/* is it there already? */
1992a55deb1SDavid E. O'Brien 		if (fatab[i]->anchor == anchor
200007c6572SDag-Erling Smørgrav 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
2012a55deb1SDavid E. O'Brien 			fatab[i]->use = now++;
2022a55deb1SDavid E. O'Brien 			return fatab[i];
2032a55deb1SDavid E. O'Brien 		}
2042a55deb1SDavid E. O'Brien 	pfa = mkdfa(s, anchor);
2052a55deb1SDavid E. O'Brien 	if (nfatab < NFA) {	/* room for another */
2062a55deb1SDavid E. O'Brien 		fatab[nfatab] = pfa;
2072a55deb1SDavid E. O'Brien 		fatab[nfatab]->use = now++;
2082a55deb1SDavid E. O'Brien 		nfatab++;
2092a55deb1SDavid E. O'Brien 		return pfa;
2102a55deb1SDavid E. O'Brien 	}
2112a55deb1SDavid E. O'Brien 	use = fatab[0]->use;	/* replace least-recently used */
2122a55deb1SDavid E. O'Brien 	nuse = 0;
2132a55deb1SDavid E. O'Brien 	for (i = 1; i < nfatab; i++)
2142a55deb1SDavid E. O'Brien 		if (fatab[i]->use < use) {
2152a55deb1SDavid E. O'Brien 			use = fatab[i]->use;
2162a55deb1SDavid E. O'Brien 			nuse = i;
2172a55deb1SDavid E. O'Brien 		}
2182a55deb1SDavid E. O'Brien 	freefa(fatab[nuse]);
2192a55deb1SDavid E. O'Brien 	fatab[nuse] = pfa;
2202a55deb1SDavid E. O'Brien 	pfa->use = now++;
2212a55deb1SDavid E. O'Brien 	return pfa;
2222a55deb1SDavid E. O'Brien }
2232a55deb1SDavid E. O'Brien 
mkdfa(const char * s,bool anchor)224f39dd6a9SWarner Losh fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
225f39dd6a9SWarner Losh 				/* anchor = true for anchored matches, else false */
2262a55deb1SDavid E. O'Brien {
2272a55deb1SDavid E. O'Brien 	Node *p, *p1;
2282a55deb1SDavid E. O'Brien 	fa *f;
2292a55deb1SDavid E. O'Brien 
230f39dd6a9SWarner Losh 	firstbasestr = (const uschar *) s;
231b5253557SWarner Losh 	basestr = firstbasestr;
2322a55deb1SDavid E. O'Brien 	p = reparse(s);
2332a55deb1SDavid E. O'Brien 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
2342a55deb1SDavid E. O'Brien 		/* put ALL STAR in front of reg.  exp. */
2352a55deb1SDavid E. O'Brien 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
2362a55deb1SDavid E. O'Brien 		/* put FINAL after reg.  exp. */
2372a55deb1SDavid E. O'Brien 
2382a55deb1SDavid E. O'Brien 	poscnt = 0;
2392a55deb1SDavid E. O'Brien 	penter(p1);	/* enter parent pointers and leaf indices */
2402a55deb1SDavid E. O'Brien 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241f39dd6a9SWarner Losh 		overflo(__func__);
2422a55deb1SDavid E. O'Brien 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
2432a55deb1SDavid E. O'Brien 	cfoll(f, p1);	/* set up follow sets */
2442a55deb1SDavid E. O'Brien 	freetr(p1);
245f39dd6a9SWarner Losh 	resize_state(f, 1);
246f39dd6a9SWarner Losh 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247f39dd6a9SWarner Losh 	f->posns[1] = intalloc(1, __func__);
2482a55deb1SDavid E. O'Brien 	*f->posns[1] = 0;
2492a55deb1SDavid E. O'Brien 	f->initstat = makeinit(f, anchor);
2502a55deb1SDavid E. O'Brien 	f->anchor = anchor;
2512a55deb1SDavid E. O'Brien 	f->restr = (uschar *) tostring(s);
252b5253557SWarner Losh 	if (firstbasestr != basestr) {
253b5253557SWarner Losh 		if (basestr)
254b5253557SWarner Losh 			xfree(basestr);
255b5253557SWarner Losh 	}
2562a55deb1SDavid E. O'Brien 	return f;
2572a55deb1SDavid E. O'Brien }
2582a55deb1SDavid E. O'Brien 
makeinit(fa * f,bool anchor)259f39dd6a9SWarner Losh int makeinit(fa *f, bool anchor)
2602a55deb1SDavid E. O'Brien {
2612a55deb1SDavid E. O'Brien 	int i, k;
2622a55deb1SDavid E. O'Brien 
2632a55deb1SDavid E. O'Brien 	f->curstat = 2;
2642a55deb1SDavid E. O'Brien 	f->out[2] = 0;
2652a55deb1SDavid E. O'Brien 	k = *(f->re[0].lfollow);
2662a55deb1SDavid E. O'Brien 	xfree(f->posns[2]);
267f39dd6a9SWarner Losh 	f->posns[2] = intalloc(k + 1,  __func__);
2682a55deb1SDavid E. O'Brien 	for (i = 0; i <= k; i++) {
2692a55deb1SDavid E. O'Brien 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
2702a55deb1SDavid E. O'Brien 	}
2712a55deb1SDavid E. O'Brien 	if ((f->posns[2])[1] == f->accept)
2722a55deb1SDavid E. O'Brien 		f->out[2] = 1;
273f32a6403SWarner Losh 	clear_gototab(f, 2);
2742a55deb1SDavid E. O'Brien 	f->curstat = cgoto(f, 2, HAT);
2752a55deb1SDavid E. O'Brien 	if (anchor) {
2762a55deb1SDavid E. O'Brien 		*f->posns[2] = k-1;	/* leave out position 0 */
2772a55deb1SDavid E. O'Brien 		for (i = 0; i < k; i++) {
2782a55deb1SDavid E. O'Brien 			(f->posns[0])[i] = (f->posns[2])[i];
2792a55deb1SDavid E. O'Brien 		}
2802a55deb1SDavid E. O'Brien 
2812a55deb1SDavid E. O'Brien 		f->out[0] = f->out[2];
2822a55deb1SDavid E. O'Brien 		if (f->curstat != 2)
2832a55deb1SDavid E. O'Brien 			--(*f->posns[f->curstat]);
2842a55deb1SDavid E. O'Brien 	}
2852a55deb1SDavid E. O'Brien 	return f->curstat;
2862a55deb1SDavid E. O'Brien }
2872a55deb1SDavid E. O'Brien 
penter(Node * p)2882a55deb1SDavid E. O'Brien void penter(Node *p)	/* set up parent pointers and leaf indices */
2892a55deb1SDavid E. O'Brien {
2902a55deb1SDavid E. O'Brien 	switch (type(p)) {
291addad6afSRong-En Fan 	ELEAF
2922a55deb1SDavid E. O'Brien 	LEAF
2932a55deb1SDavid E. O'Brien 		info(p) = poscnt;
2942a55deb1SDavid E. O'Brien 		poscnt++;
2952a55deb1SDavid E. O'Brien 		break;
2962a55deb1SDavid E. O'Brien 	UNARY
2972a55deb1SDavid E. O'Brien 		penter(left(p));
2982a55deb1SDavid E. O'Brien 		parent(left(p)) = p;
2992a55deb1SDavid E. O'Brien 		break;
3002a55deb1SDavid E. O'Brien 	case CAT:
3012a55deb1SDavid E. O'Brien 	case OR:
3022a55deb1SDavid E. O'Brien 		penter(left(p));
3032a55deb1SDavid E. O'Brien 		penter(right(p));
3042a55deb1SDavid E. O'Brien 		parent(left(p)) = p;
3052a55deb1SDavid E. O'Brien 		parent(right(p)) = p;
3062a55deb1SDavid E. O'Brien 		break;
307f39dd6a9SWarner Losh 	case ZERO:
308f39dd6a9SWarner Losh 		break;
3092a55deb1SDavid E. O'Brien 	default:	/* can't happen */
3102a55deb1SDavid E. O'Brien 		FATAL("can't happen: unknown type %d in penter", type(p));
3112a55deb1SDavid E. O'Brien 		break;
3122a55deb1SDavid E. O'Brien 	}
3132a55deb1SDavid E. O'Brien }
3142a55deb1SDavid E. O'Brien 
freetr(Node * p)3152a55deb1SDavid E. O'Brien void freetr(Node *p)	/* free parse tree */
3162a55deb1SDavid E. O'Brien {
3172a55deb1SDavid E. O'Brien 	switch (type(p)) {
318addad6afSRong-En Fan 	ELEAF
3192a55deb1SDavid E. O'Brien 	LEAF
3202a55deb1SDavid E. O'Brien 		xfree(p);
3212a55deb1SDavid E. O'Brien 		break;
3222a55deb1SDavid E. O'Brien 	UNARY
323f39dd6a9SWarner Losh 	case ZERO:
3242a55deb1SDavid E. O'Brien 		freetr(left(p));
3252a55deb1SDavid E. O'Brien 		xfree(p);
3262a55deb1SDavid E. O'Brien 		break;
3272a55deb1SDavid E. O'Brien 	case CAT:
3282a55deb1SDavid E. O'Brien 	case OR:
3292a55deb1SDavid E. O'Brien 		freetr(left(p));
3302a55deb1SDavid E. O'Brien 		freetr(right(p));
3312a55deb1SDavid E. O'Brien 		xfree(p);
3322a55deb1SDavid E. O'Brien 		break;
3332a55deb1SDavid E. O'Brien 	default:	/* can't happen */
3342a55deb1SDavid E. O'Brien 		FATAL("can't happen: unknown type %d in freetr", type(p));
3352a55deb1SDavid E. O'Brien 		break;
3362a55deb1SDavid E. O'Brien 	}
3372a55deb1SDavid E. O'Brien }
3382a55deb1SDavid E. O'Brien 
3392a55deb1SDavid E. O'Brien /* in the parsing of regular expressions, metacharacters like . have */
3402a55deb1SDavid E. O'Brien /* to be seen literally;  \056 is not a metacharacter. */
3412a55deb1SDavid E. O'Brien 
hexstr(const uschar ** pp,int max)342f32a6403SWarner Losh int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
3432a55deb1SDavid E. O'Brien {			/* only pick up one 8-bit byte (2 chars) */
344f39dd6a9SWarner Losh 	const uschar *p;
3452a55deb1SDavid E. O'Brien 	int n = 0;
3462a55deb1SDavid E. O'Brien 	int i;
3472a55deb1SDavid E. O'Brien 
348f32a6403SWarner Losh 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349f32a6403SWarner Losh 		if (isdigit((int) *p))
3502a55deb1SDavid E. O'Brien 			n = 16 * n + *p - '0';
3512a55deb1SDavid E. O'Brien 		else if (*p >= 'a' && *p <= 'f')
3522a55deb1SDavid E. O'Brien 			n = 16 * n + *p - 'a' + 10;
3532a55deb1SDavid E. O'Brien 		else if (*p >= 'A' && *p <= 'F')
3542a55deb1SDavid E. O'Brien 			n = 16 * n + *p - 'A' + 10;
3552a55deb1SDavid E. O'Brien 	}
356f39dd6a9SWarner Losh 	*pp = p;
3572a55deb1SDavid E. O'Brien 	return n;
3582a55deb1SDavid E. O'Brien }
3592a55deb1SDavid E. O'Brien 
360f32a6403SWarner Losh 
361f32a6403SWarner Losh 
3622a55deb1SDavid E. O'Brien #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
3632a55deb1SDavid E. O'Brien 
quoted(const uschar ** pp)364f39dd6a9SWarner Losh int quoted(const uschar **pp)	/* pick up next thing after a \\ */
3652a55deb1SDavid E. O'Brien 			/* and increment *pp */
3662a55deb1SDavid E. O'Brien {
367f39dd6a9SWarner Losh 	const uschar *p = *pp;
3682a55deb1SDavid E. O'Brien 	int c;
3692a55deb1SDavid E. O'Brien 
370f32a6403SWarner Losh /* BUG: should advance by utf-8 char even if makes no sense */
371f32a6403SWarner Losh 
372*17853db4SWarner Losh 	switch ((c = *p++)) {
373*17853db4SWarner Losh 	case 't':
3742a55deb1SDavid E. O'Brien 		c = '\t';
375*17853db4SWarner Losh 		break;
376*17853db4SWarner Losh 	case 'n':
3772a55deb1SDavid E. O'Brien 		c = '\n';
378*17853db4SWarner Losh 		break;
379*17853db4SWarner Losh 	case 'f':
3802a55deb1SDavid E. O'Brien 		c = '\f';
381*17853db4SWarner Losh 		break;
382*17853db4SWarner Losh 	case 'r':
3832a55deb1SDavid E. O'Brien 		c = '\r';
384*17853db4SWarner Losh 		break;
385*17853db4SWarner Losh 	case 'b':
3862a55deb1SDavid E. O'Brien 		c = '\b';
387*17853db4SWarner Losh 		break;
388*17853db4SWarner Losh 	case 'v':
389f39dd6a9SWarner Losh 		c = '\v';
390*17853db4SWarner Losh 		break;
391*17853db4SWarner Losh 	case 'a':
392f39dd6a9SWarner Losh 		c = '\a';
393*17853db4SWarner Losh 		break;
394*17853db4SWarner Losh 	case '\\':
3952a55deb1SDavid E. O'Brien 		c = '\\';
396*17853db4SWarner Losh 		break;
397*17853db4SWarner Losh 	case 'x': /* 2 hex digits follow */
398f32a6403SWarner Losh 		c = hexstr(&p, 2); /* this adds a null if number is invalid */
399*17853db4SWarner Losh 		break;
400*17853db4SWarner Losh 	case 'u': /* unicode char number up to 8 hex digits */
401f32a6403SWarner Losh 		c = hexstr(&p, 8);
402*17853db4SWarner Losh 		break;
403*17853db4SWarner Losh 	default:
404*17853db4SWarner Losh 		if (isoctdigit(c)) { /* \d \dd \ddd */
4052a55deb1SDavid E. O'Brien 			int n = c - '0';
4062a55deb1SDavid E. O'Brien 			if (isoctdigit(*p)) {
4072a55deb1SDavid E. O'Brien 				n = 8 * n + *p++ - '0';
4082a55deb1SDavid E. O'Brien 				if (isoctdigit(*p))
4092a55deb1SDavid E. O'Brien 					n = 8 * n + *p++ - '0';
4102a55deb1SDavid E. O'Brien 			}
4112a55deb1SDavid E. O'Brien 			c = n;
412*17853db4SWarner Losh 		}
413*17853db4SWarner Losh 	}
414*17853db4SWarner Losh 
4152a55deb1SDavid E. O'Brien 	*pp = p;
4162a55deb1SDavid E. O'Brien 	return c;
4172a55deb1SDavid E. O'Brien }
4182a55deb1SDavid E. O'Brien 
cclenter(const char * argp)419f32a6403SWarner Losh int *cclenter(const char *argp)	/* add a character class */
4202a55deb1SDavid E. O'Brien {
421628bd30aSWarner Losh 	int i, c, c2;
422f32a6403SWarner Losh 	int n;
423f32a6403SWarner Losh 	const uschar *p = (const uschar *) argp;
424f32a6403SWarner Losh 	int *bp, *retp;
425f32a6403SWarner Losh 	static int *buf = NULL;
4262a55deb1SDavid E. O'Brien 	static int bufsz = 100;
4272a55deb1SDavid E. O'Brien 
428f32a6403SWarner Losh 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
4292a55deb1SDavid E. O'Brien 		FATAL("out of space for character class [%.10s...] 1", p);
4302a55deb1SDavid E. O'Brien 	bp = buf;
431f32a6403SWarner Losh 	for (i = 0; *p != 0; ) {
432f32a6403SWarner Losh 		n = u8_rune(&c, (const char *) p);
433f32a6403SWarner Losh 		p += n;
4342a55deb1SDavid E. O'Brien 		if (c == '\\') {
435d86a0988SRuslan Ermilov 			c = quoted(&p);
4362a55deb1SDavid E. O'Brien 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
4372a55deb1SDavid E. O'Brien 			if (*p != 0) {
4382a55deb1SDavid E. O'Brien 				c = bp[-1];
439f32a6403SWarner Losh 				/* c2 = *p++; */
440f32a6403SWarner Losh 				n = u8_rune(&c2, (const char *) p);
441f32a6403SWarner Losh 				p += n;
4422a55deb1SDavid E. O'Brien 				if (c2 == '\\')
443f32a6403SWarner Losh 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
444628bd30aSWarner Losh 				if (c > c2) {	/* empty; ignore */
4452a55deb1SDavid E. O'Brien 					bp--;
4462a55deb1SDavid E. O'Brien 					i--;
4472a55deb1SDavid E. O'Brien 					continue;
4482a55deb1SDavid E. O'Brien 				}
449628bd30aSWarner Losh 				while (c < c2) {
450f32a6403SWarner Losh 					if (i >= bufsz) {
451f32a6403SWarner Losh 						bufsz *= 2;
452f32a6403SWarner Losh 						buf = (int *) realloc(buf, bufsz * sizeof(int));
453f32a6403SWarner Losh 						if (buf == NULL)
4542a55deb1SDavid E. O'Brien 							FATAL("out of space for character class [%.10s...] 2", p);
455f32a6403SWarner Losh 						bp = buf + i;
456f32a6403SWarner Losh 					}
457628bd30aSWarner Losh 					*bp++ = ++c;
4582a55deb1SDavid E. O'Brien 					i++;
4592a55deb1SDavid E. O'Brien 				}
4602a55deb1SDavid E. O'Brien 				continue;
4612a55deb1SDavid E. O'Brien 			}
4622a55deb1SDavid E. O'Brien 		}
463f32a6403SWarner Losh 		if (i >= bufsz) {
464f32a6403SWarner Losh 			bufsz *= 2;
465f32a6403SWarner Losh 			buf = (int *) realloc(buf, bufsz * sizeof(int));
466f32a6403SWarner Losh 			if (buf == NULL)
467f32a6403SWarner Losh 				FATAL("out of space for character class [%.10s...] 2", p);
468f32a6403SWarner Losh 			bp = buf + i;
469f32a6403SWarner Losh 		}
4702a55deb1SDavid E. O'Brien 		*bp++ = c;
4712a55deb1SDavid E. O'Brien 		i++;
4722a55deb1SDavid E. O'Brien 	}
4732a55deb1SDavid E. O'Brien 	*bp = 0;
474f32a6403SWarner Losh 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
475f32a6403SWarner Losh 	/* xfree(op);  BUG: what are we freeing here? */
476f32a6403SWarner Losh 	retp = (int *) calloc(bp-buf+1, sizeof(int));
477f32a6403SWarner Losh 	for (i = 0; i < bp-buf+1; i++)
478f32a6403SWarner Losh 		retp[i] = buf[i];
479f32a6403SWarner Losh 	return retp;
4802a55deb1SDavid E. O'Brien }
4812a55deb1SDavid E. O'Brien 
overflo(const char * s)482813da98dSDavid E. O'Brien void overflo(const char *s)
4832a55deb1SDavid E. O'Brien {
484f39dd6a9SWarner Losh 	FATAL("regular expression too big: out of space in %.30s...", s);
4852a55deb1SDavid E. O'Brien }
4862a55deb1SDavid E. O'Brien 
cfoll(fa * f,Node * v)4872a55deb1SDavid E. O'Brien void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
4882a55deb1SDavid E. O'Brien {
4892a55deb1SDavid E. O'Brien 	int i;
4902a55deb1SDavid E. O'Brien 	int *p;
4912a55deb1SDavid E. O'Brien 
4922a55deb1SDavid E. O'Brien 	switch (type(v)) {
493addad6afSRong-En Fan 	ELEAF
4942a55deb1SDavid E. O'Brien 	LEAF
4952a55deb1SDavid E. O'Brien 		f->re[info(v)].ltype = type(v);
4962a55deb1SDavid E. O'Brien 		f->re[info(v)].lval.np = right(v);
4972a55deb1SDavid E. O'Brien 		while (f->accept >= maxsetvec) {	/* guessing here! */
498f39dd6a9SWarner Losh 			resizesetvec(__func__);
4992a55deb1SDavid E. O'Brien 		}
5002a55deb1SDavid E. O'Brien 		for (i = 0; i <= f->accept; i++)
5012a55deb1SDavid E. O'Brien 			setvec[i] = 0;
5022a55deb1SDavid E. O'Brien 		setcnt = 0;
5032a55deb1SDavid E. O'Brien 		follow(v);	/* computes setvec and setcnt */
504f39dd6a9SWarner Losh 		p = intalloc(setcnt + 1, __func__);
5052a55deb1SDavid E. O'Brien 		f->re[info(v)].lfollow = p;
5062a55deb1SDavid E. O'Brien 		*p = setcnt;
5072a55deb1SDavid E. O'Brien 		for (i = f->accept; i >= 0; i--)
5082a55deb1SDavid E. O'Brien 			if (setvec[i] == 1)
5092a55deb1SDavid E. O'Brien 				*++p = i;
5102a55deb1SDavid E. O'Brien 		break;
5112a55deb1SDavid E. O'Brien 	UNARY
5122a55deb1SDavid E. O'Brien 		cfoll(f,left(v));
5132a55deb1SDavid E. O'Brien 		break;
5142a55deb1SDavid E. O'Brien 	case CAT:
5152a55deb1SDavid E. O'Brien 	case OR:
5162a55deb1SDavid E. O'Brien 		cfoll(f,left(v));
5172a55deb1SDavid E. O'Brien 		cfoll(f,right(v));
5182a55deb1SDavid E. O'Brien 		break;
519f39dd6a9SWarner Losh 	case ZERO:
520f39dd6a9SWarner Losh 		break;
5212a55deb1SDavid E. O'Brien 	default:	/* can't happen */
5222a55deb1SDavid E. O'Brien 		FATAL("can't happen: unknown type %d in cfoll", type(v));
5232a55deb1SDavid E. O'Brien 	}
5242a55deb1SDavid E. O'Brien }
5252a55deb1SDavid E. O'Brien 
first(Node * p)5262a55deb1SDavid E. O'Brien int first(Node *p)	/* collects initially active leaves of p into setvec */
527addad6afSRong-En Fan 			/* returns 0 if p matches empty string */
5282a55deb1SDavid E. O'Brien {
5292a55deb1SDavid E. O'Brien 	int b, lp;
5302a55deb1SDavid E. O'Brien 
5312a55deb1SDavid E. O'Brien 	switch (type(p)) {
532addad6afSRong-En Fan 	ELEAF
5332a55deb1SDavid E. O'Brien 	LEAF
5342a55deb1SDavid E. O'Brien 		lp = info(p);	/* look for high-water mark of subscripts */
5352a55deb1SDavid E. O'Brien 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
536f39dd6a9SWarner Losh 			resizesetvec(__func__);
5372a55deb1SDavid E. O'Brien 		}
538addad6afSRong-En Fan 		if (type(p) == EMPTYRE) {
539addad6afSRong-En Fan 			setvec[lp] = 0;
540addad6afSRong-En Fan 			return(0);
541addad6afSRong-En Fan 		}
5422a55deb1SDavid E. O'Brien 		if (setvec[lp] != 1) {
5432a55deb1SDavid E. O'Brien 			setvec[lp] = 1;
5442a55deb1SDavid E. O'Brien 			setcnt++;
5452a55deb1SDavid E. O'Brien 		}
546f32a6403SWarner Losh 		if (type(p) == CCL && (*(int *) right(p)) == 0)
5472a55deb1SDavid E. O'Brien 			return(0);		/* empty CCL */
548f39dd6a9SWarner Losh 		return(1);
5492a55deb1SDavid E. O'Brien 	case PLUS:
550f39dd6a9SWarner Losh 		if (first(left(p)) == 0)
551f39dd6a9SWarner Losh 			return(0);
5522a55deb1SDavid E. O'Brien 		return(1);
5532a55deb1SDavid E. O'Brien 	case STAR:
5542a55deb1SDavid E. O'Brien 	case QUEST:
5552a55deb1SDavid E. O'Brien 		first(left(p));
5562a55deb1SDavid E. O'Brien 		return(0);
5572a55deb1SDavid E. O'Brien 	case CAT:
5582a55deb1SDavid E. O'Brien 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
5592a55deb1SDavid E. O'Brien 		return(1);
5602a55deb1SDavid E. O'Brien 	case OR:
5612a55deb1SDavid E. O'Brien 		b = first(right(p));
5622a55deb1SDavid E. O'Brien 		if (first(left(p)) == 0 || b == 0) return(0);
5632a55deb1SDavid E. O'Brien 		return(1);
564f39dd6a9SWarner Losh 	case ZERO:
565f39dd6a9SWarner Losh 		return 0;
5662a55deb1SDavid E. O'Brien 	}
5672a55deb1SDavid E. O'Brien 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
5682a55deb1SDavid E. O'Brien 	return(-1);
5692a55deb1SDavid E. O'Brien }
5702a55deb1SDavid E. O'Brien 
follow(Node * v)5712a55deb1SDavid E. O'Brien void follow(Node *v)	/* collects leaves that can follow v into setvec */
5722a55deb1SDavid E. O'Brien {
5732a55deb1SDavid E. O'Brien 	Node *p;
5742a55deb1SDavid E. O'Brien 
5752a55deb1SDavid E. O'Brien 	if (type(v) == FINAL)
5762a55deb1SDavid E. O'Brien 		return;
5772a55deb1SDavid E. O'Brien 	p = parent(v);
5782a55deb1SDavid E. O'Brien 	switch (type(p)) {
5792a55deb1SDavid E. O'Brien 	case STAR:
5802a55deb1SDavid E. O'Brien 	case PLUS:
5812a55deb1SDavid E. O'Brien 		first(v);
5822a55deb1SDavid E. O'Brien 		follow(p);
5832a55deb1SDavid E. O'Brien 		return;
5842a55deb1SDavid E. O'Brien 
5852a55deb1SDavid E. O'Brien 	case OR:
5862a55deb1SDavid E. O'Brien 	case QUEST:
5872a55deb1SDavid E. O'Brien 		follow(p);
5882a55deb1SDavid E. O'Brien 		return;
5892a55deb1SDavid E. O'Brien 
5902a55deb1SDavid E. O'Brien 	case CAT:
5912a55deb1SDavid E. O'Brien 		if (v == left(p)) {	/* v is left child of p */
5922a55deb1SDavid E. O'Brien 			if (first(right(p)) == 0) {
5932a55deb1SDavid E. O'Brien 				follow(p);
5942a55deb1SDavid E. O'Brien 				return;
5952a55deb1SDavid E. O'Brien 			}
5962a55deb1SDavid E. O'Brien 		} else		/* v is right child */
5972a55deb1SDavid E. O'Brien 			follow(p);
5982a55deb1SDavid E. O'Brien 		return;
5992a55deb1SDavid E. O'Brien 	}
6002a55deb1SDavid E. O'Brien }
6012a55deb1SDavid E. O'Brien 
member(int c,int * sarg)602f32a6403SWarner Losh int member(int c, int *sarg)	/* is c in s? */
6032a55deb1SDavid E. O'Brien {
604f32a6403SWarner Losh 	int *s = (int *) sarg;
6052a55deb1SDavid E. O'Brien 
6062a55deb1SDavid E. O'Brien 	while (*s)
6072a55deb1SDavid E. O'Brien 		if (c == *s++)
6082a55deb1SDavid E. O'Brien 			return(1);
6092a55deb1SDavid E. O'Brien 	return(0);
6102a55deb1SDavid E. O'Brien }
6112a55deb1SDavid E. O'Brien 
resize_gototab(fa * f,int state)612f32a6403SWarner Losh static void resize_gototab(fa *f, int state)
613f32a6403SWarner Losh {
614f32a6403SWarner Losh 	size_t new_size = f->gototab[state].allocated * 2;
615f32a6403SWarner Losh 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
616f32a6403SWarner Losh 	if (p == NULL)
617f32a6403SWarner Losh 		overflo(__func__);
618f32a6403SWarner Losh 
619f32a6403SWarner Losh 	// need to initialized the new memory to zero
620f32a6403SWarner Losh 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
621f32a6403SWarner Losh 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
622f32a6403SWarner Losh 
623f32a6403SWarner Losh 	f->gototab[state].allocated = new_size;			// update gototab info
624f32a6403SWarner Losh 	f->gototab[state].entries = p;
625f32a6403SWarner Losh }
626f32a6403SWarner Losh 
get_gototab(fa * f,int state,int ch)627f32a6403SWarner Losh static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
628f32a6403SWarner Losh {
629f32a6403SWarner Losh 	gtte key;
630f32a6403SWarner Losh 	gtte *item;
631f32a6403SWarner Losh 
632f32a6403SWarner Losh 	key.ch = ch;
633f32a6403SWarner Losh 	key.state = 0;	/* irrelevant */
634f32a6403SWarner Losh 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
635f32a6403SWarner Losh 			f->gototab[state].inuse, sizeof(gtte),
636f32a6403SWarner Losh 			entry_cmp);
637f32a6403SWarner Losh 
638f32a6403SWarner Losh 	if (item == NULL)
639f32a6403SWarner Losh 		return 0;
640f32a6403SWarner Losh 	else
641f32a6403SWarner Losh 		return item->state;
642f32a6403SWarner Losh }
643f32a6403SWarner Losh 
entry_cmp(const void * l,const void * r)644f32a6403SWarner Losh static int entry_cmp(const void *l, const void *r)
645f32a6403SWarner Losh {
646f32a6403SWarner Losh 	const gtte *left, *right;
647f32a6403SWarner Losh 
648f32a6403SWarner Losh 	left = (const gtte *) l;
649f32a6403SWarner Losh 	right = (const gtte *) r;
650f32a6403SWarner Losh 
651f32a6403SWarner Losh 	return left->ch - right->ch;
652f32a6403SWarner Losh }
653f32a6403SWarner Losh 
set_gototab(fa * f,int state,int ch,int val)654f32a6403SWarner Losh static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
655f32a6403SWarner Losh {
656f32a6403SWarner Losh 	if (f->gototab[state].inuse == 0) {
657f32a6403SWarner Losh 		f->gototab[state].entries[0].ch = ch;
658f32a6403SWarner Losh 		f->gototab[state].entries[0].state = val;
659f32a6403SWarner Losh 		f->gototab[state].inuse++;
660f32a6403SWarner Losh 		return val;
661*17853db4SWarner Losh 	} else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
662f32a6403SWarner Losh 		// not seen yet, insert and return
663f32a6403SWarner Losh 		gtt *tab = & f->gototab[state];
664f32a6403SWarner Losh 		if (tab->inuse + 1 >= tab->allocated)
665f32a6403SWarner Losh 			resize_gototab(f, state);
666f32a6403SWarner Losh 
6671023317aSWarner Losh 		f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
6681023317aSWarner Losh 		f->gototab[state].entries[f->gototab[state].inuse].state = val;
669f32a6403SWarner Losh 		f->gototab[state].inuse++;
670f32a6403SWarner Losh 		return val;
671f32a6403SWarner Losh 	} else {
672f32a6403SWarner Losh 		// maybe we have it, maybe we don't
673f32a6403SWarner Losh 		gtte key;
674f32a6403SWarner Losh 		gtte *item;
675f32a6403SWarner Losh 
676f32a6403SWarner Losh 		key.ch = ch;
677f32a6403SWarner Losh 		key.state = 0;	/* irrelevant */
678f32a6403SWarner Losh 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
679f32a6403SWarner Losh 				f->gototab[state].inuse, sizeof(gtte),
680f32a6403SWarner Losh 				entry_cmp);
681f32a6403SWarner Losh 
682f32a6403SWarner Losh 		if (item != NULL) {
683f32a6403SWarner Losh 			// we have it, update state and return
684f32a6403SWarner Losh 			item->state = val;
685f32a6403SWarner Losh 			return item->state;
686f32a6403SWarner Losh 		}
687f32a6403SWarner Losh 		// otherwise, fall through to insert and reallocate.
688f32a6403SWarner Losh 	}
689f32a6403SWarner Losh 
690f32a6403SWarner Losh 	gtt *tab = & f->gototab[state];
691f32a6403SWarner Losh 	if (tab->inuse + 1 >= tab->allocated)
692f32a6403SWarner Losh 		resize_gototab(f, state);
693f32a6403SWarner Losh 	f->gototab[state].entries[tab->inuse].ch = ch;
694f32a6403SWarner Losh 	f->gototab[state].entries[tab->inuse].state = val;
6951023317aSWarner Losh 	++tab->inuse;
696f32a6403SWarner Losh 
697f32a6403SWarner Losh 	qsort(f->gototab[state].entries,
698f32a6403SWarner Losh 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
699f32a6403SWarner Losh 
700f32a6403SWarner Losh 	return val; /* not used anywhere at the moment */
701f32a6403SWarner Losh }
702f32a6403SWarner Losh 
clear_gototab(fa * f,int state)703f32a6403SWarner Losh static void clear_gototab(fa *f, int state)
704f32a6403SWarner Losh {
705f32a6403SWarner Losh 	memset(f->gototab[state].entries, 0,
706f32a6403SWarner Losh 		f->gototab[state].allocated * sizeof(gtte));
707f32a6403SWarner Losh 	f->gototab[state].inuse = 0;
708f32a6403SWarner Losh }
709f32a6403SWarner Losh 
match(fa * f,const char * p0)710813da98dSDavid E. O'Brien int match(fa *f, const char *p0)	/* shortest match ? */
7112a55deb1SDavid E. O'Brien {
7122a55deb1SDavid E. O'Brien 	int s, ns;
713f32a6403SWarner Losh 	int n;
714f32a6403SWarner Losh 	int rune;
715f39dd6a9SWarner Losh 	const uschar *p = (const uschar *) p0;
7162a55deb1SDavid E. O'Brien 
717f32a6403SWarner Losh 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
718f32a6403SWarner Losh 
719f39dd6a9SWarner Losh 	s = f->initstat;
720f39dd6a9SWarner Losh 	assert (s < f->state_count);
721f39dd6a9SWarner Losh 
7222a55deb1SDavid E. O'Brien 	if (f->out[s])
7232a55deb1SDavid E. O'Brien 		return(1);
7242a55deb1SDavid E. O'Brien 	do {
725addad6afSRong-En Fan 		/* assert(*p < NCHARS); */
726f32a6403SWarner Losh 		n = u8_rune(&rune, (const char *) p);
727f32a6403SWarner Losh 		if ((ns = get_gototab(f, s, rune)) != 0)
7282a55deb1SDavid E. O'Brien 			s = ns;
7292a55deb1SDavid E. O'Brien 		else
730f32a6403SWarner Losh 			s = cgoto(f, s, rune);
7312a55deb1SDavid E. O'Brien 		if (f->out[s])
7322a55deb1SDavid E. O'Brien 			return(1);
733f32a6403SWarner Losh 		if (*p == 0)
734f32a6403SWarner Losh 			break;
735f32a6403SWarner Losh 		p += n;
736f32a6403SWarner Losh 	} while (1);  /* was *p++ != 0 */
7372a55deb1SDavid E. O'Brien 	return(0);
7382a55deb1SDavid E. O'Brien }
7392a55deb1SDavid E. O'Brien 
pmatch(fa * f,const char * p0)740813da98dSDavid E. O'Brien int pmatch(fa *f, const char *p0)	/* longest match, for sub */
7412a55deb1SDavid E. O'Brien {
7422a55deb1SDavid E. O'Brien 	int s, ns;
743f32a6403SWarner Losh 	int n;
744f32a6403SWarner Losh 	int rune;
745f39dd6a9SWarner Losh 	const uschar *p = (const uschar *) p0;
746f39dd6a9SWarner Losh 	const uschar *q;
7472a55deb1SDavid E. O'Brien 
74862ebc626SRuslan Ermilov 	s = f->initstat;
749f39dd6a9SWarner Losh 	assert(s < f->state_count);
750f39dd6a9SWarner Losh 
751f39dd6a9SWarner Losh 	patbeg = (const char *)p;
7522a55deb1SDavid E. O'Brien 	patlen = -1;
7532a55deb1SDavid E. O'Brien 	do {
7542a55deb1SDavid E. O'Brien 		q = p;
7552a55deb1SDavid E. O'Brien 		do {
7562a55deb1SDavid E. O'Brien 			if (f->out[s])		/* final state */
7572a55deb1SDavid E. O'Brien 				patlen = q-p;
758addad6afSRong-En Fan 			/* assert(*q < NCHARS); */
759f32a6403SWarner Losh 			n = u8_rune(&rune, (const char *) q);
760f32a6403SWarner Losh 			if ((ns = get_gototab(f, s, rune)) != 0)
7612a55deb1SDavid E. O'Brien 				s = ns;
7622a55deb1SDavid E. O'Brien 			else
763f32a6403SWarner Losh 				s = cgoto(f, s, rune);
764f39dd6a9SWarner Losh 
765f39dd6a9SWarner Losh 			assert(s < f->state_count);
766f39dd6a9SWarner Losh 
7672a55deb1SDavid E. O'Brien 			if (s == 1) {	/* no transition */
7682a55deb1SDavid E. O'Brien 				if (patlen >= 0) {
769f39dd6a9SWarner Losh 					patbeg = (const char *) p;
7702a55deb1SDavid E. O'Brien 					return(1);
7712a55deb1SDavid E. O'Brien 				}
7722a55deb1SDavid E. O'Brien 				else
7732a55deb1SDavid E. O'Brien 					goto nextin;	/* no match */
7742a55deb1SDavid E. O'Brien 			}
775f32a6403SWarner Losh 			if (*q == 0)
776f32a6403SWarner Losh 				break;
777f32a6403SWarner Losh 			q += n;
778f32a6403SWarner Losh 		} while (1);
779f32a6403SWarner Losh 		q++;  /* was *q++ */
7802a55deb1SDavid E. O'Brien 		if (f->out[s])
7812a55deb1SDavid E. O'Brien 			patlen = q-p-1;	/* don't count $ */
7822a55deb1SDavid E. O'Brien 		if (patlen >= 0) {
783f39dd6a9SWarner Losh 			patbeg = (const char *) p;
7842a55deb1SDavid E. O'Brien 			return(1);
7852a55deb1SDavid E. O'Brien 		}
7862a55deb1SDavid E. O'Brien 	nextin:
7872a55deb1SDavid E. O'Brien 		s = 2;
788f32a6403SWarner Losh 		if (*p == 0)
789f32a6403SWarner Losh 			break;
790f32a6403SWarner Losh 		n = u8_rune(&rune, (const char *) p);
791f32a6403SWarner Losh 		p += n;
792f32a6403SWarner Losh 	} while (1); /* was *p++ */
7932a55deb1SDavid E. O'Brien 	return (0);
7942a55deb1SDavid E. O'Brien }
7952a55deb1SDavid E. O'Brien 
nematch(fa * f,const char * p0)796813da98dSDavid E. O'Brien int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
7972a55deb1SDavid E. O'Brien {
7982a55deb1SDavid E. O'Brien 	int s, ns;
799f32a6403SWarner Losh         int n;
800f32a6403SWarner Losh         int rune;
801f39dd6a9SWarner Losh 	const uschar *p = (const uschar *) p0;
802f39dd6a9SWarner Losh 	const uschar *q;
8032a55deb1SDavid E. O'Brien 
80462ebc626SRuslan Ermilov 	s = f->initstat;
805f39dd6a9SWarner Losh 	assert(s < f->state_count);
806f39dd6a9SWarner Losh 
807f39dd6a9SWarner Losh 	patbeg = (const char *)p;
8082a55deb1SDavid E. O'Brien 	patlen = -1;
8092a55deb1SDavid E. O'Brien 	while (*p) {
8102a55deb1SDavid E. O'Brien 		q = p;
8112a55deb1SDavid E. O'Brien 		do {
8122a55deb1SDavid E. O'Brien 			if (f->out[s])		/* final state */
8132a55deb1SDavid E. O'Brien 				patlen = q-p;
814addad6afSRong-En Fan 			/* assert(*q < NCHARS); */
815f32a6403SWarner Losh 			n = u8_rune(&rune, (const char *) q);
816f32a6403SWarner Losh 			if ((ns = get_gototab(f, s, rune)) != 0)
8172a55deb1SDavid E. O'Brien 				s = ns;
8182a55deb1SDavid E. O'Brien 			else
819f32a6403SWarner Losh 				s = cgoto(f, s, rune);
8202a55deb1SDavid E. O'Brien 			if (s == 1) {	/* no transition */
8212a55deb1SDavid E. O'Brien 				if (patlen > 0) {
822f39dd6a9SWarner Losh 					patbeg = (const char *) p;
8232a55deb1SDavid E. O'Brien 					return(1);
8242a55deb1SDavid E. O'Brien 				} else
8252a55deb1SDavid E. O'Brien 					goto nnextin;	/* no nonempty match */
8262a55deb1SDavid E. O'Brien 			}
827f32a6403SWarner Losh 			if (*q == 0)
828f32a6403SWarner Losh 				break;
829f32a6403SWarner Losh 			q += n;
830f32a6403SWarner Losh 		} while (1);
831f32a6403SWarner Losh 		q++;
8322a55deb1SDavid E. O'Brien 		if (f->out[s])
8332a55deb1SDavid E. O'Brien 			patlen = q-p-1;	/* don't count $ */
8342a55deb1SDavid E. O'Brien 		if (patlen > 0 ) {
835f39dd6a9SWarner Losh 			patbeg = (const char *) p;
8362a55deb1SDavid E. O'Brien 			return(1);
8372a55deb1SDavid E. O'Brien 		}
8382a55deb1SDavid E. O'Brien 	nnextin:
8392a55deb1SDavid E. O'Brien 		s = 2;
8402a55deb1SDavid E. O'Brien 		p++;
8412a55deb1SDavid E. O'Brien 	}
8422a55deb1SDavid E. O'Brien 	return (0);
8432a55deb1SDavid E. O'Brien }
8442a55deb1SDavid E. O'Brien 
845f39dd6a9SWarner Losh 
846f39dd6a9SWarner Losh /*
847f39dd6a9SWarner Losh  * NAME
848f39dd6a9SWarner Losh  *     fnematch
849f39dd6a9SWarner Losh  *
850f39dd6a9SWarner Losh  * DESCRIPTION
851f39dd6a9SWarner Losh  *     A stream-fed version of nematch which transfers characters to a
852f39dd6a9SWarner Losh  *     null-terminated buffer. All characters up to and including the last
853f39dd6a9SWarner Losh  *     character of the matching text or EOF are placed in the buffer. If
854f39dd6a9SWarner Losh  *     a match is found, patbeg and patlen are set appropriately.
855f39dd6a9SWarner Losh  *
856f39dd6a9SWarner Losh  * RETURN VALUES
857f39dd6a9SWarner Losh  *     false    No match found.
858f39dd6a9SWarner Losh  *     true     Match found.
859f39dd6a9SWarner Losh  */
860f39dd6a9SWarner Losh 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)861f39dd6a9SWarner Losh bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
862f39dd6a9SWarner Losh {
863f32a6403SWarner Losh 	char *i, *j, *k, *buf = *pbuf;
864f39dd6a9SWarner Losh 	int bufsize = *pbufsize;
865f32a6403SWarner Losh 	int c, n, ns, s;
866f39dd6a9SWarner Losh 
867f39dd6a9SWarner Losh 	s = pfa->initstat;
868f39dd6a9SWarner Losh 	patlen = 0;
869f39dd6a9SWarner Losh 
870f39dd6a9SWarner Losh 	/*
871f32a6403SWarner Losh 	 * buf <= i <= j <= k <= buf+bufsize
872f39dd6a9SWarner Losh 	 *
873b2376a5fSWarner Losh 	 * i: origin of active substring
874b2376a5fSWarner Losh 	 * j: current character
875f32a6403SWarner Losh 	 * k: destination of the next getc
876f39dd6a9SWarner Losh 	 */
877f39dd6a9SWarner Losh 
878f32a6403SWarner Losh 	i = j = k = buf;
879f32a6403SWarner Losh 
880f32a6403SWarner Losh 	do {
881f32a6403SWarner Losh 		/*
8821023317aSWarner Losh 		 * Call u8_rune with at least awk_mb_cur_max ahead in
883f32a6403SWarner Losh 		 * the buffer until EOF interferes.
884f32a6403SWarner Losh 		 */
885*17853db4SWarner Losh 		if (k - j < (int)awk_mb_cur_max) {
8861023317aSWarner Losh 			if (k + awk_mb_cur_max > buf + bufsize) {
8871023317aSWarner Losh 				char *obuf = buf;
888f32a6403SWarner Losh 				adjbuf((char **) &buf, &bufsize,
8891023317aSWarner Losh 				    bufsize + awk_mb_cur_max,
890f32a6403SWarner Losh 				    quantum, 0, "fnematch");
8911023317aSWarner Losh 
8921023317aSWarner Losh 				/* buf resized, maybe moved. update pointers */
8931023317aSWarner Losh 				*pbufsize = bufsize;
8941023317aSWarner Losh 				if (obuf != buf) {
8951023317aSWarner Losh 					i = buf + (i - obuf);
8961023317aSWarner Losh 					j = buf + (j - obuf);
8971023317aSWarner Losh 					k = buf + (k - obuf);
8981023317aSWarner Losh 					*pbuf = buf;
8991023317aSWarner Losh 					if (patlen)
9001023317aSWarner Losh 						patbeg = buf + (patbeg - obuf);
901f32a6403SWarner Losh 				}
9021023317aSWarner Losh 			}
9031023317aSWarner Losh 			for (n = awk_mb_cur_max ; n > 0; n--) {
904f32a6403SWarner Losh 				*k++ = (c = getc(f)) != EOF ? c : 0;
905f32a6403SWarner Losh 				if (c == EOF) {
906f32a6403SWarner Losh 					if (ferror(f))
907f32a6403SWarner Losh 						FATAL("fnematch: getc error");
908f32a6403SWarner Losh 					break;
909f32a6403SWarner Losh 				}
910f32a6403SWarner Losh 			}
911f32a6403SWarner Losh 		}
912f32a6403SWarner Losh 
913f32a6403SWarner Losh 		j += u8_rune(&c, j);
914f32a6403SWarner Losh 
915f32a6403SWarner Losh 		if ((ns = get_gototab(pfa, s, c)) != 0)
916f39dd6a9SWarner Losh 			s = ns;
917f39dd6a9SWarner Losh 		else
918b2376a5fSWarner Losh 			s = cgoto(pfa, s, c);
919f39dd6a9SWarner Losh 
920f39dd6a9SWarner Losh 		if (pfa->out[s]) {	/* final state */
921f32a6403SWarner Losh 			patbeg = i;
922f32a6403SWarner Losh 			patlen = j - i;
923b2376a5fSWarner Losh 			if (c == 0)	/* don't count $ */
924f39dd6a9SWarner Losh 				patlen--;
925f39dd6a9SWarner Losh 		}
926f32a6403SWarner Losh 
927f32a6403SWarner Losh 		if (c && s != 1)
928f32a6403SWarner Losh 			continue;  /* origin i still viable, next j */
929f32a6403SWarner Losh 		if (patlen)
930f32a6403SWarner Losh 			break;     /* best match found */
931f32a6403SWarner Losh 
932f32a6403SWarner Losh 		/* no match at origin i, next i and start over */
933f32a6403SWarner Losh 		i += u8_rune(&c, i);
934f32a6403SWarner Losh 		if (c == 0)
935f32a6403SWarner Losh 			break;    /* no match */
936f32a6403SWarner Losh 		j = i;
937f39dd6a9SWarner Losh 		s = 2;
938f32a6403SWarner Losh 	} while (1);
939f39dd6a9SWarner Losh 
940f39dd6a9SWarner Losh 	if (patlen) {
941f39dd6a9SWarner Losh 		/*
942f39dd6a9SWarner Losh 		 * Under no circumstances is the last character fed to
943f39dd6a9SWarner Losh 		 * the automaton part of the match. It is EOF's nullbyte,
944f39dd6a9SWarner Losh 		 * or it sent the automaton into a state with no further
945f39dd6a9SWarner Losh 		 * transitions available (s==1), or both. Room for a
946f39dd6a9SWarner Losh 		 * terminating nullbyte is guaranteed.
947f39dd6a9SWarner Losh 		 *
948f39dd6a9SWarner Losh 		 * ungetc any chars after the end of matching text
949f39dd6a9SWarner Losh 		 * (except for EOF's nullbyte, if present) and null
950f39dd6a9SWarner Losh 		 * terminate the buffer.
951f39dd6a9SWarner Losh 		 */
952f39dd6a9SWarner Losh 		do
953f32a6403SWarner Losh 			if (*--k && ungetc(*k, f) == EOF)
954f32a6403SWarner Losh 				FATAL("unable to ungetc '%c'", *k);
955f32a6403SWarner Losh 		while (k > patbeg + patlen);
956f32a6403SWarner Losh 		*k = '\0';
957f39dd6a9SWarner Losh 		return true;
958f39dd6a9SWarner Losh 	}
959f39dd6a9SWarner Losh 	else
960f39dd6a9SWarner Losh 		return false;
961f39dd6a9SWarner Losh }
962f39dd6a9SWarner Losh 
reparse(const char * p)963813da98dSDavid E. O'Brien Node *reparse(const char *p)	/* parses regular expression pointed to by p */
9642a55deb1SDavid E. O'Brien {			/* uses relex() to scan regular expression */
9652a55deb1SDavid E. O'Brien 	Node *np;
9662a55deb1SDavid E. O'Brien 
967f39dd6a9SWarner Losh 	DPRINTF("reparse <%s>\n", p);
968f39dd6a9SWarner Losh 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
9692a55deb1SDavid E. O'Brien 	rtok = relex();
970813da98dSDavid E. O'Brien 	/* GNU compatibility: an empty regexp matches anything */
971addad6afSRong-En Fan 	if (rtok == '\0') {
972813da98dSDavid E. O'Brien 		/* FATAL("empty regular expression"); previous */
973addad6afSRong-En Fan 		return(op2(EMPTYRE, NIL, NIL));
974addad6afSRong-En Fan 	}
9752a55deb1SDavid E. O'Brien 	np = regexp();
9762a55deb1SDavid E. O'Brien 	if (rtok != '\0')
9772a55deb1SDavid E. O'Brien 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
9782a55deb1SDavid E. O'Brien 	return(np);
9792a55deb1SDavid E. O'Brien }
9802a55deb1SDavid E. O'Brien 
regexp(void)9812a55deb1SDavid E. O'Brien Node *regexp(void)	/* top-level parse of reg expr */
9822a55deb1SDavid E. O'Brien {
9832a55deb1SDavid E. O'Brien 	return (alt(concat(primary())));
9842a55deb1SDavid E. O'Brien }
9852a55deb1SDavid E. O'Brien 
primary(void)9862a55deb1SDavid E. O'Brien Node *primary(void)
9872a55deb1SDavid E. O'Brien {
9882a55deb1SDavid E. O'Brien 	Node *np;
989b5253557SWarner Losh 	int savelastatom;
9902a55deb1SDavid E. O'Brien 
9912a55deb1SDavid E. O'Brien 	switch (rtok) {
9922a55deb1SDavid E. O'Brien 	case CHAR:
993b5253557SWarner Losh 		lastatom = starttok;
9942a55deb1SDavid E. O'Brien 		np = op2(CHAR, NIL, itonp(rlxval));
9952a55deb1SDavid E. O'Brien 		rtok = relex();
9962a55deb1SDavid E. O'Brien 		return (unary(np));
9972a55deb1SDavid E. O'Brien 	case ALL:
9982a55deb1SDavid E. O'Brien 		rtok = relex();
9992a55deb1SDavid E. O'Brien 		return (unary(op2(ALL, NIL, NIL)));
1000addad6afSRong-En Fan 	case EMPTYRE:
1001addad6afSRong-En Fan 		rtok = relex();
1002b5253557SWarner Losh 		return (unary(op2(EMPTYRE, NIL, NIL)));
10032a55deb1SDavid E. O'Brien 	case DOT:
1004b5253557SWarner Losh 		lastatom = starttok;
10052a55deb1SDavid E. O'Brien 		rtok = relex();
10062a55deb1SDavid E. O'Brien 		return (unary(op2(DOT, NIL, NIL)));
10072a55deb1SDavid E. O'Brien 	case CCL:
1008f39dd6a9SWarner Losh 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1009b5253557SWarner Losh 		lastatom = starttok;
10102a55deb1SDavid E. O'Brien 		rtok = relex();
10112a55deb1SDavid E. O'Brien 		return (unary(np));
10122a55deb1SDavid E. O'Brien 	case NCCL:
1013f39dd6a9SWarner Losh 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1014b5253557SWarner Losh 		lastatom = starttok;
10152a55deb1SDavid E. O'Brien 		rtok = relex();
10162a55deb1SDavid E. O'Brien 		return (unary(np));
10172a55deb1SDavid E. O'Brien 	case '^':
10182a55deb1SDavid E. O'Brien 		rtok = relex();
10192a55deb1SDavid E. O'Brien 		return (unary(op2(CHAR, NIL, itonp(HAT))));
10202a55deb1SDavid E. O'Brien 	case '$':
10212a55deb1SDavid E. O'Brien 		rtok = relex();
10222a55deb1SDavid E. O'Brien 		return (unary(op2(CHAR, NIL, NIL)));
10232a55deb1SDavid E. O'Brien 	case '(':
1024b5253557SWarner Losh 		lastatom = starttok;
1025b5253557SWarner Losh 		savelastatom = starttok - basestr; /* Retain over recursion */
10262a55deb1SDavid E. O'Brien 		rtok = relex();
10272a55deb1SDavid E. O'Brien 		if (rtok == ')') {	/* special pleading for () */
10282a55deb1SDavid E. O'Brien 			rtok = relex();
1029f32a6403SWarner Losh 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
10302a55deb1SDavid E. O'Brien 		}
10312a55deb1SDavid E. O'Brien 		np = regexp();
10322a55deb1SDavid E. O'Brien 		if (rtok == ')') {
1033b5253557SWarner Losh 			lastatom = basestr + savelastatom; /* Restore */
10342a55deb1SDavid E. O'Brien 			rtok = relex();
10352a55deb1SDavid E. O'Brien 			return (unary(np));
10362a55deb1SDavid E. O'Brien 		}
10372a55deb1SDavid E. O'Brien 		else
10382a55deb1SDavid E. O'Brien 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
10392a55deb1SDavid E. O'Brien 	default:
10402a55deb1SDavid E. O'Brien 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
10412a55deb1SDavid E. O'Brien 	}
10422a55deb1SDavid E. O'Brien 	return 0;	/*NOTREACHED*/
10432a55deb1SDavid E. O'Brien }
10442a55deb1SDavid E. O'Brien 
concat(Node * np)10452a55deb1SDavid E. O'Brien Node *concat(Node *np)
10462a55deb1SDavid E. O'Brien {
10472a55deb1SDavid E. O'Brien 	switch (rtok) {
1048b5253557SWarner Losh 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
10492a55deb1SDavid E. O'Brien 		return (concat(op2(CAT, np, primary())));
1050b5253557SWarner Losh 	case EMPTYRE:
1051b5253557SWarner Losh 		rtok = relex();
1052f32a6403SWarner Losh 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1053b5253557SWarner Losh 				primary())));
10542a55deb1SDavid E. O'Brien 	}
10552a55deb1SDavid E. O'Brien 	return (np);
10562a55deb1SDavid E. O'Brien }
10572a55deb1SDavid E. O'Brien 
alt(Node * np)10582a55deb1SDavid E. O'Brien Node *alt(Node *np)
10592a55deb1SDavid E. O'Brien {
10602a55deb1SDavid E. O'Brien 	if (rtok == OR) {
10612a55deb1SDavid E. O'Brien 		rtok = relex();
10622a55deb1SDavid E. O'Brien 		return (alt(op2(OR, np, concat(primary()))));
10632a55deb1SDavid E. O'Brien 	}
10642a55deb1SDavid E. O'Brien 	return (np);
10652a55deb1SDavid E. O'Brien }
10662a55deb1SDavid E. O'Brien 
unary(Node * np)10672a55deb1SDavid E. O'Brien Node *unary(Node *np)
10682a55deb1SDavid E. O'Brien {
10692a55deb1SDavid E. O'Brien 	switch (rtok) {
10702a55deb1SDavid E. O'Brien 	case STAR:
10712a55deb1SDavid E. O'Brien 		rtok = relex();
10722a55deb1SDavid E. O'Brien 		return (unary(op2(STAR, np, NIL)));
10732a55deb1SDavid E. O'Brien 	case PLUS:
10742a55deb1SDavid E. O'Brien 		rtok = relex();
10752a55deb1SDavid E. O'Brien 		return (unary(op2(PLUS, np, NIL)));
10762a55deb1SDavid E. O'Brien 	case QUEST:
10772a55deb1SDavid E. O'Brien 		rtok = relex();
10782a55deb1SDavid E. O'Brien 		return (unary(op2(QUEST, np, NIL)));
1079f39dd6a9SWarner Losh 	case ZERO:
1080f39dd6a9SWarner Losh 		rtok = relex();
1081f39dd6a9SWarner Losh 		return (unary(op2(ZERO, np, NIL)));
10822a55deb1SDavid E. O'Brien 	default:
10832a55deb1SDavid E. O'Brien 		return (np);
10842a55deb1SDavid E. O'Brien 	}
10852a55deb1SDavid E. O'Brien }
10862a55deb1SDavid E. O'Brien 
1087007c6572SDag-Erling Smørgrav /*
1088007c6572SDag-Erling Smørgrav  * Character class definitions conformant to the POSIX locale as
1089007c6572SDag-Erling Smørgrav  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1090007c6572SDag-Erling Smørgrav  * and operating character sets are both ASCII (ISO646) or supersets
1091007c6572SDag-Erling Smørgrav  * thereof.
1092007c6572SDag-Erling Smørgrav  *
1093007c6572SDag-Erling Smørgrav  * Note that to avoid overflowing the temporary buffer used in
1094007c6572SDag-Erling Smørgrav  * relex(), the expanded character class (prior to range expansion)
1095007c6572SDag-Erling Smørgrav  * must be less than twice the size of their full name.
1096007c6572SDag-Erling Smørgrav  */
1097fc6b1dfeSDavid E. O'Brien 
1098fc6b1dfeSDavid E. O'Brien /* Because isblank doesn't show up in any of the header files on any
1099fc6b1dfeSDavid E. O'Brien  * system i use, it's defined here.  if some other locale has a richer
1100fc6b1dfeSDavid E. O'Brien  * definition of "blank", define HAS_ISBLANK and provide your own
1101fc6b1dfeSDavid E. O'Brien  * version.
110288b8d487SRuslan Ermilov  * the parentheses here are an attempt to find a path through the maze
110388b8d487SRuslan Ermilov  * of macro definition and/or function and/or version provided.  thanks
110488b8d487SRuslan Ermilov  * to nelson beebe for the suggestion; let's see if it works everywhere.
1105fc6b1dfeSDavid E. O'Brien  */
1106fc6b1dfeSDavid E. O'Brien 
110791217c1cSRuslan Ermilov /* #define HAS_ISBLANK */
1108fc6b1dfeSDavid E. O'Brien #ifndef HAS_ISBLANK
1109fc6b1dfeSDavid E. O'Brien 
11101b11b783SRuslan Ermilov int (xisblank)(int c)
1111fc6b1dfeSDavid E. O'Brien {
1112fc6b1dfeSDavid E. O'Brien 	return c==' ' || c=='\t';
1113fc6b1dfeSDavid E. O'Brien }
1114fc6b1dfeSDavid E. O'Brien 
1115fc6b1dfeSDavid E. O'Brien #endif
1116fc6b1dfeSDavid E. O'Brien 
1117f39dd6a9SWarner Losh static const struct charclass {
1118007c6572SDag-Erling Smørgrav 	const char *cc_name;
1119007c6572SDag-Erling Smørgrav 	int cc_namelen;
1120fc6b1dfeSDavid E. O'Brien 	int (*cc_func)(int);
1121007c6572SDag-Erling Smørgrav } charclasses[] = {
1122fc6b1dfeSDavid E. O'Brien 	{ "alnum",	5,	isalnum },
1123fc6b1dfeSDavid E. O'Brien 	{ "alpha",	5,	isalpha },
11241b11b783SRuslan Ermilov #ifndef HAS_ISBLANK
1125b5253557SWarner Losh 	{ "blank",	5,	xisblank },
11261b11b783SRuslan Ermilov #else
1127fc6b1dfeSDavid E. O'Brien 	{ "blank",	5,	isblank },
11281b11b783SRuslan Ermilov #endif
1129fc6b1dfeSDavid E. O'Brien 	{ "cntrl",	5,	iscntrl },
1130fc6b1dfeSDavid E. O'Brien 	{ "digit",	5,	isdigit },
1131fc6b1dfeSDavid E. O'Brien 	{ "graph",	5,	isgraph },
1132fc6b1dfeSDavid E. O'Brien 	{ "lower",	5,	islower },
1133fc6b1dfeSDavid E. O'Brien 	{ "print",	5,	isprint },
1134fc6b1dfeSDavid E. O'Brien 	{ "punct",	5,	ispunct },
1135fc6b1dfeSDavid E. O'Brien 	{ "space",	5,	isspace },
1136fc6b1dfeSDavid E. O'Brien 	{ "upper",	5,	isupper },
1137fc6b1dfeSDavid E. O'Brien 	{ "xdigit",	6,	isxdigit },
1138007c6572SDag-Erling Smørgrav 	{ NULL,		0,	NULL },
1139007c6572SDag-Erling Smørgrav };
1140007c6572SDag-Erling Smørgrav 
1141b5253557SWarner Losh #define REPEAT_SIMPLE		0
1142b5253557SWarner Losh #define REPEAT_PLUS_APPENDED	1
1143b5253557SWarner Losh #define REPEAT_WITH_Q		2
1144b5253557SWarner Losh #define REPEAT_ZERO		3
1145b5253557SWarner Losh 
1146b5253557SWarner Losh static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1147b5253557SWarner Losh replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1148b5253557SWarner Losh 	       int atomlen, int firstnum, int secondnum, int special_case)
1149b5253557SWarner Losh {
1150b5253557SWarner Losh 	int i, j;
1151b5253557SWarner Losh 	uschar *buf = 0;
1152b5253557SWarner Losh 	int ret = 1;
1153b5253557SWarner Losh 	int init_q = (firstnum == 0);		/* first added char will be ? */
1154b5253557SWarner Losh 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1155b5253557SWarner Losh 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1156f39dd6a9SWarner Losh 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1157b5253557SWarner Losh 	int size = prefix_length +  suffix_length;
1158b5253557SWarner Losh 
1159b5253557SWarner Losh 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1160b5253557SWarner Losh 		size += atomlen*(firstnum-1);
1161b5253557SWarner Losh 	}
1162b5253557SWarner Losh 
1163b5253557SWarner Losh 	/* Adjust size of buffer for special cases */
1164b5253557SWarner Losh 	if (special_case == REPEAT_PLUS_APPENDED) {
1165b5253557SWarner Losh 		size++;		/* for the final + */
1166b5253557SWarner Losh 	} else if (special_case == REPEAT_WITH_Q) {
116723f24377SWarner Losh 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1168b5253557SWarner Losh 	} else if (special_case == REPEAT_ZERO) {
1169b5253557SWarner Losh 		size += 2;	/* just a null ERE: () */
1170b5253557SWarner Losh 	}
1171b5253557SWarner Losh 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1172b5253557SWarner Losh 		FATAL("out of space in reg expr %.10s..", lastre);
1173b5253557SWarner Losh 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1174b5253557SWarner Losh 	j = prefix_length;
1175b5253557SWarner Losh 	if (special_case == REPEAT_ZERO) {
1176b5253557SWarner Losh 		j -= atomlen;
1177b5253557SWarner Losh 		buf[j++] = '(';
1178b5253557SWarner Losh 		buf[j++] = ')';
1179b5253557SWarner Losh 	}
1180b5253557SWarner Losh 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1181b5253557SWarner Losh 		memcpy(&buf[j], atom, atomlen);
1182b5253557SWarner Losh 		j += atomlen;
1183b5253557SWarner Losh 	}
1184b5253557SWarner Losh 	if (special_case == REPEAT_PLUS_APPENDED) {
1185b5253557SWarner Losh 		buf[j++] = '+';
1186b5253557SWarner Losh 	} else if (special_case == REPEAT_WITH_Q) {
1187f39dd6a9SWarner Losh 		if (init_q)
1188f39dd6a9SWarner Losh 			buf[j++] = '?';
1189f39dd6a9SWarner Losh 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1190b5253557SWarner Losh 			memcpy(&buf[j], atom, atomlen);
1191b5253557SWarner Losh 			j += atomlen;
1192b5253557SWarner Losh 			buf[j++] = '?';
1193b5253557SWarner Losh 		}
1194b5253557SWarner Losh 	}
1195b5253557SWarner Losh 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
119623f24377SWarner Losh 	j += suffix_length;
119723f24377SWarner Losh 	buf[j] = '\0';
1198b5253557SWarner Losh 	/* free old basestr */
1199b5253557SWarner Losh 	if (firstbasestr != basestr) {
1200b5253557SWarner Losh 		if (basestr)
1201b5253557SWarner Losh 			xfree(basestr);
1202b5253557SWarner Losh 	}
1203b5253557SWarner Losh 	basestr = buf;
1204b5253557SWarner Losh 	prestr  = buf + prefix_length;
1205b5253557SWarner Losh 	if (special_case == REPEAT_ZERO) {
1206b5253557SWarner Losh 		prestr  -= atomlen;
1207b5253557SWarner Losh 		ret++;
1208b5253557SWarner Losh 	}
1209b5253557SWarner Losh 	return ret;
1210b5253557SWarner Losh }
1211b5253557SWarner Losh 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1212b5253557SWarner Losh static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1213b5253557SWarner Losh 		  int atomlen, int firstnum, int secondnum)
1214b5253557SWarner Losh {
1215b5253557SWarner Losh 	/*
1216b5253557SWarner Losh 	   In general, the repetition specifier or "bound" is replaced here
1217b5253557SWarner Losh 	   by an equivalent ERE string, repeating the immediately previous atom
1218b5253557SWarner Losh 	   and appending ? and + as needed. Note that the first copy of the
1219b5253557SWarner Losh 	   atom is left in place, except in the special_case of a zero-repeat
1220b5253557SWarner Losh 	   (i.e., {0}).
1221b5253557SWarner Losh 	 */
1222b5253557SWarner Losh 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1223b5253557SWarner Losh 		if (firstnum < 2) {
1224b5253557SWarner Losh 			/* 0 or 1: should be handled before you get here */
1225b5253557SWarner Losh 			FATAL("internal error");
1226b5253557SWarner Losh 		} else {
1227b5253557SWarner Losh 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1228b5253557SWarner Losh 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1229b5253557SWarner Losh 		}
1230b5253557SWarner Losh 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1231b5253557SWarner Losh 		if (firstnum == 0) {	/* {0} or {0,0} */
1232b5253557SWarner Losh 			/* This case is unusual because the resulting
1233b5253557SWarner Losh 			   replacement string might actually be SMALLER than
1234b5253557SWarner Losh 			   the original ERE */
1235b5253557SWarner Losh 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1236b5253557SWarner Losh 					firstnum, secondnum, REPEAT_ZERO);
1237b5253557SWarner Losh 		} else {		/* (firstnum >= 1) */
1238b5253557SWarner Losh 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1239b5253557SWarner Losh 					firstnum, secondnum, REPEAT_SIMPLE);
1240b5253557SWarner Losh 		}
1241b5253557SWarner Losh 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1242b5253557SWarner Losh 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1243b5253557SWarner Losh 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1244b5253557SWarner Losh 					firstnum, secondnum, REPEAT_WITH_Q);
1245b5253557SWarner Losh 	} else {	/* Error - shouldn't be here (n>m) */
1246b5253557SWarner Losh 		FATAL("internal error");
1247b5253557SWarner Losh 	}
1248b5253557SWarner Losh 	return 0;
1249b5253557SWarner Losh }
1250007c6572SDag-Erling Smørgrav 
relex(void)12512a55deb1SDavid E. O'Brien int relex(void)		/* lexical analyzer for reparse */
12522a55deb1SDavid E. O'Brien {
12532a55deb1SDavid E. O'Brien 	int c, n;
12542a55deb1SDavid E. O'Brien 	int cflag;
125510ce5b99SWarner Losh 	static uschar *buf = NULL;
12562a55deb1SDavid E. O'Brien 	static int bufsz = 100;
12572a55deb1SDavid E. O'Brien 	uschar *bp;
1258f39dd6a9SWarner Losh 	const struct charclass *cc;
1259fc6b1dfeSDavid E. O'Brien 	int i;
1260f39dd6a9SWarner Losh 	int num, m;
1261f39dd6a9SWarner Losh 	bool commafound, digitfound;
1262b5253557SWarner Losh 	const uschar *startreptok;
1263f39dd6a9SWarner Losh 	static int parens = 0;
1264b5253557SWarner Losh 
1265b5253557SWarner Losh rescan:
1266b5253557SWarner Losh 	starttok = prestr;
12672a55deb1SDavid E. O'Brien 
1268f32a6403SWarner Losh 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1269f32a6403SWarner Losh 		prestr += n;
1270f32a6403SWarner Losh 		starttok = prestr;
1271f32a6403SWarner Losh 		return CHAR;
1272f32a6403SWarner Losh 	}
1273f32a6403SWarner Losh 
12742a55deb1SDavid E. O'Brien 	switch (c = *prestr++) {
12752a55deb1SDavid E. O'Brien 	case '|': return OR;
12762a55deb1SDavid E. O'Brien 	case '*': return STAR;
12772a55deb1SDavid E. O'Brien 	case '+': return PLUS;
12782a55deb1SDavid E. O'Brien 	case '?': return QUEST;
12792a55deb1SDavid E. O'Brien 	case '.': return DOT;
12802a55deb1SDavid E. O'Brien 	case '\0': prestr--; return '\0';
12812a55deb1SDavid E. O'Brien 	case '^':
12822a55deb1SDavid E. O'Brien 	case '$':
12832a55deb1SDavid E. O'Brien 		return c;
1284f39dd6a9SWarner Losh 	case '(':
1285f39dd6a9SWarner Losh 		parens++;
1286f39dd6a9SWarner Losh 		return c;
1287f39dd6a9SWarner Losh 	case ')':
1288f39dd6a9SWarner Losh 		if (parens) {
1289f39dd6a9SWarner Losh 			parens--;
1290f39dd6a9SWarner Losh 			return c;
1291f39dd6a9SWarner Losh 		}
1292f39dd6a9SWarner Losh 		/* unmatched close parenthesis; per POSIX, treat as literal */
1293f39dd6a9SWarner Losh 		rlxval = c;
1294f39dd6a9SWarner Losh 		return CHAR;
12952a55deb1SDavid E. O'Brien 	case '\\':
1296d86a0988SRuslan Ermilov 		rlxval = quoted(&prestr);
12972a55deb1SDavid E. O'Brien 		return CHAR;
12982a55deb1SDavid E. O'Brien 	default:
12992a55deb1SDavid E. O'Brien 		rlxval = c;
13002a55deb1SDavid E. O'Brien 		return CHAR;
13012a55deb1SDavid E. O'Brien 	case '[':
130210ce5b99SWarner Losh 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
13032a55deb1SDavid E. O'Brien 			FATAL("out of space in reg expr %.10s..", lastre);
13042a55deb1SDavid E. O'Brien 		bp = buf;
13052a55deb1SDavid E. O'Brien 		if (*prestr == '^') {
13062a55deb1SDavid E. O'Brien 			cflag = 1;
13072a55deb1SDavid E. O'Brien 			prestr++;
13082a55deb1SDavid E. O'Brien 		}
13092a55deb1SDavid E. O'Brien 		else
13102a55deb1SDavid E. O'Brien 			cflag = 0;
1311f32a6403SWarner Losh 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1312addad6afSRong-En Fan 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
13132a55deb1SDavid E. O'Brien 			FATAL("out of space for reg expr %.10s...", lastre);
13142a55deb1SDavid E. O'Brien 		for (; ; ) {
1315f32a6403SWarner Losh 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1316f32a6403SWarner Losh 				for (i = 0; i < n; i++)
1317f32a6403SWarner Losh 					*bp++ = *prestr++;
1318f32a6403SWarner Losh 				continue;
1319f32a6403SWarner Losh 			}
13202a55deb1SDavid E. O'Brien 			if ((c = *prestr++) == '\\') {
13212a55deb1SDavid E. O'Brien 				*bp++ = '\\';
13222a55deb1SDavid E. O'Brien 				if ((c = *prestr++) == '\0')
13232a55deb1SDavid E. O'Brien 					FATAL("nonterminated character class %.20s...", lastre);
13242a55deb1SDavid E. O'Brien 				*bp++ = c;
13252a55deb1SDavid E. O'Brien 			/* } else if (c == '\n') { */
13262a55deb1SDavid E. O'Brien 			/* 	FATAL("newline in character class %.20s...", lastre); */
1327007c6572SDag-Erling Smørgrav 			} else if (c == '[' && *prestr == ':') {
1328007c6572SDag-Erling Smørgrav 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1329007c6572SDag-Erling Smørgrav 				for (cc = charclasses; cc->cc_name; cc++)
1330007c6572SDag-Erling Smørgrav 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1331007c6572SDag-Erling Smørgrav 						break;
1332007c6572SDag-Erling Smørgrav 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1333007c6572SDag-Erling Smørgrav 				    prestr[2 + cc->cc_namelen] == ']') {
1334007c6572SDag-Erling Smørgrav 					prestr += cc->cc_namelen + 3;
1335b5253557SWarner Losh 					/*
1336b5253557SWarner Losh 					 * BUG: We begin at 1, instead of 0, since we
1337b5253557SWarner Losh 					 * would otherwise prematurely terminate the
1338b5253557SWarner Losh 					 * string for classes like [[:cntrl:]]. This
1339b5253557SWarner Losh 					 * means that we can't match the NUL character,
1340b5253557SWarner Losh 					 * not without first adapting the entire
1341b5253557SWarner Losh 					 * program to track each string's length.
1342b5253557SWarner Losh 					 */
1343b5253557SWarner Losh 					for (i = 1; i <= UCHAR_MAX; i++) {
1344f32a6403SWarner Losh 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1345fc6b1dfeSDavid E. O'Brien 						    FATAL("out of space for reg expr %.10s...", lastre);
1346fc6b1dfeSDavid E. O'Brien 						if (cc->cc_func(i)) {
1347f39dd6a9SWarner Losh 							/* escape backslash */
1348f39dd6a9SWarner Losh 							if (i == '\\') {
1349f39dd6a9SWarner Losh 								*bp++ = '\\';
1350f39dd6a9SWarner Losh 								n++;
1351f39dd6a9SWarner Losh 							}
1352f39dd6a9SWarner Losh 
1353fc6b1dfeSDavid E. O'Brien 							*bp++ = i;
1354fc6b1dfeSDavid E. O'Brien 							n++;
1355fc6b1dfeSDavid E. O'Brien 						}
1356fc6b1dfeSDavid E. O'Brien 					}
1357007c6572SDag-Erling Smørgrav 				} else
1358007c6572SDag-Erling Smørgrav 					*bp++ = c;
1359b5253557SWarner Losh 			} else if (c == '[' && *prestr == '.') {
1360b5253557SWarner Losh 				char collate_char;
1361b5253557SWarner Losh 				prestr++;
1362b5253557SWarner Losh 				collate_char = *prestr++;
1363b5253557SWarner Losh 				if (*prestr == '.' && prestr[1] == ']') {
1364b5253557SWarner Losh 					prestr += 2;
1365b5253557SWarner Losh 					/* Found it: map via locale TBD: for
1366b5253557SWarner Losh 					   now, simply return this char.  This
1367b5253557SWarner Losh 					   is sufficient to pass conformance
1368b5253557SWarner Losh 					   test awk.ex 156
1369b5253557SWarner Losh 					 */
1370b5253557SWarner Losh 					if (*prestr == ']') {
1371b5253557SWarner Losh 						prestr++;
1372b5253557SWarner Losh 						rlxval = collate_char;
1373b5253557SWarner Losh 						return CHAR;
1374b5253557SWarner Losh 					}
1375b5253557SWarner Losh 				}
1376b5253557SWarner Losh 			} else if (c == '[' && *prestr == '=') {
1377b5253557SWarner Losh 				char equiv_char;
1378b5253557SWarner Losh 				prestr++;
1379b5253557SWarner Losh 				equiv_char = *prestr++;
1380b5253557SWarner Losh 				if (*prestr == '=' && prestr[1] == ']') {
1381b5253557SWarner Losh 					prestr += 2;
1382b5253557SWarner Losh 					/* Found it: map via locale TBD: for now
1383b5253557SWarner Losh 					   simply return this char. This is
1384b5253557SWarner Losh 					   sufficient to pass conformance test
1385b5253557SWarner Losh 					   awk.ex 156
1386b5253557SWarner Losh 					 */
1387b5253557SWarner Losh 					if (*prestr == ']') {
1388b5253557SWarner Losh 						prestr++;
1389b5253557SWarner Losh 						rlxval = equiv_char;
1390b5253557SWarner Losh 						return CHAR;
1391b5253557SWarner Losh 					}
1392b5253557SWarner Losh 				}
13932a55deb1SDavid E. O'Brien 			} else if (c == '\0') {
13942a55deb1SDavid E. O'Brien 				FATAL("nonterminated character class %.20s", lastre);
13952a55deb1SDavid E. O'Brien 			} else if (bp == buf) {	/* 1st char is special */
13962a55deb1SDavid E. O'Brien 				*bp++ = c;
13972a55deb1SDavid E. O'Brien 			} else if (c == ']') {
13982a55deb1SDavid E. O'Brien 				*bp++ = 0;
13992a55deb1SDavid E. O'Brien 				rlxstr = (uschar *) tostring((char *) buf);
14002a55deb1SDavid E. O'Brien 				if (cflag == 0)
14012a55deb1SDavid E. O'Brien 					return CCL;
14022a55deb1SDavid E. O'Brien 				else
14032a55deb1SDavid E. O'Brien 					return NCCL;
14042a55deb1SDavid E. O'Brien 			} else
14052a55deb1SDavid E. O'Brien 				*bp++ = c;
14062a55deb1SDavid E. O'Brien 		}
1407b5253557SWarner Losh 		break;
1408b5253557SWarner Losh 	case '{':
1409f32a6403SWarner Losh 		if (isdigit((int) *(prestr))) {
1410b5253557SWarner Losh 			num = 0;	/* Process as a repetition */
1411b5253557SWarner Losh 			n = -1; m = -1;
1412f39dd6a9SWarner Losh 			commafound = false;
1413f39dd6a9SWarner Losh 			digitfound = false;
1414b5253557SWarner Losh 			startreptok = prestr-1;
1415b5253557SWarner Losh 			/* Remember start of previous atom here ? */
1416b5253557SWarner Losh 		} else {        	/* just a { char, not a repetition */
1417b5253557SWarner Losh 			rlxval = c;
1418b5253557SWarner Losh 			return CHAR;
1419b5253557SWarner Losh                 }
1420b5253557SWarner Losh 		for (; ; ) {
1421b5253557SWarner Losh 			if ((c = *prestr++) == '}') {
1422b5253557SWarner Losh 				if (commafound) {
1423b5253557SWarner Losh 					if (digitfound) { /* {n,m} */
1424b5253557SWarner Losh 						m = num;
1425b5253557SWarner Losh 						if (m < n)
1426b5253557SWarner Losh 							FATAL("illegal repetition expression: class %.20s",
1427b5253557SWarner Losh 								lastre);
1428f39dd6a9SWarner Losh 						if (n == 0 && m == 1) {
1429b5253557SWarner Losh 							return QUEST;
1430b5253557SWarner Losh 						}
1431b5253557SWarner Losh 					} else {	/* {n,} */
1432f39dd6a9SWarner Losh 						if (n == 0)
1433f39dd6a9SWarner Losh 							return STAR;
1434f39dd6a9SWarner Losh 						else if (n == 1)
1435f39dd6a9SWarner Losh 							return PLUS;
1436b5253557SWarner Losh 					}
1437b5253557SWarner Losh 				} else {
1438b5253557SWarner Losh 					if (digitfound) { /* {n} same as {n,n} */
1439b5253557SWarner Losh 						n = num;
1440b5253557SWarner Losh 						m = num;
1441b5253557SWarner Losh 					} else {	/* {} */
1442b5253557SWarner Losh 						FATAL("illegal repetition expression: class %.20s",
1443b5253557SWarner Losh 							lastre);
1444b5253557SWarner Losh 					}
1445b5253557SWarner Losh 				}
1446b5253557SWarner Losh 				if (repeat(starttok, prestr-starttok, lastatom,
1447b5253557SWarner Losh 					   startreptok - lastatom, n, m) > 0) {
1448f39dd6a9SWarner Losh 					if (n == 0 && m == 0) {
1449f39dd6a9SWarner Losh 						return ZERO;
1450b5253557SWarner Losh 					}
1451b5253557SWarner Losh 					/* must rescan input for next token */
1452b5253557SWarner Losh 					goto rescan;
1453b5253557SWarner Losh 				}
1454b5253557SWarner Losh 				/* Failed to replace: eat up {...} characters
1455b5253557SWarner Losh 				   and treat like just PLUS */
1456b5253557SWarner Losh 				return PLUS;
1457b5253557SWarner Losh 			} else if (c == '\0') {
1458b5253557SWarner Losh 				FATAL("nonterminated character class %.20s",
1459b5253557SWarner Losh 					lastre);
1460b5253557SWarner Losh 			} else if (isdigit(c)) {
1461b5253557SWarner Losh 				num = 10 * num + c - '0';
1462f39dd6a9SWarner Losh 				digitfound = true;
1463b5253557SWarner Losh 			} else if (c == ',') {
1464b5253557SWarner Losh 				if (commafound)
1465b5253557SWarner Losh 					FATAL("illegal repetition expression: class %.20s",
1466b5253557SWarner Losh 						lastre);
1467b5253557SWarner Losh 				/* looking for {n,} or {n,m} */
1468f39dd6a9SWarner Losh 				commafound = true;
1469b5253557SWarner Losh 				n = num;
1470f39dd6a9SWarner Losh 				digitfound = false; /* reset */
1471b5253557SWarner Losh 				num = 0;
1472b5253557SWarner Losh 			} else {
1473b5253557SWarner Losh 				FATAL("illegal repetition expression: class %.20s",
1474b5253557SWarner Losh 					lastre);
1475b5253557SWarner Losh 			}
1476b5253557SWarner Losh 		}
1477b5253557SWarner Losh 		break;
14782a55deb1SDavid E. O'Brien 	}
14792a55deb1SDavid E. O'Brien }
14802a55deb1SDavid E. O'Brien 
cgoto(fa * f,int s,int c)14812a55deb1SDavid E. O'Brien int cgoto(fa *f, int s, int c)
14822a55deb1SDavid E. O'Brien {
14832a55deb1SDavid E. O'Brien 	int *p, *q;
1484f39dd6a9SWarner Losh 	int i, j, k;
14852a55deb1SDavid E. O'Brien 
1486f32a6403SWarner Losh 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
14872a55deb1SDavid E. O'Brien 	while (f->accept >= maxsetvec) {	/* guessing here! */
1488f39dd6a9SWarner Losh 		resizesetvec(__func__);
14892a55deb1SDavid E. O'Brien 	}
14902a55deb1SDavid E. O'Brien 	for (i = 0; i <= f->accept; i++)
14912a55deb1SDavid E. O'Brien 		setvec[i] = 0;
14922a55deb1SDavid E. O'Brien 	setcnt = 0;
1493f39dd6a9SWarner Losh 	resize_state(f, s);
14942a55deb1SDavid E. O'Brien 	/* compute positions of gototab[s,c] into setvec */
14952a55deb1SDavid E. O'Brien 	p = f->posns[s];
14962a55deb1SDavid E. O'Brien 	for (i = 1; i <= *p; i++) {
14972a55deb1SDavid E. O'Brien 		if ((k = f->re[p[i]].ltype) != FINAL) {
14982a55deb1SDavid E. O'Brien 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
14992a55deb1SDavid E. O'Brien 			 || (k == DOT && c != 0 && c != HAT)
15002a55deb1SDavid E. O'Brien 			 || (k == ALL && c != 0)
1501addad6afSRong-En Fan 			 || (k == EMPTYRE && c != 0)
1502f32a6403SWarner Losh 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1503f32a6403SWarner Losh 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
15042a55deb1SDavid E. O'Brien 				q = f->re[p[i]].lfollow;
15052a55deb1SDavid E. O'Brien 				for (j = 1; j <= *q; j++) {
15062a55deb1SDavid E. O'Brien 					if (q[j] >= maxsetvec) {
1507f39dd6a9SWarner Losh 						resizesetvec(__func__);
15082a55deb1SDavid E. O'Brien 					}
15092a55deb1SDavid E. O'Brien 					if (setvec[q[j]] == 0) {
15102a55deb1SDavid E. O'Brien 						setcnt++;
15112a55deb1SDavid E. O'Brien 						setvec[q[j]] = 1;
15122a55deb1SDavid E. O'Brien 					}
15132a55deb1SDavid E. O'Brien 				}
15142a55deb1SDavid E. O'Brien 			}
15152a55deb1SDavid E. O'Brien 		}
15162a55deb1SDavid E. O'Brien 	}
15172a55deb1SDavid E. O'Brien 	/* determine if setvec is a previous state */
15182a55deb1SDavid E. O'Brien 	tmpset[0] = setcnt;
15192a55deb1SDavid E. O'Brien 	j = 1;
15202a55deb1SDavid E. O'Brien 	for (i = f->accept; i >= 0; i--)
15212a55deb1SDavid E. O'Brien 		if (setvec[i]) {
15222a55deb1SDavid E. O'Brien 			tmpset[j++] = i;
15232a55deb1SDavid E. O'Brien 		}
1524f39dd6a9SWarner Losh 	resize_state(f, f->curstat > s ? f->curstat : s);
15252a55deb1SDavid E. O'Brien 	/* tmpset == previous state? */
15262a55deb1SDavid E. O'Brien 	for (i = 1; i <= f->curstat; i++) {
15272a55deb1SDavid E. O'Brien 		p = f->posns[i];
15282a55deb1SDavid E. O'Brien 		if ((k = tmpset[0]) != p[0])
15292a55deb1SDavid E. O'Brien 			goto different;
15302a55deb1SDavid E. O'Brien 		for (j = 1; j <= k; j++)
15312a55deb1SDavid E. O'Brien 			if (tmpset[j] != p[j])
15322a55deb1SDavid E. O'Brien 				goto different;
15332a55deb1SDavid E. O'Brien 		/* setvec is state i */
1534f39dd6a9SWarner Losh 		if (c != HAT)
1535f32a6403SWarner Losh 			set_gototab(f, s, c, i);
15362a55deb1SDavid E. O'Brien 		return i;
15372a55deb1SDavid E. O'Brien 	  different:;
15382a55deb1SDavid E. O'Brien 	}
15392a55deb1SDavid E. O'Brien 
15402a55deb1SDavid E. O'Brien 	/* add tmpset to current set of states */
15412a55deb1SDavid E. O'Brien 	++(f->curstat);
1542f39dd6a9SWarner Losh 	resize_state(f, f->curstat);
1543f32a6403SWarner Losh 	clear_gototab(f, f->curstat);
15442a55deb1SDavid E. O'Brien 	xfree(f->posns[f->curstat]);
1545f39dd6a9SWarner Losh 	p = intalloc(setcnt + 1, __func__);
15462a55deb1SDavid E. O'Brien 
15472a55deb1SDavid E. O'Brien 	f->posns[f->curstat] = p;
1548f39dd6a9SWarner Losh 	if (c != HAT)
1549f32a6403SWarner Losh 		set_gototab(f, s, c, f->curstat);
15502a55deb1SDavid E. O'Brien 	for (i = 0; i <= setcnt; i++)
15512a55deb1SDavid E. O'Brien 		p[i] = tmpset[i];
15522a55deb1SDavid E. O'Brien 	if (setvec[f->accept])
15532a55deb1SDavid E. O'Brien 		f->out[f->curstat] = 1;
15542a55deb1SDavid E. O'Brien 	else
15552a55deb1SDavid E. O'Brien 		f->out[f->curstat] = 0;
15562a55deb1SDavid E. O'Brien 	return f->curstat;
15572a55deb1SDavid E. O'Brien }
15582a55deb1SDavid E. O'Brien 
15592a55deb1SDavid E. O'Brien 
freefa(fa * f)15602a55deb1SDavid E. O'Brien void freefa(fa *f)	/* free a finite automaton */
15612a55deb1SDavid E. O'Brien {
15622a55deb1SDavid E. O'Brien 	int i;
15632a55deb1SDavid E. O'Brien 
15642a55deb1SDavid E. O'Brien 	if (f == NULL)
15652a55deb1SDavid E. O'Brien 		return;
1566f39dd6a9SWarner Losh 	for (i = 0; i < f->state_count; i++)
1567f32a6403SWarner Losh 		xfree(f->gototab[i].entries);
1568f32a6403SWarner Losh 	xfree(f->gototab);
15692a55deb1SDavid E. O'Brien 	for (i = 0; i <= f->curstat; i++)
15702a55deb1SDavid E. O'Brien 		xfree(f->posns[i]);
15712a55deb1SDavid E. O'Brien 	for (i = 0; i <= f->accept; i++) {
15722a55deb1SDavid E. O'Brien 		xfree(f->re[i].lfollow);
15732a55deb1SDavid E. O'Brien 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1574f39dd6a9SWarner Losh 			xfree(f->re[i].lval.np);
15752a55deb1SDavid E. O'Brien 	}
15762a55deb1SDavid E. O'Brien 	xfree(f->restr);
1577f39dd6a9SWarner Losh 	xfree(f->out);
1578f39dd6a9SWarner Losh 	xfree(f->posns);
1579f39dd6a9SWarner Losh 	xfree(f->gototab);
15802a55deb1SDavid E. O'Brien 	xfree(f);
15812a55deb1SDavid E. O'Brien }
1582