xref: /freebsd/contrib/one-true-awk/b.c (revision f32a6403d34654ac6e61182d09abb5e85850e1ee)
12a55deb1SDavid E. O'Brien /****************************************************************
22a55deb1SDavid E. O'Brien Copyright (C) Lucent Technologies 1997
32a55deb1SDavid E. O'Brien All Rights Reserved
42a55deb1SDavid E. O'Brien 
52a55deb1SDavid E. O'Brien Permission to use, copy, modify, and distribute this software and
62a55deb1SDavid E. O'Brien its documentation for any purpose and without fee is hereby
72a55deb1SDavid E. O'Brien granted, provided that the above copyright notice appear in all
82a55deb1SDavid E. O'Brien copies and that both that the copyright notice and this
92a55deb1SDavid E. O'Brien permission notice and warranty disclaimer appear in supporting
102a55deb1SDavid E. O'Brien documentation, and that the name Lucent Technologies or any of
112a55deb1SDavid E. O'Brien its entities not be used in advertising or publicity pertaining
122a55deb1SDavid E. O'Brien to distribution of the software without specific, written prior
132a55deb1SDavid E. O'Brien permission.
142a55deb1SDavid E. O'Brien 
152a55deb1SDavid E. O'Brien LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
162a55deb1SDavid E. O'Brien INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
172a55deb1SDavid E. O'Brien IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
182a55deb1SDavid E. O'Brien SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
192a55deb1SDavid E. O'Brien WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
202a55deb1SDavid E. O'Brien IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
212a55deb1SDavid E. O'Brien ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
222a55deb1SDavid E. O'Brien THIS SOFTWARE.
232a55deb1SDavid E. O'Brien ****************************************************************/
242a55deb1SDavid E. O'Brien 
25addad6afSRong-En Fan /* lasciate ogne speranza, voi ch'intrate. */
262a55deb1SDavid E. O'Brien 
272a55deb1SDavid E. O'Brien #define	DEBUG
282a55deb1SDavid E. O'Brien 
292a55deb1SDavid E. O'Brien #include <ctype.h>
30b5253557SWarner Losh #include <limits.h>
312a55deb1SDavid E. O'Brien #include <stdio.h>
322a55deb1SDavid E. O'Brien #include <string.h>
332a55deb1SDavid E. O'Brien #include <stdlib.h>
342a55deb1SDavid E. O'Brien #include "awk.h"
35f39dd6a9SWarner Losh #include "awkgram.tab.h"
362a55deb1SDavid E. O'Brien 
372a55deb1SDavid E. O'Brien #define MAXLIN 22
382a55deb1SDavid E. O'Brien 
392a55deb1SDavid E. O'Brien #define type(v)		(v)->nobj	/* badly overloaded here */
402a55deb1SDavid E. O'Brien #define info(v)		(v)->ntype	/* badly overloaded here */
412a55deb1SDavid E. O'Brien #define left(v)		(v)->narg[0]
422a55deb1SDavid E. O'Brien #define right(v)	(v)->narg[1]
432a55deb1SDavid E. O'Brien #define parent(v)	(v)->nnext
442a55deb1SDavid E. O'Brien 
452a55deb1SDavid E. O'Brien #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46addad6afSRong-En Fan #define ELEAF	case EMPTYRE:		/* empty string in regexp */
472a55deb1SDavid E. O'Brien #define UNARY	case STAR: case PLUS: case QUEST:
482a55deb1SDavid E. O'Brien 
492a55deb1SDavid E. O'Brien /* encoding in tree Nodes:
50addad6afSRong-En Fan 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
512a55deb1SDavid E. O'Brien 		left is index, right contains value or pointer to value
522a55deb1SDavid E. O'Brien 	unary (STAR, PLUS, QUEST): left is child, right is null
532a55deb1SDavid E. O'Brien 	binary (CAT, OR): left and right are children
542a55deb1SDavid E. O'Brien 	parent contains pointer to parent
552a55deb1SDavid E. O'Brien */
562a55deb1SDavid E. O'Brien 
572a55deb1SDavid E. O'Brien 
582a55deb1SDavid E. O'Brien int	*setvec;
592a55deb1SDavid E. O'Brien int	*tmpset;
602a55deb1SDavid E. O'Brien int	maxsetvec = 0;
612a55deb1SDavid E. O'Brien 
622a55deb1SDavid E. O'Brien int	rtok;		/* next token in current re */
632a55deb1SDavid E. O'Brien int	rlxval;
64f39dd6a9SWarner Losh static const uschar	*rlxstr;
65f39dd6a9SWarner Losh static const uschar	*prestr;	/* current position in current re */
66f39dd6a9SWarner Losh static const uschar	*lastre;	/* origin of last re */
67f39dd6a9SWarner Losh static const uschar	*lastatom;	/* origin of last Atom */
68f39dd6a9SWarner Losh static const uschar	*starttok;
69f39dd6a9SWarner Losh static const uschar 	*basestr;	/* starts with original, replaced during
70b5253557SWarner Losh 				   repetition processing */
71f39dd6a9SWarner Losh static const uschar 	*firstbasestr;
722a55deb1SDavid E. O'Brien 
732a55deb1SDavid E. O'Brien static	int setcnt;
742a55deb1SDavid E. O'Brien static	int poscnt;
752a55deb1SDavid E. O'Brien 
76f39dd6a9SWarner Losh const char	*patbeg;
772a55deb1SDavid E. O'Brien int	patlen;
782a55deb1SDavid E. O'Brien 
79f39dd6a9SWarner Losh #define	NFA	128	/* cache this many dynamic fa's */
802a55deb1SDavid E. O'Brien fa	*fatab[NFA];
812a55deb1SDavid E. O'Brien int	nfatab	= 0;	/* entries in fatab */
822a55deb1SDavid E. O'Brien 
83*f32a6403SWarner Losh extern int u8_nextlen(const char *s);
84*f32a6403SWarner Losh 
85*f32a6403SWarner Losh 
86*f32a6403SWarner Losh /* utf-8 mechanism:
87*f32a6403SWarner Losh 
88*f32a6403SWarner Losh    For most of Awk, utf-8 strings just "work", since they look like
89*f32a6403SWarner Losh    null-terminated sequences of 8-bit bytes.
90*f32a6403SWarner Losh 
91*f32a6403SWarner Losh    Functions like length(), index(), and substr() have to operate
92*f32a6403SWarner Losh    in units of utf-8 characters.  The u8_* functions in run.c
93*f32a6403SWarner Losh    handle this.
94*f32a6403SWarner Losh 
95*f32a6403SWarner Losh    Regular expressions are more complicated, since the basic
96*f32a6403SWarner Losh    mechanism of the goto table used 8-bit byte indices into the
97*f32a6403SWarner Losh    gototab entries to compute the next state.  Unicode is a lot
98*f32a6403SWarner Losh    bigger, so the gototab entries are now structs with a character
99*f32a6403SWarner Losh    and a next state. These are sorted by code point and binary
100*f32a6403SWarner Losh    searched.
101*f32a6403SWarner Losh 
102*f32a6403SWarner Losh    Throughout the RE mechanism in b.c, utf-8 characters are
103*f32a6403SWarner Losh    converted to their utf-32 value.  This mostly shows up in
104*f32a6403SWarner Losh    cclenter, which expands character class ranges like a-z and now
105*f32a6403SWarner Losh    alpha-omega.  The size of a gototab array is still about 256.
106*f32a6403SWarner Losh    This should be dynamic, but for now things work ok for a single
107*f32a6403SWarner Losh    code page of Unicode, which is the most likely case.
108*f32a6403SWarner Losh 
109*f32a6403SWarner Losh    The code changes are localized in run.c and b.c.  I have added a
110*f32a6403SWarner Losh    handful of functions to somewhat better hide the implementation,
111*f32a6403SWarner Losh    but a lot more could be done.
112*f32a6403SWarner Losh 
113*f32a6403SWarner Losh  */
114*f32a6403SWarner Losh 
115*f32a6403SWarner Losh static int entry_cmp(const void *l, const void *r);
116*f32a6403SWarner Losh static int get_gototab(fa*, int, int);
117*f32a6403SWarner Losh static int set_gototab(fa*, int, int, int);
118*f32a6403SWarner Losh static void clear_gototab(fa*, int);
119*f32a6403SWarner Losh extern int u8_rune(int *, const char *);
120*f32a6403SWarner Losh 
121f39dd6a9SWarner Losh static int *
122f39dd6a9SWarner Losh intalloc(size_t n, const char *f)
123f39dd6a9SWarner Losh {
124f39dd6a9SWarner Losh 	int *p = (int *) calloc(n, sizeof(int));
125f39dd6a9SWarner Losh 	if (p == NULL)
126f39dd6a9SWarner Losh 		overflo(f);
127f39dd6a9SWarner Losh 	return p;
128f39dd6a9SWarner Losh }
129f39dd6a9SWarner Losh 
130f39dd6a9SWarner Losh static void
131f39dd6a9SWarner Losh resizesetvec(const char *f)
132f39dd6a9SWarner Losh {
133f39dd6a9SWarner Losh 	if (maxsetvec == 0)
134f39dd6a9SWarner Losh 		maxsetvec = MAXLIN;
135f39dd6a9SWarner Losh 	else
136f39dd6a9SWarner Losh 		maxsetvec *= 4;
137f39dd6a9SWarner Losh 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138f39dd6a9SWarner Losh 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139f39dd6a9SWarner Losh 	if (setvec == NULL || tmpset == NULL)
140f39dd6a9SWarner Losh 		overflo(f);
141f39dd6a9SWarner Losh }
142f39dd6a9SWarner Losh 
143f39dd6a9SWarner Losh static void
144f39dd6a9SWarner Losh resize_state(fa *f, int state)
145f39dd6a9SWarner Losh {
146*f32a6403SWarner Losh 	gtt *p;
147f39dd6a9SWarner Losh 	uschar *p2;
148f39dd6a9SWarner Losh 	int **p3;
149f39dd6a9SWarner Losh 	int i, new_count;
150f39dd6a9SWarner Losh 
151f39dd6a9SWarner Losh 	if (++state < f->state_count)
152f39dd6a9SWarner Losh 		return;
153f39dd6a9SWarner Losh 
154f39dd6a9SWarner Losh 	new_count = state + 10; /* needs to be tuned */
155f39dd6a9SWarner Losh 
156*f32a6403SWarner Losh 	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157f39dd6a9SWarner Losh 	if (p == NULL)
158f39dd6a9SWarner Losh 		goto out;
159f39dd6a9SWarner Losh 	f->gototab = p;
160f39dd6a9SWarner Losh 
161f39dd6a9SWarner Losh 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162f39dd6a9SWarner Losh 	if (p2 == NULL)
163f39dd6a9SWarner Losh 		goto out;
164f39dd6a9SWarner Losh 	f->out = p2;
165f39dd6a9SWarner Losh 
166f39dd6a9SWarner Losh 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167f39dd6a9SWarner Losh 	if (p3 == NULL)
168f39dd6a9SWarner Losh 		goto out;
169f39dd6a9SWarner Losh 	f->posns = p3;
170f39dd6a9SWarner Losh 
171f39dd6a9SWarner Losh 	for (i = f->state_count; i < new_count; ++i) {
172*f32a6403SWarner Losh 		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173*f32a6403SWarner Losh 		if (f->gototab[i].entries == NULL)
174f39dd6a9SWarner Losh 			goto out;
175*f32a6403SWarner Losh 		f->gototab[i].allocated = NCHARS;
176*f32a6403SWarner Losh 		f->gototab[i].inuse = 0;
177f39dd6a9SWarner Losh 		f->out[i] = 0;
178f39dd6a9SWarner Losh 		f->posns[i] = NULL;
179f39dd6a9SWarner Losh 	}
180f39dd6a9SWarner Losh 	f->state_count = new_count;
181f39dd6a9SWarner Losh 	return;
182f39dd6a9SWarner Losh out:
183f39dd6a9SWarner Losh 	overflo(__func__);
184f39dd6a9SWarner Losh }
185f39dd6a9SWarner Losh 
186f39dd6a9SWarner Losh fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
1872a55deb1SDavid E. O'Brien {
1882a55deb1SDavid E. O'Brien 	int i, use, nuse;
1892a55deb1SDavid E. O'Brien 	fa *pfa;
1902a55deb1SDavid E. O'Brien 	static int now = 1;
1912a55deb1SDavid E. O'Brien 
19210ce5b99SWarner Losh 	if (setvec == NULL) {	/* first time through any RE */
193f39dd6a9SWarner Losh 		resizesetvec(__func__);
1942a55deb1SDavid E. O'Brien 	}
1952a55deb1SDavid E. O'Brien 
196f39dd6a9SWarner Losh 	if (compile_time != RUNNING)	/* a constant for sure */
1972a55deb1SDavid E. O'Brien 		return mkdfa(s, anchor);
1982a55deb1SDavid E. O'Brien 	for (i = 0; i < nfatab; i++)	/* is it there already? */
1992a55deb1SDavid E. O'Brien 		if (fatab[i]->anchor == anchor
200007c6572SDag-Erling Smørgrav 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
2012a55deb1SDavid E. O'Brien 			fatab[i]->use = now++;
2022a55deb1SDavid E. O'Brien 			return fatab[i];
2032a55deb1SDavid E. O'Brien 		}
2042a55deb1SDavid E. O'Brien 	pfa = mkdfa(s, anchor);
2052a55deb1SDavid E. O'Brien 	if (nfatab < NFA) {	/* room for another */
2062a55deb1SDavid E. O'Brien 		fatab[nfatab] = pfa;
2072a55deb1SDavid E. O'Brien 		fatab[nfatab]->use = now++;
2082a55deb1SDavid E. O'Brien 		nfatab++;
2092a55deb1SDavid E. O'Brien 		return pfa;
2102a55deb1SDavid E. O'Brien 	}
2112a55deb1SDavid E. O'Brien 	use = fatab[0]->use;	/* replace least-recently used */
2122a55deb1SDavid E. O'Brien 	nuse = 0;
2132a55deb1SDavid E. O'Brien 	for (i = 1; i < nfatab; i++)
2142a55deb1SDavid E. O'Brien 		if (fatab[i]->use < use) {
2152a55deb1SDavid E. O'Brien 			use = fatab[i]->use;
2162a55deb1SDavid E. O'Brien 			nuse = i;
2172a55deb1SDavid E. O'Brien 		}
2182a55deb1SDavid E. O'Brien 	freefa(fatab[nuse]);
2192a55deb1SDavid E. O'Brien 	fatab[nuse] = pfa;
2202a55deb1SDavid E. O'Brien 	pfa->use = now++;
2212a55deb1SDavid E. O'Brien 	return pfa;
2222a55deb1SDavid E. O'Brien }
2232a55deb1SDavid E. O'Brien 
224f39dd6a9SWarner Losh fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
225f39dd6a9SWarner Losh 				/* anchor = true for anchored matches, else false */
2262a55deb1SDavid E. O'Brien {
2272a55deb1SDavid E. O'Brien 	Node *p, *p1;
2282a55deb1SDavid E. O'Brien 	fa *f;
2292a55deb1SDavid E. O'Brien 
230f39dd6a9SWarner Losh 	firstbasestr = (const uschar *) s;
231b5253557SWarner Losh 	basestr = firstbasestr;
2322a55deb1SDavid E. O'Brien 	p = reparse(s);
2332a55deb1SDavid E. O'Brien 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
2342a55deb1SDavid E. O'Brien 		/* put ALL STAR in front of reg.  exp. */
2352a55deb1SDavid E. O'Brien 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
2362a55deb1SDavid E. O'Brien 		/* put FINAL after reg.  exp. */
2372a55deb1SDavid E. O'Brien 
2382a55deb1SDavid E. O'Brien 	poscnt = 0;
2392a55deb1SDavid E. O'Brien 	penter(p1);	/* enter parent pointers and leaf indices */
2402a55deb1SDavid E. O'Brien 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241f39dd6a9SWarner Losh 		overflo(__func__);
2422a55deb1SDavid E. O'Brien 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
2432a55deb1SDavid E. O'Brien 	cfoll(f, p1);	/* set up follow sets */
2442a55deb1SDavid E. O'Brien 	freetr(p1);
245f39dd6a9SWarner Losh 	resize_state(f, 1);
246f39dd6a9SWarner Losh 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247f39dd6a9SWarner Losh 	f->posns[1] = intalloc(1, __func__);
2482a55deb1SDavid E. O'Brien 	*f->posns[1] = 0;
2492a55deb1SDavid E. O'Brien 	f->initstat = makeinit(f, anchor);
2502a55deb1SDavid E. O'Brien 	f->anchor = anchor;
2512a55deb1SDavid E. O'Brien 	f->restr = (uschar *) tostring(s);
252b5253557SWarner Losh 	if (firstbasestr != basestr) {
253b5253557SWarner Losh 		if (basestr)
254b5253557SWarner Losh 			xfree(basestr);
255b5253557SWarner Losh 	}
2562a55deb1SDavid E. O'Brien 	return f;
2572a55deb1SDavid E. O'Brien }
2582a55deb1SDavid E. O'Brien 
259f39dd6a9SWarner Losh int makeinit(fa *f, bool anchor)
2602a55deb1SDavid E. O'Brien {
2612a55deb1SDavid E. O'Brien 	int i, k;
2622a55deb1SDavid E. O'Brien 
2632a55deb1SDavid E. O'Brien 	f->curstat = 2;
2642a55deb1SDavid E. O'Brien 	f->out[2] = 0;
2652a55deb1SDavid E. O'Brien 	k = *(f->re[0].lfollow);
2662a55deb1SDavid E. O'Brien 	xfree(f->posns[2]);
267f39dd6a9SWarner Losh 	f->posns[2] = intalloc(k + 1,  __func__);
2682a55deb1SDavid E. O'Brien 	for (i = 0; i <= k; i++) {
2692a55deb1SDavid E. O'Brien 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
2702a55deb1SDavid E. O'Brien 	}
2712a55deb1SDavid E. O'Brien 	if ((f->posns[2])[1] == f->accept)
2722a55deb1SDavid E. O'Brien 		f->out[2] = 1;
273*f32a6403SWarner Losh 	clear_gototab(f, 2);
2742a55deb1SDavid E. O'Brien 	f->curstat = cgoto(f, 2, HAT);
2752a55deb1SDavid E. O'Brien 	if (anchor) {
2762a55deb1SDavid E. O'Brien 		*f->posns[2] = k-1;	/* leave out position 0 */
2772a55deb1SDavid E. O'Brien 		for (i = 0; i < k; i++) {
2782a55deb1SDavid E. O'Brien 			(f->posns[0])[i] = (f->posns[2])[i];
2792a55deb1SDavid E. O'Brien 		}
2802a55deb1SDavid E. O'Brien 
2812a55deb1SDavid E. O'Brien 		f->out[0] = f->out[2];
2822a55deb1SDavid E. O'Brien 		if (f->curstat != 2)
2832a55deb1SDavid E. O'Brien 			--(*f->posns[f->curstat]);
2842a55deb1SDavid E. O'Brien 	}
2852a55deb1SDavid E. O'Brien 	return f->curstat;
2862a55deb1SDavid E. O'Brien }
2872a55deb1SDavid E. O'Brien 
2882a55deb1SDavid E. O'Brien void penter(Node *p)	/* set up parent pointers and leaf indices */
2892a55deb1SDavid E. O'Brien {
2902a55deb1SDavid E. O'Brien 	switch (type(p)) {
291addad6afSRong-En Fan 	ELEAF
2922a55deb1SDavid E. O'Brien 	LEAF
2932a55deb1SDavid E. O'Brien 		info(p) = poscnt;
2942a55deb1SDavid E. O'Brien 		poscnt++;
2952a55deb1SDavid E. O'Brien 		break;
2962a55deb1SDavid E. O'Brien 	UNARY
2972a55deb1SDavid E. O'Brien 		penter(left(p));
2982a55deb1SDavid E. O'Brien 		parent(left(p)) = p;
2992a55deb1SDavid E. O'Brien 		break;
3002a55deb1SDavid E. O'Brien 	case CAT:
3012a55deb1SDavid E. O'Brien 	case OR:
3022a55deb1SDavid E. O'Brien 		penter(left(p));
3032a55deb1SDavid E. O'Brien 		penter(right(p));
3042a55deb1SDavid E. O'Brien 		parent(left(p)) = p;
3052a55deb1SDavid E. O'Brien 		parent(right(p)) = p;
3062a55deb1SDavid E. O'Brien 		break;
307f39dd6a9SWarner Losh 	case ZERO:
308f39dd6a9SWarner Losh 		break;
3092a55deb1SDavid E. O'Brien 	default:	/* can't happen */
3102a55deb1SDavid E. O'Brien 		FATAL("can't happen: unknown type %d in penter", type(p));
3112a55deb1SDavid E. O'Brien 		break;
3122a55deb1SDavid E. O'Brien 	}
3132a55deb1SDavid E. O'Brien }
3142a55deb1SDavid E. O'Brien 
3152a55deb1SDavid E. O'Brien void freetr(Node *p)	/* free parse tree */
3162a55deb1SDavid E. O'Brien {
3172a55deb1SDavid E. O'Brien 	switch (type(p)) {
318addad6afSRong-En Fan 	ELEAF
3192a55deb1SDavid E. O'Brien 	LEAF
3202a55deb1SDavid E. O'Brien 		xfree(p);
3212a55deb1SDavid E. O'Brien 		break;
3222a55deb1SDavid E. O'Brien 	UNARY
323f39dd6a9SWarner Losh 	case ZERO:
3242a55deb1SDavid E. O'Brien 		freetr(left(p));
3252a55deb1SDavid E. O'Brien 		xfree(p);
3262a55deb1SDavid E. O'Brien 		break;
3272a55deb1SDavid E. O'Brien 	case CAT:
3282a55deb1SDavid E. O'Brien 	case OR:
3292a55deb1SDavid E. O'Brien 		freetr(left(p));
3302a55deb1SDavid E. O'Brien 		freetr(right(p));
3312a55deb1SDavid E. O'Brien 		xfree(p);
3322a55deb1SDavid E. O'Brien 		break;
3332a55deb1SDavid E. O'Brien 	default:	/* can't happen */
3342a55deb1SDavid E. O'Brien 		FATAL("can't happen: unknown type %d in freetr", type(p));
3352a55deb1SDavid E. O'Brien 		break;
3362a55deb1SDavid E. O'Brien 	}
3372a55deb1SDavid E. O'Brien }
3382a55deb1SDavid E. O'Brien 
3392a55deb1SDavid E. O'Brien /* in the parsing of regular expressions, metacharacters like . have */
3402a55deb1SDavid E. O'Brien /* to be seen literally;  \056 is not a metacharacter. */
3412a55deb1SDavid E. O'Brien 
342*f32a6403SWarner Losh int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
3432a55deb1SDavid E. O'Brien {			/* only pick up one 8-bit byte (2 chars) */
344f39dd6a9SWarner Losh 	const uschar *p;
3452a55deb1SDavid E. O'Brien 	int n = 0;
3462a55deb1SDavid E. O'Brien 	int i;
3472a55deb1SDavid E. O'Brien 
348*f32a6403SWarner Losh 	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349*f32a6403SWarner Losh 		if (isdigit((int) *p))
3502a55deb1SDavid E. O'Brien 			n = 16 * n + *p - '0';
3512a55deb1SDavid E. O'Brien 		else if (*p >= 'a' && *p <= 'f')
3522a55deb1SDavid E. O'Brien 			n = 16 * n + *p - 'a' + 10;
3532a55deb1SDavid E. O'Brien 		else if (*p >= 'A' && *p <= 'F')
3542a55deb1SDavid E. O'Brien 			n = 16 * n + *p - 'A' + 10;
3552a55deb1SDavid E. O'Brien 	}
356f39dd6a9SWarner Losh 	*pp = p;
3572a55deb1SDavid E. O'Brien 	return n;
3582a55deb1SDavid E. O'Brien }
3592a55deb1SDavid E. O'Brien 
360*f32a6403SWarner Losh 
361*f32a6403SWarner Losh 
3622a55deb1SDavid E. O'Brien #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
3632a55deb1SDavid E. O'Brien 
364f39dd6a9SWarner Losh int quoted(const uschar **pp)	/* pick up next thing after a \\ */
3652a55deb1SDavid E. O'Brien 			/* and increment *pp */
3662a55deb1SDavid E. O'Brien {
367f39dd6a9SWarner Losh 	const uschar *p = *pp;
3682a55deb1SDavid E. O'Brien 	int c;
3692a55deb1SDavid E. O'Brien 
370*f32a6403SWarner Losh /* BUG: should advance by utf-8 char even if makes no sense */
371*f32a6403SWarner Losh 
372*f32a6403SWarner Losh 	if ((c = *p++) == 't') {
3732a55deb1SDavid E. O'Brien 		c = '\t';
374*f32a6403SWarner Losh 	} else if (c == 'n') {
3752a55deb1SDavid E. O'Brien 		c = '\n';
376*f32a6403SWarner Losh 	} else if (c == 'f') {
3772a55deb1SDavid E. O'Brien 		c = '\f';
378*f32a6403SWarner Losh 	} else if (c == 'r') {
3792a55deb1SDavid E. O'Brien 		c = '\r';
380*f32a6403SWarner Losh 	} else if (c == 'b') {
3812a55deb1SDavid E. O'Brien 		c = '\b';
382*f32a6403SWarner Losh 	} else if (c == 'v') {
383f39dd6a9SWarner Losh 		c = '\v';
384*f32a6403SWarner Losh 	} else if (c == 'a') {
385f39dd6a9SWarner Losh 		c = '\a';
386*f32a6403SWarner Losh 	} else if (c == '\\') {
3872a55deb1SDavid E. O'Brien 		c = '\\';
388*f32a6403SWarner Losh 	} else if (c == 'x') {	/* 2 hex digits follow */
389*f32a6403SWarner Losh 		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
390*f32a6403SWarner Losh 	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
391*f32a6403SWarner Losh 		c = hexstr(&p, 8);
3922a55deb1SDavid E. O'Brien 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
3932a55deb1SDavid E. O'Brien 		int n = c - '0';
3942a55deb1SDavid E. O'Brien 		if (isoctdigit(*p)) {
3952a55deb1SDavid E. O'Brien 			n = 8 * n + *p++ - '0';
3962a55deb1SDavid E. O'Brien 			if (isoctdigit(*p))
3972a55deb1SDavid E. O'Brien 				n = 8 * n + *p++ - '0';
3982a55deb1SDavid E. O'Brien 		}
3992a55deb1SDavid E. O'Brien 		c = n;
4002a55deb1SDavid E. O'Brien 	} /* else */
4012a55deb1SDavid E. O'Brien 		/* c = c; */
4022a55deb1SDavid E. O'Brien 	*pp = p;
4032a55deb1SDavid E. O'Brien 	return c;
4042a55deb1SDavid E. O'Brien }
4052a55deb1SDavid E. O'Brien 
406*f32a6403SWarner Losh int *cclenter(const char *argp)	/* add a character class */
4072a55deb1SDavid E. O'Brien {
408628bd30aSWarner Losh 	int i, c, c2;
409*f32a6403SWarner Losh 	int n;
410*f32a6403SWarner Losh 	const uschar *p = (const uschar *) argp;
411*f32a6403SWarner Losh 	int *bp, *retp;
412*f32a6403SWarner Losh 	static int *buf = NULL;
4132a55deb1SDavid E. O'Brien 	static int bufsz = 100;
4142a55deb1SDavid E. O'Brien 
415*f32a6403SWarner Losh 	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
4162a55deb1SDavid E. O'Brien 		FATAL("out of space for character class [%.10s...] 1", p);
4172a55deb1SDavid E. O'Brien 	bp = buf;
418*f32a6403SWarner Losh 	for (i = 0; *p != 0; ) {
419*f32a6403SWarner Losh 		n = u8_rune(&c, (const char *) p);
420*f32a6403SWarner Losh 		p += n;
4212a55deb1SDavid E. O'Brien 		if (c == '\\') {
422d86a0988SRuslan Ermilov 			c = quoted(&p);
4232a55deb1SDavid E. O'Brien 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
4242a55deb1SDavid E. O'Brien 			if (*p != 0) {
4252a55deb1SDavid E. O'Brien 				c = bp[-1];
426*f32a6403SWarner Losh 				/* c2 = *p++; */
427*f32a6403SWarner Losh 				n = u8_rune(&c2, (const char *) p);
428*f32a6403SWarner Losh 				p += n;
4292a55deb1SDavid E. O'Brien 				if (c2 == '\\')
430*f32a6403SWarner Losh 					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
431628bd30aSWarner Losh 				if (c > c2) {	/* empty; ignore */
4322a55deb1SDavid E. O'Brien 					bp--;
4332a55deb1SDavid E. O'Brien 					i--;
4342a55deb1SDavid E. O'Brien 					continue;
4352a55deb1SDavid E. O'Brien 				}
436628bd30aSWarner Losh 				while (c < c2) {
437*f32a6403SWarner Losh 					if (i >= bufsz) {
438*f32a6403SWarner Losh 						bufsz *= 2;
439*f32a6403SWarner Losh 						buf = (int *) realloc(buf, bufsz * sizeof(int));
440*f32a6403SWarner Losh 						if (buf == NULL)
4412a55deb1SDavid E. O'Brien 							FATAL("out of space for character class [%.10s...] 2", p);
442*f32a6403SWarner Losh 						bp = buf + i;
443*f32a6403SWarner Losh 					}
444628bd30aSWarner Losh 					*bp++ = ++c;
4452a55deb1SDavid E. O'Brien 					i++;
4462a55deb1SDavid E. O'Brien 				}
4472a55deb1SDavid E. O'Brien 				continue;
4482a55deb1SDavid E. O'Brien 			}
4492a55deb1SDavid E. O'Brien 		}
450*f32a6403SWarner Losh 		if (i >= bufsz) {
451*f32a6403SWarner Losh 			bufsz *= 2;
452*f32a6403SWarner Losh 			buf = (int *) realloc(buf, bufsz * sizeof(int));
453*f32a6403SWarner Losh 			if (buf == NULL)
454*f32a6403SWarner Losh 				FATAL("out of space for character class [%.10s...] 2", p);
455*f32a6403SWarner Losh 			bp = buf + i;
456*f32a6403SWarner Losh 		}
4572a55deb1SDavid E. O'Brien 		*bp++ = c;
4582a55deb1SDavid E. O'Brien 		i++;
4592a55deb1SDavid E. O'Brien 	}
4602a55deb1SDavid E. O'Brien 	*bp = 0;
461*f32a6403SWarner Losh 	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
462*f32a6403SWarner Losh 	/* xfree(op);  BUG: what are we freeing here? */
463*f32a6403SWarner Losh 	retp = (int *) calloc(bp-buf+1, sizeof(int));
464*f32a6403SWarner Losh 	for (i = 0; i < bp-buf+1; i++)
465*f32a6403SWarner Losh 		retp[i] = buf[i];
466*f32a6403SWarner Losh 	return retp;
4672a55deb1SDavid E. O'Brien }
4682a55deb1SDavid E. O'Brien 
469813da98dSDavid E. O'Brien void overflo(const char *s)
4702a55deb1SDavid E. O'Brien {
471f39dd6a9SWarner Losh 	FATAL("regular expression too big: out of space in %.30s...", s);
4722a55deb1SDavid E. O'Brien }
4732a55deb1SDavid E. O'Brien 
4742a55deb1SDavid E. O'Brien void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
4752a55deb1SDavid E. O'Brien {
4762a55deb1SDavid E. O'Brien 	int i;
4772a55deb1SDavid E. O'Brien 	int *p;
4782a55deb1SDavid E. O'Brien 
4792a55deb1SDavid E. O'Brien 	switch (type(v)) {
480addad6afSRong-En Fan 	ELEAF
4812a55deb1SDavid E. O'Brien 	LEAF
4822a55deb1SDavid E. O'Brien 		f->re[info(v)].ltype = type(v);
4832a55deb1SDavid E. O'Brien 		f->re[info(v)].lval.np = right(v);
4842a55deb1SDavid E. O'Brien 		while (f->accept >= maxsetvec) {	/* guessing here! */
485f39dd6a9SWarner Losh 			resizesetvec(__func__);
4862a55deb1SDavid E. O'Brien 		}
4872a55deb1SDavid E. O'Brien 		for (i = 0; i <= f->accept; i++)
4882a55deb1SDavid E. O'Brien 			setvec[i] = 0;
4892a55deb1SDavid E. O'Brien 		setcnt = 0;
4902a55deb1SDavid E. O'Brien 		follow(v);	/* computes setvec and setcnt */
491f39dd6a9SWarner Losh 		p = intalloc(setcnt + 1, __func__);
4922a55deb1SDavid E. O'Brien 		f->re[info(v)].lfollow = p;
4932a55deb1SDavid E. O'Brien 		*p = setcnt;
4942a55deb1SDavid E. O'Brien 		for (i = f->accept; i >= 0; i--)
4952a55deb1SDavid E. O'Brien 			if (setvec[i] == 1)
4962a55deb1SDavid E. O'Brien 				*++p = i;
4972a55deb1SDavid E. O'Brien 		break;
4982a55deb1SDavid E. O'Brien 	UNARY
4992a55deb1SDavid E. O'Brien 		cfoll(f,left(v));
5002a55deb1SDavid E. O'Brien 		break;
5012a55deb1SDavid E. O'Brien 	case CAT:
5022a55deb1SDavid E. O'Brien 	case OR:
5032a55deb1SDavid E. O'Brien 		cfoll(f,left(v));
5042a55deb1SDavid E. O'Brien 		cfoll(f,right(v));
5052a55deb1SDavid E. O'Brien 		break;
506f39dd6a9SWarner Losh 	case ZERO:
507f39dd6a9SWarner Losh 		break;
5082a55deb1SDavid E. O'Brien 	default:	/* can't happen */
5092a55deb1SDavid E. O'Brien 		FATAL("can't happen: unknown type %d in cfoll", type(v));
5102a55deb1SDavid E. O'Brien 	}
5112a55deb1SDavid E. O'Brien }
5122a55deb1SDavid E. O'Brien 
5132a55deb1SDavid E. O'Brien int first(Node *p)	/* collects initially active leaves of p into setvec */
514addad6afSRong-En Fan 			/* returns 0 if p matches empty string */
5152a55deb1SDavid E. O'Brien {
5162a55deb1SDavid E. O'Brien 	int b, lp;
5172a55deb1SDavid E. O'Brien 
5182a55deb1SDavid E. O'Brien 	switch (type(p)) {
519addad6afSRong-En Fan 	ELEAF
5202a55deb1SDavid E. O'Brien 	LEAF
5212a55deb1SDavid E. O'Brien 		lp = info(p);	/* look for high-water mark of subscripts */
5222a55deb1SDavid E. O'Brien 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
523f39dd6a9SWarner Losh 			resizesetvec(__func__);
5242a55deb1SDavid E. O'Brien 		}
525addad6afSRong-En Fan 		if (type(p) == EMPTYRE) {
526addad6afSRong-En Fan 			setvec[lp] = 0;
527addad6afSRong-En Fan 			return(0);
528addad6afSRong-En Fan 		}
5292a55deb1SDavid E. O'Brien 		if (setvec[lp] != 1) {
5302a55deb1SDavid E. O'Brien 			setvec[lp] = 1;
5312a55deb1SDavid E. O'Brien 			setcnt++;
5322a55deb1SDavid E. O'Brien 		}
533*f32a6403SWarner Losh 		if (type(p) == CCL && (*(int *) right(p)) == 0)
5342a55deb1SDavid E. O'Brien 			return(0);		/* empty CCL */
535f39dd6a9SWarner Losh 		return(1);
5362a55deb1SDavid E. O'Brien 	case PLUS:
537f39dd6a9SWarner Losh 		if (first(left(p)) == 0)
538f39dd6a9SWarner Losh 			return(0);
5392a55deb1SDavid E. O'Brien 		return(1);
5402a55deb1SDavid E. O'Brien 	case STAR:
5412a55deb1SDavid E. O'Brien 	case QUEST:
5422a55deb1SDavid E. O'Brien 		first(left(p));
5432a55deb1SDavid E. O'Brien 		return(0);
5442a55deb1SDavid E. O'Brien 	case CAT:
5452a55deb1SDavid E. O'Brien 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
5462a55deb1SDavid E. O'Brien 		return(1);
5472a55deb1SDavid E. O'Brien 	case OR:
5482a55deb1SDavid E. O'Brien 		b = first(right(p));
5492a55deb1SDavid E. O'Brien 		if (first(left(p)) == 0 || b == 0) return(0);
5502a55deb1SDavid E. O'Brien 		return(1);
551f39dd6a9SWarner Losh 	case ZERO:
552f39dd6a9SWarner Losh 		return 0;
5532a55deb1SDavid E. O'Brien 	}
5542a55deb1SDavid E. O'Brien 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
5552a55deb1SDavid E. O'Brien 	return(-1);
5562a55deb1SDavid E. O'Brien }
5572a55deb1SDavid E. O'Brien 
5582a55deb1SDavid E. O'Brien void follow(Node *v)	/* collects leaves that can follow v into setvec */
5592a55deb1SDavid E. O'Brien {
5602a55deb1SDavid E. O'Brien 	Node *p;
5612a55deb1SDavid E. O'Brien 
5622a55deb1SDavid E. O'Brien 	if (type(v) == FINAL)
5632a55deb1SDavid E. O'Brien 		return;
5642a55deb1SDavid E. O'Brien 	p = parent(v);
5652a55deb1SDavid E. O'Brien 	switch (type(p)) {
5662a55deb1SDavid E. O'Brien 	case STAR:
5672a55deb1SDavid E. O'Brien 	case PLUS:
5682a55deb1SDavid E. O'Brien 		first(v);
5692a55deb1SDavid E. O'Brien 		follow(p);
5702a55deb1SDavid E. O'Brien 		return;
5712a55deb1SDavid E. O'Brien 
5722a55deb1SDavid E. O'Brien 	case OR:
5732a55deb1SDavid E. O'Brien 	case QUEST:
5742a55deb1SDavid E. O'Brien 		follow(p);
5752a55deb1SDavid E. O'Brien 		return;
5762a55deb1SDavid E. O'Brien 
5772a55deb1SDavid E. O'Brien 	case CAT:
5782a55deb1SDavid E. O'Brien 		if (v == left(p)) {	/* v is left child of p */
5792a55deb1SDavid E. O'Brien 			if (first(right(p)) == 0) {
5802a55deb1SDavid E. O'Brien 				follow(p);
5812a55deb1SDavid E. O'Brien 				return;
5822a55deb1SDavid E. O'Brien 			}
5832a55deb1SDavid E. O'Brien 		} else		/* v is right child */
5842a55deb1SDavid E. O'Brien 			follow(p);
5852a55deb1SDavid E. O'Brien 		return;
5862a55deb1SDavid E. O'Brien 	}
5872a55deb1SDavid E. O'Brien }
5882a55deb1SDavid E. O'Brien 
589*f32a6403SWarner Losh int member(int c, int *sarg)	/* is c in s? */
5902a55deb1SDavid E. O'Brien {
591*f32a6403SWarner Losh 	int *s = (int *) sarg;
5922a55deb1SDavid E. O'Brien 
5932a55deb1SDavid E. O'Brien 	while (*s)
5942a55deb1SDavid E. O'Brien 		if (c == *s++)
5952a55deb1SDavid E. O'Brien 			return(1);
5962a55deb1SDavid E. O'Brien 	return(0);
5972a55deb1SDavid E. O'Brien }
5982a55deb1SDavid E. O'Brien 
599*f32a6403SWarner Losh static void resize_gototab(fa *f, int state)
600*f32a6403SWarner Losh {
601*f32a6403SWarner Losh 	size_t new_size = f->gototab[state].allocated * 2;
602*f32a6403SWarner Losh 	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
603*f32a6403SWarner Losh 	if (p == NULL)
604*f32a6403SWarner Losh 		overflo(__func__);
605*f32a6403SWarner Losh 
606*f32a6403SWarner Losh 	// need to initialized the new memory to zero
607*f32a6403SWarner Losh 	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
608*f32a6403SWarner Losh 	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
609*f32a6403SWarner Losh 
610*f32a6403SWarner Losh 	f->gototab[state].allocated = new_size;			// update gototab info
611*f32a6403SWarner Losh 	f->gototab[state].entries = p;
612*f32a6403SWarner Losh }
613*f32a6403SWarner Losh 
614*f32a6403SWarner Losh static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
615*f32a6403SWarner Losh {
616*f32a6403SWarner Losh 	gtte key;
617*f32a6403SWarner Losh 	gtte *item;
618*f32a6403SWarner Losh 
619*f32a6403SWarner Losh 	key.ch = ch;
620*f32a6403SWarner Losh 	key.state = 0;	/* irrelevant */
621*f32a6403SWarner Losh 	item = (gtte *) bsearch(& key, f->gototab[state].entries,
622*f32a6403SWarner Losh 			f->gototab[state].inuse, sizeof(gtte),
623*f32a6403SWarner Losh 			entry_cmp);
624*f32a6403SWarner Losh 
625*f32a6403SWarner Losh 	if (item == NULL)
626*f32a6403SWarner Losh 		return 0;
627*f32a6403SWarner Losh 	else
628*f32a6403SWarner Losh 		return item->state;
629*f32a6403SWarner Losh }
630*f32a6403SWarner Losh 
631*f32a6403SWarner Losh static int entry_cmp(const void *l, const void *r)
632*f32a6403SWarner Losh {
633*f32a6403SWarner Losh 	const gtte *left, *right;
634*f32a6403SWarner Losh 
635*f32a6403SWarner Losh 	left = (const gtte *) l;
636*f32a6403SWarner Losh 	right = (const gtte *) r;
637*f32a6403SWarner Losh 
638*f32a6403SWarner Losh 	return left->ch - right->ch;
639*f32a6403SWarner Losh }
640*f32a6403SWarner Losh 
641*f32a6403SWarner Losh static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
642*f32a6403SWarner Losh {
643*f32a6403SWarner Losh 	if (f->gototab[state].inuse == 0) {
644*f32a6403SWarner Losh 		f->gototab[state].entries[0].ch = ch;
645*f32a6403SWarner Losh 		f->gototab[state].entries[0].state = val;
646*f32a6403SWarner Losh 		f->gototab[state].inuse++;
647*f32a6403SWarner Losh 		return val;
648*f32a6403SWarner Losh 	} else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
649*f32a6403SWarner Losh 		// not seen yet, insert and return
650*f32a6403SWarner Losh 		gtt *tab = & f->gototab[state];
651*f32a6403SWarner Losh 		if (tab->inuse + 1 >= tab->allocated)
652*f32a6403SWarner Losh 			resize_gototab(f, state);
653*f32a6403SWarner Losh 
654*f32a6403SWarner Losh 		f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
655*f32a6403SWarner Losh 		f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
656*f32a6403SWarner Losh 		f->gototab[state].inuse++;
657*f32a6403SWarner Losh 		return val;
658*f32a6403SWarner Losh 	} else {
659*f32a6403SWarner Losh 		// maybe we have it, maybe we don't
660*f32a6403SWarner Losh 		gtte key;
661*f32a6403SWarner Losh 		gtte *item;
662*f32a6403SWarner Losh 
663*f32a6403SWarner Losh 		key.ch = ch;
664*f32a6403SWarner Losh 		key.state = 0;	/* irrelevant */
665*f32a6403SWarner Losh 		item = (gtte *) bsearch(& key, f->gototab[state].entries,
666*f32a6403SWarner Losh 				f->gototab[state].inuse, sizeof(gtte),
667*f32a6403SWarner Losh 				entry_cmp);
668*f32a6403SWarner Losh 
669*f32a6403SWarner Losh 		if (item != NULL) {
670*f32a6403SWarner Losh 			// we have it, update state and return
671*f32a6403SWarner Losh 			item->state = val;
672*f32a6403SWarner Losh 			return item->state;
673*f32a6403SWarner Losh 		}
674*f32a6403SWarner Losh 		// otherwise, fall through to insert and reallocate.
675*f32a6403SWarner Losh 	}
676*f32a6403SWarner Losh 
677*f32a6403SWarner Losh 	gtt *tab = & f->gototab[state];
678*f32a6403SWarner Losh 	if (tab->inuse + 1 >= tab->allocated)
679*f32a6403SWarner Losh 		resize_gototab(f, state);
680*f32a6403SWarner Losh 	++tab->inuse;
681*f32a6403SWarner Losh 	f->gototab[state].entries[tab->inuse].ch = ch;
682*f32a6403SWarner Losh 	f->gototab[state].entries[tab->inuse].state = val;
683*f32a6403SWarner Losh 
684*f32a6403SWarner Losh 	qsort(f->gototab[state].entries,
685*f32a6403SWarner Losh 		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
686*f32a6403SWarner Losh 
687*f32a6403SWarner Losh 	return val; /* not used anywhere at the moment */
688*f32a6403SWarner Losh }
689*f32a6403SWarner Losh 
690*f32a6403SWarner Losh static void clear_gototab(fa *f, int state)
691*f32a6403SWarner Losh {
692*f32a6403SWarner Losh 	memset(f->gototab[state].entries, 0,
693*f32a6403SWarner Losh 		f->gototab[state].allocated * sizeof(gtte));
694*f32a6403SWarner Losh 	f->gototab[state].inuse = 0;
695*f32a6403SWarner Losh }
696*f32a6403SWarner Losh 
697813da98dSDavid E. O'Brien int match(fa *f, const char *p0)	/* shortest match ? */
6982a55deb1SDavid E. O'Brien {
6992a55deb1SDavid E. O'Brien 	int s, ns;
700*f32a6403SWarner Losh 	int n;
701*f32a6403SWarner Losh 	int rune;
702f39dd6a9SWarner Losh 	const uschar *p = (const uschar *) p0;
7032a55deb1SDavid E. O'Brien 
704*f32a6403SWarner Losh 	/* return pmatch(f, p0); does it matter whether longest or shortest? */
705*f32a6403SWarner Losh 
706f39dd6a9SWarner Losh 	s = f->initstat;
707f39dd6a9SWarner Losh 	assert (s < f->state_count);
708f39dd6a9SWarner Losh 
7092a55deb1SDavid E. O'Brien 	if (f->out[s])
7102a55deb1SDavid E. O'Brien 		return(1);
7112a55deb1SDavid E. O'Brien 	do {
712addad6afSRong-En Fan 		/* assert(*p < NCHARS); */
713*f32a6403SWarner Losh 		n = u8_rune(&rune, (const char *) p);
714*f32a6403SWarner Losh 		if ((ns = get_gototab(f, s, rune)) != 0)
7152a55deb1SDavid E. O'Brien 			s = ns;
7162a55deb1SDavid E. O'Brien 		else
717*f32a6403SWarner Losh 			s = cgoto(f, s, rune);
7182a55deb1SDavid E. O'Brien 		if (f->out[s])
7192a55deb1SDavid E. O'Brien 			return(1);
720*f32a6403SWarner Losh 		if (*p == 0)
721*f32a6403SWarner Losh 			break;
722*f32a6403SWarner Losh 		p += n;
723*f32a6403SWarner Losh 	} while (1);  /* was *p++ != 0 */
7242a55deb1SDavid E. O'Brien 	return(0);
7252a55deb1SDavid E. O'Brien }
7262a55deb1SDavid E. O'Brien 
727813da98dSDavid E. O'Brien int pmatch(fa *f, const char *p0)	/* longest match, for sub */
7282a55deb1SDavid E. O'Brien {
7292a55deb1SDavid E. O'Brien 	int s, ns;
730*f32a6403SWarner Losh 	int n;
731*f32a6403SWarner Losh 	int rune;
732f39dd6a9SWarner Losh 	const uschar *p = (const uschar *) p0;
733f39dd6a9SWarner Losh 	const uschar *q;
7342a55deb1SDavid E. O'Brien 
73562ebc626SRuslan Ermilov 	s = f->initstat;
736f39dd6a9SWarner Losh 	assert(s < f->state_count);
737f39dd6a9SWarner Losh 
738f39dd6a9SWarner Losh 	patbeg = (const char *)p;
7392a55deb1SDavid E. O'Brien 	patlen = -1;
7402a55deb1SDavid E. O'Brien 	do {
7412a55deb1SDavid E. O'Brien 		q = p;
7422a55deb1SDavid E. O'Brien 		do {
7432a55deb1SDavid E. O'Brien 			if (f->out[s])		/* final state */
7442a55deb1SDavid E. O'Brien 				patlen = q-p;
745addad6afSRong-En Fan 			/* assert(*q < NCHARS); */
746*f32a6403SWarner Losh 			n = u8_rune(&rune, (const char *) q);
747*f32a6403SWarner Losh 			if ((ns = get_gototab(f, s, rune)) != 0)
7482a55deb1SDavid E. O'Brien 				s = ns;
7492a55deb1SDavid E. O'Brien 			else
750*f32a6403SWarner Losh 				s = cgoto(f, s, rune);
751f39dd6a9SWarner Losh 
752f39dd6a9SWarner Losh 			assert(s < f->state_count);
753f39dd6a9SWarner Losh 
7542a55deb1SDavid E. O'Brien 			if (s == 1) {	/* no transition */
7552a55deb1SDavid E. O'Brien 				if (patlen >= 0) {
756f39dd6a9SWarner Losh 					patbeg = (const char *) p;
7572a55deb1SDavid E. O'Brien 					return(1);
7582a55deb1SDavid E. O'Brien 				}
7592a55deb1SDavid E. O'Brien 				else
7602a55deb1SDavid E. O'Brien 					goto nextin;	/* no match */
7612a55deb1SDavid E. O'Brien 			}
762*f32a6403SWarner Losh 			if (*q == 0)
763*f32a6403SWarner Losh 				break;
764*f32a6403SWarner Losh 			q += n;
765*f32a6403SWarner Losh 		} while (1);
766*f32a6403SWarner Losh 		q++;  /* was *q++ */
7672a55deb1SDavid E. O'Brien 		if (f->out[s])
7682a55deb1SDavid E. O'Brien 			patlen = q-p-1;	/* don't count $ */
7692a55deb1SDavid E. O'Brien 		if (patlen >= 0) {
770f39dd6a9SWarner Losh 			patbeg = (const char *) p;
7712a55deb1SDavid E. O'Brien 			return(1);
7722a55deb1SDavid E. O'Brien 		}
7732a55deb1SDavid E. O'Brien 	nextin:
7742a55deb1SDavid E. O'Brien 		s = 2;
775*f32a6403SWarner Losh 		if (*p == 0)
776*f32a6403SWarner Losh 			break;
777*f32a6403SWarner Losh 		n = u8_rune(&rune, (const char *) p);
778*f32a6403SWarner Losh 		p += n;
779*f32a6403SWarner Losh 	} while (1); /* was *p++ */
7802a55deb1SDavid E. O'Brien 	return (0);
7812a55deb1SDavid E. O'Brien }
7822a55deb1SDavid E. O'Brien 
783813da98dSDavid E. O'Brien int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
7842a55deb1SDavid E. O'Brien {
7852a55deb1SDavid E. O'Brien 	int s, ns;
786*f32a6403SWarner Losh         int n;
787*f32a6403SWarner Losh         int rune;
788f39dd6a9SWarner Losh 	const uschar *p = (const uschar *) p0;
789f39dd6a9SWarner Losh 	const uschar *q;
7902a55deb1SDavid E. O'Brien 
79162ebc626SRuslan Ermilov 	s = f->initstat;
792f39dd6a9SWarner Losh 	assert(s < f->state_count);
793f39dd6a9SWarner Losh 
794f39dd6a9SWarner Losh 	patbeg = (const char *)p;
7952a55deb1SDavid E. O'Brien 	patlen = -1;
7962a55deb1SDavid E. O'Brien 	while (*p) {
7972a55deb1SDavid E. O'Brien 		q = p;
7982a55deb1SDavid E. O'Brien 		do {
7992a55deb1SDavid E. O'Brien 			if (f->out[s])		/* final state */
8002a55deb1SDavid E. O'Brien 				patlen = q-p;
801addad6afSRong-En Fan 			/* assert(*q < NCHARS); */
802*f32a6403SWarner Losh 			n = u8_rune(&rune, (const char *) q);
803*f32a6403SWarner Losh 			if ((ns = get_gototab(f, s, rune)) != 0)
8042a55deb1SDavid E. O'Brien 				s = ns;
8052a55deb1SDavid E. O'Brien 			else
806*f32a6403SWarner Losh 				s = cgoto(f, s, rune);
8072a55deb1SDavid E. O'Brien 			if (s == 1) {	/* no transition */
8082a55deb1SDavid E. O'Brien 				if (patlen > 0) {
809f39dd6a9SWarner Losh 					patbeg = (const char *) p;
8102a55deb1SDavid E. O'Brien 					return(1);
8112a55deb1SDavid E. O'Brien 				} else
8122a55deb1SDavid E. O'Brien 					goto nnextin;	/* no nonempty match */
8132a55deb1SDavid E. O'Brien 			}
814*f32a6403SWarner Losh 			if (*q == 0)
815*f32a6403SWarner Losh 				break;
816*f32a6403SWarner Losh 			q += n;
817*f32a6403SWarner Losh 		} while (1);
818*f32a6403SWarner Losh 		q++;
8192a55deb1SDavid E. O'Brien 		if (f->out[s])
8202a55deb1SDavid E. O'Brien 			patlen = q-p-1;	/* don't count $ */
8212a55deb1SDavid E. O'Brien 		if (patlen > 0 ) {
822f39dd6a9SWarner Losh 			patbeg = (const char *) p;
8232a55deb1SDavid E. O'Brien 			return(1);
8242a55deb1SDavid E. O'Brien 		}
8252a55deb1SDavid E. O'Brien 	nnextin:
8262a55deb1SDavid E. O'Brien 		s = 2;
8272a55deb1SDavid E. O'Brien 		p++;
8282a55deb1SDavid E. O'Brien 	}
8292a55deb1SDavid E. O'Brien 	return (0);
8302a55deb1SDavid E. O'Brien }
8312a55deb1SDavid E. O'Brien 
832f39dd6a9SWarner Losh 
833*f32a6403SWarner Losh #define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
834*f32a6403SWarner Losh 
835f39dd6a9SWarner Losh /*
836f39dd6a9SWarner Losh  * NAME
837f39dd6a9SWarner Losh  *     fnematch
838f39dd6a9SWarner Losh  *
839f39dd6a9SWarner Losh  * DESCRIPTION
840f39dd6a9SWarner Losh  *     A stream-fed version of nematch which transfers characters to a
841f39dd6a9SWarner Losh  *     null-terminated buffer. All characters up to and including the last
842f39dd6a9SWarner Losh  *     character of the matching text or EOF are placed in the buffer. If
843f39dd6a9SWarner Losh  *     a match is found, patbeg and patlen are set appropriately.
844f39dd6a9SWarner Losh  *
845f39dd6a9SWarner Losh  * RETURN VALUES
846f39dd6a9SWarner Losh  *     false    No match found.
847f39dd6a9SWarner Losh  *     true     Match found.
848f39dd6a9SWarner Losh  */
849f39dd6a9SWarner Losh 
850f39dd6a9SWarner Losh bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
851f39dd6a9SWarner Losh {
852*f32a6403SWarner Losh 	char *i, *j, *k, *buf = *pbuf;
853f39dd6a9SWarner Losh 	int bufsize = *pbufsize;
854*f32a6403SWarner Losh 	int c, n, ns, s;
855f39dd6a9SWarner Losh 
856f39dd6a9SWarner Losh 	s = pfa->initstat;
857f39dd6a9SWarner Losh 	patlen = 0;
858f39dd6a9SWarner Losh 
859f39dd6a9SWarner Losh 	/*
860*f32a6403SWarner Losh 	 * buf <= i <= j <= k <= buf+bufsize
861f39dd6a9SWarner Losh 	 *
862b2376a5fSWarner Losh 	 * i: origin of active substring
863b2376a5fSWarner Losh 	 * j: current character
864*f32a6403SWarner Losh 	 * k: destination of the next getc
865f39dd6a9SWarner Losh 	 */
866f39dd6a9SWarner Losh 
867*f32a6403SWarner Losh 	i = j = k = buf;
868*f32a6403SWarner Losh 
869*f32a6403SWarner Losh 	do {
870*f32a6403SWarner Losh 		/*
871*f32a6403SWarner Losh 		 * Call u8_rune with at least MAX_UTF_BYTES ahead in
872*f32a6403SWarner Losh 		 * the buffer until EOF interferes.
873*f32a6403SWarner Losh 		 */
874*f32a6403SWarner Losh 		if (k - j < MAX_UTF_BYTES) {
875*f32a6403SWarner Losh 			if (k + MAX_UTF_BYTES > buf + bufsize) {
876*f32a6403SWarner Losh 				adjbuf((char **) &buf, &bufsize,
877*f32a6403SWarner Losh 				    bufsize + MAX_UTF_BYTES,
878*f32a6403SWarner Losh 				    quantum, 0, "fnematch");
879*f32a6403SWarner Losh 			}
880*f32a6403SWarner Losh 			for (n = MAX_UTF_BYTES ; n > 0; n--) {
881*f32a6403SWarner Losh 				*k++ = (c = getc(f)) != EOF ? c : 0;
882*f32a6403SWarner Losh 				if (c == EOF) {
883*f32a6403SWarner Losh 					if (ferror(f))
884*f32a6403SWarner Losh 						FATAL("fnematch: getc error");
885*f32a6403SWarner Losh 					break;
886*f32a6403SWarner Losh 				}
887*f32a6403SWarner Losh 			}
888*f32a6403SWarner Losh 		}
889*f32a6403SWarner Losh 
890*f32a6403SWarner Losh 		j += u8_rune(&c, j);
891*f32a6403SWarner Losh 
892*f32a6403SWarner Losh 		if ((ns = get_gototab(pfa, s, c)) != 0)
893f39dd6a9SWarner Losh 			s = ns;
894f39dd6a9SWarner Losh 		else
895b2376a5fSWarner Losh 			s = cgoto(pfa, s, c);
896f39dd6a9SWarner Losh 
897f39dd6a9SWarner Losh 		if (pfa->out[s]) {	/* final state */
898*f32a6403SWarner Losh 			patbeg = i;
899*f32a6403SWarner Losh 			patlen = j - i;
900b2376a5fSWarner Losh 			if (c == 0)	/* don't count $ */
901f39dd6a9SWarner Losh 				patlen--;
902f39dd6a9SWarner Losh 		}
903*f32a6403SWarner Losh 
904*f32a6403SWarner Losh 		if (c && s != 1)
905*f32a6403SWarner Losh 			continue;  /* origin i still viable, next j */
906*f32a6403SWarner Losh 		if (patlen)
907*f32a6403SWarner Losh 			break;     /* best match found */
908*f32a6403SWarner Losh 
909*f32a6403SWarner Losh 		/* no match at origin i, next i and start over */
910*f32a6403SWarner Losh 		i += u8_rune(&c, i);
911*f32a6403SWarner Losh 		if (c == 0)
912*f32a6403SWarner Losh 			break;    /* no match */
913*f32a6403SWarner Losh 		j = i;
914f39dd6a9SWarner Losh 		s = 2;
915*f32a6403SWarner Losh 	} while (1);
916f39dd6a9SWarner Losh 
917f39dd6a9SWarner Losh 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
918f39dd6a9SWarner Losh 	*pbuf = buf;
919f39dd6a9SWarner Losh 	*pbufsize = bufsize;
920f39dd6a9SWarner Losh 
921f39dd6a9SWarner Losh 	if (patlen) {
922f39dd6a9SWarner Losh 		/*
923f39dd6a9SWarner Losh 		 * Under no circumstances is the last character fed to
924f39dd6a9SWarner Losh 		 * the automaton part of the match. It is EOF's nullbyte,
925f39dd6a9SWarner Losh 		 * or it sent the automaton into a state with no further
926f39dd6a9SWarner Losh 		 * transitions available (s==1), or both. Room for a
927f39dd6a9SWarner Losh 		 * terminating nullbyte is guaranteed.
928f39dd6a9SWarner Losh 		 *
929f39dd6a9SWarner Losh 		 * ungetc any chars after the end of matching text
930f39dd6a9SWarner Losh 		 * (except for EOF's nullbyte, if present) and null
931f39dd6a9SWarner Losh 		 * terminate the buffer.
932f39dd6a9SWarner Losh 		 */
933f39dd6a9SWarner Losh 		do
934*f32a6403SWarner Losh 			if (*--k && ungetc(*k, f) == EOF)
935*f32a6403SWarner Losh 				FATAL("unable to ungetc '%c'", *k);
936*f32a6403SWarner Losh 		while (k > patbeg + patlen);
937*f32a6403SWarner Losh 		*k = '\0';
938f39dd6a9SWarner Losh 		return true;
939f39dd6a9SWarner Losh 	}
940f39dd6a9SWarner Losh 	else
941f39dd6a9SWarner Losh 		return false;
942f39dd6a9SWarner Losh }
943f39dd6a9SWarner Losh 
944813da98dSDavid E. O'Brien Node *reparse(const char *p)	/* parses regular expression pointed to by p */
9452a55deb1SDavid E. O'Brien {			/* uses relex() to scan regular expression */
9462a55deb1SDavid E. O'Brien 	Node *np;
9472a55deb1SDavid E. O'Brien 
948f39dd6a9SWarner Losh 	DPRINTF("reparse <%s>\n", p);
949f39dd6a9SWarner Losh 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
9502a55deb1SDavid E. O'Brien 	rtok = relex();
951813da98dSDavid E. O'Brien 	/* GNU compatibility: an empty regexp matches anything */
952addad6afSRong-En Fan 	if (rtok == '\0') {
953813da98dSDavid E. O'Brien 		/* FATAL("empty regular expression"); previous */
954addad6afSRong-En Fan 		return(op2(EMPTYRE, NIL, NIL));
955addad6afSRong-En Fan 	}
9562a55deb1SDavid E. O'Brien 	np = regexp();
9572a55deb1SDavid E. O'Brien 	if (rtok != '\0')
9582a55deb1SDavid E. O'Brien 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
9592a55deb1SDavid E. O'Brien 	return(np);
9602a55deb1SDavid E. O'Brien }
9612a55deb1SDavid E. O'Brien 
9622a55deb1SDavid E. O'Brien Node *regexp(void)	/* top-level parse of reg expr */
9632a55deb1SDavid E. O'Brien {
9642a55deb1SDavid E. O'Brien 	return (alt(concat(primary())));
9652a55deb1SDavid E. O'Brien }
9662a55deb1SDavid E. O'Brien 
9672a55deb1SDavid E. O'Brien Node *primary(void)
9682a55deb1SDavid E. O'Brien {
9692a55deb1SDavid E. O'Brien 	Node *np;
970b5253557SWarner Losh 	int savelastatom;
9712a55deb1SDavid E. O'Brien 
9722a55deb1SDavid E. O'Brien 	switch (rtok) {
9732a55deb1SDavid E. O'Brien 	case CHAR:
974b5253557SWarner Losh 		lastatom = starttok;
9752a55deb1SDavid E. O'Brien 		np = op2(CHAR, NIL, itonp(rlxval));
9762a55deb1SDavid E. O'Brien 		rtok = relex();
9772a55deb1SDavid E. O'Brien 		return (unary(np));
9782a55deb1SDavid E. O'Brien 	case ALL:
9792a55deb1SDavid E. O'Brien 		rtok = relex();
9802a55deb1SDavid E. O'Brien 		return (unary(op2(ALL, NIL, NIL)));
981addad6afSRong-En Fan 	case EMPTYRE:
982addad6afSRong-En Fan 		rtok = relex();
983b5253557SWarner Losh 		return (unary(op2(EMPTYRE, NIL, NIL)));
9842a55deb1SDavid E. O'Brien 	case DOT:
985b5253557SWarner Losh 		lastatom = starttok;
9862a55deb1SDavid E. O'Brien 		rtok = relex();
9872a55deb1SDavid E. O'Brien 		return (unary(op2(DOT, NIL, NIL)));
9882a55deb1SDavid E. O'Brien 	case CCL:
989f39dd6a9SWarner Losh 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
990b5253557SWarner Losh 		lastatom = starttok;
9912a55deb1SDavid E. O'Brien 		rtok = relex();
9922a55deb1SDavid E. O'Brien 		return (unary(np));
9932a55deb1SDavid E. O'Brien 	case NCCL:
994f39dd6a9SWarner Losh 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
995b5253557SWarner Losh 		lastatom = starttok;
9962a55deb1SDavid E. O'Brien 		rtok = relex();
9972a55deb1SDavid E. O'Brien 		return (unary(np));
9982a55deb1SDavid E. O'Brien 	case '^':
9992a55deb1SDavid E. O'Brien 		rtok = relex();
10002a55deb1SDavid E. O'Brien 		return (unary(op2(CHAR, NIL, itonp(HAT))));
10012a55deb1SDavid E. O'Brien 	case '$':
10022a55deb1SDavid E. O'Brien 		rtok = relex();
10032a55deb1SDavid E. O'Brien 		return (unary(op2(CHAR, NIL, NIL)));
10042a55deb1SDavid E. O'Brien 	case '(':
1005b5253557SWarner Losh 		lastatom = starttok;
1006b5253557SWarner Losh 		savelastatom = starttok - basestr; /* Retain over recursion */
10072a55deb1SDavid E. O'Brien 		rtok = relex();
10082a55deb1SDavid E. O'Brien 		if (rtok == ')') {	/* special pleading for () */
10092a55deb1SDavid E. O'Brien 			rtok = relex();
1010*f32a6403SWarner Losh 			return unary(op2(CCL, NIL, (Node *) cclenter("")));
10112a55deb1SDavid E. O'Brien 		}
10122a55deb1SDavid E. O'Brien 		np = regexp();
10132a55deb1SDavid E. O'Brien 		if (rtok == ')') {
1014b5253557SWarner Losh 			lastatom = basestr + savelastatom; /* Restore */
10152a55deb1SDavid E. O'Brien 			rtok = relex();
10162a55deb1SDavid E. O'Brien 			return (unary(np));
10172a55deb1SDavid E. O'Brien 		}
10182a55deb1SDavid E. O'Brien 		else
10192a55deb1SDavid E. O'Brien 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
10202a55deb1SDavid E. O'Brien 	default:
10212a55deb1SDavid E. O'Brien 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
10222a55deb1SDavid E. O'Brien 	}
10232a55deb1SDavid E. O'Brien 	return 0;	/*NOTREACHED*/
10242a55deb1SDavid E. O'Brien }
10252a55deb1SDavid E. O'Brien 
10262a55deb1SDavid E. O'Brien Node *concat(Node *np)
10272a55deb1SDavid E. O'Brien {
10282a55deb1SDavid E. O'Brien 	switch (rtok) {
1029b5253557SWarner Losh 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
10302a55deb1SDavid E. O'Brien 		return (concat(op2(CAT, np, primary())));
1031b5253557SWarner Losh 	case EMPTYRE:
1032b5253557SWarner Losh 		rtok = relex();
1033*f32a6403SWarner Losh 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1034b5253557SWarner Losh 				primary())));
10352a55deb1SDavid E. O'Brien 	}
10362a55deb1SDavid E. O'Brien 	return (np);
10372a55deb1SDavid E. O'Brien }
10382a55deb1SDavid E. O'Brien 
10392a55deb1SDavid E. O'Brien Node *alt(Node *np)
10402a55deb1SDavid E. O'Brien {
10412a55deb1SDavid E. O'Brien 	if (rtok == OR) {
10422a55deb1SDavid E. O'Brien 		rtok = relex();
10432a55deb1SDavid E. O'Brien 		return (alt(op2(OR, np, concat(primary()))));
10442a55deb1SDavid E. O'Brien 	}
10452a55deb1SDavid E. O'Brien 	return (np);
10462a55deb1SDavid E. O'Brien }
10472a55deb1SDavid E. O'Brien 
10482a55deb1SDavid E. O'Brien Node *unary(Node *np)
10492a55deb1SDavid E. O'Brien {
10502a55deb1SDavid E. O'Brien 	switch (rtok) {
10512a55deb1SDavid E. O'Brien 	case STAR:
10522a55deb1SDavid E. O'Brien 		rtok = relex();
10532a55deb1SDavid E. O'Brien 		return (unary(op2(STAR, np, NIL)));
10542a55deb1SDavid E. O'Brien 	case PLUS:
10552a55deb1SDavid E. O'Brien 		rtok = relex();
10562a55deb1SDavid E. O'Brien 		return (unary(op2(PLUS, np, NIL)));
10572a55deb1SDavid E. O'Brien 	case QUEST:
10582a55deb1SDavid E. O'Brien 		rtok = relex();
10592a55deb1SDavid E. O'Brien 		return (unary(op2(QUEST, np, NIL)));
1060f39dd6a9SWarner Losh 	case ZERO:
1061f39dd6a9SWarner Losh 		rtok = relex();
1062f39dd6a9SWarner Losh 		return (unary(op2(ZERO, np, NIL)));
10632a55deb1SDavid E. O'Brien 	default:
10642a55deb1SDavid E. O'Brien 		return (np);
10652a55deb1SDavid E. O'Brien 	}
10662a55deb1SDavid E. O'Brien }
10672a55deb1SDavid E. O'Brien 
1068007c6572SDag-Erling Smørgrav /*
1069007c6572SDag-Erling Smørgrav  * Character class definitions conformant to the POSIX locale as
1070007c6572SDag-Erling Smørgrav  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1071007c6572SDag-Erling Smørgrav  * and operating character sets are both ASCII (ISO646) or supersets
1072007c6572SDag-Erling Smørgrav  * thereof.
1073007c6572SDag-Erling Smørgrav  *
1074007c6572SDag-Erling Smørgrav  * Note that to avoid overflowing the temporary buffer used in
1075007c6572SDag-Erling Smørgrav  * relex(), the expanded character class (prior to range expansion)
1076007c6572SDag-Erling Smørgrav  * must be less than twice the size of their full name.
1077007c6572SDag-Erling Smørgrav  */
1078fc6b1dfeSDavid E. O'Brien 
1079fc6b1dfeSDavid E. O'Brien /* Because isblank doesn't show up in any of the header files on any
1080fc6b1dfeSDavid E. O'Brien  * system i use, it's defined here.  if some other locale has a richer
1081fc6b1dfeSDavid E. O'Brien  * definition of "blank", define HAS_ISBLANK and provide your own
1082fc6b1dfeSDavid E. O'Brien  * version.
108388b8d487SRuslan Ermilov  * the parentheses here are an attempt to find a path through the maze
108488b8d487SRuslan Ermilov  * of macro definition and/or function and/or version provided.  thanks
108588b8d487SRuslan Ermilov  * to nelson beebe for the suggestion; let's see if it works everywhere.
1086fc6b1dfeSDavid E. O'Brien  */
1087fc6b1dfeSDavid E. O'Brien 
108891217c1cSRuslan Ermilov /* #define HAS_ISBLANK */
1089fc6b1dfeSDavid E. O'Brien #ifndef HAS_ISBLANK
1090fc6b1dfeSDavid E. O'Brien 
10911b11b783SRuslan Ermilov int (xisblank)(int c)
1092fc6b1dfeSDavid E. O'Brien {
1093fc6b1dfeSDavid E. O'Brien 	return c==' ' || c=='\t';
1094fc6b1dfeSDavid E. O'Brien }
1095fc6b1dfeSDavid E. O'Brien 
1096fc6b1dfeSDavid E. O'Brien #endif
1097fc6b1dfeSDavid E. O'Brien 
1098f39dd6a9SWarner Losh static const struct charclass {
1099007c6572SDag-Erling Smørgrav 	const char *cc_name;
1100007c6572SDag-Erling Smørgrav 	int cc_namelen;
1101fc6b1dfeSDavid E. O'Brien 	int (*cc_func)(int);
1102007c6572SDag-Erling Smørgrav } charclasses[] = {
1103fc6b1dfeSDavid E. O'Brien 	{ "alnum",	5,	isalnum },
1104fc6b1dfeSDavid E. O'Brien 	{ "alpha",	5,	isalpha },
11051b11b783SRuslan Ermilov #ifndef HAS_ISBLANK
1106b5253557SWarner Losh 	{ "blank",	5,	xisblank },
11071b11b783SRuslan Ermilov #else
1108fc6b1dfeSDavid E. O'Brien 	{ "blank",	5,	isblank },
11091b11b783SRuslan Ermilov #endif
1110fc6b1dfeSDavid E. O'Brien 	{ "cntrl",	5,	iscntrl },
1111fc6b1dfeSDavid E. O'Brien 	{ "digit",	5,	isdigit },
1112fc6b1dfeSDavid E. O'Brien 	{ "graph",	5,	isgraph },
1113fc6b1dfeSDavid E. O'Brien 	{ "lower",	5,	islower },
1114fc6b1dfeSDavid E. O'Brien 	{ "print",	5,	isprint },
1115fc6b1dfeSDavid E. O'Brien 	{ "punct",	5,	ispunct },
1116fc6b1dfeSDavid E. O'Brien 	{ "space",	5,	isspace },
1117fc6b1dfeSDavid E. O'Brien 	{ "upper",	5,	isupper },
1118fc6b1dfeSDavid E. O'Brien 	{ "xdigit",	6,	isxdigit },
1119007c6572SDag-Erling Smørgrav 	{ NULL,		0,	NULL },
1120007c6572SDag-Erling Smørgrav };
1121007c6572SDag-Erling Smørgrav 
1122b5253557SWarner Losh #define REPEAT_SIMPLE		0
1123b5253557SWarner Losh #define REPEAT_PLUS_APPENDED	1
1124b5253557SWarner Losh #define REPEAT_WITH_Q		2
1125b5253557SWarner Losh #define REPEAT_ZERO		3
1126b5253557SWarner Losh 
1127b5253557SWarner Losh static int
1128b5253557SWarner Losh replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1129b5253557SWarner Losh 	       int atomlen, int firstnum, int secondnum, int special_case)
1130b5253557SWarner Losh {
1131b5253557SWarner Losh 	int i, j;
1132b5253557SWarner Losh 	uschar *buf = 0;
1133b5253557SWarner Losh 	int ret = 1;
1134b5253557SWarner Losh 	int init_q = (firstnum == 0);		/* first added char will be ? */
1135b5253557SWarner Losh 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
1136b5253557SWarner Losh 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
1137f39dd6a9SWarner Losh 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
1138b5253557SWarner Losh 	int size = prefix_length +  suffix_length;
1139b5253557SWarner Losh 
1140b5253557SWarner Losh 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
1141b5253557SWarner Losh 		size += atomlen*(firstnum-1);
1142b5253557SWarner Losh 	}
1143b5253557SWarner Losh 
1144b5253557SWarner Losh 	/* Adjust size of buffer for special cases */
1145b5253557SWarner Losh 	if (special_case == REPEAT_PLUS_APPENDED) {
1146b5253557SWarner Losh 		size++;		/* for the final + */
1147b5253557SWarner Losh 	} else if (special_case == REPEAT_WITH_Q) {
114823f24377SWarner Losh 		size += init_q + (atomlen+1)* (n_q_reps-init_q);
1149b5253557SWarner Losh 	} else if (special_case == REPEAT_ZERO) {
1150b5253557SWarner Losh 		size += 2;	/* just a null ERE: () */
1151b5253557SWarner Losh 	}
1152b5253557SWarner Losh 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
1153b5253557SWarner Losh 		FATAL("out of space in reg expr %.10s..", lastre);
1154b5253557SWarner Losh 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
1155b5253557SWarner Losh 	j = prefix_length;
1156b5253557SWarner Losh 	if (special_case == REPEAT_ZERO) {
1157b5253557SWarner Losh 		j -= atomlen;
1158b5253557SWarner Losh 		buf[j++] = '(';
1159b5253557SWarner Losh 		buf[j++] = ')';
1160b5253557SWarner Losh 	}
1161b5253557SWarner Losh 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
1162b5253557SWarner Losh 		memcpy(&buf[j], atom, atomlen);
1163b5253557SWarner Losh 		j += atomlen;
1164b5253557SWarner Losh 	}
1165b5253557SWarner Losh 	if (special_case == REPEAT_PLUS_APPENDED) {
1166b5253557SWarner Losh 		buf[j++] = '+';
1167b5253557SWarner Losh 	} else if (special_case == REPEAT_WITH_Q) {
1168f39dd6a9SWarner Losh 		if (init_q)
1169f39dd6a9SWarner Losh 			buf[j++] = '?';
1170f39dd6a9SWarner Losh 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
1171b5253557SWarner Losh 			memcpy(&buf[j], atom, atomlen);
1172b5253557SWarner Losh 			j += atomlen;
1173b5253557SWarner Losh 			buf[j++] = '?';
1174b5253557SWarner Losh 		}
1175b5253557SWarner Losh 	}
1176b5253557SWarner Losh 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
117723f24377SWarner Losh 	j += suffix_length;
117823f24377SWarner Losh 	buf[j] = '\0';
1179b5253557SWarner Losh 	/* free old basestr */
1180b5253557SWarner Losh 	if (firstbasestr != basestr) {
1181b5253557SWarner Losh 		if (basestr)
1182b5253557SWarner Losh 			xfree(basestr);
1183b5253557SWarner Losh 	}
1184b5253557SWarner Losh 	basestr = buf;
1185b5253557SWarner Losh 	prestr  = buf + prefix_length;
1186b5253557SWarner Losh 	if (special_case == REPEAT_ZERO) {
1187b5253557SWarner Losh 		prestr  -= atomlen;
1188b5253557SWarner Losh 		ret++;
1189b5253557SWarner Losh 	}
1190b5253557SWarner Losh 	return ret;
1191b5253557SWarner Losh }
1192b5253557SWarner Losh 
1193b5253557SWarner Losh static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1194b5253557SWarner Losh 		  int atomlen, int firstnum, int secondnum)
1195b5253557SWarner Losh {
1196b5253557SWarner Losh 	/*
1197b5253557SWarner Losh 	   In general, the repetition specifier or "bound" is replaced here
1198b5253557SWarner Losh 	   by an equivalent ERE string, repeating the immediately previous atom
1199b5253557SWarner Losh 	   and appending ? and + as needed. Note that the first copy of the
1200b5253557SWarner Losh 	   atom is left in place, except in the special_case of a zero-repeat
1201b5253557SWarner Losh 	   (i.e., {0}).
1202b5253557SWarner Losh 	 */
1203b5253557SWarner Losh 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
1204b5253557SWarner Losh 		if (firstnum < 2) {
1205b5253557SWarner Losh 			/* 0 or 1: should be handled before you get here */
1206b5253557SWarner Losh 			FATAL("internal error");
1207b5253557SWarner Losh 		} else {
1208b5253557SWarner Losh 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1209b5253557SWarner Losh 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1210b5253557SWarner Losh 		}
1211b5253557SWarner Losh 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1212b5253557SWarner Losh 		if (firstnum == 0) {	/* {0} or {0,0} */
1213b5253557SWarner Losh 			/* This case is unusual because the resulting
1214b5253557SWarner Losh 			   replacement string might actually be SMALLER than
1215b5253557SWarner Losh 			   the original ERE */
1216b5253557SWarner Losh 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1217b5253557SWarner Losh 					firstnum, secondnum, REPEAT_ZERO);
1218b5253557SWarner Losh 		} else {		/* (firstnum >= 1) */
1219b5253557SWarner Losh 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1220b5253557SWarner Losh 					firstnum, secondnum, REPEAT_SIMPLE);
1221b5253557SWarner Losh 		}
1222b5253557SWarner Losh 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1223b5253557SWarner Losh 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1224b5253557SWarner Losh 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1225b5253557SWarner Losh 					firstnum, secondnum, REPEAT_WITH_Q);
1226b5253557SWarner Losh 	} else {	/* Error - shouldn't be here (n>m) */
1227b5253557SWarner Losh 		FATAL("internal error");
1228b5253557SWarner Losh 	}
1229b5253557SWarner Losh 	return 0;
1230b5253557SWarner Losh }
1231007c6572SDag-Erling Smørgrav 
12322a55deb1SDavid E. O'Brien int relex(void)		/* lexical analyzer for reparse */
12332a55deb1SDavid E. O'Brien {
12342a55deb1SDavid E. O'Brien 	int c, n;
12352a55deb1SDavid E. O'Brien 	int cflag;
123610ce5b99SWarner Losh 	static uschar *buf = NULL;
12372a55deb1SDavid E. O'Brien 	static int bufsz = 100;
12382a55deb1SDavid E. O'Brien 	uschar *bp;
1239f39dd6a9SWarner Losh 	const struct charclass *cc;
1240fc6b1dfeSDavid E. O'Brien 	int i;
1241f39dd6a9SWarner Losh 	int num, m;
1242f39dd6a9SWarner Losh 	bool commafound, digitfound;
1243b5253557SWarner Losh 	const uschar *startreptok;
1244f39dd6a9SWarner Losh 	static int parens = 0;
1245b5253557SWarner Losh 
1246b5253557SWarner Losh rescan:
1247b5253557SWarner Losh 	starttok = prestr;
12482a55deb1SDavid E. O'Brien 
1249*f32a6403SWarner Losh 	if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1250*f32a6403SWarner Losh 		prestr += n;
1251*f32a6403SWarner Losh 		starttok = prestr;
1252*f32a6403SWarner Losh 		return CHAR;
1253*f32a6403SWarner Losh 	}
1254*f32a6403SWarner Losh 
12552a55deb1SDavid E. O'Brien 	switch (c = *prestr++) {
12562a55deb1SDavid E. O'Brien 	case '|': return OR;
12572a55deb1SDavid E. O'Brien 	case '*': return STAR;
12582a55deb1SDavid E. O'Brien 	case '+': return PLUS;
12592a55deb1SDavid E. O'Brien 	case '?': return QUEST;
12602a55deb1SDavid E. O'Brien 	case '.': return DOT;
12612a55deb1SDavid E. O'Brien 	case '\0': prestr--; return '\0';
12622a55deb1SDavid E. O'Brien 	case '^':
12632a55deb1SDavid E. O'Brien 	case '$':
12642a55deb1SDavid E. O'Brien 		return c;
1265f39dd6a9SWarner Losh 	case '(':
1266f39dd6a9SWarner Losh 		parens++;
1267f39dd6a9SWarner Losh 		return c;
1268f39dd6a9SWarner Losh 	case ')':
1269f39dd6a9SWarner Losh 		if (parens) {
1270f39dd6a9SWarner Losh 			parens--;
1271f39dd6a9SWarner Losh 			return c;
1272f39dd6a9SWarner Losh 		}
1273f39dd6a9SWarner Losh 		/* unmatched close parenthesis; per POSIX, treat as literal */
1274f39dd6a9SWarner Losh 		rlxval = c;
1275f39dd6a9SWarner Losh 		return CHAR;
12762a55deb1SDavid E. O'Brien 	case '\\':
1277d86a0988SRuslan Ermilov 		rlxval = quoted(&prestr);
12782a55deb1SDavid E. O'Brien 		return CHAR;
12792a55deb1SDavid E. O'Brien 	default:
12802a55deb1SDavid E. O'Brien 		rlxval = c;
12812a55deb1SDavid E. O'Brien 		return CHAR;
12822a55deb1SDavid E. O'Brien 	case '[':
128310ce5b99SWarner Losh 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
12842a55deb1SDavid E. O'Brien 			FATAL("out of space in reg expr %.10s..", lastre);
12852a55deb1SDavid E. O'Brien 		bp = buf;
12862a55deb1SDavid E. O'Brien 		if (*prestr == '^') {
12872a55deb1SDavid E. O'Brien 			cflag = 1;
12882a55deb1SDavid E. O'Brien 			prestr++;
12892a55deb1SDavid E. O'Brien 		}
12902a55deb1SDavid E. O'Brien 		else
12912a55deb1SDavid E. O'Brien 			cflag = 0;
1292*f32a6403SWarner Losh 		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1293addad6afSRong-En Fan 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
12942a55deb1SDavid E. O'Brien 			FATAL("out of space for reg expr %.10s...", lastre);
12952a55deb1SDavid E. O'Brien 		for (; ; ) {
1296*f32a6403SWarner Losh 			if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1297*f32a6403SWarner Losh 				for (i = 0; i < n; i++)
1298*f32a6403SWarner Losh 					*bp++ = *prestr++;
1299*f32a6403SWarner Losh 				continue;
1300*f32a6403SWarner Losh 			}
13012a55deb1SDavid E. O'Brien 			if ((c = *prestr++) == '\\') {
13022a55deb1SDavid E. O'Brien 				*bp++ = '\\';
13032a55deb1SDavid E. O'Brien 				if ((c = *prestr++) == '\0')
13042a55deb1SDavid E. O'Brien 					FATAL("nonterminated character class %.20s...", lastre);
13052a55deb1SDavid E. O'Brien 				*bp++ = c;
13062a55deb1SDavid E. O'Brien 			/* } else if (c == '\n') { */
13072a55deb1SDavid E. O'Brien 			/* 	FATAL("newline in character class %.20s...", lastre); */
1308007c6572SDag-Erling Smørgrav 			} else if (c == '[' && *prestr == ':') {
1309007c6572SDag-Erling Smørgrav 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1310007c6572SDag-Erling Smørgrav 				for (cc = charclasses; cc->cc_name; cc++)
1311007c6572SDag-Erling Smørgrav 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1312007c6572SDag-Erling Smørgrav 						break;
1313007c6572SDag-Erling Smørgrav 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1314007c6572SDag-Erling Smørgrav 				    prestr[2 + cc->cc_namelen] == ']') {
1315007c6572SDag-Erling Smørgrav 					prestr += cc->cc_namelen + 3;
1316b5253557SWarner Losh 					/*
1317b5253557SWarner Losh 					 * BUG: We begin at 1, instead of 0, since we
1318b5253557SWarner Losh 					 * would otherwise prematurely terminate the
1319b5253557SWarner Losh 					 * string for classes like [[:cntrl:]]. This
1320b5253557SWarner Losh 					 * means that we can't match the NUL character,
1321b5253557SWarner Losh 					 * not without first adapting the entire
1322b5253557SWarner Losh 					 * program to track each string's length.
1323b5253557SWarner Losh 					 */
1324b5253557SWarner Losh 					for (i = 1; i <= UCHAR_MAX; i++) {
1325*f32a6403SWarner Losh 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1326fc6b1dfeSDavid E. O'Brien 						    FATAL("out of space for reg expr %.10s...", lastre);
1327fc6b1dfeSDavid E. O'Brien 						if (cc->cc_func(i)) {
1328f39dd6a9SWarner Losh 							/* escape backslash */
1329f39dd6a9SWarner Losh 							if (i == '\\') {
1330f39dd6a9SWarner Losh 								*bp++ = '\\';
1331f39dd6a9SWarner Losh 								n++;
1332f39dd6a9SWarner Losh 							}
1333f39dd6a9SWarner Losh 
1334fc6b1dfeSDavid E. O'Brien 							*bp++ = i;
1335fc6b1dfeSDavid E. O'Brien 							n++;
1336fc6b1dfeSDavid E. O'Brien 						}
1337fc6b1dfeSDavid E. O'Brien 					}
1338007c6572SDag-Erling Smørgrav 				} else
1339007c6572SDag-Erling Smørgrav 					*bp++ = c;
1340b5253557SWarner Losh 			} else if (c == '[' && *prestr == '.') {
1341b5253557SWarner Losh 				char collate_char;
1342b5253557SWarner Losh 				prestr++;
1343b5253557SWarner Losh 				collate_char = *prestr++;
1344b5253557SWarner Losh 				if (*prestr == '.' && prestr[1] == ']') {
1345b5253557SWarner Losh 					prestr += 2;
1346b5253557SWarner Losh 					/* Found it: map via locale TBD: for
1347b5253557SWarner Losh 					   now, simply return this char.  This
1348b5253557SWarner Losh 					   is sufficient to pass conformance
1349b5253557SWarner Losh 					   test awk.ex 156
1350b5253557SWarner Losh 					 */
1351b5253557SWarner Losh 					if (*prestr == ']') {
1352b5253557SWarner Losh 						prestr++;
1353b5253557SWarner Losh 						rlxval = collate_char;
1354b5253557SWarner Losh 						return CHAR;
1355b5253557SWarner Losh 					}
1356b5253557SWarner Losh 				}
1357b5253557SWarner Losh 			} else if (c == '[' && *prestr == '=') {
1358b5253557SWarner Losh 				char equiv_char;
1359b5253557SWarner Losh 				prestr++;
1360b5253557SWarner Losh 				equiv_char = *prestr++;
1361b5253557SWarner Losh 				if (*prestr == '=' && prestr[1] == ']') {
1362b5253557SWarner Losh 					prestr += 2;
1363b5253557SWarner Losh 					/* Found it: map via locale TBD: for now
1364b5253557SWarner Losh 					   simply return this char. This is
1365b5253557SWarner Losh 					   sufficient to pass conformance test
1366b5253557SWarner Losh 					   awk.ex 156
1367b5253557SWarner Losh 					 */
1368b5253557SWarner Losh 					if (*prestr == ']') {
1369b5253557SWarner Losh 						prestr++;
1370b5253557SWarner Losh 						rlxval = equiv_char;
1371b5253557SWarner Losh 						return CHAR;
1372b5253557SWarner Losh 					}
1373b5253557SWarner Losh 				}
13742a55deb1SDavid E. O'Brien 			} else if (c == '\0') {
13752a55deb1SDavid E. O'Brien 				FATAL("nonterminated character class %.20s", lastre);
13762a55deb1SDavid E. O'Brien 			} else if (bp == buf) {	/* 1st char is special */
13772a55deb1SDavid E. O'Brien 				*bp++ = c;
13782a55deb1SDavid E. O'Brien 			} else if (c == ']') {
13792a55deb1SDavid E. O'Brien 				*bp++ = 0;
13802a55deb1SDavid E. O'Brien 				rlxstr = (uschar *) tostring((char *) buf);
13812a55deb1SDavid E. O'Brien 				if (cflag == 0)
13822a55deb1SDavid E. O'Brien 					return CCL;
13832a55deb1SDavid E. O'Brien 				else
13842a55deb1SDavid E. O'Brien 					return NCCL;
13852a55deb1SDavid E. O'Brien 			} else
13862a55deb1SDavid E. O'Brien 				*bp++ = c;
13872a55deb1SDavid E. O'Brien 		}
1388b5253557SWarner Losh 		break;
1389b5253557SWarner Losh 	case '{':
1390*f32a6403SWarner Losh 		if (isdigit((int) *(prestr))) {
1391b5253557SWarner Losh 			num = 0;	/* Process as a repetition */
1392b5253557SWarner Losh 			n = -1; m = -1;
1393f39dd6a9SWarner Losh 			commafound = false;
1394f39dd6a9SWarner Losh 			digitfound = false;
1395b5253557SWarner Losh 			startreptok = prestr-1;
1396b5253557SWarner Losh 			/* Remember start of previous atom here ? */
1397b5253557SWarner Losh 		} else {        	/* just a { char, not a repetition */
1398b5253557SWarner Losh 			rlxval = c;
1399b5253557SWarner Losh 			return CHAR;
1400b5253557SWarner Losh                 }
1401b5253557SWarner Losh 		for (; ; ) {
1402b5253557SWarner Losh 			if ((c = *prestr++) == '}') {
1403b5253557SWarner Losh 				if (commafound) {
1404b5253557SWarner Losh 					if (digitfound) { /* {n,m} */
1405b5253557SWarner Losh 						m = num;
1406b5253557SWarner Losh 						if (m < n)
1407b5253557SWarner Losh 							FATAL("illegal repetition expression: class %.20s",
1408b5253557SWarner Losh 								lastre);
1409f39dd6a9SWarner Losh 						if (n == 0 && m == 1) {
1410b5253557SWarner Losh 							return QUEST;
1411b5253557SWarner Losh 						}
1412b5253557SWarner Losh 					} else {	/* {n,} */
1413f39dd6a9SWarner Losh 						if (n == 0)
1414f39dd6a9SWarner Losh 							return STAR;
1415f39dd6a9SWarner Losh 						else if (n == 1)
1416f39dd6a9SWarner Losh 							return PLUS;
1417b5253557SWarner Losh 					}
1418b5253557SWarner Losh 				} else {
1419b5253557SWarner Losh 					if (digitfound) { /* {n} same as {n,n} */
1420b5253557SWarner Losh 						n = num;
1421b5253557SWarner Losh 						m = num;
1422b5253557SWarner Losh 					} else {	/* {} */
1423b5253557SWarner Losh 						FATAL("illegal repetition expression: class %.20s",
1424b5253557SWarner Losh 							lastre);
1425b5253557SWarner Losh 					}
1426b5253557SWarner Losh 				}
1427b5253557SWarner Losh 				if (repeat(starttok, prestr-starttok, lastatom,
1428b5253557SWarner Losh 					   startreptok - lastatom, n, m) > 0) {
1429f39dd6a9SWarner Losh 					if (n == 0 && m == 0) {
1430f39dd6a9SWarner Losh 						return ZERO;
1431b5253557SWarner Losh 					}
1432b5253557SWarner Losh 					/* must rescan input for next token */
1433b5253557SWarner Losh 					goto rescan;
1434b5253557SWarner Losh 				}
1435b5253557SWarner Losh 				/* Failed to replace: eat up {...} characters
1436b5253557SWarner Losh 				   and treat like just PLUS */
1437b5253557SWarner Losh 				return PLUS;
1438b5253557SWarner Losh 			} else if (c == '\0') {
1439b5253557SWarner Losh 				FATAL("nonterminated character class %.20s",
1440b5253557SWarner Losh 					lastre);
1441b5253557SWarner Losh 			} else if (isdigit(c)) {
1442b5253557SWarner Losh 				num = 10 * num + c - '0';
1443f39dd6a9SWarner Losh 				digitfound = true;
1444b5253557SWarner Losh 			} else if (c == ',') {
1445b5253557SWarner Losh 				if (commafound)
1446b5253557SWarner Losh 					FATAL("illegal repetition expression: class %.20s",
1447b5253557SWarner Losh 						lastre);
1448b5253557SWarner Losh 				/* looking for {n,} or {n,m} */
1449f39dd6a9SWarner Losh 				commafound = true;
1450b5253557SWarner Losh 				n = num;
1451f39dd6a9SWarner Losh 				digitfound = false; /* reset */
1452b5253557SWarner Losh 				num = 0;
1453b5253557SWarner Losh 			} else {
1454b5253557SWarner Losh 				FATAL("illegal repetition expression: class %.20s",
1455b5253557SWarner Losh 					lastre);
1456b5253557SWarner Losh 			}
1457b5253557SWarner Losh 		}
1458b5253557SWarner Losh 		break;
14592a55deb1SDavid E. O'Brien 	}
14602a55deb1SDavid E. O'Brien }
14612a55deb1SDavid E. O'Brien 
14622a55deb1SDavid E. O'Brien int cgoto(fa *f, int s, int c)
14632a55deb1SDavid E. O'Brien {
14642a55deb1SDavid E. O'Brien 	int *p, *q;
1465f39dd6a9SWarner Losh 	int i, j, k;
14662a55deb1SDavid E. O'Brien 
1467*f32a6403SWarner Losh 	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
14682a55deb1SDavid E. O'Brien 	while (f->accept >= maxsetvec) {	/* guessing here! */
1469f39dd6a9SWarner Losh 		resizesetvec(__func__);
14702a55deb1SDavid E. O'Brien 	}
14712a55deb1SDavid E. O'Brien 	for (i = 0; i <= f->accept; i++)
14722a55deb1SDavid E. O'Brien 		setvec[i] = 0;
14732a55deb1SDavid E. O'Brien 	setcnt = 0;
1474f39dd6a9SWarner Losh 	resize_state(f, s);
14752a55deb1SDavid E. O'Brien 	/* compute positions of gototab[s,c] into setvec */
14762a55deb1SDavid E. O'Brien 	p = f->posns[s];
14772a55deb1SDavid E. O'Brien 	for (i = 1; i <= *p; i++) {
14782a55deb1SDavid E. O'Brien 		if ((k = f->re[p[i]].ltype) != FINAL) {
14792a55deb1SDavid E. O'Brien 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
14802a55deb1SDavid E. O'Brien 			 || (k == DOT && c != 0 && c != HAT)
14812a55deb1SDavid E. O'Brien 			 || (k == ALL && c != 0)
1482addad6afSRong-En Fan 			 || (k == EMPTYRE && c != 0)
1483*f32a6403SWarner Losh 			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1484*f32a6403SWarner Losh 			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
14852a55deb1SDavid E. O'Brien 				q = f->re[p[i]].lfollow;
14862a55deb1SDavid E. O'Brien 				for (j = 1; j <= *q; j++) {
14872a55deb1SDavid E. O'Brien 					if (q[j] >= maxsetvec) {
1488f39dd6a9SWarner Losh 						resizesetvec(__func__);
14892a55deb1SDavid E. O'Brien 					}
14902a55deb1SDavid E. O'Brien 					if (setvec[q[j]] == 0) {
14912a55deb1SDavid E. O'Brien 						setcnt++;
14922a55deb1SDavid E. O'Brien 						setvec[q[j]] = 1;
14932a55deb1SDavid E. O'Brien 					}
14942a55deb1SDavid E. O'Brien 				}
14952a55deb1SDavid E. O'Brien 			}
14962a55deb1SDavid E. O'Brien 		}
14972a55deb1SDavid E. O'Brien 	}
14982a55deb1SDavid E. O'Brien 	/* determine if setvec is a previous state */
14992a55deb1SDavid E. O'Brien 	tmpset[0] = setcnt;
15002a55deb1SDavid E. O'Brien 	j = 1;
15012a55deb1SDavid E. O'Brien 	for (i = f->accept; i >= 0; i--)
15022a55deb1SDavid E. O'Brien 		if (setvec[i]) {
15032a55deb1SDavid E. O'Brien 			tmpset[j++] = i;
15042a55deb1SDavid E. O'Brien 		}
1505f39dd6a9SWarner Losh 	resize_state(f, f->curstat > s ? f->curstat : s);
15062a55deb1SDavid E. O'Brien 	/* tmpset == previous state? */
15072a55deb1SDavid E. O'Brien 	for (i = 1; i <= f->curstat; i++) {
15082a55deb1SDavid E. O'Brien 		p = f->posns[i];
15092a55deb1SDavid E. O'Brien 		if ((k = tmpset[0]) != p[0])
15102a55deb1SDavid E. O'Brien 			goto different;
15112a55deb1SDavid E. O'Brien 		for (j = 1; j <= k; j++)
15122a55deb1SDavid E. O'Brien 			if (tmpset[j] != p[j])
15132a55deb1SDavid E. O'Brien 				goto different;
15142a55deb1SDavid E. O'Brien 		/* setvec is state i */
1515f39dd6a9SWarner Losh 		if (c != HAT)
1516*f32a6403SWarner Losh 			set_gototab(f, s, c, i);
15172a55deb1SDavid E. O'Brien 		return i;
15182a55deb1SDavid E. O'Brien 	  different:;
15192a55deb1SDavid E. O'Brien 	}
15202a55deb1SDavid E. O'Brien 
15212a55deb1SDavid E. O'Brien 	/* add tmpset to current set of states */
15222a55deb1SDavid E. O'Brien 	++(f->curstat);
1523f39dd6a9SWarner Losh 	resize_state(f, f->curstat);
1524*f32a6403SWarner Losh 	clear_gototab(f, f->curstat);
15252a55deb1SDavid E. O'Brien 	xfree(f->posns[f->curstat]);
1526f39dd6a9SWarner Losh 	p = intalloc(setcnt + 1, __func__);
15272a55deb1SDavid E. O'Brien 
15282a55deb1SDavid E. O'Brien 	f->posns[f->curstat] = p;
1529f39dd6a9SWarner Losh 	if (c != HAT)
1530*f32a6403SWarner Losh 		set_gototab(f, s, c, f->curstat);
15312a55deb1SDavid E. O'Brien 	for (i = 0; i <= setcnt; i++)
15322a55deb1SDavid E. O'Brien 		p[i] = tmpset[i];
15332a55deb1SDavid E. O'Brien 	if (setvec[f->accept])
15342a55deb1SDavid E. O'Brien 		f->out[f->curstat] = 1;
15352a55deb1SDavid E. O'Brien 	else
15362a55deb1SDavid E. O'Brien 		f->out[f->curstat] = 0;
15372a55deb1SDavid E. O'Brien 	return f->curstat;
15382a55deb1SDavid E. O'Brien }
15392a55deb1SDavid E. O'Brien 
15402a55deb1SDavid E. O'Brien 
15412a55deb1SDavid E. O'Brien void freefa(fa *f)	/* free a finite automaton */
15422a55deb1SDavid E. O'Brien {
15432a55deb1SDavid E. O'Brien 	int i;
15442a55deb1SDavid E. O'Brien 
15452a55deb1SDavid E. O'Brien 	if (f == NULL)
15462a55deb1SDavid E. O'Brien 		return;
1547f39dd6a9SWarner Losh 	for (i = 0; i < f->state_count; i++)
1548*f32a6403SWarner Losh 		xfree(f->gototab[i].entries);
1549*f32a6403SWarner Losh 	xfree(f->gototab);
15502a55deb1SDavid E. O'Brien 	for (i = 0; i <= f->curstat; i++)
15512a55deb1SDavid E. O'Brien 		xfree(f->posns[i]);
15522a55deb1SDavid E. O'Brien 	for (i = 0; i <= f->accept; i++) {
15532a55deb1SDavid E. O'Brien 		xfree(f->re[i].lfollow);
15542a55deb1SDavid E. O'Brien 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1555f39dd6a9SWarner Losh 			xfree(f->re[i].lval.np);
15562a55deb1SDavid E. O'Brien 	}
15572a55deb1SDavid E. O'Brien 	xfree(f->restr);
1558f39dd6a9SWarner Losh 	xfree(f->out);
1559f39dd6a9SWarner Losh 	xfree(f->posns);
1560f39dd6a9SWarner Losh 	xfree(f->gototab);
15612a55deb1SDavid E. O'Brien 	xfree(f);
15622a55deb1SDavid E. O'Brien }
1563