12a55deb1SDavid E. O'Brien /****************************************************************
22a55deb1SDavid E. O'Brien Copyright (C) Lucent Technologies 1997
32a55deb1SDavid E. O'Brien All Rights Reserved
42a55deb1SDavid E. O'Brien
52a55deb1SDavid E. O'Brien Permission to use, copy, modify, and distribute this software and
62a55deb1SDavid E. O'Brien its documentation for any purpose and without fee is hereby
72a55deb1SDavid E. O'Brien granted, provided that the above copyright notice appear in all
82a55deb1SDavid E. O'Brien copies and that both that the copyright notice and this
92a55deb1SDavid E. O'Brien permission notice and warranty disclaimer appear in supporting
102a55deb1SDavid E. O'Brien documentation, and that the name Lucent Technologies or any of
112a55deb1SDavid E. O'Brien its entities not be used in advertising or publicity pertaining
122a55deb1SDavid E. O'Brien to distribution of the software without specific, written prior
132a55deb1SDavid E. O'Brien permission.
142a55deb1SDavid E. O'Brien
152a55deb1SDavid E. O'Brien LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
162a55deb1SDavid E. O'Brien INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
172a55deb1SDavid E. O'Brien IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
182a55deb1SDavid E. O'Brien SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
192a55deb1SDavid E. O'Brien WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
202a55deb1SDavid E. O'Brien IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
212a55deb1SDavid E. O'Brien ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
222a55deb1SDavid E. O'Brien THIS SOFTWARE.
232a55deb1SDavid E. O'Brien ****************************************************************/
242a55deb1SDavid E. O'Brien
25addad6afSRong-En Fan /* lasciate ogne speranza, voi ch'intrate. */
262a55deb1SDavid E. O'Brien
272a55deb1SDavid E. O'Brien #define DEBUG
282a55deb1SDavid E. O'Brien
292a55deb1SDavid E. O'Brien #include <ctype.h>
30b5253557SWarner Losh #include <limits.h>
312a55deb1SDavid E. O'Brien #include <stdio.h>
322a55deb1SDavid E. O'Brien #include <string.h>
332a55deb1SDavid E. O'Brien #include <stdlib.h>
342a55deb1SDavid E. O'Brien #include "awk.h"
35f39dd6a9SWarner Losh #include "awkgram.tab.h"
362a55deb1SDavid E. O'Brien
372a55deb1SDavid E. O'Brien #define MAXLIN 22
382a55deb1SDavid E. O'Brien
392a55deb1SDavid E. O'Brien #define type(v) (v)->nobj /* badly overloaded here */
402a55deb1SDavid E. O'Brien #define info(v) (v)->ntype /* badly overloaded here */
412a55deb1SDavid E. O'Brien #define left(v) (v)->narg[0]
422a55deb1SDavid E. O'Brien #define right(v) (v)->narg[1]
432a55deb1SDavid E. O'Brien #define parent(v) (v)->nnext
442a55deb1SDavid E. O'Brien
452a55deb1SDavid E. O'Brien #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46addad6afSRong-En Fan #define ELEAF case EMPTYRE: /* empty string in regexp */
472a55deb1SDavid E. O'Brien #define UNARY case STAR: case PLUS: case QUEST:
482a55deb1SDavid E. O'Brien
492a55deb1SDavid E. O'Brien /* encoding in tree Nodes:
50addad6afSRong-En Fan leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
512a55deb1SDavid E. O'Brien left is index, right contains value or pointer to value
522a55deb1SDavid E. O'Brien unary (STAR, PLUS, QUEST): left is child, right is null
532a55deb1SDavid E. O'Brien binary (CAT, OR): left and right are children
542a55deb1SDavid E. O'Brien parent contains pointer to parent
552a55deb1SDavid E. O'Brien */
562a55deb1SDavid E. O'Brien
572a55deb1SDavid E. O'Brien
582a55deb1SDavid E. O'Brien int *setvec;
592a55deb1SDavid E. O'Brien int *tmpset;
602a55deb1SDavid E. O'Brien int maxsetvec = 0;
612a55deb1SDavid E. O'Brien
622a55deb1SDavid E. O'Brien int rtok; /* next token in current re */
632a55deb1SDavid E. O'Brien int rlxval;
64f39dd6a9SWarner Losh static const uschar *rlxstr;
65f39dd6a9SWarner Losh static const uschar *prestr; /* current position in current re */
66f39dd6a9SWarner Losh static const uschar *lastre; /* origin of last re */
67f39dd6a9SWarner Losh static const uschar *lastatom; /* origin of last Atom */
68f39dd6a9SWarner Losh static const uschar *starttok;
69f39dd6a9SWarner Losh static const uschar *basestr; /* starts with original, replaced during
70b5253557SWarner Losh repetition processing */
71f39dd6a9SWarner Losh static const uschar *firstbasestr;
722a55deb1SDavid E. O'Brien
732a55deb1SDavid E. O'Brien static int setcnt;
742a55deb1SDavid E. O'Brien static int poscnt;
752a55deb1SDavid E. O'Brien
76f39dd6a9SWarner Losh const char *patbeg;
772a55deb1SDavid E. O'Brien int patlen;
782a55deb1SDavid E. O'Brien
79f39dd6a9SWarner Losh #define NFA 128 /* cache this many dynamic fa's */
802a55deb1SDavid E. O'Brien fa *fatab[NFA];
812a55deb1SDavid E. O'Brien int nfatab = 0; /* entries in fatab */
822a55deb1SDavid E. O'Brien
83f32a6403SWarner Losh extern int u8_nextlen(const char *s);
84f32a6403SWarner Losh
85f32a6403SWarner Losh
86f32a6403SWarner Losh /* utf-8 mechanism:
87f32a6403SWarner Losh
88f32a6403SWarner Losh For most of Awk, utf-8 strings just "work", since they look like
89f32a6403SWarner Losh null-terminated sequences of 8-bit bytes.
90f32a6403SWarner Losh
91f32a6403SWarner Losh Functions like length(), index(), and substr() have to operate
92f32a6403SWarner Losh in units of utf-8 characters. The u8_* functions in run.c
93f32a6403SWarner Losh handle this.
94f32a6403SWarner Losh
95f32a6403SWarner Losh Regular expressions are more complicated, since the basic
96f32a6403SWarner Losh mechanism of the goto table used 8-bit byte indices into the
97f32a6403SWarner Losh gototab entries to compute the next state. Unicode is a lot
98f32a6403SWarner Losh bigger, so the gototab entries are now structs with a character
99f32a6403SWarner Losh and a next state. These are sorted by code point and binary
100f32a6403SWarner Losh searched.
101f32a6403SWarner Losh
102f32a6403SWarner Losh Throughout the RE mechanism in b.c, utf-8 characters are
103f32a6403SWarner Losh converted to their utf-32 value. This mostly shows up in
104f32a6403SWarner Losh cclenter, which expands character class ranges like a-z and now
105f32a6403SWarner Losh alpha-omega. The size of a gototab array is still about 256.
106f32a6403SWarner Losh This should be dynamic, but for now things work ok for a single
107f32a6403SWarner Losh code page of Unicode, which is the most likely case.
108f32a6403SWarner Losh
109f32a6403SWarner Losh The code changes are localized in run.c and b.c. I have added a
110f32a6403SWarner Losh handful of functions to somewhat better hide the implementation,
111f32a6403SWarner Losh but a lot more could be done.
112f32a6403SWarner Losh
113f32a6403SWarner Losh */
114f32a6403SWarner Losh
115f32a6403SWarner Losh static int entry_cmp(const void *l, const void *r);
116f32a6403SWarner Losh static int get_gototab(fa*, int, int);
117f32a6403SWarner Losh static int set_gototab(fa*, int, int, int);
118f32a6403SWarner Losh static void clear_gototab(fa*, int);
119f32a6403SWarner Losh extern int u8_rune(int *, const char *);
120f32a6403SWarner Losh
121f39dd6a9SWarner Losh static int *
intalloc(size_t n,const char * f)122f39dd6a9SWarner Losh intalloc(size_t n, const char *f)
123f39dd6a9SWarner Losh {
124f39dd6a9SWarner Losh int *p = (int *) calloc(n, sizeof(int));
125f39dd6a9SWarner Losh if (p == NULL)
126f39dd6a9SWarner Losh overflo(f);
127f39dd6a9SWarner Losh return p;
128f39dd6a9SWarner Losh }
129f39dd6a9SWarner Losh
130f39dd6a9SWarner Losh static void
resizesetvec(const char * f)131f39dd6a9SWarner Losh resizesetvec(const char *f)
132f39dd6a9SWarner Losh {
133f39dd6a9SWarner Losh if (maxsetvec == 0)
134f39dd6a9SWarner Losh maxsetvec = MAXLIN;
135f39dd6a9SWarner Losh else
136f39dd6a9SWarner Losh maxsetvec *= 4;
137f39dd6a9SWarner Losh setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138f39dd6a9SWarner Losh tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139f39dd6a9SWarner Losh if (setvec == NULL || tmpset == NULL)
140f39dd6a9SWarner Losh overflo(f);
141f39dd6a9SWarner Losh }
142f39dd6a9SWarner Losh
143f39dd6a9SWarner Losh static void
resize_state(fa * f,int state)144f39dd6a9SWarner Losh resize_state(fa *f, int state)
145f39dd6a9SWarner Losh {
146f32a6403SWarner Losh gtt *p;
147f39dd6a9SWarner Losh uschar *p2;
148f39dd6a9SWarner Losh int **p3;
149f39dd6a9SWarner Losh int i, new_count;
150f39dd6a9SWarner Losh
151f39dd6a9SWarner Losh if (++state < f->state_count)
152f39dd6a9SWarner Losh return;
153f39dd6a9SWarner Losh
154f39dd6a9SWarner Losh new_count = state + 10; /* needs to be tuned */
155f39dd6a9SWarner Losh
156f32a6403SWarner Losh p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157f39dd6a9SWarner Losh if (p == NULL)
158f39dd6a9SWarner Losh goto out;
159f39dd6a9SWarner Losh f->gototab = p;
160f39dd6a9SWarner Losh
161f39dd6a9SWarner Losh p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162f39dd6a9SWarner Losh if (p2 == NULL)
163f39dd6a9SWarner Losh goto out;
164f39dd6a9SWarner Losh f->out = p2;
165f39dd6a9SWarner Losh
166f39dd6a9SWarner Losh p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167f39dd6a9SWarner Losh if (p3 == NULL)
168f39dd6a9SWarner Losh goto out;
169f39dd6a9SWarner Losh f->posns = p3;
170f39dd6a9SWarner Losh
171f39dd6a9SWarner Losh for (i = f->state_count; i < new_count; ++i) {
172f32a6403SWarner Losh f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173f32a6403SWarner Losh if (f->gototab[i].entries == NULL)
174f39dd6a9SWarner Losh goto out;
175f32a6403SWarner Losh f->gototab[i].allocated = NCHARS;
176f32a6403SWarner Losh f->gototab[i].inuse = 0;
177f39dd6a9SWarner Losh f->out[i] = 0;
178f39dd6a9SWarner Losh f->posns[i] = NULL;
179f39dd6a9SWarner Losh }
180f39dd6a9SWarner Losh f->state_count = new_count;
181f39dd6a9SWarner Losh return;
182f39dd6a9SWarner Losh out:
183f39dd6a9SWarner Losh overflo(__func__);
184f39dd6a9SWarner Losh }
185f39dd6a9SWarner Losh
makedfa(const char * s,bool anchor)186f39dd6a9SWarner Losh fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1872a55deb1SDavid E. O'Brien {
1882a55deb1SDavid E. O'Brien int i, use, nuse;
1892a55deb1SDavid E. O'Brien fa *pfa;
1902a55deb1SDavid E. O'Brien static int now = 1;
1912a55deb1SDavid E. O'Brien
19210ce5b99SWarner Losh if (setvec == NULL) { /* first time through any RE */
193f39dd6a9SWarner Losh resizesetvec(__func__);
1942a55deb1SDavid E. O'Brien }
1952a55deb1SDavid E. O'Brien
196f39dd6a9SWarner Losh if (compile_time != RUNNING) /* a constant for sure */
1972a55deb1SDavid E. O'Brien return mkdfa(s, anchor);
1982a55deb1SDavid E. O'Brien for (i = 0; i < nfatab; i++) /* is it there already? */
1992a55deb1SDavid E. O'Brien if (fatab[i]->anchor == anchor
200007c6572SDag-Erling Smørgrav && strcmp((const char *) fatab[i]->restr, s) == 0) {
2012a55deb1SDavid E. O'Brien fatab[i]->use = now++;
2022a55deb1SDavid E. O'Brien return fatab[i];
2032a55deb1SDavid E. O'Brien }
2042a55deb1SDavid E. O'Brien pfa = mkdfa(s, anchor);
2052a55deb1SDavid E. O'Brien if (nfatab < NFA) { /* room for another */
2062a55deb1SDavid E. O'Brien fatab[nfatab] = pfa;
2072a55deb1SDavid E. O'Brien fatab[nfatab]->use = now++;
2082a55deb1SDavid E. O'Brien nfatab++;
2092a55deb1SDavid E. O'Brien return pfa;
2102a55deb1SDavid E. O'Brien }
2112a55deb1SDavid E. O'Brien use = fatab[0]->use; /* replace least-recently used */
2122a55deb1SDavid E. O'Brien nuse = 0;
2132a55deb1SDavid E. O'Brien for (i = 1; i < nfatab; i++)
2142a55deb1SDavid E. O'Brien if (fatab[i]->use < use) {
2152a55deb1SDavid E. O'Brien use = fatab[i]->use;
2162a55deb1SDavid E. O'Brien nuse = i;
2172a55deb1SDavid E. O'Brien }
2182a55deb1SDavid E. O'Brien freefa(fatab[nuse]);
2192a55deb1SDavid E. O'Brien fatab[nuse] = pfa;
2202a55deb1SDavid E. O'Brien pfa->use = now++;
2212a55deb1SDavid E. O'Brien return pfa;
2222a55deb1SDavid E. O'Brien }
2232a55deb1SDavid E. O'Brien
mkdfa(const char * s,bool anchor)224f39dd6a9SWarner Losh fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
225f39dd6a9SWarner Losh /* anchor = true for anchored matches, else false */
2262a55deb1SDavid E. O'Brien {
2272a55deb1SDavid E. O'Brien Node *p, *p1;
2282a55deb1SDavid E. O'Brien fa *f;
2292a55deb1SDavid E. O'Brien
230f39dd6a9SWarner Losh firstbasestr = (const uschar *) s;
231b5253557SWarner Losh basestr = firstbasestr;
2322a55deb1SDavid E. O'Brien p = reparse(s);
2332a55deb1SDavid E. O'Brien p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
2342a55deb1SDavid E. O'Brien /* put ALL STAR in front of reg. exp. */
2352a55deb1SDavid E. O'Brien p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
2362a55deb1SDavid E. O'Brien /* put FINAL after reg. exp. */
2372a55deb1SDavid E. O'Brien
2382a55deb1SDavid E. O'Brien poscnt = 0;
2392a55deb1SDavid E. O'Brien penter(p1); /* enter parent pointers and leaf indices */
2402a55deb1SDavid E. O'Brien if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241f39dd6a9SWarner Losh overflo(__func__);
2422a55deb1SDavid E. O'Brien f->accept = poscnt-1; /* penter has computed number of positions in re */
2432a55deb1SDavid E. O'Brien cfoll(f, p1); /* set up follow sets */
2442a55deb1SDavid E. O'Brien freetr(p1);
245f39dd6a9SWarner Losh resize_state(f, 1);
246f39dd6a9SWarner Losh f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247f39dd6a9SWarner Losh f->posns[1] = intalloc(1, __func__);
2482a55deb1SDavid E. O'Brien *f->posns[1] = 0;
2492a55deb1SDavid E. O'Brien f->initstat = makeinit(f, anchor);
2502a55deb1SDavid E. O'Brien f->anchor = anchor;
2512a55deb1SDavid E. O'Brien f->restr = (uschar *) tostring(s);
252b5253557SWarner Losh if (firstbasestr != basestr) {
253b5253557SWarner Losh if (basestr)
254b5253557SWarner Losh xfree(basestr);
255b5253557SWarner Losh }
2562a55deb1SDavid E. O'Brien return f;
2572a55deb1SDavid E. O'Brien }
2582a55deb1SDavid E. O'Brien
makeinit(fa * f,bool anchor)259f39dd6a9SWarner Losh int makeinit(fa *f, bool anchor)
2602a55deb1SDavid E. O'Brien {
2612a55deb1SDavid E. O'Brien int i, k;
2622a55deb1SDavid E. O'Brien
2632a55deb1SDavid E. O'Brien f->curstat = 2;
2642a55deb1SDavid E. O'Brien f->out[2] = 0;
2652a55deb1SDavid E. O'Brien k = *(f->re[0].lfollow);
2662a55deb1SDavid E. O'Brien xfree(f->posns[2]);
267f39dd6a9SWarner Losh f->posns[2] = intalloc(k + 1, __func__);
2682a55deb1SDavid E. O'Brien for (i = 0; i <= k; i++) {
2692a55deb1SDavid E. O'Brien (f->posns[2])[i] = (f->re[0].lfollow)[i];
2702a55deb1SDavid E. O'Brien }
2712a55deb1SDavid E. O'Brien if ((f->posns[2])[1] == f->accept)
2722a55deb1SDavid E. O'Brien f->out[2] = 1;
273f32a6403SWarner Losh clear_gototab(f, 2);
2742a55deb1SDavid E. O'Brien f->curstat = cgoto(f, 2, HAT);
2752a55deb1SDavid E. O'Brien if (anchor) {
2762a55deb1SDavid E. O'Brien *f->posns[2] = k-1; /* leave out position 0 */
2772a55deb1SDavid E. O'Brien for (i = 0; i < k; i++) {
2782a55deb1SDavid E. O'Brien (f->posns[0])[i] = (f->posns[2])[i];
2792a55deb1SDavid E. O'Brien }
2802a55deb1SDavid E. O'Brien
2812a55deb1SDavid E. O'Brien f->out[0] = f->out[2];
2822a55deb1SDavid E. O'Brien if (f->curstat != 2)
2832a55deb1SDavid E. O'Brien --(*f->posns[f->curstat]);
2842a55deb1SDavid E. O'Brien }
2852a55deb1SDavid E. O'Brien return f->curstat;
2862a55deb1SDavid E. O'Brien }
2872a55deb1SDavid E. O'Brien
penter(Node * p)2882a55deb1SDavid E. O'Brien void penter(Node *p) /* set up parent pointers and leaf indices */
2892a55deb1SDavid E. O'Brien {
2902a55deb1SDavid E. O'Brien switch (type(p)) {
291addad6afSRong-En Fan ELEAF
2922a55deb1SDavid E. O'Brien LEAF
2932a55deb1SDavid E. O'Brien info(p) = poscnt;
2942a55deb1SDavid E. O'Brien poscnt++;
2952a55deb1SDavid E. O'Brien break;
2962a55deb1SDavid E. O'Brien UNARY
2972a55deb1SDavid E. O'Brien penter(left(p));
2982a55deb1SDavid E. O'Brien parent(left(p)) = p;
2992a55deb1SDavid E. O'Brien break;
3002a55deb1SDavid E. O'Brien case CAT:
3012a55deb1SDavid E. O'Brien case OR:
3022a55deb1SDavid E. O'Brien penter(left(p));
3032a55deb1SDavid E. O'Brien penter(right(p));
3042a55deb1SDavid E. O'Brien parent(left(p)) = p;
3052a55deb1SDavid E. O'Brien parent(right(p)) = p;
3062a55deb1SDavid E. O'Brien break;
307f39dd6a9SWarner Losh case ZERO:
308f39dd6a9SWarner Losh break;
3092a55deb1SDavid E. O'Brien default: /* can't happen */
3102a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in penter", type(p));
3112a55deb1SDavid E. O'Brien break;
3122a55deb1SDavid E. O'Brien }
3132a55deb1SDavid E. O'Brien }
3142a55deb1SDavid E. O'Brien
freetr(Node * p)3152a55deb1SDavid E. O'Brien void freetr(Node *p) /* free parse tree */
3162a55deb1SDavid E. O'Brien {
3172a55deb1SDavid E. O'Brien switch (type(p)) {
318addad6afSRong-En Fan ELEAF
3192a55deb1SDavid E. O'Brien LEAF
3202a55deb1SDavid E. O'Brien xfree(p);
3212a55deb1SDavid E. O'Brien break;
3222a55deb1SDavid E. O'Brien UNARY
323f39dd6a9SWarner Losh case ZERO:
3242a55deb1SDavid E. O'Brien freetr(left(p));
3252a55deb1SDavid E. O'Brien xfree(p);
3262a55deb1SDavid E. O'Brien break;
3272a55deb1SDavid E. O'Brien case CAT:
3282a55deb1SDavid E. O'Brien case OR:
3292a55deb1SDavid E. O'Brien freetr(left(p));
3302a55deb1SDavid E. O'Brien freetr(right(p));
3312a55deb1SDavid E. O'Brien xfree(p);
3322a55deb1SDavid E. O'Brien break;
3332a55deb1SDavid E. O'Brien default: /* can't happen */
3342a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in freetr", type(p));
3352a55deb1SDavid E. O'Brien break;
3362a55deb1SDavid E. O'Brien }
3372a55deb1SDavid E. O'Brien }
3382a55deb1SDavid E. O'Brien
3392a55deb1SDavid E. O'Brien /* in the parsing of regular expressions, metacharacters like . have */
3402a55deb1SDavid E. O'Brien /* to be seen literally; \056 is not a metacharacter. */
3412a55deb1SDavid E. O'Brien
hexstr(const uschar ** pp,int max)342f32a6403SWarner Losh int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
3432a55deb1SDavid E. O'Brien { /* only pick up one 8-bit byte (2 chars) */
344f39dd6a9SWarner Losh const uschar *p;
3452a55deb1SDavid E. O'Brien int n = 0;
3462a55deb1SDavid E. O'Brien int i;
3472a55deb1SDavid E. O'Brien
348f32a6403SWarner Losh for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349f32a6403SWarner Losh if (isdigit((int) *p))
3502a55deb1SDavid E. O'Brien n = 16 * n + *p - '0';
3512a55deb1SDavid E. O'Brien else if (*p >= 'a' && *p <= 'f')
3522a55deb1SDavid E. O'Brien n = 16 * n + *p - 'a' + 10;
3532a55deb1SDavid E. O'Brien else if (*p >= 'A' && *p <= 'F')
3542a55deb1SDavid E. O'Brien n = 16 * n + *p - 'A' + 10;
3552a55deb1SDavid E. O'Brien }
356f39dd6a9SWarner Losh *pp = p;
3572a55deb1SDavid E. O'Brien return n;
3582a55deb1SDavid E. O'Brien }
3592a55deb1SDavid E. O'Brien
360f32a6403SWarner Losh
361f32a6403SWarner Losh
3622a55deb1SDavid E. O'Brien #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
3632a55deb1SDavid E. O'Brien
quoted(const uschar ** pp)364f39dd6a9SWarner Losh int quoted(const uschar **pp) /* pick up next thing after a \\ */
3652a55deb1SDavid E. O'Brien /* and increment *pp */
3662a55deb1SDavid E. O'Brien {
367f39dd6a9SWarner Losh const uschar *p = *pp;
3682a55deb1SDavid E. O'Brien int c;
3692a55deb1SDavid E. O'Brien
370f32a6403SWarner Losh /* BUG: should advance by utf-8 char even if makes no sense */
371f32a6403SWarner Losh
372*17853db4SWarner Losh switch ((c = *p++)) {
373*17853db4SWarner Losh case 't':
3742a55deb1SDavid E. O'Brien c = '\t';
375*17853db4SWarner Losh break;
376*17853db4SWarner Losh case 'n':
3772a55deb1SDavid E. O'Brien c = '\n';
378*17853db4SWarner Losh break;
379*17853db4SWarner Losh case 'f':
3802a55deb1SDavid E. O'Brien c = '\f';
381*17853db4SWarner Losh break;
382*17853db4SWarner Losh case 'r':
3832a55deb1SDavid E. O'Brien c = '\r';
384*17853db4SWarner Losh break;
385*17853db4SWarner Losh case 'b':
3862a55deb1SDavid E. O'Brien c = '\b';
387*17853db4SWarner Losh break;
388*17853db4SWarner Losh case 'v':
389f39dd6a9SWarner Losh c = '\v';
390*17853db4SWarner Losh break;
391*17853db4SWarner Losh case 'a':
392f39dd6a9SWarner Losh c = '\a';
393*17853db4SWarner Losh break;
394*17853db4SWarner Losh case '\\':
3952a55deb1SDavid E. O'Brien c = '\\';
396*17853db4SWarner Losh break;
397*17853db4SWarner Losh case 'x': /* 2 hex digits follow */
398f32a6403SWarner Losh c = hexstr(&p, 2); /* this adds a null if number is invalid */
399*17853db4SWarner Losh break;
400*17853db4SWarner Losh case 'u': /* unicode char number up to 8 hex digits */
401f32a6403SWarner Losh c = hexstr(&p, 8);
402*17853db4SWarner Losh break;
403*17853db4SWarner Losh default:
404*17853db4SWarner Losh if (isoctdigit(c)) { /* \d \dd \ddd */
4052a55deb1SDavid E. O'Brien int n = c - '0';
4062a55deb1SDavid E. O'Brien if (isoctdigit(*p)) {
4072a55deb1SDavid E. O'Brien n = 8 * n + *p++ - '0';
4082a55deb1SDavid E. O'Brien if (isoctdigit(*p))
4092a55deb1SDavid E. O'Brien n = 8 * n + *p++ - '0';
4102a55deb1SDavid E. O'Brien }
4112a55deb1SDavid E. O'Brien c = n;
412*17853db4SWarner Losh }
413*17853db4SWarner Losh }
414*17853db4SWarner Losh
4152a55deb1SDavid E. O'Brien *pp = p;
4162a55deb1SDavid E. O'Brien return c;
4172a55deb1SDavid E. O'Brien }
4182a55deb1SDavid E. O'Brien
cclenter(const char * argp)419f32a6403SWarner Losh int *cclenter(const char *argp) /* add a character class */
4202a55deb1SDavid E. O'Brien {
421628bd30aSWarner Losh int i, c, c2;
422f32a6403SWarner Losh int n;
423f32a6403SWarner Losh const uschar *p = (const uschar *) argp;
424f32a6403SWarner Losh int *bp, *retp;
425f32a6403SWarner Losh static int *buf = NULL;
4262a55deb1SDavid E. O'Brien static int bufsz = 100;
4272a55deb1SDavid E. O'Brien
428f32a6403SWarner Losh if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
4292a55deb1SDavid E. O'Brien FATAL("out of space for character class [%.10s...] 1", p);
4302a55deb1SDavid E. O'Brien bp = buf;
431f32a6403SWarner Losh for (i = 0; *p != 0; ) {
432f32a6403SWarner Losh n = u8_rune(&c, (const char *) p);
433f32a6403SWarner Losh p += n;
4342a55deb1SDavid E. O'Brien if (c == '\\') {
435d86a0988SRuslan Ermilov c = quoted(&p);
4362a55deb1SDavid E. O'Brien } else if (c == '-' && i > 0 && bp[-1] != 0) {
4372a55deb1SDavid E. O'Brien if (*p != 0) {
4382a55deb1SDavid E. O'Brien c = bp[-1];
439f32a6403SWarner Losh /* c2 = *p++; */
440f32a6403SWarner Losh n = u8_rune(&c2, (const char *) p);
441f32a6403SWarner Losh p += n;
4422a55deb1SDavid E. O'Brien if (c2 == '\\')
443f32a6403SWarner Losh c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
444628bd30aSWarner Losh if (c > c2) { /* empty; ignore */
4452a55deb1SDavid E. O'Brien bp--;
4462a55deb1SDavid E. O'Brien i--;
4472a55deb1SDavid E. O'Brien continue;
4482a55deb1SDavid E. O'Brien }
449628bd30aSWarner Losh while (c < c2) {
450f32a6403SWarner Losh if (i >= bufsz) {
451f32a6403SWarner Losh bufsz *= 2;
452f32a6403SWarner Losh buf = (int *) realloc(buf, bufsz * sizeof(int));
453f32a6403SWarner Losh if (buf == NULL)
4542a55deb1SDavid E. O'Brien FATAL("out of space for character class [%.10s...] 2", p);
455f32a6403SWarner Losh bp = buf + i;
456f32a6403SWarner Losh }
457628bd30aSWarner Losh *bp++ = ++c;
4582a55deb1SDavid E. O'Brien i++;
4592a55deb1SDavid E. O'Brien }
4602a55deb1SDavid E. O'Brien continue;
4612a55deb1SDavid E. O'Brien }
4622a55deb1SDavid E. O'Brien }
463f32a6403SWarner Losh if (i >= bufsz) {
464f32a6403SWarner Losh bufsz *= 2;
465f32a6403SWarner Losh buf = (int *) realloc(buf, bufsz * sizeof(int));
466f32a6403SWarner Losh if (buf == NULL)
467f32a6403SWarner Losh FATAL("out of space for character class [%.10s...] 2", p);
468f32a6403SWarner Losh bp = buf + i;
469f32a6403SWarner Losh }
4702a55deb1SDavid E. O'Brien *bp++ = c;
4712a55deb1SDavid E. O'Brien i++;
4722a55deb1SDavid E. O'Brien }
4732a55deb1SDavid E. O'Brien *bp = 0;
474f32a6403SWarner Losh /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
475f32a6403SWarner Losh /* xfree(op); BUG: what are we freeing here? */
476f32a6403SWarner Losh retp = (int *) calloc(bp-buf+1, sizeof(int));
477f32a6403SWarner Losh for (i = 0; i < bp-buf+1; i++)
478f32a6403SWarner Losh retp[i] = buf[i];
479f32a6403SWarner Losh return retp;
4802a55deb1SDavid E. O'Brien }
4812a55deb1SDavid E. O'Brien
overflo(const char * s)482813da98dSDavid E. O'Brien void overflo(const char *s)
4832a55deb1SDavid E. O'Brien {
484f39dd6a9SWarner Losh FATAL("regular expression too big: out of space in %.30s...", s);
4852a55deb1SDavid E. O'Brien }
4862a55deb1SDavid E. O'Brien
cfoll(fa * f,Node * v)4872a55deb1SDavid E. O'Brien void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
4882a55deb1SDavid E. O'Brien {
4892a55deb1SDavid E. O'Brien int i;
4902a55deb1SDavid E. O'Brien int *p;
4912a55deb1SDavid E. O'Brien
4922a55deb1SDavid E. O'Brien switch (type(v)) {
493addad6afSRong-En Fan ELEAF
4942a55deb1SDavid E. O'Brien LEAF
4952a55deb1SDavid E. O'Brien f->re[info(v)].ltype = type(v);
4962a55deb1SDavid E. O'Brien f->re[info(v)].lval.np = right(v);
4972a55deb1SDavid E. O'Brien while (f->accept >= maxsetvec) { /* guessing here! */
498f39dd6a9SWarner Losh resizesetvec(__func__);
4992a55deb1SDavid E. O'Brien }
5002a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++)
5012a55deb1SDavid E. O'Brien setvec[i] = 0;
5022a55deb1SDavid E. O'Brien setcnt = 0;
5032a55deb1SDavid E. O'Brien follow(v); /* computes setvec and setcnt */
504f39dd6a9SWarner Losh p = intalloc(setcnt + 1, __func__);
5052a55deb1SDavid E. O'Brien f->re[info(v)].lfollow = p;
5062a55deb1SDavid E. O'Brien *p = setcnt;
5072a55deb1SDavid E. O'Brien for (i = f->accept; i >= 0; i--)
5082a55deb1SDavid E. O'Brien if (setvec[i] == 1)
5092a55deb1SDavid E. O'Brien *++p = i;
5102a55deb1SDavid E. O'Brien break;
5112a55deb1SDavid E. O'Brien UNARY
5122a55deb1SDavid E. O'Brien cfoll(f,left(v));
5132a55deb1SDavid E. O'Brien break;
5142a55deb1SDavid E. O'Brien case CAT:
5152a55deb1SDavid E. O'Brien case OR:
5162a55deb1SDavid E. O'Brien cfoll(f,left(v));
5172a55deb1SDavid E. O'Brien cfoll(f,right(v));
5182a55deb1SDavid E. O'Brien break;
519f39dd6a9SWarner Losh case ZERO:
520f39dd6a9SWarner Losh break;
5212a55deb1SDavid E. O'Brien default: /* can't happen */
5222a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in cfoll", type(v));
5232a55deb1SDavid E. O'Brien }
5242a55deb1SDavid E. O'Brien }
5252a55deb1SDavid E. O'Brien
first(Node * p)5262a55deb1SDavid E. O'Brien int first(Node *p) /* collects initially active leaves of p into setvec */
527addad6afSRong-En Fan /* returns 0 if p matches empty string */
5282a55deb1SDavid E. O'Brien {
5292a55deb1SDavid E. O'Brien int b, lp;
5302a55deb1SDavid E. O'Brien
5312a55deb1SDavid E. O'Brien switch (type(p)) {
532addad6afSRong-En Fan ELEAF
5332a55deb1SDavid E. O'Brien LEAF
5342a55deb1SDavid E. O'Brien lp = info(p); /* look for high-water mark of subscripts */
5352a55deb1SDavid E. O'Brien while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
536f39dd6a9SWarner Losh resizesetvec(__func__);
5372a55deb1SDavid E. O'Brien }
538addad6afSRong-En Fan if (type(p) == EMPTYRE) {
539addad6afSRong-En Fan setvec[lp] = 0;
540addad6afSRong-En Fan return(0);
541addad6afSRong-En Fan }
5422a55deb1SDavid E. O'Brien if (setvec[lp] != 1) {
5432a55deb1SDavid E. O'Brien setvec[lp] = 1;
5442a55deb1SDavid E. O'Brien setcnt++;
5452a55deb1SDavid E. O'Brien }
546f32a6403SWarner Losh if (type(p) == CCL && (*(int *) right(p)) == 0)
5472a55deb1SDavid E. O'Brien return(0); /* empty CCL */
548f39dd6a9SWarner Losh return(1);
5492a55deb1SDavid E. O'Brien case PLUS:
550f39dd6a9SWarner Losh if (first(left(p)) == 0)
551f39dd6a9SWarner Losh return(0);
5522a55deb1SDavid E. O'Brien return(1);
5532a55deb1SDavid E. O'Brien case STAR:
5542a55deb1SDavid E. O'Brien case QUEST:
5552a55deb1SDavid E. O'Brien first(left(p));
5562a55deb1SDavid E. O'Brien return(0);
5572a55deb1SDavid E. O'Brien case CAT:
5582a55deb1SDavid E. O'Brien if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
5592a55deb1SDavid E. O'Brien return(1);
5602a55deb1SDavid E. O'Brien case OR:
5612a55deb1SDavid E. O'Brien b = first(right(p));
5622a55deb1SDavid E. O'Brien if (first(left(p)) == 0 || b == 0) return(0);
5632a55deb1SDavid E. O'Brien return(1);
564f39dd6a9SWarner Losh case ZERO:
565f39dd6a9SWarner Losh return 0;
5662a55deb1SDavid E. O'Brien }
5672a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
5682a55deb1SDavid E. O'Brien return(-1);
5692a55deb1SDavid E. O'Brien }
5702a55deb1SDavid E. O'Brien
follow(Node * v)5712a55deb1SDavid E. O'Brien void follow(Node *v) /* collects leaves that can follow v into setvec */
5722a55deb1SDavid E. O'Brien {
5732a55deb1SDavid E. O'Brien Node *p;
5742a55deb1SDavid E. O'Brien
5752a55deb1SDavid E. O'Brien if (type(v) == FINAL)
5762a55deb1SDavid E. O'Brien return;
5772a55deb1SDavid E. O'Brien p = parent(v);
5782a55deb1SDavid E. O'Brien switch (type(p)) {
5792a55deb1SDavid E. O'Brien case STAR:
5802a55deb1SDavid E. O'Brien case PLUS:
5812a55deb1SDavid E. O'Brien first(v);
5822a55deb1SDavid E. O'Brien follow(p);
5832a55deb1SDavid E. O'Brien return;
5842a55deb1SDavid E. O'Brien
5852a55deb1SDavid E. O'Brien case OR:
5862a55deb1SDavid E. O'Brien case QUEST:
5872a55deb1SDavid E. O'Brien follow(p);
5882a55deb1SDavid E. O'Brien return;
5892a55deb1SDavid E. O'Brien
5902a55deb1SDavid E. O'Brien case CAT:
5912a55deb1SDavid E. O'Brien if (v == left(p)) { /* v is left child of p */
5922a55deb1SDavid E. O'Brien if (first(right(p)) == 0) {
5932a55deb1SDavid E. O'Brien follow(p);
5942a55deb1SDavid E. O'Brien return;
5952a55deb1SDavid E. O'Brien }
5962a55deb1SDavid E. O'Brien } else /* v is right child */
5972a55deb1SDavid E. O'Brien follow(p);
5982a55deb1SDavid E. O'Brien return;
5992a55deb1SDavid E. O'Brien }
6002a55deb1SDavid E. O'Brien }
6012a55deb1SDavid E. O'Brien
member(int c,int * sarg)602f32a6403SWarner Losh int member(int c, int *sarg) /* is c in s? */
6032a55deb1SDavid E. O'Brien {
604f32a6403SWarner Losh int *s = (int *) sarg;
6052a55deb1SDavid E. O'Brien
6062a55deb1SDavid E. O'Brien while (*s)
6072a55deb1SDavid E. O'Brien if (c == *s++)
6082a55deb1SDavid E. O'Brien return(1);
6092a55deb1SDavid E. O'Brien return(0);
6102a55deb1SDavid E. O'Brien }
6112a55deb1SDavid E. O'Brien
resize_gototab(fa * f,int state)612f32a6403SWarner Losh static void resize_gototab(fa *f, int state)
613f32a6403SWarner Losh {
614f32a6403SWarner Losh size_t new_size = f->gototab[state].allocated * 2;
615f32a6403SWarner Losh gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
616f32a6403SWarner Losh if (p == NULL)
617f32a6403SWarner Losh overflo(__func__);
618f32a6403SWarner Losh
619f32a6403SWarner Losh // need to initialized the new memory to zero
620f32a6403SWarner Losh size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
621f32a6403SWarner Losh memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
622f32a6403SWarner Losh
623f32a6403SWarner Losh f->gototab[state].allocated = new_size; // update gototab info
624f32a6403SWarner Losh f->gototab[state].entries = p;
625f32a6403SWarner Losh }
626f32a6403SWarner Losh
get_gototab(fa * f,int state,int ch)627f32a6403SWarner Losh static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
628f32a6403SWarner Losh {
629f32a6403SWarner Losh gtte key;
630f32a6403SWarner Losh gtte *item;
631f32a6403SWarner Losh
632f32a6403SWarner Losh key.ch = ch;
633f32a6403SWarner Losh key.state = 0; /* irrelevant */
634f32a6403SWarner Losh item = (gtte *) bsearch(& key, f->gototab[state].entries,
635f32a6403SWarner Losh f->gototab[state].inuse, sizeof(gtte),
636f32a6403SWarner Losh entry_cmp);
637f32a6403SWarner Losh
638f32a6403SWarner Losh if (item == NULL)
639f32a6403SWarner Losh return 0;
640f32a6403SWarner Losh else
641f32a6403SWarner Losh return item->state;
642f32a6403SWarner Losh }
643f32a6403SWarner Losh
entry_cmp(const void * l,const void * r)644f32a6403SWarner Losh static int entry_cmp(const void *l, const void *r)
645f32a6403SWarner Losh {
646f32a6403SWarner Losh const gtte *left, *right;
647f32a6403SWarner Losh
648f32a6403SWarner Losh left = (const gtte *) l;
649f32a6403SWarner Losh right = (const gtte *) r;
650f32a6403SWarner Losh
651f32a6403SWarner Losh return left->ch - right->ch;
652f32a6403SWarner Losh }
653f32a6403SWarner Losh
set_gototab(fa * f,int state,int ch,int val)654f32a6403SWarner Losh static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
655f32a6403SWarner Losh {
656f32a6403SWarner Losh if (f->gototab[state].inuse == 0) {
657f32a6403SWarner Losh f->gototab[state].entries[0].ch = ch;
658f32a6403SWarner Losh f->gototab[state].entries[0].state = val;
659f32a6403SWarner Losh f->gototab[state].inuse++;
660f32a6403SWarner Losh return val;
661*17853db4SWarner Losh } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
662f32a6403SWarner Losh // not seen yet, insert and return
663f32a6403SWarner Losh gtt *tab = & f->gototab[state];
664f32a6403SWarner Losh if (tab->inuse + 1 >= tab->allocated)
665f32a6403SWarner Losh resize_gototab(f, state);
666f32a6403SWarner Losh
6671023317aSWarner Losh f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
6681023317aSWarner Losh f->gototab[state].entries[f->gototab[state].inuse].state = val;
669f32a6403SWarner Losh f->gototab[state].inuse++;
670f32a6403SWarner Losh return val;
671f32a6403SWarner Losh } else {
672f32a6403SWarner Losh // maybe we have it, maybe we don't
673f32a6403SWarner Losh gtte key;
674f32a6403SWarner Losh gtte *item;
675f32a6403SWarner Losh
676f32a6403SWarner Losh key.ch = ch;
677f32a6403SWarner Losh key.state = 0; /* irrelevant */
678f32a6403SWarner Losh item = (gtte *) bsearch(& key, f->gototab[state].entries,
679f32a6403SWarner Losh f->gototab[state].inuse, sizeof(gtte),
680f32a6403SWarner Losh entry_cmp);
681f32a6403SWarner Losh
682f32a6403SWarner Losh if (item != NULL) {
683f32a6403SWarner Losh // we have it, update state and return
684f32a6403SWarner Losh item->state = val;
685f32a6403SWarner Losh return item->state;
686f32a6403SWarner Losh }
687f32a6403SWarner Losh // otherwise, fall through to insert and reallocate.
688f32a6403SWarner Losh }
689f32a6403SWarner Losh
690f32a6403SWarner Losh gtt *tab = & f->gototab[state];
691f32a6403SWarner Losh if (tab->inuse + 1 >= tab->allocated)
692f32a6403SWarner Losh resize_gototab(f, state);
693f32a6403SWarner Losh f->gototab[state].entries[tab->inuse].ch = ch;
694f32a6403SWarner Losh f->gototab[state].entries[tab->inuse].state = val;
6951023317aSWarner Losh ++tab->inuse;
696f32a6403SWarner Losh
697f32a6403SWarner Losh qsort(f->gototab[state].entries,
698f32a6403SWarner Losh f->gototab[state].inuse, sizeof(gtte), entry_cmp);
699f32a6403SWarner Losh
700f32a6403SWarner Losh return val; /* not used anywhere at the moment */
701f32a6403SWarner Losh }
702f32a6403SWarner Losh
clear_gototab(fa * f,int state)703f32a6403SWarner Losh static void clear_gototab(fa *f, int state)
704f32a6403SWarner Losh {
705f32a6403SWarner Losh memset(f->gototab[state].entries, 0,
706f32a6403SWarner Losh f->gototab[state].allocated * sizeof(gtte));
707f32a6403SWarner Losh f->gototab[state].inuse = 0;
708f32a6403SWarner Losh }
709f32a6403SWarner Losh
match(fa * f,const char * p0)710813da98dSDavid E. O'Brien int match(fa *f, const char *p0) /* shortest match ? */
7112a55deb1SDavid E. O'Brien {
7122a55deb1SDavid E. O'Brien int s, ns;
713f32a6403SWarner Losh int n;
714f32a6403SWarner Losh int rune;
715f39dd6a9SWarner Losh const uschar *p = (const uschar *) p0;
7162a55deb1SDavid E. O'Brien
717f32a6403SWarner Losh /* return pmatch(f, p0); does it matter whether longest or shortest? */
718f32a6403SWarner Losh
719f39dd6a9SWarner Losh s = f->initstat;
720f39dd6a9SWarner Losh assert (s < f->state_count);
721f39dd6a9SWarner Losh
7222a55deb1SDavid E. O'Brien if (f->out[s])
7232a55deb1SDavid E. O'Brien return(1);
7242a55deb1SDavid E. O'Brien do {
725addad6afSRong-En Fan /* assert(*p < NCHARS); */
726f32a6403SWarner Losh n = u8_rune(&rune, (const char *) p);
727f32a6403SWarner Losh if ((ns = get_gototab(f, s, rune)) != 0)
7282a55deb1SDavid E. O'Brien s = ns;
7292a55deb1SDavid E. O'Brien else
730f32a6403SWarner Losh s = cgoto(f, s, rune);
7312a55deb1SDavid E. O'Brien if (f->out[s])
7322a55deb1SDavid E. O'Brien return(1);
733f32a6403SWarner Losh if (*p == 0)
734f32a6403SWarner Losh break;
735f32a6403SWarner Losh p += n;
736f32a6403SWarner Losh } while (1); /* was *p++ != 0 */
7372a55deb1SDavid E. O'Brien return(0);
7382a55deb1SDavid E. O'Brien }
7392a55deb1SDavid E. O'Brien
pmatch(fa * f,const char * p0)740813da98dSDavid E. O'Brien int pmatch(fa *f, const char *p0) /* longest match, for sub */
7412a55deb1SDavid E. O'Brien {
7422a55deb1SDavid E. O'Brien int s, ns;
743f32a6403SWarner Losh int n;
744f32a6403SWarner Losh int rune;
745f39dd6a9SWarner Losh const uschar *p = (const uschar *) p0;
746f39dd6a9SWarner Losh const uschar *q;
7472a55deb1SDavid E. O'Brien
74862ebc626SRuslan Ermilov s = f->initstat;
749f39dd6a9SWarner Losh assert(s < f->state_count);
750f39dd6a9SWarner Losh
751f39dd6a9SWarner Losh patbeg = (const char *)p;
7522a55deb1SDavid E. O'Brien patlen = -1;
7532a55deb1SDavid E. O'Brien do {
7542a55deb1SDavid E. O'Brien q = p;
7552a55deb1SDavid E. O'Brien do {
7562a55deb1SDavid E. O'Brien if (f->out[s]) /* final state */
7572a55deb1SDavid E. O'Brien patlen = q-p;
758addad6afSRong-En Fan /* assert(*q < NCHARS); */
759f32a6403SWarner Losh n = u8_rune(&rune, (const char *) q);
760f32a6403SWarner Losh if ((ns = get_gototab(f, s, rune)) != 0)
7612a55deb1SDavid E. O'Brien s = ns;
7622a55deb1SDavid E. O'Brien else
763f32a6403SWarner Losh s = cgoto(f, s, rune);
764f39dd6a9SWarner Losh
765f39dd6a9SWarner Losh assert(s < f->state_count);
766f39dd6a9SWarner Losh
7672a55deb1SDavid E. O'Brien if (s == 1) { /* no transition */
7682a55deb1SDavid E. O'Brien if (patlen >= 0) {
769f39dd6a9SWarner Losh patbeg = (const char *) p;
7702a55deb1SDavid E. O'Brien return(1);
7712a55deb1SDavid E. O'Brien }
7722a55deb1SDavid E. O'Brien else
7732a55deb1SDavid E. O'Brien goto nextin; /* no match */
7742a55deb1SDavid E. O'Brien }
775f32a6403SWarner Losh if (*q == 0)
776f32a6403SWarner Losh break;
777f32a6403SWarner Losh q += n;
778f32a6403SWarner Losh } while (1);
779f32a6403SWarner Losh q++; /* was *q++ */
7802a55deb1SDavid E. O'Brien if (f->out[s])
7812a55deb1SDavid E. O'Brien patlen = q-p-1; /* don't count $ */
7822a55deb1SDavid E. O'Brien if (patlen >= 0) {
783f39dd6a9SWarner Losh patbeg = (const char *) p;
7842a55deb1SDavid E. O'Brien return(1);
7852a55deb1SDavid E. O'Brien }
7862a55deb1SDavid E. O'Brien nextin:
7872a55deb1SDavid E. O'Brien s = 2;
788f32a6403SWarner Losh if (*p == 0)
789f32a6403SWarner Losh break;
790f32a6403SWarner Losh n = u8_rune(&rune, (const char *) p);
791f32a6403SWarner Losh p += n;
792f32a6403SWarner Losh } while (1); /* was *p++ */
7932a55deb1SDavid E. O'Brien return (0);
7942a55deb1SDavid E. O'Brien }
7952a55deb1SDavid E. O'Brien
nematch(fa * f,const char * p0)796813da98dSDavid E. O'Brien int nematch(fa *f, const char *p0) /* non-empty match, for sub */
7972a55deb1SDavid E. O'Brien {
7982a55deb1SDavid E. O'Brien int s, ns;
799f32a6403SWarner Losh int n;
800f32a6403SWarner Losh int rune;
801f39dd6a9SWarner Losh const uschar *p = (const uschar *) p0;
802f39dd6a9SWarner Losh const uschar *q;
8032a55deb1SDavid E. O'Brien
80462ebc626SRuslan Ermilov s = f->initstat;
805f39dd6a9SWarner Losh assert(s < f->state_count);
806f39dd6a9SWarner Losh
807f39dd6a9SWarner Losh patbeg = (const char *)p;
8082a55deb1SDavid E. O'Brien patlen = -1;
8092a55deb1SDavid E. O'Brien while (*p) {
8102a55deb1SDavid E. O'Brien q = p;
8112a55deb1SDavid E. O'Brien do {
8122a55deb1SDavid E. O'Brien if (f->out[s]) /* final state */
8132a55deb1SDavid E. O'Brien patlen = q-p;
814addad6afSRong-En Fan /* assert(*q < NCHARS); */
815f32a6403SWarner Losh n = u8_rune(&rune, (const char *) q);
816f32a6403SWarner Losh if ((ns = get_gototab(f, s, rune)) != 0)
8172a55deb1SDavid E. O'Brien s = ns;
8182a55deb1SDavid E. O'Brien else
819f32a6403SWarner Losh s = cgoto(f, s, rune);
8202a55deb1SDavid E. O'Brien if (s == 1) { /* no transition */
8212a55deb1SDavid E. O'Brien if (patlen > 0) {
822f39dd6a9SWarner Losh patbeg = (const char *) p;
8232a55deb1SDavid E. O'Brien return(1);
8242a55deb1SDavid E. O'Brien } else
8252a55deb1SDavid E. O'Brien goto nnextin; /* no nonempty match */
8262a55deb1SDavid E. O'Brien }
827f32a6403SWarner Losh if (*q == 0)
828f32a6403SWarner Losh break;
829f32a6403SWarner Losh q += n;
830f32a6403SWarner Losh } while (1);
831f32a6403SWarner Losh q++;
8322a55deb1SDavid E. O'Brien if (f->out[s])
8332a55deb1SDavid E. O'Brien patlen = q-p-1; /* don't count $ */
8342a55deb1SDavid E. O'Brien if (patlen > 0 ) {
835f39dd6a9SWarner Losh patbeg = (const char *) p;
8362a55deb1SDavid E. O'Brien return(1);
8372a55deb1SDavid E. O'Brien }
8382a55deb1SDavid E. O'Brien nnextin:
8392a55deb1SDavid E. O'Brien s = 2;
8402a55deb1SDavid E. O'Brien p++;
8412a55deb1SDavid E. O'Brien }
8422a55deb1SDavid E. O'Brien return (0);
8432a55deb1SDavid E. O'Brien }
8442a55deb1SDavid E. O'Brien
845f39dd6a9SWarner Losh
846f39dd6a9SWarner Losh /*
847f39dd6a9SWarner Losh * NAME
848f39dd6a9SWarner Losh * fnematch
849f39dd6a9SWarner Losh *
850f39dd6a9SWarner Losh * DESCRIPTION
851f39dd6a9SWarner Losh * A stream-fed version of nematch which transfers characters to a
852f39dd6a9SWarner Losh * null-terminated buffer. All characters up to and including the last
853f39dd6a9SWarner Losh * character of the matching text or EOF are placed in the buffer. If
854f39dd6a9SWarner Losh * a match is found, patbeg and patlen are set appropriately.
855f39dd6a9SWarner Losh *
856f39dd6a9SWarner Losh * RETURN VALUES
857f39dd6a9SWarner Losh * false No match found.
858f39dd6a9SWarner Losh * true Match found.
859f39dd6a9SWarner Losh */
860f39dd6a9SWarner Losh
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)861f39dd6a9SWarner Losh bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
862f39dd6a9SWarner Losh {
863f32a6403SWarner Losh char *i, *j, *k, *buf = *pbuf;
864f39dd6a9SWarner Losh int bufsize = *pbufsize;
865f32a6403SWarner Losh int c, n, ns, s;
866f39dd6a9SWarner Losh
867f39dd6a9SWarner Losh s = pfa->initstat;
868f39dd6a9SWarner Losh patlen = 0;
869f39dd6a9SWarner Losh
870f39dd6a9SWarner Losh /*
871f32a6403SWarner Losh * buf <= i <= j <= k <= buf+bufsize
872f39dd6a9SWarner Losh *
873b2376a5fSWarner Losh * i: origin of active substring
874b2376a5fSWarner Losh * j: current character
875f32a6403SWarner Losh * k: destination of the next getc
876f39dd6a9SWarner Losh */
877f39dd6a9SWarner Losh
878f32a6403SWarner Losh i = j = k = buf;
879f32a6403SWarner Losh
880f32a6403SWarner Losh do {
881f32a6403SWarner Losh /*
8821023317aSWarner Losh * Call u8_rune with at least awk_mb_cur_max ahead in
883f32a6403SWarner Losh * the buffer until EOF interferes.
884f32a6403SWarner Losh */
885*17853db4SWarner Losh if (k - j < (int)awk_mb_cur_max) {
8861023317aSWarner Losh if (k + awk_mb_cur_max > buf + bufsize) {
8871023317aSWarner Losh char *obuf = buf;
888f32a6403SWarner Losh adjbuf((char **) &buf, &bufsize,
8891023317aSWarner Losh bufsize + awk_mb_cur_max,
890f32a6403SWarner Losh quantum, 0, "fnematch");
8911023317aSWarner Losh
8921023317aSWarner Losh /* buf resized, maybe moved. update pointers */
8931023317aSWarner Losh *pbufsize = bufsize;
8941023317aSWarner Losh if (obuf != buf) {
8951023317aSWarner Losh i = buf + (i - obuf);
8961023317aSWarner Losh j = buf + (j - obuf);
8971023317aSWarner Losh k = buf + (k - obuf);
8981023317aSWarner Losh *pbuf = buf;
8991023317aSWarner Losh if (patlen)
9001023317aSWarner Losh patbeg = buf + (patbeg - obuf);
901f32a6403SWarner Losh }
9021023317aSWarner Losh }
9031023317aSWarner Losh for (n = awk_mb_cur_max ; n > 0; n--) {
904f32a6403SWarner Losh *k++ = (c = getc(f)) != EOF ? c : 0;
905f32a6403SWarner Losh if (c == EOF) {
906f32a6403SWarner Losh if (ferror(f))
907f32a6403SWarner Losh FATAL("fnematch: getc error");
908f32a6403SWarner Losh break;
909f32a6403SWarner Losh }
910f32a6403SWarner Losh }
911f32a6403SWarner Losh }
912f32a6403SWarner Losh
913f32a6403SWarner Losh j += u8_rune(&c, j);
914f32a6403SWarner Losh
915f32a6403SWarner Losh if ((ns = get_gototab(pfa, s, c)) != 0)
916f39dd6a9SWarner Losh s = ns;
917f39dd6a9SWarner Losh else
918b2376a5fSWarner Losh s = cgoto(pfa, s, c);
919f39dd6a9SWarner Losh
920f39dd6a9SWarner Losh if (pfa->out[s]) { /* final state */
921f32a6403SWarner Losh patbeg = i;
922f32a6403SWarner Losh patlen = j - i;
923b2376a5fSWarner Losh if (c == 0) /* don't count $ */
924f39dd6a9SWarner Losh patlen--;
925f39dd6a9SWarner Losh }
926f32a6403SWarner Losh
927f32a6403SWarner Losh if (c && s != 1)
928f32a6403SWarner Losh continue; /* origin i still viable, next j */
929f32a6403SWarner Losh if (patlen)
930f32a6403SWarner Losh break; /* best match found */
931f32a6403SWarner Losh
932f32a6403SWarner Losh /* no match at origin i, next i and start over */
933f32a6403SWarner Losh i += u8_rune(&c, i);
934f32a6403SWarner Losh if (c == 0)
935f32a6403SWarner Losh break; /* no match */
936f32a6403SWarner Losh j = i;
937f39dd6a9SWarner Losh s = 2;
938f32a6403SWarner Losh } while (1);
939f39dd6a9SWarner Losh
940f39dd6a9SWarner Losh if (patlen) {
941f39dd6a9SWarner Losh /*
942f39dd6a9SWarner Losh * Under no circumstances is the last character fed to
943f39dd6a9SWarner Losh * the automaton part of the match. It is EOF's nullbyte,
944f39dd6a9SWarner Losh * or it sent the automaton into a state with no further
945f39dd6a9SWarner Losh * transitions available (s==1), or both. Room for a
946f39dd6a9SWarner Losh * terminating nullbyte is guaranteed.
947f39dd6a9SWarner Losh *
948f39dd6a9SWarner Losh * ungetc any chars after the end of matching text
949f39dd6a9SWarner Losh * (except for EOF's nullbyte, if present) and null
950f39dd6a9SWarner Losh * terminate the buffer.
951f39dd6a9SWarner Losh */
952f39dd6a9SWarner Losh do
953f32a6403SWarner Losh if (*--k && ungetc(*k, f) == EOF)
954f32a6403SWarner Losh FATAL("unable to ungetc '%c'", *k);
955f32a6403SWarner Losh while (k > patbeg + patlen);
956f32a6403SWarner Losh *k = '\0';
957f39dd6a9SWarner Losh return true;
958f39dd6a9SWarner Losh }
959f39dd6a9SWarner Losh else
960f39dd6a9SWarner Losh return false;
961f39dd6a9SWarner Losh }
962f39dd6a9SWarner Losh
reparse(const char * p)963813da98dSDavid E. O'Brien Node *reparse(const char *p) /* parses regular expression pointed to by p */
9642a55deb1SDavid E. O'Brien { /* uses relex() to scan regular expression */
9652a55deb1SDavid E. O'Brien Node *np;
9662a55deb1SDavid E. O'Brien
967f39dd6a9SWarner Losh DPRINTF("reparse <%s>\n", p);
968f39dd6a9SWarner Losh lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
9692a55deb1SDavid E. O'Brien rtok = relex();
970813da98dSDavid E. O'Brien /* GNU compatibility: an empty regexp matches anything */
971addad6afSRong-En Fan if (rtok == '\0') {
972813da98dSDavid E. O'Brien /* FATAL("empty regular expression"); previous */
973addad6afSRong-En Fan return(op2(EMPTYRE, NIL, NIL));
974addad6afSRong-En Fan }
9752a55deb1SDavid E. O'Brien np = regexp();
9762a55deb1SDavid E. O'Brien if (rtok != '\0')
9772a55deb1SDavid E. O'Brien FATAL("syntax error in regular expression %s at %s", lastre, prestr);
9782a55deb1SDavid E. O'Brien return(np);
9792a55deb1SDavid E. O'Brien }
9802a55deb1SDavid E. O'Brien
regexp(void)9812a55deb1SDavid E. O'Brien Node *regexp(void) /* top-level parse of reg expr */
9822a55deb1SDavid E. O'Brien {
9832a55deb1SDavid E. O'Brien return (alt(concat(primary())));
9842a55deb1SDavid E. O'Brien }
9852a55deb1SDavid E. O'Brien
primary(void)9862a55deb1SDavid E. O'Brien Node *primary(void)
9872a55deb1SDavid E. O'Brien {
9882a55deb1SDavid E. O'Brien Node *np;
989b5253557SWarner Losh int savelastatom;
9902a55deb1SDavid E. O'Brien
9912a55deb1SDavid E. O'Brien switch (rtok) {
9922a55deb1SDavid E. O'Brien case CHAR:
993b5253557SWarner Losh lastatom = starttok;
9942a55deb1SDavid E. O'Brien np = op2(CHAR, NIL, itonp(rlxval));
9952a55deb1SDavid E. O'Brien rtok = relex();
9962a55deb1SDavid E. O'Brien return (unary(np));
9972a55deb1SDavid E. O'Brien case ALL:
9982a55deb1SDavid E. O'Brien rtok = relex();
9992a55deb1SDavid E. O'Brien return (unary(op2(ALL, NIL, NIL)));
1000addad6afSRong-En Fan case EMPTYRE:
1001addad6afSRong-En Fan rtok = relex();
1002b5253557SWarner Losh return (unary(op2(EMPTYRE, NIL, NIL)));
10032a55deb1SDavid E. O'Brien case DOT:
1004b5253557SWarner Losh lastatom = starttok;
10052a55deb1SDavid E. O'Brien rtok = relex();
10062a55deb1SDavid E. O'Brien return (unary(op2(DOT, NIL, NIL)));
10072a55deb1SDavid E. O'Brien case CCL:
1008f39dd6a9SWarner Losh np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1009b5253557SWarner Losh lastatom = starttok;
10102a55deb1SDavid E. O'Brien rtok = relex();
10112a55deb1SDavid E. O'Brien return (unary(np));
10122a55deb1SDavid E. O'Brien case NCCL:
1013f39dd6a9SWarner Losh np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1014b5253557SWarner Losh lastatom = starttok;
10152a55deb1SDavid E. O'Brien rtok = relex();
10162a55deb1SDavid E. O'Brien return (unary(np));
10172a55deb1SDavid E. O'Brien case '^':
10182a55deb1SDavid E. O'Brien rtok = relex();
10192a55deb1SDavid E. O'Brien return (unary(op2(CHAR, NIL, itonp(HAT))));
10202a55deb1SDavid E. O'Brien case '$':
10212a55deb1SDavid E. O'Brien rtok = relex();
10222a55deb1SDavid E. O'Brien return (unary(op2(CHAR, NIL, NIL)));
10232a55deb1SDavid E. O'Brien case '(':
1024b5253557SWarner Losh lastatom = starttok;
1025b5253557SWarner Losh savelastatom = starttok - basestr; /* Retain over recursion */
10262a55deb1SDavid E. O'Brien rtok = relex();
10272a55deb1SDavid E. O'Brien if (rtok == ')') { /* special pleading for () */
10282a55deb1SDavid E. O'Brien rtok = relex();
1029f32a6403SWarner Losh return unary(op2(CCL, NIL, (Node *) cclenter("")));
10302a55deb1SDavid E. O'Brien }
10312a55deb1SDavid E. O'Brien np = regexp();
10322a55deb1SDavid E. O'Brien if (rtok == ')') {
1033b5253557SWarner Losh lastatom = basestr + savelastatom; /* Restore */
10342a55deb1SDavid E. O'Brien rtok = relex();
10352a55deb1SDavid E. O'Brien return (unary(np));
10362a55deb1SDavid E. O'Brien }
10372a55deb1SDavid E. O'Brien else
10382a55deb1SDavid E. O'Brien FATAL("syntax error in regular expression %s at %s", lastre, prestr);
10392a55deb1SDavid E. O'Brien default:
10402a55deb1SDavid E. O'Brien FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
10412a55deb1SDavid E. O'Brien }
10422a55deb1SDavid E. O'Brien return 0; /*NOTREACHED*/
10432a55deb1SDavid E. O'Brien }
10442a55deb1SDavid E. O'Brien
concat(Node * np)10452a55deb1SDavid E. O'Brien Node *concat(Node *np)
10462a55deb1SDavid E. O'Brien {
10472a55deb1SDavid E. O'Brien switch (rtok) {
1048b5253557SWarner Losh case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
10492a55deb1SDavid E. O'Brien return (concat(op2(CAT, np, primary())));
1050b5253557SWarner Losh case EMPTYRE:
1051b5253557SWarner Losh rtok = relex();
1052f32a6403SWarner Losh return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1053b5253557SWarner Losh primary())));
10542a55deb1SDavid E. O'Brien }
10552a55deb1SDavid E. O'Brien return (np);
10562a55deb1SDavid E. O'Brien }
10572a55deb1SDavid E. O'Brien
alt(Node * np)10582a55deb1SDavid E. O'Brien Node *alt(Node *np)
10592a55deb1SDavid E. O'Brien {
10602a55deb1SDavid E. O'Brien if (rtok == OR) {
10612a55deb1SDavid E. O'Brien rtok = relex();
10622a55deb1SDavid E. O'Brien return (alt(op2(OR, np, concat(primary()))));
10632a55deb1SDavid E. O'Brien }
10642a55deb1SDavid E. O'Brien return (np);
10652a55deb1SDavid E. O'Brien }
10662a55deb1SDavid E. O'Brien
unary(Node * np)10672a55deb1SDavid E. O'Brien Node *unary(Node *np)
10682a55deb1SDavid E. O'Brien {
10692a55deb1SDavid E. O'Brien switch (rtok) {
10702a55deb1SDavid E. O'Brien case STAR:
10712a55deb1SDavid E. O'Brien rtok = relex();
10722a55deb1SDavid E. O'Brien return (unary(op2(STAR, np, NIL)));
10732a55deb1SDavid E. O'Brien case PLUS:
10742a55deb1SDavid E. O'Brien rtok = relex();
10752a55deb1SDavid E. O'Brien return (unary(op2(PLUS, np, NIL)));
10762a55deb1SDavid E. O'Brien case QUEST:
10772a55deb1SDavid E. O'Brien rtok = relex();
10782a55deb1SDavid E. O'Brien return (unary(op2(QUEST, np, NIL)));
1079f39dd6a9SWarner Losh case ZERO:
1080f39dd6a9SWarner Losh rtok = relex();
1081f39dd6a9SWarner Losh return (unary(op2(ZERO, np, NIL)));
10822a55deb1SDavid E. O'Brien default:
10832a55deb1SDavid E. O'Brien return (np);
10842a55deb1SDavid E. O'Brien }
10852a55deb1SDavid E. O'Brien }
10862a55deb1SDavid E. O'Brien
1087007c6572SDag-Erling Smørgrav /*
1088007c6572SDag-Erling Smørgrav * Character class definitions conformant to the POSIX locale as
1089007c6572SDag-Erling Smørgrav * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1090007c6572SDag-Erling Smørgrav * and operating character sets are both ASCII (ISO646) or supersets
1091007c6572SDag-Erling Smørgrav * thereof.
1092007c6572SDag-Erling Smørgrav *
1093007c6572SDag-Erling Smørgrav * Note that to avoid overflowing the temporary buffer used in
1094007c6572SDag-Erling Smørgrav * relex(), the expanded character class (prior to range expansion)
1095007c6572SDag-Erling Smørgrav * must be less than twice the size of their full name.
1096007c6572SDag-Erling Smørgrav */
1097fc6b1dfeSDavid E. O'Brien
1098fc6b1dfeSDavid E. O'Brien /* Because isblank doesn't show up in any of the header files on any
1099fc6b1dfeSDavid E. O'Brien * system i use, it's defined here. if some other locale has a richer
1100fc6b1dfeSDavid E. O'Brien * definition of "blank", define HAS_ISBLANK and provide your own
1101fc6b1dfeSDavid E. O'Brien * version.
110288b8d487SRuslan Ermilov * the parentheses here are an attempt to find a path through the maze
110388b8d487SRuslan Ermilov * of macro definition and/or function and/or version provided. thanks
110488b8d487SRuslan Ermilov * to nelson beebe for the suggestion; let's see if it works everywhere.
1105fc6b1dfeSDavid E. O'Brien */
1106fc6b1dfeSDavid E. O'Brien
110791217c1cSRuslan Ermilov /* #define HAS_ISBLANK */
1108fc6b1dfeSDavid E. O'Brien #ifndef HAS_ISBLANK
1109fc6b1dfeSDavid E. O'Brien
11101b11b783SRuslan Ermilov int (xisblank)(int c)
1111fc6b1dfeSDavid E. O'Brien {
1112fc6b1dfeSDavid E. O'Brien return c==' ' || c=='\t';
1113fc6b1dfeSDavid E. O'Brien }
1114fc6b1dfeSDavid E. O'Brien
1115fc6b1dfeSDavid E. O'Brien #endif
1116fc6b1dfeSDavid E. O'Brien
1117f39dd6a9SWarner Losh static const struct charclass {
1118007c6572SDag-Erling Smørgrav const char *cc_name;
1119007c6572SDag-Erling Smørgrav int cc_namelen;
1120fc6b1dfeSDavid E. O'Brien int (*cc_func)(int);
1121007c6572SDag-Erling Smørgrav } charclasses[] = {
1122fc6b1dfeSDavid E. O'Brien { "alnum", 5, isalnum },
1123fc6b1dfeSDavid E. O'Brien { "alpha", 5, isalpha },
11241b11b783SRuslan Ermilov #ifndef HAS_ISBLANK
1125b5253557SWarner Losh { "blank", 5, xisblank },
11261b11b783SRuslan Ermilov #else
1127fc6b1dfeSDavid E. O'Brien { "blank", 5, isblank },
11281b11b783SRuslan Ermilov #endif
1129fc6b1dfeSDavid E. O'Brien { "cntrl", 5, iscntrl },
1130fc6b1dfeSDavid E. O'Brien { "digit", 5, isdigit },
1131fc6b1dfeSDavid E. O'Brien { "graph", 5, isgraph },
1132fc6b1dfeSDavid E. O'Brien { "lower", 5, islower },
1133fc6b1dfeSDavid E. O'Brien { "print", 5, isprint },
1134fc6b1dfeSDavid E. O'Brien { "punct", 5, ispunct },
1135fc6b1dfeSDavid E. O'Brien { "space", 5, isspace },
1136fc6b1dfeSDavid E. O'Brien { "upper", 5, isupper },
1137fc6b1dfeSDavid E. O'Brien { "xdigit", 6, isxdigit },
1138007c6572SDag-Erling Smørgrav { NULL, 0, NULL },
1139007c6572SDag-Erling Smørgrav };
1140007c6572SDag-Erling Smørgrav
1141b5253557SWarner Losh #define REPEAT_SIMPLE 0
1142b5253557SWarner Losh #define REPEAT_PLUS_APPENDED 1
1143b5253557SWarner Losh #define REPEAT_WITH_Q 2
1144b5253557SWarner Losh #define REPEAT_ZERO 3
1145b5253557SWarner Losh
1146b5253557SWarner Losh static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1147b5253557SWarner Losh replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1148b5253557SWarner Losh int atomlen, int firstnum, int secondnum, int special_case)
1149b5253557SWarner Losh {
1150b5253557SWarner Losh int i, j;
1151b5253557SWarner Losh uschar *buf = 0;
1152b5253557SWarner Losh int ret = 1;
1153b5253557SWarner Losh int init_q = (firstnum == 0); /* first added char will be ? */
1154b5253557SWarner Losh int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1155b5253557SWarner Losh int prefix_length = reptok - basestr; /* prefix includes first rep */
1156f39dd6a9SWarner Losh int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1157b5253557SWarner Losh int size = prefix_length + suffix_length;
1158b5253557SWarner Losh
1159b5253557SWarner Losh if (firstnum > 1) { /* add room for reps 2 through firstnum */
1160b5253557SWarner Losh size += atomlen*(firstnum-1);
1161b5253557SWarner Losh }
1162b5253557SWarner Losh
1163b5253557SWarner Losh /* Adjust size of buffer for special cases */
1164b5253557SWarner Losh if (special_case == REPEAT_PLUS_APPENDED) {
1165b5253557SWarner Losh size++; /* for the final + */
1166b5253557SWarner Losh } else if (special_case == REPEAT_WITH_Q) {
116723f24377SWarner Losh size += init_q + (atomlen+1)* (n_q_reps-init_q);
1168b5253557SWarner Losh } else if (special_case == REPEAT_ZERO) {
1169b5253557SWarner Losh size += 2; /* just a null ERE: () */
1170b5253557SWarner Losh }
1171b5253557SWarner Losh if ((buf = (uschar *) malloc(size + 1)) == NULL)
1172b5253557SWarner Losh FATAL("out of space in reg expr %.10s..", lastre);
1173b5253557SWarner Losh memcpy(buf, basestr, prefix_length); /* copy prefix */
1174b5253557SWarner Losh j = prefix_length;
1175b5253557SWarner Losh if (special_case == REPEAT_ZERO) {
1176b5253557SWarner Losh j -= atomlen;
1177b5253557SWarner Losh buf[j++] = '(';
1178b5253557SWarner Losh buf[j++] = ')';
1179b5253557SWarner Losh }
1180b5253557SWarner Losh for (i = 1; i < firstnum; i++) { /* copy x reps */
1181b5253557SWarner Losh memcpy(&buf[j], atom, atomlen);
1182b5253557SWarner Losh j += atomlen;
1183b5253557SWarner Losh }
1184b5253557SWarner Losh if (special_case == REPEAT_PLUS_APPENDED) {
1185b5253557SWarner Losh buf[j++] = '+';
1186b5253557SWarner Losh } else if (special_case == REPEAT_WITH_Q) {
1187f39dd6a9SWarner Losh if (init_q)
1188f39dd6a9SWarner Losh buf[j++] = '?';
1189f39dd6a9SWarner Losh for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1190b5253557SWarner Losh memcpy(&buf[j], atom, atomlen);
1191b5253557SWarner Losh j += atomlen;
1192b5253557SWarner Losh buf[j++] = '?';
1193b5253557SWarner Losh }
1194b5253557SWarner Losh }
1195b5253557SWarner Losh memcpy(&buf[j], reptok+reptoklen, suffix_length);
119623f24377SWarner Losh j += suffix_length;
119723f24377SWarner Losh buf[j] = '\0';
1198b5253557SWarner Losh /* free old basestr */
1199b5253557SWarner Losh if (firstbasestr != basestr) {
1200b5253557SWarner Losh if (basestr)
1201b5253557SWarner Losh xfree(basestr);
1202b5253557SWarner Losh }
1203b5253557SWarner Losh basestr = buf;
1204b5253557SWarner Losh prestr = buf + prefix_length;
1205b5253557SWarner Losh if (special_case == REPEAT_ZERO) {
1206b5253557SWarner Losh prestr -= atomlen;
1207b5253557SWarner Losh ret++;
1208b5253557SWarner Losh }
1209b5253557SWarner Losh return ret;
1210b5253557SWarner Losh }
1211b5253557SWarner Losh
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1212b5253557SWarner Losh static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1213b5253557SWarner Losh int atomlen, int firstnum, int secondnum)
1214b5253557SWarner Losh {
1215b5253557SWarner Losh /*
1216b5253557SWarner Losh In general, the repetition specifier or "bound" is replaced here
1217b5253557SWarner Losh by an equivalent ERE string, repeating the immediately previous atom
1218b5253557SWarner Losh and appending ? and + as needed. Note that the first copy of the
1219b5253557SWarner Losh atom is left in place, except in the special_case of a zero-repeat
1220b5253557SWarner Losh (i.e., {0}).
1221b5253557SWarner Losh */
1222b5253557SWarner Losh if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1223b5253557SWarner Losh if (firstnum < 2) {
1224b5253557SWarner Losh /* 0 or 1: should be handled before you get here */
1225b5253557SWarner Losh FATAL("internal error");
1226b5253557SWarner Losh } else {
1227b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen,
1228b5253557SWarner Losh firstnum, secondnum, REPEAT_PLUS_APPENDED);
1229b5253557SWarner Losh }
1230b5253557SWarner Losh } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1231b5253557SWarner Losh if (firstnum == 0) { /* {0} or {0,0} */
1232b5253557SWarner Losh /* This case is unusual because the resulting
1233b5253557SWarner Losh replacement string might actually be SMALLER than
1234b5253557SWarner Losh the original ERE */
1235b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen,
1236b5253557SWarner Losh firstnum, secondnum, REPEAT_ZERO);
1237b5253557SWarner Losh } else { /* (firstnum >= 1) */
1238b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen,
1239b5253557SWarner Losh firstnum, secondnum, REPEAT_SIMPLE);
1240b5253557SWarner Losh }
1241b5253557SWarner Losh } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1242b5253557SWarner Losh /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1243b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen,
1244b5253557SWarner Losh firstnum, secondnum, REPEAT_WITH_Q);
1245b5253557SWarner Losh } else { /* Error - shouldn't be here (n>m) */
1246b5253557SWarner Losh FATAL("internal error");
1247b5253557SWarner Losh }
1248b5253557SWarner Losh return 0;
1249b5253557SWarner Losh }
1250007c6572SDag-Erling Smørgrav
relex(void)12512a55deb1SDavid E. O'Brien int relex(void) /* lexical analyzer for reparse */
12522a55deb1SDavid E. O'Brien {
12532a55deb1SDavid E. O'Brien int c, n;
12542a55deb1SDavid E. O'Brien int cflag;
125510ce5b99SWarner Losh static uschar *buf = NULL;
12562a55deb1SDavid E. O'Brien static int bufsz = 100;
12572a55deb1SDavid E. O'Brien uschar *bp;
1258f39dd6a9SWarner Losh const struct charclass *cc;
1259fc6b1dfeSDavid E. O'Brien int i;
1260f39dd6a9SWarner Losh int num, m;
1261f39dd6a9SWarner Losh bool commafound, digitfound;
1262b5253557SWarner Losh const uschar *startreptok;
1263f39dd6a9SWarner Losh static int parens = 0;
1264b5253557SWarner Losh
1265b5253557SWarner Losh rescan:
1266b5253557SWarner Losh starttok = prestr;
12672a55deb1SDavid E. O'Brien
1268f32a6403SWarner Losh if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1269f32a6403SWarner Losh prestr += n;
1270f32a6403SWarner Losh starttok = prestr;
1271f32a6403SWarner Losh return CHAR;
1272f32a6403SWarner Losh }
1273f32a6403SWarner Losh
12742a55deb1SDavid E. O'Brien switch (c = *prestr++) {
12752a55deb1SDavid E. O'Brien case '|': return OR;
12762a55deb1SDavid E. O'Brien case '*': return STAR;
12772a55deb1SDavid E. O'Brien case '+': return PLUS;
12782a55deb1SDavid E. O'Brien case '?': return QUEST;
12792a55deb1SDavid E. O'Brien case '.': return DOT;
12802a55deb1SDavid E. O'Brien case '\0': prestr--; return '\0';
12812a55deb1SDavid E. O'Brien case '^':
12822a55deb1SDavid E. O'Brien case '$':
12832a55deb1SDavid E. O'Brien return c;
1284f39dd6a9SWarner Losh case '(':
1285f39dd6a9SWarner Losh parens++;
1286f39dd6a9SWarner Losh return c;
1287f39dd6a9SWarner Losh case ')':
1288f39dd6a9SWarner Losh if (parens) {
1289f39dd6a9SWarner Losh parens--;
1290f39dd6a9SWarner Losh return c;
1291f39dd6a9SWarner Losh }
1292f39dd6a9SWarner Losh /* unmatched close parenthesis; per POSIX, treat as literal */
1293f39dd6a9SWarner Losh rlxval = c;
1294f39dd6a9SWarner Losh return CHAR;
12952a55deb1SDavid E. O'Brien case '\\':
1296d86a0988SRuslan Ermilov rlxval = quoted(&prestr);
12972a55deb1SDavid E. O'Brien return CHAR;
12982a55deb1SDavid E. O'Brien default:
12992a55deb1SDavid E. O'Brien rlxval = c;
13002a55deb1SDavid E. O'Brien return CHAR;
13012a55deb1SDavid E. O'Brien case '[':
130210ce5b99SWarner Losh if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
13032a55deb1SDavid E. O'Brien FATAL("out of space in reg expr %.10s..", lastre);
13042a55deb1SDavid E. O'Brien bp = buf;
13052a55deb1SDavid E. O'Brien if (*prestr == '^') {
13062a55deb1SDavid E. O'Brien cflag = 1;
13072a55deb1SDavid E. O'Brien prestr++;
13082a55deb1SDavid E. O'Brien }
13092a55deb1SDavid E. O'Brien else
13102a55deb1SDavid E. O'Brien cflag = 0;
1311f32a6403SWarner Losh n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1312addad6afSRong-En Fan if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
13132a55deb1SDavid E. O'Brien FATAL("out of space for reg expr %.10s...", lastre);
13142a55deb1SDavid E. O'Brien for (; ; ) {
1315f32a6403SWarner Losh if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1316f32a6403SWarner Losh for (i = 0; i < n; i++)
1317f32a6403SWarner Losh *bp++ = *prestr++;
1318f32a6403SWarner Losh continue;
1319f32a6403SWarner Losh }
13202a55deb1SDavid E. O'Brien if ((c = *prestr++) == '\\') {
13212a55deb1SDavid E. O'Brien *bp++ = '\\';
13222a55deb1SDavid E. O'Brien if ((c = *prestr++) == '\0')
13232a55deb1SDavid E. O'Brien FATAL("nonterminated character class %.20s...", lastre);
13242a55deb1SDavid E. O'Brien *bp++ = c;
13252a55deb1SDavid E. O'Brien /* } else if (c == '\n') { */
13262a55deb1SDavid E. O'Brien /* FATAL("newline in character class %.20s...", lastre); */
1327007c6572SDag-Erling Smørgrav } else if (c == '[' && *prestr == ':') {
1328007c6572SDag-Erling Smørgrav /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1329007c6572SDag-Erling Smørgrav for (cc = charclasses; cc->cc_name; cc++)
1330007c6572SDag-Erling Smørgrav if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1331007c6572SDag-Erling Smørgrav break;
1332007c6572SDag-Erling Smørgrav if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1333007c6572SDag-Erling Smørgrav prestr[2 + cc->cc_namelen] == ']') {
1334007c6572SDag-Erling Smørgrav prestr += cc->cc_namelen + 3;
1335b5253557SWarner Losh /*
1336b5253557SWarner Losh * BUG: We begin at 1, instead of 0, since we
1337b5253557SWarner Losh * would otherwise prematurely terminate the
1338b5253557SWarner Losh * string for classes like [[:cntrl:]]. This
1339b5253557SWarner Losh * means that we can't match the NUL character,
1340b5253557SWarner Losh * not without first adapting the entire
1341b5253557SWarner Losh * program to track each string's length.
1342b5253557SWarner Losh */
1343b5253557SWarner Losh for (i = 1; i <= UCHAR_MAX; i++) {
1344f32a6403SWarner Losh if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1345fc6b1dfeSDavid E. O'Brien FATAL("out of space for reg expr %.10s...", lastre);
1346fc6b1dfeSDavid E. O'Brien if (cc->cc_func(i)) {
1347f39dd6a9SWarner Losh /* escape backslash */
1348f39dd6a9SWarner Losh if (i == '\\') {
1349f39dd6a9SWarner Losh *bp++ = '\\';
1350f39dd6a9SWarner Losh n++;
1351f39dd6a9SWarner Losh }
1352f39dd6a9SWarner Losh
1353fc6b1dfeSDavid E. O'Brien *bp++ = i;
1354fc6b1dfeSDavid E. O'Brien n++;
1355fc6b1dfeSDavid E. O'Brien }
1356fc6b1dfeSDavid E. O'Brien }
1357007c6572SDag-Erling Smørgrav } else
1358007c6572SDag-Erling Smørgrav *bp++ = c;
1359b5253557SWarner Losh } else if (c == '[' && *prestr == '.') {
1360b5253557SWarner Losh char collate_char;
1361b5253557SWarner Losh prestr++;
1362b5253557SWarner Losh collate_char = *prestr++;
1363b5253557SWarner Losh if (*prestr == '.' && prestr[1] == ']') {
1364b5253557SWarner Losh prestr += 2;
1365b5253557SWarner Losh /* Found it: map via locale TBD: for
1366b5253557SWarner Losh now, simply return this char. This
1367b5253557SWarner Losh is sufficient to pass conformance
1368b5253557SWarner Losh test awk.ex 156
1369b5253557SWarner Losh */
1370b5253557SWarner Losh if (*prestr == ']') {
1371b5253557SWarner Losh prestr++;
1372b5253557SWarner Losh rlxval = collate_char;
1373b5253557SWarner Losh return CHAR;
1374b5253557SWarner Losh }
1375b5253557SWarner Losh }
1376b5253557SWarner Losh } else if (c == '[' && *prestr == '=') {
1377b5253557SWarner Losh char equiv_char;
1378b5253557SWarner Losh prestr++;
1379b5253557SWarner Losh equiv_char = *prestr++;
1380b5253557SWarner Losh if (*prestr == '=' && prestr[1] == ']') {
1381b5253557SWarner Losh prestr += 2;
1382b5253557SWarner Losh /* Found it: map via locale TBD: for now
1383b5253557SWarner Losh simply return this char. This is
1384b5253557SWarner Losh sufficient to pass conformance test
1385b5253557SWarner Losh awk.ex 156
1386b5253557SWarner Losh */
1387b5253557SWarner Losh if (*prestr == ']') {
1388b5253557SWarner Losh prestr++;
1389b5253557SWarner Losh rlxval = equiv_char;
1390b5253557SWarner Losh return CHAR;
1391b5253557SWarner Losh }
1392b5253557SWarner Losh }
13932a55deb1SDavid E. O'Brien } else if (c == '\0') {
13942a55deb1SDavid E. O'Brien FATAL("nonterminated character class %.20s", lastre);
13952a55deb1SDavid E. O'Brien } else if (bp == buf) { /* 1st char is special */
13962a55deb1SDavid E. O'Brien *bp++ = c;
13972a55deb1SDavid E. O'Brien } else if (c == ']') {
13982a55deb1SDavid E. O'Brien *bp++ = 0;
13992a55deb1SDavid E. O'Brien rlxstr = (uschar *) tostring((char *) buf);
14002a55deb1SDavid E. O'Brien if (cflag == 0)
14012a55deb1SDavid E. O'Brien return CCL;
14022a55deb1SDavid E. O'Brien else
14032a55deb1SDavid E. O'Brien return NCCL;
14042a55deb1SDavid E. O'Brien } else
14052a55deb1SDavid E. O'Brien *bp++ = c;
14062a55deb1SDavid E. O'Brien }
1407b5253557SWarner Losh break;
1408b5253557SWarner Losh case '{':
1409f32a6403SWarner Losh if (isdigit((int) *(prestr))) {
1410b5253557SWarner Losh num = 0; /* Process as a repetition */
1411b5253557SWarner Losh n = -1; m = -1;
1412f39dd6a9SWarner Losh commafound = false;
1413f39dd6a9SWarner Losh digitfound = false;
1414b5253557SWarner Losh startreptok = prestr-1;
1415b5253557SWarner Losh /* Remember start of previous atom here ? */
1416b5253557SWarner Losh } else { /* just a { char, not a repetition */
1417b5253557SWarner Losh rlxval = c;
1418b5253557SWarner Losh return CHAR;
1419b5253557SWarner Losh }
1420b5253557SWarner Losh for (; ; ) {
1421b5253557SWarner Losh if ((c = *prestr++) == '}') {
1422b5253557SWarner Losh if (commafound) {
1423b5253557SWarner Losh if (digitfound) { /* {n,m} */
1424b5253557SWarner Losh m = num;
1425b5253557SWarner Losh if (m < n)
1426b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s",
1427b5253557SWarner Losh lastre);
1428f39dd6a9SWarner Losh if (n == 0 && m == 1) {
1429b5253557SWarner Losh return QUEST;
1430b5253557SWarner Losh }
1431b5253557SWarner Losh } else { /* {n,} */
1432f39dd6a9SWarner Losh if (n == 0)
1433f39dd6a9SWarner Losh return STAR;
1434f39dd6a9SWarner Losh else if (n == 1)
1435f39dd6a9SWarner Losh return PLUS;
1436b5253557SWarner Losh }
1437b5253557SWarner Losh } else {
1438b5253557SWarner Losh if (digitfound) { /* {n} same as {n,n} */
1439b5253557SWarner Losh n = num;
1440b5253557SWarner Losh m = num;
1441b5253557SWarner Losh } else { /* {} */
1442b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s",
1443b5253557SWarner Losh lastre);
1444b5253557SWarner Losh }
1445b5253557SWarner Losh }
1446b5253557SWarner Losh if (repeat(starttok, prestr-starttok, lastatom,
1447b5253557SWarner Losh startreptok - lastatom, n, m) > 0) {
1448f39dd6a9SWarner Losh if (n == 0 && m == 0) {
1449f39dd6a9SWarner Losh return ZERO;
1450b5253557SWarner Losh }
1451b5253557SWarner Losh /* must rescan input for next token */
1452b5253557SWarner Losh goto rescan;
1453b5253557SWarner Losh }
1454b5253557SWarner Losh /* Failed to replace: eat up {...} characters
1455b5253557SWarner Losh and treat like just PLUS */
1456b5253557SWarner Losh return PLUS;
1457b5253557SWarner Losh } else if (c == '\0') {
1458b5253557SWarner Losh FATAL("nonterminated character class %.20s",
1459b5253557SWarner Losh lastre);
1460b5253557SWarner Losh } else if (isdigit(c)) {
1461b5253557SWarner Losh num = 10 * num + c - '0';
1462f39dd6a9SWarner Losh digitfound = true;
1463b5253557SWarner Losh } else if (c == ',') {
1464b5253557SWarner Losh if (commafound)
1465b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s",
1466b5253557SWarner Losh lastre);
1467b5253557SWarner Losh /* looking for {n,} or {n,m} */
1468f39dd6a9SWarner Losh commafound = true;
1469b5253557SWarner Losh n = num;
1470f39dd6a9SWarner Losh digitfound = false; /* reset */
1471b5253557SWarner Losh num = 0;
1472b5253557SWarner Losh } else {
1473b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s",
1474b5253557SWarner Losh lastre);
1475b5253557SWarner Losh }
1476b5253557SWarner Losh }
1477b5253557SWarner Losh break;
14782a55deb1SDavid E. O'Brien }
14792a55deb1SDavid E. O'Brien }
14802a55deb1SDavid E. O'Brien
cgoto(fa * f,int s,int c)14812a55deb1SDavid E. O'Brien int cgoto(fa *f, int s, int c)
14822a55deb1SDavid E. O'Brien {
14832a55deb1SDavid E. O'Brien int *p, *q;
1484f39dd6a9SWarner Losh int i, j, k;
14852a55deb1SDavid E. O'Brien
1486f32a6403SWarner Losh /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
14872a55deb1SDavid E. O'Brien while (f->accept >= maxsetvec) { /* guessing here! */
1488f39dd6a9SWarner Losh resizesetvec(__func__);
14892a55deb1SDavid E. O'Brien }
14902a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++)
14912a55deb1SDavid E. O'Brien setvec[i] = 0;
14922a55deb1SDavid E. O'Brien setcnt = 0;
1493f39dd6a9SWarner Losh resize_state(f, s);
14942a55deb1SDavid E. O'Brien /* compute positions of gototab[s,c] into setvec */
14952a55deb1SDavid E. O'Brien p = f->posns[s];
14962a55deb1SDavid E. O'Brien for (i = 1; i <= *p; i++) {
14972a55deb1SDavid E. O'Brien if ((k = f->re[p[i]].ltype) != FINAL) {
14982a55deb1SDavid E. O'Brien if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
14992a55deb1SDavid E. O'Brien || (k == DOT && c != 0 && c != HAT)
15002a55deb1SDavid E. O'Brien || (k == ALL && c != 0)
1501addad6afSRong-En Fan || (k == EMPTYRE && c != 0)
1502f32a6403SWarner Losh || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1503f32a6403SWarner Losh || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
15042a55deb1SDavid E. O'Brien q = f->re[p[i]].lfollow;
15052a55deb1SDavid E. O'Brien for (j = 1; j <= *q; j++) {
15062a55deb1SDavid E. O'Brien if (q[j] >= maxsetvec) {
1507f39dd6a9SWarner Losh resizesetvec(__func__);
15082a55deb1SDavid E. O'Brien }
15092a55deb1SDavid E. O'Brien if (setvec[q[j]] == 0) {
15102a55deb1SDavid E. O'Brien setcnt++;
15112a55deb1SDavid E. O'Brien setvec[q[j]] = 1;
15122a55deb1SDavid E. O'Brien }
15132a55deb1SDavid E. O'Brien }
15142a55deb1SDavid E. O'Brien }
15152a55deb1SDavid E. O'Brien }
15162a55deb1SDavid E. O'Brien }
15172a55deb1SDavid E. O'Brien /* determine if setvec is a previous state */
15182a55deb1SDavid E. O'Brien tmpset[0] = setcnt;
15192a55deb1SDavid E. O'Brien j = 1;
15202a55deb1SDavid E. O'Brien for (i = f->accept; i >= 0; i--)
15212a55deb1SDavid E. O'Brien if (setvec[i]) {
15222a55deb1SDavid E. O'Brien tmpset[j++] = i;
15232a55deb1SDavid E. O'Brien }
1524f39dd6a9SWarner Losh resize_state(f, f->curstat > s ? f->curstat : s);
15252a55deb1SDavid E. O'Brien /* tmpset == previous state? */
15262a55deb1SDavid E. O'Brien for (i = 1; i <= f->curstat; i++) {
15272a55deb1SDavid E. O'Brien p = f->posns[i];
15282a55deb1SDavid E. O'Brien if ((k = tmpset[0]) != p[0])
15292a55deb1SDavid E. O'Brien goto different;
15302a55deb1SDavid E. O'Brien for (j = 1; j <= k; j++)
15312a55deb1SDavid E. O'Brien if (tmpset[j] != p[j])
15322a55deb1SDavid E. O'Brien goto different;
15332a55deb1SDavid E. O'Brien /* setvec is state i */
1534f39dd6a9SWarner Losh if (c != HAT)
1535f32a6403SWarner Losh set_gototab(f, s, c, i);
15362a55deb1SDavid E. O'Brien return i;
15372a55deb1SDavid E. O'Brien different:;
15382a55deb1SDavid E. O'Brien }
15392a55deb1SDavid E. O'Brien
15402a55deb1SDavid E. O'Brien /* add tmpset to current set of states */
15412a55deb1SDavid E. O'Brien ++(f->curstat);
1542f39dd6a9SWarner Losh resize_state(f, f->curstat);
1543f32a6403SWarner Losh clear_gototab(f, f->curstat);
15442a55deb1SDavid E. O'Brien xfree(f->posns[f->curstat]);
1545f39dd6a9SWarner Losh p = intalloc(setcnt + 1, __func__);
15462a55deb1SDavid E. O'Brien
15472a55deb1SDavid E. O'Brien f->posns[f->curstat] = p;
1548f39dd6a9SWarner Losh if (c != HAT)
1549f32a6403SWarner Losh set_gototab(f, s, c, f->curstat);
15502a55deb1SDavid E. O'Brien for (i = 0; i <= setcnt; i++)
15512a55deb1SDavid E. O'Brien p[i] = tmpset[i];
15522a55deb1SDavid E. O'Brien if (setvec[f->accept])
15532a55deb1SDavid E. O'Brien f->out[f->curstat] = 1;
15542a55deb1SDavid E. O'Brien else
15552a55deb1SDavid E. O'Brien f->out[f->curstat] = 0;
15562a55deb1SDavid E. O'Brien return f->curstat;
15572a55deb1SDavid E. O'Brien }
15582a55deb1SDavid E. O'Brien
15592a55deb1SDavid E. O'Brien
freefa(fa * f)15602a55deb1SDavid E. O'Brien void freefa(fa *f) /* free a finite automaton */
15612a55deb1SDavid E. O'Brien {
15622a55deb1SDavid E. O'Brien int i;
15632a55deb1SDavid E. O'Brien
15642a55deb1SDavid E. O'Brien if (f == NULL)
15652a55deb1SDavid E. O'Brien return;
1566f39dd6a9SWarner Losh for (i = 0; i < f->state_count; i++)
1567f32a6403SWarner Losh xfree(f->gototab[i].entries);
1568f32a6403SWarner Losh xfree(f->gototab);
15692a55deb1SDavid E. O'Brien for (i = 0; i <= f->curstat; i++)
15702a55deb1SDavid E. O'Brien xfree(f->posns[i]);
15712a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++) {
15722a55deb1SDavid E. O'Brien xfree(f->re[i].lfollow);
15732a55deb1SDavid E. O'Brien if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1574f39dd6a9SWarner Losh xfree(f->re[i].lval.np);
15752a55deb1SDavid E. O'Brien }
15762a55deb1SDavid E. O'Brien xfree(f->restr);
1577f39dd6a9SWarner Losh xfree(f->out);
1578f39dd6a9SWarner Losh xfree(f->posns);
1579f39dd6a9SWarner Losh xfree(f->gototab);
15802a55deb1SDavid E. O'Brien xfree(f);
15812a55deb1SDavid E. O'Brien }
1582