12a55deb1SDavid E. O'Brien /**************************************************************** 22a55deb1SDavid E. O'Brien Copyright (C) Lucent Technologies 1997 32a55deb1SDavid E. O'Brien All Rights Reserved 42a55deb1SDavid E. O'Brien 52a55deb1SDavid E. O'Brien Permission to use, copy, modify, and distribute this software and 62a55deb1SDavid E. O'Brien its documentation for any purpose and without fee is hereby 72a55deb1SDavid E. O'Brien granted, provided that the above copyright notice appear in all 82a55deb1SDavid E. O'Brien copies and that both that the copyright notice and this 92a55deb1SDavid E. O'Brien permission notice and warranty disclaimer appear in supporting 102a55deb1SDavid E. O'Brien documentation, and that the name Lucent Technologies or any of 112a55deb1SDavid E. O'Brien its entities not be used in advertising or publicity pertaining 122a55deb1SDavid E. O'Brien to distribution of the software without specific, written prior 132a55deb1SDavid E. O'Brien permission. 142a55deb1SDavid E. O'Brien 152a55deb1SDavid E. O'Brien LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 162a55deb1SDavid E. O'Brien INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 172a55deb1SDavid E. O'Brien IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 182a55deb1SDavid E. O'Brien SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 192a55deb1SDavid E. O'Brien WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 202a55deb1SDavid E. O'Brien IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 212a55deb1SDavid E. O'Brien ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 222a55deb1SDavid E. O'Brien THIS SOFTWARE. 232a55deb1SDavid E. O'Brien ****************************************************************/ 242a55deb1SDavid E. O'Brien 25addad6afSRong-En Fan /* lasciate ogne speranza, voi ch'intrate. */ 262a55deb1SDavid E. O'Brien 272a55deb1SDavid E. O'Brien #define DEBUG 282a55deb1SDavid E. O'Brien 292a55deb1SDavid E. O'Brien #include <ctype.h> 30b5253557SWarner Losh #include <limits.h> 312a55deb1SDavid E. O'Brien #include <stdio.h> 322a55deb1SDavid E. O'Brien #include <string.h> 332a55deb1SDavid E. O'Brien #include <stdlib.h> 342a55deb1SDavid E. O'Brien #include "awk.h" 35f39dd6a9SWarner Losh #include "awkgram.tab.h" 362a55deb1SDavid E. O'Brien 372a55deb1SDavid E. O'Brien #define MAXLIN 22 382a55deb1SDavid E. O'Brien 392a55deb1SDavid E. O'Brien #define type(v) (v)->nobj /* badly overloaded here */ 402a55deb1SDavid E. O'Brien #define info(v) (v)->ntype /* badly overloaded here */ 412a55deb1SDavid E. O'Brien #define left(v) (v)->narg[0] 422a55deb1SDavid E. O'Brien #define right(v) (v)->narg[1] 432a55deb1SDavid E. O'Brien #define parent(v) (v)->nnext 442a55deb1SDavid E. O'Brien 452a55deb1SDavid E. O'Brien #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 46addad6afSRong-En Fan #define ELEAF case EMPTYRE: /* empty string in regexp */ 472a55deb1SDavid E. O'Brien #define UNARY case STAR: case PLUS: case QUEST: 482a55deb1SDavid E. O'Brien 492a55deb1SDavid E. O'Brien /* encoding in tree Nodes: 50addad6afSRong-En Fan leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 512a55deb1SDavid E. O'Brien left is index, right contains value or pointer to value 522a55deb1SDavid E. O'Brien unary (STAR, PLUS, QUEST): left is child, right is null 532a55deb1SDavid E. O'Brien binary (CAT, OR): left and right are children 542a55deb1SDavid E. O'Brien parent contains pointer to parent 552a55deb1SDavid E. O'Brien */ 562a55deb1SDavid E. O'Brien 572a55deb1SDavid E. O'Brien 582a55deb1SDavid E. O'Brien int *setvec; 592a55deb1SDavid E. O'Brien int *tmpset; 602a55deb1SDavid E. O'Brien int maxsetvec = 0; 612a55deb1SDavid E. O'Brien 622a55deb1SDavid E. O'Brien int rtok; /* next token in current re */ 632a55deb1SDavid E. O'Brien int rlxval; 64f39dd6a9SWarner Losh static const uschar *rlxstr; 65f39dd6a9SWarner Losh static const uschar *prestr; /* current position in current re */ 66f39dd6a9SWarner Losh static const uschar *lastre; /* origin of last re */ 67f39dd6a9SWarner Losh static const uschar *lastatom; /* origin of last Atom */ 68f39dd6a9SWarner Losh static const uschar *starttok; 69f39dd6a9SWarner Losh static const uschar *basestr; /* starts with original, replaced during 70b5253557SWarner Losh repetition processing */ 71f39dd6a9SWarner Losh static const uschar *firstbasestr; 722a55deb1SDavid E. O'Brien 732a55deb1SDavid E. O'Brien static int setcnt; 742a55deb1SDavid E. O'Brien static int poscnt; 752a55deb1SDavid E. O'Brien 76f39dd6a9SWarner Losh const char *patbeg; 772a55deb1SDavid E. O'Brien int patlen; 782a55deb1SDavid E. O'Brien 79f39dd6a9SWarner Losh #define NFA 128 /* cache this many dynamic fa's */ 802a55deb1SDavid E. O'Brien fa *fatab[NFA]; 812a55deb1SDavid E. O'Brien int nfatab = 0; /* entries in fatab */ 822a55deb1SDavid E. O'Brien 83*f32a6403SWarner Losh extern int u8_nextlen(const char *s); 84*f32a6403SWarner Losh 85*f32a6403SWarner Losh 86*f32a6403SWarner Losh /* utf-8 mechanism: 87*f32a6403SWarner Losh 88*f32a6403SWarner Losh For most of Awk, utf-8 strings just "work", since they look like 89*f32a6403SWarner Losh null-terminated sequences of 8-bit bytes. 90*f32a6403SWarner Losh 91*f32a6403SWarner Losh Functions like length(), index(), and substr() have to operate 92*f32a6403SWarner Losh in units of utf-8 characters. The u8_* functions in run.c 93*f32a6403SWarner Losh handle this. 94*f32a6403SWarner Losh 95*f32a6403SWarner Losh Regular expressions are more complicated, since the basic 96*f32a6403SWarner Losh mechanism of the goto table used 8-bit byte indices into the 97*f32a6403SWarner Losh gototab entries to compute the next state. Unicode is a lot 98*f32a6403SWarner Losh bigger, so the gototab entries are now structs with a character 99*f32a6403SWarner Losh and a next state. These are sorted by code point and binary 100*f32a6403SWarner Losh searched. 101*f32a6403SWarner Losh 102*f32a6403SWarner Losh Throughout the RE mechanism in b.c, utf-8 characters are 103*f32a6403SWarner Losh converted to their utf-32 value. This mostly shows up in 104*f32a6403SWarner Losh cclenter, which expands character class ranges like a-z and now 105*f32a6403SWarner Losh alpha-omega. The size of a gototab array is still about 256. 106*f32a6403SWarner Losh This should be dynamic, but for now things work ok for a single 107*f32a6403SWarner Losh code page of Unicode, which is the most likely case. 108*f32a6403SWarner Losh 109*f32a6403SWarner Losh The code changes are localized in run.c and b.c. I have added a 110*f32a6403SWarner Losh handful of functions to somewhat better hide the implementation, 111*f32a6403SWarner Losh but a lot more could be done. 112*f32a6403SWarner Losh 113*f32a6403SWarner Losh */ 114*f32a6403SWarner Losh 115*f32a6403SWarner Losh static int entry_cmp(const void *l, const void *r); 116*f32a6403SWarner Losh static int get_gototab(fa*, int, int); 117*f32a6403SWarner Losh static int set_gototab(fa*, int, int, int); 118*f32a6403SWarner Losh static void clear_gototab(fa*, int); 119*f32a6403SWarner Losh extern int u8_rune(int *, const char *); 120*f32a6403SWarner Losh 121f39dd6a9SWarner Losh static int * 122f39dd6a9SWarner Losh intalloc(size_t n, const char *f) 123f39dd6a9SWarner Losh { 124f39dd6a9SWarner Losh int *p = (int *) calloc(n, sizeof(int)); 125f39dd6a9SWarner Losh if (p == NULL) 126f39dd6a9SWarner Losh overflo(f); 127f39dd6a9SWarner Losh return p; 128f39dd6a9SWarner Losh } 129f39dd6a9SWarner Losh 130f39dd6a9SWarner Losh static void 131f39dd6a9SWarner Losh resizesetvec(const char *f) 132f39dd6a9SWarner Losh { 133f39dd6a9SWarner Losh if (maxsetvec == 0) 134f39dd6a9SWarner Losh maxsetvec = MAXLIN; 135f39dd6a9SWarner Losh else 136f39dd6a9SWarner Losh maxsetvec *= 4; 137f39dd6a9SWarner Losh setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec)); 138f39dd6a9SWarner Losh tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset)); 139f39dd6a9SWarner Losh if (setvec == NULL || tmpset == NULL) 140f39dd6a9SWarner Losh overflo(f); 141f39dd6a9SWarner Losh } 142f39dd6a9SWarner Losh 143f39dd6a9SWarner Losh static void 144f39dd6a9SWarner Losh resize_state(fa *f, int state) 145f39dd6a9SWarner Losh { 146*f32a6403SWarner Losh gtt *p; 147f39dd6a9SWarner Losh uschar *p2; 148f39dd6a9SWarner Losh int **p3; 149f39dd6a9SWarner Losh int i, new_count; 150f39dd6a9SWarner Losh 151f39dd6a9SWarner Losh if (++state < f->state_count) 152f39dd6a9SWarner Losh return; 153f39dd6a9SWarner Losh 154f39dd6a9SWarner Losh new_count = state + 10; /* needs to be tuned */ 155f39dd6a9SWarner Losh 156*f32a6403SWarner Losh p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt)); 157f39dd6a9SWarner Losh if (p == NULL) 158f39dd6a9SWarner Losh goto out; 159f39dd6a9SWarner Losh f->gototab = p; 160f39dd6a9SWarner Losh 161f39dd6a9SWarner Losh p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0])); 162f39dd6a9SWarner Losh if (p2 == NULL) 163f39dd6a9SWarner Losh goto out; 164f39dd6a9SWarner Losh f->out = p2; 165f39dd6a9SWarner Losh 166f39dd6a9SWarner Losh p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0])); 167f39dd6a9SWarner Losh if (p3 == NULL) 168f39dd6a9SWarner Losh goto out; 169f39dd6a9SWarner Losh f->posns = p3; 170f39dd6a9SWarner Losh 171f39dd6a9SWarner Losh for (i = f->state_count; i < new_count; ++i) { 172*f32a6403SWarner Losh f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); 173*f32a6403SWarner Losh if (f->gototab[i].entries == NULL) 174f39dd6a9SWarner Losh goto out; 175*f32a6403SWarner Losh f->gototab[i].allocated = NCHARS; 176*f32a6403SWarner Losh f->gototab[i].inuse = 0; 177f39dd6a9SWarner Losh f->out[i] = 0; 178f39dd6a9SWarner Losh f->posns[i] = NULL; 179f39dd6a9SWarner Losh } 180f39dd6a9SWarner Losh f->state_count = new_count; 181f39dd6a9SWarner Losh return; 182f39dd6a9SWarner Losh out: 183f39dd6a9SWarner Losh overflo(__func__); 184f39dd6a9SWarner Losh } 185f39dd6a9SWarner Losh 186f39dd6a9SWarner Losh fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 1872a55deb1SDavid E. O'Brien { 1882a55deb1SDavid E. O'Brien int i, use, nuse; 1892a55deb1SDavid E. O'Brien fa *pfa; 1902a55deb1SDavid E. O'Brien static int now = 1; 1912a55deb1SDavid E. O'Brien 19210ce5b99SWarner Losh if (setvec == NULL) { /* first time through any RE */ 193f39dd6a9SWarner Losh resizesetvec(__func__); 1942a55deb1SDavid E. O'Brien } 1952a55deb1SDavid E. O'Brien 196f39dd6a9SWarner Losh if (compile_time != RUNNING) /* a constant for sure */ 1972a55deb1SDavid E. O'Brien return mkdfa(s, anchor); 1982a55deb1SDavid E. O'Brien for (i = 0; i < nfatab; i++) /* is it there already? */ 1992a55deb1SDavid E. O'Brien if (fatab[i]->anchor == anchor 200007c6572SDag-Erling Smørgrav && strcmp((const char *) fatab[i]->restr, s) == 0) { 2012a55deb1SDavid E. O'Brien fatab[i]->use = now++; 2022a55deb1SDavid E. O'Brien return fatab[i]; 2032a55deb1SDavid E. O'Brien } 2042a55deb1SDavid E. O'Brien pfa = mkdfa(s, anchor); 2052a55deb1SDavid E. O'Brien if (nfatab < NFA) { /* room for another */ 2062a55deb1SDavid E. O'Brien fatab[nfatab] = pfa; 2072a55deb1SDavid E. O'Brien fatab[nfatab]->use = now++; 2082a55deb1SDavid E. O'Brien nfatab++; 2092a55deb1SDavid E. O'Brien return pfa; 2102a55deb1SDavid E. O'Brien } 2112a55deb1SDavid E. O'Brien use = fatab[0]->use; /* replace least-recently used */ 2122a55deb1SDavid E. O'Brien nuse = 0; 2132a55deb1SDavid E. O'Brien for (i = 1; i < nfatab; i++) 2142a55deb1SDavid E. O'Brien if (fatab[i]->use < use) { 2152a55deb1SDavid E. O'Brien use = fatab[i]->use; 2162a55deb1SDavid E. O'Brien nuse = i; 2172a55deb1SDavid E. O'Brien } 2182a55deb1SDavid E. O'Brien freefa(fatab[nuse]); 2192a55deb1SDavid E. O'Brien fatab[nuse] = pfa; 2202a55deb1SDavid E. O'Brien pfa->use = now++; 2212a55deb1SDavid E. O'Brien return pfa; 2222a55deb1SDavid E. O'Brien } 2232a55deb1SDavid E. O'Brien 224f39dd6a9SWarner Losh fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 225f39dd6a9SWarner Losh /* anchor = true for anchored matches, else false */ 2262a55deb1SDavid E. O'Brien { 2272a55deb1SDavid E. O'Brien Node *p, *p1; 2282a55deb1SDavid E. O'Brien fa *f; 2292a55deb1SDavid E. O'Brien 230f39dd6a9SWarner Losh firstbasestr = (const uschar *) s; 231b5253557SWarner Losh basestr = firstbasestr; 2322a55deb1SDavid E. O'Brien p = reparse(s); 2332a55deb1SDavid E. O'Brien p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 2342a55deb1SDavid E. O'Brien /* put ALL STAR in front of reg. exp. */ 2352a55deb1SDavid E. O'Brien p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 2362a55deb1SDavid E. O'Brien /* put FINAL after reg. exp. */ 2372a55deb1SDavid E. O'Brien 2382a55deb1SDavid E. O'Brien poscnt = 0; 2392a55deb1SDavid E. O'Brien penter(p1); /* enter parent pointers and leaf indices */ 2402a55deb1SDavid E. O'Brien if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 241f39dd6a9SWarner Losh overflo(__func__); 2422a55deb1SDavid E. O'Brien f->accept = poscnt-1; /* penter has computed number of positions in re */ 2432a55deb1SDavid E. O'Brien cfoll(f, p1); /* set up follow sets */ 2442a55deb1SDavid E. O'Brien freetr(p1); 245f39dd6a9SWarner Losh resize_state(f, 1); 246f39dd6a9SWarner Losh f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 247f39dd6a9SWarner Losh f->posns[1] = intalloc(1, __func__); 2482a55deb1SDavid E. O'Brien *f->posns[1] = 0; 2492a55deb1SDavid E. O'Brien f->initstat = makeinit(f, anchor); 2502a55deb1SDavid E. O'Brien f->anchor = anchor; 2512a55deb1SDavid E. O'Brien f->restr = (uschar *) tostring(s); 252b5253557SWarner Losh if (firstbasestr != basestr) { 253b5253557SWarner Losh if (basestr) 254b5253557SWarner Losh xfree(basestr); 255b5253557SWarner Losh } 2562a55deb1SDavid E. O'Brien return f; 2572a55deb1SDavid E. O'Brien } 2582a55deb1SDavid E. O'Brien 259f39dd6a9SWarner Losh int makeinit(fa *f, bool anchor) 2602a55deb1SDavid E. O'Brien { 2612a55deb1SDavid E. O'Brien int i, k; 2622a55deb1SDavid E. O'Brien 2632a55deb1SDavid E. O'Brien f->curstat = 2; 2642a55deb1SDavid E. O'Brien f->out[2] = 0; 2652a55deb1SDavid E. O'Brien k = *(f->re[0].lfollow); 2662a55deb1SDavid E. O'Brien xfree(f->posns[2]); 267f39dd6a9SWarner Losh f->posns[2] = intalloc(k + 1, __func__); 2682a55deb1SDavid E. O'Brien for (i = 0; i <= k; i++) { 2692a55deb1SDavid E. O'Brien (f->posns[2])[i] = (f->re[0].lfollow)[i]; 2702a55deb1SDavid E. O'Brien } 2712a55deb1SDavid E. O'Brien if ((f->posns[2])[1] == f->accept) 2722a55deb1SDavid E. O'Brien f->out[2] = 1; 273*f32a6403SWarner Losh clear_gototab(f, 2); 2742a55deb1SDavid E. O'Brien f->curstat = cgoto(f, 2, HAT); 2752a55deb1SDavid E. O'Brien if (anchor) { 2762a55deb1SDavid E. O'Brien *f->posns[2] = k-1; /* leave out position 0 */ 2772a55deb1SDavid E. O'Brien for (i = 0; i < k; i++) { 2782a55deb1SDavid E. O'Brien (f->posns[0])[i] = (f->posns[2])[i]; 2792a55deb1SDavid E. O'Brien } 2802a55deb1SDavid E. O'Brien 2812a55deb1SDavid E. O'Brien f->out[0] = f->out[2]; 2822a55deb1SDavid E. O'Brien if (f->curstat != 2) 2832a55deb1SDavid E. O'Brien --(*f->posns[f->curstat]); 2842a55deb1SDavid E. O'Brien } 2852a55deb1SDavid E. O'Brien return f->curstat; 2862a55deb1SDavid E. O'Brien } 2872a55deb1SDavid E. O'Brien 2882a55deb1SDavid E. O'Brien void penter(Node *p) /* set up parent pointers and leaf indices */ 2892a55deb1SDavid E. O'Brien { 2902a55deb1SDavid E. O'Brien switch (type(p)) { 291addad6afSRong-En Fan ELEAF 2922a55deb1SDavid E. O'Brien LEAF 2932a55deb1SDavid E. O'Brien info(p) = poscnt; 2942a55deb1SDavid E. O'Brien poscnt++; 2952a55deb1SDavid E. O'Brien break; 2962a55deb1SDavid E. O'Brien UNARY 2972a55deb1SDavid E. O'Brien penter(left(p)); 2982a55deb1SDavid E. O'Brien parent(left(p)) = p; 2992a55deb1SDavid E. O'Brien break; 3002a55deb1SDavid E. O'Brien case CAT: 3012a55deb1SDavid E. O'Brien case OR: 3022a55deb1SDavid E. O'Brien penter(left(p)); 3032a55deb1SDavid E. O'Brien penter(right(p)); 3042a55deb1SDavid E. O'Brien parent(left(p)) = p; 3052a55deb1SDavid E. O'Brien parent(right(p)) = p; 3062a55deb1SDavid E. O'Brien break; 307f39dd6a9SWarner Losh case ZERO: 308f39dd6a9SWarner Losh break; 3092a55deb1SDavid E. O'Brien default: /* can't happen */ 3102a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in penter", type(p)); 3112a55deb1SDavid E. O'Brien break; 3122a55deb1SDavid E. O'Brien } 3132a55deb1SDavid E. O'Brien } 3142a55deb1SDavid E. O'Brien 3152a55deb1SDavid E. O'Brien void freetr(Node *p) /* free parse tree */ 3162a55deb1SDavid E. O'Brien { 3172a55deb1SDavid E. O'Brien switch (type(p)) { 318addad6afSRong-En Fan ELEAF 3192a55deb1SDavid E. O'Brien LEAF 3202a55deb1SDavid E. O'Brien xfree(p); 3212a55deb1SDavid E. O'Brien break; 3222a55deb1SDavid E. O'Brien UNARY 323f39dd6a9SWarner Losh case ZERO: 3242a55deb1SDavid E. O'Brien freetr(left(p)); 3252a55deb1SDavid E. O'Brien xfree(p); 3262a55deb1SDavid E. O'Brien break; 3272a55deb1SDavid E. O'Brien case CAT: 3282a55deb1SDavid E. O'Brien case OR: 3292a55deb1SDavid E. O'Brien freetr(left(p)); 3302a55deb1SDavid E. O'Brien freetr(right(p)); 3312a55deb1SDavid E. O'Brien xfree(p); 3322a55deb1SDavid E. O'Brien break; 3332a55deb1SDavid E. O'Brien default: /* can't happen */ 3342a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in freetr", type(p)); 3352a55deb1SDavid E. O'Brien break; 3362a55deb1SDavid E. O'Brien } 3372a55deb1SDavid E. O'Brien } 3382a55deb1SDavid E. O'Brien 3392a55deb1SDavid E. O'Brien /* in the parsing of regular expressions, metacharacters like . have */ 3402a55deb1SDavid E. O'Brien /* to be seen literally; \056 is not a metacharacter. */ 3412a55deb1SDavid E. O'Brien 342*f32a6403SWarner Losh int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ 3432a55deb1SDavid E. O'Brien { /* only pick up one 8-bit byte (2 chars) */ 344f39dd6a9SWarner Losh const uschar *p; 3452a55deb1SDavid E. O'Brien int n = 0; 3462a55deb1SDavid E. O'Brien int i; 3472a55deb1SDavid E. O'Brien 348*f32a6403SWarner Losh for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { 349*f32a6403SWarner Losh if (isdigit((int) *p)) 3502a55deb1SDavid E. O'Brien n = 16 * n + *p - '0'; 3512a55deb1SDavid E. O'Brien else if (*p >= 'a' && *p <= 'f') 3522a55deb1SDavid E. O'Brien n = 16 * n + *p - 'a' + 10; 3532a55deb1SDavid E. O'Brien else if (*p >= 'A' && *p <= 'F') 3542a55deb1SDavid E. O'Brien n = 16 * n + *p - 'A' + 10; 3552a55deb1SDavid E. O'Brien } 356f39dd6a9SWarner Losh *pp = p; 3572a55deb1SDavid E. O'Brien return n; 3582a55deb1SDavid E. O'Brien } 3592a55deb1SDavid E. O'Brien 360*f32a6403SWarner Losh 361*f32a6403SWarner Losh 3622a55deb1SDavid E. O'Brien #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 3632a55deb1SDavid E. O'Brien 364f39dd6a9SWarner Losh int quoted(const uschar **pp) /* pick up next thing after a \\ */ 3652a55deb1SDavid E. O'Brien /* and increment *pp */ 3662a55deb1SDavid E. O'Brien { 367f39dd6a9SWarner Losh const uschar *p = *pp; 3682a55deb1SDavid E. O'Brien int c; 3692a55deb1SDavid E. O'Brien 370*f32a6403SWarner Losh /* BUG: should advance by utf-8 char even if makes no sense */ 371*f32a6403SWarner Losh 372*f32a6403SWarner Losh if ((c = *p++) == 't') { 3732a55deb1SDavid E. O'Brien c = '\t'; 374*f32a6403SWarner Losh } else if (c == 'n') { 3752a55deb1SDavid E. O'Brien c = '\n'; 376*f32a6403SWarner Losh } else if (c == 'f') { 3772a55deb1SDavid E. O'Brien c = '\f'; 378*f32a6403SWarner Losh } else if (c == 'r') { 3792a55deb1SDavid E. O'Brien c = '\r'; 380*f32a6403SWarner Losh } else if (c == 'b') { 3812a55deb1SDavid E. O'Brien c = '\b'; 382*f32a6403SWarner Losh } else if (c == 'v') { 383f39dd6a9SWarner Losh c = '\v'; 384*f32a6403SWarner Losh } else if (c == 'a') { 385f39dd6a9SWarner Losh c = '\a'; 386*f32a6403SWarner Losh } else if (c == '\\') { 3872a55deb1SDavid E. O'Brien c = '\\'; 388*f32a6403SWarner Losh } else if (c == 'x') { /* 2 hex digits follow */ 389*f32a6403SWarner Losh c = hexstr(&p, 2); /* this adds a null if number is invalid */ 390*f32a6403SWarner Losh } else if (c == 'u') { /* unicode char number up to 8 hex digits */ 391*f32a6403SWarner Losh c = hexstr(&p, 8); 3922a55deb1SDavid E. O'Brien } else if (isoctdigit(c)) { /* \d \dd \ddd */ 3932a55deb1SDavid E. O'Brien int n = c - '0'; 3942a55deb1SDavid E. O'Brien if (isoctdigit(*p)) { 3952a55deb1SDavid E. O'Brien n = 8 * n + *p++ - '0'; 3962a55deb1SDavid E. O'Brien if (isoctdigit(*p)) 3972a55deb1SDavid E. O'Brien n = 8 * n + *p++ - '0'; 3982a55deb1SDavid E. O'Brien } 3992a55deb1SDavid E. O'Brien c = n; 4002a55deb1SDavid E. O'Brien } /* else */ 4012a55deb1SDavid E. O'Brien /* c = c; */ 4022a55deb1SDavid E. O'Brien *pp = p; 4032a55deb1SDavid E. O'Brien return c; 4042a55deb1SDavid E. O'Brien } 4052a55deb1SDavid E. O'Brien 406*f32a6403SWarner Losh int *cclenter(const char *argp) /* add a character class */ 4072a55deb1SDavid E. O'Brien { 408628bd30aSWarner Losh int i, c, c2; 409*f32a6403SWarner Losh int n; 410*f32a6403SWarner Losh const uschar *p = (const uschar *) argp; 411*f32a6403SWarner Losh int *bp, *retp; 412*f32a6403SWarner Losh static int *buf = NULL; 4132a55deb1SDavid E. O'Brien static int bufsz = 100; 4142a55deb1SDavid E. O'Brien 415*f32a6403SWarner Losh if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) 4162a55deb1SDavid E. O'Brien FATAL("out of space for character class [%.10s...] 1", p); 4172a55deb1SDavid E. O'Brien bp = buf; 418*f32a6403SWarner Losh for (i = 0; *p != 0; ) { 419*f32a6403SWarner Losh n = u8_rune(&c, (const char *) p); 420*f32a6403SWarner Losh p += n; 4212a55deb1SDavid E. O'Brien if (c == '\\') { 422d86a0988SRuslan Ermilov c = quoted(&p); 4232a55deb1SDavid E. O'Brien } else if (c == '-' && i > 0 && bp[-1] != 0) { 4242a55deb1SDavid E. O'Brien if (*p != 0) { 4252a55deb1SDavid E. O'Brien c = bp[-1]; 426*f32a6403SWarner Losh /* c2 = *p++; */ 427*f32a6403SWarner Losh n = u8_rune(&c2, (const char *) p); 428*f32a6403SWarner Losh p += n; 4292a55deb1SDavid E. O'Brien if (c2 == '\\') 430*f32a6403SWarner Losh c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ 431628bd30aSWarner Losh if (c > c2) { /* empty; ignore */ 4322a55deb1SDavid E. O'Brien bp--; 4332a55deb1SDavid E. O'Brien i--; 4342a55deb1SDavid E. O'Brien continue; 4352a55deb1SDavid E. O'Brien } 436628bd30aSWarner Losh while (c < c2) { 437*f32a6403SWarner Losh if (i >= bufsz) { 438*f32a6403SWarner Losh bufsz *= 2; 439*f32a6403SWarner Losh buf = (int *) realloc(buf, bufsz * sizeof(int)); 440*f32a6403SWarner Losh if (buf == NULL) 4412a55deb1SDavid E. O'Brien FATAL("out of space for character class [%.10s...] 2", p); 442*f32a6403SWarner Losh bp = buf + i; 443*f32a6403SWarner Losh } 444628bd30aSWarner Losh *bp++ = ++c; 4452a55deb1SDavid E. O'Brien i++; 4462a55deb1SDavid E. O'Brien } 4472a55deb1SDavid E. O'Brien continue; 4482a55deb1SDavid E. O'Brien } 4492a55deb1SDavid E. O'Brien } 450*f32a6403SWarner Losh if (i >= bufsz) { 451*f32a6403SWarner Losh bufsz *= 2; 452*f32a6403SWarner Losh buf = (int *) realloc(buf, bufsz * sizeof(int)); 453*f32a6403SWarner Losh if (buf == NULL) 454*f32a6403SWarner Losh FATAL("out of space for character class [%.10s...] 2", p); 455*f32a6403SWarner Losh bp = buf + i; 456*f32a6403SWarner Losh } 4572a55deb1SDavid E. O'Brien *bp++ = c; 4582a55deb1SDavid E. O'Brien i++; 4592a55deb1SDavid E. O'Brien } 4602a55deb1SDavid E. O'Brien *bp = 0; 461*f32a6403SWarner Losh /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ 462*f32a6403SWarner Losh /* xfree(op); BUG: what are we freeing here? */ 463*f32a6403SWarner Losh retp = (int *) calloc(bp-buf+1, sizeof(int)); 464*f32a6403SWarner Losh for (i = 0; i < bp-buf+1; i++) 465*f32a6403SWarner Losh retp[i] = buf[i]; 466*f32a6403SWarner Losh return retp; 4672a55deb1SDavid E. O'Brien } 4682a55deb1SDavid E. O'Brien 469813da98dSDavid E. O'Brien void overflo(const char *s) 4702a55deb1SDavid E. O'Brien { 471f39dd6a9SWarner Losh FATAL("regular expression too big: out of space in %.30s...", s); 4722a55deb1SDavid E. O'Brien } 4732a55deb1SDavid E. O'Brien 4742a55deb1SDavid E. O'Brien void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 4752a55deb1SDavid E. O'Brien { 4762a55deb1SDavid E. O'Brien int i; 4772a55deb1SDavid E. O'Brien int *p; 4782a55deb1SDavid E. O'Brien 4792a55deb1SDavid E. O'Brien switch (type(v)) { 480addad6afSRong-En Fan ELEAF 4812a55deb1SDavid E. O'Brien LEAF 4822a55deb1SDavid E. O'Brien f->re[info(v)].ltype = type(v); 4832a55deb1SDavid E. O'Brien f->re[info(v)].lval.np = right(v); 4842a55deb1SDavid E. O'Brien while (f->accept >= maxsetvec) { /* guessing here! */ 485f39dd6a9SWarner Losh resizesetvec(__func__); 4862a55deb1SDavid E. O'Brien } 4872a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++) 4882a55deb1SDavid E. O'Brien setvec[i] = 0; 4892a55deb1SDavid E. O'Brien setcnt = 0; 4902a55deb1SDavid E. O'Brien follow(v); /* computes setvec and setcnt */ 491f39dd6a9SWarner Losh p = intalloc(setcnt + 1, __func__); 4922a55deb1SDavid E. O'Brien f->re[info(v)].lfollow = p; 4932a55deb1SDavid E. O'Brien *p = setcnt; 4942a55deb1SDavid E. O'Brien for (i = f->accept; i >= 0; i--) 4952a55deb1SDavid E. O'Brien if (setvec[i] == 1) 4962a55deb1SDavid E. O'Brien *++p = i; 4972a55deb1SDavid E. O'Brien break; 4982a55deb1SDavid E. O'Brien UNARY 4992a55deb1SDavid E. O'Brien cfoll(f,left(v)); 5002a55deb1SDavid E. O'Brien break; 5012a55deb1SDavid E. O'Brien case CAT: 5022a55deb1SDavid E. O'Brien case OR: 5032a55deb1SDavid E. O'Brien cfoll(f,left(v)); 5042a55deb1SDavid E. O'Brien cfoll(f,right(v)); 5052a55deb1SDavid E. O'Brien break; 506f39dd6a9SWarner Losh case ZERO: 507f39dd6a9SWarner Losh break; 5082a55deb1SDavid E. O'Brien default: /* can't happen */ 5092a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in cfoll", type(v)); 5102a55deb1SDavid E. O'Brien } 5112a55deb1SDavid E. O'Brien } 5122a55deb1SDavid E. O'Brien 5132a55deb1SDavid E. O'Brien int first(Node *p) /* collects initially active leaves of p into setvec */ 514addad6afSRong-En Fan /* returns 0 if p matches empty string */ 5152a55deb1SDavid E. O'Brien { 5162a55deb1SDavid E. O'Brien int b, lp; 5172a55deb1SDavid E. O'Brien 5182a55deb1SDavid E. O'Brien switch (type(p)) { 519addad6afSRong-En Fan ELEAF 5202a55deb1SDavid E. O'Brien LEAF 5212a55deb1SDavid E. O'Brien lp = info(p); /* look for high-water mark of subscripts */ 5222a55deb1SDavid E. O'Brien while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 523f39dd6a9SWarner Losh resizesetvec(__func__); 5242a55deb1SDavid E. O'Brien } 525addad6afSRong-En Fan if (type(p) == EMPTYRE) { 526addad6afSRong-En Fan setvec[lp] = 0; 527addad6afSRong-En Fan return(0); 528addad6afSRong-En Fan } 5292a55deb1SDavid E. O'Brien if (setvec[lp] != 1) { 5302a55deb1SDavid E. O'Brien setvec[lp] = 1; 5312a55deb1SDavid E. O'Brien setcnt++; 5322a55deb1SDavid E. O'Brien } 533*f32a6403SWarner Losh if (type(p) == CCL && (*(int *) right(p)) == 0) 5342a55deb1SDavid E. O'Brien return(0); /* empty CCL */ 535f39dd6a9SWarner Losh return(1); 5362a55deb1SDavid E. O'Brien case PLUS: 537f39dd6a9SWarner Losh if (first(left(p)) == 0) 538f39dd6a9SWarner Losh return(0); 5392a55deb1SDavid E. O'Brien return(1); 5402a55deb1SDavid E. O'Brien case STAR: 5412a55deb1SDavid E. O'Brien case QUEST: 5422a55deb1SDavid E. O'Brien first(left(p)); 5432a55deb1SDavid E. O'Brien return(0); 5442a55deb1SDavid E. O'Brien case CAT: 5452a55deb1SDavid E. O'Brien if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 5462a55deb1SDavid E. O'Brien return(1); 5472a55deb1SDavid E. O'Brien case OR: 5482a55deb1SDavid E. O'Brien b = first(right(p)); 5492a55deb1SDavid E. O'Brien if (first(left(p)) == 0 || b == 0) return(0); 5502a55deb1SDavid E. O'Brien return(1); 551f39dd6a9SWarner Losh case ZERO: 552f39dd6a9SWarner Losh return 0; 5532a55deb1SDavid E. O'Brien } 5542a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 5552a55deb1SDavid E. O'Brien return(-1); 5562a55deb1SDavid E. O'Brien } 5572a55deb1SDavid E. O'Brien 5582a55deb1SDavid E. O'Brien void follow(Node *v) /* collects leaves that can follow v into setvec */ 5592a55deb1SDavid E. O'Brien { 5602a55deb1SDavid E. O'Brien Node *p; 5612a55deb1SDavid E. O'Brien 5622a55deb1SDavid E. O'Brien if (type(v) == FINAL) 5632a55deb1SDavid E. O'Brien return; 5642a55deb1SDavid E. O'Brien p = parent(v); 5652a55deb1SDavid E. O'Brien switch (type(p)) { 5662a55deb1SDavid E. O'Brien case STAR: 5672a55deb1SDavid E. O'Brien case PLUS: 5682a55deb1SDavid E. O'Brien first(v); 5692a55deb1SDavid E. O'Brien follow(p); 5702a55deb1SDavid E. O'Brien return; 5712a55deb1SDavid E. O'Brien 5722a55deb1SDavid E. O'Brien case OR: 5732a55deb1SDavid E. O'Brien case QUEST: 5742a55deb1SDavid E. O'Brien follow(p); 5752a55deb1SDavid E. O'Brien return; 5762a55deb1SDavid E. O'Brien 5772a55deb1SDavid E. O'Brien case CAT: 5782a55deb1SDavid E. O'Brien if (v == left(p)) { /* v is left child of p */ 5792a55deb1SDavid E. O'Brien if (first(right(p)) == 0) { 5802a55deb1SDavid E. O'Brien follow(p); 5812a55deb1SDavid E. O'Brien return; 5822a55deb1SDavid E. O'Brien } 5832a55deb1SDavid E. O'Brien } else /* v is right child */ 5842a55deb1SDavid E. O'Brien follow(p); 5852a55deb1SDavid E. O'Brien return; 5862a55deb1SDavid E. O'Brien } 5872a55deb1SDavid E. O'Brien } 5882a55deb1SDavid E. O'Brien 589*f32a6403SWarner Losh int member(int c, int *sarg) /* is c in s? */ 5902a55deb1SDavid E. O'Brien { 591*f32a6403SWarner Losh int *s = (int *) sarg; 5922a55deb1SDavid E. O'Brien 5932a55deb1SDavid E. O'Brien while (*s) 5942a55deb1SDavid E. O'Brien if (c == *s++) 5952a55deb1SDavid E. O'Brien return(1); 5962a55deb1SDavid E. O'Brien return(0); 5972a55deb1SDavid E. O'Brien } 5982a55deb1SDavid E. O'Brien 599*f32a6403SWarner Losh static void resize_gototab(fa *f, int state) 600*f32a6403SWarner Losh { 601*f32a6403SWarner Losh size_t new_size = f->gototab[state].allocated * 2; 602*f32a6403SWarner Losh gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); 603*f32a6403SWarner Losh if (p == NULL) 604*f32a6403SWarner Losh overflo(__func__); 605*f32a6403SWarner Losh 606*f32a6403SWarner Losh // need to initialized the new memory to zero 607*f32a6403SWarner Losh size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size 608*f32a6403SWarner Losh memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out 609*f32a6403SWarner Losh 610*f32a6403SWarner Losh f->gototab[state].allocated = new_size; // update gototab info 611*f32a6403SWarner Losh f->gototab[state].entries = p; 612*f32a6403SWarner Losh } 613*f32a6403SWarner Losh 614*f32a6403SWarner Losh static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */ 615*f32a6403SWarner Losh { 616*f32a6403SWarner Losh gtte key; 617*f32a6403SWarner Losh gtte *item; 618*f32a6403SWarner Losh 619*f32a6403SWarner Losh key.ch = ch; 620*f32a6403SWarner Losh key.state = 0; /* irrelevant */ 621*f32a6403SWarner Losh item = (gtte *) bsearch(& key, f->gototab[state].entries, 622*f32a6403SWarner Losh f->gototab[state].inuse, sizeof(gtte), 623*f32a6403SWarner Losh entry_cmp); 624*f32a6403SWarner Losh 625*f32a6403SWarner Losh if (item == NULL) 626*f32a6403SWarner Losh return 0; 627*f32a6403SWarner Losh else 628*f32a6403SWarner Losh return item->state; 629*f32a6403SWarner Losh } 630*f32a6403SWarner Losh 631*f32a6403SWarner Losh static int entry_cmp(const void *l, const void *r) 632*f32a6403SWarner Losh { 633*f32a6403SWarner Losh const gtte *left, *right; 634*f32a6403SWarner Losh 635*f32a6403SWarner Losh left = (const gtte *) l; 636*f32a6403SWarner Losh right = (const gtte *) r; 637*f32a6403SWarner Losh 638*f32a6403SWarner Losh return left->ch - right->ch; 639*f32a6403SWarner Losh } 640*f32a6403SWarner Losh 641*f32a6403SWarner Losh static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */ 642*f32a6403SWarner Losh { 643*f32a6403SWarner Losh if (f->gototab[state].inuse == 0) { 644*f32a6403SWarner Losh f->gototab[state].entries[0].ch = ch; 645*f32a6403SWarner Losh f->gototab[state].entries[0].state = val; 646*f32a6403SWarner Losh f->gototab[state].inuse++; 647*f32a6403SWarner Losh return val; 648*f32a6403SWarner Losh } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { 649*f32a6403SWarner Losh // not seen yet, insert and return 650*f32a6403SWarner Losh gtt *tab = & f->gototab[state]; 651*f32a6403SWarner Losh if (tab->inuse + 1 >= tab->allocated) 652*f32a6403SWarner Losh resize_gototab(f, state); 653*f32a6403SWarner Losh 654*f32a6403SWarner Losh f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch; 655*f32a6403SWarner Losh f->gototab[state].entries[f->gototab[state].inuse-1].state = val; 656*f32a6403SWarner Losh f->gototab[state].inuse++; 657*f32a6403SWarner Losh return val; 658*f32a6403SWarner Losh } else { 659*f32a6403SWarner Losh // maybe we have it, maybe we don't 660*f32a6403SWarner Losh gtte key; 661*f32a6403SWarner Losh gtte *item; 662*f32a6403SWarner Losh 663*f32a6403SWarner Losh key.ch = ch; 664*f32a6403SWarner Losh key.state = 0; /* irrelevant */ 665*f32a6403SWarner Losh item = (gtte *) bsearch(& key, f->gototab[state].entries, 666*f32a6403SWarner Losh f->gototab[state].inuse, sizeof(gtte), 667*f32a6403SWarner Losh entry_cmp); 668*f32a6403SWarner Losh 669*f32a6403SWarner Losh if (item != NULL) { 670*f32a6403SWarner Losh // we have it, update state and return 671*f32a6403SWarner Losh item->state = val; 672*f32a6403SWarner Losh return item->state; 673*f32a6403SWarner Losh } 674*f32a6403SWarner Losh // otherwise, fall through to insert and reallocate. 675*f32a6403SWarner Losh } 676*f32a6403SWarner Losh 677*f32a6403SWarner Losh gtt *tab = & f->gototab[state]; 678*f32a6403SWarner Losh if (tab->inuse + 1 >= tab->allocated) 679*f32a6403SWarner Losh resize_gototab(f, state); 680*f32a6403SWarner Losh ++tab->inuse; 681*f32a6403SWarner Losh f->gototab[state].entries[tab->inuse].ch = ch; 682*f32a6403SWarner Losh f->gototab[state].entries[tab->inuse].state = val; 683*f32a6403SWarner Losh 684*f32a6403SWarner Losh qsort(f->gototab[state].entries, 685*f32a6403SWarner Losh f->gototab[state].inuse, sizeof(gtte), entry_cmp); 686*f32a6403SWarner Losh 687*f32a6403SWarner Losh return val; /* not used anywhere at the moment */ 688*f32a6403SWarner Losh } 689*f32a6403SWarner Losh 690*f32a6403SWarner Losh static void clear_gototab(fa *f, int state) 691*f32a6403SWarner Losh { 692*f32a6403SWarner Losh memset(f->gototab[state].entries, 0, 693*f32a6403SWarner Losh f->gototab[state].allocated * sizeof(gtte)); 694*f32a6403SWarner Losh f->gototab[state].inuse = 0; 695*f32a6403SWarner Losh } 696*f32a6403SWarner Losh 697813da98dSDavid E. O'Brien int match(fa *f, const char *p0) /* shortest match ? */ 6982a55deb1SDavid E. O'Brien { 6992a55deb1SDavid E. O'Brien int s, ns; 700*f32a6403SWarner Losh int n; 701*f32a6403SWarner Losh int rune; 702f39dd6a9SWarner Losh const uschar *p = (const uschar *) p0; 7032a55deb1SDavid E. O'Brien 704*f32a6403SWarner Losh /* return pmatch(f, p0); does it matter whether longest or shortest? */ 705*f32a6403SWarner Losh 706f39dd6a9SWarner Losh s = f->initstat; 707f39dd6a9SWarner Losh assert (s < f->state_count); 708f39dd6a9SWarner Losh 7092a55deb1SDavid E. O'Brien if (f->out[s]) 7102a55deb1SDavid E. O'Brien return(1); 7112a55deb1SDavid E. O'Brien do { 712addad6afSRong-En Fan /* assert(*p < NCHARS); */ 713*f32a6403SWarner Losh n = u8_rune(&rune, (const char *) p); 714*f32a6403SWarner Losh if ((ns = get_gototab(f, s, rune)) != 0) 7152a55deb1SDavid E. O'Brien s = ns; 7162a55deb1SDavid E. O'Brien else 717*f32a6403SWarner Losh s = cgoto(f, s, rune); 7182a55deb1SDavid E. O'Brien if (f->out[s]) 7192a55deb1SDavid E. O'Brien return(1); 720*f32a6403SWarner Losh if (*p == 0) 721*f32a6403SWarner Losh break; 722*f32a6403SWarner Losh p += n; 723*f32a6403SWarner Losh } while (1); /* was *p++ != 0 */ 7242a55deb1SDavid E. O'Brien return(0); 7252a55deb1SDavid E. O'Brien } 7262a55deb1SDavid E. O'Brien 727813da98dSDavid E. O'Brien int pmatch(fa *f, const char *p0) /* longest match, for sub */ 7282a55deb1SDavid E. O'Brien { 7292a55deb1SDavid E. O'Brien int s, ns; 730*f32a6403SWarner Losh int n; 731*f32a6403SWarner Losh int rune; 732f39dd6a9SWarner Losh const uschar *p = (const uschar *) p0; 733f39dd6a9SWarner Losh const uschar *q; 7342a55deb1SDavid E. O'Brien 73562ebc626SRuslan Ermilov s = f->initstat; 736f39dd6a9SWarner Losh assert(s < f->state_count); 737f39dd6a9SWarner Losh 738f39dd6a9SWarner Losh patbeg = (const char *)p; 7392a55deb1SDavid E. O'Brien patlen = -1; 7402a55deb1SDavid E. O'Brien do { 7412a55deb1SDavid E. O'Brien q = p; 7422a55deb1SDavid E. O'Brien do { 7432a55deb1SDavid E. O'Brien if (f->out[s]) /* final state */ 7442a55deb1SDavid E. O'Brien patlen = q-p; 745addad6afSRong-En Fan /* assert(*q < NCHARS); */ 746*f32a6403SWarner Losh n = u8_rune(&rune, (const char *) q); 747*f32a6403SWarner Losh if ((ns = get_gototab(f, s, rune)) != 0) 7482a55deb1SDavid E. O'Brien s = ns; 7492a55deb1SDavid E. O'Brien else 750*f32a6403SWarner Losh s = cgoto(f, s, rune); 751f39dd6a9SWarner Losh 752f39dd6a9SWarner Losh assert(s < f->state_count); 753f39dd6a9SWarner Losh 7542a55deb1SDavid E. O'Brien if (s == 1) { /* no transition */ 7552a55deb1SDavid E. O'Brien if (patlen >= 0) { 756f39dd6a9SWarner Losh patbeg = (const char *) p; 7572a55deb1SDavid E. O'Brien return(1); 7582a55deb1SDavid E. O'Brien } 7592a55deb1SDavid E. O'Brien else 7602a55deb1SDavid E. O'Brien goto nextin; /* no match */ 7612a55deb1SDavid E. O'Brien } 762*f32a6403SWarner Losh if (*q == 0) 763*f32a6403SWarner Losh break; 764*f32a6403SWarner Losh q += n; 765*f32a6403SWarner Losh } while (1); 766*f32a6403SWarner Losh q++; /* was *q++ */ 7672a55deb1SDavid E. O'Brien if (f->out[s]) 7682a55deb1SDavid E. O'Brien patlen = q-p-1; /* don't count $ */ 7692a55deb1SDavid E. O'Brien if (patlen >= 0) { 770f39dd6a9SWarner Losh patbeg = (const char *) p; 7712a55deb1SDavid E. O'Brien return(1); 7722a55deb1SDavid E. O'Brien } 7732a55deb1SDavid E. O'Brien nextin: 7742a55deb1SDavid E. O'Brien s = 2; 775*f32a6403SWarner Losh if (*p == 0) 776*f32a6403SWarner Losh break; 777*f32a6403SWarner Losh n = u8_rune(&rune, (const char *) p); 778*f32a6403SWarner Losh p += n; 779*f32a6403SWarner Losh } while (1); /* was *p++ */ 7802a55deb1SDavid E. O'Brien return (0); 7812a55deb1SDavid E. O'Brien } 7822a55deb1SDavid E. O'Brien 783813da98dSDavid E. O'Brien int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 7842a55deb1SDavid E. O'Brien { 7852a55deb1SDavid E. O'Brien int s, ns; 786*f32a6403SWarner Losh int n; 787*f32a6403SWarner Losh int rune; 788f39dd6a9SWarner Losh const uschar *p = (const uschar *) p0; 789f39dd6a9SWarner Losh const uschar *q; 7902a55deb1SDavid E. O'Brien 79162ebc626SRuslan Ermilov s = f->initstat; 792f39dd6a9SWarner Losh assert(s < f->state_count); 793f39dd6a9SWarner Losh 794f39dd6a9SWarner Losh patbeg = (const char *)p; 7952a55deb1SDavid E. O'Brien patlen = -1; 7962a55deb1SDavid E. O'Brien while (*p) { 7972a55deb1SDavid E. O'Brien q = p; 7982a55deb1SDavid E. O'Brien do { 7992a55deb1SDavid E. O'Brien if (f->out[s]) /* final state */ 8002a55deb1SDavid E. O'Brien patlen = q-p; 801addad6afSRong-En Fan /* assert(*q < NCHARS); */ 802*f32a6403SWarner Losh n = u8_rune(&rune, (const char *) q); 803*f32a6403SWarner Losh if ((ns = get_gototab(f, s, rune)) != 0) 8042a55deb1SDavid E. O'Brien s = ns; 8052a55deb1SDavid E. O'Brien else 806*f32a6403SWarner Losh s = cgoto(f, s, rune); 8072a55deb1SDavid E. O'Brien if (s == 1) { /* no transition */ 8082a55deb1SDavid E. O'Brien if (patlen > 0) { 809f39dd6a9SWarner Losh patbeg = (const char *) p; 8102a55deb1SDavid E. O'Brien return(1); 8112a55deb1SDavid E. O'Brien } else 8122a55deb1SDavid E. O'Brien goto nnextin; /* no nonempty match */ 8132a55deb1SDavid E. O'Brien } 814*f32a6403SWarner Losh if (*q == 0) 815*f32a6403SWarner Losh break; 816*f32a6403SWarner Losh q += n; 817*f32a6403SWarner Losh } while (1); 818*f32a6403SWarner Losh q++; 8192a55deb1SDavid E. O'Brien if (f->out[s]) 8202a55deb1SDavid E. O'Brien patlen = q-p-1; /* don't count $ */ 8212a55deb1SDavid E. O'Brien if (patlen > 0 ) { 822f39dd6a9SWarner Losh patbeg = (const char *) p; 8232a55deb1SDavid E. O'Brien return(1); 8242a55deb1SDavid E. O'Brien } 8252a55deb1SDavid E. O'Brien nnextin: 8262a55deb1SDavid E. O'Brien s = 2; 8272a55deb1SDavid E. O'Brien p++; 8282a55deb1SDavid E. O'Brien } 8292a55deb1SDavid E. O'Brien return (0); 8302a55deb1SDavid E. O'Brien } 8312a55deb1SDavid E. O'Brien 832f39dd6a9SWarner Losh 833*f32a6403SWarner Losh #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long 834*f32a6403SWarner Losh 835f39dd6a9SWarner Losh /* 836f39dd6a9SWarner Losh * NAME 837f39dd6a9SWarner Losh * fnematch 838f39dd6a9SWarner Losh * 839f39dd6a9SWarner Losh * DESCRIPTION 840f39dd6a9SWarner Losh * A stream-fed version of nematch which transfers characters to a 841f39dd6a9SWarner Losh * null-terminated buffer. All characters up to and including the last 842f39dd6a9SWarner Losh * character of the matching text or EOF are placed in the buffer. If 843f39dd6a9SWarner Losh * a match is found, patbeg and patlen are set appropriately. 844f39dd6a9SWarner Losh * 845f39dd6a9SWarner Losh * RETURN VALUES 846f39dd6a9SWarner Losh * false No match found. 847f39dd6a9SWarner Losh * true Match found. 848f39dd6a9SWarner Losh */ 849f39dd6a9SWarner Losh 850f39dd6a9SWarner Losh bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 851f39dd6a9SWarner Losh { 852*f32a6403SWarner Losh char *i, *j, *k, *buf = *pbuf; 853f39dd6a9SWarner Losh int bufsize = *pbufsize; 854*f32a6403SWarner Losh int c, n, ns, s; 855f39dd6a9SWarner Losh 856f39dd6a9SWarner Losh s = pfa->initstat; 857f39dd6a9SWarner Losh patlen = 0; 858f39dd6a9SWarner Losh 859f39dd6a9SWarner Losh /* 860*f32a6403SWarner Losh * buf <= i <= j <= k <= buf+bufsize 861f39dd6a9SWarner Losh * 862b2376a5fSWarner Losh * i: origin of active substring 863b2376a5fSWarner Losh * j: current character 864*f32a6403SWarner Losh * k: destination of the next getc 865f39dd6a9SWarner Losh */ 866f39dd6a9SWarner Losh 867*f32a6403SWarner Losh i = j = k = buf; 868*f32a6403SWarner Losh 869*f32a6403SWarner Losh do { 870*f32a6403SWarner Losh /* 871*f32a6403SWarner Losh * Call u8_rune with at least MAX_UTF_BYTES ahead in 872*f32a6403SWarner Losh * the buffer until EOF interferes. 873*f32a6403SWarner Losh */ 874*f32a6403SWarner Losh if (k - j < MAX_UTF_BYTES) { 875*f32a6403SWarner Losh if (k + MAX_UTF_BYTES > buf + bufsize) { 876*f32a6403SWarner Losh adjbuf((char **) &buf, &bufsize, 877*f32a6403SWarner Losh bufsize + MAX_UTF_BYTES, 878*f32a6403SWarner Losh quantum, 0, "fnematch"); 879*f32a6403SWarner Losh } 880*f32a6403SWarner Losh for (n = MAX_UTF_BYTES ; n > 0; n--) { 881*f32a6403SWarner Losh *k++ = (c = getc(f)) != EOF ? c : 0; 882*f32a6403SWarner Losh if (c == EOF) { 883*f32a6403SWarner Losh if (ferror(f)) 884*f32a6403SWarner Losh FATAL("fnematch: getc error"); 885*f32a6403SWarner Losh break; 886*f32a6403SWarner Losh } 887*f32a6403SWarner Losh } 888*f32a6403SWarner Losh } 889*f32a6403SWarner Losh 890*f32a6403SWarner Losh j += u8_rune(&c, j); 891*f32a6403SWarner Losh 892*f32a6403SWarner Losh if ((ns = get_gototab(pfa, s, c)) != 0) 893f39dd6a9SWarner Losh s = ns; 894f39dd6a9SWarner Losh else 895b2376a5fSWarner Losh s = cgoto(pfa, s, c); 896f39dd6a9SWarner Losh 897f39dd6a9SWarner Losh if (pfa->out[s]) { /* final state */ 898*f32a6403SWarner Losh patbeg = i; 899*f32a6403SWarner Losh patlen = j - i; 900b2376a5fSWarner Losh if (c == 0) /* don't count $ */ 901f39dd6a9SWarner Losh patlen--; 902f39dd6a9SWarner Losh } 903*f32a6403SWarner Losh 904*f32a6403SWarner Losh if (c && s != 1) 905*f32a6403SWarner Losh continue; /* origin i still viable, next j */ 906*f32a6403SWarner Losh if (patlen) 907*f32a6403SWarner Losh break; /* best match found */ 908*f32a6403SWarner Losh 909*f32a6403SWarner Losh /* no match at origin i, next i and start over */ 910*f32a6403SWarner Losh i += u8_rune(&c, i); 911*f32a6403SWarner Losh if (c == 0) 912*f32a6403SWarner Losh break; /* no match */ 913*f32a6403SWarner Losh j = i; 914f39dd6a9SWarner Losh s = 2; 915*f32a6403SWarner Losh } while (1); 916f39dd6a9SWarner Losh 917f39dd6a9SWarner Losh /* adjbuf() may have relocated a resized buffer. Inform the world. */ 918f39dd6a9SWarner Losh *pbuf = buf; 919f39dd6a9SWarner Losh *pbufsize = bufsize; 920f39dd6a9SWarner Losh 921f39dd6a9SWarner Losh if (patlen) { 922f39dd6a9SWarner Losh /* 923f39dd6a9SWarner Losh * Under no circumstances is the last character fed to 924f39dd6a9SWarner Losh * the automaton part of the match. It is EOF's nullbyte, 925f39dd6a9SWarner Losh * or it sent the automaton into a state with no further 926f39dd6a9SWarner Losh * transitions available (s==1), or both. Room for a 927f39dd6a9SWarner Losh * terminating nullbyte is guaranteed. 928f39dd6a9SWarner Losh * 929f39dd6a9SWarner Losh * ungetc any chars after the end of matching text 930f39dd6a9SWarner Losh * (except for EOF's nullbyte, if present) and null 931f39dd6a9SWarner Losh * terminate the buffer. 932f39dd6a9SWarner Losh */ 933f39dd6a9SWarner Losh do 934*f32a6403SWarner Losh if (*--k && ungetc(*k, f) == EOF) 935*f32a6403SWarner Losh FATAL("unable to ungetc '%c'", *k); 936*f32a6403SWarner Losh while (k > patbeg + patlen); 937*f32a6403SWarner Losh *k = '\0'; 938f39dd6a9SWarner Losh return true; 939f39dd6a9SWarner Losh } 940f39dd6a9SWarner Losh else 941f39dd6a9SWarner Losh return false; 942f39dd6a9SWarner Losh } 943f39dd6a9SWarner Losh 944813da98dSDavid E. O'Brien Node *reparse(const char *p) /* parses regular expression pointed to by p */ 9452a55deb1SDavid E. O'Brien { /* uses relex() to scan regular expression */ 9462a55deb1SDavid E. O'Brien Node *np; 9472a55deb1SDavid E. O'Brien 948f39dd6a9SWarner Losh DPRINTF("reparse <%s>\n", p); 949f39dd6a9SWarner Losh lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 9502a55deb1SDavid E. O'Brien rtok = relex(); 951813da98dSDavid E. O'Brien /* GNU compatibility: an empty regexp matches anything */ 952addad6afSRong-En Fan if (rtok == '\0') { 953813da98dSDavid E. O'Brien /* FATAL("empty regular expression"); previous */ 954addad6afSRong-En Fan return(op2(EMPTYRE, NIL, NIL)); 955addad6afSRong-En Fan } 9562a55deb1SDavid E. O'Brien np = regexp(); 9572a55deb1SDavid E. O'Brien if (rtok != '\0') 9582a55deb1SDavid E. O'Brien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 9592a55deb1SDavid E. O'Brien return(np); 9602a55deb1SDavid E. O'Brien } 9612a55deb1SDavid E. O'Brien 9622a55deb1SDavid E. O'Brien Node *regexp(void) /* top-level parse of reg expr */ 9632a55deb1SDavid E. O'Brien { 9642a55deb1SDavid E. O'Brien return (alt(concat(primary()))); 9652a55deb1SDavid E. O'Brien } 9662a55deb1SDavid E. O'Brien 9672a55deb1SDavid E. O'Brien Node *primary(void) 9682a55deb1SDavid E. O'Brien { 9692a55deb1SDavid E. O'Brien Node *np; 970b5253557SWarner Losh int savelastatom; 9712a55deb1SDavid E. O'Brien 9722a55deb1SDavid E. O'Brien switch (rtok) { 9732a55deb1SDavid E. O'Brien case CHAR: 974b5253557SWarner Losh lastatom = starttok; 9752a55deb1SDavid E. O'Brien np = op2(CHAR, NIL, itonp(rlxval)); 9762a55deb1SDavid E. O'Brien rtok = relex(); 9772a55deb1SDavid E. O'Brien return (unary(np)); 9782a55deb1SDavid E. O'Brien case ALL: 9792a55deb1SDavid E. O'Brien rtok = relex(); 9802a55deb1SDavid E. O'Brien return (unary(op2(ALL, NIL, NIL))); 981addad6afSRong-En Fan case EMPTYRE: 982addad6afSRong-En Fan rtok = relex(); 983b5253557SWarner Losh return (unary(op2(EMPTYRE, NIL, NIL))); 9842a55deb1SDavid E. O'Brien case DOT: 985b5253557SWarner Losh lastatom = starttok; 9862a55deb1SDavid E. O'Brien rtok = relex(); 9872a55deb1SDavid E. O'Brien return (unary(op2(DOT, NIL, NIL))); 9882a55deb1SDavid E. O'Brien case CCL: 989f39dd6a9SWarner Losh np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 990b5253557SWarner Losh lastatom = starttok; 9912a55deb1SDavid E. O'Brien rtok = relex(); 9922a55deb1SDavid E. O'Brien return (unary(np)); 9932a55deb1SDavid E. O'Brien case NCCL: 994f39dd6a9SWarner Losh np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 995b5253557SWarner Losh lastatom = starttok; 9962a55deb1SDavid E. O'Brien rtok = relex(); 9972a55deb1SDavid E. O'Brien return (unary(np)); 9982a55deb1SDavid E. O'Brien case '^': 9992a55deb1SDavid E. O'Brien rtok = relex(); 10002a55deb1SDavid E. O'Brien return (unary(op2(CHAR, NIL, itonp(HAT)))); 10012a55deb1SDavid E. O'Brien case '$': 10022a55deb1SDavid E. O'Brien rtok = relex(); 10032a55deb1SDavid E. O'Brien return (unary(op2(CHAR, NIL, NIL))); 10042a55deb1SDavid E. O'Brien case '(': 1005b5253557SWarner Losh lastatom = starttok; 1006b5253557SWarner Losh savelastatom = starttok - basestr; /* Retain over recursion */ 10072a55deb1SDavid E. O'Brien rtok = relex(); 10082a55deb1SDavid E. O'Brien if (rtok == ')') { /* special pleading for () */ 10092a55deb1SDavid E. O'Brien rtok = relex(); 1010*f32a6403SWarner Losh return unary(op2(CCL, NIL, (Node *) cclenter(""))); 10112a55deb1SDavid E. O'Brien } 10122a55deb1SDavid E. O'Brien np = regexp(); 10132a55deb1SDavid E. O'Brien if (rtok == ')') { 1014b5253557SWarner Losh lastatom = basestr + savelastatom; /* Restore */ 10152a55deb1SDavid E. O'Brien rtok = relex(); 10162a55deb1SDavid E. O'Brien return (unary(np)); 10172a55deb1SDavid E. O'Brien } 10182a55deb1SDavid E. O'Brien else 10192a55deb1SDavid E. O'Brien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 10202a55deb1SDavid E. O'Brien default: 10212a55deb1SDavid E. O'Brien FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 10222a55deb1SDavid E. O'Brien } 10232a55deb1SDavid E. O'Brien return 0; /*NOTREACHED*/ 10242a55deb1SDavid E. O'Brien } 10252a55deb1SDavid E. O'Brien 10262a55deb1SDavid E. O'Brien Node *concat(Node *np) 10272a55deb1SDavid E. O'Brien { 10282a55deb1SDavid E. O'Brien switch (rtok) { 1029b5253557SWarner Losh case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 10302a55deb1SDavid E. O'Brien return (concat(op2(CAT, np, primary()))); 1031b5253557SWarner Losh case EMPTYRE: 1032b5253557SWarner Losh rtok = relex(); 1033*f32a6403SWarner Losh return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")), 1034b5253557SWarner Losh primary()))); 10352a55deb1SDavid E. O'Brien } 10362a55deb1SDavid E. O'Brien return (np); 10372a55deb1SDavid E. O'Brien } 10382a55deb1SDavid E. O'Brien 10392a55deb1SDavid E. O'Brien Node *alt(Node *np) 10402a55deb1SDavid E. O'Brien { 10412a55deb1SDavid E. O'Brien if (rtok == OR) { 10422a55deb1SDavid E. O'Brien rtok = relex(); 10432a55deb1SDavid E. O'Brien return (alt(op2(OR, np, concat(primary())))); 10442a55deb1SDavid E. O'Brien } 10452a55deb1SDavid E. O'Brien return (np); 10462a55deb1SDavid E. O'Brien } 10472a55deb1SDavid E. O'Brien 10482a55deb1SDavid E. O'Brien Node *unary(Node *np) 10492a55deb1SDavid E. O'Brien { 10502a55deb1SDavid E. O'Brien switch (rtok) { 10512a55deb1SDavid E. O'Brien case STAR: 10522a55deb1SDavid E. O'Brien rtok = relex(); 10532a55deb1SDavid E. O'Brien return (unary(op2(STAR, np, NIL))); 10542a55deb1SDavid E. O'Brien case PLUS: 10552a55deb1SDavid E. O'Brien rtok = relex(); 10562a55deb1SDavid E. O'Brien return (unary(op2(PLUS, np, NIL))); 10572a55deb1SDavid E. O'Brien case QUEST: 10582a55deb1SDavid E. O'Brien rtok = relex(); 10592a55deb1SDavid E. O'Brien return (unary(op2(QUEST, np, NIL))); 1060f39dd6a9SWarner Losh case ZERO: 1061f39dd6a9SWarner Losh rtok = relex(); 1062f39dd6a9SWarner Losh return (unary(op2(ZERO, np, NIL))); 10632a55deb1SDavid E. O'Brien default: 10642a55deb1SDavid E. O'Brien return (np); 10652a55deb1SDavid E. O'Brien } 10662a55deb1SDavid E. O'Brien } 10672a55deb1SDavid E. O'Brien 1068007c6572SDag-Erling Smørgrav /* 1069007c6572SDag-Erling Smørgrav * Character class definitions conformant to the POSIX locale as 1070007c6572SDag-Erling Smørgrav * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 1071007c6572SDag-Erling Smørgrav * and operating character sets are both ASCII (ISO646) or supersets 1072007c6572SDag-Erling Smørgrav * thereof. 1073007c6572SDag-Erling Smørgrav * 1074007c6572SDag-Erling Smørgrav * Note that to avoid overflowing the temporary buffer used in 1075007c6572SDag-Erling Smørgrav * relex(), the expanded character class (prior to range expansion) 1076007c6572SDag-Erling Smørgrav * must be less than twice the size of their full name. 1077007c6572SDag-Erling Smørgrav */ 1078fc6b1dfeSDavid E. O'Brien 1079fc6b1dfeSDavid E. O'Brien /* Because isblank doesn't show up in any of the header files on any 1080fc6b1dfeSDavid E. O'Brien * system i use, it's defined here. if some other locale has a richer 1081fc6b1dfeSDavid E. O'Brien * definition of "blank", define HAS_ISBLANK and provide your own 1082fc6b1dfeSDavid E. O'Brien * version. 108388b8d487SRuslan Ermilov * the parentheses here are an attempt to find a path through the maze 108488b8d487SRuslan Ermilov * of macro definition and/or function and/or version provided. thanks 108588b8d487SRuslan Ermilov * to nelson beebe for the suggestion; let's see if it works everywhere. 1086fc6b1dfeSDavid E. O'Brien */ 1087fc6b1dfeSDavid E. O'Brien 108891217c1cSRuslan Ermilov /* #define HAS_ISBLANK */ 1089fc6b1dfeSDavid E. O'Brien #ifndef HAS_ISBLANK 1090fc6b1dfeSDavid E. O'Brien 10911b11b783SRuslan Ermilov int (xisblank)(int c) 1092fc6b1dfeSDavid E. O'Brien { 1093fc6b1dfeSDavid E. O'Brien return c==' ' || c=='\t'; 1094fc6b1dfeSDavid E. O'Brien } 1095fc6b1dfeSDavid E. O'Brien 1096fc6b1dfeSDavid E. O'Brien #endif 1097fc6b1dfeSDavid E. O'Brien 1098f39dd6a9SWarner Losh static const struct charclass { 1099007c6572SDag-Erling Smørgrav const char *cc_name; 1100007c6572SDag-Erling Smørgrav int cc_namelen; 1101fc6b1dfeSDavid E. O'Brien int (*cc_func)(int); 1102007c6572SDag-Erling Smørgrav } charclasses[] = { 1103fc6b1dfeSDavid E. O'Brien { "alnum", 5, isalnum }, 1104fc6b1dfeSDavid E. O'Brien { "alpha", 5, isalpha }, 11051b11b783SRuslan Ermilov #ifndef HAS_ISBLANK 1106b5253557SWarner Losh { "blank", 5, xisblank }, 11071b11b783SRuslan Ermilov #else 1108fc6b1dfeSDavid E. O'Brien { "blank", 5, isblank }, 11091b11b783SRuslan Ermilov #endif 1110fc6b1dfeSDavid E. O'Brien { "cntrl", 5, iscntrl }, 1111fc6b1dfeSDavid E. O'Brien { "digit", 5, isdigit }, 1112fc6b1dfeSDavid E. O'Brien { "graph", 5, isgraph }, 1113fc6b1dfeSDavid E. O'Brien { "lower", 5, islower }, 1114fc6b1dfeSDavid E. O'Brien { "print", 5, isprint }, 1115fc6b1dfeSDavid E. O'Brien { "punct", 5, ispunct }, 1116fc6b1dfeSDavid E. O'Brien { "space", 5, isspace }, 1117fc6b1dfeSDavid E. O'Brien { "upper", 5, isupper }, 1118fc6b1dfeSDavid E. O'Brien { "xdigit", 6, isxdigit }, 1119007c6572SDag-Erling Smørgrav { NULL, 0, NULL }, 1120007c6572SDag-Erling Smørgrav }; 1121007c6572SDag-Erling Smørgrav 1122b5253557SWarner Losh #define REPEAT_SIMPLE 0 1123b5253557SWarner Losh #define REPEAT_PLUS_APPENDED 1 1124b5253557SWarner Losh #define REPEAT_WITH_Q 2 1125b5253557SWarner Losh #define REPEAT_ZERO 3 1126b5253557SWarner Losh 1127b5253557SWarner Losh static int 1128b5253557SWarner Losh replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1129b5253557SWarner Losh int atomlen, int firstnum, int secondnum, int special_case) 1130b5253557SWarner Losh { 1131b5253557SWarner Losh int i, j; 1132b5253557SWarner Losh uschar *buf = 0; 1133b5253557SWarner Losh int ret = 1; 1134b5253557SWarner Losh int init_q = (firstnum == 0); /* first added char will be ? */ 1135b5253557SWarner Losh int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 1136b5253557SWarner Losh int prefix_length = reptok - basestr; /* prefix includes first rep */ 1137f39dd6a9SWarner Losh int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 1138b5253557SWarner Losh int size = prefix_length + suffix_length; 1139b5253557SWarner Losh 1140b5253557SWarner Losh if (firstnum > 1) { /* add room for reps 2 through firstnum */ 1141b5253557SWarner Losh size += atomlen*(firstnum-1); 1142b5253557SWarner Losh } 1143b5253557SWarner Losh 1144b5253557SWarner Losh /* Adjust size of buffer for special cases */ 1145b5253557SWarner Losh if (special_case == REPEAT_PLUS_APPENDED) { 1146b5253557SWarner Losh size++; /* for the final + */ 1147b5253557SWarner Losh } else if (special_case == REPEAT_WITH_Q) { 114823f24377SWarner Losh size += init_q + (atomlen+1)* (n_q_reps-init_q); 1149b5253557SWarner Losh } else if (special_case == REPEAT_ZERO) { 1150b5253557SWarner Losh size += 2; /* just a null ERE: () */ 1151b5253557SWarner Losh } 1152b5253557SWarner Losh if ((buf = (uschar *) malloc(size + 1)) == NULL) 1153b5253557SWarner Losh FATAL("out of space in reg expr %.10s..", lastre); 1154b5253557SWarner Losh memcpy(buf, basestr, prefix_length); /* copy prefix */ 1155b5253557SWarner Losh j = prefix_length; 1156b5253557SWarner Losh if (special_case == REPEAT_ZERO) { 1157b5253557SWarner Losh j -= atomlen; 1158b5253557SWarner Losh buf[j++] = '('; 1159b5253557SWarner Losh buf[j++] = ')'; 1160b5253557SWarner Losh } 1161b5253557SWarner Losh for (i = 1; i < firstnum; i++) { /* copy x reps */ 1162b5253557SWarner Losh memcpy(&buf[j], atom, atomlen); 1163b5253557SWarner Losh j += atomlen; 1164b5253557SWarner Losh } 1165b5253557SWarner Losh if (special_case == REPEAT_PLUS_APPENDED) { 1166b5253557SWarner Losh buf[j++] = '+'; 1167b5253557SWarner Losh } else if (special_case == REPEAT_WITH_Q) { 1168f39dd6a9SWarner Losh if (init_q) 1169f39dd6a9SWarner Losh buf[j++] = '?'; 1170f39dd6a9SWarner Losh for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 1171b5253557SWarner Losh memcpy(&buf[j], atom, atomlen); 1172b5253557SWarner Losh j += atomlen; 1173b5253557SWarner Losh buf[j++] = '?'; 1174b5253557SWarner Losh } 1175b5253557SWarner Losh } 1176b5253557SWarner Losh memcpy(&buf[j], reptok+reptoklen, suffix_length); 117723f24377SWarner Losh j += suffix_length; 117823f24377SWarner Losh buf[j] = '\0'; 1179b5253557SWarner Losh /* free old basestr */ 1180b5253557SWarner Losh if (firstbasestr != basestr) { 1181b5253557SWarner Losh if (basestr) 1182b5253557SWarner Losh xfree(basestr); 1183b5253557SWarner Losh } 1184b5253557SWarner Losh basestr = buf; 1185b5253557SWarner Losh prestr = buf + prefix_length; 1186b5253557SWarner Losh if (special_case == REPEAT_ZERO) { 1187b5253557SWarner Losh prestr -= atomlen; 1188b5253557SWarner Losh ret++; 1189b5253557SWarner Losh } 1190b5253557SWarner Losh return ret; 1191b5253557SWarner Losh } 1192b5253557SWarner Losh 1193b5253557SWarner Losh static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1194b5253557SWarner Losh int atomlen, int firstnum, int secondnum) 1195b5253557SWarner Losh { 1196b5253557SWarner Losh /* 1197b5253557SWarner Losh In general, the repetition specifier or "bound" is replaced here 1198b5253557SWarner Losh by an equivalent ERE string, repeating the immediately previous atom 1199b5253557SWarner Losh and appending ? and + as needed. Note that the first copy of the 1200b5253557SWarner Losh atom is left in place, except in the special_case of a zero-repeat 1201b5253557SWarner Losh (i.e., {0}). 1202b5253557SWarner Losh */ 1203b5253557SWarner Losh if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1204b5253557SWarner Losh if (firstnum < 2) { 1205b5253557SWarner Losh /* 0 or 1: should be handled before you get here */ 1206b5253557SWarner Losh FATAL("internal error"); 1207b5253557SWarner Losh } else { 1208b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen, 1209b5253557SWarner Losh firstnum, secondnum, REPEAT_PLUS_APPENDED); 1210b5253557SWarner Losh } 1211b5253557SWarner Losh } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1212b5253557SWarner Losh if (firstnum == 0) { /* {0} or {0,0} */ 1213b5253557SWarner Losh /* This case is unusual because the resulting 1214b5253557SWarner Losh replacement string might actually be SMALLER than 1215b5253557SWarner Losh the original ERE */ 1216b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen, 1217b5253557SWarner Losh firstnum, secondnum, REPEAT_ZERO); 1218b5253557SWarner Losh } else { /* (firstnum >= 1) */ 1219b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen, 1220b5253557SWarner Losh firstnum, secondnum, REPEAT_SIMPLE); 1221b5253557SWarner Losh } 1222b5253557SWarner Losh } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1223b5253557SWarner Losh /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1224b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen, 1225b5253557SWarner Losh firstnum, secondnum, REPEAT_WITH_Q); 1226b5253557SWarner Losh } else { /* Error - shouldn't be here (n>m) */ 1227b5253557SWarner Losh FATAL("internal error"); 1228b5253557SWarner Losh } 1229b5253557SWarner Losh return 0; 1230b5253557SWarner Losh } 1231007c6572SDag-Erling Smørgrav 12322a55deb1SDavid E. O'Brien int relex(void) /* lexical analyzer for reparse */ 12332a55deb1SDavid E. O'Brien { 12342a55deb1SDavid E. O'Brien int c, n; 12352a55deb1SDavid E. O'Brien int cflag; 123610ce5b99SWarner Losh static uschar *buf = NULL; 12372a55deb1SDavid E. O'Brien static int bufsz = 100; 12382a55deb1SDavid E. O'Brien uschar *bp; 1239f39dd6a9SWarner Losh const struct charclass *cc; 1240fc6b1dfeSDavid E. O'Brien int i; 1241f39dd6a9SWarner Losh int num, m; 1242f39dd6a9SWarner Losh bool commafound, digitfound; 1243b5253557SWarner Losh const uschar *startreptok; 1244f39dd6a9SWarner Losh static int parens = 0; 1245b5253557SWarner Losh 1246b5253557SWarner Losh rescan: 1247b5253557SWarner Losh starttok = prestr; 12482a55deb1SDavid E. O'Brien 1249*f32a6403SWarner Losh if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1250*f32a6403SWarner Losh prestr += n; 1251*f32a6403SWarner Losh starttok = prestr; 1252*f32a6403SWarner Losh return CHAR; 1253*f32a6403SWarner Losh } 1254*f32a6403SWarner Losh 12552a55deb1SDavid E. O'Brien switch (c = *prestr++) { 12562a55deb1SDavid E. O'Brien case '|': return OR; 12572a55deb1SDavid E. O'Brien case '*': return STAR; 12582a55deb1SDavid E. O'Brien case '+': return PLUS; 12592a55deb1SDavid E. O'Brien case '?': return QUEST; 12602a55deb1SDavid E. O'Brien case '.': return DOT; 12612a55deb1SDavid E. O'Brien case '\0': prestr--; return '\0'; 12622a55deb1SDavid E. O'Brien case '^': 12632a55deb1SDavid E. O'Brien case '$': 12642a55deb1SDavid E. O'Brien return c; 1265f39dd6a9SWarner Losh case '(': 1266f39dd6a9SWarner Losh parens++; 1267f39dd6a9SWarner Losh return c; 1268f39dd6a9SWarner Losh case ')': 1269f39dd6a9SWarner Losh if (parens) { 1270f39dd6a9SWarner Losh parens--; 1271f39dd6a9SWarner Losh return c; 1272f39dd6a9SWarner Losh } 1273f39dd6a9SWarner Losh /* unmatched close parenthesis; per POSIX, treat as literal */ 1274f39dd6a9SWarner Losh rlxval = c; 1275f39dd6a9SWarner Losh return CHAR; 12762a55deb1SDavid E. O'Brien case '\\': 1277d86a0988SRuslan Ermilov rlxval = quoted(&prestr); 12782a55deb1SDavid E. O'Brien return CHAR; 12792a55deb1SDavid E. O'Brien default: 12802a55deb1SDavid E. O'Brien rlxval = c; 12812a55deb1SDavid E. O'Brien return CHAR; 12822a55deb1SDavid E. O'Brien case '[': 128310ce5b99SWarner Losh if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 12842a55deb1SDavid E. O'Brien FATAL("out of space in reg expr %.10s..", lastre); 12852a55deb1SDavid E. O'Brien bp = buf; 12862a55deb1SDavid E. O'Brien if (*prestr == '^') { 12872a55deb1SDavid E. O'Brien cflag = 1; 12882a55deb1SDavid E. O'Brien prestr++; 12892a55deb1SDavid E. O'Brien } 12902a55deb1SDavid E. O'Brien else 12912a55deb1SDavid E. O'Brien cflag = 0; 1292*f32a6403SWarner Losh n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ 1293addad6afSRong-En Fan if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 12942a55deb1SDavid E. O'Brien FATAL("out of space for reg expr %.10s...", lastre); 12952a55deb1SDavid E. O'Brien for (; ; ) { 1296*f32a6403SWarner Losh if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1297*f32a6403SWarner Losh for (i = 0; i < n; i++) 1298*f32a6403SWarner Losh *bp++ = *prestr++; 1299*f32a6403SWarner Losh continue; 1300*f32a6403SWarner Losh } 13012a55deb1SDavid E. O'Brien if ((c = *prestr++) == '\\') { 13022a55deb1SDavid E. O'Brien *bp++ = '\\'; 13032a55deb1SDavid E. O'Brien if ((c = *prestr++) == '\0') 13042a55deb1SDavid E. O'Brien FATAL("nonterminated character class %.20s...", lastre); 13052a55deb1SDavid E. O'Brien *bp++ = c; 13062a55deb1SDavid E. O'Brien /* } else if (c == '\n') { */ 13072a55deb1SDavid E. O'Brien /* FATAL("newline in character class %.20s...", lastre); */ 1308007c6572SDag-Erling Smørgrav } else if (c == '[' && *prestr == ':') { 1309007c6572SDag-Erling Smørgrav /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1310007c6572SDag-Erling Smørgrav for (cc = charclasses; cc->cc_name; cc++) 1311007c6572SDag-Erling Smørgrav if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1312007c6572SDag-Erling Smørgrav break; 1313007c6572SDag-Erling Smørgrav if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1314007c6572SDag-Erling Smørgrav prestr[2 + cc->cc_namelen] == ']') { 1315007c6572SDag-Erling Smørgrav prestr += cc->cc_namelen + 3; 1316b5253557SWarner Losh /* 1317b5253557SWarner Losh * BUG: We begin at 1, instead of 0, since we 1318b5253557SWarner Losh * would otherwise prematurely terminate the 1319b5253557SWarner Losh * string for classes like [[:cntrl:]]. This 1320b5253557SWarner Losh * means that we can't match the NUL character, 1321b5253557SWarner Losh * not without first adapting the entire 1322b5253557SWarner Losh * program to track each string's length. 1323b5253557SWarner Losh */ 1324b5253557SWarner Losh for (i = 1; i <= UCHAR_MAX; i++) { 1325*f32a6403SWarner Losh if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2")) 1326fc6b1dfeSDavid E. O'Brien FATAL("out of space for reg expr %.10s...", lastre); 1327fc6b1dfeSDavid E. O'Brien if (cc->cc_func(i)) { 1328f39dd6a9SWarner Losh /* escape backslash */ 1329f39dd6a9SWarner Losh if (i == '\\') { 1330f39dd6a9SWarner Losh *bp++ = '\\'; 1331f39dd6a9SWarner Losh n++; 1332f39dd6a9SWarner Losh } 1333f39dd6a9SWarner Losh 1334fc6b1dfeSDavid E. O'Brien *bp++ = i; 1335fc6b1dfeSDavid E. O'Brien n++; 1336fc6b1dfeSDavid E. O'Brien } 1337fc6b1dfeSDavid E. O'Brien } 1338007c6572SDag-Erling Smørgrav } else 1339007c6572SDag-Erling Smørgrav *bp++ = c; 1340b5253557SWarner Losh } else if (c == '[' && *prestr == '.') { 1341b5253557SWarner Losh char collate_char; 1342b5253557SWarner Losh prestr++; 1343b5253557SWarner Losh collate_char = *prestr++; 1344b5253557SWarner Losh if (*prestr == '.' && prestr[1] == ']') { 1345b5253557SWarner Losh prestr += 2; 1346b5253557SWarner Losh /* Found it: map via locale TBD: for 1347b5253557SWarner Losh now, simply return this char. This 1348b5253557SWarner Losh is sufficient to pass conformance 1349b5253557SWarner Losh test awk.ex 156 1350b5253557SWarner Losh */ 1351b5253557SWarner Losh if (*prestr == ']') { 1352b5253557SWarner Losh prestr++; 1353b5253557SWarner Losh rlxval = collate_char; 1354b5253557SWarner Losh return CHAR; 1355b5253557SWarner Losh } 1356b5253557SWarner Losh } 1357b5253557SWarner Losh } else if (c == '[' && *prestr == '=') { 1358b5253557SWarner Losh char equiv_char; 1359b5253557SWarner Losh prestr++; 1360b5253557SWarner Losh equiv_char = *prestr++; 1361b5253557SWarner Losh if (*prestr == '=' && prestr[1] == ']') { 1362b5253557SWarner Losh prestr += 2; 1363b5253557SWarner Losh /* Found it: map via locale TBD: for now 1364b5253557SWarner Losh simply return this char. This is 1365b5253557SWarner Losh sufficient to pass conformance test 1366b5253557SWarner Losh awk.ex 156 1367b5253557SWarner Losh */ 1368b5253557SWarner Losh if (*prestr == ']') { 1369b5253557SWarner Losh prestr++; 1370b5253557SWarner Losh rlxval = equiv_char; 1371b5253557SWarner Losh return CHAR; 1372b5253557SWarner Losh } 1373b5253557SWarner Losh } 13742a55deb1SDavid E. O'Brien } else if (c == '\0') { 13752a55deb1SDavid E. O'Brien FATAL("nonterminated character class %.20s", lastre); 13762a55deb1SDavid E. O'Brien } else if (bp == buf) { /* 1st char is special */ 13772a55deb1SDavid E. O'Brien *bp++ = c; 13782a55deb1SDavid E. O'Brien } else if (c == ']') { 13792a55deb1SDavid E. O'Brien *bp++ = 0; 13802a55deb1SDavid E. O'Brien rlxstr = (uschar *) tostring((char *) buf); 13812a55deb1SDavid E. O'Brien if (cflag == 0) 13822a55deb1SDavid E. O'Brien return CCL; 13832a55deb1SDavid E. O'Brien else 13842a55deb1SDavid E. O'Brien return NCCL; 13852a55deb1SDavid E. O'Brien } else 13862a55deb1SDavid E. O'Brien *bp++ = c; 13872a55deb1SDavid E. O'Brien } 1388b5253557SWarner Losh break; 1389b5253557SWarner Losh case '{': 1390*f32a6403SWarner Losh if (isdigit((int) *(prestr))) { 1391b5253557SWarner Losh num = 0; /* Process as a repetition */ 1392b5253557SWarner Losh n = -1; m = -1; 1393f39dd6a9SWarner Losh commafound = false; 1394f39dd6a9SWarner Losh digitfound = false; 1395b5253557SWarner Losh startreptok = prestr-1; 1396b5253557SWarner Losh /* Remember start of previous atom here ? */ 1397b5253557SWarner Losh } else { /* just a { char, not a repetition */ 1398b5253557SWarner Losh rlxval = c; 1399b5253557SWarner Losh return CHAR; 1400b5253557SWarner Losh } 1401b5253557SWarner Losh for (; ; ) { 1402b5253557SWarner Losh if ((c = *prestr++) == '}') { 1403b5253557SWarner Losh if (commafound) { 1404b5253557SWarner Losh if (digitfound) { /* {n,m} */ 1405b5253557SWarner Losh m = num; 1406b5253557SWarner Losh if (m < n) 1407b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s", 1408b5253557SWarner Losh lastre); 1409f39dd6a9SWarner Losh if (n == 0 && m == 1) { 1410b5253557SWarner Losh return QUEST; 1411b5253557SWarner Losh } 1412b5253557SWarner Losh } else { /* {n,} */ 1413f39dd6a9SWarner Losh if (n == 0) 1414f39dd6a9SWarner Losh return STAR; 1415f39dd6a9SWarner Losh else if (n == 1) 1416f39dd6a9SWarner Losh return PLUS; 1417b5253557SWarner Losh } 1418b5253557SWarner Losh } else { 1419b5253557SWarner Losh if (digitfound) { /* {n} same as {n,n} */ 1420b5253557SWarner Losh n = num; 1421b5253557SWarner Losh m = num; 1422b5253557SWarner Losh } else { /* {} */ 1423b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s", 1424b5253557SWarner Losh lastre); 1425b5253557SWarner Losh } 1426b5253557SWarner Losh } 1427b5253557SWarner Losh if (repeat(starttok, prestr-starttok, lastatom, 1428b5253557SWarner Losh startreptok - lastatom, n, m) > 0) { 1429f39dd6a9SWarner Losh if (n == 0 && m == 0) { 1430f39dd6a9SWarner Losh return ZERO; 1431b5253557SWarner Losh } 1432b5253557SWarner Losh /* must rescan input for next token */ 1433b5253557SWarner Losh goto rescan; 1434b5253557SWarner Losh } 1435b5253557SWarner Losh /* Failed to replace: eat up {...} characters 1436b5253557SWarner Losh and treat like just PLUS */ 1437b5253557SWarner Losh return PLUS; 1438b5253557SWarner Losh } else if (c == '\0') { 1439b5253557SWarner Losh FATAL("nonterminated character class %.20s", 1440b5253557SWarner Losh lastre); 1441b5253557SWarner Losh } else if (isdigit(c)) { 1442b5253557SWarner Losh num = 10 * num + c - '0'; 1443f39dd6a9SWarner Losh digitfound = true; 1444b5253557SWarner Losh } else if (c == ',') { 1445b5253557SWarner Losh if (commafound) 1446b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s", 1447b5253557SWarner Losh lastre); 1448b5253557SWarner Losh /* looking for {n,} or {n,m} */ 1449f39dd6a9SWarner Losh commafound = true; 1450b5253557SWarner Losh n = num; 1451f39dd6a9SWarner Losh digitfound = false; /* reset */ 1452b5253557SWarner Losh num = 0; 1453b5253557SWarner Losh } else { 1454b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s", 1455b5253557SWarner Losh lastre); 1456b5253557SWarner Losh } 1457b5253557SWarner Losh } 1458b5253557SWarner Losh break; 14592a55deb1SDavid E. O'Brien } 14602a55deb1SDavid E. O'Brien } 14612a55deb1SDavid E. O'Brien 14622a55deb1SDavid E. O'Brien int cgoto(fa *f, int s, int c) 14632a55deb1SDavid E. O'Brien { 14642a55deb1SDavid E. O'Brien int *p, *q; 1465f39dd6a9SWarner Losh int i, j, k; 14662a55deb1SDavid E. O'Brien 1467*f32a6403SWarner Losh /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ 14682a55deb1SDavid E. O'Brien while (f->accept >= maxsetvec) { /* guessing here! */ 1469f39dd6a9SWarner Losh resizesetvec(__func__); 14702a55deb1SDavid E. O'Brien } 14712a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++) 14722a55deb1SDavid E. O'Brien setvec[i] = 0; 14732a55deb1SDavid E. O'Brien setcnt = 0; 1474f39dd6a9SWarner Losh resize_state(f, s); 14752a55deb1SDavid E. O'Brien /* compute positions of gototab[s,c] into setvec */ 14762a55deb1SDavid E. O'Brien p = f->posns[s]; 14772a55deb1SDavid E. O'Brien for (i = 1; i <= *p; i++) { 14782a55deb1SDavid E. O'Brien if ((k = f->re[p[i]].ltype) != FINAL) { 14792a55deb1SDavid E. O'Brien if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 14802a55deb1SDavid E. O'Brien || (k == DOT && c != 0 && c != HAT) 14812a55deb1SDavid E. O'Brien || (k == ALL && c != 0) 1482addad6afSRong-En Fan || (k == EMPTYRE && c != 0) 1483*f32a6403SWarner Losh || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) 1484*f32a6403SWarner Losh || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { 14852a55deb1SDavid E. O'Brien q = f->re[p[i]].lfollow; 14862a55deb1SDavid E. O'Brien for (j = 1; j <= *q; j++) { 14872a55deb1SDavid E. O'Brien if (q[j] >= maxsetvec) { 1488f39dd6a9SWarner Losh resizesetvec(__func__); 14892a55deb1SDavid E. O'Brien } 14902a55deb1SDavid E. O'Brien if (setvec[q[j]] == 0) { 14912a55deb1SDavid E. O'Brien setcnt++; 14922a55deb1SDavid E. O'Brien setvec[q[j]] = 1; 14932a55deb1SDavid E. O'Brien } 14942a55deb1SDavid E. O'Brien } 14952a55deb1SDavid E. O'Brien } 14962a55deb1SDavid E. O'Brien } 14972a55deb1SDavid E. O'Brien } 14982a55deb1SDavid E. O'Brien /* determine if setvec is a previous state */ 14992a55deb1SDavid E. O'Brien tmpset[0] = setcnt; 15002a55deb1SDavid E. O'Brien j = 1; 15012a55deb1SDavid E. O'Brien for (i = f->accept; i >= 0; i--) 15022a55deb1SDavid E. O'Brien if (setvec[i]) { 15032a55deb1SDavid E. O'Brien tmpset[j++] = i; 15042a55deb1SDavid E. O'Brien } 1505f39dd6a9SWarner Losh resize_state(f, f->curstat > s ? f->curstat : s); 15062a55deb1SDavid E. O'Brien /* tmpset == previous state? */ 15072a55deb1SDavid E. O'Brien for (i = 1; i <= f->curstat; i++) { 15082a55deb1SDavid E. O'Brien p = f->posns[i]; 15092a55deb1SDavid E. O'Brien if ((k = tmpset[0]) != p[0]) 15102a55deb1SDavid E. O'Brien goto different; 15112a55deb1SDavid E. O'Brien for (j = 1; j <= k; j++) 15122a55deb1SDavid E. O'Brien if (tmpset[j] != p[j]) 15132a55deb1SDavid E. O'Brien goto different; 15142a55deb1SDavid E. O'Brien /* setvec is state i */ 1515f39dd6a9SWarner Losh if (c != HAT) 1516*f32a6403SWarner Losh set_gototab(f, s, c, i); 15172a55deb1SDavid E. O'Brien return i; 15182a55deb1SDavid E. O'Brien different:; 15192a55deb1SDavid E. O'Brien } 15202a55deb1SDavid E. O'Brien 15212a55deb1SDavid E. O'Brien /* add tmpset to current set of states */ 15222a55deb1SDavid E. O'Brien ++(f->curstat); 1523f39dd6a9SWarner Losh resize_state(f, f->curstat); 1524*f32a6403SWarner Losh clear_gototab(f, f->curstat); 15252a55deb1SDavid E. O'Brien xfree(f->posns[f->curstat]); 1526f39dd6a9SWarner Losh p = intalloc(setcnt + 1, __func__); 15272a55deb1SDavid E. O'Brien 15282a55deb1SDavid E. O'Brien f->posns[f->curstat] = p; 1529f39dd6a9SWarner Losh if (c != HAT) 1530*f32a6403SWarner Losh set_gototab(f, s, c, f->curstat); 15312a55deb1SDavid E. O'Brien for (i = 0; i <= setcnt; i++) 15322a55deb1SDavid E. O'Brien p[i] = tmpset[i]; 15332a55deb1SDavid E. O'Brien if (setvec[f->accept]) 15342a55deb1SDavid E. O'Brien f->out[f->curstat] = 1; 15352a55deb1SDavid E. O'Brien else 15362a55deb1SDavid E. O'Brien f->out[f->curstat] = 0; 15372a55deb1SDavid E. O'Brien return f->curstat; 15382a55deb1SDavid E. O'Brien } 15392a55deb1SDavid E. O'Brien 15402a55deb1SDavid E. O'Brien 15412a55deb1SDavid E. O'Brien void freefa(fa *f) /* free a finite automaton */ 15422a55deb1SDavid E. O'Brien { 15432a55deb1SDavid E. O'Brien int i; 15442a55deb1SDavid E. O'Brien 15452a55deb1SDavid E. O'Brien if (f == NULL) 15462a55deb1SDavid E. O'Brien return; 1547f39dd6a9SWarner Losh for (i = 0; i < f->state_count; i++) 1548*f32a6403SWarner Losh xfree(f->gototab[i].entries); 1549*f32a6403SWarner Losh xfree(f->gototab); 15502a55deb1SDavid E. O'Brien for (i = 0; i <= f->curstat; i++) 15512a55deb1SDavid E. O'Brien xfree(f->posns[i]); 15522a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++) { 15532a55deb1SDavid E. O'Brien xfree(f->re[i].lfollow); 15542a55deb1SDavid E. O'Brien if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1555f39dd6a9SWarner Losh xfree(f->re[i].lval.np); 15562a55deb1SDavid E. O'Brien } 15572a55deb1SDavid E. O'Brien xfree(f->restr); 1558f39dd6a9SWarner Losh xfree(f->out); 1559f39dd6a9SWarner Losh xfree(f->posns); 1560f39dd6a9SWarner Losh xfree(f->gototab); 15612a55deb1SDavid E. O'Brien xfree(f); 15622a55deb1SDavid E. O'Brien } 1563