12a55deb1SDavid E. O'Brien /**************************************************************** 22a55deb1SDavid E. O'Brien Copyright (C) Lucent Technologies 1997 32a55deb1SDavid E. O'Brien All Rights Reserved 42a55deb1SDavid E. O'Brien 52a55deb1SDavid E. O'Brien Permission to use, copy, modify, and distribute this software and 62a55deb1SDavid E. O'Brien its documentation for any purpose and without fee is hereby 72a55deb1SDavid E. O'Brien granted, provided that the above copyright notice appear in all 82a55deb1SDavid E. O'Brien copies and that both that the copyright notice and this 92a55deb1SDavid E. O'Brien permission notice and warranty disclaimer appear in supporting 102a55deb1SDavid E. O'Brien documentation, and that the name Lucent Technologies or any of 112a55deb1SDavid E. O'Brien its entities not be used in advertising or publicity pertaining 122a55deb1SDavid E. O'Brien to distribution of the software without specific, written prior 132a55deb1SDavid E. O'Brien permission. 142a55deb1SDavid E. O'Brien 152a55deb1SDavid E. O'Brien LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 162a55deb1SDavid E. O'Brien INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 172a55deb1SDavid E. O'Brien IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 182a55deb1SDavid E. O'Brien SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 192a55deb1SDavid E. O'Brien WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 202a55deb1SDavid E. O'Brien IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 212a55deb1SDavid E. O'Brien ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 222a55deb1SDavid E. O'Brien THIS SOFTWARE. 232a55deb1SDavid E. O'Brien ****************************************************************/ 242a55deb1SDavid E. O'Brien 25addad6afSRong-En Fan /* lasciate ogne speranza, voi ch'intrate. */ 262a55deb1SDavid E. O'Brien 27d98dd8e5SRuslan Ermilov #include <sys/cdefs.h> 28d98dd8e5SRuslan Ermilov __FBSDID("$FreeBSD$"); 29d98dd8e5SRuslan Ermilov 302a55deb1SDavid E. O'Brien #define DEBUG 312a55deb1SDavid E. O'Brien 322a55deb1SDavid E. O'Brien #include <ctype.h> 33*b5253557SWarner Losh #include <limits.h> 342a55deb1SDavid E. O'Brien #include <stdio.h> 352a55deb1SDavid E. O'Brien #include <string.h> 362a55deb1SDavid E. O'Brien #include <stdlib.h> 372a55deb1SDavid E. O'Brien #include "awk.h" 382a55deb1SDavid E. O'Brien #include "ytab.h" 392a55deb1SDavid E. O'Brien 4088b8d487SRuslan Ermilov #define HAT (NCHARS+2) /* matches ^ in regular expr */ 412a55deb1SDavid E. O'Brien /* NCHARS is 2**n */ 422a55deb1SDavid E. O'Brien #define MAXLIN 22 432a55deb1SDavid E. O'Brien 442a55deb1SDavid E. O'Brien #define type(v) (v)->nobj /* badly overloaded here */ 452a55deb1SDavid E. O'Brien #define info(v) (v)->ntype /* badly overloaded here */ 462a55deb1SDavid E. O'Brien #define left(v) (v)->narg[0] 472a55deb1SDavid E. O'Brien #define right(v) (v)->narg[1] 482a55deb1SDavid E. O'Brien #define parent(v) (v)->nnext 492a55deb1SDavid E. O'Brien 502a55deb1SDavid E. O'Brien #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 51addad6afSRong-En Fan #define ELEAF case EMPTYRE: /* empty string in regexp */ 522a55deb1SDavid E. O'Brien #define UNARY case STAR: case PLUS: case QUEST: 532a55deb1SDavid E. O'Brien 542a55deb1SDavid E. O'Brien /* encoding in tree Nodes: 55addad6afSRong-En Fan leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 562a55deb1SDavid E. O'Brien left is index, right contains value or pointer to value 572a55deb1SDavid E. O'Brien unary (STAR, PLUS, QUEST): left is child, right is null 582a55deb1SDavid E. O'Brien binary (CAT, OR): left and right are children 592a55deb1SDavid E. O'Brien parent contains pointer to parent 602a55deb1SDavid E. O'Brien */ 612a55deb1SDavid E. O'Brien 622a55deb1SDavid E. O'Brien 632a55deb1SDavid E. O'Brien int *setvec; 642a55deb1SDavid E. O'Brien int *tmpset; 652a55deb1SDavid E. O'Brien int maxsetvec = 0; 662a55deb1SDavid E. O'Brien 672a55deb1SDavid E. O'Brien int rtok; /* next token in current re */ 682a55deb1SDavid E. O'Brien int rlxval; 692a55deb1SDavid E. O'Brien static uschar *rlxstr; 702a55deb1SDavid E. O'Brien static uschar *prestr; /* current position in current re */ 712a55deb1SDavid E. O'Brien static uschar *lastre; /* origin of last re */ 72*b5253557SWarner Losh static uschar *lastatom; /* origin of last Atom */ 73*b5253557SWarner Losh static uschar *starttok; 74*b5253557SWarner Losh static uschar *basestr; /* starts with original, replaced during 75*b5253557SWarner Losh repetition processing */ 76*b5253557SWarner Losh static uschar *firstbasestr; 772a55deb1SDavid E. O'Brien 782a55deb1SDavid E. O'Brien static int setcnt; 792a55deb1SDavid E. O'Brien static int poscnt; 802a55deb1SDavid E. O'Brien 812a55deb1SDavid E. O'Brien char *patbeg; 822a55deb1SDavid E. O'Brien int patlen; 832a55deb1SDavid E. O'Brien 842a55deb1SDavid E. O'Brien #define NFA 20 /* cache this many dynamic fa's */ 852a55deb1SDavid E. O'Brien fa *fatab[NFA]; 862a55deb1SDavid E. O'Brien int nfatab = 0; /* entries in fatab */ 872a55deb1SDavid E. O'Brien 88813da98dSDavid E. O'Brien fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 892a55deb1SDavid E. O'Brien { 902a55deb1SDavid E. O'Brien int i, use, nuse; 912a55deb1SDavid E. O'Brien fa *pfa; 922a55deb1SDavid E. O'Brien static int now = 1; 932a55deb1SDavid E. O'Brien 94*b5253557SWarner Losh if (setvec == 0) { /* first time through any RE */ 952a55deb1SDavid E. O'Brien maxsetvec = MAXLIN; 962a55deb1SDavid E. O'Brien setvec = (int *) malloc(maxsetvec * sizeof(int)); 972a55deb1SDavid E. O'Brien tmpset = (int *) malloc(maxsetvec * sizeof(int)); 98*b5253557SWarner Losh if (setvec == 0 || tmpset == 0) 992a55deb1SDavid E. O'Brien overflo("out of space initializing makedfa"); 1002a55deb1SDavid E. O'Brien } 1012a55deb1SDavid E. O'Brien 1022a55deb1SDavid E. O'Brien if (compile_time) /* a constant for sure */ 1032a55deb1SDavid E. O'Brien return mkdfa(s, anchor); 1042a55deb1SDavid E. O'Brien for (i = 0; i < nfatab; i++) /* is it there already? */ 1052a55deb1SDavid E. O'Brien if (fatab[i]->anchor == anchor 106007c6572SDag-Erling Smørgrav && strcmp((const char *) fatab[i]->restr, s) == 0) { 1072a55deb1SDavid E. O'Brien fatab[i]->use = now++; 1082a55deb1SDavid E. O'Brien return fatab[i]; 1092a55deb1SDavid E. O'Brien } 1102a55deb1SDavid E. O'Brien pfa = mkdfa(s, anchor); 1112a55deb1SDavid E. O'Brien if (nfatab < NFA) { /* room for another */ 1122a55deb1SDavid E. O'Brien fatab[nfatab] = pfa; 1132a55deb1SDavid E. O'Brien fatab[nfatab]->use = now++; 1142a55deb1SDavid E. O'Brien nfatab++; 1152a55deb1SDavid E. O'Brien return pfa; 1162a55deb1SDavid E. O'Brien } 1172a55deb1SDavid E. O'Brien use = fatab[0]->use; /* replace least-recently used */ 1182a55deb1SDavid E. O'Brien nuse = 0; 1192a55deb1SDavid E. O'Brien for (i = 1; i < nfatab; i++) 1202a55deb1SDavid E. O'Brien if (fatab[i]->use < use) { 1212a55deb1SDavid E. O'Brien use = fatab[i]->use; 1222a55deb1SDavid E. O'Brien nuse = i; 1232a55deb1SDavid E. O'Brien } 1242a55deb1SDavid E. O'Brien freefa(fatab[nuse]); 1252a55deb1SDavid E. O'Brien fatab[nuse] = pfa; 1262a55deb1SDavid E. O'Brien pfa->use = now++; 1272a55deb1SDavid E. O'Brien return pfa; 1282a55deb1SDavid E. O'Brien } 1292a55deb1SDavid E. O'Brien 130813da98dSDavid E. O'Brien fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 1312a55deb1SDavid E. O'Brien /* anchor = 1 for anchored matches, else 0 */ 1322a55deb1SDavid E. O'Brien { 1332a55deb1SDavid E. O'Brien Node *p, *p1; 1342a55deb1SDavid E. O'Brien fa *f; 1352a55deb1SDavid E. O'Brien 136*b5253557SWarner Losh firstbasestr = (uschar *) s; 137*b5253557SWarner Losh basestr = firstbasestr; 1382a55deb1SDavid E. O'Brien p = reparse(s); 1392a55deb1SDavid E. O'Brien p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 1402a55deb1SDavid E. O'Brien /* put ALL STAR in front of reg. exp. */ 1412a55deb1SDavid E. O'Brien p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 1422a55deb1SDavid E. O'Brien /* put FINAL after reg. exp. */ 1432a55deb1SDavid E. O'Brien 1442a55deb1SDavid E. O'Brien poscnt = 0; 1452a55deb1SDavid E. O'Brien penter(p1); /* enter parent pointers and leaf indices */ 1462a55deb1SDavid E. O'Brien if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 1472a55deb1SDavid E. O'Brien overflo("out of space for fa"); 1482a55deb1SDavid E. O'Brien f->accept = poscnt-1; /* penter has computed number of positions in re */ 1492a55deb1SDavid E. O'Brien cfoll(f, p1); /* set up follow sets */ 1502a55deb1SDavid E. O'Brien freetr(p1); 151*b5253557SWarner Losh if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) 1522a55deb1SDavid E. O'Brien overflo("out of space in makedfa"); 1532a55deb1SDavid E. O'Brien if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 1542a55deb1SDavid E. O'Brien overflo("out of space in makedfa"); 1552a55deb1SDavid E. O'Brien *f->posns[1] = 0; 1562a55deb1SDavid E. O'Brien f->initstat = makeinit(f, anchor); 1572a55deb1SDavid E. O'Brien f->anchor = anchor; 1582a55deb1SDavid E. O'Brien f->restr = (uschar *) tostring(s); 159*b5253557SWarner Losh if (firstbasestr != basestr) { 160*b5253557SWarner Losh if (basestr) 161*b5253557SWarner Losh xfree(basestr); 162*b5253557SWarner Losh } 1632a55deb1SDavid E. O'Brien return f; 1642a55deb1SDavid E. O'Brien } 1652a55deb1SDavid E. O'Brien 1662a55deb1SDavid E. O'Brien int makeinit(fa *f, int anchor) 1672a55deb1SDavid E. O'Brien { 1682a55deb1SDavid E. O'Brien int i, k; 1692a55deb1SDavid E. O'Brien 1702a55deb1SDavid E. O'Brien f->curstat = 2; 1712a55deb1SDavid E. O'Brien f->out[2] = 0; 1722a55deb1SDavid E. O'Brien f->reset = 0; 1732a55deb1SDavid E. O'Brien k = *(f->re[0].lfollow); 1742a55deb1SDavid E. O'Brien xfree(f->posns[2]); 175*b5253557SWarner Losh if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 1762a55deb1SDavid E. O'Brien overflo("out of space in makeinit"); 1772a55deb1SDavid E. O'Brien for (i=0; i <= k; i++) { 1782a55deb1SDavid E. O'Brien (f->posns[2])[i] = (f->re[0].lfollow)[i]; 1792a55deb1SDavid E. O'Brien } 1802a55deb1SDavid E. O'Brien if ((f->posns[2])[1] == f->accept) 1812a55deb1SDavid E. O'Brien f->out[2] = 1; 1822a55deb1SDavid E. O'Brien for (i=0; i < NCHARS; i++) 1832a55deb1SDavid E. O'Brien f->gototab[2][i] = 0; 1842a55deb1SDavid E. O'Brien f->curstat = cgoto(f, 2, HAT); 1852a55deb1SDavid E. O'Brien if (anchor) { 1862a55deb1SDavid E. O'Brien *f->posns[2] = k-1; /* leave out position 0 */ 1872a55deb1SDavid E. O'Brien for (i=0; i < k; i++) { 1882a55deb1SDavid E. O'Brien (f->posns[0])[i] = (f->posns[2])[i]; 1892a55deb1SDavid E. O'Brien } 1902a55deb1SDavid E. O'Brien 1912a55deb1SDavid E. O'Brien f->out[0] = f->out[2]; 1922a55deb1SDavid E. O'Brien if (f->curstat != 2) 1932a55deb1SDavid E. O'Brien --(*f->posns[f->curstat]); 1942a55deb1SDavid E. O'Brien } 1952a55deb1SDavid E. O'Brien return f->curstat; 1962a55deb1SDavid E. O'Brien } 1972a55deb1SDavid E. O'Brien 1982a55deb1SDavid E. O'Brien void penter(Node *p) /* set up parent pointers and leaf indices */ 1992a55deb1SDavid E. O'Brien { 2002a55deb1SDavid E. O'Brien switch (type(p)) { 201addad6afSRong-En Fan ELEAF 2022a55deb1SDavid E. O'Brien LEAF 2032a55deb1SDavid E. O'Brien info(p) = poscnt; 2042a55deb1SDavid E. O'Brien poscnt++; 2052a55deb1SDavid E. O'Brien break; 2062a55deb1SDavid E. O'Brien UNARY 2072a55deb1SDavid E. O'Brien penter(left(p)); 2082a55deb1SDavid E. O'Brien parent(left(p)) = p; 2092a55deb1SDavid E. O'Brien break; 2102a55deb1SDavid E. O'Brien case CAT: 2112a55deb1SDavid E. O'Brien case OR: 2122a55deb1SDavid E. O'Brien penter(left(p)); 2132a55deb1SDavid E. O'Brien penter(right(p)); 2142a55deb1SDavid E. O'Brien parent(left(p)) = p; 2152a55deb1SDavid E. O'Brien parent(right(p)) = p; 2162a55deb1SDavid E. O'Brien break; 2172a55deb1SDavid E. O'Brien default: /* can't happen */ 2182a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in penter", type(p)); 2192a55deb1SDavid E. O'Brien break; 2202a55deb1SDavid E. O'Brien } 2212a55deb1SDavid E. O'Brien } 2222a55deb1SDavid E. O'Brien 2232a55deb1SDavid E. O'Brien void freetr(Node *p) /* free parse tree */ 2242a55deb1SDavid E. O'Brien { 2252a55deb1SDavid E. O'Brien switch (type(p)) { 226addad6afSRong-En Fan ELEAF 2272a55deb1SDavid E. O'Brien LEAF 2282a55deb1SDavid E. O'Brien xfree(p); 2292a55deb1SDavid E. O'Brien break; 2302a55deb1SDavid E. O'Brien UNARY 2312a55deb1SDavid E. O'Brien freetr(left(p)); 2322a55deb1SDavid E. O'Brien xfree(p); 2332a55deb1SDavid E. O'Brien break; 2342a55deb1SDavid E. O'Brien case CAT: 2352a55deb1SDavid E. O'Brien case OR: 2362a55deb1SDavid E. O'Brien freetr(left(p)); 2372a55deb1SDavid E. O'Brien freetr(right(p)); 2382a55deb1SDavid E. O'Brien xfree(p); 2392a55deb1SDavid E. O'Brien break; 2402a55deb1SDavid E. O'Brien default: /* can't happen */ 2412a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in freetr", type(p)); 2422a55deb1SDavid E. O'Brien break; 2432a55deb1SDavid E. O'Brien } 2442a55deb1SDavid E. O'Brien } 2452a55deb1SDavid E. O'Brien 2462a55deb1SDavid E. O'Brien /* in the parsing of regular expressions, metacharacters like . have */ 2472a55deb1SDavid E. O'Brien /* to be seen literally; \056 is not a metacharacter. */ 2482a55deb1SDavid E. O'Brien 249d86a0988SRuslan Ermilov int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */ 2502a55deb1SDavid E. O'Brien { /* only pick up one 8-bit byte (2 chars) */ 2512a55deb1SDavid E. O'Brien uschar *p; 2522a55deb1SDavid E. O'Brien int n = 0; 2532a55deb1SDavid E. O'Brien int i; 2542a55deb1SDavid E. O'Brien 2552a55deb1SDavid E. O'Brien for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { 2562a55deb1SDavid E. O'Brien if (isdigit(*p)) 2572a55deb1SDavid E. O'Brien n = 16 * n + *p - '0'; 2582a55deb1SDavid E. O'Brien else if (*p >= 'a' && *p <= 'f') 2592a55deb1SDavid E. O'Brien n = 16 * n + *p - 'a' + 10; 2602a55deb1SDavid E. O'Brien else if (*p >= 'A' && *p <= 'F') 2612a55deb1SDavid E. O'Brien n = 16 * n + *p - 'A' + 10; 2622a55deb1SDavid E. O'Brien } 263d86a0988SRuslan Ermilov *pp = (uschar *) p; 2642a55deb1SDavid E. O'Brien return n; 2652a55deb1SDavid E. O'Brien } 2662a55deb1SDavid E. O'Brien 2672a55deb1SDavid E. O'Brien #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 2682a55deb1SDavid E. O'Brien 269d86a0988SRuslan Ermilov int quoted(uschar **pp) /* pick up next thing after a \\ */ 2702a55deb1SDavid E. O'Brien /* and increment *pp */ 2712a55deb1SDavid E. O'Brien { 272d86a0988SRuslan Ermilov uschar *p = *pp; 2732a55deb1SDavid E. O'Brien int c; 2742a55deb1SDavid E. O'Brien 2752a55deb1SDavid E. O'Brien if ((c = *p++) == 't') 2762a55deb1SDavid E. O'Brien c = '\t'; 2772a55deb1SDavid E. O'Brien else if (c == 'n') 2782a55deb1SDavid E. O'Brien c = '\n'; 2792a55deb1SDavid E. O'Brien else if (c == 'f') 2802a55deb1SDavid E. O'Brien c = '\f'; 2812a55deb1SDavid E. O'Brien else if (c == 'r') 2822a55deb1SDavid E. O'Brien c = '\r'; 2832a55deb1SDavid E. O'Brien else if (c == 'b') 2842a55deb1SDavid E. O'Brien c = '\b'; 2852a55deb1SDavid E. O'Brien else if (c == '\\') 2862a55deb1SDavid E. O'Brien c = '\\'; 2872a55deb1SDavid E. O'Brien else if (c == 'x') { /* hexadecimal goo follows */ 2882a55deb1SDavid E. O'Brien c = hexstr(&p); /* this adds a null if number is invalid */ 2892a55deb1SDavid E. O'Brien } else if (isoctdigit(c)) { /* \d \dd \ddd */ 2902a55deb1SDavid E. O'Brien int n = c - '0'; 2912a55deb1SDavid E. O'Brien if (isoctdigit(*p)) { 2922a55deb1SDavid E. O'Brien n = 8 * n + *p++ - '0'; 2932a55deb1SDavid E. O'Brien if (isoctdigit(*p)) 2942a55deb1SDavid E. O'Brien n = 8 * n + *p++ - '0'; 2952a55deb1SDavid E. O'Brien } 2962a55deb1SDavid E. O'Brien c = n; 2972a55deb1SDavid E. O'Brien } /* else */ 2982a55deb1SDavid E. O'Brien /* c = c; */ 2992a55deb1SDavid E. O'Brien *pp = p; 3002a55deb1SDavid E. O'Brien return c; 3012a55deb1SDavid E. O'Brien } 3022a55deb1SDavid E. O'Brien 303d98dd8e5SRuslan Ermilov static int collate_range_cmp(int a, int b) 304d98dd8e5SRuslan Ermilov { 305d98dd8e5SRuslan Ermilov static char s[2][2]; 306d98dd8e5SRuslan Ermilov 307d98dd8e5SRuslan Ermilov if ((uschar)a == (uschar)b) 308d98dd8e5SRuslan Ermilov return 0; 309d98dd8e5SRuslan Ermilov s[0][0] = a; 310d98dd8e5SRuslan Ermilov s[1][0] = b; 311d98dd8e5SRuslan Ermilov return (strcoll(s[0], s[1])); 312d98dd8e5SRuslan Ermilov } 313d98dd8e5SRuslan Ermilov 314813da98dSDavid E. O'Brien char *cclenter(const char *argp) /* add a character class */ 3152a55deb1SDavid E. O'Brien { 3162a55deb1SDavid E. O'Brien int i, c, c2; 317d98dd8e5SRuslan Ermilov int j; 3182a55deb1SDavid E. O'Brien uschar *p = (uschar *) argp; 3192a55deb1SDavid E. O'Brien uschar *op, *bp; 320*b5253557SWarner Losh static uschar *buf = 0; 3212a55deb1SDavid E. O'Brien static int bufsz = 100; 3222a55deb1SDavid E. O'Brien 3232a55deb1SDavid E. O'Brien op = p; 324*b5253557SWarner Losh if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 3252a55deb1SDavid E. O'Brien FATAL("out of space for character class [%.10s...] 1", p); 3262a55deb1SDavid E. O'Brien bp = buf; 3272a55deb1SDavid E. O'Brien for (i = 0; (c = *p++) != 0; ) { 3282a55deb1SDavid E. O'Brien if (c == '\\') { 329d86a0988SRuslan Ermilov c = quoted(&p); 3302a55deb1SDavid E. O'Brien } else if (c == '-' && i > 0 && bp[-1] != 0) { 3312a55deb1SDavid E. O'Brien if (*p != 0) { 3322a55deb1SDavid E. O'Brien c = bp[-1]; 3332a55deb1SDavid E. O'Brien c2 = *p++; 3342a55deb1SDavid E. O'Brien if (c2 == '\\') 335d86a0988SRuslan Ermilov c2 = quoted(&p); 336d98dd8e5SRuslan Ermilov if (collate_range_cmp(c, c2) > 0) { 3372a55deb1SDavid E. O'Brien bp--; 3382a55deb1SDavid E. O'Brien i--; 3392a55deb1SDavid E. O'Brien continue; 3402a55deb1SDavid E. O'Brien } 341d98dd8e5SRuslan Ermilov for (j = 0; j < NCHARS; j++) { 342d98dd8e5SRuslan Ermilov if ((collate_range_cmp(c, j) > 0) || 343d98dd8e5SRuslan Ermilov collate_range_cmp(j, c2) > 0) 344d98dd8e5SRuslan Ermilov continue; 345addad6afSRong-En Fan if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 3462a55deb1SDavid E. O'Brien FATAL("out of space for character class [%.10s...] 2", p); 347d98dd8e5SRuslan Ermilov *bp++ = j; 3482a55deb1SDavid E. O'Brien i++; 3492a55deb1SDavid E. O'Brien } 3502a55deb1SDavid E. O'Brien continue; 3512a55deb1SDavid E. O'Brien } 3522a55deb1SDavid E. O'Brien } 353addad6afSRong-En Fan if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 3542a55deb1SDavid E. O'Brien FATAL("out of space for character class [%.10s...] 3", p); 3552a55deb1SDavid E. O'Brien *bp++ = c; 3562a55deb1SDavid E. O'Brien i++; 3572a55deb1SDavid E. O'Brien } 3582a55deb1SDavid E. O'Brien *bp = 0; 3592a55deb1SDavid E. O'Brien dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 3602a55deb1SDavid E. O'Brien xfree(op); 3612a55deb1SDavid E. O'Brien return (char *) tostring((char *) buf); 3622a55deb1SDavid E. O'Brien } 3632a55deb1SDavid E. O'Brien 364813da98dSDavid E. O'Brien void overflo(const char *s) 3652a55deb1SDavid E. O'Brien { 3662a55deb1SDavid E. O'Brien FATAL("regular expression too big: %.30s...", s); 3672a55deb1SDavid E. O'Brien } 3682a55deb1SDavid E. O'Brien 3692a55deb1SDavid E. O'Brien void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 3702a55deb1SDavid E. O'Brien { 3712a55deb1SDavid E. O'Brien int i; 3722a55deb1SDavid E. O'Brien int *p; 3732a55deb1SDavid E. O'Brien 3742a55deb1SDavid E. O'Brien switch (type(v)) { 375addad6afSRong-En Fan ELEAF 3762a55deb1SDavid E. O'Brien LEAF 3772a55deb1SDavid E. O'Brien f->re[info(v)].ltype = type(v); 3782a55deb1SDavid E. O'Brien f->re[info(v)].lval.np = right(v); 3792a55deb1SDavid E. O'Brien while (f->accept >= maxsetvec) { /* guessing here! */ 3802a55deb1SDavid E. O'Brien maxsetvec *= 4; 3812a55deb1SDavid E. O'Brien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 3822a55deb1SDavid E. O'Brien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 383*b5253557SWarner Losh if (setvec == 0 || tmpset == 0) 3842a55deb1SDavid E. O'Brien overflo("out of space in cfoll()"); 3852a55deb1SDavid E. O'Brien } 3862a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++) 3872a55deb1SDavid E. O'Brien setvec[i] = 0; 3882a55deb1SDavid E. O'Brien setcnt = 0; 3892a55deb1SDavid E. O'Brien follow(v); /* computes setvec and setcnt */ 390*b5253557SWarner Losh if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 3912a55deb1SDavid E. O'Brien overflo("out of space building follow set"); 3922a55deb1SDavid E. O'Brien f->re[info(v)].lfollow = p; 3932a55deb1SDavid E. O'Brien *p = setcnt; 3942a55deb1SDavid E. O'Brien for (i = f->accept; i >= 0; i--) 3952a55deb1SDavid E. O'Brien if (setvec[i] == 1) 3962a55deb1SDavid E. O'Brien *++p = i; 3972a55deb1SDavid E. O'Brien break; 3982a55deb1SDavid E. O'Brien UNARY 3992a55deb1SDavid E. O'Brien cfoll(f,left(v)); 4002a55deb1SDavid E. O'Brien break; 4012a55deb1SDavid E. O'Brien case CAT: 4022a55deb1SDavid E. O'Brien case OR: 4032a55deb1SDavid E. O'Brien cfoll(f,left(v)); 4042a55deb1SDavid E. O'Brien cfoll(f,right(v)); 4052a55deb1SDavid E. O'Brien break; 4062a55deb1SDavid E. O'Brien default: /* can't happen */ 4072a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in cfoll", type(v)); 4082a55deb1SDavid E. O'Brien } 4092a55deb1SDavid E. O'Brien } 4102a55deb1SDavid E. O'Brien 4112a55deb1SDavid E. O'Brien int first(Node *p) /* collects initially active leaves of p into setvec */ 412addad6afSRong-En Fan /* returns 0 if p matches empty string */ 4132a55deb1SDavid E. O'Brien { 4142a55deb1SDavid E. O'Brien int b, lp; 4152a55deb1SDavid E. O'Brien 4162a55deb1SDavid E. O'Brien switch (type(p)) { 417addad6afSRong-En Fan ELEAF 4182a55deb1SDavid E. O'Brien LEAF 4192a55deb1SDavid E. O'Brien lp = info(p); /* look for high-water mark of subscripts */ 4202a55deb1SDavid E. O'Brien while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 4212a55deb1SDavid E. O'Brien maxsetvec *= 4; 4222a55deb1SDavid E. O'Brien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 4232a55deb1SDavid E. O'Brien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 424*b5253557SWarner Losh if (setvec == 0 || tmpset == 0) 4252a55deb1SDavid E. O'Brien overflo("out of space in first()"); 4262a55deb1SDavid E. O'Brien } 427addad6afSRong-En Fan if (type(p) == EMPTYRE) { 428addad6afSRong-En Fan setvec[lp] = 0; 429addad6afSRong-En Fan return(0); 430addad6afSRong-En Fan } 4312a55deb1SDavid E. O'Brien if (setvec[lp] != 1) { 4322a55deb1SDavid E. O'Brien setvec[lp] = 1; 4332a55deb1SDavid E. O'Brien setcnt++; 4342a55deb1SDavid E. O'Brien } 4352a55deb1SDavid E. O'Brien if (type(p) == CCL && (*(char *) right(p)) == '\0') 4362a55deb1SDavid E. O'Brien return(0); /* empty CCL */ 4372a55deb1SDavid E. O'Brien else return(1); 4382a55deb1SDavid E. O'Brien case PLUS: 4392a55deb1SDavid E. O'Brien if (first(left(p)) == 0) return(0); 4402a55deb1SDavid E. O'Brien return(1); 4412a55deb1SDavid E. O'Brien case STAR: 4422a55deb1SDavid E. O'Brien case QUEST: 4432a55deb1SDavid E. O'Brien first(left(p)); 4442a55deb1SDavid E. O'Brien return(0); 4452a55deb1SDavid E. O'Brien case CAT: 4462a55deb1SDavid E. O'Brien if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 4472a55deb1SDavid E. O'Brien return(1); 4482a55deb1SDavid E. O'Brien case OR: 4492a55deb1SDavid E. O'Brien b = first(right(p)); 4502a55deb1SDavid E. O'Brien if (first(left(p)) == 0 || b == 0) return(0); 4512a55deb1SDavid E. O'Brien return(1); 4522a55deb1SDavid E. O'Brien } 4532a55deb1SDavid E. O'Brien FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 4542a55deb1SDavid E. O'Brien return(-1); 4552a55deb1SDavid E. O'Brien } 4562a55deb1SDavid E. O'Brien 4572a55deb1SDavid E. O'Brien void follow(Node *v) /* collects leaves that can follow v into setvec */ 4582a55deb1SDavid E. O'Brien { 4592a55deb1SDavid E. O'Brien Node *p; 4602a55deb1SDavid E. O'Brien 4612a55deb1SDavid E. O'Brien if (type(v) == FINAL) 4622a55deb1SDavid E. O'Brien return; 4632a55deb1SDavid E. O'Brien p = parent(v); 4642a55deb1SDavid E. O'Brien switch (type(p)) { 4652a55deb1SDavid E. O'Brien case STAR: 4662a55deb1SDavid E. O'Brien case PLUS: 4672a55deb1SDavid E. O'Brien first(v); 4682a55deb1SDavid E. O'Brien follow(p); 4692a55deb1SDavid E. O'Brien return; 4702a55deb1SDavid E. O'Brien 4712a55deb1SDavid E. O'Brien case OR: 4722a55deb1SDavid E. O'Brien case QUEST: 4732a55deb1SDavid E. O'Brien follow(p); 4742a55deb1SDavid E. O'Brien return; 4752a55deb1SDavid E. O'Brien 4762a55deb1SDavid E. O'Brien case CAT: 4772a55deb1SDavid E. O'Brien if (v == left(p)) { /* v is left child of p */ 4782a55deb1SDavid E. O'Brien if (first(right(p)) == 0) { 4792a55deb1SDavid E. O'Brien follow(p); 4802a55deb1SDavid E. O'Brien return; 4812a55deb1SDavid E. O'Brien } 4822a55deb1SDavid E. O'Brien } else /* v is right child */ 4832a55deb1SDavid E. O'Brien follow(p); 4842a55deb1SDavid E. O'Brien return; 4852a55deb1SDavid E. O'Brien } 4862a55deb1SDavid E. O'Brien } 4872a55deb1SDavid E. O'Brien 488813da98dSDavid E. O'Brien int member(int c, const char *sarg) /* is c in s? */ 4892a55deb1SDavid E. O'Brien { 4902a55deb1SDavid E. O'Brien uschar *s = (uschar *) sarg; 4912a55deb1SDavid E. O'Brien 4922a55deb1SDavid E. O'Brien while (*s) 4932a55deb1SDavid E. O'Brien if (c == *s++) 4942a55deb1SDavid E. O'Brien return(1); 4952a55deb1SDavid E. O'Brien return(0); 4962a55deb1SDavid E. O'Brien } 4972a55deb1SDavid E. O'Brien 498813da98dSDavid E. O'Brien int match(fa *f, const char *p0) /* shortest match ? */ 4992a55deb1SDavid E. O'Brien { 5002a55deb1SDavid E. O'Brien int s, ns; 5012a55deb1SDavid E. O'Brien uschar *p = (uschar *) p0; 5022a55deb1SDavid E. O'Brien 5032a55deb1SDavid E. O'Brien s = f->reset ? makeinit(f,0) : f->initstat; 5042a55deb1SDavid E. O'Brien if (f->out[s]) 5052a55deb1SDavid E. O'Brien return(1); 5062a55deb1SDavid E. O'Brien do { 507addad6afSRong-En Fan /* assert(*p < NCHARS); */ 5082a55deb1SDavid E. O'Brien if ((ns = f->gototab[s][*p]) != 0) 5092a55deb1SDavid E. O'Brien s = ns; 5102a55deb1SDavid E. O'Brien else 5112a55deb1SDavid E. O'Brien s = cgoto(f, s, *p); 5122a55deb1SDavid E. O'Brien if (f->out[s]) 5132a55deb1SDavid E. O'Brien return(1); 5142a55deb1SDavid E. O'Brien } while (*p++ != 0); 5152a55deb1SDavid E. O'Brien return(0); 5162a55deb1SDavid E. O'Brien } 5172a55deb1SDavid E. O'Brien 518813da98dSDavid E. O'Brien int pmatch(fa *f, const char *p0) /* longest match, for sub */ 5192a55deb1SDavid E. O'Brien { 5202a55deb1SDavid E. O'Brien int s, ns; 5212a55deb1SDavid E. O'Brien uschar *p = (uschar *) p0; 5222a55deb1SDavid E. O'Brien uschar *q; 5232a55deb1SDavid E. O'Brien int i, k; 5242a55deb1SDavid E. O'Brien 52562ebc626SRuslan Ermilov /* s = f->reset ? makeinit(f,1) : f->initstat; */ 52662ebc626SRuslan Ermilov if (f->reset) { 52762ebc626SRuslan Ermilov f->initstat = s = makeinit(f,1); 52862ebc626SRuslan Ermilov } else { 52962ebc626SRuslan Ermilov s = f->initstat; 53062ebc626SRuslan Ermilov } 5312a55deb1SDavid E. O'Brien patbeg = (char *) p; 5322a55deb1SDavid E. O'Brien patlen = -1; 5332a55deb1SDavid E. O'Brien do { 5342a55deb1SDavid E. O'Brien q = p; 5352a55deb1SDavid E. O'Brien do { 5362a55deb1SDavid E. O'Brien if (f->out[s]) /* final state */ 5372a55deb1SDavid E. O'Brien patlen = q-p; 538addad6afSRong-En Fan /* assert(*q < NCHARS); */ 5392a55deb1SDavid E. O'Brien if ((ns = f->gototab[s][*q]) != 0) 5402a55deb1SDavid E. O'Brien s = ns; 5412a55deb1SDavid E. O'Brien else 5422a55deb1SDavid E. O'Brien s = cgoto(f, s, *q); 5432a55deb1SDavid E. O'Brien if (s == 1) { /* no transition */ 5442a55deb1SDavid E. O'Brien if (patlen >= 0) { 5452a55deb1SDavid E. O'Brien patbeg = (char *) p; 5462a55deb1SDavid E. O'Brien return(1); 5472a55deb1SDavid E. O'Brien } 5482a55deb1SDavid E. O'Brien else 5492a55deb1SDavid E. O'Brien goto nextin; /* no match */ 5502a55deb1SDavid E. O'Brien } 5512a55deb1SDavid E. O'Brien } while (*q++ != 0); 5522a55deb1SDavid E. O'Brien if (f->out[s]) 5532a55deb1SDavid E. O'Brien patlen = q-p-1; /* don't count $ */ 5542a55deb1SDavid E. O'Brien if (patlen >= 0) { 5552a55deb1SDavid E. O'Brien patbeg = (char *) p; 5562a55deb1SDavid E. O'Brien return(1); 5572a55deb1SDavid E. O'Brien } 5582a55deb1SDavid E. O'Brien nextin: 5592a55deb1SDavid E. O'Brien s = 2; 5602a55deb1SDavid E. O'Brien if (f->reset) { 5612a55deb1SDavid E. O'Brien for (i = 2; i <= f->curstat; i++) 5622a55deb1SDavid E. O'Brien xfree(f->posns[i]); 5632a55deb1SDavid E. O'Brien k = *f->posns[0]; 564*b5253557SWarner Losh if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 5652a55deb1SDavid E. O'Brien overflo("out of space in pmatch"); 5662a55deb1SDavid E. O'Brien for (i = 0; i <= k; i++) 5672a55deb1SDavid E. O'Brien (f->posns[2])[i] = (f->posns[0])[i]; 5682a55deb1SDavid E. O'Brien f->initstat = f->curstat = 2; 5692a55deb1SDavid E. O'Brien f->out[2] = f->out[0]; 5702a55deb1SDavid E. O'Brien for (i = 0; i < NCHARS; i++) 5712a55deb1SDavid E. O'Brien f->gototab[2][i] = 0; 5722a55deb1SDavid E. O'Brien } 5732a55deb1SDavid E. O'Brien } while (*p++ != 0); 5742a55deb1SDavid E. O'Brien return (0); 5752a55deb1SDavid E. O'Brien } 5762a55deb1SDavid E. O'Brien 577813da98dSDavid E. O'Brien int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 5782a55deb1SDavid E. O'Brien { 5792a55deb1SDavid E. O'Brien int s, ns; 5802a55deb1SDavid E. O'Brien uschar *p = (uschar *) p0; 5812a55deb1SDavid E. O'Brien uschar *q; 5822a55deb1SDavid E. O'Brien int i, k; 5832a55deb1SDavid E. O'Brien 58462ebc626SRuslan Ermilov /* s = f->reset ? makeinit(f,1) : f->initstat; */ 58562ebc626SRuslan Ermilov if (f->reset) { 58662ebc626SRuslan Ermilov f->initstat = s = makeinit(f,1); 58762ebc626SRuslan Ermilov } else { 58862ebc626SRuslan Ermilov s = f->initstat; 58962ebc626SRuslan Ermilov } 5902a55deb1SDavid E. O'Brien patlen = -1; 5912a55deb1SDavid E. O'Brien while (*p) { 5922a55deb1SDavid E. O'Brien q = p; 5932a55deb1SDavid E. O'Brien do { 5942a55deb1SDavid E. O'Brien if (f->out[s]) /* final state */ 5952a55deb1SDavid E. O'Brien patlen = q-p; 596addad6afSRong-En Fan /* assert(*q < NCHARS); */ 5972a55deb1SDavid E. O'Brien if ((ns = f->gototab[s][*q]) != 0) 5982a55deb1SDavid E. O'Brien s = ns; 5992a55deb1SDavid E. O'Brien else 6002a55deb1SDavid E. O'Brien s = cgoto(f, s, *q); 6012a55deb1SDavid E. O'Brien if (s == 1) { /* no transition */ 6022a55deb1SDavid E. O'Brien if (patlen > 0) { 6032a55deb1SDavid E. O'Brien patbeg = (char *) p; 6042a55deb1SDavid E. O'Brien return(1); 6052a55deb1SDavid E. O'Brien } else 6062a55deb1SDavid E. O'Brien goto nnextin; /* no nonempty match */ 6072a55deb1SDavid E. O'Brien } 6082a55deb1SDavid E. O'Brien } while (*q++ != 0); 6092a55deb1SDavid E. O'Brien if (f->out[s]) 6102a55deb1SDavid E. O'Brien patlen = q-p-1; /* don't count $ */ 6112a55deb1SDavid E. O'Brien if (patlen > 0 ) { 6122a55deb1SDavid E. O'Brien patbeg = (char *) p; 6132a55deb1SDavid E. O'Brien return(1); 6142a55deb1SDavid E. O'Brien } 6152a55deb1SDavid E. O'Brien nnextin: 6162a55deb1SDavid E. O'Brien s = 2; 6172a55deb1SDavid E. O'Brien if (f->reset) { 6182a55deb1SDavid E. O'Brien for (i = 2; i <= f->curstat; i++) 6192a55deb1SDavid E. O'Brien xfree(f->posns[i]); 6202a55deb1SDavid E. O'Brien k = *f->posns[0]; 621*b5253557SWarner Losh if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 6222a55deb1SDavid E. O'Brien overflo("out of state space"); 6232a55deb1SDavid E. O'Brien for (i = 0; i <= k; i++) 6242a55deb1SDavid E. O'Brien (f->posns[2])[i] = (f->posns[0])[i]; 6252a55deb1SDavid E. O'Brien f->initstat = f->curstat = 2; 6262a55deb1SDavid E. O'Brien f->out[2] = f->out[0]; 6272a55deb1SDavid E. O'Brien for (i = 0; i < NCHARS; i++) 6282a55deb1SDavid E. O'Brien f->gototab[2][i] = 0; 6292a55deb1SDavid E. O'Brien } 6302a55deb1SDavid E. O'Brien p++; 6312a55deb1SDavid E. O'Brien } 6322a55deb1SDavid E. O'Brien return (0); 6332a55deb1SDavid E. O'Brien } 6342a55deb1SDavid E. O'Brien 635813da98dSDavid E. O'Brien Node *reparse(const char *p) /* parses regular expression pointed to by p */ 6362a55deb1SDavid E. O'Brien { /* uses relex() to scan regular expression */ 6372a55deb1SDavid E. O'Brien Node *np; 6382a55deb1SDavid E. O'Brien 6392a55deb1SDavid E. O'Brien dprintf( ("reparse <%s>\n", p) ); 6402a55deb1SDavid E. O'Brien lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ 6412a55deb1SDavid E. O'Brien rtok = relex(); 642813da98dSDavid E. O'Brien /* GNU compatibility: an empty regexp matches anything */ 643addad6afSRong-En Fan if (rtok == '\0') { 644813da98dSDavid E. O'Brien /* FATAL("empty regular expression"); previous */ 645addad6afSRong-En Fan return(op2(EMPTYRE, NIL, NIL)); 646addad6afSRong-En Fan } 6472a55deb1SDavid E. O'Brien np = regexp(); 6482a55deb1SDavid E. O'Brien if (rtok != '\0') 6492a55deb1SDavid E. O'Brien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 6502a55deb1SDavid E. O'Brien return(np); 6512a55deb1SDavid E. O'Brien } 6522a55deb1SDavid E. O'Brien 6532a55deb1SDavid E. O'Brien Node *regexp(void) /* top-level parse of reg expr */ 6542a55deb1SDavid E. O'Brien { 6552a55deb1SDavid E. O'Brien return (alt(concat(primary()))); 6562a55deb1SDavid E. O'Brien } 6572a55deb1SDavid E. O'Brien 6582a55deb1SDavid E. O'Brien Node *primary(void) 6592a55deb1SDavid E. O'Brien { 6602a55deb1SDavid E. O'Brien Node *np; 661*b5253557SWarner Losh int savelastatom; 6622a55deb1SDavid E. O'Brien 6632a55deb1SDavid E. O'Brien switch (rtok) { 6642a55deb1SDavid E. O'Brien case CHAR: 665*b5253557SWarner Losh lastatom = starttok; 6662a55deb1SDavid E. O'Brien np = op2(CHAR, NIL, itonp(rlxval)); 6672a55deb1SDavid E. O'Brien rtok = relex(); 6682a55deb1SDavid E. O'Brien return (unary(np)); 6692a55deb1SDavid E. O'Brien case ALL: 6702a55deb1SDavid E. O'Brien rtok = relex(); 6712a55deb1SDavid E. O'Brien return (unary(op2(ALL, NIL, NIL))); 672addad6afSRong-En Fan case EMPTYRE: 673addad6afSRong-En Fan rtok = relex(); 674*b5253557SWarner Losh return (unary(op2(EMPTYRE, NIL, NIL))); 6752a55deb1SDavid E. O'Brien case DOT: 676*b5253557SWarner Losh lastatom = starttok; 6772a55deb1SDavid E. O'Brien rtok = relex(); 6782a55deb1SDavid E. O'Brien return (unary(op2(DOT, NIL, NIL))); 6792a55deb1SDavid E. O'Brien case CCL: 6802a55deb1SDavid E. O'Brien np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); 681*b5253557SWarner Losh lastatom = starttok; 6822a55deb1SDavid E. O'Brien rtok = relex(); 6832a55deb1SDavid E. O'Brien return (unary(np)); 6842a55deb1SDavid E. O'Brien case NCCL: 6852a55deb1SDavid E. O'Brien np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); 686*b5253557SWarner Losh lastatom = starttok; 6872a55deb1SDavid E. O'Brien rtok = relex(); 6882a55deb1SDavid E. O'Brien return (unary(np)); 6892a55deb1SDavid E. O'Brien case '^': 6902a55deb1SDavid E. O'Brien rtok = relex(); 6912a55deb1SDavid E. O'Brien return (unary(op2(CHAR, NIL, itonp(HAT)))); 6922a55deb1SDavid E. O'Brien case '$': 6932a55deb1SDavid E. O'Brien rtok = relex(); 6942a55deb1SDavid E. O'Brien return (unary(op2(CHAR, NIL, NIL))); 6952a55deb1SDavid E. O'Brien case '(': 696*b5253557SWarner Losh lastatom = starttok; 697*b5253557SWarner Losh savelastatom = starttok - basestr; /* Retain over recursion */ 6982a55deb1SDavid E. O'Brien rtok = relex(); 6992a55deb1SDavid E. O'Brien if (rtok == ')') { /* special pleading for () */ 7002a55deb1SDavid E. O'Brien rtok = relex(); 7012a55deb1SDavid E. O'Brien return unary(op2(CCL, NIL, (Node *) tostring(""))); 7022a55deb1SDavid E. O'Brien } 7032a55deb1SDavid E. O'Brien np = regexp(); 7042a55deb1SDavid E. O'Brien if (rtok == ')') { 705*b5253557SWarner Losh lastatom = basestr + savelastatom; /* Restore */ 7062a55deb1SDavid E. O'Brien rtok = relex(); 7072a55deb1SDavid E. O'Brien return (unary(np)); 7082a55deb1SDavid E. O'Brien } 7092a55deb1SDavid E. O'Brien else 7102a55deb1SDavid E. O'Brien FATAL("syntax error in regular expression %s at %s", lastre, prestr); 7112a55deb1SDavid E. O'Brien default: 7122a55deb1SDavid E. O'Brien FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 7132a55deb1SDavid E. O'Brien } 7142a55deb1SDavid E. O'Brien return 0; /*NOTREACHED*/ 7152a55deb1SDavid E. O'Brien } 7162a55deb1SDavid E. O'Brien 7172a55deb1SDavid E. O'Brien Node *concat(Node *np) 7182a55deb1SDavid E. O'Brien { 7192a55deb1SDavid E. O'Brien switch (rtok) { 720*b5253557SWarner Losh case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 7212a55deb1SDavid E. O'Brien return (concat(op2(CAT, np, primary()))); 722*b5253557SWarner Losh case EMPTYRE: 723*b5253557SWarner Losh rtok = relex(); 724*b5253557SWarner Losh return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 725*b5253557SWarner Losh primary()))); 7262a55deb1SDavid E. O'Brien } 7272a55deb1SDavid E. O'Brien return (np); 7282a55deb1SDavid E. O'Brien } 7292a55deb1SDavid E. O'Brien 7302a55deb1SDavid E. O'Brien Node *alt(Node *np) 7312a55deb1SDavid E. O'Brien { 7322a55deb1SDavid E. O'Brien if (rtok == OR) { 7332a55deb1SDavid E. O'Brien rtok = relex(); 7342a55deb1SDavid E. O'Brien return (alt(op2(OR, np, concat(primary())))); 7352a55deb1SDavid E. O'Brien } 7362a55deb1SDavid E. O'Brien return (np); 7372a55deb1SDavid E. O'Brien } 7382a55deb1SDavid E. O'Brien 7392a55deb1SDavid E. O'Brien Node *unary(Node *np) 7402a55deb1SDavid E. O'Brien { 7412a55deb1SDavid E. O'Brien switch (rtok) { 7422a55deb1SDavid E. O'Brien case STAR: 7432a55deb1SDavid E. O'Brien rtok = relex(); 7442a55deb1SDavid E. O'Brien return (unary(op2(STAR, np, NIL))); 7452a55deb1SDavid E. O'Brien case PLUS: 7462a55deb1SDavid E. O'Brien rtok = relex(); 7472a55deb1SDavid E. O'Brien return (unary(op2(PLUS, np, NIL))); 7482a55deb1SDavid E. O'Brien case QUEST: 7492a55deb1SDavid E. O'Brien rtok = relex(); 7502a55deb1SDavid E. O'Brien return (unary(op2(QUEST, np, NIL))); 7512a55deb1SDavid E. O'Brien default: 7522a55deb1SDavid E. O'Brien return (np); 7532a55deb1SDavid E. O'Brien } 7542a55deb1SDavid E. O'Brien } 7552a55deb1SDavid E. O'Brien 756007c6572SDag-Erling Smørgrav /* 757007c6572SDag-Erling Smørgrav * Character class definitions conformant to the POSIX locale as 758007c6572SDag-Erling Smørgrav * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 759007c6572SDag-Erling Smørgrav * and operating character sets are both ASCII (ISO646) or supersets 760007c6572SDag-Erling Smørgrav * thereof. 761007c6572SDag-Erling Smørgrav * 762007c6572SDag-Erling Smørgrav * Note that to avoid overflowing the temporary buffer used in 763007c6572SDag-Erling Smørgrav * relex(), the expanded character class (prior to range expansion) 764007c6572SDag-Erling Smørgrav * must be less than twice the size of their full name. 765007c6572SDag-Erling Smørgrav */ 766fc6b1dfeSDavid E. O'Brien 767fc6b1dfeSDavid E. O'Brien /* Because isblank doesn't show up in any of the header files on any 768fc6b1dfeSDavid E. O'Brien * system i use, it's defined here. if some other locale has a richer 769fc6b1dfeSDavid E. O'Brien * definition of "blank", define HAS_ISBLANK and provide your own 770fc6b1dfeSDavid E. O'Brien * version. 77188b8d487SRuslan Ermilov * the parentheses here are an attempt to find a path through the maze 77288b8d487SRuslan Ermilov * of macro definition and/or function and/or version provided. thanks 77388b8d487SRuslan Ermilov * to nelson beebe for the suggestion; let's see if it works everywhere. 774fc6b1dfeSDavid E. O'Brien */ 775fc6b1dfeSDavid E. O'Brien 77691217c1cSRuslan Ermilov /* #define HAS_ISBLANK */ 777fc6b1dfeSDavid E. O'Brien #ifndef HAS_ISBLANK 778fc6b1dfeSDavid E. O'Brien 7791b11b783SRuslan Ermilov int (xisblank)(int c) 780fc6b1dfeSDavid E. O'Brien { 781fc6b1dfeSDavid E. O'Brien return c==' ' || c=='\t'; 782fc6b1dfeSDavid E. O'Brien } 783fc6b1dfeSDavid E. O'Brien 784fc6b1dfeSDavid E. O'Brien #endif 785fc6b1dfeSDavid E. O'Brien 786007c6572SDag-Erling Smørgrav struct charclass { 787007c6572SDag-Erling Smørgrav const char *cc_name; 788007c6572SDag-Erling Smørgrav int cc_namelen; 789fc6b1dfeSDavid E. O'Brien int (*cc_func)(int); 790007c6572SDag-Erling Smørgrav } charclasses[] = { 791fc6b1dfeSDavid E. O'Brien { "alnum", 5, isalnum }, 792fc6b1dfeSDavid E. O'Brien { "alpha", 5, isalpha }, 7931b11b783SRuslan Ermilov #ifndef HAS_ISBLANK 794*b5253557SWarner Losh { "blank", 5, xisblank }, 7951b11b783SRuslan Ermilov #else 796fc6b1dfeSDavid E. O'Brien { "blank", 5, isblank }, 7971b11b783SRuslan Ermilov #endif 798fc6b1dfeSDavid E. O'Brien { "cntrl", 5, iscntrl }, 799fc6b1dfeSDavid E. O'Brien { "digit", 5, isdigit }, 800fc6b1dfeSDavid E. O'Brien { "graph", 5, isgraph }, 801fc6b1dfeSDavid E. O'Brien { "lower", 5, islower }, 802fc6b1dfeSDavid E. O'Brien { "print", 5, isprint }, 803fc6b1dfeSDavid E. O'Brien { "punct", 5, ispunct }, 804fc6b1dfeSDavid E. O'Brien { "space", 5, isspace }, 805fc6b1dfeSDavid E. O'Brien { "upper", 5, isupper }, 806fc6b1dfeSDavid E. O'Brien { "xdigit", 6, isxdigit }, 807007c6572SDag-Erling Smørgrav { NULL, 0, NULL }, 808007c6572SDag-Erling Smørgrav }; 809007c6572SDag-Erling Smørgrav 810*b5253557SWarner Losh #define REPEAT_SIMPLE 0 811*b5253557SWarner Losh #define REPEAT_PLUS_APPENDED 1 812*b5253557SWarner Losh #define REPEAT_WITH_Q 2 813*b5253557SWarner Losh #define REPEAT_ZERO 3 814*b5253557SWarner Losh 815*b5253557SWarner Losh static int 816*b5253557SWarner Losh replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 817*b5253557SWarner Losh int atomlen, int firstnum, int secondnum, int special_case) 818*b5253557SWarner Losh { 819*b5253557SWarner Losh int i, j; 820*b5253557SWarner Losh uschar *buf = 0; 821*b5253557SWarner Losh int ret = 1; 822*b5253557SWarner Losh int init_q = (firstnum==0); /* first added char will be ? */ 823*b5253557SWarner Losh int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 824*b5253557SWarner Losh int prefix_length = reptok - basestr; /* prefix includes first rep */ 825*b5253557SWarner Losh int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */ 826*b5253557SWarner Losh int size = prefix_length + suffix_length; 827*b5253557SWarner Losh 828*b5253557SWarner Losh if (firstnum > 1) { /* add room for reps 2 through firstnum */ 829*b5253557SWarner Losh size += atomlen*(firstnum-1); 830*b5253557SWarner Losh } 831*b5253557SWarner Losh 832*b5253557SWarner Losh /* Adjust size of buffer for special cases */ 833*b5253557SWarner Losh if (special_case == REPEAT_PLUS_APPENDED) { 834*b5253557SWarner Losh size++; /* for the final + */ 835*b5253557SWarner Losh } else if (special_case == REPEAT_WITH_Q) { 836*b5253557SWarner Losh size += init_q + (atomlen+1)* n_q_reps; 837*b5253557SWarner Losh } else if (special_case == REPEAT_ZERO) { 838*b5253557SWarner Losh size += 2; /* just a null ERE: () */ 839*b5253557SWarner Losh } 840*b5253557SWarner Losh if ((buf = (uschar *) malloc(size+1)) == NULL) 841*b5253557SWarner Losh FATAL("out of space in reg expr %.10s..", lastre); 842*b5253557SWarner Losh memcpy(buf, basestr, prefix_length); /* copy prefix */ 843*b5253557SWarner Losh j = prefix_length; 844*b5253557SWarner Losh if (special_case == REPEAT_ZERO) { 845*b5253557SWarner Losh j -= atomlen; 846*b5253557SWarner Losh buf[j++] = '('; 847*b5253557SWarner Losh buf[j++] = ')'; 848*b5253557SWarner Losh } 849*b5253557SWarner Losh for (i=1; i < firstnum; i++) { /* copy x reps */ 850*b5253557SWarner Losh memcpy(&buf[j], atom, atomlen); 851*b5253557SWarner Losh j += atomlen; 852*b5253557SWarner Losh } 853*b5253557SWarner Losh if (special_case == REPEAT_PLUS_APPENDED) { 854*b5253557SWarner Losh buf[j++] = '+'; 855*b5253557SWarner Losh } else if (special_case == REPEAT_WITH_Q) { 856*b5253557SWarner Losh if (init_q) buf[j++] = '?'; 857*b5253557SWarner Losh for (i=0; i < n_q_reps; i++) { /* copy x? reps */ 858*b5253557SWarner Losh memcpy(&buf[j], atom, atomlen); 859*b5253557SWarner Losh j += atomlen; 860*b5253557SWarner Losh buf[j++] = '?'; 861*b5253557SWarner Losh } 862*b5253557SWarner Losh } 863*b5253557SWarner Losh memcpy(&buf[j], reptok+reptoklen, suffix_length); 864*b5253557SWarner Losh if (special_case == REPEAT_ZERO) { 865*b5253557SWarner Losh buf[j+suffix_length] = '\0'; 866*b5253557SWarner Losh } else { 867*b5253557SWarner Losh buf[size] = '\0'; 868*b5253557SWarner Losh } 869*b5253557SWarner Losh /* free old basestr */ 870*b5253557SWarner Losh if (firstbasestr != basestr) { 871*b5253557SWarner Losh if (basestr) 872*b5253557SWarner Losh xfree(basestr); 873*b5253557SWarner Losh } 874*b5253557SWarner Losh basestr = buf; 875*b5253557SWarner Losh prestr = buf + prefix_length; 876*b5253557SWarner Losh if (special_case == REPEAT_ZERO) { 877*b5253557SWarner Losh prestr -= atomlen; 878*b5253557SWarner Losh ret++; 879*b5253557SWarner Losh } 880*b5253557SWarner Losh return ret; 881*b5253557SWarner Losh } 882*b5253557SWarner Losh 883*b5253557SWarner Losh static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 884*b5253557SWarner Losh int atomlen, int firstnum, int secondnum) 885*b5253557SWarner Losh { 886*b5253557SWarner Losh /* 887*b5253557SWarner Losh In general, the repetition specifier or "bound" is replaced here 888*b5253557SWarner Losh by an equivalent ERE string, repeating the immediately previous atom 889*b5253557SWarner Losh and appending ? and + as needed. Note that the first copy of the 890*b5253557SWarner Losh atom is left in place, except in the special_case of a zero-repeat 891*b5253557SWarner Losh (i.e., {0}). 892*b5253557SWarner Losh */ 893*b5253557SWarner Losh if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 894*b5253557SWarner Losh if (firstnum < 2) { 895*b5253557SWarner Losh /* 0 or 1: should be handled before you get here */ 896*b5253557SWarner Losh FATAL("internal error"); 897*b5253557SWarner Losh } else { 898*b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen, 899*b5253557SWarner Losh firstnum, secondnum, REPEAT_PLUS_APPENDED); 900*b5253557SWarner Losh } 901*b5253557SWarner Losh } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 902*b5253557SWarner Losh if (firstnum == 0) { /* {0} or {0,0} */ 903*b5253557SWarner Losh /* This case is unusual because the resulting 904*b5253557SWarner Losh replacement string might actually be SMALLER than 905*b5253557SWarner Losh the original ERE */ 906*b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen, 907*b5253557SWarner Losh firstnum, secondnum, REPEAT_ZERO); 908*b5253557SWarner Losh } else { /* (firstnum >= 1) */ 909*b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen, 910*b5253557SWarner Losh firstnum, secondnum, REPEAT_SIMPLE); 911*b5253557SWarner Losh } 912*b5253557SWarner Losh } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 913*b5253557SWarner Losh /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 914*b5253557SWarner Losh return replace_repeat(reptok, reptoklen, atom, atomlen, 915*b5253557SWarner Losh firstnum, secondnum, REPEAT_WITH_Q); 916*b5253557SWarner Losh } else { /* Error - shouldn't be here (n>m) */ 917*b5253557SWarner Losh FATAL("internal error"); 918*b5253557SWarner Losh } 919*b5253557SWarner Losh return 0; 920*b5253557SWarner Losh } 921007c6572SDag-Erling Smørgrav 9222a55deb1SDavid E. O'Brien int relex(void) /* lexical analyzer for reparse */ 9232a55deb1SDavid E. O'Brien { 9242a55deb1SDavid E. O'Brien int c, n; 9252a55deb1SDavid E. O'Brien int cflag; 926*b5253557SWarner Losh static uschar *buf = 0; 9272a55deb1SDavid E. O'Brien static int bufsz = 100; 9282a55deb1SDavid E. O'Brien uschar *bp; 929007c6572SDag-Erling Smørgrav struct charclass *cc; 930fc6b1dfeSDavid E. O'Brien int i; 931*b5253557SWarner Losh int num, m, commafound, digitfound; 932*b5253557SWarner Losh const uschar *startreptok; 933*b5253557SWarner Losh 934*b5253557SWarner Losh rescan: 935*b5253557SWarner Losh starttok = prestr; 9362a55deb1SDavid E. O'Brien 9372a55deb1SDavid E. O'Brien switch (c = *prestr++) { 9382a55deb1SDavid E. O'Brien case '|': return OR; 9392a55deb1SDavid E. O'Brien case '*': return STAR; 9402a55deb1SDavid E. O'Brien case '+': return PLUS; 9412a55deb1SDavid E. O'Brien case '?': return QUEST; 9422a55deb1SDavid E. O'Brien case '.': return DOT; 9432a55deb1SDavid E. O'Brien case '\0': prestr--; return '\0'; 9442a55deb1SDavid E. O'Brien case '^': 9452a55deb1SDavid E. O'Brien case '$': 9462a55deb1SDavid E. O'Brien case '(': 9472a55deb1SDavid E. O'Brien case ')': 9482a55deb1SDavid E. O'Brien return c; 9492a55deb1SDavid E. O'Brien case '\\': 950d86a0988SRuslan Ermilov rlxval = quoted(&prestr); 9512a55deb1SDavid E. O'Brien return CHAR; 9522a55deb1SDavid E. O'Brien default: 9532a55deb1SDavid E. O'Brien rlxval = c; 9542a55deb1SDavid E. O'Brien return CHAR; 9552a55deb1SDavid E. O'Brien case '[': 956*b5253557SWarner Losh if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) 9572a55deb1SDavid E. O'Brien FATAL("out of space in reg expr %.10s..", lastre); 9582a55deb1SDavid E. O'Brien bp = buf; 9592a55deb1SDavid E. O'Brien if (*prestr == '^') { 9602a55deb1SDavid E. O'Brien cflag = 1; 9612a55deb1SDavid E. O'Brien prestr++; 9622a55deb1SDavid E. O'Brien } 9632a55deb1SDavid E. O'Brien else 9642a55deb1SDavid E. O'Brien cflag = 0; 965007c6572SDag-Erling Smørgrav n = 2 * strlen((const char *) prestr)+1; 966addad6afSRong-En Fan if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 9672a55deb1SDavid E. O'Brien FATAL("out of space for reg expr %.10s...", lastre); 9682a55deb1SDavid E. O'Brien for (; ; ) { 9692a55deb1SDavid E. O'Brien if ((c = *prestr++) == '\\') { 9702a55deb1SDavid E. O'Brien *bp++ = '\\'; 9712a55deb1SDavid E. O'Brien if ((c = *prestr++) == '\0') 9722a55deb1SDavid E. O'Brien FATAL("nonterminated character class %.20s...", lastre); 9732a55deb1SDavid E. O'Brien *bp++ = c; 9742a55deb1SDavid E. O'Brien /* } else if (c == '\n') { */ 9752a55deb1SDavid E. O'Brien /* FATAL("newline in character class %.20s...", lastre); */ 976007c6572SDag-Erling Smørgrav } else if (c == '[' && *prestr == ':') { 977007c6572SDag-Erling Smørgrav /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 978007c6572SDag-Erling Smørgrav for (cc = charclasses; cc->cc_name; cc++) 979007c6572SDag-Erling Smørgrav if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 980007c6572SDag-Erling Smørgrav break; 981007c6572SDag-Erling Smørgrav if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 982007c6572SDag-Erling Smørgrav prestr[2 + cc->cc_namelen] == ']') { 983007c6572SDag-Erling Smørgrav prestr += cc->cc_namelen + 3; 984*b5253557SWarner Losh /* 985*b5253557SWarner Losh * BUG: We begin at 1, instead of 0, since we 986*b5253557SWarner Losh * would otherwise prematurely terminate the 987*b5253557SWarner Losh * string for classes like [[:cntrl:]]. This 988*b5253557SWarner Losh * means that we can't match the NUL character, 989*b5253557SWarner Losh * not without first adapting the entire 990*b5253557SWarner Losh * program to track each string's length. 991*b5253557SWarner Losh */ 992*b5253557SWarner Losh for (i = 1; i <= UCHAR_MAX; i++) { 993addad6afSRong-En Fan if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 994fc6b1dfeSDavid E. O'Brien FATAL("out of space for reg expr %.10s...", lastre); 995fc6b1dfeSDavid E. O'Brien if (cc->cc_func(i)) { 996fc6b1dfeSDavid E. O'Brien *bp++ = i; 997fc6b1dfeSDavid E. O'Brien n++; 998fc6b1dfeSDavid E. O'Brien } 999fc6b1dfeSDavid E. O'Brien } 1000007c6572SDag-Erling Smørgrav } else 1001007c6572SDag-Erling Smørgrav *bp++ = c; 1002*b5253557SWarner Losh } else if (c == '[' && *prestr == '.') { 1003*b5253557SWarner Losh char collate_char; 1004*b5253557SWarner Losh prestr++; 1005*b5253557SWarner Losh collate_char = *prestr++; 1006*b5253557SWarner Losh if (*prestr == '.' && prestr[1] == ']') { 1007*b5253557SWarner Losh prestr += 2; 1008*b5253557SWarner Losh /* Found it: map via locale TBD: for 1009*b5253557SWarner Losh now, simply return this char. This 1010*b5253557SWarner Losh is sufficient to pass conformance 1011*b5253557SWarner Losh test awk.ex 156 1012*b5253557SWarner Losh */ 1013*b5253557SWarner Losh if (*prestr == ']') { 1014*b5253557SWarner Losh prestr++; 1015*b5253557SWarner Losh rlxval = collate_char; 1016*b5253557SWarner Losh return CHAR; 1017*b5253557SWarner Losh } 1018*b5253557SWarner Losh } 1019*b5253557SWarner Losh } else if (c == '[' && *prestr == '=') { 1020*b5253557SWarner Losh char equiv_char; 1021*b5253557SWarner Losh prestr++; 1022*b5253557SWarner Losh equiv_char = *prestr++; 1023*b5253557SWarner Losh if (*prestr == '=' && prestr[1] == ']') { 1024*b5253557SWarner Losh prestr += 2; 1025*b5253557SWarner Losh /* Found it: map via locale TBD: for now 1026*b5253557SWarner Losh simply return this char. This is 1027*b5253557SWarner Losh sufficient to pass conformance test 1028*b5253557SWarner Losh awk.ex 156 1029*b5253557SWarner Losh */ 1030*b5253557SWarner Losh if (*prestr == ']') { 1031*b5253557SWarner Losh prestr++; 1032*b5253557SWarner Losh rlxval = equiv_char; 1033*b5253557SWarner Losh return CHAR; 1034*b5253557SWarner Losh } 1035*b5253557SWarner Losh } 10362a55deb1SDavid E. O'Brien } else if (c == '\0') { 10372a55deb1SDavid E. O'Brien FATAL("nonterminated character class %.20s", lastre); 10382a55deb1SDavid E. O'Brien } else if (bp == buf) { /* 1st char is special */ 10392a55deb1SDavid E. O'Brien *bp++ = c; 10402a55deb1SDavid E. O'Brien } else if (c == ']') { 10412a55deb1SDavid E. O'Brien *bp++ = 0; 10422a55deb1SDavid E. O'Brien rlxstr = (uschar *) tostring((char *) buf); 10432a55deb1SDavid E. O'Brien if (cflag == 0) 10442a55deb1SDavid E. O'Brien return CCL; 10452a55deb1SDavid E. O'Brien else 10462a55deb1SDavid E. O'Brien return NCCL; 10472a55deb1SDavid E. O'Brien } else 10482a55deb1SDavid E. O'Brien *bp++ = c; 10492a55deb1SDavid E. O'Brien } 1050*b5253557SWarner Losh break; 1051*b5253557SWarner Losh case '{': 1052*b5253557SWarner Losh if (isdigit(*(prestr))) { 1053*b5253557SWarner Losh num = 0; /* Process as a repetition */ 1054*b5253557SWarner Losh n = -1; m = -1; 1055*b5253557SWarner Losh commafound = 0; 1056*b5253557SWarner Losh digitfound = 0; 1057*b5253557SWarner Losh startreptok = prestr-1; 1058*b5253557SWarner Losh /* Remember start of previous atom here ? */ 1059*b5253557SWarner Losh } else { /* just a { char, not a repetition */ 1060*b5253557SWarner Losh rlxval = c; 1061*b5253557SWarner Losh return CHAR; 1062*b5253557SWarner Losh } 1063*b5253557SWarner Losh for (; ; ) { 1064*b5253557SWarner Losh if ((c = *prestr++) == '}') { 1065*b5253557SWarner Losh if (commafound) { 1066*b5253557SWarner Losh if (digitfound) { /* {n,m} */ 1067*b5253557SWarner Losh m = num; 1068*b5253557SWarner Losh if (m<n) 1069*b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s", 1070*b5253557SWarner Losh lastre); 1071*b5253557SWarner Losh if ((n==0) && (m==1)) { 1072*b5253557SWarner Losh return QUEST; 1073*b5253557SWarner Losh } 1074*b5253557SWarner Losh } else { /* {n,} */ 1075*b5253557SWarner Losh if (n==0) return STAR; 1076*b5253557SWarner Losh if (n==1) return PLUS; 1077*b5253557SWarner Losh } 1078*b5253557SWarner Losh } else { 1079*b5253557SWarner Losh if (digitfound) { /* {n} same as {n,n} */ 1080*b5253557SWarner Losh n = num; 1081*b5253557SWarner Losh m = num; 1082*b5253557SWarner Losh } else { /* {} */ 1083*b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s", 1084*b5253557SWarner Losh lastre); 1085*b5253557SWarner Losh } 1086*b5253557SWarner Losh } 1087*b5253557SWarner Losh if (repeat(starttok, prestr-starttok, lastatom, 1088*b5253557SWarner Losh startreptok - lastatom, n, m) > 0) { 1089*b5253557SWarner Losh if ((n==0) && (m==0)) { 1090*b5253557SWarner Losh return EMPTYRE; 1091*b5253557SWarner Losh } 1092*b5253557SWarner Losh /* must rescan input for next token */ 1093*b5253557SWarner Losh goto rescan; 1094*b5253557SWarner Losh } 1095*b5253557SWarner Losh /* Failed to replace: eat up {...} characters 1096*b5253557SWarner Losh and treat like just PLUS */ 1097*b5253557SWarner Losh return PLUS; 1098*b5253557SWarner Losh } else if (c == '\0') { 1099*b5253557SWarner Losh FATAL("nonterminated character class %.20s", 1100*b5253557SWarner Losh lastre); 1101*b5253557SWarner Losh } else if (isdigit(c)) { 1102*b5253557SWarner Losh num = 10 * num + c - '0'; 1103*b5253557SWarner Losh digitfound = 1; 1104*b5253557SWarner Losh } else if (c == ',') { 1105*b5253557SWarner Losh if (commafound) 1106*b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s", 1107*b5253557SWarner Losh lastre); 1108*b5253557SWarner Losh /* looking for {n,} or {n,m} */ 1109*b5253557SWarner Losh commafound = 1; 1110*b5253557SWarner Losh n = num; 1111*b5253557SWarner Losh digitfound = 0; /* reset */ 1112*b5253557SWarner Losh num = 0; 1113*b5253557SWarner Losh } else { 1114*b5253557SWarner Losh FATAL("illegal repetition expression: class %.20s", 1115*b5253557SWarner Losh lastre); 1116*b5253557SWarner Losh } 1117*b5253557SWarner Losh } 1118*b5253557SWarner Losh break; 11192a55deb1SDavid E. O'Brien } 11202a55deb1SDavid E. O'Brien } 11212a55deb1SDavid E. O'Brien 11222a55deb1SDavid E. O'Brien int cgoto(fa *f, int s, int c) 11232a55deb1SDavid E. O'Brien { 11242a55deb1SDavid E. O'Brien int i, j, k; 11252a55deb1SDavid E. O'Brien int *p, *q; 11262a55deb1SDavid E. O'Brien 1127c263f9bfSRuslan Ermilov assert(c == HAT || c < NCHARS); 11282a55deb1SDavid E. O'Brien while (f->accept >= maxsetvec) { /* guessing here! */ 11292a55deb1SDavid E. O'Brien maxsetvec *= 4; 11302a55deb1SDavid E. O'Brien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 11312a55deb1SDavid E. O'Brien tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 1132*b5253557SWarner Losh if (setvec == 0 || tmpset == 0) 11332a55deb1SDavid E. O'Brien overflo("out of space in cgoto()"); 11342a55deb1SDavid E. O'Brien } 11352a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++) 11362a55deb1SDavid E. O'Brien setvec[i] = 0; 11372a55deb1SDavid E. O'Brien setcnt = 0; 11382a55deb1SDavid E. O'Brien /* compute positions of gototab[s,c] into setvec */ 11392a55deb1SDavid E. O'Brien p = f->posns[s]; 11402a55deb1SDavid E. O'Brien for (i = 1; i <= *p; i++) { 11412a55deb1SDavid E. O'Brien if ((k = f->re[p[i]].ltype) != FINAL) { 11422a55deb1SDavid E. O'Brien if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 11432a55deb1SDavid E. O'Brien || (k == DOT && c != 0 && c != HAT) 11442a55deb1SDavid E. O'Brien || (k == ALL && c != 0) 1145addad6afSRong-En Fan || (k == EMPTYRE && c != 0) 11462a55deb1SDavid E. O'Brien || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 11472a55deb1SDavid E. O'Brien || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 11482a55deb1SDavid E. O'Brien q = f->re[p[i]].lfollow; 11492a55deb1SDavid E. O'Brien for (j = 1; j <= *q; j++) { 11502a55deb1SDavid E. O'Brien if (q[j] >= maxsetvec) { 11512a55deb1SDavid E. O'Brien maxsetvec *= 4; 11522a55deb1SDavid E. O'Brien setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 115391217c1cSRuslan Ermilov tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 1154*b5253557SWarner Losh if (setvec == 0 || tmpset == 0) 11552a55deb1SDavid E. O'Brien overflo("cgoto overflow"); 11562a55deb1SDavid E. O'Brien } 11572a55deb1SDavid E. O'Brien if (setvec[q[j]] == 0) { 11582a55deb1SDavid E. O'Brien setcnt++; 11592a55deb1SDavid E. O'Brien setvec[q[j]] = 1; 11602a55deb1SDavid E. O'Brien } 11612a55deb1SDavid E. O'Brien } 11622a55deb1SDavid E. O'Brien } 11632a55deb1SDavid E. O'Brien } 11642a55deb1SDavid E. O'Brien } 11652a55deb1SDavid E. O'Brien /* determine if setvec is a previous state */ 11662a55deb1SDavid E. O'Brien tmpset[0] = setcnt; 11672a55deb1SDavid E. O'Brien j = 1; 11682a55deb1SDavid E. O'Brien for (i = f->accept; i >= 0; i--) 11692a55deb1SDavid E. O'Brien if (setvec[i]) { 11702a55deb1SDavid E. O'Brien tmpset[j++] = i; 11712a55deb1SDavid E. O'Brien } 11722a55deb1SDavid E. O'Brien /* tmpset == previous state? */ 11732a55deb1SDavid E. O'Brien for (i = 1; i <= f->curstat; i++) { 11742a55deb1SDavid E. O'Brien p = f->posns[i]; 11752a55deb1SDavid E. O'Brien if ((k = tmpset[0]) != p[0]) 11762a55deb1SDavid E. O'Brien goto different; 11772a55deb1SDavid E. O'Brien for (j = 1; j <= k; j++) 11782a55deb1SDavid E. O'Brien if (tmpset[j] != p[j]) 11792a55deb1SDavid E. O'Brien goto different; 11802a55deb1SDavid E. O'Brien /* setvec is state i */ 11812a55deb1SDavid E. O'Brien f->gototab[s][c] = i; 11822a55deb1SDavid E. O'Brien return i; 11832a55deb1SDavid E. O'Brien different:; 11842a55deb1SDavid E. O'Brien } 11852a55deb1SDavid E. O'Brien 11862a55deb1SDavid E. O'Brien /* add tmpset to current set of states */ 11872a55deb1SDavid E. O'Brien if (f->curstat >= NSTATES-1) { 11882a55deb1SDavid E. O'Brien f->curstat = 2; 11892a55deb1SDavid E. O'Brien f->reset = 1; 11902a55deb1SDavid E. O'Brien for (i = 2; i < NSTATES; i++) 11912a55deb1SDavid E. O'Brien xfree(f->posns[i]); 11922a55deb1SDavid E. O'Brien } else 11932a55deb1SDavid E. O'Brien ++(f->curstat); 11942a55deb1SDavid E. O'Brien for (i = 0; i < NCHARS; i++) 11952a55deb1SDavid E. O'Brien f->gototab[f->curstat][i] = 0; 11962a55deb1SDavid E. O'Brien xfree(f->posns[f->curstat]); 1197*b5253557SWarner Losh if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 11982a55deb1SDavid E. O'Brien overflo("out of space in cgoto"); 11992a55deb1SDavid E. O'Brien 12002a55deb1SDavid E. O'Brien f->posns[f->curstat] = p; 12012a55deb1SDavid E. O'Brien f->gototab[s][c] = f->curstat; 12022a55deb1SDavid E. O'Brien for (i = 0; i <= setcnt; i++) 12032a55deb1SDavid E. O'Brien p[i] = tmpset[i]; 12042a55deb1SDavid E. O'Brien if (setvec[f->accept]) 12052a55deb1SDavid E. O'Brien f->out[f->curstat] = 1; 12062a55deb1SDavid E. O'Brien else 12072a55deb1SDavid E. O'Brien f->out[f->curstat] = 0; 12082a55deb1SDavid E. O'Brien return f->curstat; 12092a55deb1SDavid E. O'Brien } 12102a55deb1SDavid E. O'Brien 12112a55deb1SDavid E. O'Brien 12122a55deb1SDavid E. O'Brien void freefa(fa *f) /* free a finite automaton */ 12132a55deb1SDavid E. O'Brien { 12142a55deb1SDavid E. O'Brien int i; 12152a55deb1SDavid E. O'Brien 12162a55deb1SDavid E. O'Brien if (f == NULL) 12172a55deb1SDavid E. O'Brien return; 12182a55deb1SDavid E. O'Brien for (i = 0; i <= f->curstat; i++) 12192a55deb1SDavid E. O'Brien xfree(f->posns[i]); 12202a55deb1SDavid E. O'Brien for (i = 0; i <= f->accept; i++) { 12212a55deb1SDavid E. O'Brien xfree(f->re[i].lfollow); 12222a55deb1SDavid E. O'Brien if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 12232a55deb1SDavid E. O'Brien xfree((f->re[i].lval.np)); 12242a55deb1SDavid E. O'Brien } 12252a55deb1SDavid E. O'Brien xfree(f->restr); 12262a55deb1SDavid E. O'Brien xfree(f); 12272a55deb1SDavid E. O'Brien } 1228