14297a3b0SGarrett D'Amore /* 2*2d08521bSGarrett D'Amore * Copyright 2013 Garrett D'Amore <garrett@damore.org> 36b5e5868SGarrett D'Amore * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 433f5ff17SMilan Jurik * Copyright 2012 Milan Jurik. All rights reserved. 54297a3b0SGarrett D'Amore * Copyright (c) 1992, 1993, 1994 Henry Spencer. 64297a3b0SGarrett D'Amore * Copyright (c) 1992, 1993, 1994 74297a3b0SGarrett D'Amore * The Regents of the University of California. All rights reserved. 84297a3b0SGarrett D'Amore * 94297a3b0SGarrett D'Amore * This code is derived from software contributed to Berkeley by 104297a3b0SGarrett D'Amore * Henry Spencer. 114297a3b0SGarrett D'Amore * 124297a3b0SGarrett D'Amore * Redistribution and use in source and binary forms, with or without 134297a3b0SGarrett D'Amore * modification, are permitted provided that the following conditions 144297a3b0SGarrett D'Amore * are met: 154297a3b0SGarrett D'Amore * 1. Redistributions of source code must retain the above copyright 164297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer. 174297a3b0SGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright 184297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer in the 194297a3b0SGarrett D'Amore * documentation and/or other materials provided with the distribution. 204297a3b0SGarrett D'Amore * 4. Neither the name of the University nor the names of its contributors 214297a3b0SGarrett D'Amore * may be used to endorse or promote products derived from this software 224297a3b0SGarrett D'Amore * without specific prior written permission. 234297a3b0SGarrett D'Amore * 244297a3b0SGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 254297a3b0SGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 264297a3b0SGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 274297a3b0SGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 284297a3b0SGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 294297a3b0SGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 304297a3b0SGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 314297a3b0SGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 324297a3b0SGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 334297a3b0SGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 344297a3b0SGarrett D'Amore * SUCH DAMAGE. 354297a3b0SGarrett D'Amore */ 364297a3b0SGarrett D'Amore 374297a3b0SGarrett D'Amore #include "lint.h" 384297a3b0SGarrett D'Amore #include "file64.h" 394297a3b0SGarrett D'Amore #include <sys/types.h> 404297a3b0SGarrett D'Amore #include <stdio.h> 414297a3b0SGarrett D'Amore #include <string.h> 424297a3b0SGarrett D'Amore #include <ctype.h> 434297a3b0SGarrett D'Amore #include <limits.h> 444297a3b0SGarrett D'Amore #include <stdlib.h> 454297a3b0SGarrett D'Amore #include <regex.h> 464297a3b0SGarrett D'Amore #include <wchar.h> 474297a3b0SGarrett D'Amore #include <wctype.h> 484297a3b0SGarrett D'Amore 494297a3b0SGarrett D'Amore #include "runetype.h" 504297a3b0SGarrett D'Amore #include "collate.h" 514297a3b0SGarrett D'Amore 524297a3b0SGarrett D'Amore #include "utils.h" 534297a3b0SGarrett D'Amore #include "regex2.h" 544297a3b0SGarrett D'Amore 554297a3b0SGarrett D'Amore #include "cname.h" 564297a3b0SGarrett D'Amore #include "mblocal.h" 574297a3b0SGarrett D'Amore 584297a3b0SGarrett D'Amore /* 594297a3b0SGarrett D'Amore * parse structure, passed up and down to avoid global variables and 604297a3b0SGarrett D'Amore * other clumsinesses 614297a3b0SGarrett D'Amore */ 624297a3b0SGarrett D'Amore struct parse { 634297a3b0SGarrett D'Amore char *next; /* next character in RE */ 644297a3b0SGarrett D'Amore char *end; /* end of string (-> NUL normally) */ 654297a3b0SGarrett D'Amore int error; /* has an error been seen? */ 664297a3b0SGarrett D'Amore sop *strip; /* malloced strip */ 674297a3b0SGarrett D'Amore sopno ssize; /* malloced strip size (allocated) */ 684297a3b0SGarrett D'Amore sopno slen; /* malloced strip length (used) */ 694297a3b0SGarrett D'Amore int ncsalloc; /* number of csets allocated */ 704297a3b0SGarrett D'Amore struct re_guts *g; 714297a3b0SGarrett D'Amore #define NPAREN 10 /* we need to remember () 1-9 for back refs */ 724297a3b0SGarrett D'Amore sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 734297a3b0SGarrett D'Amore sopno pend[NPAREN]; /* -> ) ([0] unused) */ 744297a3b0SGarrett D'Amore }; 754297a3b0SGarrett D'Amore 764297a3b0SGarrett D'Amore /* ========= begin header generated by ./mkh ========= */ 774297a3b0SGarrett D'Amore #ifdef __cplusplus 784297a3b0SGarrett D'Amore extern "C" { 794297a3b0SGarrett D'Amore #endif 804297a3b0SGarrett D'Amore 814297a3b0SGarrett D'Amore /* === regcomp.c === */ 824297a3b0SGarrett D'Amore static void p_ere(struct parse *p, wint_t stop); 834297a3b0SGarrett D'Amore static void p_ere_exp(struct parse *p); 844297a3b0SGarrett D'Amore static void p_str(struct parse *p); 854297a3b0SGarrett D'Amore static void p_bre(struct parse *p, wint_t end1, wint_t end2); 864297a3b0SGarrett D'Amore static int p_simp_re(struct parse *p, int starordinary); 874297a3b0SGarrett D'Amore static int p_count(struct parse *p); 884297a3b0SGarrett D'Amore static void p_bracket(struct parse *p); 894297a3b0SGarrett D'Amore static void p_b_term(struct parse *p, cset *cs); 904297a3b0SGarrett D'Amore static void p_b_cclass(struct parse *p, cset *cs); 914297a3b0SGarrett D'Amore static void p_b_eclass(struct parse *p, cset *cs); 924297a3b0SGarrett D'Amore static wint_t p_b_symbol(struct parse *p); 934297a3b0SGarrett D'Amore static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 944297a3b0SGarrett D'Amore static wint_t othercase(wint_t ch); 954297a3b0SGarrett D'Amore static void bothcases(struct parse *p, wint_t ch); 964297a3b0SGarrett D'Amore static void ordinary(struct parse *p, wint_t ch); 974297a3b0SGarrett D'Amore static void nonnewline(struct parse *p); 984297a3b0SGarrett D'Amore static void repeat(struct parse *p, sopno start, int from, int to); 994297a3b0SGarrett D'Amore static int seterr(struct parse *p, int e); 1004297a3b0SGarrett D'Amore static cset *allocset(struct parse *p); 1014297a3b0SGarrett D'Amore static void freeset(struct parse *p, cset *cs); 1024297a3b0SGarrett D'Amore static void CHadd(struct parse *p, cset *cs, wint_t ch); 1034297a3b0SGarrett D'Amore static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 1044297a3b0SGarrett D'Amore static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 1054297a3b0SGarrett D'Amore static wint_t singleton(cset *cs); 1064297a3b0SGarrett D'Amore static sopno dupl(struct parse *p, sopno start, sopno finish); 1074297a3b0SGarrett D'Amore static void doemit(struct parse *p, sop op, size_t opnd); 1084297a3b0SGarrett D'Amore static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1094297a3b0SGarrett D'Amore static void dofwd(struct parse *p, sopno pos, sop value); 1104297a3b0SGarrett D'Amore static void enlarge(struct parse *p, sopno size); 1114297a3b0SGarrett D'Amore static void stripsnug(struct parse *p, struct re_guts *g); 1124297a3b0SGarrett D'Amore static void findmust(struct parse *p, struct re_guts *g); 1134297a3b0SGarrett D'Amore static int altoffset(sop *scan, int offset); 1144297a3b0SGarrett D'Amore static void computejumps(struct parse *p, struct re_guts *g); 1154297a3b0SGarrett D'Amore static void computematchjumps(struct parse *p, struct re_guts *g); 1164297a3b0SGarrett D'Amore static sopno pluscount(struct parse *p, struct re_guts *g); 1174297a3b0SGarrett D'Amore static wint_t wgetnext(struct parse *p); 1184297a3b0SGarrett D'Amore 1194297a3b0SGarrett D'Amore #ifdef __cplusplus 1204297a3b0SGarrett D'Amore } 1214297a3b0SGarrett D'Amore #endif 1224297a3b0SGarrett D'Amore /* ========= end header generated by ./mkh ========= */ 1234297a3b0SGarrett D'Amore 1244297a3b0SGarrett D'Amore static char nuls[10]; /* place to point scanner in event of error */ 1254297a3b0SGarrett D'Amore 1264297a3b0SGarrett D'Amore /* 1274297a3b0SGarrett D'Amore * macros for use with parse structure 1284297a3b0SGarrett D'Amore * BEWARE: these know that the parse structure is named `p' !!! 1294297a3b0SGarrett D'Amore */ 1304297a3b0SGarrett D'Amore #define PEEK() (*p->next) 1314297a3b0SGarrett D'Amore #define PEEK2() (*(p->next+1)) 1324297a3b0SGarrett D'Amore #define MORE() (p->next < p->end) 1334297a3b0SGarrett D'Amore #define MORE2() (p->next+1 < p->end) 1344297a3b0SGarrett D'Amore #define SEE(c) (MORE() && PEEK() == (c)) 1354297a3b0SGarrett D'Amore #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 1364297a3b0SGarrett D'Amore #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 1374297a3b0SGarrett D'Amore #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 1384297a3b0SGarrett D'Amore #define NEXT() (p->next++) 1394297a3b0SGarrett D'Amore #define NEXT2() (p->next += 2) 1404297a3b0SGarrett D'Amore #define NEXTn(n) (p->next += (n)) 1414297a3b0SGarrett D'Amore #define GETNEXT() (*p->next++) 1424297a3b0SGarrett D'Amore #define WGETNEXT() wgetnext(p) 1434297a3b0SGarrett D'Amore #define SETERROR(e) ((void)seterr(p, (e))) 1444297a3b0SGarrett D'Amore #define REQUIRE(co, e) ((co) || seterr(p, e)) 1454297a3b0SGarrett D'Amore #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 1464297a3b0SGarrett D'Amore #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 1474297a3b0SGarrett D'Amore #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 1484297a3b0SGarrett D'Amore #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 1494297a3b0SGarrett D'Amore #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 1504297a3b0SGarrett D'Amore #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 1514297a3b0SGarrett D'Amore #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 1524297a3b0SGarrett D'Amore #define HERE() (p->slen) 1534297a3b0SGarrett D'Amore #define THERE() (p->slen - 1) 1544297a3b0SGarrett D'Amore #define THERETHERE() (p->slen - 2) 1554297a3b0SGarrett D'Amore #define DROP(n) (p->slen -= (n)) 1564297a3b0SGarrett D'Amore 1574297a3b0SGarrett D'Amore #ifndef NDEBUG 1584297a3b0SGarrett D'Amore static int never = 0; /* for use in asserts; shuts lint up */ 1594297a3b0SGarrett D'Amore #else 1604297a3b0SGarrett D'Amore #define never 0 /* some <assert.h>s have bugs too */ 1614297a3b0SGarrett D'Amore #endif 1624297a3b0SGarrett D'Amore 1634297a3b0SGarrett D'Amore /* 1644297a3b0SGarrett D'Amore * regcomp - interface for parser and compilation 1654297a3b0SGarrett D'Amore */ 1664297a3b0SGarrett D'Amore int /* 0 success, otherwise REG_something */ 1674297a3b0SGarrett D'Amore regcomp(regex_t *_RESTRICT_KYWD preg, 1684297a3b0SGarrett D'Amore const char *_RESTRICT_KYWD pattern, 1694297a3b0SGarrett D'Amore int cflags) 1704297a3b0SGarrett D'Amore { 1714297a3b0SGarrett D'Amore struct parse pa; 1724297a3b0SGarrett D'Amore struct re_guts *g; 1734297a3b0SGarrett D'Amore struct parse *p = &pa; 1744297a3b0SGarrett D'Amore int i; 1754297a3b0SGarrett D'Amore size_t len; 1764297a3b0SGarrett D'Amore #ifdef REDEBUG 1774297a3b0SGarrett D'Amore #define GOODFLAGS(f) (f) 1784297a3b0SGarrett D'Amore #else 1794297a3b0SGarrett D'Amore #define GOODFLAGS(f) ((f)&~REG_DUMP) 1804297a3b0SGarrett D'Amore #endif 1814297a3b0SGarrett D'Amore 1824297a3b0SGarrett D'Amore /* We had REG_INVARG, but we don't have that on Solaris. */ 1834297a3b0SGarrett D'Amore cflags = GOODFLAGS(cflags); 1844297a3b0SGarrett D'Amore if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 1854297a3b0SGarrett D'Amore return (REG_EFATAL); 1864297a3b0SGarrett D'Amore 1874297a3b0SGarrett D'Amore if (cflags®_PEND) { 1884297a3b0SGarrett D'Amore if (preg->re_endp < pattern) 1894297a3b0SGarrett D'Amore return (REG_EFATAL); 1904297a3b0SGarrett D'Amore len = preg->re_endp - pattern; 1914297a3b0SGarrett D'Amore } else 1924297a3b0SGarrett D'Amore len = strlen((char *)pattern); 1934297a3b0SGarrett D'Amore 1944297a3b0SGarrett D'Amore /* do the mallocs early so failure handling is easy */ 1954297a3b0SGarrett D'Amore g = (struct re_guts *)malloc(sizeof (struct re_guts)); 1964297a3b0SGarrett D'Amore if (g == NULL) 1974297a3b0SGarrett D'Amore return (REG_ESPACE); 1984297a3b0SGarrett D'Amore p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 1994297a3b0SGarrett D'Amore p->strip = (sop *)malloc(p->ssize * sizeof (sop)); 2004297a3b0SGarrett D'Amore p->slen = 0; 2014297a3b0SGarrett D'Amore if (p->strip == NULL) { 2024297a3b0SGarrett D'Amore free((char *)g); 2034297a3b0SGarrett D'Amore return (REG_ESPACE); 2044297a3b0SGarrett D'Amore } 2054297a3b0SGarrett D'Amore 2064297a3b0SGarrett D'Amore /* set things up */ 2074297a3b0SGarrett D'Amore p->g = g; 2084297a3b0SGarrett D'Amore p->next = (char *)pattern; /* convenience; we do not modify it */ 2094297a3b0SGarrett D'Amore p->end = p->next + len; 2104297a3b0SGarrett D'Amore p->error = 0; 2114297a3b0SGarrett D'Amore p->ncsalloc = 0; 2124297a3b0SGarrett D'Amore for (i = 0; i < NPAREN; i++) { 2134297a3b0SGarrett D'Amore p->pbegin[i] = 0; 2144297a3b0SGarrett D'Amore p->pend[i] = 0; 2154297a3b0SGarrett D'Amore } 2164297a3b0SGarrett D'Amore g->sets = NULL; 2174297a3b0SGarrett D'Amore g->ncsets = 0; 2184297a3b0SGarrett D'Amore g->cflags = cflags; 2194297a3b0SGarrett D'Amore g->iflags = 0; 2204297a3b0SGarrett D'Amore g->nbol = 0; 2214297a3b0SGarrett D'Amore g->neol = 0; 2224297a3b0SGarrett D'Amore g->must = NULL; 2234297a3b0SGarrett D'Amore g->moffset = -1; 2244297a3b0SGarrett D'Amore g->charjump = NULL; 2254297a3b0SGarrett D'Amore g->matchjump = NULL; 2264297a3b0SGarrett D'Amore g->mlen = 0; 2274297a3b0SGarrett D'Amore g->nsub = 0; 2284297a3b0SGarrett D'Amore g->backrefs = 0; 2294297a3b0SGarrett D'Amore 2304297a3b0SGarrett D'Amore /* do it */ 2314297a3b0SGarrett D'Amore EMIT(OEND, 0); 2324297a3b0SGarrett D'Amore g->firststate = THERE(); 2334297a3b0SGarrett D'Amore if (cflags®_EXTENDED) 2344297a3b0SGarrett D'Amore p_ere(p, OUT); 2354297a3b0SGarrett D'Amore else if (cflags®_NOSPEC) 2364297a3b0SGarrett D'Amore p_str(p); 2374297a3b0SGarrett D'Amore else 2384297a3b0SGarrett D'Amore p_bre(p, OUT, OUT); 2394297a3b0SGarrett D'Amore EMIT(OEND, 0); 2404297a3b0SGarrett D'Amore g->laststate = THERE(); 2414297a3b0SGarrett D'Amore 2424297a3b0SGarrett D'Amore /* tidy up loose ends and fill things in */ 2434297a3b0SGarrett D'Amore stripsnug(p, g); 2444297a3b0SGarrett D'Amore findmust(p, g); 2454297a3b0SGarrett D'Amore /* 2464297a3b0SGarrett D'Amore * only use Boyer-Moore algorithm if the pattern is bigger 2474297a3b0SGarrett D'Amore * than three characters 2484297a3b0SGarrett D'Amore */ 2494297a3b0SGarrett D'Amore if (g->mlen > 3) { 2504297a3b0SGarrett D'Amore computejumps(p, g); 2514297a3b0SGarrett D'Amore computematchjumps(p, g); 2524297a3b0SGarrett D'Amore if (g->matchjump == NULL && g->charjump != NULL) { 2534297a3b0SGarrett D'Amore free(g->charjump); 2544297a3b0SGarrett D'Amore g->charjump = NULL; 2554297a3b0SGarrett D'Amore } 2564297a3b0SGarrett D'Amore } 2574297a3b0SGarrett D'Amore g->nplus = pluscount(p, g); 2584297a3b0SGarrett D'Amore g->magic = MAGIC2; 2594297a3b0SGarrett D'Amore preg->re_nsub = g->nsub; 2604297a3b0SGarrett D'Amore preg->re_g = g; 2614297a3b0SGarrett D'Amore preg->re_magic = MAGIC1; 2624297a3b0SGarrett D'Amore #ifndef REDEBUG 2634297a3b0SGarrett D'Amore /* not debugging, so can't rely on the assert() in regexec() */ 2644297a3b0SGarrett D'Amore if (g->iflags&BAD) 2654297a3b0SGarrett D'Amore SETERROR(REG_EFATAL); 2664297a3b0SGarrett D'Amore #endif 2674297a3b0SGarrett D'Amore 2684297a3b0SGarrett D'Amore /* win or lose, we're done */ 2694297a3b0SGarrett D'Amore if (p->error != 0) /* lose */ 2704297a3b0SGarrett D'Amore regfree(preg); 2714297a3b0SGarrett D'Amore return (p->error); 2724297a3b0SGarrett D'Amore } 2734297a3b0SGarrett D'Amore 2744297a3b0SGarrett D'Amore /* 2754297a3b0SGarrett D'Amore * p_ere - ERE parser top level, concatenation and alternation 2764297a3b0SGarrett D'Amore */ 2774297a3b0SGarrett D'Amore static void 2784297a3b0SGarrett D'Amore p_ere(struct parse *p, 2794297a3b0SGarrett D'Amore wint_t stop) /* character this ERE should end at */ 2804297a3b0SGarrett D'Amore { 2814297a3b0SGarrett D'Amore char c; 2824297a3b0SGarrett D'Amore sopno prevback; 2834297a3b0SGarrett D'Amore sopno prevfwd; 2844297a3b0SGarrett D'Amore sopno conc; 2854297a3b0SGarrett D'Amore int first = 1; /* is this the first alternative? */ 2864297a3b0SGarrett D'Amore 2874297a3b0SGarrett D'Amore for (;;) { 2884297a3b0SGarrett D'Amore /* do a bunch of concatenated expressions */ 2894297a3b0SGarrett D'Amore conc = HERE(); 2904297a3b0SGarrett D'Amore while (MORE() && (c = PEEK()) != '|' && c != stop) 2914297a3b0SGarrett D'Amore p_ere_exp(p); 2924297a3b0SGarrett D'Amore /* require nonempty */ 2934297a3b0SGarrett D'Amore (void) REQUIRE(HERE() != conc, REG_BADPAT); 2944297a3b0SGarrett D'Amore 2954297a3b0SGarrett D'Amore if (!EAT('|')) 2964297a3b0SGarrett D'Amore break; /* NOTE BREAK OUT */ 2974297a3b0SGarrett D'Amore 2984297a3b0SGarrett D'Amore if (first) { 2994297a3b0SGarrett D'Amore INSERT(OCH_, conc); /* offset is wrong */ 3004297a3b0SGarrett D'Amore prevfwd = conc; 3014297a3b0SGarrett D'Amore prevback = conc; 3024297a3b0SGarrett D'Amore first = 0; 3034297a3b0SGarrett D'Amore } 3044297a3b0SGarrett D'Amore ASTERN(OOR1, prevback); 3054297a3b0SGarrett D'Amore prevback = THERE(); 3064297a3b0SGarrett D'Amore AHEAD(prevfwd); /* fix previous offset */ 3074297a3b0SGarrett D'Amore prevfwd = HERE(); 3084297a3b0SGarrett D'Amore EMIT(OOR2, 0); /* offset is very wrong */ 3094297a3b0SGarrett D'Amore } 3104297a3b0SGarrett D'Amore 3114297a3b0SGarrett D'Amore if (!first) { /* tail-end fixups */ 3124297a3b0SGarrett D'Amore AHEAD(prevfwd); 3134297a3b0SGarrett D'Amore ASTERN(O_CH, prevback); 3144297a3b0SGarrett D'Amore } 3154297a3b0SGarrett D'Amore 3164297a3b0SGarrett D'Amore assert(!MORE() || SEE(stop)); 3174297a3b0SGarrett D'Amore } 3184297a3b0SGarrett D'Amore 3194297a3b0SGarrett D'Amore /* 3204297a3b0SGarrett D'Amore * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 3214297a3b0SGarrett D'Amore */ 3224297a3b0SGarrett D'Amore static void 3234297a3b0SGarrett D'Amore p_ere_exp(struct parse *p) 3244297a3b0SGarrett D'Amore { 3254297a3b0SGarrett D'Amore char c; 3264297a3b0SGarrett D'Amore wint_t wc; 3274297a3b0SGarrett D'Amore sopno pos; 3284297a3b0SGarrett D'Amore int count; 3294297a3b0SGarrett D'Amore int count2; 3304297a3b0SGarrett D'Amore sopno subno; 3314297a3b0SGarrett D'Amore int wascaret = 0; 3324297a3b0SGarrett D'Amore 3334297a3b0SGarrett D'Amore assert(MORE()); /* caller should have ensured this */ 3344297a3b0SGarrett D'Amore c = GETNEXT(); 3354297a3b0SGarrett D'Amore 3364297a3b0SGarrett D'Amore pos = HERE(); 3374297a3b0SGarrett D'Amore switch (c) { 3384297a3b0SGarrett D'Amore case '(': 3394297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EPAREN); 3404297a3b0SGarrett D'Amore p->g->nsub++; 3414297a3b0SGarrett D'Amore subno = p->g->nsub; 3424297a3b0SGarrett D'Amore if (subno < NPAREN) 3434297a3b0SGarrett D'Amore p->pbegin[subno] = HERE(); 3444297a3b0SGarrett D'Amore EMIT(OLPAREN, subno); 3454297a3b0SGarrett D'Amore if (!SEE(')')) 3464297a3b0SGarrett D'Amore p_ere(p, ')'); 3474297a3b0SGarrett D'Amore if (subno < NPAREN) { 3484297a3b0SGarrett D'Amore p->pend[subno] = HERE(); 3494297a3b0SGarrett D'Amore assert(p->pend[subno] != 0); 3504297a3b0SGarrett D'Amore } 3514297a3b0SGarrett D'Amore EMIT(ORPAREN, subno); 3524297a3b0SGarrett D'Amore (void) MUSTEAT(')', REG_EPAREN); 3534297a3b0SGarrett D'Amore break; 3544297a3b0SGarrett D'Amore #ifndef POSIX_MISTAKE 3554297a3b0SGarrett D'Amore case ')': /* happens only if no current unmatched ( */ 3564297a3b0SGarrett D'Amore /* 3574297a3b0SGarrett D'Amore * You may ask, why the ifndef? Because I didn't notice 3584297a3b0SGarrett D'Amore * this until slightly too late for 1003.2, and none of the 3594297a3b0SGarrett D'Amore * other 1003.2 regular-expression reviewers noticed it at 3604297a3b0SGarrett D'Amore * all. So an unmatched ) is legal POSIX, at least until 3614297a3b0SGarrett D'Amore * we can get it fixed. 3624297a3b0SGarrett D'Amore */ 3634297a3b0SGarrett D'Amore SETERROR(REG_EPAREN); 3644297a3b0SGarrett D'Amore break; 3654297a3b0SGarrett D'Amore #endif 3664297a3b0SGarrett D'Amore case '^': 3674297a3b0SGarrett D'Amore EMIT(OBOL, 0); 3684297a3b0SGarrett D'Amore p->g->iflags |= USEBOL; 3694297a3b0SGarrett D'Amore p->g->nbol++; 3704297a3b0SGarrett D'Amore wascaret = 1; 3714297a3b0SGarrett D'Amore break; 3724297a3b0SGarrett D'Amore case '$': 3734297a3b0SGarrett D'Amore EMIT(OEOL, 0); 3744297a3b0SGarrett D'Amore p->g->iflags |= USEEOL; 3754297a3b0SGarrett D'Amore p->g->neol++; 3764297a3b0SGarrett D'Amore break; 3774297a3b0SGarrett D'Amore case '|': 3784297a3b0SGarrett D'Amore SETERROR(REG_BADPAT); 3794297a3b0SGarrett D'Amore break; 3804297a3b0SGarrett D'Amore case '*': 3814297a3b0SGarrett D'Amore case '+': 3824297a3b0SGarrett D'Amore case '?': 3834297a3b0SGarrett D'Amore SETERROR(REG_BADRPT); 3844297a3b0SGarrett D'Amore break; 3854297a3b0SGarrett D'Amore case '.': 3864297a3b0SGarrett D'Amore if (p->g->cflags®_NEWLINE) 3874297a3b0SGarrett D'Amore nonnewline(p); 3884297a3b0SGarrett D'Amore else 3894297a3b0SGarrett D'Amore EMIT(OANY, 0); 3904297a3b0SGarrett D'Amore break; 3914297a3b0SGarrett D'Amore case '[': 3924297a3b0SGarrett D'Amore p_bracket(p); 3934297a3b0SGarrett D'Amore break; 3944297a3b0SGarrett D'Amore case '\\': 3954297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EESCAPE); 3964297a3b0SGarrett D'Amore wc = WGETNEXT(); 39784441f85SGarrett D'Amore switch (wc) { 39884441f85SGarrett D'Amore case '<': 39984441f85SGarrett D'Amore EMIT(OBOW, 0); 40084441f85SGarrett D'Amore break; 40184441f85SGarrett D'Amore case '>': 40284441f85SGarrett D'Amore EMIT(OEOW, 0); 40384441f85SGarrett D'Amore break; 40484441f85SGarrett D'Amore default: 4054297a3b0SGarrett D'Amore ordinary(p, wc); 4064297a3b0SGarrett D'Amore break; 40784441f85SGarrett D'Amore } 40884441f85SGarrett D'Amore break; 4094297a3b0SGarrett D'Amore case '{': /* okay as ordinary except if digit follows */ 4104297a3b0SGarrett D'Amore (void) REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 4114297a3b0SGarrett D'Amore /* FALLTHROUGH */ 4124297a3b0SGarrett D'Amore default: 4134297a3b0SGarrett D'Amore p->next--; 4144297a3b0SGarrett D'Amore wc = WGETNEXT(); 4154297a3b0SGarrett D'Amore ordinary(p, wc); 4164297a3b0SGarrett D'Amore break; 4174297a3b0SGarrett D'Amore } 4184297a3b0SGarrett D'Amore 4194297a3b0SGarrett D'Amore if (!MORE()) 4204297a3b0SGarrett D'Amore return; 4214297a3b0SGarrett D'Amore c = PEEK(); 4224297a3b0SGarrett D'Amore /* we call { a repetition if followed by a digit */ 4234297a3b0SGarrett D'Amore if (!(c == '*' || c == '+' || c == '?' || 4244297a3b0SGarrett D'Amore (c == '{' && MORE2() && isdigit((uch)PEEK2())))) 4254297a3b0SGarrett D'Amore return; /* no repetition, we're done */ 4264297a3b0SGarrett D'Amore NEXT(); 4274297a3b0SGarrett D'Amore 4284297a3b0SGarrett D'Amore (void) REQUIRE(!wascaret, REG_BADRPT); 4294297a3b0SGarrett D'Amore switch (c) { 4304297a3b0SGarrett D'Amore case '*': /* implemented as +? */ 4314297a3b0SGarrett D'Amore /* this case does not require the (y|) trick, noKLUDGE */ 4324297a3b0SGarrett D'Amore INSERT(OPLUS_, pos); 4334297a3b0SGarrett D'Amore ASTERN(O_PLUS, pos); 4344297a3b0SGarrett D'Amore INSERT(OQUEST_, pos); 4354297a3b0SGarrett D'Amore ASTERN(O_QUEST, pos); 4364297a3b0SGarrett D'Amore break; 4374297a3b0SGarrett D'Amore case '+': 4384297a3b0SGarrett D'Amore INSERT(OPLUS_, pos); 4394297a3b0SGarrett D'Amore ASTERN(O_PLUS, pos); 4404297a3b0SGarrett D'Amore break; 4414297a3b0SGarrett D'Amore case '?': 4424297a3b0SGarrett D'Amore /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 4434297a3b0SGarrett D'Amore INSERT(OCH_, pos); /* offset slightly wrong */ 4444297a3b0SGarrett D'Amore ASTERN(OOR1, pos); /* this one's right */ 4454297a3b0SGarrett D'Amore AHEAD(pos); /* fix the OCH_ */ 4464297a3b0SGarrett D'Amore EMIT(OOR2, 0); /* offset very wrong... */ 4474297a3b0SGarrett D'Amore AHEAD(THERE()); /* ...so fix it */ 4484297a3b0SGarrett D'Amore ASTERN(O_CH, THERETHERE()); 4494297a3b0SGarrett D'Amore break; 4504297a3b0SGarrett D'Amore case '{': 4514297a3b0SGarrett D'Amore count = p_count(p); 4524297a3b0SGarrett D'Amore if (EAT(',')) { 4534297a3b0SGarrett D'Amore if (isdigit((uch)PEEK())) { 4544297a3b0SGarrett D'Amore count2 = p_count(p); 4554297a3b0SGarrett D'Amore (void) REQUIRE(count <= count2, REG_BADBR); 4564297a3b0SGarrett D'Amore } else /* single number with comma */ 4574297a3b0SGarrett D'Amore count2 = INFINITY; 4584297a3b0SGarrett D'Amore } else /* just a single number */ 4594297a3b0SGarrett D'Amore count2 = count; 4604297a3b0SGarrett D'Amore repeat(p, pos, count, count2); 4614297a3b0SGarrett D'Amore if (!EAT('}')) { /* error heuristics */ 4624297a3b0SGarrett D'Amore while (MORE() && PEEK() != '}') 4634297a3b0SGarrett D'Amore NEXT(); 4644297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACE); 4654297a3b0SGarrett D'Amore SETERROR(REG_BADBR); 4664297a3b0SGarrett D'Amore } 4674297a3b0SGarrett D'Amore break; 4684297a3b0SGarrett D'Amore } 4694297a3b0SGarrett D'Amore 4704297a3b0SGarrett D'Amore if (!MORE()) 4714297a3b0SGarrett D'Amore return; 4724297a3b0SGarrett D'Amore c = PEEK(); 4734297a3b0SGarrett D'Amore if (!(c == '*' || c == '+' || c == '?' || 4744297a3b0SGarrett D'Amore (c == '{' && MORE2() && isdigit((uch)PEEK2())))) 4754297a3b0SGarrett D'Amore return; 4764297a3b0SGarrett D'Amore SETERROR(REG_BADRPT); 4774297a3b0SGarrett D'Amore } 4784297a3b0SGarrett D'Amore 4794297a3b0SGarrett D'Amore /* 4804297a3b0SGarrett D'Amore * p_str - string (no metacharacters) "parser" 4814297a3b0SGarrett D'Amore */ 4824297a3b0SGarrett D'Amore static void 4834297a3b0SGarrett D'Amore p_str(struct parse *p) 4844297a3b0SGarrett D'Amore { 4854297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_BADPAT); 4864297a3b0SGarrett D'Amore while (MORE()) 4874297a3b0SGarrett D'Amore ordinary(p, WGETNEXT()); 4884297a3b0SGarrett D'Amore } 4894297a3b0SGarrett D'Amore 4904297a3b0SGarrett D'Amore /* 4914297a3b0SGarrett D'Amore * p_bre - BRE parser top level, anchoring and concatenation 4924297a3b0SGarrett D'Amore * Giving end1 as OUT essentially eliminates the end1/end2 check. 4934297a3b0SGarrett D'Amore * 4944297a3b0SGarrett D'Amore * This implementation is a bit of a kludge, in that a trailing $ is first 4954297a3b0SGarrett D'Amore * taken as an ordinary character and then revised to be an anchor. 4964297a3b0SGarrett D'Amore * The amount of lookahead needed to avoid this kludge is excessive. 4974297a3b0SGarrett D'Amore */ 4984297a3b0SGarrett D'Amore static void 4994297a3b0SGarrett D'Amore p_bre(struct parse *p, 5004297a3b0SGarrett D'Amore wint_t end1, /* first terminating character */ 5014297a3b0SGarrett D'Amore wint_t end2) /* second terminating character */ 5024297a3b0SGarrett D'Amore { 5034297a3b0SGarrett D'Amore sopno start = HERE(); 5044297a3b0SGarrett D'Amore int first = 1; /* first subexpression? */ 5054297a3b0SGarrett D'Amore int wasdollar = 0; 5064297a3b0SGarrett D'Amore 5074297a3b0SGarrett D'Amore if (EAT('^')) { 5084297a3b0SGarrett D'Amore EMIT(OBOL, 0); 5094297a3b0SGarrett D'Amore p->g->iflags |= USEBOL; 5104297a3b0SGarrett D'Amore p->g->nbol++; 5114297a3b0SGarrett D'Amore } 5124297a3b0SGarrett D'Amore while (MORE() && !SEETWO(end1, end2)) { 5134297a3b0SGarrett D'Amore wasdollar = p_simp_re(p, first); 5144297a3b0SGarrett D'Amore first = 0; 5154297a3b0SGarrett D'Amore } 5164297a3b0SGarrett D'Amore if (wasdollar) { /* oops, that was a trailing anchor */ 5174297a3b0SGarrett D'Amore DROP(1); 5184297a3b0SGarrett D'Amore EMIT(OEOL, 0); 5194297a3b0SGarrett D'Amore p->g->iflags |= USEEOL; 5204297a3b0SGarrett D'Amore p->g->neol++; 5214297a3b0SGarrett D'Amore } 5224297a3b0SGarrett D'Amore 5234297a3b0SGarrett D'Amore (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */ 5244297a3b0SGarrett D'Amore } 5254297a3b0SGarrett D'Amore 5264297a3b0SGarrett D'Amore /* 5274297a3b0SGarrett D'Amore * p_simp_re - parse a simple RE, an atom possibly followed by a repetition 5284297a3b0SGarrett D'Amore */ 5294297a3b0SGarrett D'Amore static int /* was the simple RE an unbackslashed $? */ 5304297a3b0SGarrett D'Amore p_simp_re(struct parse *p, 5314297a3b0SGarrett D'Amore int starordinary) /* is a leading * an ordinary character? */ 5324297a3b0SGarrett D'Amore { 5334297a3b0SGarrett D'Amore int c; 5344297a3b0SGarrett D'Amore int count; 5354297a3b0SGarrett D'Amore int count2; 5364297a3b0SGarrett D'Amore sopno pos; 5374297a3b0SGarrett D'Amore int i; 5384297a3b0SGarrett D'Amore wint_t wc; 5394297a3b0SGarrett D'Amore sopno subno; 5404297a3b0SGarrett D'Amore #define BACKSL (1<<CHAR_BIT) 5414297a3b0SGarrett D'Amore 5424297a3b0SGarrett D'Amore pos = HERE(); /* repetion op, if any, covers from here */ 5434297a3b0SGarrett D'Amore 5444297a3b0SGarrett D'Amore assert(MORE()); /* caller should have ensured this */ 5454297a3b0SGarrett D'Amore c = GETNEXT(); 5464297a3b0SGarrett D'Amore if (c == '\\') { 5474297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EESCAPE); 5484297a3b0SGarrett D'Amore c = BACKSL | GETNEXT(); 5494297a3b0SGarrett D'Amore } 5504297a3b0SGarrett D'Amore switch (c) { 5514297a3b0SGarrett D'Amore case '.': 5524297a3b0SGarrett D'Amore if (p->g->cflags®_NEWLINE) 5534297a3b0SGarrett D'Amore nonnewline(p); 5544297a3b0SGarrett D'Amore else 5554297a3b0SGarrett D'Amore EMIT(OANY, 0); 5564297a3b0SGarrett D'Amore break; 5574297a3b0SGarrett D'Amore case '[': 5584297a3b0SGarrett D'Amore p_bracket(p); 5594297a3b0SGarrett D'Amore break; 56084441f85SGarrett D'Amore case BACKSL|'<': 56184441f85SGarrett D'Amore EMIT(OBOW, 0); 56284441f85SGarrett D'Amore break; 56384441f85SGarrett D'Amore case BACKSL|'>': 56484441f85SGarrett D'Amore EMIT(OEOW, 0); 56584441f85SGarrett D'Amore break; 5664297a3b0SGarrett D'Amore case BACKSL|'{': 5674297a3b0SGarrett D'Amore SETERROR(REG_BADRPT); 5684297a3b0SGarrett D'Amore break; 5694297a3b0SGarrett D'Amore case BACKSL|'(': 5704297a3b0SGarrett D'Amore p->g->nsub++; 5714297a3b0SGarrett D'Amore subno = p->g->nsub; 5724297a3b0SGarrett D'Amore if (subno < NPAREN) 5734297a3b0SGarrett D'Amore p->pbegin[subno] = HERE(); 5744297a3b0SGarrett D'Amore EMIT(OLPAREN, subno); 5754297a3b0SGarrett D'Amore /* the MORE here is an error heuristic */ 5764297a3b0SGarrett D'Amore if (MORE() && !SEETWO('\\', ')')) 5774297a3b0SGarrett D'Amore p_bre(p, '\\', ')'); 5784297a3b0SGarrett D'Amore if (subno < NPAREN) { 5794297a3b0SGarrett D'Amore p->pend[subno] = HERE(); 5804297a3b0SGarrett D'Amore assert(p->pend[subno] != 0); 5814297a3b0SGarrett D'Amore } 5824297a3b0SGarrett D'Amore EMIT(ORPAREN, subno); 5834297a3b0SGarrett D'Amore (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 5844297a3b0SGarrett D'Amore break; 5854297a3b0SGarrett D'Amore case BACKSL|')': /* should not get here -- must be user */ 5864297a3b0SGarrett D'Amore case BACKSL|'}': 5874297a3b0SGarrett D'Amore SETERROR(REG_EPAREN); 5884297a3b0SGarrett D'Amore break; 5894297a3b0SGarrett D'Amore case BACKSL|'1': 5904297a3b0SGarrett D'Amore case BACKSL|'2': 5914297a3b0SGarrett D'Amore case BACKSL|'3': 5924297a3b0SGarrett D'Amore case BACKSL|'4': 5934297a3b0SGarrett D'Amore case BACKSL|'5': 5944297a3b0SGarrett D'Amore case BACKSL|'6': 5954297a3b0SGarrett D'Amore case BACKSL|'7': 5964297a3b0SGarrett D'Amore case BACKSL|'8': 5974297a3b0SGarrett D'Amore case BACKSL|'9': 5984297a3b0SGarrett D'Amore i = (c&~BACKSL) - '0'; 5994297a3b0SGarrett D'Amore assert(i < NPAREN); 6004297a3b0SGarrett D'Amore if (p->pend[i] != 0) { 6014297a3b0SGarrett D'Amore assert(i <= p->g->nsub); 6024297a3b0SGarrett D'Amore EMIT(OBACK_, i); 6034297a3b0SGarrett D'Amore assert(p->pbegin[i] != 0); 6044297a3b0SGarrett D'Amore assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 6054297a3b0SGarrett D'Amore assert(OP(p->strip[p->pend[i]]) == ORPAREN); 6064297a3b0SGarrett D'Amore (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 6074297a3b0SGarrett D'Amore EMIT(O_BACK, i); 6084297a3b0SGarrett D'Amore } else 6094297a3b0SGarrett D'Amore SETERROR(REG_ESUBREG); 6104297a3b0SGarrett D'Amore p->g->backrefs = 1; 6114297a3b0SGarrett D'Amore break; 6124297a3b0SGarrett D'Amore case '*': 6134297a3b0SGarrett D'Amore (void) REQUIRE(starordinary, REG_BADRPT); 6144297a3b0SGarrett D'Amore /* FALLTHROUGH */ 6154297a3b0SGarrett D'Amore default: 6164297a3b0SGarrett D'Amore p->next--; 6174297a3b0SGarrett D'Amore wc = WGETNEXT(); 6184297a3b0SGarrett D'Amore ordinary(p, wc); 6194297a3b0SGarrett D'Amore break; 6204297a3b0SGarrett D'Amore } 6214297a3b0SGarrett D'Amore 6224297a3b0SGarrett D'Amore if (EAT('*')) { /* implemented as +? */ 6234297a3b0SGarrett D'Amore /* this case does not require the (y|) trick, noKLUDGE */ 6244297a3b0SGarrett D'Amore INSERT(OPLUS_, pos); 6254297a3b0SGarrett D'Amore ASTERN(O_PLUS, pos); 6264297a3b0SGarrett D'Amore INSERT(OQUEST_, pos); 6274297a3b0SGarrett D'Amore ASTERN(O_QUEST, pos); 6284297a3b0SGarrett D'Amore } else if (EATTWO('\\', '{')) { 6294297a3b0SGarrett D'Amore count = p_count(p); 6304297a3b0SGarrett D'Amore if (EAT(',')) { 6314297a3b0SGarrett D'Amore if (MORE() && isdigit((uch)PEEK())) { 6324297a3b0SGarrett D'Amore count2 = p_count(p); 6334297a3b0SGarrett D'Amore (void) REQUIRE(count <= count2, REG_BADBR); 6344297a3b0SGarrett D'Amore } else /* single number with comma */ 6354297a3b0SGarrett D'Amore count2 = INFINITY; 6364297a3b0SGarrett D'Amore } else /* just a single number */ 6374297a3b0SGarrett D'Amore count2 = count; 6384297a3b0SGarrett D'Amore repeat(p, pos, count, count2); 6394297a3b0SGarrett D'Amore if (!EATTWO('\\', '}')) { /* error heuristics */ 6404297a3b0SGarrett D'Amore while (MORE() && !SEETWO('\\', '}')) 6414297a3b0SGarrett D'Amore NEXT(); 6424297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACE); 6434297a3b0SGarrett D'Amore SETERROR(REG_BADBR); 6444297a3b0SGarrett D'Amore } 6454297a3b0SGarrett D'Amore } else if (c == '$') /* $ (but not \$) ends it */ 6464297a3b0SGarrett D'Amore return (1); 6474297a3b0SGarrett D'Amore 6484297a3b0SGarrett D'Amore return (0); 6494297a3b0SGarrett D'Amore } 6504297a3b0SGarrett D'Amore 6514297a3b0SGarrett D'Amore /* 6524297a3b0SGarrett D'Amore * p_count - parse a repetition count 6534297a3b0SGarrett D'Amore */ 6544297a3b0SGarrett D'Amore static int /* the value */ 6554297a3b0SGarrett D'Amore p_count(struct parse *p) 6564297a3b0SGarrett D'Amore { 6574297a3b0SGarrett D'Amore int count = 0; 6584297a3b0SGarrett D'Amore int ndigits = 0; 6594297a3b0SGarrett D'Amore 6604297a3b0SGarrett D'Amore while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 6614297a3b0SGarrett D'Amore count = count*10 + (GETNEXT() - '0'); 6624297a3b0SGarrett D'Amore ndigits++; 6634297a3b0SGarrett D'Amore } 6644297a3b0SGarrett D'Amore 6654297a3b0SGarrett D'Amore (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 6664297a3b0SGarrett D'Amore return (count); 6674297a3b0SGarrett D'Amore } 6684297a3b0SGarrett D'Amore 6694297a3b0SGarrett D'Amore /* 6704297a3b0SGarrett D'Amore * p_bracket - parse a bracketed character list 6714297a3b0SGarrett D'Amore */ 6724297a3b0SGarrett D'Amore static void 6734297a3b0SGarrett D'Amore p_bracket(struct parse *p) 6744297a3b0SGarrett D'Amore { 6754297a3b0SGarrett D'Amore cset *cs; 6764297a3b0SGarrett D'Amore wint_t ch; 6774297a3b0SGarrett D'Amore 6784297a3b0SGarrett D'Amore /* Dept of Truly Sickening Special-Case Kludges */ 6794297a3b0SGarrett D'Amore if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 6804297a3b0SGarrett D'Amore EMIT(OBOW, 0); 6814297a3b0SGarrett D'Amore NEXTn(6); 6824297a3b0SGarrett D'Amore return; 6834297a3b0SGarrett D'Amore } 6844297a3b0SGarrett D'Amore if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 6854297a3b0SGarrett D'Amore EMIT(OEOW, 0); 6864297a3b0SGarrett D'Amore NEXTn(6); 6874297a3b0SGarrett D'Amore return; 6884297a3b0SGarrett D'Amore } 6894297a3b0SGarrett D'Amore 6904297a3b0SGarrett D'Amore if ((cs = allocset(p)) == NULL) 6914297a3b0SGarrett D'Amore return; 6924297a3b0SGarrett D'Amore 6934297a3b0SGarrett D'Amore if (p->g->cflags®_ICASE) 6944297a3b0SGarrett D'Amore cs->icase = 1; 6954297a3b0SGarrett D'Amore if (EAT('^')) 6964297a3b0SGarrett D'Amore cs->invert = 1; 6974297a3b0SGarrett D'Amore if (EAT(']')) 6984297a3b0SGarrett D'Amore CHadd(p, cs, ']'); 6994297a3b0SGarrett D'Amore else if (EAT('-')) 7004297a3b0SGarrett D'Amore CHadd(p, cs, '-'); 7014297a3b0SGarrett D'Amore while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 7024297a3b0SGarrett D'Amore p_b_term(p, cs); 7034297a3b0SGarrett D'Amore if (EAT('-')) 7044297a3b0SGarrett D'Amore CHadd(p, cs, '-'); 7054297a3b0SGarrett D'Amore (void) MUSTEAT(']', REG_EBRACK); 7064297a3b0SGarrett D'Amore 7074297a3b0SGarrett D'Amore if (p->error != 0) /* don't mess things up further */ 7084297a3b0SGarrett D'Amore return; 7094297a3b0SGarrett D'Amore 7104297a3b0SGarrett D'Amore if (cs->invert && p->g->cflags®_NEWLINE) 7114297a3b0SGarrett D'Amore cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 7124297a3b0SGarrett D'Amore 7134297a3b0SGarrett D'Amore if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 7144297a3b0SGarrett D'Amore ordinary(p, ch); 7154297a3b0SGarrett D'Amore freeset(p, cs); 7164297a3b0SGarrett D'Amore } else 7174297a3b0SGarrett D'Amore EMIT(OANYOF, (int)(cs - p->g->sets)); 7184297a3b0SGarrett D'Amore } 7194297a3b0SGarrett D'Amore 7204297a3b0SGarrett D'Amore /* 7214297a3b0SGarrett D'Amore * p_b_term - parse one term of a bracketed character list 7224297a3b0SGarrett D'Amore */ 7234297a3b0SGarrett D'Amore static void 7244297a3b0SGarrett D'Amore p_b_term(struct parse *p, cset *cs) 7254297a3b0SGarrett D'Amore { 7264297a3b0SGarrett D'Amore char c; 7274297a3b0SGarrett D'Amore wint_t start, finish; 7284297a3b0SGarrett D'Amore wint_t i; 729*2d08521bSGarrett D'Amore locale_t loc = uselocale(NULL); 7304297a3b0SGarrett D'Amore 7314297a3b0SGarrett D'Amore /* classify what we've got */ 7324297a3b0SGarrett D'Amore switch ((MORE()) ? PEEK() : '\0') { 7334297a3b0SGarrett D'Amore case '[': 7344297a3b0SGarrett D'Amore c = (MORE2()) ? PEEK2() : '\0'; 7354297a3b0SGarrett D'Amore break; 7364297a3b0SGarrett D'Amore case '-': 7374297a3b0SGarrett D'Amore SETERROR(REG_ERANGE); 7384297a3b0SGarrett D'Amore return; /* NOTE RETURN */ 7394297a3b0SGarrett D'Amore default: 7404297a3b0SGarrett D'Amore c = '\0'; 7414297a3b0SGarrett D'Amore break; 7424297a3b0SGarrett D'Amore } 7434297a3b0SGarrett D'Amore 7444297a3b0SGarrett D'Amore switch (c) { 7454297a3b0SGarrett D'Amore case ':': /* character class */ 7464297a3b0SGarrett D'Amore NEXT2(); 7474297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK); 7484297a3b0SGarrett D'Amore c = PEEK(); 7494297a3b0SGarrett D'Amore (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE); 7504297a3b0SGarrett D'Amore p_b_cclass(p, cs); 7514297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK); 7524297a3b0SGarrett D'Amore (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 7534297a3b0SGarrett D'Amore break; 7544297a3b0SGarrett D'Amore case '=': /* equivalence class */ 7554297a3b0SGarrett D'Amore NEXT2(); 7564297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK); 7574297a3b0SGarrett D'Amore c = PEEK(); 7584297a3b0SGarrett D'Amore (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 7594297a3b0SGarrett D'Amore p_b_eclass(p, cs); 7604297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK); 7614297a3b0SGarrett D'Amore (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 7624297a3b0SGarrett D'Amore break; 7634297a3b0SGarrett D'Amore default: /* symbol, ordinary character, or range */ 7644297a3b0SGarrett D'Amore start = p_b_symbol(p); 7654297a3b0SGarrett D'Amore if (SEE('-') && MORE2() && PEEK2() != ']') { 7664297a3b0SGarrett D'Amore /* range */ 7674297a3b0SGarrett D'Amore NEXT(); 7684297a3b0SGarrett D'Amore if (EAT('-')) 7694297a3b0SGarrett D'Amore finish = '-'; 7704297a3b0SGarrett D'Amore else 7714297a3b0SGarrett D'Amore finish = p_b_symbol(p); 7724297a3b0SGarrett D'Amore } else 7734297a3b0SGarrett D'Amore finish = start; 7744297a3b0SGarrett D'Amore if (start == finish) 7754297a3b0SGarrett D'Amore CHadd(p, cs, start); 7764297a3b0SGarrett D'Amore else { 777*2d08521bSGarrett D'Amore if (loc->collate->lc_is_posix) { 7784297a3b0SGarrett D'Amore (void) REQUIRE((uch)start <= (uch)finish, 7794297a3b0SGarrett D'Amore REG_ERANGE); 7804297a3b0SGarrett D'Amore CHaddrange(p, cs, start, finish); 7814297a3b0SGarrett D'Amore } else { 7826b5e5868SGarrett D'Amore (void) REQUIRE(_collate_range_cmp(start, 783*2d08521bSGarrett D'Amore finish, loc) <= 0, REG_ERANGE); 7844297a3b0SGarrett D'Amore for (i = 0; i <= UCHAR_MAX; i++) { 785*2d08521bSGarrett D'Amore if (_collate_range_cmp(start, i, loc) 786*2d08521bSGarrett D'Amore <= 0 && 787*2d08521bSGarrett D'Amore _collate_range_cmp(i, finish, loc) 788*2d08521bSGarrett D'Amore <= 0) 7894297a3b0SGarrett D'Amore CHadd(p, cs, i); 7904297a3b0SGarrett D'Amore } 7914297a3b0SGarrett D'Amore } 7924297a3b0SGarrett D'Amore } 7934297a3b0SGarrett D'Amore break; 7944297a3b0SGarrett D'Amore } 7954297a3b0SGarrett D'Amore } 7964297a3b0SGarrett D'Amore 7974297a3b0SGarrett D'Amore /* 7984297a3b0SGarrett D'Amore * p_b_cclass - parse a character-class name and deal with it 7994297a3b0SGarrett D'Amore */ 8004297a3b0SGarrett D'Amore static void 8014297a3b0SGarrett D'Amore p_b_cclass(struct parse *p, cset *cs) 8024297a3b0SGarrett D'Amore { 8034297a3b0SGarrett D'Amore char *sp = p->next; 8044297a3b0SGarrett D'Amore size_t len; 8054297a3b0SGarrett D'Amore wctype_t wct; 8064297a3b0SGarrett D'Amore char clname[16]; 8074297a3b0SGarrett D'Amore 8084297a3b0SGarrett D'Amore while (MORE() && isalpha((uch)PEEK())) 8094297a3b0SGarrett D'Amore NEXT(); 8104297a3b0SGarrett D'Amore len = p->next - sp; 8114297a3b0SGarrett D'Amore if (len >= sizeof (clname) - 1) { 8124297a3b0SGarrett D'Amore SETERROR(REG_ECTYPE); 8134297a3b0SGarrett D'Amore return; 8144297a3b0SGarrett D'Amore } 8154297a3b0SGarrett D'Amore (void) memcpy(clname, sp, len); 8164297a3b0SGarrett D'Amore clname[len] = '\0'; 8174297a3b0SGarrett D'Amore if ((wct = wctype(clname)) == 0) { 8184297a3b0SGarrett D'Amore SETERROR(REG_ECTYPE); 8194297a3b0SGarrett D'Amore return; 8204297a3b0SGarrett D'Amore } 8214297a3b0SGarrett D'Amore CHaddtype(p, cs, wct); 8224297a3b0SGarrett D'Amore } 8234297a3b0SGarrett D'Amore 8244297a3b0SGarrett D'Amore /* 8254297a3b0SGarrett D'Amore * p_b_eclass - parse an equivalence-class name and deal with it 8264297a3b0SGarrett D'Amore * 8274297a3b0SGarrett D'Amore * This implementation is incomplete. xxx 8284297a3b0SGarrett D'Amore */ 8294297a3b0SGarrett D'Amore static void 8304297a3b0SGarrett D'Amore p_b_eclass(struct parse *p, cset *cs) 8314297a3b0SGarrett D'Amore { 8324297a3b0SGarrett D'Amore wint_t c; 8334297a3b0SGarrett D'Amore 8344297a3b0SGarrett D'Amore c = p_b_coll_elem(p, '='); 8354297a3b0SGarrett D'Amore CHadd(p, cs, c); 8364297a3b0SGarrett D'Amore } 8374297a3b0SGarrett D'Amore 8384297a3b0SGarrett D'Amore /* 8394297a3b0SGarrett D'Amore * p_b_symbol - parse a character or [..]ed multicharacter collating symbol 8404297a3b0SGarrett D'Amore */ 8414297a3b0SGarrett D'Amore static wint_t /* value of symbol */ 8424297a3b0SGarrett D'Amore p_b_symbol(struct parse *p) 8434297a3b0SGarrett D'Amore { 8444297a3b0SGarrett D'Amore wint_t value; 8454297a3b0SGarrett D'Amore 8464297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK); 8474297a3b0SGarrett D'Amore if (!EATTWO('[', '.')) 8484297a3b0SGarrett D'Amore return (WGETNEXT()); 8494297a3b0SGarrett D'Amore 8504297a3b0SGarrett D'Amore /* collating symbol */ 8514297a3b0SGarrett D'Amore value = p_b_coll_elem(p, '.'); 8524297a3b0SGarrett D'Amore (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 8534297a3b0SGarrett D'Amore return (value); 8544297a3b0SGarrett D'Amore } 8554297a3b0SGarrett D'Amore 8564297a3b0SGarrett D'Amore /* 8574297a3b0SGarrett D'Amore * p_b_coll_elem - parse a collating-element name and look it up 8584297a3b0SGarrett D'Amore */ 8594297a3b0SGarrett D'Amore static wint_t /* value of collating element */ 8604297a3b0SGarrett D'Amore p_b_coll_elem(struct parse *p, 8614297a3b0SGarrett D'Amore wint_t endc) /* name ended by endc,']' */ 8624297a3b0SGarrett D'Amore { 8634297a3b0SGarrett D'Amore char *sp = p->next; 8644297a3b0SGarrett D'Amore struct cname *cp; 8654297a3b0SGarrett D'Amore int len; 8664297a3b0SGarrett D'Amore mbstate_t mbs; 8674297a3b0SGarrett D'Amore wchar_t wc; 8684297a3b0SGarrett D'Amore size_t clen; 8694297a3b0SGarrett D'Amore 8704297a3b0SGarrett D'Amore while (MORE() && !SEETWO(endc, ']')) 8714297a3b0SGarrett D'Amore NEXT(); 8724297a3b0SGarrett D'Amore if (!MORE()) { 8734297a3b0SGarrett D'Amore SETERROR(REG_EBRACK); 8744297a3b0SGarrett D'Amore return (0); 8754297a3b0SGarrett D'Amore } 8764297a3b0SGarrett D'Amore len = p->next - sp; 8774297a3b0SGarrett D'Amore for (cp = cnames; cp->name != NULL; cp++) 8784297a3b0SGarrett D'Amore if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 8794297a3b0SGarrett D'Amore return (cp->code); /* known name */ 8804297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs)); 8814297a3b0SGarrett D'Amore if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 8824297a3b0SGarrett D'Amore return (wc); /* single character */ 8834297a3b0SGarrett D'Amore else if (clen == (size_t)-1 || clen == (size_t)-2) 8844297a3b0SGarrett D'Amore SETERROR(REG_ECHAR); 8854297a3b0SGarrett D'Amore else 8864297a3b0SGarrett D'Amore SETERROR(REG_ECOLLATE); /* neither */ 8874297a3b0SGarrett D'Amore return (0); 8884297a3b0SGarrett D'Amore } 8894297a3b0SGarrett D'Amore 8904297a3b0SGarrett D'Amore /* 8914297a3b0SGarrett D'Amore * othercase - return the case counterpart of an alphabetic 8924297a3b0SGarrett D'Amore */ 8934297a3b0SGarrett D'Amore static wint_t /* if no counterpart, return ch */ 8944297a3b0SGarrett D'Amore othercase(wint_t ch) 8954297a3b0SGarrett D'Amore { 8964297a3b0SGarrett D'Amore assert(iswalpha(ch)); 8974297a3b0SGarrett D'Amore if (iswupper(ch)) 8984297a3b0SGarrett D'Amore return (towlower(ch)); 8994297a3b0SGarrett D'Amore else if (iswlower(ch)) 9004297a3b0SGarrett D'Amore return (towupper(ch)); 9014297a3b0SGarrett D'Amore else /* peculiar, but could happen */ 9024297a3b0SGarrett D'Amore return (ch); 9034297a3b0SGarrett D'Amore } 9044297a3b0SGarrett D'Amore 9054297a3b0SGarrett D'Amore /* 9064297a3b0SGarrett D'Amore * bothcases - emit a dualcase version of a two-case character 9074297a3b0SGarrett D'Amore * 9084297a3b0SGarrett D'Amore * Boy, is this implementation ever a kludge... 9094297a3b0SGarrett D'Amore */ 9104297a3b0SGarrett D'Amore static void 9114297a3b0SGarrett D'Amore bothcases(struct parse *p, wint_t ch) 9124297a3b0SGarrett D'Amore { 9134297a3b0SGarrett D'Amore char *oldnext = p->next; 9144297a3b0SGarrett D'Amore char *oldend = p->end; 9154297a3b0SGarrett D'Amore char bracket[3 + MB_LEN_MAX]; 9164297a3b0SGarrett D'Amore size_t n; 9174297a3b0SGarrett D'Amore mbstate_t mbs; 9184297a3b0SGarrett D'Amore 9194297a3b0SGarrett D'Amore assert(othercase(ch) != ch); /* p_bracket() would recurse */ 9204297a3b0SGarrett D'Amore p->next = bracket; 9214297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs)); 9224297a3b0SGarrett D'Amore n = wcrtomb(bracket, ch, &mbs); 9234297a3b0SGarrett D'Amore assert(n != (size_t)-1); 9244297a3b0SGarrett D'Amore bracket[n] = ']'; 9254297a3b0SGarrett D'Amore bracket[n + 1] = '\0'; 9264297a3b0SGarrett D'Amore p->end = bracket+n+1; 9274297a3b0SGarrett D'Amore p_bracket(p); 9284297a3b0SGarrett D'Amore assert(p->next == p->end); 9294297a3b0SGarrett D'Amore p->next = oldnext; 9304297a3b0SGarrett D'Amore p->end = oldend; 9314297a3b0SGarrett D'Amore } 9324297a3b0SGarrett D'Amore 9334297a3b0SGarrett D'Amore /* 9344297a3b0SGarrett D'Amore * ordinary - emit an ordinary character 9354297a3b0SGarrett D'Amore */ 9364297a3b0SGarrett D'Amore static void 9374297a3b0SGarrett D'Amore ordinary(struct parse *p, wint_t ch) 9384297a3b0SGarrett D'Amore { 9394297a3b0SGarrett D'Amore cset *cs; 9404297a3b0SGarrett D'Amore 9414297a3b0SGarrett D'Amore if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 9424297a3b0SGarrett D'Amore bothcases(p, ch); 9434297a3b0SGarrett D'Amore else if ((ch & OPDMASK) == ch) 9444297a3b0SGarrett D'Amore EMIT(OCHAR, ch); 9454297a3b0SGarrett D'Amore else { 9464297a3b0SGarrett D'Amore /* 9474297a3b0SGarrett D'Amore * Kludge: character is too big to fit into an OCHAR operand. 9484297a3b0SGarrett D'Amore * Emit a singleton set. 9494297a3b0SGarrett D'Amore */ 9504297a3b0SGarrett D'Amore if ((cs = allocset(p)) == NULL) 9514297a3b0SGarrett D'Amore return; 9524297a3b0SGarrett D'Amore CHadd(p, cs, ch); 9534297a3b0SGarrett D'Amore EMIT(OANYOF, (int)(cs - p->g->sets)); 9544297a3b0SGarrett D'Amore } 9554297a3b0SGarrett D'Amore } 9564297a3b0SGarrett D'Amore 9574297a3b0SGarrett D'Amore /* 9584297a3b0SGarrett D'Amore * nonnewline - emit REG_NEWLINE version of OANY 9594297a3b0SGarrett D'Amore * 9604297a3b0SGarrett D'Amore * Boy, is this implementation ever a kludge... 9614297a3b0SGarrett D'Amore */ 9624297a3b0SGarrett D'Amore static void 9634297a3b0SGarrett D'Amore nonnewline(struct parse *p) 9644297a3b0SGarrett D'Amore { 9654297a3b0SGarrett D'Amore char *oldnext = p->next; 9664297a3b0SGarrett D'Amore char *oldend = p->end; 9674297a3b0SGarrett D'Amore char bracket[4]; 9684297a3b0SGarrett D'Amore 9694297a3b0SGarrett D'Amore p->next = bracket; 9704297a3b0SGarrett D'Amore p->end = bracket+3; 9714297a3b0SGarrett D'Amore bracket[0] = '^'; 9724297a3b0SGarrett D'Amore bracket[1] = '\n'; 9734297a3b0SGarrett D'Amore bracket[2] = ']'; 9744297a3b0SGarrett D'Amore bracket[3] = '\0'; 9754297a3b0SGarrett D'Amore p_bracket(p); 9764297a3b0SGarrett D'Amore assert(p->next == bracket+3); 9774297a3b0SGarrett D'Amore p->next = oldnext; 9784297a3b0SGarrett D'Amore p->end = oldend; 9794297a3b0SGarrett D'Amore } 9804297a3b0SGarrett D'Amore 9814297a3b0SGarrett D'Amore /* 9824297a3b0SGarrett D'Amore * repeat - generate code for a bounded repetition, recursively if needed 9834297a3b0SGarrett D'Amore */ 9844297a3b0SGarrett D'Amore static void 9854297a3b0SGarrett D'Amore repeat(struct parse *p, 9864297a3b0SGarrett D'Amore sopno start, /* operand from here to end of strip */ 9874297a3b0SGarrett D'Amore int from, /* repeated from this number */ 9884297a3b0SGarrett D'Amore int to) /* to this number of times (maybe INFINITY) */ 9894297a3b0SGarrett D'Amore { 9904297a3b0SGarrett D'Amore sopno finish = HERE(); 9914297a3b0SGarrett D'Amore #define N 2 9924297a3b0SGarrett D'Amore #define INF 3 9934297a3b0SGarrett D'Amore #define REP(f, t) ((f)*8 + (t)) 9944297a3b0SGarrett D'Amore #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 9954297a3b0SGarrett D'Amore sopno copy; 9964297a3b0SGarrett D'Amore 9974297a3b0SGarrett D'Amore if (p->error != 0) /* head off possible runaway recursion */ 9984297a3b0SGarrett D'Amore return; 9994297a3b0SGarrett D'Amore 10004297a3b0SGarrett D'Amore assert(from <= to); 10014297a3b0SGarrett D'Amore 10024297a3b0SGarrett D'Amore switch (REP(MAP(from), MAP(to))) { 10034297a3b0SGarrett D'Amore case REP(0, 0): /* must be user doing this */ 10044297a3b0SGarrett D'Amore DROP(finish-start); /* drop the operand */ 10054297a3b0SGarrett D'Amore break; 10064297a3b0SGarrett D'Amore case REP(0, 1): /* as x{1,1}? */ 10074297a3b0SGarrett D'Amore case REP(0, N): /* as x{1,n}? */ 10084297a3b0SGarrett D'Amore case REP(0, INF): /* as x{1,}? */ 10094297a3b0SGarrett D'Amore /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10104297a3b0SGarrett D'Amore INSERT(OCH_, start); /* offset is wrong... */ 10114297a3b0SGarrett D'Amore repeat(p, start+1, 1, to); 10124297a3b0SGarrett D'Amore ASTERN(OOR1, start); 10134297a3b0SGarrett D'Amore AHEAD(start); /* ... fix it */ 10144297a3b0SGarrett D'Amore EMIT(OOR2, 0); 10154297a3b0SGarrett D'Amore AHEAD(THERE()); 10164297a3b0SGarrett D'Amore ASTERN(O_CH, THERETHERE()); 10174297a3b0SGarrett D'Amore break; 10184297a3b0SGarrett D'Amore case REP(1, 1): /* trivial case */ 10194297a3b0SGarrett D'Amore /* done */ 10204297a3b0SGarrett D'Amore break; 10214297a3b0SGarrett D'Amore case REP(1, N): /* as x?x{1,n-1} */ 10224297a3b0SGarrett D'Amore /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 10234297a3b0SGarrett D'Amore INSERT(OCH_, start); 10244297a3b0SGarrett D'Amore ASTERN(OOR1, start); 10254297a3b0SGarrett D'Amore AHEAD(start); 10264297a3b0SGarrett D'Amore EMIT(OOR2, 0); /* offset very wrong... */ 10274297a3b0SGarrett D'Amore AHEAD(THERE()); /* ...so fix it */ 10284297a3b0SGarrett D'Amore ASTERN(O_CH, THERETHERE()); 10294297a3b0SGarrett D'Amore copy = dupl(p, start+1, finish+1); 10304297a3b0SGarrett D'Amore assert(copy == finish+4); 10314297a3b0SGarrett D'Amore repeat(p, copy, 1, to-1); 10324297a3b0SGarrett D'Amore break; 10334297a3b0SGarrett D'Amore case REP(1, INF): /* as x+ */ 10344297a3b0SGarrett D'Amore INSERT(OPLUS_, start); 10354297a3b0SGarrett D'Amore ASTERN(O_PLUS, start); 10364297a3b0SGarrett D'Amore break; 10374297a3b0SGarrett D'Amore case REP(N, N): /* as xx{m-1,n-1} */ 10384297a3b0SGarrett D'Amore copy = dupl(p, start, finish); 10394297a3b0SGarrett D'Amore repeat(p, copy, from-1, to-1); 10404297a3b0SGarrett D'Amore break; 10414297a3b0SGarrett D'Amore case REP(N, INF): /* as xx{n-1,INF} */ 10424297a3b0SGarrett D'Amore copy = dupl(p, start, finish); 10434297a3b0SGarrett D'Amore repeat(p, copy, from-1, to); 10444297a3b0SGarrett D'Amore break; 10454297a3b0SGarrett D'Amore default: /* "can't happen" */ 10464297a3b0SGarrett D'Amore SETERROR(REG_EFATAL); /* just in case */ 10474297a3b0SGarrett D'Amore break; 10484297a3b0SGarrett D'Amore } 10494297a3b0SGarrett D'Amore } 10504297a3b0SGarrett D'Amore 10514297a3b0SGarrett D'Amore /* 10524297a3b0SGarrett D'Amore * wgetnext - helper function for WGETNEXT() macro. Gets the next wide 10534297a3b0SGarrett D'Amore * character from the parse struct, signals a REG_ILLSEQ error if the 10544297a3b0SGarrett D'Amore * character can't be converted. Returns the number of bytes consumed. 10554297a3b0SGarrett D'Amore */ 10564297a3b0SGarrett D'Amore static wint_t 10574297a3b0SGarrett D'Amore wgetnext(struct parse *p) 10584297a3b0SGarrett D'Amore { 10594297a3b0SGarrett D'Amore mbstate_t mbs; 10604297a3b0SGarrett D'Amore wchar_t wc; 10614297a3b0SGarrett D'Amore size_t n; 10624297a3b0SGarrett D'Amore 10634297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs)); 10644297a3b0SGarrett D'Amore n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); 10654297a3b0SGarrett D'Amore if (n == (size_t)-1 || n == (size_t)-2) { 10664297a3b0SGarrett D'Amore SETERROR(REG_ECHAR); 10674297a3b0SGarrett D'Amore return (0); 10684297a3b0SGarrett D'Amore } 10694297a3b0SGarrett D'Amore if (n == 0) 10704297a3b0SGarrett D'Amore n = 1; 10714297a3b0SGarrett D'Amore p->next += n; 10724297a3b0SGarrett D'Amore return (wc); 10734297a3b0SGarrett D'Amore } 10744297a3b0SGarrett D'Amore 10754297a3b0SGarrett D'Amore /* 10764297a3b0SGarrett D'Amore * seterr - set an error condition 10774297a3b0SGarrett D'Amore */ 10784297a3b0SGarrett D'Amore static int /* useless but makes type checking happy */ 10794297a3b0SGarrett D'Amore seterr(struct parse *p, int e) 10804297a3b0SGarrett D'Amore { 10814297a3b0SGarrett D'Amore if (p->error == 0) /* keep earliest error condition */ 10824297a3b0SGarrett D'Amore p->error = e; 10834297a3b0SGarrett D'Amore p->next = nuls; /* try to bring things to a halt */ 10844297a3b0SGarrett D'Amore p->end = nuls; 10854297a3b0SGarrett D'Amore return (0); /* make the return value well-defined */ 10864297a3b0SGarrett D'Amore } 10874297a3b0SGarrett D'Amore 10884297a3b0SGarrett D'Amore /* 10894297a3b0SGarrett D'Amore * allocset - allocate a set of characters for [] 10904297a3b0SGarrett D'Amore */ 10914297a3b0SGarrett D'Amore static cset * 10924297a3b0SGarrett D'Amore allocset(struct parse *p) 10934297a3b0SGarrett D'Amore { 10944297a3b0SGarrett D'Amore cset *cs, *ncs; 10954297a3b0SGarrett D'Amore 10964297a3b0SGarrett D'Amore ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs)); 10974297a3b0SGarrett D'Amore if (ncs == NULL) { 10984297a3b0SGarrett D'Amore SETERROR(REG_ESPACE); 10994297a3b0SGarrett D'Amore return (NULL); 11004297a3b0SGarrett D'Amore } 11014297a3b0SGarrett D'Amore p->g->sets = ncs; 11024297a3b0SGarrett D'Amore cs = &p->g->sets[p->g->ncsets++]; 11034297a3b0SGarrett D'Amore (void) memset(cs, 0, sizeof (*cs)); 11044297a3b0SGarrett D'Amore 11054297a3b0SGarrett D'Amore return (cs); 11064297a3b0SGarrett D'Amore } 11074297a3b0SGarrett D'Amore 11084297a3b0SGarrett D'Amore /* 11094297a3b0SGarrett D'Amore * freeset - free a now-unused set 11104297a3b0SGarrett D'Amore */ 11114297a3b0SGarrett D'Amore static void 11124297a3b0SGarrett D'Amore freeset(struct parse *p, cset *cs) 11134297a3b0SGarrett D'Amore { 11144297a3b0SGarrett D'Amore cset *top = &p->g->sets[p->g->ncsets]; 11154297a3b0SGarrett D'Amore 11164297a3b0SGarrett D'Amore free(cs->wides); 11174297a3b0SGarrett D'Amore free(cs->ranges); 11184297a3b0SGarrett D'Amore free(cs->types); 11194297a3b0SGarrett D'Amore (void) memset(cs, 0, sizeof (*cs)); 11204297a3b0SGarrett D'Amore if (cs == top-1) /* recover only the easy case */ 11214297a3b0SGarrett D'Amore p->g->ncsets--; 11224297a3b0SGarrett D'Amore } 11234297a3b0SGarrett D'Amore 11244297a3b0SGarrett D'Amore /* 11254297a3b0SGarrett D'Amore * singleton - Determine whether a set contains only one character, 11264297a3b0SGarrett D'Amore * returning it if so, otherwise returning OUT. 11274297a3b0SGarrett D'Amore */ 11284297a3b0SGarrett D'Amore static wint_t 11294297a3b0SGarrett D'Amore singleton(cset *cs) 11304297a3b0SGarrett D'Amore { 11314297a3b0SGarrett D'Amore wint_t i, s, n; 11324297a3b0SGarrett D'Amore 11334297a3b0SGarrett D'Amore for (i = n = 0; i < NC; i++) 11344297a3b0SGarrett D'Amore if (CHIN(cs, i)) { 11354297a3b0SGarrett D'Amore n++; 11364297a3b0SGarrett D'Amore s = i; 11374297a3b0SGarrett D'Amore } 11384297a3b0SGarrett D'Amore if (n == 1) 11394297a3b0SGarrett D'Amore return (s); 11404297a3b0SGarrett D'Amore if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 11414297a3b0SGarrett D'Amore cs->icase == 0) 11424297a3b0SGarrett D'Amore return (cs->wides[0]); 11434297a3b0SGarrett D'Amore /* Don't bother handling the other cases. */ 11444297a3b0SGarrett D'Amore return (OUT); 11454297a3b0SGarrett D'Amore } 11464297a3b0SGarrett D'Amore 11474297a3b0SGarrett D'Amore /* 11484297a3b0SGarrett D'Amore * CHadd - add character to character set. 11494297a3b0SGarrett D'Amore */ 11504297a3b0SGarrett D'Amore static void 11514297a3b0SGarrett D'Amore CHadd(struct parse *p, cset *cs, wint_t ch) 11524297a3b0SGarrett D'Amore { 11534297a3b0SGarrett D'Amore wint_t nch, *newwides; 11544297a3b0SGarrett D'Amore assert(ch >= 0); 11554297a3b0SGarrett D'Amore if (ch < NC) 11564297a3b0SGarrett D'Amore cs->bmp[ch >> 3] |= 1 << (ch & 7); 11574297a3b0SGarrett D'Amore else { 11584297a3b0SGarrett D'Amore newwides = realloc(cs->wides, (cs->nwides + 1) * 11594297a3b0SGarrett D'Amore sizeof (*cs->wides)); 11604297a3b0SGarrett D'Amore if (newwides == NULL) { 11614297a3b0SGarrett D'Amore SETERROR(REG_ESPACE); 11624297a3b0SGarrett D'Amore return; 11634297a3b0SGarrett D'Amore } 11644297a3b0SGarrett D'Amore cs->wides = newwides; 11654297a3b0SGarrett D'Amore cs->wides[cs->nwides++] = ch; 11664297a3b0SGarrett D'Amore } 11674297a3b0SGarrett D'Amore if (cs->icase) { 11684297a3b0SGarrett D'Amore if ((nch = towlower(ch)) < NC) 11694297a3b0SGarrett D'Amore cs->bmp[nch >> 3] |= 1 << (nch & 7); 11704297a3b0SGarrett D'Amore if ((nch = towupper(ch)) < NC) 11714297a3b0SGarrett D'Amore cs->bmp[nch >> 3] |= 1 << (nch & 7); 11724297a3b0SGarrett D'Amore } 11734297a3b0SGarrett D'Amore } 11744297a3b0SGarrett D'Amore 11754297a3b0SGarrett D'Amore /* 11764297a3b0SGarrett D'Amore * CHaddrange - add all characters in the range [min,max] to a character set. 11774297a3b0SGarrett D'Amore */ 11784297a3b0SGarrett D'Amore static void 11794297a3b0SGarrett D'Amore CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 11804297a3b0SGarrett D'Amore { 11814297a3b0SGarrett D'Amore crange *newranges; 11824297a3b0SGarrett D'Amore 11834297a3b0SGarrett D'Amore for (; min < NC && min <= max; min++) 11844297a3b0SGarrett D'Amore CHadd(p, cs, min); 11854297a3b0SGarrett D'Amore if (min >= max) 11864297a3b0SGarrett D'Amore return; 11874297a3b0SGarrett D'Amore newranges = realloc(cs->ranges, (cs->nranges + 1) * 11884297a3b0SGarrett D'Amore sizeof (*cs->ranges)); 11894297a3b0SGarrett D'Amore if (newranges == NULL) { 11904297a3b0SGarrett D'Amore SETERROR(REG_ESPACE); 11914297a3b0SGarrett D'Amore return; 11924297a3b0SGarrett D'Amore } 11934297a3b0SGarrett D'Amore cs->ranges = newranges; 11944297a3b0SGarrett D'Amore cs->ranges[cs->nranges].min = min; 11954297a3b0SGarrett D'Amore cs->ranges[cs->nranges].min = max; 11964297a3b0SGarrett D'Amore cs->nranges++; 11974297a3b0SGarrett D'Amore } 11984297a3b0SGarrett D'Amore 11994297a3b0SGarrett D'Amore /* 12004297a3b0SGarrett D'Amore * CHaddtype - add all characters of a certain type to a character set. 12014297a3b0SGarrett D'Amore */ 12024297a3b0SGarrett D'Amore static void 12034297a3b0SGarrett D'Amore CHaddtype(struct parse *p, cset *cs, wctype_t wct) 12044297a3b0SGarrett D'Amore { 12054297a3b0SGarrett D'Amore wint_t i; 12064297a3b0SGarrett D'Amore wctype_t *newtypes; 12074297a3b0SGarrett D'Amore 12084297a3b0SGarrett D'Amore for (i = 0; i < NC; i++) 12094297a3b0SGarrett D'Amore if (iswctype(i, wct)) 12104297a3b0SGarrett D'Amore CHadd(p, cs, i); 12114297a3b0SGarrett D'Amore newtypes = realloc(cs->types, (cs->ntypes + 1) * 12124297a3b0SGarrett D'Amore sizeof (*cs->types)); 12134297a3b0SGarrett D'Amore if (newtypes == NULL) { 12144297a3b0SGarrett D'Amore SETERROR(REG_ESPACE); 12154297a3b0SGarrett D'Amore return; 12164297a3b0SGarrett D'Amore } 12174297a3b0SGarrett D'Amore cs->types = newtypes; 12184297a3b0SGarrett D'Amore cs->types[cs->ntypes++] = wct; 12194297a3b0SGarrett D'Amore } 12204297a3b0SGarrett D'Amore 12214297a3b0SGarrett D'Amore /* 12224297a3b0SGarrett D'Amore * dupl - emit a duplicate of a bunch of sops 12234297a3b0SGarrett D'Amore */ 12244297a3b0SGarrett D'Amore static sopno /* start of duplicate */ 12254297a3b0SGarrett D'Amore dupl(struct parse *p, 12264297a3b0SGarrett D'Amore sopno start, /* from here */ 12274297a3b0SGarrett D'Amore sopno finish) /* to this less one */ 12284297a3b0SGarrett D'Amore { 12294297a3b0SGarrett D'Amore sopno ret = HERE(); 12304297a3b0SGarrett D'Amore sopno len = finish - start; 12314297a3b0SGarrett D'Amore 12324297a3b0SGarrett D'Amore assert(finish >= start); 12334297a3b0SGarrett D'Amore if (len == 0) 12344297a3b0SGarrett D'Amore return (ret); 12354297a3b0SGarrett D'Amore enlarge(p, p->ssize + len); /* this many unexpected additions */ 12364297a3b0SGarrett D'Amore assert(p->ssize >= p->slen + len); 12374297a3b0SGarrett D'Amore (void) memcpy((char *)(p->strip + p->slen), 12384297a3b0SGarrett D'Amore (char *)(p->strip + start), (size_t)len*sizeof (sop)); 12394297a3b0SGarrett D'Amore p->slen += len; 12404297a3b0SGarrett D'Amore return (ret); 12414297a3b0SGarrett D'Amore } 12424297a3b0SGarrett D'Amore 12434297a3b0SGarrett D'Amore /* 12444297a3b0SGarrett D'Amore * doemit - emit a strip operator 12454297a3b0SGarrett D'Amore * 12464297a3b0SGarrett D'Amore * It might seem better to implement this as a macro with a function as 12474297a3b0SGarrett D'Amore * hard-case backup, but it's just too big and messy unless there are 12484297a3b0SGarrett D'Amore * some changes to the data structures. Maybe later. 12494297a3b0SGarrett D'Amore */ 12504297a3b0SGarrett D'Amore static void 12514297a3b0SGarrett D'Amore doemit(struct parse *p, sop op, size_t opnd) 12524297a3b0SGarrett D'Amore { 12534297a3b0SGarrett D'Amore /* avoid making error situations worse */ 12544297a3b0SGarrett D'Amore if (p->error != 0) 12554297a3b0SGarrett D'Amore return; 12564297a3b0SGarrett D'Amore 12574297a3b0SGarrett D'Amore /* deal with oversize operands ("can't happen", more or less) */ 12584297a3b0SGarrett D'Amore assert(opnd < 1<<OPSHIFT); 12594297a3b0SGarrett D'Amore 12604297a3b0SGarrett D'Amore /* deal with undersized strip */ 12614297a3b0SGarrett D'Amore if (p->slen >= p->ssize) 12624297a3b0SGarrett D'Amore enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 12634297a3b0SGarrett D'Amore assert(p->slen < p->ssize); 12644297a3b0SGarrett D'Amore 12654297a3b0SGarrett D'Amore /* finally, it's all reduced to the easy case */ 12664297a3b0SGarrett D'Amore p->strip[p->slen++] = SOP(op, opnd); 12674297a3b0SGarrett D'Amore } 12684297a3b0SGarrett D'Amore 12694297a3b0SGarrett D'Amore /* 12704297a3b0SGarrett D'Amore * doinsert - insert a sop into the strip 12714297a3b0SGarrett D'Amore */ 12724297a3b0SGarrett D'Amore static void 12734297a3b0SGarrett D'Amore doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 12744297a3b0SGarrett D'Amore { 12754297a3b0SGarrett D'Amore sopno sn; 12764297a3b0SGarrett D'Amore sop s; 12774297a3b0SGarrett D'Amore int i; 12784297a3b0SGarrett D'Amore 12794297a3b0SGarrett D'Amore /* avoid making error situations worse */ 12804297a3b0SGarrett D'Amore if (p->error != 0) 12814297a3b0SGarrett D'Amore return; 12824297a3b0SGarrett D'Amore 12834297a3b0SGarrett D'Amore sn = HERE(); 12844297a3b0SGarrett D'Amore EMIT(op, opnd); /* do checks, ensure space */ 12854297a3b0SGarrett D'Amore assert(HERE() == sn+1); 12864297a3b0SGarrett D'Amore s = p->strip[sn]; 12874297a3b0SGarrett D'Amore 12884297a3b0SGarrett D'Amore /* adjust paren pointers */ 12894297a3b0SGarrett D'Amore assert(pos > 0); 12904297a3b0SGarrett D'Amore for (i = 1; i < NPAREN; i++) { 12914297a3b0SGarrett D'Amore if (p->pbegin[i] >= pos) { 12924297a3b0SGarrett D'Amore p->pbegin[i]++; 12934297a3b0SGarrett D'Amore } 12944297a3b0SGarrett D'Amore if (p->pend[i] >= pos) { 12954297a3b0SGarrett D'Amore p->pend[i]++; 12964297a3b0SGarrett D'Amore } 12974297a3b0SGarrett D'Amore } 12984297a3b0SGarrett D'Amore 1299eda71b4aSGarrett D'Amore (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 13004297a3b0SGarrett D'Amore (HERE()-pos-1)*sizeof (sop)); 13014297a3b0SGarrett D'Amore p->strip[pos] = s; 13024297a3b0SGarrett D'Amore } 13034297a3b0SGarrett D'Amore 13044297a3b0SGarrett D'Amore /* 13054297a3b0SGarrett D'Amore * dofwd - complete a forward reference 13064297a3b0SGarrett D'Amore */ 13074297a3b0SGarrett D'Amore static void 13084297a3b0SGarrett D'Amore dofwd(struct parse *p, sopno pos, sop value) 13094297a3b0SGarrett D'Amore { 13104297a3b0SGarrett D'Amore /* avoid making error situations worse */ 13114297a3b0SGarrett D'Amore if (p->error != 0) 13124297a3b0SGarrett D'Amore return; 13134297a3b0SGarrett D'Amore 13144297a3b0SGarrett D'Amore assert(value < 1<<OPSHIFT); 13154297a3b0SGarrett D'Amore p->strip[pos] = OP(p->strip[pos]) | value; 13164297a3b0SGarrett D'Amore } 13174297a3b0SGarrett D'Amore 13184297a3b0SGarrett D'Amore /* 13194297a3b0SGarrett D'Amore * enlarge - enlarge the strip 13204297a3b0SGarrett D'Amore */ 13214297a3b0SGarrett D'Amore static void 13224297a3b0SGarrett D'Amore enlarge(struct parse *p, sopno size) 13234297a3b0SGarrett D'Amore { 13244297a3b0SGarrett D'Amore sop *sp; 13254297a3b0SGarrett D'Amore 13264297a3b0SGarrett D'Amore if (p->ssize >= size) 13274297a3b0SGarrett D'Amore return; 13284297a3b0SGarrett D'Amore 13294297a3b0SGarrett D'Amore sp = (sop *)realloc(p->strip, size*sizeof (sop)); 13304297a3b0SGarrett D'Amore if (sp == NULL) { 13314297a3b0SGarrett D'Amore SETERROR(REG_ESPACE); 13324297a3b0SGarrett D'Amore return; 13334297a3b0SGarrett D'Amore } 13344297a3b0SGarrett D'Amore p->strip = sp; 13354297a3b0SGarrett D'Amore p->ssize = size; 13364297a3b0SGarrett D'Amore } 13374297a3b0SGarrett D'Amore 13384297a3b0SGarrett D'Amore /* 13394297a3b0SGarrett D'Amore * stripsnug - compact the strip 13404297a3b0SGarrett D'Amore */ 13414297a3b0SGarrett D'Amore static void 13424297a3b0SGarrett D'Amore stripsnug(struct parse *p, struct re_guts *g) 13434297a3b0SGarrett D'Amore { 13444297a3b0SGarrett D'Amore g->nstates = p->slen; 13454297a3b0SGarrett D'Amore g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop)); 13464297a3b0SGarrett D'Amore if (g->strip == NULL) { 13474297a3b0SGarrett D'Amore SETERROR(REG_ESPACE); 13484297a3b0SGarrett D'Amore g->strip = p->strip; 13494297a3b0SGarrett D'Amore } 13504297a3b0SGarrett D'Amore } 13514297a3b0SGarrett D'Amore 13524297a3b0SGarrett D'Amore /* 13534297a3b0SGarrett D'Amore * findmust - fill in must and mlen with longest mandatory literal string 13544297a3b0SGarrett D'Amore * 13554297a3b0SGarrett D'Amore * This algorithm could do fancy things like analyzing the operands of | 13564297a3b0SGarrett D'Amore * for common subsequences. Someday. This code is simple and finds most 13574297a3b0SGarrett D'Amore * of the interesting cases. 13584297a3b0SGarrett D'Amore * 13594297a3b0SGarrett D'Amore * Note that must and mlen got initialized during setup. 13604297a3b0SGarrett D'Amore */ 13614297a3b0SGarrett D'Amore static void 13624297a3b0SGarrett D'Amore findmust(struct parse *p, struct re_guts *g) 13634297a3b0SGarrett D'Amore { 13644297a3b0SGarrett D'Amore sop *scan; 13654297a3b0SGarrett D'Amore sop *start; 13664297a3b0SGarrett D'Amore sop *newstart; 13674297a3b0SGarrett D'Amore sopno newlen; 13684297a3b0SGarrett D'Amore sop s; 13694297a3b0SGarrett D'Amore char *cp; 13704297a3b0SGarrett D'Amore int offset; 13714297a3b0SGarrett D'Amore char buf[MB_LEN_MAX]; 13724297a3b0SGarrett D'Amore size_t clen; 13734297a3b0SGarrett D'Amore mbstate_t mbs; 1374*2d08521bSGarrett D'Amore locale_t loc = uselocale(NULL); 13754297a3b0SGarrett D'Amore 13764297a3b0SGarrett D'Amore /* avoid making error situations worse */ 13774297a3b0SGarrett D'Amore if (p->error != 0) 13784297a3b0SGarrett D'Amore return; 13794297a3b0SGarrett D'Amore 13804297a3b0SGarrett D'Amore /* 13814297a3b0SGarrett D'Amore * It's not generally safe to do a ``char'' substring search on 13824297a3b0SGarrett D'Amore * multibyte character strings, but it's safe for at least 13834297a3b0SGarrett D'Amore * UTF-8 (see RFC 3629). 13844297a3b0SGarrett D'Amore */ 13854297a3b0SGarrett D'Amore if (MB_CUR_MAX > 1 && 1386*2d08521bSGarrett D'Amore strcmp(loc->runelocale->__encoding, "UTF-8") != 0) 13874297a3b0SGarrett D'Amore return; 13884297a3b0SGarrett D'Amore 13894297a3b0SGarrett D'Amore /* find the longest OCHAR sequence in strip */ 13904297a3b0SGarrett D'Amore newlen = 0; 13914297a3b0SGarrett D'Amore offset = 0; 13924297a3b0SGarrett D'Amore g->moffset = 0; 13934297a3b0SGarrett D'Amore scan = g->strip + 1; 13944297a3b0SGarrett D'Amore do { 13954297a3b0SGarrett D'Amore s = *scan++; 13964297a3b0SGarrett D'Amore switch (OP(s)) { 13974297a3b0SGarrett D'Amore case OCHAR: /* sequence member */ 13984297a3b0SGarrett D'Amore if (newlen == 0) { /* new sequence */ 13994297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs)); 14004297a3b0SGarrett D'Amore newstart = scan - 1; 14014297a3b0SGarrett D'Amore } 14024297a3b0SGarrett D'Amore clen = wcrtomb(buf, OPND(s), &mbs); 14034297a3b0SGarrett D'Amore if (clen == (size_t)-1) 14044297a3b0SGarrett D'Amore goto toohard; 14054297a3b0SGarrett D'Amore newlen += clen; 14064297a3b0SGarrett D'Amore break; 14074297a3b0SGarrett D'Amore case OPLUS_: /* things that don't break one */ 14084297a3b0SGarrett D'Amore case OLPAREN: 14094297a3b0SGarrett D'Amore case ORPAREN: 14104297a3b0SGarrett D'Amore break; 14114297a3b0SGarrett D'Amore case OQUEST_: /* things that must be skipped */ 14124297a3b0SGarrett D'Amore case OCH_: 14134297a3b0SGarrett D'Amore offset = altoffset(scan, offset); 14144297a3b0SGarrett D'Amore scan--; 14154297a3b0SGarrett D'Amore do { 14164297a3b0SGarrett D'Amore scan += OPND(s); 14174297a3b0SGarrett D'Amore s = *scan; 14184297a3b0SGarrett D'Amore /* assert() interferes w debug printouts */ 14194297a3b0SGarrett D'Amore if (OP(s) != O_QUEST && OP(s) != O_CH && 14204297a3b0SGarrett D'Amore OP(s) != OOR2) { 14214297a3b0SGarrett D'Amore g->iflags |= BAD; 14224297a3b0SGarrett D'Amore return; 14234297a3b0SGarrett D'Amore } 14244297a3b0SGarrett D'Amore } while (OP(s) != O_QUEST && OP(s) != O_CH); 14254297a3b0SGarrett D'Amore /* FALLTHROUGH */ 14264297a3b0SGarrett D'Amore case OBOW: /* things that break a sequence */ 14274297a3b0SGarrett D'Amore case OEOW: 14284297a3b0SGarrett D'Amore case OBOL: 14294297a3b0SGarrett D'Amore case OEOL: 14304297a3b0SGarrett D'Amore case O_QUEST: 14314297a3b0SGarrett D'Amore case O_CH: 14324297a3b0SGarrett D'Amore case OEND: 14334297a3b0SGarrett D'Amore if (newlen > g->mlen) { /* ends one */ 14344297a3b0SGarrett D'Amore start = newstart; 14354297a3b0SGarrett D'Amore g->mlen = newlen; 14364297a3b0SGarrett D'Amore if (offset > -1) { 14374297a3b0SGarrett D'Amore g->moffset += offset; 14384297a3b0SGarrett D'Amore offset = newlen; 14394297a3b0SGarrett D'Amore } else 14404297a3b0SGarrett D'Amore g->moffset = offset; 14414297a3b0SGarrett D'Amore } else { 14424297a3b0SGarrett D'Amore if (offset > -1) 14434297a3b0SGarrett D'Amore offset += newlen; 14444297a3b0SGarrett D'Amore } 14454297a3b0SGarrett D'Amore newlen = 0; 14464297a3b0SGarrett D'Amore break; 14474297a3b0SGarrett D'Amore case OANY: 14484297a3b0SGarrett D'Amore if (newlen > g->mlen) { /* ends one */ 14494297a3b0SGarrett D'Amore start = newstart; 14504297a3b0SGarrett D'Amore g->mlen = newlen; 14514297a3b0SGarrett D'Amore if (offset > -1) { 14524297a3b0SGarrett D'Amore g->moffset += offset; 14534297a3b0SGarrett D'Amore offset = newlen; 14544297a3b0SGarrett D'Amore } else 14554297a3b0SGarrett D'Amore g->moffset = offset; 14564297a3b0SGarrett D'Amore } else { 14574297a3b0SGarrett D'Amore if (offset > -1) 14584297a3b0SGarrett D'Amore offset += newlen; 14594297a3b0SGarrett D'Amore } 14604297a3b0SGarrett D'Amore if (offset > -1) 14614297a3b0SGarrett D'Amore offset++; 14624297a3b0SGarrett D'Amore newlen = 0; 14634297a3b0SGarrett D'Amore break; 14644297a3b0SGarrett D'Amore case OANYOF: /* may or may not invalidate offset */ 14654297a3b0SGarrett D'Amore /* First, everything as OANY */ 14664297a3b0SGarrett D'Amore if (newlen > g->mlen) { /* ends one */ 14674297a3b0SGarrett D'Amore start = newstart; 14684297a3b0SGarrett D'Amore g->mlen = newlen; 14694297a3b0SGarrett D'Amore if (offset > -1) { 14704297a3b0SGarrett D'Amore g->moffset += offset; 14714297a3b0SGarrett D'Amore offset = newlen; 14724297a3b0SGarrett D'Amore } else 14734297a3b0SGarrett D'Amore g->moffset = offset; 14744297a3b0SGarrett D'Amore } else { 14754297a3b0SGarrett D'Amore if (offset > -1) 14764297a3b0SGarrett D'Amore offset += newlen; 14774297a3b0SGarrett D'Amore } 14784297a3b0SGarrett D'Amore if (offset > -1) 14794297a3b0SGarrett D'Amore offset++; 14804297a3b0SGarrett D'Amore newlen = 0; 14814297a3b0SGarrett D'Amore break; 14824297a3b0SGarrett D'Amore toohard: 14834297a3b0SGarrett D'Amore default: 14844297a3b0SGarrett D'Amore /* 14854297a3b0SGarrett D'Amore * Anything here makes it impossible or too hard 14864297a3b0SGarrett D'Amore * to calculate the offset -- so we give up; 14874297a3b0SGarrett D'Amore * save the last known good offset, in case the 14884297a3b0SGarrett D'Amore * must sequence doesn't occur later. 14894297a3b0SGarrett D'Amore */ 14904297a3b0SGarrett D'Amore if (newlen > g->mlen) { /* ends one */ 14914297a3b0SGarrett D'Amore start = newstart; 14924297a3b0SGarrett D'Amore g->mlen = newlen; 14934297a3b0SGarrett D'Amore if (offset > -1) 14944297a3b0SGarrett D'Amore g->moffset += offset; 14954297a3b0SGarrett D'Amore else 14964297a3b0SGarrett D'Amore g->moffset = offset; 14974297a3b0SGarrett D'Amore } 14984297a3b0SGarrett D'Amore offset = -1; 14994297a3b0SGarrett D'Amore newlen = 0; 15004297a3b0SGarrett D'Amore break; 15014297a3b0SGarrett D'Amore } 15024297a3b0SGarrett D'Amore } while (OP(s) != OEND); 15034297a3b0SGarrett D'Amore 15044297a3b0SGarrett D'Amore if (g->mlen == 0) { /* there isn't one */ 15054297a3b0SGarrett D'Amore g->moffset = -1; 15064297a3b0SGarrett D'Amore return; 15074297a3b0SGarrett D'Amore } 15084297a3b0SGarrett D'Amore 15094297a3b0SGarrett D'Amore /* turn it into a character string */ 15104297a3b0SGarrett D'Amore g->must = malloc((size_t)g->mlen + 1); 15114297a3b0SGarrett D'Amore if (g->must == NULL) { /* argh; just forget it */ 15124297a3b0SGarrett D'Amore g->mlen = 0; 15134297a3b0SGarrett D'Amore g->moffset = -1; 15144297a3b0SGarrett D'Amore return; 15154297a3b0SGarrett D'Amore } 15164297a3b0SGarrett D'Amore cp = g->must; 15174297a3b0SGarrett D'Amore scan = start; 15184297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs)); 15194297a3b0SGarrett D'Amore while (cp < g->must + g->mlen) { 15204297a3b0SGarrett D'Amore while (OP(s = *scan++) != OCHAR) 15214297a3b0SGarrett D'Amore continue; 15224297a3b0SGarrett D'Amore clen = wcrtomb(cp, OPND(s), &mbs); 15234297a3b0SGarrett D'Amore assert(clen != (size_t)-1); 15244297a3b0SGarrett D'Amore cp += clen; 15254297a3b0SGarrett D'Amore } 15264297a3b0SGarrett D'Amore assert(cp == g->must + g->mlen); 15274297a3b0SGarrett D'Amore *cp++ = '\0'; /* just on general principles */ 15284297a3b0SGarrett D'Amore } 15294297a3b0SGarrett D'Amore 15304297a3b0SGarrett D'Amore /* 15314297a3b0SGarrett D'Amore * altoffset - choose biggest offset among multiple choices 15324297a3b0SGarrett D'Amore * 15334297a3b0SGarrett D'Amore * Compute, recursively if necessary, the largest offset among multiple 15344297a3b0SGarrett D'Amore * re paths. 15354297a3b0SGarrett D'Amore */ 15364297a3b0SGarrett D'Amore static int 15374297a3b0SGarrett D'Amore altoffset(sop *scan, int offset) 15384297a3b0SGarrett D'Amore { 15394297a3b0SGarrett D'Amore int largest; 15404297a3b0SGarrett D'Amore int try; 15414297a3b0SGarrett D'Amore sop s; 15424297a3b0SGarrett D'Amore 15434297a3b0SGarrett D'Amore /* If we gave up already on offsets, return */ 15444297a3b0SGarrett D'Amore if (offset == -1) 15454297a3b0SGarrett D'Amore return (-1); 15464297a3b0SGarrett D'Amore 15474297a3b0SGarrett D'Amore largest = 0; 15484297a3b0SGarrett D'Amore try = 0; 15494297a3b0SGarrett D'Amore s = *scan++; 15504297a3b0SGarrett D'Amore while (OP(s) != O_QUEST && OP(s) != O_CH) { 15514297a3b0SGarrett D'Amore switch (OP(s)) { 15524297a3b0SGarrett D'Amore case OOR1: 15534297a3b0SGarrett D'Amore if (try > largest) 15544297a3b0SGarrett D'Amore largest = try; 15554297a3b0SGarrett D'Amore try = 0; 15564297a3b0SGarrett D'Amore break; 15574297a3b0SGarrett D'Amore case OQUEST_: 15584297a3b0SGarrett D'Amore case OCH_: 15594297a3b0SGarrett D'Amore try = altoffset(scan, try); 15604297a3b0SGarrett D'Amore if (try == -1) 15614297a3b0SGarrett D'Amore return (-1); 15624297a3b0SGarrett D'Amore scan--; 15634297a3b0SGarrett D'Amore do { 15644297a3b0SGarrett D'Amore scan += OPND(s); 15654297a3b0SGarrett D'Amore s = *scan; 15664297a3b0SGarrett D'Amore if (OP(s) != O_QUEST && OP(s) != O_CH && 15674297a3b0SGarrett D'Amore OP(s) != OOR2) 15684297a3b0SGarrett D'Amore return (-1); 15694297a3b0SGarrett D'Amore } while (OP(s) != O_QUEST && OP(s) != O_CH); 15704297a3b0SGarrett D'Amore /* 15714297a3b0SGarrett D'Amore * We must skip to the next position, or we'll 15724297a3b0SGarrett D'Amore * leave altoffset() too early. 15734297a3b0SGarrett D'Amore */ 15744297a3b0SGarrett D'Amore scan++; 15754297a3b0SGarrett D'Amore break; 15764297a3b0SGarrett D'Amore case OANYOF: 15774297a3b0SGarrett D'Amore case OCHAR: 15784297a3b0SGarrett D'Amore case OANY: 15794297a3b0SGarrett D'Amore try++; 15804297a3b0SGarrett D'Amore /*FALLTHRU*/ 15814297a3b0SGarrett D'Amore case OBOW: 15824297a3b0SGarrett D'Amore case OEOW: 15834297a3b0SGarrett D'Amore case OLPAREN: 15844297a3b0SGarrett D'Amore case ORPAREN: 15854297a3b0SGarrett D'Amore case OOR2: 15864297a3b0SGarrett D'Amore break; 15874297a3b0SGarrett D'Amore default: 15884297a3b0SGarrett D'Amore try = -1; 15894297a3b0SGarrett D'Amore break; 15904297a3b0SGarrett D'Amore } 15914297a3b0SGarrett D'Amore if (try == -1) 15924297a3b0SGarrett D'Amore return (-1); 15934297a3b0SGarrett D'Amore s = *scan++; 15944297a3b0SGarrett D'Amore } 15954297a3b0SGarrett D'Amore 15964297a3b0SGarrett D'Amore if (try > largest) 15974297a3b0SGarrett D'Amore largest = try; 15984297a3b0SGarrett D'Amore 15994297a3b0SGarrett D'Amore return (largest+offset); 16004297a3b0SGarrett D'Amore } 16014297a3b0SGarrett D'Amore 16024297a3b0SGarrett D'Amore /* 16034297a3b0SGarrett D'Amore * computejumps - compute char jumps for BM scan 16044297a3b0SGarrett D'Amore * 16054297a3b0SGarrett D'Amore * This algorithm assumes g->must exists and is has size greater than 16064297a3b0SGarrett D'Amore * zero. It's based on the algorithm found on Computer Algorithms by 16074297a3b0SGarrett D'Amore * Sara Baase. 16084297a3b0SGarrett D'Amore * 16094297a3b0SGarrett D'Amore * A char jump is the number of characters one needs to jump based on 16104297a3b0SGarrett D'Amore * the value of the character from the text that was mismatched. 16114297a3b0SGarrett D'Amore */ 16124297a3b0SGarrett D'Amore static void 16134297a3b0SGarrett D'Amore computejumps(struct parse *p, struct re_guts *g) 16144297a3b0SGarrett D'Amore { 16154297a3b0SGarrett D'Amore int ch; 16164297a3b0SGarrett D'Amore int mindex; 16174297a3b0SGarrett D'Amore 16184297a3b0SGarrett D'Amore /* Avoid making errors worse */ 16194297a3b0SGarrett D'Amore if (p->error != 0) 16204297a3b0SGarrett D'Amore return; 16214297a3b0SGarrett D'Amore 16224297a3b0SGarrett D'Amore g->charjump = (int *)malloc((NC + 1) * sizeof (int)); 16234297a3b0SGarrett D'Amore if (g->charjump == NULL) /* Not a fatal error */ 16244297a3b0SGarrett D'Amore return; 16254297a3b0SGarrett D'Amore /* Adjust for signed chars, if necessary */ 16264297a3b0SGarrett D'Amore g->charjump = &g->charjump[-(CHAR_MIN)]; 16274297a3b0SGarrett D'Amore 16284297a3b0SGarrett D'Amore /* 16294297a3b0SGarrett D'Amore * If the character does not exist in the pattern, the jump 16304297a3b0SGarrett D'Amore * is equal to the number of characters in the pattern. 16314297a3b0SGarrett D'Amore */ 16324297a3b0SGarrett D'Amore for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 16334297a3b0SGarrett D'Amore g->charjump[ch] = g->mlen; 16344297a3b0SGarrett D'Amore 16354297a3b0SGarrett D'Amore /* 16364297a3b0SGarrett D'Amore * If the character does exist, compute the jump that would 16374297a3b0SGarrett D'Amore * take us to the last character in the pattern equal to it 16384297a3b0SGarrett D'Amore * (notice that we match right to left, so that last character 16394297a3b0SGarrett D'Amore * is the first one that would be matched). 16404297a3b0SGarrett D'Amore */ 16414297a3b0SGarrett D'Amore for (mindex = 0; mindex < g->mlen; mindex++) 16424297a3b0SGarrett D'Amore g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 16434297a3b0SGarrett D'Amore } 16444297a3b0SGarrett D'Amore 16454297a3b0SGarrett D'Amore /* 16464297a3b0SGarrett D'Amore * computematchjumps - compute match jumps for BM scan 16474297a3b0SGarrett D'Amore * 16484297a3b0SGarrett D'Amore * This algorithm assumes g->must exists and is has size greater than 16494297a3b0SGarrett D'Amore * zero. It's based on the algorithm found on Computer Algorithms by 16504297a3b0SGarrett D'Amore * Sara Baase. 16514297a3b0SGarrett D'Amore * 16524297a3b0SGarrett D'Amore * A match jump is the number of characters one needs to advance based 16534297a3b0SGarrett D'Amore * on the already-matched suffix. 16544297a3b0SGarrett D'Amore * Notice that all values here are minus (g->mlen-1), because of the way 16554297a3b0SGarrett D'Amore * the search algorithm works. 16564297a3b0SGarrett D'Amore */ 16574297a3b0SGarrett D'Amore static void 16584297a3b0SGarrett D'Amore computematchjumps(struct parse *p, struct re_guts *g) 16594297a3b0SGarrett D'Amore { 16604297a3b0SGarrett D'Amore int mindex; /* General "must" iterator */ 16614297a3b0SGarrett D'Amore int suffix; /* Keeps track of matching suffix */ 16624297a3b0SGarrett D'Amore int ssuffix; /* Keeps track of suffixes' suffix */ 16634297a3b0SGarrett D'Amore int *pmatches; 16644297a3b0SGarrett D'Amore /* 16654297a3b0SGarrett D'Amore * pmatches[k] points to the next i 16664297a3b0SGarrett D'Amore * such that i+1...mlen is a substring 16674297a3b0SGarrett D'Amore * of k+1...k+mlen-i-1 16684297a3b0SGarrett D'Amore */ 16694297a3b0SGarrett D'Amore 16704297a3b0SGarrett D'Amore /* Avoid making errors worse */ 16714297a3b0SGarrett D'Amore if (p->error != 0) 16724297a3b0SGarrett D'Amore return; 16734297a3b0SGarrett D'Amore 16744297a3b0SGarrett D'Amore pmatches = (int *)malloc(g->mlen * sizeof (unsigned int)); 16754297a3b0SGarrett D'Amore if (pmatches == NULL) { 16764297a3b0SGarrett D'Amore g->matchjump = NULL; 16774297a3b0SGarrett D'Amore return; 16784297a3b0SGarrett D'Amore } 16794297a3b0SGarrett D'Amore 16804297a3b0SGarrett D'Amore g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int)); 16814297a3b0SGarrett D'Amore if (g->matchjump == NULL) /* Not a fatal error */ 16824297a3b0SGarrett D'Amore return; 16834297a3b0SGarrett D'Amore 16844297a3b0SGarrett D'Amore /* Set maximum possible jump for each character in the pattern */ 16854297a3b0SGarrett D'Amore for (mindex = 0; mindex < g->mlen; mindex++) 16864297a3b0SGarrett D'Amore g->matchjump[mindex] = 2*g->mlen - mindex - 1; 16874297a3b0SGarrett D'Amore 16884297a3b0SGarrett D'Amore /* Compute pmatches[] */ 16894297a3b0SGarrett D'Amore for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; 16904297a3b0SGarrett D'Amore mindex--, suffix--) { 16914297a3b0SGarrett D'Amore pmatches[mindex] = suffix; 16924297a3b0SGarrett D'Amore 16934297a3b0SGarrett D'Amore /* 16944297a3b0SGarrett D'Amore * If a mismatch is found, interrupting the substring, 16954297a3b0SGarrett D'Amore * compute the matchjump for that position. If no 16964297a3b0SGarrett D'Amore * mismatch is found, then a text substring mismatched 16974297a3b0SGarrett D'Amore * against the suffix will also mismatch against the 16984297a3b0SGarrett D'Amore * substring. 16994297a3b0SGarrett D'Amore */ 17004297a3b0SGarrett D'Amore while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) { 17014297a3b0SGarrett D'Amore g->matchjump[suffix] = MIN(g->matchjump[suffix], 17024297a3b0SGarrett D'Amore g->mlen - mindex - 1); 17034297a3b0SGarrett D'Amore suffix = pmatches[suffix]; 17044297a3b0SGarrett D'Amore } 17054297a3b0SGarrett D'Amore } 17064297a3b0SGarrett D'Amore 17074297a3b0SGarrett D'Amore /* 17084297a3b0SGarrett D'Amore * Compute the matchjump up to the last substring found to jump 17094297a3b0SGarrett D'Amore * to the beginning of the largest must pattern prefix matching 17104297a3b0SGarrett D'Amore * it's own suffix. 17114297a3b0SGarrett D'Amore */ 17124297a3b0SGarrett D'Amore for (mindex = 0; mindex <= suffix; mindex++) 17134297a3b0SGarrett D'Amore g->matchjump[mindex] = MIN(g->matchjump[mindex], 17144297a3b0SGarrett D'Amore g->mlen + suffix - mindex); 17154297a3b0SGarrett D'Amore 17164297a3b0SGarrett D'Amore ssuffix = pmatches[suffix]; 17174297a3b0SGarrett D'Amore while (suffix < g->mlen) { 17184297a3b0SGarrett D'Amore while (suffix <= ssuffix && suffix < g->mlen) { 17194297a3b0SGarrett D'Amore g->matchjump[suffix] = MIN(g->matchjump[suffix], 17204297a3b0SGarrett D'Amore g->mlen + ssuffix - suffix); 17214297a3b0SGarrett D'Amore suffix++; 17224297a3b0SGarrett D'Amore } 17234297a3b0SGarrett D'Amore if (suffix < g->mlen) 17244297a3b0SGarrett D'Amore ssuffix = pmatches[ssuffix]; 17254297a3b0SGarrett D'Amore } 17264297a3b0SGarrett D'Amore 17274297a3b0SGarrett D'Amore free(pmatches); 17284297a3b0SGarrett D'Amore } 17294297a3b0SGarrett D'Amore 17304297a3b0SGarrett D'Amore /* 17314297a3b0SGarrett D'Amore * pluscount - count + nesting 17324297a3b0SGarrett D'Amore */ 17334297a3b0SGarrett D'Amore static sopno /* nesting depth */ 17344297a3b0SGarrett D'Amore pluscount(struct parse *p, struct re_guts *g) 17354297a3b0SGarrett D'Amore { 17364297a3b0SGarrett D'Amore sop *scan; 17374297a3b0SGarrett D'Amore sop s; 17384297a3b0SGarrett D'Amore sopno plusnest = 0; 17394297a3b0SGarrett D'Amore sopno maxnest = 0; 17404297a3b0SGarrett D'Amore 17414297a3b0SGarrett D'Amore if (p->error != 0) 17424297a3b0SGarrett D'Amore return (0); /* there may not be an OEND */ 17434297a3b0SGarrett D'Amore 17444297a3b0SGarrett D'Amore scan = g->strip + 1; 17454297a3b0SGarrett D'Amore do { 17464297a3b0SGarrett D'Amore s = *scan++; 17474297a3b0SGarrett D'Amore switch (OP(s)) { 17484297a3b0SGarrett D'Amore case OPLUS_: 17494297a3b0SGarrett D'Amore plusnest++; 17504297a3b0SGarrett D'Amore break; 17514297a3b0SGarrett D'Amore case O_PLUS: 17524297a3b0SGarrett D'Amore if (plusnest > maxnest) 17534297a3b0SGarrett D'Amore maxnest = plusnest; 17544297a3b0SGarrett D'Amore plusnest--; 17554297a3b0SGarrett D'Amore break; 17564297a3b0SGarrett D'Amore } 17574297a3b0SGarrett D'Amore } while (OP(s) != OEND); 17584297a3b0SGarrett D'Amore if (plusnest != 0) 17594297a3b0SGarrett D'Amore g->iflags |= BAD; 17604297a3b0SGarrett D'Amore return (maxnest); 17614297a3b0SGarrett D'Amore } 1762