14297a3b0SGarrett D'Amore /*
2*62c3776aSGarrett D'Amore * Copyright 2013 Garrett D'Amore <garrett@damore.org>
36b5e5868SGarrett D'Amore * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
433f5ff17SMilan Jurik * Copyright 2012 Milan Jurik. All rights reserved.
54297a3b0SGarrett D'Amore * Copyright (c) 1992, 1993, 1994 Henry Spencer.
64297a3b0SGarrett D'Amore * Copyright (c) 1992, 1993, 1994
74297a3b0SGarrett D'Amore * The Regents of the University of California. All rights reserved.
84297a3b0SGarrett D'Amore *
94297a3b0SGarrett D'Amore * This code is derived from software contributed to Berkeley by
104297a3b0SGarrett D'Amore * Henry Spencer.
114297a3b0SGarrett D'Amore *
124297a3b0SGarrett D'Amore * Redistribution and use in source and binary forms, with or without
134297a3b0SGarrett D'Amore * modification, are permitted provided that the following conditions
144297a3b0SGarrett D'Amore * are met:
154297a3b0SGarrett D'Amore * 1. Redistributions of source code must retain the above copyright
164297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer.
174297a3b0SGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright
184297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer in the
194297a3b0SGarrett D'Amore * documentation and/or other materials provided with the distribution.
204297a3b0SGarrett D'Amore * 4. Neither the name of the University nor the names of its contributors
214297a3b0SGarrett D'Amore * may be used to endorse or promote products derived from this software
224297a3b0SGarrett D'Amore * without specific prior written permission.
234297a3b0SGarrett D'Amore *
244297a3b0SGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
254297a3b0SGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
264297a3b0SGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
274297a3b0SGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
284297a3b0SGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
294297a3b0SGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
304297a3b0SGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
314297a3b0SGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
324297a3b0SGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
334297a3b0SGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
344297a3b0SGarrett D'Amore * SUCH DAMAGE.
354297a3b0SGarrett D'Amore */
364297a3b0SGarrett D'Amore
374297a3b0SGarrett D'Amore #include "lint.h"
384297a3b0SGarrett D'Amore #include "file64.h"
394297a3b0SGarrett D'Amore #include <sys/types.h>
404297a3b0SGarrett D'Amore #include <stdio.h>
414297a3b0SGarrett D'Amore #include <string.h>
424297a3b0SGarrett D'Amore #include <ctype.h>
434297a3b0SGarrett D'Amore #include <limits.h>
444297a3b0SGarrett D'Amore #include <stdlib.h>
454297a3b0SGarrett D'Amore #include <regex.h>
464297a3b0SGarrett D'Amore #include <wchar.h>
474297a3b0SGarrett D'Amore #include <wctype.h>
484297a3b0SGarrett D'Amore
494297a3b0SGarrett D'Amore #include "runetype.h"
504297a3b0SGarrett D'Amore #include "collate.h"
514297a3b0SGarrett D'Amore
524297a3b0SGarrett D'Amore #include "utils.h"
534297a3b0SGarrett D'Amore #include "regex2.h"
544297a3b0SGarrett D'Amore
554297a3b0SGarrett D'Amore #include "cname.h"
564297a3b0SGarrett D'Amore #include "mblocal.h"
574297a3b0SGarrett D'Amore
584297a3b0SGarrett D'Amore /*
594297a3b0SGarrett D'Amore * parse structure, passed up and down to avoid global variables and
604297a3b0SGarrett D'Amore * other clumsinesses
614297a3b0SGarrett D'Amore */
624297a3b0SGarrett D'Amore struct parse {
634297a3b0SGarrett D'Amore char *next; /* next character in RE */
644297a3b0SGarrett D'Amore char *end; /* end of string (-> NUL normally) */
654297a3b0SGarrett D'Amore int error; /* has an error been seen? */
664297a3b0SGarrett D'Amore sop *strip; /* malloced strip */
674297a3b0SGarrett D'Amore sopno ssize; /* malloced strip size (allocated) */
684297a3b0SGarrett D'Amore sopno slen; /* malloced strip length (used) */
694297a3b0SGarrett D'Amore int ncsalloc; /* number of csets allocated */
704297a3b0SGarrett D'Amore struct re_guts *g;
714297a3b0SGarrett D'Amore #define NPAREN 10 /* we need to remember () 1-9 for back refs */
724297a3b0SGarrett D'Amore sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
734297a3b0SGarrett D'Amore sopno pend[NPAREN]; /* -> ) ([0] unused) */
744297a3b0SGarrett D'Amore };
754297a3b0SGarrett D'Amore
764297a3b0SGarrett D'Amore /* ========= begin header generated by ./mkh ========= */
774297a3b0SGarrett D'Amore #ifdef __cplusplus
784297a3b0SGarrett D'Amore extern "C" {
794297a3b0SGarrett D'Amore #endif
804297a3b0SGarrett D'Amore
814297a3b0SGarrett D'Amore /* === regcomp.c === */
824297a3b0SGarrett D'Amore static void p_ere(struct parse *p, wint_t stop);
834297a3b0SGarrett D'Amore static void p_ere_exp(struct parse *p);
844297a3b0SGarrett D'Amore static void p_str(struct parse *p);
854297a3b0SGarrett D'Amore static void p_bre(struct parse *p, wint_t end1, wint_t end2);
864297a3b0SGarrett D'Amore static int p_simp_re(struct parse *p, int starordinary);
874297a3b0SGarrett D'Amore static int p_count(struct parse *p);
884297a3b0SGarrett D'Amore static void p_bracket(struct parse *p);
894297a3b0SGarrett D'Amore static void p_b_term(struct parse *p, cset *cs);
904297a3b0SGarrett D'Amore static void p_b_cclass(struct parse *p, cset *cs);
914297a3b0SGarrett D'Amore static void p_b_eclass(struct parse *p, cset *cs);
924297a3b0SGarrett D'Amore static wint_t p_b_symbol(struct parse *p);
934297a3b0SGarrett D'Amore static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
944297a3b0SGarrett D'Amore static wint_t othercase(wint_t ch);
954297a3b0SGarrett D'Amore static void bothcases(struct parse *p, wint_t ch);
964297a3b0SGarrett D'Amore static void ordinary(struct parse *p, wint_t ch);
974297a3b0SGarrett D'Amore static void nonnewline(struct parse *p);
984297a3b0SGarrett D'Amore static void repeat(struct parse *p, sopno start, int from, int to);
994297a3b0SGarrett D'Amore static int seterr(struct parse *p, int e);
1004297a3b0SGarrett D'Amore static cset *allocset(struct parse *p);
1014297a3b0SGarrett D'Amore static void freeset(struct parse *p, cset *cs);
1024297a3b0SGarrett D'Amore static void CHadd(struct parse *p, cset *cs, wint_t ch);
1034297a3b0SGarrett D'Amore static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
1044297a3b0SGarrett D'Amore static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
1054297a3b0SGarrett D'Amore static wint_t singleton(cset *cs);
1064297a3b0SGarrett D'Amore static sopno dupl(struct parse *p, sopno start, sopno finish);
1074297a3b0SGarrett D'Amore static void doemit(struct parse *p, sop op, size_t opnd);
1084297a3b0SGarrett D'Amore static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1094297a3b0SGarrett D'Amore static void dofwd(struct parse *p, sopno pos, sop value);
1104297a3b0SGarrett D'Amore static void enlarge(struct parse *p, sopno size);
1114297a3b0SGarrett D'Amore static void stripsnug(struct parse *p, struct re_guts *g);
1124297a3b0SGarrett D'Amore static void findmust(struct parse *p, struct re_guts *g);
1134297a3b0SGarrett D'Amore static int altoffset(sop *scan, int offset);
1144297a3b0SGarrett D'Amore static void computejumps(struct parse *p, struct re_guts *g);
1154297a3b0SGarrett D'Amore static void computematchjumps(struct parse *p, struct re_guts *g);
1164297a3b0SGarrett D'Amore static sopno pluscount(struct parse *p, struct re_guts *g);
1174297a3b0SGarrett D'Amore static wint_t wgetnext(struct parse *p);
1184297a3b0SGarrett D'Amore
1194297a3b0SGarrett D'Amore #ifdef __cplusplus
1204297a3b0SGarrett D'Amore }
1214297a3b0SGarrett D'Amore #endif
1224297a3b0SGarrett D'Amore /* ========= end header generated by ./mkh ========= */
1234297a3b0SGarrett D'Amore
1244297a3b0SGarrett D'Amore static char nuls[10]; /* place to point scanner in event of error */
1254297a3b0SGarrett D'Amore
1264297a3b0SGarrett D'Amore /*
1274297a3b0SGarrett D'Amore * macros for use with parse structure
1284297a3b0SGarrett D'Amore * BEWARE: these know that the parse structure is named `p' !!!
1294297a3b0SGarrett D'Amore */
1304297a3b0SGarrett D'Amore #define PEEK() (*p->next)
1314297a3b0SGarrett D'Amore #define PEEK2() (*(p->next+1))
1324297a3b0SGarrett D'Amore #define MORE() (p->next < p->end)
1334297a3b0SGarrett D'Amore #define MORE2() (p->next+1 < p->end)
1344297a3b0SGarrett D'Amore #define SEE(c) (MORE() && PEEK() == (c))
1354297a3b0SGarrett D'Amore #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
1364297a3b0SGarrett D'Amore #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
1374297a3b0SGarrett D'Amore #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
1384297a3b0SGarrett D'Amore #define NEXT() (p->next++)
1394297a3b0SGarrett D'Amore #define NEXT2() (p->next += 2)
1404297a3b0SGarrett D'Amore #define NEXTn(n) (p->next += (n))
1414297a3b0SGarrett D'Amore #define GETNEXT() (*p->next++)
1424297a3b0SGarrett D'Amore #define WGETNEXT() wgetnext(p)
1434297a3b0SGarrett D'Amore #define SETERROR(e) ((void)seterr(p, (e)))
1444297a3b0SGarrett D'Amore #define REQUIRE(co, e) ((co) || seterr(p, e))
1454297a3b0SGarrett D'Amore #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
1464297a3b0SGarrett D'Amore #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
1474297a3b0SGarrett D'Amore #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
1484297a3b0SGarrett D'Amore #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
1494297a3b0SGarrett D'Amore #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
1504297a3b0SGarrett D'Amore #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
1514297a3b0SGarrett D'Amore #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
1524297a3b0SGarrett D'Amore #define HERE() (p->slen)
1534297a3b0SGarrett D'Amore #define THERE() (p->slen - 1)
1544297a3b0SGarrett D'Amore #define THERETHERE() (p->slen - 2)
1554297a3b0SGarrett D'Amore #define DROP(n) (p->slen -= (n))
1564297a3b0SGarrett D'Amore
1574297a3b0SGarrett D'Amore #ifndef NDEBUG
1584297a3b0SGarrett D'Amore static int never = 0; /* for use in asserts; shuts lint up */
1594297a3b0SGarrett D'Amore #else
1604297a3b0SGarrett D'Amore #define never 0 /* some <assert.h>s have bugs too */
1614297a3b0SGarrett D'Amore #endif
1624297a3b0SGarrett D'Amore
1634297a3b0SGarrett D'Amore /*
1644297a3b0SGarrett D'Amore * regcomp - interface for parser and compilation
1654297a3b0SGarrett D'Amore */
1664297a3b0SGarrett D'Amore int /* 0 success, otherwise REG_something */
regcomp(regex_t * _RESTRICT_KYWD preg,const char * _RESTRICT_KYWD pattern,int cflags)1674297a3b0SGarrett D'Amore regcomp(regex_t *_RESTRICT_KYWD preg,
1684297a3b0SGarrett D'Amore const char *_RESTRICT_KYWD pattern,
1694297a3b0SGarrett D'Amore int cflags)
1704297a3b0SGarrett D'Amore {
1714297a3b0SGarrett D'Amore struct parse pa;
1724297a3b0SGarrett D'Amore struct re_guts *g;
1734297a3b0SGarrett D'Amore struct parse *p = &pa;
1744297a3b0SGarrett D'Amore int i;
1754297a3b0SGarrett D'Amore size_t len;
1764297a3b0SGarrett D'Amore #ifdef REDEBUG
1774297a3b0SGarrett D'Amore #define GOODFLAGS(f) (f)
1784297a3b0SGarrett D'Amore #else
1794297a3b0SGarrett D'Amore #define GOODFLAGS(f) ((f)&~REG_DUMP)
1804297a3b0SGarrett D'Amore #endif
1814297a3b0SGarrett D'Amore
1824297a3b0SGarrett D'Amore /* We had REG_INVARG, but we don't have that on Solaris. */
1834297a3b0SGarrett D'Amore cflags = GOODFLAGS(cflags);
1844297a3b0SGarrett D'Amore if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
1854297a3b0SGarrett D'Amore return (REG_EFATAL);
1864297a3b0SGarrett D'Amore
1874297a3b0SGarrett D'Amore if (cflags®_PEND) {
1884297a3b0SGarrett D'Amore if (preg->re_endp < pattern)
1894297a3b0SGarrett D'Amore return (REG_EFATAL);
1904297a3b0SGarrett D'Amore len = preg->re_endp - pattern;
1914297a3b0SGarrett D'Amore } else
1924297a3b0SGarrett D'Amore len = strlen((char *)pattern);
1934297a3b0SGarrett D'Amore
1944297a3b0SGarrett D'Amore /* do the mallocs early so failure handling is easy */
1954297a3b0SGarrett D'Amore g = (struct re_guts *)malloc(sizeof (struct re_guts));
1964297a3b0SGarrett D'Amore if (g == NULL)
1974297a3b0SGarrett D'Amore return (REG_ESPACE);
1984297a3b0SGarrett D'Amore p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
1994297a3b0SGarrett D'Amore p->strip = (sop *)malloc(p->ssize * sizeof (sop));
2004297a3b0SGarrett D'Amore p->slen = 0;
2014297a3b0SGarrett D'Amore if (p->strip == NULL) {
2024297a3b0SGarrett D'Amore free((char *)g);
2034297a3b0SGarrett D'Amore return (REG_ESPACE);
2044297a3b0SGarrett D'Amore }
2054297a3b0SGarrett D'Amore
2064297a3b0SGarrett D'Amore /* set things up */
2074297a3b0SGarrett D'Amore p->g = g;
2084297a3b0SGarrett D'Amore p->next = (char *)pattern; /* convenience; we do not modify it */
2094297a3b0SGarrett D'Amore p->end = p->next + len;
2104297a3b0SGarrett D'Amore p->error = 0;
2114297a3b0SGarrett D'Amore p->ncsalloc = 0;
2124297a3b0SGarrett D'Amore for (i = 0; i < NPAREN; i++) {
2134297a3b0SGarrett D'Amore p->pbegin[i] = 0;
2144297a3b0SGarrett D'Amore p->pend[i] = 0;
2154297a3b0SGarrett D'Amore }
2164297a3b0SGarrett D'Amore g->sets = NULL;
2174297a3b0SGarrett D'Amore g->ncsets = 0;
2184297a3b0SGarrett D'Amore g->cflags = cflags;
2194297a3b0SGarrett D'Amore g->iflags = 0;
2204297a3b0SGarrett D'Amore g->nbol = 0;
2214297a3b0SGarrett D'Amore g->neol = 0;
2224297a3b0SGarrett D'Amore g->must = NULL;
2234297a3b0SGarrett D'Amore g->moffset = -1;
2244297a3b0SGarrett D'Amore g->charjump = NULL;
2254297a3b0SGarrett D'Amore g->matchjump = NULL;
2264297a3b0SGarrett D'Amore g->mlen = 0;
2274297a3b0SGarrett D'Amore g->nsub = 0;
2284297a3b0SGarrett D'Amore g->backrefs = 0;
2294297a3b0SGarrett D'Amore
2304297a3b0SGarrett D'Amore /* do it */
2314297a3b0SGarrett D'Amore EMIT(OEND, 0);
2324297a3b0SGarrett D'Amore g->firststate = THERE();
2334297a3b0SGarrett D'Amore if (cflags®_EXTENDED)
2344297a3b0SGarrett D'Amore p_ere(p, OUT);
2354297a3b0SGarrett D'Amore else if (cflags®_NOSPEC)
2364297a3b0SGarrett D'Amore p_str(p);
2374297a3b0SGarrett D'Amore else
2384297a3b0SGarrett D'Amore p_bre(p, OUT, OUT);
2394297a3b0SGarrett D'Amore EMIT(OEND, 0);
2404297a3b0SGarrett D'Amore g->laststate = THERE();
2414297a3b0SGarrett D'Amore
2424297a3b0SGarrett D'Amore /* tidy up loose ends and fill things in */
2434297a3b0SGarrett D'Amore stripsnug(p, g);
2444297a3b0SGarrett D'Amore findmust(p, g);
2454297a3b0SGarrett D'Amore /*
2464297a3b0SGarrett D'Amore * only use Boyer-Moore algorithm if the pattern is bigger
2474297a3b0SGarrett D'Amore * than three characters
2484297a3b0SGarrett D'Amore */
2494297a3b0SGarrett D'Amore if (g->mlen > 3) {
2504297a3b0SGarrett D'Amore computejumps(p, g);
2514297a3b0SGarrett D'Amore computematchjumps(p, g);
2524297a3b0SGarrett D'Amore if (g->matchjump == NULL && g->charjump != NULL) {
2534297a3b0SGarrett D'Amore free(g->charjump);
2544297a3b0SGarrett D'Amore g->charjump = NULL;
2554297a3b0SGarrett D'Amore }
2564297a3b0SGarrett D'Amore }
2574297a3b0SGarrett D'Amore g->nplus = pluscount(p, g);
2584297a3b0SGarrett D'Amore g->magic = MAGIC2;
2594297a3b0SGarrett D'Amore preg->re_nsub = g->nsub;
2604297a3b0SGarrett D'Amore preg->re_g = g;
2614297a3b0SGarrett D'Amore preg->re_magic = MAGIC1;
2624297a3b0SGarrett D'Amore #ifndef REDEBUG
2634297a3b0SGarrett D'Amore /* not debugging, so can't rely on the assert() in regexec() */
2644297a3b0SGarrett D'Amore if (g->iflags&BAD)
2654297a3b0SGarrett D'Amore SETERROR(REG_EFATAL);
2664297a3b0SGarrett D'Amore #endif
2674297a3b0SGarrett D'Amore
2684297a3b0SGarrett D'Amore /* win or lose, we're done */
2694297a3b0SGarrett D'Amore if (p->error != 0) /* lose */
2704297a3b0SGarrett D'Amore regfree(preg);
2714297a3b0SGarrett D'Amore return (p->error);
2724297a3b0SGarrett D'Amore }
2734297a3b0SGarrett D'Amore
2744297a3b0SGarrett D'Amore /*
2754297a3b0SGarrett D'Amore * p_ere - ERE parser top level, concatenation and alternation
2764297a3b0SGarrett D'Amore */
2774297a3b0SGarrett D'Amore static void
p_ere(struct parse * p,wint_t stop)2784297a3b0SGarrett D'Amore p_ere(struct parse *p,
2794297a3b0SGarrett D'Amore wint_t stop) /* character this ERE should end at */
2804297a3b0SGarrett D'Amore {
2814297a3b0SGarrett D'Amore char c;
2824297a3b0SGarrett D'Amore sopno prevback;
2834297a3b0SGarrett D'Amore sopno prevfwd;
2844297a3b0SGarrett D'Amore sopno conc;
2854297a3b0SGarrett D'Amore int first = 1; /* is this the first alternative? */
2864297a3b0SGarrett D'Amore
2874297a3b0SGarrett D'Amore for (;;) {
2884297a3b0SGarrett D'Amore /* do a bunch of concatenated expressions */
2894297a3b0SGarrett D'Amore conc = HERE();
2904297a3b0SGarrett D'Amore while (MORE() && (c = PEEK()) != '|' && c != stop)
2914297a3b0SGarrett D'Amore p_ere_exp(p);
2924297a3b0SGarrett D'Amore /* require nonempty */
2934297a3b0SGarrett D'Amore (void) REQUIRE(HERE() != conc, REG_BADPAT);
2944297a3b0SGarrett D'Amore
2954297a3b0SGarrett D'Amore if (!EAT('|'))
2964297a3b0SGarrett D'Amore break; /* NOTE BREAK OUT */
2974297a3b0SGarrett D'Amore
2984297a3b0SGarrett D'Amore if (first) {
2994297a3b0SGarrett D'Amore INSERT(OCH_, conc); /* offset is wrong */
3004297a3b0SGarrett D'Amore prevfwd = conc;
3014297a3b0SGarrett D'Amore prevback = conc;
3024297a3b0SGarrett D'Amore first = 0;
3034297a3b0SGarrett D'Amore }
3044297a3b0SGarrett D'Amore ASTERN(OOR1, prevback);
3054297a3b0SGarrett D'Amore prevback = THERE();
3064297a3b0SGarrett D'Amore AHEAD(prevfwd); /* fix previous offset */
3074297a3b0SGarrett D'Amore prevfwd = HERE();
3084297a3b0SGarrett D'Amore EMIT(OOR2, 0); /* offset is very wrong */
3094297a3b0SGarrett D'Amore }
3104297a3b0SGarrett D'Amore
3114297a3b0SGarrett D'Amore if (!first) { /* tail-end fixups */
3124297a3b0SGarrett D'Amore AHEAD(prevfwd);
3134297a3b0SGarrett D'Amore ASTERN(O_CH, prevback);
3144297a3b0SGarrett D'Amore }
3154297a3b0SGarrett D'Amore
3164297a3b0SGarrett D'Amore assert(!MORE() || SEE(stop));
3174297a3b0SGarrett D'Amore }
3184297a3b0SGarrett D'Amore
3194297a3b0SGarrett D'Amore /*
3204297a3b0SGarrett D'Amore * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
3214297a3b0SGarrett D'Amore */
3224297a3b0SGarrett D'Amore static void
p_ere_exp(struct parse * p)3234297a3b0SGarrett D'Amore p_ere_exp(struct parse *p)
3244297a3b0SGarrett D'Amore {
3254297a3b0SGarrett D'Amore char c;
3264297a3b0SGarrett D'Amore wint_t wc;
3274297a3b0SGarrett D'Amore sopno pos;
3284297a3b0SGarrett D'Amore int count;
3294297a3b0SGarrett D'Amore int count2;
3304297a3b0SGarrett D'Amore sopno subno;
3314297a3b0SGarrett D'Amore int wascaret = 0;
3324297a3b0SGarrett D'Amore
3334297a3b0SGarrett D'Amore assert(MORE()); /* caller should have ensured this */
3344297a3b0SGarrett D'Amore c = GETNEXT();
3354297a3b0SGarrett D'Amore
3364297a3b0SGarrett D'Amore pos = HERE();
3374297a3b0SGarrett D'Amore switch (c) {
3384297a3b0SGarrett D'Amore case '(':
3394297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EPAREN);
3404297a3b0SGarrett D'Amore p->g->nsub++;
3414297a3b0SGarrett D'Amore subno = p->g->nsub;
3424297a3b0SGarrett D'Amore if (subno < NPAREN)
3434297a3b0SGarrett D'Amore p->pbegin[subno] = HERE();
3444297a3b0SGarrett D'Amore EMIT(OLPAREN, subno);
3454297a3b0SGarrett D'Amore if (!SEE(')'))
3464297a3b0SGarrett D'Amore p_ere(p, ')');
3474297a3b0SGarrett D'Amore if (subno < NPAREN) {
3484297a3b0SGarrett D'Amore p->pend[subno] = HERE();
3494297a3b0SGarrett D'Amore assert(p->pend[subno] != 0);
3504297a3b0SGarrett D'Amore }
3514297a3b0SGarrett D'Amore EMIT(ORPAREN, subno);
3524297a3b0SGarrett D'Amore (void) MUSTEAT(')', REG_EPAREN);
3534297a3b0SGarrett D'Amore break;
3544297a3b0SGarrett D'Amore #ifndef POSIX_MISTAKE
3554297a3b0SGarrett D'Amore case ')': /* happens only if no current unmatched ( */
3564297a3b0SGarrett D'Amore /*
3574297a3b0SGarrett D'Amore * You may ask, why the ifndef? Because I didn't notice
3584297a3b0SGarrett D'Amore * this until slightly too late for 1003.2, and none of the
3594297a3b0SGarrett D'Amore * other 1003.2 regular-expression reviewers noticed it at
3604297a3b0SGarrett D'Amore * all. So an unmatched ) is legal POSIX, at least until
3614297a3b0SGarrett D'Amore * we can get it fixed.
3624297a3b0SGarrett D'Amore */
3634297a3b0SGarrett D'Amore SETERROR(REG_EPAREN);
3644297a3b0SGarrett D'Amore break;
3654297a3b0SGarrett D'Amore #endif
3664297a3b0SGarrett D'Amore case '^':
3674297a3b0SGarrett D'Amore EMIT(OBOL, 0);
3684297a3b0SGarrett D'Amore p->g->iflags |= USEBOL;
3694297a3b0SGarrett D'Amore p->g->nbol++;
3704297a3b0SGarrett D'Amore wascaret = 1;
3714297a3b0SGarrett D'Amore break;
3724297a3b0SGarrett D'Amore case '$':
3734297a3b0SGarrett D'Amore EMIT(OEOL, 0);
3744297a3b0SGarrett D'Amore p->g->iflags |= USEEOL;
3754297a3b0SGarrett D'Amore p->g->neol++;
3764297a3b0SGarrett D'Amore break;
3774297a3b0SGarrett D'Amore case '|':
3784297a3b0SGarrett D'Amore SETERROR(REG_BADPAT);
3794297a3b0SGarrett D'Amore break;
3804297a3b0SGarrett D'Amore case '*':
3814297a3b0SGarrett D'Amore case '+':
3824297a3b0SGarrett D'Amore case '?':
3834297a3b0SGarrett D'Amore SETERROR(REG_BADRPT);
3844297a3b0SGarrett D'Amore break;
3854297a3b0SGarrett D'Amore case '.':
3864297a3b0SGarrett D'Amore if (p->g->cflags®_NEWLINE)
3874297a3b0SGarrett D'Amore nonnewline(p);
3884297a3b0SGarrett D'Amore else
3894297a3b0SGarrett D'Amore EMIT(OANY, 0);
3904297a3b0SGarrett D'Amore break;
3914297a3b0SGarrett D'Amore case '[':
3924297a3b0SGarrett D'Amore p_bracket(p);
3934297a3b0SGarrett D'Amore break;
3944297a3b0SGarrett D'Amore case '\\':
3954297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EESCAPE);
3964297a3b0SGarrett D'Amore wc = WGETNEXT();
39784441f85SGarrett D'Amore switch (wc) {
39884441f85SGarrett D'Amore case '<':
39984441f85SGarrett D'Amore EMIT(OBOW, 0);
40084441f85SGarrett D'Amore break;
40184441f85SGarrett D'Amore case '>':
40284441f85SGarrett D'Amore EMIT(OEOW, 0);
40384441f85SGarrett D'Amore break;
40484441f85SGarrett D'Amore default:
4054297a3b0SGarrett D'Amore ordinary(p, wc);
4064297a3b0SGarrett D'Amore break;
40784441f85SGarrett D'Amore }
40884441f85SGarrett D'Amore break;
4094297a3b0SGarrett D'Amore case '{': /* okay as ordinary except if digit follows */
4104297a3b0SGarrett D'Amore (void) REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
4114297a3b0SGarrett D'Amore /* FALLTHROUGH */
4124297a3b0SGarrett D'Amore default:
4134297a3b0SGarrett D'Amore p->next--;
4144297a3b0SGarrett D'Amore wc = WGETNEXT();
4154297a3b0SGarrett D'Amore ordinary(p, wc);
4164297a3b0SGarrett D'Amore break;
4174297a3b0SGarrett D'Amore }
4184297a3b0SGarrett D'Amore
4194297a3b0SGarrett D'Amore if (!MORE())
4204297a3b0SGarrett D'Amore return;
4214297a3b0SGarrett D'Amore c = PEEK();
4224297a3b0SGarrett D'Amore /* we call { a repetition if followed by a digit */
4234297a3b0SGarrett D'Amore if (!(c == '*' || c == '+' || c == '?' ||
4244297a3b0SGarrett D'Amore (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
4254297a3b0SGarrett D'Amore return; /* no repetition, we're done */
4264297a3b0SGarrett D'Amore NEXT();
4274297a3b0SGarrett D'Amore
4284297a3b0SGarrett D'Amore (void) REQUIRE(!wascaret, REG_BADRPT);
4294297a3b0SGarrett D'Amore switch (c) {
4304297a3b0SGarrett D'Amore case '*': /* implemented as +? */
4314297a3b0SGarrett D'Amore /* this case does not require the (y|) trick, noKLUDGE */
4324297a3b0SGarrett D'Amore INSERT(OPLUS_, pos);
4334297a3b0SGarrett D'Amore ASTERN(O_PLUS, pos);
4344297a3b0SGarrett D'Amore INSERT(OQUEST_, pos);
4354297a3b0SGarrett D'Amore ASTERN(O_QUEST, pos);
4364297a3b0SGarrett D'Amore break;
4374297a3b0SGarrett D'Amore case '+':
4384297a3b0SGarrett D'Amore INSERT(OPLUS_, pos);
4394297a3b0SGarrett D'Amore ASTERN(O_PLUS, pos);
4404297a3b0SGarrett D'Amore break;
4414297a3b0SGarrett D'Amore case '?':
4424297a3b0SGarrett D'Amore /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
4434297a3b0SGarrett D'Amore INSERT(OCH_, pos); /* offset slightly wrong */
4444297a3b0SGarrett D'Amore ASTERN(OOR1, pos); /* this one's right */
4454297a3b0SGarrett D'Amore AHEAD(pos); /* fix the OCH_ */
4464297a3b0SGarrett D'Amore EMIT(OOR2, 0); /* offset very wrong... */
4474297a3b0SGarrett D'Amore AHEAD(THERE()); /* ...so fix it */
4484297a3b0SGarrett D'Amore ASTERN(O_CH, THERETHERE());
4494297a3b0SGarrett D'Amore break;
4504297a3b0SGarrett D'Amore case '{':
4514297a3b0SGarrett D'Amore count = p_count(p);
4524297a3b0SGarrett D'Amore if (EAT(',')) {
4534297a3b0SGarrett D'Amore if (isdigit((uch)PEEK())) {
4544297a3b0SGarrett D'Amore count2 = p_count(p);
4554297a3b0SGarrett D'Amore (void) REQUIRE(count <= count2, REG_BADBR);
4564297a3b0SGarrett D'Amore } else /* single number with comma */
4574297a3b0SGarrett D'Amore count2 = INFINITY;
4584297a3b0SGarrett D'Amore } else /* just a single number */
4594297a3b0SGarrett D'Amore count2 = count;
4604297a3b0SGarrett D'Amore repeat(p, pos, count, count2);
4614297a3b0SGarrett D'Amore if (!EAT('}')) { /* error heuristics */
4624297a3b0SGarrett D'Amore while (MORE() && PEEK() != '}')
4634297a3b0SGarrett D'Amore NEXT();
4644297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACE);
4654297a3b0SGarrett D'Amore SETERROR(REG_BADBR);
4664297a3b0SGarrett D'Amore }
4674297a3b0SGarrett D'Amore break;
4684297a3b0SGarrett D'Amore }
4694297a3b0SGarrett D'Amore
4704297a3b0SGarrett D'Amore if (!MORE())
4714297a3b0SGarrett D'Amore return;
4724297a3b0SGarrett D'Amore c = PEEK();
4734297a3b0SGarrett D'Amore if (!(c == '*' || c == '+' || c == '?' ||
4744297a3b0SGarrett D'Amore (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
4754297a3b0SGarrett D'Amore return;
4764297a3b0SGarrett D'Amore SETERROR(REG_BADRPT);
4774297a3b0SGarrett D'Amore }
4784297a3b0SGarrett D'Amore
4794297a3b0SGarrett D'Amore /*
4804297a3b0SGarrett D'Amore * p_str - string (no metacharacters) "parser"
4814297a3b0SGarrett D'Amore */
4824297a3b0SGarrett D'Amore static void
p_str(struct parse * p)4834297a3b0SGarrett D'Amore p_str(struct parse *p)
4844297a3b0SGarrett D'Amore {
4854297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_BADPAT);
4864297a3b0SGarrett D'Amore while (MORE())
4874297a3b0SGarrett D'Amore ordinary(p, WGETNEXT());
4884297a3b0SGarrett D'Amore }
4894297a3b0SGarrett D'Amore
4904297a3b0SGarrett D'Amore /*
4914297a3b0SGarrett D'Amore * p_bre - BRE parser top level, anchoring and concatenation
4924297a3b0SGarrett D'Amore * Giving end1 as OUT essentially eliminates the end1/end2 check.
4934297a3b0SGarrett D'Amore *
4944297a3b0SGarrett D'Amore * This implementation is a bit of a kludge, in that a trailing $ is first
4954297a3b0SGarrett D'Amore * taken as an ordinary character and then revised to be an anchor.
4964297a3b0SGarrett D'Amore * The amount of lookahead needed to avoid this kludge is excessive.
4974297a3b0SGarrett D'Amore */
4984297a3b0SGarrett D'Amore static void
p_bre(struct parse * p,wint_t end1,wint_t end2)4994297a3b0SGarrett D'Amore p_bre(struct parse *p,
5004297a3b0SGarrett D'Amore wint_t end1, /* first terminating character */
5014297a3b0SGarrett D'Amore wint_t end2) /* second terminating character */
5024297a3b0SGarrett D'Amore {
5034297a3b0SGarrett D'Amore sopno start = HERE();
5044297a3b0SGarrett D'Amore int first = 1; /* first subexpression? */
5054297a3b0SGarrett D'Amore int wasdollar = 0;
5064297a3b0SGarrett D'Amore
5074297a3b0SGarrett D'Amore if (EAT('^')) {
5084297a3b0SGarrett D'Amore EMIT(OBOL, 0);
5094297a3b0SGarrett D'Amore p->g->iflags |= USEBOL;
5104297a3b0SGarrett D'Amore p->g->nbol++;
5114297a3b0SGarrett D'Amore }
5124297a3b0SGarrett D'Amore while (MORE() && !SEETWO(end1, end2)) {
5134297a3b0SGarrett D'Amore wasdollar = p_simp_re(p, first);
5144297a3b0SGarrett D'Amore first = 0;
5154297a3b0SGarrett D'Amore }
5164297a3b0SGarrett D'Amore if (wasdollar) { /* oops, that was a trailing anchor */
5174297a3b0SGarrett D'Amore DROP(1);
5184297a3b0SGarrett D'Amore EMIT(OEOL, 0);
5194297a3b0SGarrett D'Amore p->g->iflags |= USEEOL;
5204297a3b0SGarrett D'Amore p->g->neol++;
5214297a3b0SGarrett D'Amore }
5224297a3b0SGarrett D'Amore
5234297a3b0SGarrett D'Amore (void) REQUIRE(HERE() != start, REG_BADPAT); /* require nonempty */
5244297a3b0SGarrett D'Amore }
5254297a3b0SGarrett D'Amore
5264297a3b0SGarrett D'Amore /*
5274297a3b0SGarrett D'Amore * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
5284297a3b0SGarrett D'Amore */
5294297a3b0SGarrett D'Amore static int /* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,int starordinary)5304297a3b0SGarrett D'Amore p_simp_re(struct parse *p,
5314297a3b0SGarrett D'Amore int starordinary) /* is a leading * an ordinary character? */
5324297a3b0SGarrett D'Amore {
5334297a3b0SGarrett D'Amore int c;
5344297a3b0SGarrett D'Amore int count;
5354297a3b0SGarrett D'Amore int count2;
5364297a3b0SGarrett D'Amore sopno pos;
5374297a3b0SGarrett D'Amore int i;
5384297a3b0SGarrett D'Amore wint_t wc;
5394297a3b0SGarrett D'Amore sopno subno;
5404297a3b0SGarrett D'Amore #define BACKSL (1<<CHAR_BIT)
5414297a3b0SGarrett D'Amore
5424297a3b0SGarrett D'Amore pos = HERE(); /* repetion op, if any, covers from here */
5434297a3b0SGarrett D'Amore
5444297a3b0SGarrett D'Amore assert(MORE()); /* caller should have ensured this */
5454297a3b0SGarrett D'Amore c = GETNEXT();
5464297a3b0SGarrett D'Amore if (c == '\\') {
5474297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EESCAPE);
5484297a3b0SGarrett D'Amore c = BACKSL | GETNEXT();
5494297a3b0SGarrett D'Amore }
5504297a3b0SGarrett D'Amore switch (c) {
5514297a3b0SGarrett D'Amore case '.':
5524297a3b0SGarrett D'Amore if (p->g->cflags®_NEWLINE)
5534297a3b0SGarrett D'Amore nonnewline(p);
5544297a3b0SGarrett D'Amore else
5554297a3b0SGarrett D'Amore EMIT(OANY, 0);
5564297a3b0SGarrett D'Amore break;
5574297a3b0SGarrett D'Amore case '[':
5584297a3b0SGarrett D'Amore p_bracket(p);
5594297a3b0SGarrett D'Amore break;
56084441f85SGarrett D'Amore case BACKSL|'<':
56184441f85SGarrett D'Amore EMIT(OBOW, 0);
56284441f85SGarrett D'Amore break;
56384441f85SGarrett D'Amore case BACKSL|'>':
56484441f85SGarrett D'Amore EMIT(OEOW, 0);
56584441f85SGarrett D'Amore break;
5664297a3b0SGarrett D'Amore case BACKSL|'{':
5674297a3b0SGarrett D'Amore SETERROR(REG_BADRPT);
5684297a3b0SGarrett D'Amore break;
5694297a3b0SGarrett D'Amore case BACKSL|'(':
5704297a3b0SGarrett D'Amore p->g->nsub++;
5714297a3b0SGarrett D'Amore subno = p->g->nsub;
5724297a3b0SGarrett D'Amore if (subno < NPAREN)
5734297a3b0SGarrett D'Amore p->pbegin[subno] = HERE();
5744297a3b0SGarrett D'Amore EMIT(OLPAREN, subno);
5754297a3b0SGarrett D'Amore /* the MORE here is an error heuristic */
5764297a3b0SGarrett D'Amore if (MORE() && !SEETWO('\\', ')'))
5774297a3b0SGarrett D'Amore p_bre(p, '\\', ')');
5784297a3b0SGarrett D'Amore if (subno < NPAREN) {
5794297a3b0SGarrett D'Amore p->pend[subno] = HERE();
5804297a3b0SGarrett D'Amore assert(p->pend[subno] != 0);
5814297a3b0SGarrett D'Amore }
5824297a3b0SGarrett D'Amore EMIT(ORPAREN, subno);
5834297a3b0SGarrett D'Amore (void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
5844297a3b0SGarrett D'Amore break;
5854297a3b0SGarrett D'Amore case BACKSL|')': /* should not get here -- must be user */
5864297a3b0SGarrett D'Amore case BACKSL|'}':
5874297a3b0SGarrett D'Amore SETERROR(REG_EPAREN);
5884297a3b0SGarrett D'Amore break;
5894297a3b0SGarrett D'Amore case BACKSL|'1':
5904297a3b0SGarrett D'Amore case BACKSL|'2':
5914297a3b0SGarrett D'Amore case BACKSL|'3':
5924297a3b0SGarrett D'Amore case BACKSL|'4':
5934297a3b0SGarrett D'Amore case BACKSL|'5':
5944297a3b0SGarrett D'Amore case BACKSL|'6':
5954297a3b0SGarrett D'Amore case BACKSL|'7':
5964297a3b0SGarrett D'Amore case BACKSL|'8':
5974297a3b0SGarrett D'Amore case BACKSL|'9':
5984297a3b0SGarrett D'Amore i = (c&~BACKSL) - '0';
5994297a3b0SGarrett D'Amore assert(i < NPAREN);
6004297a3b0SGarrett D'Amore if (p->pend[i] != 0) {
6014297a3b0SGarrett D'Amore assert(i <= p->g->nsub);
6024297a3b0SGarrett D'Amore EMIT(OBACK_, i);
6034297a3b0SGarrett D'Amore assert(p->pbegin[i] != 0);
6044297a3b0SGarrett D'Amore assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
6054297a3b0SGarrett D'Amore assert(OP(p->strip[p->pend[i]]) == ORPAREN);
6064297a3b0SGarrett D'Amore (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
6074297a3b0SGarrett D'Amore EMIT(O_BACK, i);
6084297a3b0SGarrett D'Amore } else
6094297a3b0SGarrett D'Amore SETERROR(REG_ESUBREG);
6104297a3b0SGarrett D'Amore p->g->backrefs = 1;
6114297a3b0SGarrett D'Amore break;
6124297a3b0SGarrett D'Amore case '*':
6134297a3b0SGarrett D'Amore (void) REQUIRE(starordinary, REG_BADRPT);
6144297a3b0SGarrett D'Amore /* FALLTHROUGH */
6154297a3b0SGarrett D'Amore default:
6164297a3b0SGarrett D'Amore p->next--;
6174297a3b0SGarrett D'Amore wc = WGETNEXT();
6184297a3b0SGarrett D'Amore ordinary(p, wc);
6194297a3b0SGarrett D'Amore break;
6204297a3b0SGarrett D'Amore }
6214297a3b0SGarrett D'Amore
6224297a3b0SGarrett D'Amore if (EAT('*')) { /* implemented as +? */
6234297a3b0SGarrett D'Amore /* this case does not require the (y|) trick, noKLUDGE */
6244297a3b0SGarrett D'Amore INSERT(OPLUS_, pos);
6254297a3b0SGarrett D'Amore ASTERN(O_PLUS, pos);
6264297a3b0SGarrett D'Amore INSERT(OQUEST_, pos);
6274297a3b0SGarrett D'Amore ASTERN(O_QUEST, pos);
6284297a3b0SGarrett D'Amore } else if (EATTWO('\\', '{')) {
6294297a3b0SGarrett D'Amore count = p_count(p);
6304297a3b0SGarrett D'Amore if (EAT(',')) {
6314297a3b0SGarrett D'Amore if (MORE() && isdigit((uch)PEEK())) {
6324297a3b0SGarrett D'Amore count2 = p_count(p);
6334297a3b0SGarrett D'Amore (void) REQUIRE(count <= count2, REG_BADBR);
6344297a3b0SGarrett D'Amore } else /* single number with comma */
6354297a3b0SGarrett D'Amore count2 = INFINITY;
6364297a3b0SGarrett D'Amore } else /* just a single number */
6374297a3b0SGarrett D'Amore count2 = count;
6384297a3b0SGarrett D'Amore repeat(p, pos, count, count2);
6394297a3b0SGarrett D'Amore if (!EATTWO('\\', '}')) { /* error heuristics */
6404297a3b0SGarrett D'Amore while (MORE() && !SEETWO('\\', '}'))
6414297a3b0SGarrett D'Amore NEXT();
6424297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACE);
6434297a3b0SGarrett D'Amore SETERROR(REG_BADBR);
6444297a3b0SGarrett D'Amore }
6454297a3b0SGarrett D'Amore } else if (c == '$') /* $ (but not \$) ends it */
6464297a3b0SGarrett D'Amore return (1);
6474297a3b0SGarrett D'Amore
6484297a3b0SGarrett D'Amore return (0);
6494297a3b0SGarrett D'Amore }
6504297a3b0SGarrett D'Amore
6514297a3b0SGarrett D'Amore /*
6524297a3b0SGarrett D'Amore * p_count - parse a repetition count
6534297a3b0SGarrett D'Amore */
6544297a3b0SGarrett D'Amore static int /* the value */
p_count(struct parse * p)6554297a3b0SGarrett D'Amore p_count(struct parse *p)
6564297a3b0SGarrett D'Amore {
6574297a3b0SGarrett D'Amore int count = 0;
6584297a3b0SGarrett D'Amore int ndigits = 0;
6594297a3b0SGarrett D'Amore
6604297a3b0SGarrett D'Amore while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
6614297a3b0SGarrett D'Amore count = count*10 + (GETNEXT() - '0');
6624297a3b0SGarrett D'Amore ndigits++;
6634297a3b0SGarrett D'Amore }
6644297a3b0SGarrett D'Amore
6654297a3b0SGarrett D'Amore (void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
6664297a3b0SGarrett D'Amore return (count);
6674297a3b0SGarrett D'Amore }
6684297a3b0SGarrett D'Amore
6694297a3b0SGarrett D'Amore /*
6704297a3b0SGarrett D'Amore * p_bracket - parse a bracketed character list
6714297a3b0SGarrett D'Amore */
6724297a3b0SGarrett D'Amore static void
p_bracket(struct parse * p)6734297a3b0SGarrett D'Amore p_bracket(struct parse *p)
6744297a3b0SGarrett D'Amore {
6754297a3b0SGarrett D'Amore cset *cs;
6764297a3b0SGarrett D'Amore wint_t ch;
6774297a3b0SGarrett D'Amore
6784297a3b0SGarrett D'Amore /* Dept of Truly Sickening Special-Case Kludges */
6794297a3b0SGarrett D'Amore if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
6804297a3b0SGarrett D'Amore EMIT(OBOW, 0);
6814297a3b0SGarrett D'Amore NEXTn(6);
6824297a3b0SGarrett D'Amore return;
6834297a3b0SGarrett D'Amore }
6844297a3b0SGarrett D'Amore if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
6854297a3b0SGarrett D'Amore EMIT(OEOW, 0);
6864297a3b0SGarrett D'Amore NEXTn(6);
6874297a3b0SGarrett D'Amore return;
6884297a3b0SGarrett D'Amore }
6894297a3b0SGarrett D'Amore
6904297a3b0SGarrett D'Amore if ((cs = allocset(p)) == NULL)
6914297a3b0SGarrett D'Amore return;
6924297a3b0SGarrett D'Amore
6934297a3b0SGarrett D'Amore if (p->g->cflags®_ICASE)
6944297a3b0SGarrett D'Amore cs->icase = 1;
6954297a3b0SGarrett D'Amore if (EAT('^'))
6964297a3b0SGarrett D'Amore cs->invert = 1;
6974297a3b0SGarrett D'Amore if (EAT(']'))
6984297a3b0SGarrett D'Amore CHadd(p, cs, ']');
6994297a3b0SGarrett D'Amore else if (EAT('-'))
7004297a3b0SGarrett D'Amore CHadd(p, cs, '-');
7014297a3b0SGarrett D'Amore while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
7024297a3b0SGarrett D'Amore p_b_term(p, cs);
7034297a3b0SGarrett D'Amore if (EAT('-'))
7044297a3b0SGarrett D'Amore CHadd(p, cs, '-');
7054297a3b0SGarrett D'Amore (void) MUSTEAT(']', REG_EBRACK);
7064297a3b0SGarrett D'Amore
7074297a3b0SGarrett D'Amore if (p->error != 0) /* don't mess things up further */
7084297a3b0SGarrett D'Amore return;
7094297a3b0SGarrett D'Amore
7104297a3b0SGarrett D'Amore if (cs->invert && p->g->cflags®_NEWLINE)
7114297a3b0SGarrett D'Amore cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
7124297a3b0SGarrett D'Amore
7134297a3b0SGarrett D'Amore if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
7144297a3b0SGarrett D'Amore ordinary(p, ch);
7154297a3b0SGarrett D'Amore freeset(p, cs);
7164297a3b0SGarrett D'Amore } else
7174297a3b0SGarrett D'Amore EMIT(OANYOF, (int)(cs - p->g->sets));
7184297a3b0SGarrett D'Amore }
7194297a3b0SGarrett D'Amore
7204297a3b0SGarrett D'Amore /*
7214297a3b0SGarrett D'Amore * p_b_term - parse one term of a bracketed character list
7224297a3b0SGarrett D'Amore */
7234297a3b0SGarrett D'Amore static void
p_b_term(struct parse * p,cset * cs)7244297a3b0SGarrett D'Amore p_b_term(struct parse *p, cset *cs)
7254297a3b0SGarrett D'Amore {
7264297a3b0SGarrett D'Amore char c;
7274297a3b0SGarrett D'Amore wint_t start, finish;
7284297a3b0SGarrett D'Amore wint_t i;
729*62c3776aSGarrett D'Amore locale_t loc = uselocale(NULL);
7304297a3b0SGarrett D'Amore
7314297a3b0SGarrett D'Amore /* classify what we've got */
7324297a3b0SGarrett D'Amore switch ((MORE()) ? PEEK() : '\0') {
7334297a3b0SGarrett D'Amore case '[':
7344297a3b0SGarrett D'Amore c = (MORE2()) ? PEEK2() : '\0';
7354297a3b0SGarrett D'Amore break;
7364297a3b0SGarrett D'Amore case '-':
7374297a3b0SGarrett D'Amore SETERROR(REG_ERANGE);
7384297a3b0SGarrett D'Amore return; /* NOTE RETURN */
7394297a3b0SGarrett D'Amore default:
7404297a3b0SGarrett D'Amore c = '\0';
7414297a3b0SGarrett D'Amore break;
7424297a3b0SGarrett D'Amore }
7434297a3b0SGarrett D'Amore
7444297a3b0SGarrett D'Amore switch (c) {
7454297a3b0SGarrett D'Amore case ':': /* character class */
7464297a3b0SGarrett D'Amore NEXT2();
7474297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK);
7484297a3b0SGarrett D'Amore c = PEEK();
7494297a3b0SGarrett D'Amore (void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
7504297a3b0SGarrett D'Amore p_b_cclass(p, cs);
7514297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK);
7524297a3b0SGarrett D'Amore (void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
7534297a3b0SGarrett D'Amore break;
7544297a3b0SGarrett D'Amore case '=': /* equivalence class */
7554297a3b0SGarrett D'Amore NEXT2();
7564297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK);
7574297a3b0SGarrett D'Amore c = PEEK();
7584297a3b0SGarrett D'Amore (void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
7594297a3b0SGarrett D'Amore p_b_eclass(p, cs);
7604297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK);
7614297a3b0SGarrett D'Amore (void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
7624297a3b0SGarrett D'Amore break;
7634297a3b0SGarrett D'Amore default: /* symbol, ordinary character, or range */
7644297a3b0SGarrett D'Amore start = p_b_symbol(p);
7654297a3b0SGarrett D'Amore if (SEE('-') && MORE2() && PEEK2() != ']') {
7664297a3b0SGarrett D'Amore /* range */
7674297a3b0SGarrett D'Amore NEXT();
7684297a3b0SGarrett D'Amore if (EAT('-'))
7694297a3b0SGarrett D'Amore finish = '-';
7704297a3b0SGarrett D'Amore else
7714297a3b0SGarrett D'Amore finish = p_b_symbol(p);
7724297a3b0SGarrett D'Amore } else
7734297a3b0SGarrett D'Amore finish = start;
7744297a3b0SGarrett D'Amore if (start == finish)
7754297a3b0SGarrett D'Amore CHadd(p, cs, start);
7764297a3b0SGarrett D'Amore else {
777*62c3776aSGarrett D'Amore if (loc->collate->lc_is_posix) {
7784297a3b0SGarrett D'Amore (void) REQUIRE((uch)start <= (uch)finish,
7794297a3b0SGarrett D'Amore REG_ERANGE);
7804297a3b0SGarrett D'Amore CHaddrange(p, cs, start, finish);
7814297a3b0SGarrett D'Amore } else {
7826b5e5868SGarrett D'Amore (void) REQUIRE(_collate_range_cmp(start,
783*62c3776aSGarrett D'Amore finish, loc) <= 0, REG_ERANGE);
7844297a3b0SGarrett D'Amore for (i = 0; i <= UCHAR_MAX; i++) {
785*62c3776aSGarrett D'Amore if (_collate_range_cmp(start, i, loc)
786*62c3776aSGarrett D'Amore <= 0 &&
787*62c3776aSGarrett D'Amore _collate_range_cmp(i, finish, loc)
788*62c3776aSGarrett D'Amore <= 0)
7894297a3b0SGarrett D'Amore CHadd(p, cs, i);
7904297a3b0SGarrett D'Amore }
7914297a3b0SGarrett D'Amore }
7924297a3b0SGarrett D'Amore }
7934297a3b0SGarrett D'Amore break;
7944297a3b0SGarrett D'Amore }
7954297a3b0SGarrett D'Amore }
7964297a3b0SGarrett D'Amore
7974297a3b0SGarrett D'Amore /*
7984297a3b0SGarrett D'Amore * p_b_cclass - parse a character-class name and deal with it
7994297a3b0SGarrett D'Amore */
8004297a3b0SGarrett D'Amore static void
p_b_cclass(struct parse * p,cset * cs)8014297a3b0SGarrett D'Amore p_b_cclass(struct parse *p, cset *cs)
8024297a3b0SGarrett D'Amore {
8034297a3b0SGarrett D'Amore char *sp = p->next;
8044297a3b0SGarrett D'Amore size_t len;
8054297a3b0SGarrett D'Amore wctype_t wct;
8064297a3b0SGarrett D'Amore char clname[16];
8074297a3b0SGarrett D'Amore
8084297a3b0SGarrett D'Amore while (MORE() && isalpha((uch)PEEK()))
8094297a3b0SGarrett D'Amore NEXT();
8104297a3b0SGarrett D'Amore len = p->next - sp;
8114297a3b0SGarrett D'Amore if (len >= sizeof (clname) - 1) {
8124297a3b0SGarrett D'Amore SETERROR(REG_ECTYPE);
8134297a3b0SGarrett D'Amore return;
8144297a3b0SGarrett D'Amore }
8154297a3b0SGarrett D'Amore (void) memcpy(clname, sp, len);
8164297a3b0SGarrett D'Amore clname[len] = '\0';
8174297a3b0SGarrett D'Amore if ((wct = wctype(clname)) == 0) {
8184297a3b0SGarrett D'Amore SETERROR(REG_ECTYPE);
8194297a3b0SGarrett D'Amore return;
8204297a3b0SGarrett D'Amore }
8214297a3b0SGarrett D'Amore CHaddtype(p, cs, wct);
8224297a3b0SGarrett D'Amore }
8234297a3b0SGarrett D'Amore
8244297a3b0SGarrett D'Amore /*
8254297a3b0SGarrett D'Amore * p_b_eclass - parse an equivalence-class name and deal with it
8264297a3b0SGarrett D'Amore *
8274297a3b0SGarrett D'Amore * This implementation is incomplete. xxx
8284297a3b0SGarrett D'Amore */
8294297a3b0SGarrett D'Amore static void
p_b_eclass(struct parse * p,cset * cs)8304297a3b0SGarrett D'Amore p_b_eclass(struct parse *p, cset *cs)
8314297a3b0SGarrett D'Amore {
8324297a3b0SGarrett D'Amore wint_t c;
8334297a3b0SGarrett D'Amore
8344297a3b0SGarrett D'Amore c = p_b_coll_elem(p, '=');
8354297a3b0SGarrett D'Amore CHadd(p, cs, c);
8364297a3b0SGarrett D'Amore }
8374297a3b0SGarrett D'Amore
8384297a3b0SGarrett D'Amore /*
8394297a3b0SGarrett D'Amore * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
8404297a3b0SGarrett D'Amore */
8414297a3b0SGarrett D'Amore static wint_t /* value of symbol */
p_b_symbol(struct parse * p)8424297a3b0SGarrett D'Amore p_b_symbol(struct parse *p)
8434297a3b0SGarrett D'Amore {
8444297a3b0SGarrett D'Amore wint_t value;
8454297a3b0SGarrett D'Amore
8464297a3b0SGarrett D'Amore (void) REQUIRE(MORE(), REG_EBRACK);
8474297a3b0SGarrett D'Amore if (!EATTWO('[', '.'))
8484297a3b0SGarrett D'Amore return (WGETNEXT());
8494297a3b0SGarrett D'Amore
8504297a3b0SGarrett D'Amore /* collating symbol */
8514297a3b0SGarrett D'Amore value = p_b_coll_elem(p, '.');
8524297a3b0SGarrett D'Amore (void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
8534297a3b0SGarrett D'Amore return (value);
8544297a3b0SGarrett D'Amore }
8554297a3b0SGarrett D'Amore
8564297a3b0SGarrett D'Amore /*
8574297a3b0SGarrett D'Amore * p_b_coll_elem - parse a collating-element name and look it up
8584297a3b0SGarrett D'Amore */
8594297a3b0SGarrett D'Amore static wint_t /* value of collating element */
p_b_coll_elem(struct parse * p,wint_t endc)8604297a3b0SGarrett D'Amore p_b_coll_elem(struct parse *p,
8614297a3b0SGarrett D'Amore wint_t endc) /* name ended by endc,']' */
8624297a3b0SGarrett D'Amore {
8634297a3b0SGarrett D'Amore char *sp = p->next;
8644297a3b0SGarrett D'Amore struct cname *cp;
8654297a3b0SGarrett D'Amore int len;
8664297a3b0SGarrett D'Amore mbstate_t mbs;
8674297a3b0SGarrett D'Amore wchar_t wc;
8684297a3b0SGarrett D'Amore size_t clen;
8694297a3b0SGarrett D'Amore
8704297a3b0SGarrett D'Amore while (MORE() && !SEETWO(endc, ']'))
8714297a3b0SGarrett D'Amore NEXT();
8724297a3b0SGarrett D'Amore if (!MORE()) {
8734297a3b0SGarrett D'Amore SETERROR(REG_EBRACK);
8744297a3b0SGarrett D'Amore return (0);
8754297a3b0SGarrett D'Amore }
8764297a3b0SGarrett D'Amore len = p->next - sp;
8774297a3b0SGarrett D'Amore for (cp = cnames; cp->name != NULL; cp++)
8784297a3b0SGarrett D'Amore if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
8794297a3b0SGarrett D'Amore return (cp->code); /* known name */
8804297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs));
8814297a3b0SGarrett D'Amore if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
8824297a3b0SGarrett D'Amore return (wc); /* single character */
8834297a3b0SGarrett D'Amore else if (clen == (size_t)-1 || clen == (size_t)-2)
8844297a3b0SGarrett D'Amore SETERROR(REG_ECHAR);
8854297a3b0SGarrett D'Amore else
8864297a3b0SGarrett D'Amore SETERROR(REG_ECOLLATE); /* neither */
8874297a3b0SGarrett D'Amore return (0);
8884297a3b0SGarrett D'Amore }
8894297a3b0SGarrett D'Amore
8904297a3b0SGarrett D'Amore /*
8914297a3b0SGarrett D'Amore * othercase - return the case counterpart of an alphabetic
8924297a3b0SGarrett D'Amore */
8934297a3b0SGarrett D'Amore static wint_t /* if no counterpart, return ch */
othercase(wint_t ch)8944297a3b0SGarrett D'Amore othercase(wint_t ch)
8954297a3b0SGarrett D'Amore {
8964297a3b0SGarrett D'Amore assert(iswalpha(ch));
8974297a3b0SGarrett D'Amore if (iswupper(ch))
8984297a3b0SGarrett D'Amore return (towlower(ch));
8994297a3b0SGarrett D'Amore else if (iswlower(ch))
9004297a3b0SGarrett D'Amore return (towupper(ch));
9014297a3b0SGarrett D'Amore else /* peculiar, but could happen */
9024297a3b0SGarrett D'Amore return (ch);
9034297a3b0SGarrett D'Amore }
9044297a3b0SGarrett D'Amore
9054297a3b0SGarrett D'Amore /*
9064297a3b0SGarrett D'Amore * bothcases - emit a dualcase version of a two-case character
9074297a3b0SGarrett D'Amore *
9084297a3b0SGarrett D'Amore * Boy, is this implementation ever a kludge...
9094297a3b0SGarrett D'Amore */
9104297a3b0SGarrett D'Amore static void
bothcases(struct parse * p,wint_t ch)9114297a3b0SGarrett D'Amore bothcases(struct parse *p, wint_t ch)
9124297a3b0SGarrett D'Amore {
9134297a3b0SGarrett D'Amore char *oldnext = p->next;
9144297a3b0SGarrett D'Amore char *oldend = p->end;
9154297a3b0SGarrett D'Amore char bracket[3 + MB_LEN_MAX];
9164297a3b0SGarrett D'Amore size_t n;
9174297a3b0SGarrett D'Amore mbstate_t mbs;
9184297a3b0SGarrett D'Amore
9194297a3b0SGarrett D'Amore assert(othercase(ch) != ch); /* p_bracket() would recurse */
9204297a3b0SGarrett D'Amore p->next = bracket;
9214297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs));
9224297a3b0SGarrett D'Amore n = wcrtomb(bracket, ch, &mbs);
9234297a3b0SGarrett D'Amore assert(n != (size_t)-1);
9244297a3b0SGarrett D'Amore bracket[n] = ']';
9254297a3b0SGarrett D'Amore bracket[n + 1] = '\0';
9264297a3b0SGarrett D'Amore p->end = bracket+n+1;
9274297a3b0SGarrett D'Amore p_bracket(p);
9284297a3b0SGarrett D'Amore assert(p->next == p->end);
9294297a3b0SGarrett D'Amore p->next = oldnext;
9304297a3b0SGarrett D'Amore p->end = oldend;
9314297a3b0SGarrett D'Amore }
9324297a3b0SGarrett D'Amore
9334297a3b0SGarrett D'Amore /*
9344297a3b0SGarrett D'Amore * ordinary - emit an ordinary character
9354297a3b0SGarrett D'Amore */
9364297a3b0SGarrett D'Amore static void
ordinary(struct parse * p,wint_t ch)9374297a3b0SGarrett D'Amore ordinary(struct parse *p, wint_t ch)
9384297a3b0SGarrett D'Amore {
9394297a3b0SGarrett D'Amore cset *cs;
9404297a3b0SGarrett D'Amore
9414297a3b0SGarrett D'Amore if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
9424297a3b0SGarrett D'Amore bothcases(p, ch);
9434297a3b0SGarrett D'Amore else if ((ch & OPDMASK) == ch)
9444297a3b0SGarrett D'Amore EMIT(OCHAR, ch);
9454297a3b0SGarrett D'Amore else {
9464297a3b0SGarrett D'Amore /*
9474297a3b0SGarrett D'Amore * Kludge: character is too big to fit into an OCHAR operand.
9484297a3b0SGarrett D'Amore * Emit a singleton set.
9494297a3b0SGarrett D'Amore */
9504297a3b0SGarrett D'Amore if ((cs = allocset(p)) == NULL)
9514297a3b0SGarrett D'Amore return;
9524297a3b0SGarrett D'Amore CHadd(p, cs, ch);
9534297a3b0SGarrett D'Amore EMIT(OANYOF, (int)(cs - p->g->sets));
9544297a3b0SGarrett D'Amore }
9554297a3b0SGarrett D'Amore }
9564297a3b0SGarrett D'Amore
9574297a3b0SGarrett D'Amore /*
9584297a3b0SGarrett D'Amore * nonnewline - emit REG_NEWLINE version of OANY
9594297a3b0SGarrett D'Amore *
9604297a3b0SGarrett D'Amore * Boy, is this implementation ever a kludge...
9614297a3b0SGarrett D'Amore */
9624297a3b0SGarrett D'Amore static void
nonnewline(struct parse * p)9634297a3b0SGarrett D'Amore nonnewline(struct parse *p)
9644297a3b0SGarrett D'Amore {
9654297a3b0SGarrett D'Amore char *oldnext = p->next;
9664297a3b0SGarrett D'Amore char *oldend = p->end;
9674297a3b0SGarrett D'Amore char bracket[4];
9684297a3b0SGarrett D'Amore
9694297a3b0SGarrett D'Amore p->next = bracket;
9704297a3b0SGarrett D'Amore p->end = bracket+3;
9714297a3b0SGarrett D'Amore bracket[0] = '^';
9724297a3b0SGarrett D'Amore bracket[1] = '\n';
9734297a3b0SGarrett D'Amore bracket[2] = ']';
9744297a3b0SGarrett D'Amore bracket[3] = '\0';
9754297a3b0SGarrett D'Amore p_bracket(p);
9764297a3b0SGarrett D'Amore assert(p->next == bracket+3);
9774297a3b0SGarrett D'Amore p->next = oldnext;
9784297a3b0SGarrett D'Amore p->end = oldend;
9794297a3b0SGarrett D'Amore }
9804297a3b0SGarrett D'Amore
9814297a3b0SGarrett D'Amore /*
9824297a3b0SGarrett D'Amore * repeat - generate code for a bounded repetition, recursively if needed
9834297a3b0SGarrett D'Amore */
9844297a3b0SGarrett D'Amore static void
repeat(struct parse * p,sopno start,int from,int to)9854297a3b0SGarrett D'Amore repeat(struct parse *p,
9864297a3b0SGarrett D'Amore sopno start, /* operand from here to end of strip */
9874297a3b0SGarrett D'Amore int from, /* repeated from this number */
9884297a3b0SGarrett D'Amore int to) /* to this number of times (maybe INFINITY) */
9894297a3b0SGarrett D'Amore {
9904297a3b0SGarrett D'Amore sopno finish = HERE();
9914297a3b0SGarrett D'Amore #define N 2
9924297a3b0SGarrett D'Amore #define INF 3
9934297a3b0SGarrett D'Amore #define REP(f, t) ((f)*8 + (t))
9944297a3b0SGarrett D'Amore #define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
9954297a3b0SGarrett D'Amore sopno copy;
9964297a3b0SGarrett D'Amore
9974297a3b0SGarrett D'Amore if (p->error != 0) /* head off possible runaway recursion */
9984297a3b0SGarrett D'Amore return;
9994297a3b0SGarrett D'Amore
10004297a3b0SGarrett D'Amore assert(from <= to);
10014297a3b0SGarrett D'Amore
10024297a3b0SGarrett D'Amore switch (REP(MAP(from), MAP(to))) {
10034297a3b0SGarrett D'Amore case REP(0, 0): /* must be user doing this */
10044297a3b0SGarrett D'Amore DROP(finish-start); /* drop the operand */
10054297a3b0SGarrett D'Amore break;
10064297a3b0SGarrett D'Amore case REP(0, 1): /* as x{1,1}? */
10074297a3b0SGarrett D'Amore case REP(0, N): /* as x{1,n}? */
10084297a3b0SGarrett D'Amore case REP(0, INF): /* as x{1,}? */
10094297a3b0SGarrett D'Amore /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10104297a3b0SGarrett D'Amore INSERT(OCH_, start); /* offset is wrong... */
10114297a3b0SGarrett D'Amore repeat(p, start+1, 1, to);
10124297a3b0SGarrett D'Amore ASTERN(OOR1, start);
10134297a3b0SGarrett D'Amore AHEAD(start); /* ... fix it */
10144297a3b0SGarrett D'Amore EMIT(OOR2, 0);
10154297a3b0SGarrett D'Amore AHEAD(THERE());
10164297a3b0SGarrett D'Amore ASTERN(O_CH, THERETHERE());
10174297a3b0SGarrett D'Amore break;
10184297a3b0SGarrett D'Amore case REP(1, 1): /* trivial case */
10194297a3b0SGarrett D'Amore /* done */
10204297a3b0SGarrett D'Amore break;
10214297a3b0SGarrett D'Amore case REP(1, N): /* as x?x{1,n-1} */
10224297a3b0SGarrett D'Amore /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
10234297a3b0SGarrett D'Amore INSERT(OCH_, start);
10244297a3b0SGarrett D'Amore ASTERN(OOR1, start);
10254297a3b0SGarrett D'Amore AHEAD(start);
10264297a3b0SGarrett D'Amore EMIT(OOR2, 0); /* offset very wrong... */
10274297a3b0SGarrett D'Amore AHEAD(THERE()); /* ...so fix it */
10284297a3b0SGarrett D'Amore ASTERN(O_CH, THERETHERE());
10294297a3b0SGarrett D'Amore copy = dupl(p, start+1, finish+1);
10304297a3b0SGarrett D'Amore assert(copy == finish+4);
10314297a3b0SGarrett D'Amore repeat(p, copy, 1, to-1);
10324297a3b0SGarrett D'Amore break;
10334297a3b0SGarrett D'Amore case REP(1, INF): /* as x+ */
10344297a3b0SGarrett D'Amore INSERT(OPLUS_, start);
10354297a3b0SGarrett D'Amore ASTERN(O_PLUS, start);
10364297a3b0SGarrett D'Amore break;
10374297a3b0SGarrett D'Amore case REP(N, N): /* as xx{m-1,n-1} */
10384297a3b0SGarrett D'Amore copy = dupl(p, start, finish);
10394297a3b0SGarrett D'Amore repeat(p, copy, from-1, to-1);
10404297a3b0SGarrett D'Amore break;
10414297a3b0SGarrett D'Amore case REP(N, INF): /* as xx{n-1,INF} */
10424297a3b0SGarrett D'Amore copy = dupl(p, start, finish);
10434297a3b0SGarrett D'Amore repeat(p, copy, from-1, to);
10444297a3b0SGarrett D'Amore break;
10454297a3b0SGarrett D'Amore default: /* "can't happen" */
10464297a3b0SGarrett D'Amore SETERROR(REG_EFATAL); /* just in case */
10474297a3b0SGarrett D'Amore break;
10484297a3b0SGarrett D'Amore }
10494297a3b0SGarrett D'Amore }
10504297a3b0SGarrett D'Amore
10514297a3b0SGarrett D'Amore /*
10524297a3b0SGarrett D'Amore * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
10534297a3b0SGarrett D'Amore * character from the parse struct, signals a REG_ILLSEQ error if the
10544297a3b0SGarrett D'Amore * character can't be converted. Returns the number of bytes consumed.
10554297a3b0SGarrett D'Amore */
10564297a3b0SGarrett D'Amore static wint_t
wgetnext(struct parse * p)10574297a3b0SGarrett D'Amore wgetnext(struct parse *p)
10584297a3b0SGarrett D'Amore {
10594297a3b0SGarrett D'Amore mbstate_t mbs;
10604297a3b0SGarrett D'Amore wchar_t wc;
10614297a3b0SGarrett D'Amore size_t n;
10624297a3b0SGarrett D'Amore
10634297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs));
10644297a3b0SGarrett D'Amore n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
10654297a3b0SGarrett D'Amore if (n == (size_t)-1 || n == (size_t)-2) {
10664297a3b0SGarrett D'Amore SETERROR(REG_ECHAR);
10674297a3b0SGarrett D'Amore return (0);
10684297a3b0SGarrett D'Amore }
10694297a3b0SGarrett D'Amore if (n == 0)
10704297a3b0SGarrett D'Amore n = 1;
10714297a3b0SGarrett D'Amore p->next += n;
10724297a3b0SGarrett D'Amore return (wc);
10734297a3b0SGarrett D'Amore }
10744297a3b0SGarrett D'Amore
10754297a3b0SGarrett D'Amore /*
10764297a3b0SGarrett D'Amore * seterr - set an error condition
10774297a3b0SGarrett D'Amore */
10784297a3b0SGarrett D'Amore static int /* useless but makes type checking happy */
seterr(struct parse * p,int e)10794297a3b0SGarrett D'Amore seterr(struct parse *p, int e)
10804297a3b0SGarrett D'Amore {
10814297a3b0SGarrett D'Amore if (p->error == 0) /* keep earliest error condition */
10824297a3b0SGarrett D'Amore p->error = e;
10834297a3b0SGarrett D'Amore p->next = nuls; /* try to bring things to a halt */
10844297a3b0SGarrett D'Amore p->end = nuls;
10854297a3b0SGarrett D'Amore return (0); /* make the return value well-defined */
10864297a3b0SGarrett D'Amore }
10874297a3b0SGarrett D'Amore
10884297a3b0SGarrett D'Amore /*
10894297a3b0SGarrett D'Amore * allocset - allocate a set of characters for []
10904297a3b0SGarrett D'Amore */
10914297a3b0SGarrett D'Amore static cset *
allocset(struct parse * p)10924297a3b0SGarrett D'Amore allocset(struct parse *p)
10934297a3b0SGarrett D'Amore {
10944297a3b0SGarrett D'Amore cset *cs, *ncs;
10954297a3b0SGarrett D'Amore
10964297a3b0SGarrett D'Amore ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
10974297a3b0SGarrett D'Amore if (ncs == NULL) {
10984297a3b0SGarrett D'Amore SETERROR(REG_ESPACE);
10994297a3b0SGarrett D'Amore return (NULL);
11004297a3b0SGarrett D'Amore }
11014297a3b0SGarrett D'Amore p->g->sets = ncs;
11024297a3b0SGarrett D'Amore cs = &p->g->sets[p->g->ncsets++];
11034297a3b0SGarrett D'Amore (void) memset(cs, 0, sizeof (*cs));
11044297a3b0SGarrett D'Amore
11054297a3b0SGarrett D'Amore return (cs);
11064297a3b0SGarrett D'Amore }
11074297a3b0SGarrett D'Amore
11084297a3b0SGarrett D'Amore /*
11094297a3b0SGarrett D'Amore * freeset - free a now-unused set
11104297a3b0SGarrett D'Amore */
11114297a3b0SGarrett D'Amore static void
freeset(struct parse * p,cset * cs)11124297a3b0SGarrett D'Amore freeset(struct parse *p, cset *cs)
11134297a3b0SGarrett D'Amore {
11144297a3b0SGarrett D'Amore cset *top = &p->g->sets[p->g->ncsets];
11154297a3b0SGarrett D'Amore
11164297a3b0SGarrett D'Amore free(cs->wides);
11174297a3b0SGarrett D'Amore free(cs->ranges);
11184297a3b0SGarrett D'Amore free(cs->types);
11194297a3b0SGarrett D'Amore (void) memset(cs, 0, sizeof (*cs));
11204297a3b0SGarrett D'Amore if (cs == top-1) /* recover only the easy case */
11214297a3b0SGarrett D'Amore p->g->ncsets--;
11224297a3b0SGarrett D'Amore }
11234297a3b0SGarrett D'Amore
11244297a3b0SGarrett D'Amore /*
11254297a3b0SGarrett D'Amore * singleton - Determine whether a set contains only one character,
11264297a3b0SGarrett D'Amore * returning it if so, otherwise returning OUT.
11274297a3b0SGarrett D'Amore */
11284297a3b0SGarrett D'Amore static wint_t
singleton(cset * cs)11294297a3b0SGarrett D'Amore singleton(cset *cs)
11304297a3b0SGarrett D'Amore {
11314297a3b0SGarrett D'Amore wint_t i, s, n;
11324297a3b0SGarrett D'Amore
11334297a3b0SGarrett D'Amore for (i = n = 0; i < NC; i++)
11344297a3b0SGarrett D'Amore if (CHIN(cs, i)) {
11354297a3b0SGarrett D'Amore n++;
11364297a3b0SGarrett D'Amore s = i;
11374297a3b0SGarrett D'Amore }
11384297a3b0SGarrett D'Amore if (n == 1)
11394297a3b0SGarrett D'Amore return (s);
11404297a3b0SGarrett D'Amore if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
11414297a3b0SGarrett D'Amore cs->icase == 0)
11424297a3b0SGarrett D'Amore return (cs->wides[0]);
11434297a3b0SGarrett D'Amore /* Don't bother handling the other cases. */
11444297a3b0SGarrett D'Amore return (OUT);
11454297a3b0SGarrett D'Amore }
11464297a3b0SGarrett D'Amore
11474297a3b0SGarrett D'Amore /*
11484297a3b0SGarrett D'Amore * CHadd - add character to character set.
11494297a3b0SGarrett D'Amore */
11504297a3b0SGarrett D'Amore static void
CHadd(struct parse * p,cset * cs,wint_t ch)11514297a3b0SGarrett D'Amore CHadd(struct parse *p, cset *cs, wint_t ch)
11524297a3b0SGarrett D'Amore {
11534297a3b0SGarrett D'Amore wint_t nch, *newwides;
11544297a3b0SGarrett D'Amore assert(ch >= 0);
11554297a3b0SGarrett D'Amore if (ch < NC)
11564297a3b0SGarrett D'Amore cs->bmp[ch >> 3] |= 1 << (ch & 7);
11574297a3b0SGarrett D'Amore else {
11584297a3b0SGarrett D'Amore newwides = realloc(cs->wides, (cs->nwides + 1) *
11594297a3b0SGarrett D'Amore sizeof (*cs->wides));
11604297a3b0SGarrett D'Amore if (newwides == NULL) {
11614297a3b0SGarrett D'Amore SETERROR(REG_ESPACE);
11624297a3b0SGarrett D'Amore return;
11634297a3b0SGarrett D'Amore }
11644297a3b0SGarrett D'Amore cs->wides = newwides;
11654297a3b0SGarrett D'Amore cs->wides[cs->nwides++] = ch;
11664297a3b0SGarrett D'Amore }
11674297a3b0SGarrett D'Amore if (cs->icase) {
11684297a3b0SGarrett D'Amore if ((nch = towlower(ch)) < NC)
11694297a3b0SGarrett D'Amore cs->bmp[nch >> 3] |= 1 << (nch & 7);
11704297a3b0SGarrett D'Amore if ((nch = towupper(ch)) < NC)
11714297a3b0SGarrett D'Amore cs->bmp[nch >> 3] |= 1 << (nch & 7);
11724297a3b0SGarrett D'Amore }
11734297a3b0SGarrett D'Amore }
11744297a3b0SGarrett D'Amore
11754297a3b0SGarrett D'Amore /*
11764297a3b0SGarrett D'Amore * CHaddrange - add all characters in the range [min,max] to a character set.
11774297a3b0SGarrett D'Amore */
11784297a3b0SGarrett D'Amore static void
CHaddrange(struct parse * p,cset * cs,wint_t min,wint_t max)11794297a3b0SGarrett D'Amore CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
11804297a3b0SGarrett D'Amore {
11814297a3b0SGarrett D'Amore crange *newranges;
11824297a3b0SGarrett D'Amore
11834297a3b0SGarrett D'Amore for (; min < NC && min <= max; min++)
11844297a3b0SGarrett D'Amore CHadd(p, cs, min);
11854297a3b0SGarrett D'Amore if (min >= max)
11864297a3b0SGarrett D'Amore return;
11874297a3b0SGarrett D'Amore newranges = realloc(cs->ranges, (cs->nranges + 1) *
11884297a3b0SGarrett D'Amore sizeof (*cs->ranges));
11894297a3b0SGarrett D'Amore if (newranges == NULL) {
11904297a3b0SGarrett D'Amore SETERROR(REG_ESPACE);
11914297a3b0SGarrett D'Amore return;
11924297a3b0SGarrett D'Amore }
11934297a3b0SGarrett D'Amore cs->ranges = newranges;
11944297a3b0SGarrett D'Amore cs->ranges[cs->nranges].min = min;
11954297a3b0SGarrett D'Amore cs->ranges[cs->nranges].min = max;
11964297a3b0SGarrett D'Amore cs->nranges++;
11974297a3b0SGarrett D'Amore }
11984297a3b0SGarrett D'Amore
11994297a3b0SGarrett D'Amore /*
12004297a3b0SGarrett D'Amore * CHaddtype - add all characters of a certain type to a character set.
12014297a3b0SGarrett D'Amore */
12024297a3b0SGarrett D'Amore static void
CHaddtype(struct parse * p,cset * cs,wctype_t wct)12034297a3b0SGarrett D'Amore CHaddtype(struct parse *p, cset *cs, wctype_t wct)
12044297a3b0SGarrett D'Amore {
12054297a3b0SGarrett D'Amore wint_t i;
12064297a3b0SGarrett D'Amore wctype_t *newtypes;
12074297a3b0SGarrett D'Amore
12084297a3b0SGarrett D'Amore for (i = 0; i < NC; i++)
12094297a3b0SGarrett D'Amore if (iswctype(i, wct))
12104297a3b0SGarrett D'Amore CHadd(p, cs, i);
12114297a3b0SGarrett D'Amore newtypes = realloc(cs->types, (cs->ntypes + 1) *
12124297a3b0SGarrett D'Amore sizeof (*cs->types));
12134297a3b0SGarrett D'Amore if (newtypes == NULL) {
12144297a3b0SGarrett D'Amore SETERROR(REG_ESPACE);
12154297a3b0SGarrett D'Amore return;
12164297a3b0SGarrett D'Amore }
12174297a3b0SGarrett D'Amore cs->types = newtypes;
12184297a3b0SGarrett D'Amore cs->types[cs->ntypes++] = wct;
12194297a3b0SGarrett D'Amore }
12204297a3b0SGarrett D'Amore
12214297a3b0SGarrett D'Amore /*
12224297a3b0SGarrett D'Amore * dupl - emit a duplicate of a bunch of sops
12234297a3b0SGarrett D'Amore */
12244297a3b0SGarrett D'Amore static sopno /* start of duplicate */
dupl(struct parse * p,sopno start,sopno finish)12254297a3b0SGarrett D'Amore dupl(struct parse *p,
12264297a3b0SGarrett D'Amore sopno start, /* from here */
12274297a3b0SGarrett D'Amore sopno finish) /* to this less one */
12284297a3b0SGarrett D'Amore {
12294297a3b0SGarrett D'Amore sopno ret = HERE();
12304297a3b0SGarrett D'Amore sopno len = finish - start;
12314297a3b0SGarrett D'Amore
12324297a3b0SGarrett D'Amore assert(finish >= start);
12334297a3b0SGarrett D'Amore if (len == 0)
12344297a3b0SGarrett D'Amore return (ret);
12354297a3b0SGarrett D'Amore enlarge(p, p->ssize + len); /* this many unexpected additions */
12364297a3b0SGarrett D'Amore assert(p->ssize >= p->slen + len);
12374297a3b0SGarrett D'Amore (void) memcpy((char *)(p->strip + p->slen),
12384297a3b0SGarrett D'Amore (char *)(p->strip + start), (size_t)len*sizeof (sop));
12394297a3b0SGarrett D'Amore p->slen += len;
12404297a3b0SGarrett D'Amore return (ret);
12414297a3b0SGarrett D'Amore }
12424297a3b0SGarrett D'Amore
12434297a3b0SGarrett D'Amore /*
12444297a3b0SGarrett D'Amore * doemit - emit a strip operator
12454297a3b0SGarrett D'Amore *
12464297a3b0SGarrett D'Amore * It might seem better to implement this as a macro with a function as
12474297a3b0SGarrett D'Amore * hard-case backup, but it's just too big and messy unless there are
12484297a3b0SGarrett D'Amore * some changes to the data structures. Maybe later.
12494297a3b0SGarrett D'Amore */
12504297a3b0SGarrett D'Amore static void
doemit(struct parse * p,sop op,size_t opnd)12514297a3b0SGarrett D'Amore doemit(struct parse *p, sop op, size_t opnd)
12524297a3b0SGarrett D'Amore {
12534297a3b0SGarrett D'Amore /* avoid making error situations worse */
12544297a3b0SGarrett D'Amore if (p->error != 0)
12554297a3b0SGarrett D'Amore return;
12564297a3b0SGarrett D'Amore
12574297a3b0SGarrett D'Amore /* deal with oversize operands ("can't happen", more or less) */
12584297a3b0SGarrett D'Amore assert(opnd < 1<<OPSHIFT);
12594297a3b0SGarrett D'Amore
12604297a3b0SGarrett D'Amore /* deal with undersized strip */
12614297a3b0SGarrett D'Amore if (p->slen >= p->ssize)
12624297a3b0SGarrett D'Amore enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
12634297a3b0SGarrett D'Amore assert(p->slen < p->ssize);
12644297a3b0SGarrett D'Amore
12654297a3b0SGarrett D'Amore /* finally, it's all reduced to the easy case */
12664297a3b0SGarrett D'Amore p->strip[p->slen++] = SOP(op, opnd);
12674297a3b0SGarrett D'Amore }
12684297a3b0SGarrett D'Amore
12694297a3b0SGarrett D'Amore /*
12704297a3b0SGarrett D'Amore * doinsert - insert a sop into the strip
12714297a3b0SGarrett D'Amore */
12724297a3b0SGarrett D'Amore static void
doinsert(struct parse * p,sop op,size_t opnd,sopno pos)12734297a3b0SGarrett D'Amore doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
12744297a3b0SGarrett D'Amore {
12754297a3b0SGarrett D'Amore sopno sn;
12764297a3b0SGarrett D'Amore sop s;
12774297a3b0SGarrett D'Amore int i;
12784297a3b0SGarrett D'Amore
12794297a3b0SGarrett D'Amore /* avoid making error situations worse */
12804297a3b0SGarrett D'Amore if (p->error != 0)
12814297a3b0SGarrett D'Amore return;
12824297a3b0SGarrett D'Amore
12834297a3b0SGarrett D'Amore sn = HERE();
12844297a3b0SGarrett D'Amore EMIT(op, opnd); /* do checks, ensure space */
12854297a3b0SGarrett D'Amore assert(HERE() == sn+1);
12864297a3b0SGarrett D'Amore s = p->strip[sn];
12874297a3b0SGarrett D'Amore
12884297a3b0SGarrett D'Amore /* adjust paren pointers */
12894297a3b0SGarrett D'Amore assert(pos > 0);
12904297a3b0SGarrett D'Amore for (i = 1; i < NPAREN; i++) {
12914297a3b0SGarrett D'Amore if (p->pbegin[i] >= pos) {
12924297a3b0SGarrett D'Amore p->pbegin[i]++;
12934297a3b0SGarrett D'Amore }
12944297a3b0SGarrett D'Amore if (p->pend[i] >= pos) {
12954297a3b0SGarrett D'Amore p->pend[i]++;
12964297a3b0SGarrett D'Amore }
12974297a3b0SGarrett D'Amore }
12984297a3b0SGarrett D'Amore
1299eda71b4aSGarrett D'Amore (void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
13004297a3b0SGarrett D'Amore (HERE()-pos-1)*sizeof (sop));
13014297a3b0SGarrett D'Amore p->strip[pos] = s;
13024297a3b0SGarrett D'Amore }
13034297a3b0SGarrett D'Amore
13044297a3b0SGarrett D'Amore /*
13054297a3b0SGarrett D'Amore * dofwd - complete a forward reference
13064297a3b0SGarrett D'Amore */
13074297a3b0SGarrett D'Amore static void
dofwd(struct parse * p,sopno pos,sop value)13084297a3b0SGarrett D'Amore dofwd(struct parse *p, sopno pos, sop value)
13094297a3b0SGarrett D'Amore {
13104297a3b0SGarrett D'Amore /* avoid making error situations worse */
13114297a3b0SGarrett D'Amore if (p->error != 0)
13124297a3b0SGarrett D'Amore return;
13134297a3b0SGarrett D'Amore
13144297a3b0SGarrett D'Amore assert(value < 1<<OPSHIFT);
13154297a3b0SGarrett D'Amore p->strip[pos] = OP(p->strip[pos]) | value;
13164297a3b0SGarrett D'Amore }
13174297a3b0SGarrett D'Amore
13184297a3b0SGarrett D'Amore /*
13194297a3b0SGarrett D'Amore * enlarge - enlarge the strip
13204297a3b0SGarrett D'Amore */
13214297a3b0SGarrett D'Amore static void
enlarge(struct parse * p,sopno size)13224297a3b0SGarrett D'Amore enlarge(struct parse *p, sopno size)
13234297a3b0SGarrett D'Amore {
13244297a3b0SGarrett D'Amore sop *sp;
13254297a3b0SGarrett D'Amore
13264297a3b0SGarrett D'Amore if (p->ssize >= size)
13274297a3b0SGarrett D'Amore return;
13284297a3b0SGarrett D'Amore
13294297a3b0SGarrett D'Amore sp = (sop *)realloc(p->strip, size*sizeof (sop));
13304297a3b0SGarrett D'Amore if (sp == NULL) {
13314297a3b0SGarrett D'Amore SETERROR(REG_ESPACE);
13324297a3b0SGarrett D'Amore return;
13334297a3b0SGarrett D'Amore }
13344297a3b0SGarrett D'Amore p->strip = sp;
13354297a3b0SGarrett D'Amore p->ssize = size;
13364297a3b0SGarrett D'Amore }
13374297a3b0SGarrett D'Amore
13384297a3b0SGarrett D'Amore /*
13394297a3b0SGarrett D'Amore * stripsnug - compact the strip
13404297a3b0SGarrett D'Amore */
13414297a3b0SGarrett D'Amore static void
stripsnug(struct parse * p,struct re_guts * g)13424297a3b0SGarrett D'Amore stripsnug(struct parse *p, struct re_guts *g)
13434297a3b0SGarrett D'Amore {
13444297a3b0SGarrett D'Amore g->nstates = p->slen;
13454297a3b0SGarrett D'Amore g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
13464297a3b0SGarrett D'Amore if (g->strip == NULL) {
13474297a3b0SGarrett D'Amore SETERROR(REG_ESPACE);
13484297a3b0SGarrett D'Amore g->strip = p->strip;
13494297a3b0SGarrett D'Amore }
13504297a3b0SGarrett D'Amore }
13514297a3b0SGarrett D'Amore
13524297a3b0SGarrett D'Amore /*
13534297a3b0SGarrett D'Amore * findmust - fill in must and mlen with longest mandatory literal string
13544297a3b0SGarrett D'Amore *
13554297a3b0SGarrett D'Amore * This algorithm could do fancy things like analyzing the operands of |
13564297a3b0SGarrett D'Amore * for common subsequences. Someday. This code is simple and finds most
13574297a3b0SGarrett D'Amore * of the interesting cases.
13584297a3b0SGarrett D'Amore *
13594297a3b0SGarrett D'Amore * Note that must and mlen got initialized during setup.
13604297a3b0SGarrett D'Amore */
13614297a3b0SGarrett D'Amore static void
findmust(struct parse * p,struct re_guts * g)13624297a3b0SGarrett D'Amore findmust(struct parse *p, struct re_guts *g)
13634297a3b0SGarrett D'Amore {
13644297a3b0SGarrett D'Amore sop *scan;
13654297a3b0SGarrett D'Amore sop *start;
13664297a3b0SGarrett D'Amore sop *newstart;
13674297a3b0SGarrett D'Amore sopno newlen;
13684297a3b0SGarrett D'Amore sop s;
13694297a3b0SGarrett D'Amore char *cp;
13704297a3b0SGarrett D'Amore int offset;
13714297a3b0SGarrett D'Amore char buf[MB_LEN_MAX];
13724297a3b0SGarrett D'Amore size_t clen;
13734297a3b0SGarrett D'Amore mbstate_t mbs;
1374*62c3776aSGarrett D'Amore locale_t loc = uselocale(NULL);
13754297a3b0SGarrett D'Amore
13764297a3b0SGarrett D'Amore /* avoid making error situations worse */
13774297a3b0SGarrett D'Amore if (p->error != 0)
13784297a3b0SGarrett D'Amore return;
13794297a3b0SGarrett D'Amore
13804297a3b0SGarrett D'Amore /*
13814297a3b0SGarrett D'Amore * It's not generally safe to do a ``char'' substring search on
13824297a3b0SGarrett D'Amore * multibyte character strings, but it's safe for at least
13834297a3b0SGarrett D'Amore * UTF-8 (see RFC 3629).
13844297a3b0SGarrett D'Amore */
13854297a3b0SGarrett D'Amore if (MB_CUR_MAX > 1 &&
1386*62c3776aSGarrett D'Amore strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
13874297a3b0SGarrett D'Amore return;
13884297a3b0SGarrett D'Amore
13894297a3b0SGarrett D'Amore /* find the longest OCHAR sequence in strip */
13904297a3b0SGarrett D'Amore newlen = 0;
13914297a3b0SGarrett D'Amore offset = 0;
13924297a3b0SGarrett D'Amore g->moffset = 0;
13934297a3b0SGarrett D'Amore scan = g->strip + 1;
13944297a3b0SGarrett D'Amore do {
13954297a3b0SGarrett D'Amore s = *scan++;
13964297a3b0SGarrett D'Amore switch (OP(s)) {
13974297a3b0SGarrett D'Amore case OCHAR: /* sequence member */
13984297a3b0SGarrett D'Amore if (newlen == 0) { /* new sequence */
13994297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs));
14004297a3b0SGarrett D'Amore newstart = scan - 1;
14014297a3b0SGarrett D'Amore }
14024297a3b0SGarrett D'Amore clen = wcrtomb(buf, OPND(s), &mbs);
14034297a3b0SGarrett D'Amore if (clen == (size_t)-1)
14044297a3b0SGarrett D'Amore goto toohard;
14054297a3b0SGarrett D'Amore newlen += clen;
14064297a3b0SGarrett D'Amore break;
14074297a3b0SGarrett D'Amore case OPLUS_: /* things that don't break one */
14084297a3b0SGarrett D'Amore case OLPAREN:
14094297a3b0SGarrett D'Amore case ORPAREN:
14104297a3b0SGarrett D'Amore break;
14114297a3b0SGarrett D'Amore case OQUEST_: /* things that must be skipped */
14124297a3b0SGarrett D'Amore case OCH_:
14134297a3b0SGarrett D'Amore offset = altoffset(scan, offset);
14144297a3b0SGarrett D'Amore scan--;
14154297a3b0SGarrett D'Amore do {
14164297a3b0SGarrett D'Amore scan += OPND(s);
14174297a3b0SGarrett D'Amore s = *scan;
14184297a3b0SGarrett D'Amore /* assert() interferes w debug printouts */
14194297a3b0SGarrett D'Amore if (OP(s) != O_QUEST && OP(s) != O_CH &&
14204297a3b0SGarrett D'Amore OP(s) != OOR2) {
14214297a3b0SGarrett D'Amore g->iflags |= BAD;
14224297a3b0SGarrett D'Amore return;
14234297a3b0SGarrett D'Amore }
14244297a3b0SGarrett D'Amore } while (OP(s) != O_QUEST && OP(s) != O_CH);
14254297a3b0SGarrett D'Amore /* FALLTHROUGH */
14264297a3b0SGarrett D'Amore case OBOW: /* things that break a sequence */
14274297a3b0SGarrett D'Amore case OEOW:
14284297a3b0SGarrett D'Amore case OBOL:
14294297a3b0SGarrett D'Amore case OEOL:
14304297a3b0SGarrett D'Amore case O_QUEST:
14314297a3b0SGarrett D'Amore case O_CH:
14324297a3b0SGarrett D'Amore case OEND:
14334297a3b0SGarrett D'Amore if (newlen > g->mlen) { /* ends one */
14344297a3b0SGarrett D'Amore start = newstart;
14354297a3b0SGarrett D'Amore g->mlen = newlen;
14364297a3b0SGarrett D'Amore if (offset > -1) {
14374297a3b0SGarrett D'Amore g->moffset += offset;
14384297a3b0SGarrett D'Amore offset = newlen;
14394297a3b0SGarrett D'Amore } else
14404297a3b0SGarrett D'Amore g->moffset = offset;
14414297a3b0SGarrett D'Amore } else {
14424297a3b0SGarrett D'Amore if (offset > -1)
14434297a3b0SGarrett D'Amore offset += newlen;
14444297a3b0SGarrett D'Amore }
14454297a3b0SGarrett D'Amore newlen = 0;
14464297a3b0SGarrett D'Amore break;
14474297a3b0SGarrett D'Amore case OANY:
14484297a3b0SGarrett D'Amore if (newlen > g->mlen) { /* ends one */
14494297a3b0SGarrett D'Amore start = newstart;
14504297a3b0SGarrett D'Amore g->mlen = newlen;
14514297a3b0SGarrett D'Amore if (offset > -1) {
14524297a3b0SGarrett D'Amore g->moffset += offset;
14534297a3b0SGarrett D'Amore offset = newlen;
14544297a3b0SGarrett D'Amore } else
14554297a3b0SGarrett D'Amore g->moffset = offset;
14564297a3b0SGarrett D'Amore } else {
14574297a3b0SGarrett D'Amore if (offset > -1)
14584297a3b0SGarrett D'Amore offset += newlen;
14594297a3b0SGarrett D'Amore }
14604297a3b0SGarrett D'Amore if (offset > -1)
14614297a3b0SGarrett D'Amore offset++;
14624297a3b0SGarrett D'Amore newlen = 0;
14634297a3b0SGarrett D'Amore break;
14644297a3b0SGarrett D'Amore case OANYOF: /* may or may not invalidate offset */
14654297a3b0SGarrett D'Amore /* First, everything as OANY */
14664297a3b0SGarrett D'Amore if (newlen > g->mlen) { /* ends one */
14674297a3b0SGarrett D'Amore start = newstart;
14684297a3b0SGarrett D'Amore g->mlen = newlen;
14694297a3b0SGarrett D'Amore if (offset > -1) {
14704297a3b0SGarrett D'Amore g->moffset += offset;
14714297a3b0SGarrett D'Amore offset = newlen;
14724297a3b0SGarrett D'Amore } else
14734297a3b0SGarrett D'Amore g->moffset = offset;
14744297a3b0SGarrett D'Amore } else {
14754297a3b0SGarrett D'Amore if (offset > -1)
14764297a3b0SGarrett D'Amore offset += newlen;
14774297a3b0SGarrett D'Amore }
14784297a3b0SGarrett D'Amore if (offset > -1)
14794297a3b0SGarrett D'Amore offset++;
14804297a3b0SGarrett D'Amore newlen = 0;
14814297a3b0SGarrett D'Amore break;
14824297a3b0SGarrett D'Amore toohard:
14834297a3b0SGarrett D'Amore default:
14844297a3b0SGarrett D'Amore /*
14854297a3b0SGarrett D'Amore * Anything here makes it impossible or too hard
14864297a3b0SGarrett D'Amore * to calculate the offset -- so we give up;
14874297a3b0SGarrett D'Amore * save the last known good offset, in case the
14884297a3b0SGarrett D'Amore * must sequence doesn't occur later.
14894297a3b0SGarrett D'Amore */
14904297a3b0SGarrett D'Amore if (newlen > g->mlen) { /* ends one */
14914297a3b0SGarrett D'Amore start = newstart;
14924297a3b0SGarrett D'Amore g->mlen = newlen;
14934297a3b0SGarrett D'Amore if (offset > -1)
14944297a3b0SGarrett D'Amore g->moffset += offset;
14954297a3b0SGarrett D'Amore else
14964297a3b0SGarrett D'Amore g->moffset = offset;
14974297a3b0SGarrett D'Amore }
14984297a3b0SGarrett D'Amore offset = -1;
14994297a3b0SGarrett D'Amore newlen = 0;
15004297a3b0SGarrett D'Amore break;
15014297a3b0SGarrett D'Amore }
15024297a3b0SGarrett D'Amore } while (OP(s) != OEND);
15034297a3b0SGarrett D'Amore
15044297a3b0SGarrett D'Amore if (g->mlen == 0) { /* there isn't one */
15054297a3b0SGarrett D'Amore g->moffset = -1;
15064297a3b0SGarrett D'Amore return;
15074297a3b0SGarrett D'Amore }
15084297a3b0SGarrett D'Amore
15094297a3b0SGarrett D'Amore /* turn it into a character string */
15104297a3b0SGarrett D'Amore g->must = malloc((size_t)g->mlen + 1);
15114297a3b0SGarrett D'Amore if (g->must == NULL) { /* argh; just forget it */
15124297a3b0SGarrett D'Amore g->mlen = 0;
15134297a3b0SGarrett D'Amore g->moffset = -1;
15144297a3b0SGarrett D'Amore return;
15154297a3b0SGarrett D'Amore }
15164297a3b0SGarrett D'Amore cp = g->must;
15174297a3b0SGarrett D'Amore scan = start;
15184297a3b0SGarrett D'Amore (void) memset(&mbs, 0, sizeof (mbs));
15194297a3b0SGarrett D'Amore while (cp < g->must + g->mlen) {
15204297a3b0SGarrett D'Amore while (OP(s = *scan++) != OCHAR)
15214297a3b0SGarrett D'Amore continue;
15224297a3b0SGarrett D'Amore clen = wcrtomb(cp, OPND(s), &mbs);
15234297a3b0SGarrett D'Amore assert(clen != (size_t)-1);
15244297a3b0SGarrett D'Amore cp += clen;
15254297a3b0SGarrett D'Amore }
15264297a3b0SGarrett D'Amore assert(cp == g->must + g->mlen);
15274297a3b0SGarrett D'Amore *cp++ = '\0'; /* just on general principles */
15284297a3b0SGarrett D'Amore }
15294297a3b0SGarrett D'Amore
15304297a3b0SGarrett D'Amore /*
15314297a3b0SGarrett D'Amore * altoffset - choose biggest offset among multiple choices
15324297a3b0SGarrett D'Amore *
15334297a3b0SGarrett D'Amore * Compute, recursively if necessary, the largest offset among multiple
15344297a3b0SGarrett D'Amore * re paths.
15354297a3b0SGarrett D'Amore */
15364297a3b0SGarrett D'Amore static int
altoffset(sop * scan,int offset)15374297a3b0SGarrett D'Amore altoffset(sop *scan, int offset)
15384297a3b0SGarrett D'Amore {
15394297a3b0SGarrett D'Amore int largest;
15404297a3b0SGarrett D'Amore int try;
15414297a3b0SGarrett D'Amore sop s;
15424297a3b0SGarrett D'Amore
15434297a3b0SGarrett D'Amore /* If we gave up already on offsets, return */
15444297a3b0SGarrett D'Amore if (offset == -1)
15454297a3b0SGarrett D'Amore return (-1);
15464297a3b0SGarrett D'Amore
15474297a3b0SGarrett D'Amore largest = 0;
15484297a3b0SGarrett D'Amore try = 0;
15494297a3b0SGarrett D'Amore s = *scan++;
15504297a3b0SGarrett D'Amore while (OP(s) != O_QUEST && OP(s) != O_CH) {
15514297a3b0SGarrett D'Amore switch (OP(s)) {
15524297a3b0SGarrett D'Amore case OOR1:
15534297a3b0SGarrett D'Amore if (try > largest)
15544297a3b0SGarrett D'Amore largest = try;
15554297a3b0SGarrett D'Amore try = 0;
15564297a3b0SGarrett D'Amore break;
15574297a3b0SGarrett D'Amore case OQUEST_:
15584297a3b0SGarrett D'Amore case OCH_:
15594297a3b0SGarrett D'Amore try = altoffset(scan, try);
15604297a3b0SGarrett D'Amore if (try == -1)
15614297a3b0SGarrett D'Amore return (-1);
15624297a3b0SGarrett D'Amore scan--;
15634297a3b0SGarrett D'Amore do {
15644297a3b0SGarrett D'Amore scan += OPND(s);
15654297a3b0SGarrett D'Amore s = *scan;
15664297a3b0SGarrett D'Amore if (OP(s) != O_QUEST && OP(s) != O_CH &&
15674297a3b0SGarrett D'Amore OP(s) != OOR2)
15684297a3b0SGarrett D'Amore return (-1);
15694297a3b0SGarrett D'Amore } while (OP(s) != O_QUEST && OP(s) != O_CH);
15704297a3b0SGarrett D'Amore /*
15714297a3b0SGarrett D'Amore * We must skip to the next position, or we'll
15724297a3b0SGarrett D'Amore * leave altoffset() too early.
15734297a3b0SGarrett D'Amore */
15744297a3b0SGarrett D'Amore scan++;
15754297a3b0SGarrett D'Amore break;
15764297a3b0SGarrett D'Amore case OANYOF:
15774297a3b0SGarrett D'Amore case OCHAR:
15784297a3b0SGarrett D'Amore case OANY:
15794297a3b0SGarrett D'Amore try++;
15804297a3b0SGarrett D'Amore /*FALLTHRU*/
15814297a3b0SGarrett D'Amore case OBOW:
15824297a3b0SGarrett D'Amore case OEOW:
15834297a3b0SGarrett D'Amore case OLPAREN:
15844297a3b0SGarrett D'Amore case ORPAREN:
15854297a3b0SGarrett D'Amore case OOR2:
15864297a3b0SGarrett D'Amore break;
15874297a3b0SGarrett D'Amore default:
15884297a3b0SGarrett D'Amore try = -1;
15894297a3b0SGarrett D'Amore break;
15904297a3b0SGarrett D'Amore }
15914297a3b0SGarrett D'Amore if (try == -1)
15924297a3b0SGarrett D'Amore return (-1);
15934297a3b0SGarrett D'Amore s = *scan++;
15944297a3b0SGarrett D'Amore }
15954297a3b0SGarrett D'Amore
15964297a3b0SGarrett D'Amore if (try > largest)
15974297a3b0SGarrett D'Amore largest = try;
15984297a3b0SGarrett D'Amore
15994297a3b0SGarrett D'Amore return (largest+offset);
16004297a3b0SGarrett D'Amore }
16014297a3b0SGarrett D'Amore
16024297a3b0SGarrett D'Amore /*
16034297a3b0SGarrett D'Amore * computejumps - compute char jumps for BM scan
16044297a3b0SGarrett D'Amore *
16054297a3b0SGarrett D'Amore * This algorithm assumes g->must exists and is has size greater than
16064297a3b0SGarrett D'Amore * zero. It's based on the algorithm found on Computer Algorithms by
16074297a3b0SGarrett D'Amore * Sara Baase.
16084297a3b0SGarrett D'Amore *
16094297a3b0SGarrett D'Amore * A char jump is the number of characters one needs to jump based on
16104297a3b0SGarrett D'Amore * the value of the character from the text that was mismatched.
16114297a3b0SGarrett D'Amore */
16124297a3b0SGarrett D'Amore static void
computejumps(struct parse * p,struct re_guts * g)16134297a3b0SGarrett D'Amore computejumps(struct parse *p, struct re_guts *g)
16144297a3b0SGarrett D'Amore {
16154297a3b0SGarrett D'Amore int ch;
16164297a3b0SGarrett D'Amore int mindex;
16174297a3b0SGarrett D'Amore
16184297a3b0SGarrett D'Amore /* Avoid making errors worse */
16194297a3b0SGarrett D'Amore if (p->error != 0)
16204297a3b0SGarrett D'Amore return;
16214297a3b0SGarrett D'Amore
16224297a3b0SGarrett D'Amore g->charjump = (int *)malloc((NC + 1) * sizeof (int));
16234297a3b0SGarrett D'Amore if (g->charjump == NULL) /* Not a fatal error */
16244297a3b0SGarrett D'Amore return;
16254297a3b0SGarrett D'Amore /* Adjust for signed chars, if necessary */
16264297a3b0SGarrett D'Amore g->charjump = &g->charjump[-(CHAR_MIN)];
16274297a3b0SGarrett D'Amore
16284297a3b0SGarrett D'Amore /*
16294297a3b0SGarrett D'Amore * If the character does not exist in the pattern, the jump
16304297a3b0SGarrett D'Amore * is equal to the number of characters in the pattern.
16314297a3b0SGarrett D'Amore */
16324297a3b0SGarrett D'Amore for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
16334297a3b0SGarrett D'Amore g->charjump[ch] = g->mlen;
16344297a3b0SGarrett D'Amore
16354297a3b0SGarrett D'Amore /*
16364297a3b0SGarrett D'Amore * If the character does exist, compute the jump that would
16374297a3b0SGarrett D'Amore * take us to the last character in the pattern equal to it
16384297a3b0SGarrett D'Amore * (notice that we match right to left, so that last character
16394297a3b0SGarrett D'Amore * is the first one that would be matched).
16404297a3b0SGarrett D'Amore */
16414297a3b0SGarrett D'Amore for (mindex = 0; mindex < g->mlen; mindex++)
16424297a3b0SGarrett D'Amore g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
16434297a3b0SGarrett D'Amore }
16444297a3b0SGarrett D'Amore
16454297a3b0SGarrett D'Amore /*
16464297a3b0SGarrett D'Amore * computematchjumps - compute match jumps for BM scan
16474297a3b0SGarrett D'Amore *
16484297a3b0SGarrett D'Amore * This algorithm assumes g->must exists and is has size greater than
16494297a3b0SGarrett D'Amore * zero. It's based on the algorithm found on Computer Algorithms by
16504297a3b0SGarrett D'Amore * Sara Baase.
16514297a3b0SGarrett D'Amore *
16524297a3b0SGarrett D'Amore * A match jump is the number of characters one needs to advance based
16534297a3b0SGarrett D'Amore * on the already-matched suffix.
16544297a3b0SGarrett D'Amore * Notice that all values here are minus (g->mlen-1), because of the way
16554297a3b0SGarrett D'Amore * the search algorithm works.
16564297a3b0SGarrett D'Amore */
16574297a3b0SGarrett D'Amore static void
computematchjumps(struct parse * p,struct re_guts * g)16584297a3b0SGarrett D'Amore computematchjumps(struct parse *p, struct re_guts *g)
16594297a3b0SGarrett D'Amore {
16604297a3b0SGarrett D'Amore int mindex; /* General "must" iterator */
16614297a3b0SGarrett D'Amore int suffix; /* Keeps track of matching suffix */
16624297a3b0SGarrett D'Amore int ssuffix; /* Keeps track of suffixes' suffix */
16634297a3b0SGarrett D'Amore int *pmatches;
16644297a3b0SGarrett D'Amore /*
16654297a3b0SGarrett D'Amore * pmatches[k] points to the next i
16664297a3b0SGarrett D'Amore * such that i+1...mlen is a substring
16674297a3b0SGarrett D'Amore * of k+1...k+mlen-i-1
16684297a3b0SGarrett D'Amore */
16694297a3b0SGarrett D'Amore
16704297a3b0SGarrett D'Amore /* Avoid making errors worse */
16714297a3b0SGarrett D'Amore if (p->error != 0)
16724297a3b0SGarrett D'Amore return;
16734297a3b0SGarrett D'Amore
16744297a3b0SGarrett D'Amore pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
16754297a3b0SGarrett D'Amore if (pmatches == NULL) {
16764297a3b0SGarrett D'Amore g->matchjump = NULL;
16774297a3b0SGarrett D'Amore return;
16784297a3b0SGarrett D'Amore }
16794297a3b0SGarrett D'Amore
16804297a3b0SGarrett D'Amore g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
16814297a3b0SGarrett D'Amore if (g->matchjump == NULL) /* Not a fatal error */
16824297a3b0SGarrett D'Amore return;
16834297a3b0SGarrett D'Amore
16844297a3b0SGarrett D'Amore /* Set maximum possible jump for each character in the pattern */
16854297a3b0SGarrett D'Amore for (mindex = 0; mindex < g->mlen; mindex++)
16864297a3b0SGarrett D'Amore g->matchjump[mindex] = 2*g->mlen - mindex - 1;
16874297a3b0SGarrett D'Amore
16884297a3b0SGarrett D'Amore /* Compute pmatches[] */
16894297a3b0SGarrett D'Amore for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
16904297a3b0SGarrett D'Amore mindex--, suffix--) {
16914297a3b0SGarrett D'Amore pmatches[mindex] = suffix;
16924297a3b0SGarrett D'Amore
16934297a3b0SGarrett D'Amore /*
16944297a3b0SGarrett D'Amore * If a mismatch is found, interrupting the substring,
16954297a3b0SGarrett D'Amore * compute the matchjump for that position. If no
16964297a3b0SGarrett D'Amore * mismatch is found, then a text substring mismatched
16974297a3b0SGarrett D'Amore * against the suffix will also mismatch against the
16984297a3b0SGarrett D'Amore * substring.
16994297a3b0SGarrett D'Amore */
17004297a3b0SGarrett D'Amore while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
17014297a3b0SGarrett D'Amore g->matchjump[suffix] = MIN(g->matchjump[suffix],
17024297a3b0SGarrett D'Amore g->mlen - mindex - 1);
17034297a3b0SGarrett D'Amore suffix = pmatches[suffix];
17044297a3b0SGarrett D'Amore }
17054297a3b0SGarrett D'Amore }
17064297a3b0SGarrett D'Amore
17074297a3b0SGarrett D'Amore /*
17084297a3b0SGarrett D'Amore * Compute the matchjump up to the last substring found to jump
17094297a3b0SGarrett D'Amore * to the beginning of the largest must pattern prefix matching
17104297a3b0SGarrett D'Amore * it's own suffix.
17114297a3b0SGarrett D'Amore */
17124297a3b0SGarrett D'Amore for (mindex = 0; mindex <= suffix; mindex++)
17134297a3b0SGarrett D'Amore g->matchjump[mindex] = MIN(g->matchjump[mindex],
17144297a3b0SGarrett D'Amore g->mlen + suffix - mindex);
17154297a3b0SGarrett D'Amore
17164297a3b0SGarrett D'Amore ssuffix = pmatches[suffix];
17174297a3b0SGarrett D'Amore while (suffix < g->mlen) {
17184297a3b0SGarrett D'Amore while (suffix <= ssuffix && suffix < g->mlen) {
17194297a3b0SGarrett D'Amore g->matchjump[suffix] = MIN(g->matchjump[suffix],
17204297a3b0SGarrett D'Amore g->mlen + ssuffix - suffix);
17214297a3b0SGarrett D'Amore suffix++;
17224297a3b0SGarrett D'Amore }
17234297a3b0SGarrett D'Amore if (suffix < g->mlen)
17244297a3b0SGarrett D'Amore ssuffix = pmatches[ssuffix];
17254297a3b0SGarrett D'Amore }
17264297a3b0SGarrett D'Amore
17274297a3b0SGarrett D'Amore free(pmatches);
17284297a3b0SGarrett D'Amore }
17294297a3b0SGarrett D'Amore
17304297a3b0SGarrett D'Amore /*
17314297a3b0SGarrett D'Amore * pluscount - count + nesting
17324297a3b0SGarrett D'Amore */
17334297a3b0SGarrett D'Amore static sopno /* nesting depth */
pluscount(struct parse * p,struct re_guts * g)17344297a3b0SGarrett D'Amore pluscount(struct parse *p, struct re_guts *g)
17354297a3b0SGarrett D'Amore {
17364297a3b0SGarrett D'Amore sop *scan;
17374297a3b0SGarrett D'Amore sop s;
17384297a3b0SGarrett D'Amore sopno plusnest = 0;
17394297a3b0SGarrett D'Amore sopno maxnest = 0;
17404297a3b0SGarrett D'Amore
17414297a3b0SGarrett D'Amore if (p->error != 0)
17424297a3b0SGarrett D'Amore return (0); /* there may not be an OEND */
17434297a3b0SGarrett D'Amore
17444297a3b0SGarrett D'Amore scan = g->strip + 1;
17454297a3b0SGarrett D'Amore do {
17464297a3b0SGarrett D'Amore s = *scan++;
17474297a3b0SGarrett D'Amore switch (OP(s)) {
17484297a3b0SGarrett D'Amore case OPLUS_:
17494297a3b0SGarrett D'Amore plusnest++;
17504297a3b0SGarrett D'Amore break;
17514297a3b0SGarrett D'Amore case O_PLUS:
17524297a3b0SGarrett D'Amore if (plusnest > maxnest)
17534297a3b0SGarrett D'Amore maxnest = plusnest;
17544297a3b0SGarrett D'Amore plusnest--;
17554297a3b0SGarrett D'Amore break;
17564297a3b0SGarrett D'Amore }
17574297a3b0SGarrett D'Amore } while (OP(s) != OEND);
17584297a3b0SGarrett D'Amore if (plusnest != 0)
17594297a3b0SGarrett D'Amore g->iflags |= BAD;
17604297a3b0SGarrett D'Amore return (maxnest);
17614297a3b0SGarrett D'Amore }
1762