14297a3b0SGarrett D'Amore /* 24297a3b0SGarrett D'Amore * Copyright (c) 1992, 1993, 1994 Henry Spencer. 34297a3b0SGarrett D'Amore * Copyright (c) 1992, 1993, 1994 44297a3b0SGarrett D'Amore * The Regents of the University of California. All rights reserved. 54297a3b0SGarrett D'Amore * 64297a3b0SGarrett D'Amore * This code is derived from software contributed to Berkeley by 74297a3b0SGarrett D'Amore * Henry Spencer. 84297a3b0SGarrett D'Amore * 94297a3b0SGarrett D'Amore * Redistribution and use in source and binary forms, with or without 104297a3b0SGarrett D'Amore * modification, are permitted provided that the following conditions 114297a3b0SGarrett D'Amore * are met: 124297a3b0SGarrett D'Amore * 1. Redistributions of source code must retain the above copyright 134297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer. 144297a3b0SGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright 154297a3b0SGarrett D'Amore * notice, this list of conditions and the following disclaimer in the 164297a3b0SGarrett D'Amore * documentation and/or other materials provided with the distribution. 174297a3b0SGarrett D'Amore * 4. Neither the name of the University nor the names of its contributors 184297a3b0SGarrett D'Amore * may be used to endorse or promote products derived from this software 194297a3b0SGarrett D'Amore * without specific prior written permission. 204297a3b0SGarrett D'Amore * 214297a3b0SGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 224297a3b0SGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 234297a3b0SGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 244297a3b0SGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 254297a3b0SGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 264297a3b0SGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 274297a3b0SGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 284297a3b0SGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 294297a3b0SGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 304297a3b0SGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 314297a3b0SGarrett D'Amore * SUCH DAMAGE. 324297a3b0SGarrett D'Amore * 334297a3b0SGarrett D'Amore * @(#)regex2.h 8.4 (Berkeley) 3/20/94 344297a3b0SGarrett D'Amore * $FreeBSD: src/lib/libc/regex/regex2.h,v 1.11 2007/01/09 00:28:04 imp Exp $ 354297a3b0SGarrett D'Amore */ 364297a3b0SGarrett D'Amore 374297a3b0SGarrett D'Amore /* 384297a3b0SGarrett D'Amore * First, the stuff that ends up in the outside-world include file 394297a3b0SGarrett D'Amore * typedef off_t regoff_t; 404297a3b0SGarrett D'Amore * typedef struct { 414297a3b0SGarrett D'Amore * int re_magic; 424297a3b0SGarrett D'Amore * size_t re_nsub; // number of parenthesized subexpressions 434297a3b0SGarrett D'Amore * const char *re_endp; // end pointer for REG_PEND 444297a3b0SGarrett D'Amore * struct re_guts *re_g; // none of your business :-) 454297a3b0SGarrett D'Amore * } regex_t; 464297a3b0SGarrett D'Amore * typedef struct { 474297a3b0SGarrett D'Amore * regoff_t rm_so; // start of match 484297a3b0SGarrett D'Amore * regoff_t rm_eo; // end of match 494297a3b0SGarrett D'Amore * } regmatch_t; 504297a3b0SGarrett D'Amore */ 514297a3b0SGarrett D'Amore /* 524297a3b0SGarrett D'Amore * internals of regex_t 534297a3b0SGarrett D'Amore */ 544297a3b0SGarrett D'Amore #define MAGIC1 ((('r'^0200)<<8) | 'e') 554297a3b0SGarrett D'Amore 564297a3b0SGarrett D'Amore /* 574297a3b0SGarrett D'Amore * The internal representation is a *strip*, a sequence of 584297a3b0SGarrett D'Amore * operators ending with an endmarker. (Some terminology etc. is a 594297a3b0SGarrett D'Amore * historical relic of earlier versions which used multiple strips.) 604297a3b0SGarrett D'Amore * Certain oddities in the representation are there to permit running 614297a3b0SGarrett D'Amore * the machinery backwards; in particular, any deviation from sequential 624297a3b0SGarrett D'Amore * flow must be marked at both its source and its destination. Some 634297a3b0SGarrett D'Amore * fine points: 644297a3b0SGarrett D'Amore * 654297a3b0SGarrett D'Amore * - OPLUS_ and O_PLUS are *inside* the loop they create. 664297a3b0SGarrett D'Amore * - OQUEST_ and O_QUEST are *outside* the bypass they create. 674297a3b0SGarrett D'Amore * - OCH_ and O_CH are *outside* the multi-way branch they create, while 684297a3b0SGarrett D'Amore * OOR1 and OOR2 are respectively the end and the beginning of one of 694297a3b0SGarrett D'Amore * the branches. Note that there is an implicit OOR2 following OCH_ 704297a3b0SGarrett D'Amore * and an implicit OOR1 preceding O_CH. 714297a3b0SGarrett D'Amore * 724297a3b0SGarrett D'Amore * In state representations, an operator's bit is on to signify a state 734297a3b0SGarrett D'Amore * immediately *preceding* "execution" of that operator. 744297a3b0SGarrett D'Amore */ 75*eda71b4aSGarrett D'Amore typedef unsigned int sop; /* strip operator */ 76*eda71b4aSGarrett D'Amore typedef int sopno; 774297a3b0SGarrett D'Amore #define OPRMASK 0xf8000000U 784297a3b0SGarrett D'Amore #define OPDMASK 0x07ffffffU 794297a3b0SGarrett D'Amore #define OPSHIFT ((unsigned)27) 804297a3b0SGarrett D'Amore #define OP(n) ((n)&OPRMASK) 814297a3b0SGarrett D'Amore #define OPND(n) ((n)&OPDMASK) 824297a3b0SGarrett D'Amore #define SOP(op, opnd) ((op)|(opnd)) 834297a3b0SGarrett D'Amore /* operators meaning operand */ 844297a3b0SGarrett D'Amore /* (back, fwd are offsets) */ 854297a3b0SGarrett D'Amore #define OEND (1U<<OPSHIFT) /* endmarker - */ 864297a3b0SGarrett D'Amore #define OCHAR (2U<<OPSHIFT) /* character wide character */ 874297a3b0SGarrett D'Amore #define OBOL (3U<<OPSHIFT) /* left anchor - */ 884297a3b0SGarrett D'Amore #define OEOL (4U<<OPSHIFT) /* right anchor - */ 894297a3b0SGarrett D'Amore #define OANY (5U<<OPSHIFT) /* . - */ 904297a3b0SGarrett D'Amore #define OANYOF (6U<<OPSHIFT) /* [...] set number */ 914297a3b0SGarrett D'Amore #define OBACK_ (7U<<OPSHIFT) /* begin \d paren number */ 924297a3b0SGarrett D'Amore #define O_BACK (8U<<OPSHIFT) /* end \d paren number */ 934297a3b0SGarrett D'Amore #define OPLUS_ (9U<<OPSHIFT) /* + prefix fwd to suffix */ 944297a3b0SGarrett D'Amore #define O_PLUS (10U<<OPSHIFT) /* + suffix back to prefix */ 954297a3b0SGarrett D'Amore #define OQUEST_ (11U<<OPSHIFT) /* ? prefix fwd to suffix */ 964297a3b0SGarrett D'Amore #define O_QUEST (12U<<OPSHIFT) /* ? suffix back to prefix */ 974297a3b0SGarrett D'Amore #define OLPAREN (13U<<OPSHIFT) /* ( fwd to ) */ 984297a3b0SGarrett D'Amore #define ORPAREN (14U<<OPSHIFT) /* ) back to ( */ 994297a3b0SGarrett D'Amore #define OCH_ (15U<<OPSHIFT) /* begin choice fwd to OOR2 */ 1004297a3b0SGarrett D'Amore #define OOR1 (16U<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */ 1014297a3b0SGarrett D'Amore #define OOR2 (17U<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */ 1024297a3b0SGarrett D'Amore #define O_CH (18U<<OPSHIFT) /* end choice back to OOR1 */ 1034297a3b0SGarrett D'Amore #define OBOW (19U<<OPSHIFT) /* begin word - */ 1044297a3b0SGarrett D'Amore #define OEOW (20U<<OPSHIFT) /* end word - */ 1054297a3b0SGarrett D'Amore 1064297a3b0SGarrett D'Amore /* 1074297a3b0SGarrett D'Amore * Structures for [] character-set representation. 1084297a3b0SGarrett D'Amore */ 1094297a3b0SGarrett D'Amore typedef struct { 1104297a3b0SGarrett D'Amore wint_t min; 1114297a3b0SGarrett D'Amore wint_t max; 1124297a3b0SGarrett D'Amore } crange; 1134297a3b0SGarrett D'Amore typedef struct { 1144297a3b0SGarrett D'Amore unsigned char bmp[NC / 8]; 1154297a3b0SGarrett D'Amore wctype_t *types; 1164297a3b0SGarrett D'Amore int ntypes; 1174297a3b0SGarrett D'Amore wint_t *wides; 1184297a3b0SGarrett D'Amore int nwides; 1194297a3b0SGarrett D'Amore crange *ranges; 1204297a3b0SGarrett D'Amore int nranges; 1214297a3b0SGarrett D'Amore int invert; 1224297a3b0SGarrett D'Amore int icase; 1234297a3b0SGarrett D'Amore } cset; 1244297a3b0SGarrett D'Amore 1254297a3b0SGarrett D'Amore static int 1264297a3b0SGarrett D'Amore CHIN1(cset *cs, wint_t ch) 1274297a3b0SGarrett D'Amore { 1284297a3b0SGarrett D'Amore int i; 1294297a3b0SGarrett D'Amore 1304297a3b0SGarrett D'Amore assert(ch >= 0); 1314297a3b0SGarrett D'Amore if (ch < NC) 1324297a3b0SGarrett D'Amore return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ 1334297a3b0SGarrett D'Amore cs->invert); 1344297a3b0SGarrett D'Amore for (i = 0; i < cs->nwides; i++) 1354297a3b0SGarrett D'Amore if (ch == cs->wides[i]) 1364297a3b0SGarrett D'Amore return (!cs->invert); 1374297a3b0SGarrett D'Amore for (i = 0; i < cs->nranges; i++) 1384297a3b0SGarrett D'Amore if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max) 1394297a3b0SGarrett D'Amore return (!cs->invert); 1404297a3b0SGarrett D'Amore for (i = 0; i < cs->ntypes; i++) 1414297a3b0SGarrett D'Amore if (iswctype(ch, cs->types[i])) 1424297a3b0SGarrett D'Amore return (!cs->invert); 1434297a3b0SGarrett D'Amore return (cs->invert); 1444297a3b0SGarrett D'Amore } 1454297a3b0SGarrett D'Amore 1464297a3b0SGarrett D'Amore static int 1474297a3b0SGarrett D'Amore CHIN(cset *cs, wint_t ch) 1484297a3b0SGarrett D'Amore { 1494297a3b0SGarrett D'Amore 1504297a3b0SGarrett D'Amore assert(ch >= 0); 1514297a3b0SGarrett D'Amore if (ch < NC) 1524297a3b0SGarrett D'Amore return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ 1534297a3b0SGarrett D'Amore cs->invert); 1544297a3b0SGarrett D'Amore else if (cs->icase) 1554297a3b0SGarrett D'Amore return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || 1564297a3b0SGarrett D'Amore CHIN1(cs, towupper(ch))); 1574297a3b0SGarrett D'Amore else 1584297a3b0SGarrett D'Amore return (CHIN1(cs, ch)); 1594297a3b0SGarrett D'Amore } 1604297a3b0SGarrett D'Amore 1614297a3b0SGarrett D'Amore /* 1624297a3b0SGarrett D'Amore * main compiled-expression structure 1634297a3b0SGarrett D'Amore */ 1644297a3b0SGarrett D'Amore struct re_guts { 1654297a3b0SGarrett D'Amore int magic; 1664297a3b0SGarrett D'Amore #define MAGIC2 ((('R'^0200)<<8)|'E') 1674297a3b0SGarrett D'Amore sop *strip; /* malloced area for strip */ 1684297a3b0SGarrett D'Amore int ncsets; /* number of csets in use */ 1694297a3b0SGarrett D'Amore cset *sets; /* -> cset [ncsets] */ 1704297a3b0SGarrett D'Amore int cflags; /* copy of regcomp() cflags argument */ 1714297a3b0SGarrett D'Amore sopno nstates; /* = number of sops */ 1724297a3b0SGarrett D'Amore sopno firststate; /* the initial OEND (normally 0) */ 1734297a3b0SGarrett D'Amore sopno laststate; /* the final OEND */ 1744297a3b0SGarrett D'Amore int iflags; /* internal flags */ 1754297a3b0SGarrett D'Amore #define USEBOL 01 /* used ^ */ 1764297a3b0SGarrett D'Amore #define USEEOL 02 /* used $ */ 1774297a3b0SGarrett D'Amore #define BAD 04 /* something wrong */ 1784297a3b0SGarrett D'Amore int nbol; /* number of ^ used */ 1794297a3b0SGarrett D'Amore int neol; /* number of $ used */ 1804297a3b0SGarrett D'Amore char *must; /* match must contain this string */ 1814297a3b0SGarrett D'Amore int moffset; /* latest point at which must may be located */ 1824297a3b0SGarrett D'Amore int *charjump; /* Boyer-Moore char jump table */ 1834297a3b0SGarrett D'Amore int *matchjump; /* Boyer-Moore match jump table */ 1844297a3b0SGarrett D'Amore int mlen; /* length of must */ 1854297a3b0SGarrett D'Amore size_t nsub; /* copy of re_nsub */ 1864297a3b0SGarrett D'Amore int backrefs; /* does it use back references? */ 1874297a3b0SGarrett D'Amore sopno nplus; /* how deep does it nest +s? */ 1884297a3b0SGarrett D'Amore }; 1894297a3b0SGarrett D'Amore 1904297a3b0SGarrett D'Amore /* misc utilities */ 1914297a3b0SGarrett D'Amore #define OUT (CHAR_MIN - 1) /* a non-character value */ 1924297a3b0SGarrett D'Amore #define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_') 193