1da2e3ebdSchin /*********************************************************************** 2da2e3ebdSchin * * 3da2e3ebdSchin * This software is part of the ast package * 4*3e14f97fSRoger A. Faulkner * Copyright (c) 1985-2010 AT&T Intellectual Property * 5da2e3ebdSchin * and is licensed under the * 6da2e3ebdSchin * Common Public License, Version 1.0 * 77c2fbfb3SApril Chin * by AT&T Intellectual Property * 8da2e3ebdSchin * * 9da2e3ebdSchin * A copy of the License is available at * 10da2e3ebdSchin * http://www.opensource.org/licenses/cpl1.0.txt * 11da2e3ebdSchin * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12da2e3ebdSchin * * 13da2e3ebdSchin * Information and Software Systems Research * 14da2e3ebdSchin * AT&T Research * 15da2e3ebdSchin * Florham Park NJ * 16da2e3ebdSchin * * 17da2e3ebdSchin * Glenn Fowler <gsf@research.att.com> * 18da2e3ebdSchin * David Korn <dgk@research.att.com> * 19da2e3ebdSchin * Phong Vo <kpv@research.att.com> * 20da2e3ebdSchin * * 21da2e3ebdSchin ***********************************************************************/ 22da2e3ebdSchin #pragma prototyped 23da2e3ebdSchin 24da2e3ebdSchin /* 25da2e3ebdSchin * posix regex implementation 26da2e3ebdSchin * 27da2e3ebdSchin * based on Doug McIlroy's C++ implementation 28da2e3ebdSchin * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest 29da2e3ebdSchin * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume 30da2e3ebdSchin */ 31da2e3ebdSchin 32da2e3ebdSchin #ifndef _REGLIB_H 33da2e3ebdSchin #define _REGLIB_H 34da2e3ebdSchin 35da2e3ebdSchin #define REG_VERSION_EXEC 20020509L 36da2e3ebdSchin #define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */ 37da2e3ebdSchin 38da2e3ebdSchin #define re_info env 39da2e3ebdSchin 40da2e3ebdSchin #define alloc _reg_alloc 41da2e3ebdSchin #define classfun _reg_classfun 42da2e3ebdSchin #define drop _reg_drop 43da2e3ebdSchin #define fatal _reg_fatal 44da2e3ebdSchin #define state _reg_state 45da2e3ebdSchin 46da2e3ebdSchin typedef struct regsubop_s 47da2e3ebdSchin { 48da2e3ebdSchin int op; /* REG_SUB_LOWER,REG_SUB_UPPER */ 49da2e3ebdSchin int off; /* re_rhs or match[] offset */ 50da2e3ebdSchin int len; /* re_rhs len or len==0 match[] */ 51da2e3ebdSchin } regsubop_t; 52da2e3ebdSchin 53da2e3ebdSchin #define _REG_SUB_PRIVATE_ \ 54da2e3ebdSchin char* re_cur; /* re_buf cursor */ \ 55da2e3ebdSchin char* re_end; /* re_buf end */ \ 56da2e3ebdSchin regsubop_t* re_ops; /* rhs ops */ \ 57da2e3ebdSchin char re_rhs[1]; /* substitution rhs */ 58da2e3ebdSchin 59da2e3ebdSchin #include <ast.h> 60da2e3ebdSchin #include <cdt.h> 61da2e3ebdSchin #include <stk.h> 62da2e3ebdSchin 63da2e3ebdSchin #include "regex.h" 64da2e3ebdSchin 65da2e3ebdSchin #include <ctype.h> 66da2e3ebdSchin #include <errno.h> 67da2e3ebdSchin 6834f9b3eeSRoland Mainz #if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG) 6934f9b3eeSRoland Mainz #define _AST_REGEX_DEBUG 1 7034f9b3eeSRoland Mainz #endif 7134f9b3eeSRoland Mainz 72da2e3ebdSchin #define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1) 73da2e3ebdSchin 74da2e3ebdSchin #undef RE_DUP_MAX /* posix puts this in limits.h! */ 75da2e3ebdSchin #define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */ 76da2e3ebdSchin #define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */ 77da2e3ebdSchin #define BACK_REF_MAX 9 78da2e3ebdSchin 797c2fbfb3SApril Chin #define REG_COMP (REG_DELIMITED|REG_ESCAPE|REG_EXTENDED|REG_FIRST|REG_ICASE|REG_NOSUB|REG_NEWLINE|REG_SHELL|REG_AUGMENTED|REG_LEFT|REG_LITERAL|REG_MINIMAL|REG_MULTIREF|REG_NULL|REG_RIGHT|REG_LENIENT|REG_MUSTDELIM) 80da2e3ebdSchin #define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND) 81da2e3ebdSchin 82da2e3ebdSchin #define REX_NULL 0 /* null string (internal) */ 83da2e3ebdSchin #define REX_ALT 1 /* a|b */ 84da2e3ebdSchin #define REX_ALT_CATCH 2 /* REX_ALT catcher */ 85da2e3ebdSchin #define REX_BACK 3 /* \1, \2, etc */ 86da2e3ebdSchin #define REX_BEG 4 /* initial ^ */ 87da2e3ebdSchin #define REX_BEG_STR 5 /* initial ^ w/ no newline */ 88da2e3ebdSchin #define REX_BM 6 /* Boyer-Moore */ 89da2e3ebdSchin #define REX_CAT 7 /* catenation catcher */ 90da2e3ebdSchin #define REX_CLASS 8 /* [...] */ 91da2e3ebdSchin #define REX_COLL_CLASS 9 /* collation order [...] */ 92da2e3ebdSchin #define REX_CONJ 10 /* a&b */ 93da2e3ebdSchin #define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */ 94da2e3ebdSchin #define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */ 95da2e3ebdSchin #define REX_DONE 13 /* completed match (internal) */ 96da2e3ebdSchin #define REX_DOT 14 /* . */ 97da2e3ebdSchin #define REX_END 15 /* final $ */ 98da2e3ebdSchin #define REX_END_STR 16 /* final $ before tail newline */ 99da2e3ebdSchin #define REX_EXEC 17 /* call re.re_exec() */ 100da2e3ebdSchin #define REX_FIN_STR 18 /* final $ w/ no newline */ 101da2e3ebdSchin #define REX_GROUP 19 /* \(...\) */ 102da2e3ebdSchin #define REX_GROUP_CATCH 20 /* REX_GROUP catcher */ 103da2e3ebdSchin #define REX_GROUP_AHEAD 21 /* 0-width lookahead */ 104da2e3ebdSchin #define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */ 105da2e3ebdSchin #define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */ 106da2e3ebdSchin #define REX_GROUP_BEHIND 24 /* 0-width lookbehind */ 107da2e3ebdSchin #define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */ 108da2e3ebdSchin #define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */ 109da2e3ebdSchin #define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */ 110da2e3ebdSchin #define REX_GROUP_COND 28 /* conditional group */ 111da2e3ebdSchin #define REX_GROUP_COND_CATCH 29 /* conditional group catcher */ 112da2e3ebdSchin #define REX_GROUP_CUT 30 /* don't backtrack over this */ 113da2e3ebdSchin #define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */ 114da2e3ebdSchin #define REX_KMP 32 /* Knuth-Morris-Pratt */ 115da2e3ebdSchin #define REX_NEG 33 /* negation */ 116da2e3ebdSchin #define REX_NEG_CATCH 34 /* REX_NEG catcher */ 117da2e3ebdSchin #define REX_NEST 35 /* nested match */ 118da2e3ebdSchin #define REX_ONECHAR 36 /* a single-character literal */ 119da2e3ebdSchin #define REX_REP 37 /* Kleene closure */ 120da2e3ebdSchin #define REX_REP_CATCH 38 /* REX_REP catcher */ 121da2e3ebdSchin #define REX_STRING 39 /* some chars */ 122da2e3ebdSchin #define REX_TRIE 40 /* alternation of strings */ 123da2e3ebdSchin #define REX_WBEG 41 /* \< */ 124da2e3ebdSchin #define REX_WEND 42 /* \> */ 125da2e3ebdSchin #define REX_WORD 43 /* word boundary */ 126da2e3ebdSchin #define REX_WORD_NOT 44 /* not word boundary */ 127da2e3ebdSchin 128da2e3ebdSchin #define T_META ((int)UCHAR_MAX+1) 129da2e3ebdSchin #define T_STAR (T_META+0) 130da2e3ebdSchin #define T_PLUS (T_META+1) 131da2e3ebdSchin #define T_QUES (T_META+2) 132da2e3ebdSchin #define T_BANG (T_META+3) 133da2e3ebdSchin #define T_AT (T_META+4) 134da2e3ebdSchin #define T_TILDE (T_META+5) 135da2e3ebdSchin #define T_PERCENT (T_META+6) 136da2e3ebdSchin #define T_LEFT (T_META+7) 137da2e3ebdSchin #define T_OPEN (T_META+8) 138da2e3ebdSchin #define T_CLOSE (T_OPEN+1) 139da2e3ebdSchin #define T_RIGHT (T_OPEN+2) 140da2e3ebdSchin #define T_CFLX (T_OPEN+3) 141da2e3ebdSchin #define T_DOT (T_OPEN+4) 142da2e3ebdSchin #define T_DOTSTAR (T_OPEN+5) 143da2e3ebdSchin #define T_END (T_OPEN+6) 144da2e3ebdSchin #define T_BAD (T_OPEN+7) 145da2e3ebdSchin #define T_DOLL (T_OPEN+8) 146da2e3ebdSchin #define T_BRA (T_OPEN+9) 147da2e3ebdSchin #define T_BAR (T_OPEN+10) 148da2e3ebdSchin #define T_AND (T_OPEN+11) 149da2e3ebdSchin #define T_LT (T_OPEN+12) 150da2e3ebdSchin #define T_GT (T_OPEN+13) 151da2e3ebdSchin #define T_SLASHPLUS (T_OPEN+14) 152da2e3ebdSchin #define T_GROUP (T_OPEN+15) 153da2e3ebdSchin #define T_WORD (T_OPEN+16) 154da2e3ebdSchin #define T_WORD_NOT (T_WORD+1) 155da2e3ebdSchin #define T_BEG_STR (T_WORD+2) 156da2e3ebdSchin #define T_END_STR (T_WORD+3) 157da2e3ebdSchin #define T_FIN_STR (T_WORD+4) 158da2e3ebdSchin #define T_ESCAPE (T_WORD+5) 159da2e3ebdSchin #define T_ALNUM (T_WORD+6) 160da2e3ebdSchin #define T_ALNUM_NOT (T_ALNUM+1) 161da2e3ebdSchin #define T_DIGIT (T_ALNUM+2) 162da2e3ebdSchin #define T_DIGIT_NOT (T_ALNUM+3) 163da2e3ebdSchin #define T_SPACE (T_ALNUM+4) 164da2e3ebdSchin #define T_SPACE_NOT (T_ALNUM+5) 165da2e3ebdSchin #define T_BACK (T_ALNUM+6) 166da2e3ebdSchin 167da2e3ebdSchin #define BRE 0 168da2e3ebdSchin #define ERE 3 169da2e3ebdSchin #define ARE 6 170da2e3ebdSchin #define SRE 9 171da2e3ebdSchin #define KRE 12 172da2e3ebdSchin 173da2e3ebdSchin #define HIT SSIZE_MAX 174da2e3ebdSchin 175da2e3ebdSchin #define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07)))) 176da2e3ebdSchin #define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07))) 177da2e3ebdSchin #define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07))) 178da2e3ebdSchin 179da2e3ebdSchin #define setadd(p,c) bitset((p)->bits,c) 180da2e3ebdSchin #define setclr(p,c) bitclr((p)->bits,c) 181da2e3ebdSchin #define settst(p,c) bittst((p)->bits,c) 182da2e3ebdSchin 183da2e3ebdSchin #if _hdr_wchar && _lib_wctype && _lib_iswctype 184da2e3ebdSchin 185da2e3ebdSchin #include <stdio.h> /* because <wchar.h> includes it and we generate it */ 186da2e3ebdSchin #include <wchar.h> 187da2e3ebdSchin #if _hdr_wctype 188da2e3ebdSchin #include <wctype.h> 189da2e3ebdSchin #endif 190da2e3ebdSchin 191da2e3ebdSchin #if !defined(iswblank) && !_lib_iswblank 192da2e3ebdSchin #define _need_iswblank 1 193da2e3ebdSchin #define iswblank(x) _reg_iswblank(x) 194da2e3ebdSchin extern int _reg_iswblank(wint_t); 195da2e3ebdSchin #endif 196da2e3ebdSchin 197da2e3ebdSchin #if !defined(towupper) && !_lib_towupper 198da2e3ebdSchin #define towupper(x) toupper(x) 199da2e3ebdSchin #endif 200da2e3ebdSchin 201da2e3ebdSchin #if !defined(towlower) && !_lib_towlower 202da2e3ebdSchin #define towlower(x) tolower(x) 203da2e3ebdSchin #endif 204da2e3ebdSchin 205da2e3ebdSchin #else 206da2e3ebdSchin 207da2e3ebdSchin #undef _lib_wctype 208da2e3ebdSchin 209da2e3ebdSchin #ifndef iswalnum 210da2e3ebdSchin #define iswalnum(x) isalnum(x) 211da2e3ebdSchin #endif 212da2e3ebdSchin #ifndef iswalpha 213da2e3ebdSchin #define iswalpha(x) isalpha(x) 214da2e3ebdSchin #endif 215da2e3ebdSchin #ifndef iswcntrl 216da2e3ebdSchin #define iswcntrl(x) iscntrl(x) 217da2e3ebdSchin #endif 218da2e3ebdSchin #ifndef iswdigit 219da2e3ebdSchin #define iswdigit(x) isdigit(x) 220da2e3ebdSchin #endif 221da2e3ebdSchin #ifndef iswgraph 222da2e3ebdSchin #define iswgraph(x) isgraph(x) 223da2e3ebdSchin #endif 224da2e3ebdSchin #ifndef iswlower 225da2e3ebdSchin #define iswlower(x) islower(x) 226da2e3ebdSchin #endif 227da2e3ebdSchin #ifndef iswprint 228da2e3ebdSchin #define iswprint(x) isprint(x) 229da2e3ebdSchin #endif 230da2e3ebdSchin #ifndef iswpunct 231da2e3ebdSchin #define iswpunct(x) ispunct(x) 232da2e3ebdSchin #endif 233da2e3ebdSchin #ifndef iswspace 234da2e3ebdSchin #define iswspace(x) isspace(x) 235da2e3ebdSchin #endif 236da2e3ebdSchin #ifndef iswupper 237da2e3ebdSchin #define iswupper(x) isupper(x) 238da2e3ebdSchin #endif 239da2e3ebdSchin #ifndef iswxdigit 240da2e3ebdSchin #define iswxdigit(x) isxdigit(x) 241da2e3ebdSchin #endif 242da2e3ebdSchin 243da2e3ebdSchin #ifndef towlower 244da2e3ebdSchin #define towlower(x) tolower(x) 245da2e3ebdSchin #endif 246da2e3ebdSchin #ifndef towupper 247da2e3ebdSchin #define towupper(x) toupper(x) 248da2e3ebdSchin #endif 249da2e3ebdSchin 250da2e3ebdSchin #endif 251da2e3ebdSchin 252da2e3ebdSchin #ifndef iswblank 253da2e3ebdSchin #define iswblank(x) ((x)==' '||(x)=='\t') 254da2e3ebdSchin #endif 255da2e3ebdSchin 256da2e3ebdSchin #ifndef iswgraph 257da2e3ebdSchin #define iswgraph(x) (iswprint(x)&&!iswblank(x)) 258da2e3ebdSchin #endif 259da2e3ebdSchin 260da2e3ebdSchin #define isword(x) (isalnum(x)||(x)=='_') 261da2e3ebdSchin 262da2e3ebdSchin /* 263da2e3ebdSchin * collation element support 264da2e3ebdSchin */ 265da2e3ebdSchin 2667c2fbfb3SApril Chin #define COLL_KEY_MAX 32 267da2e3ebdSchin 268da2e3ebdSchin #if COLL_KEY_MAX < MB_LEN_MAX 269da2e3ebdSchin #undef COLL_KEY_MAX 270da2e3ebdSchin #define COLL_KEY_MAX MB_LEN_MAX 271da2e3ebdSchin #endif 272da2e3ebdSchin 273da2e3ebdSchin typedef unsigned char Ckey_t[COLL_KEY_MAX+1]; 274da2e3ebdSchin 275da2e3ebdSchin #define COLL_end 0 276da2e3ebdSchin #define COLL_call 1 277da2e3ebdSchin #define COLL_char 2 278da2e3ebdSchin #define COLL_range 3 279da2e3ebdSchin #define COLL_range_lc 4 280da2e3ebdSchin #define COLL_range_uc 5 281da2e3ebdSchin 282da2e3ebdSchin typedef struct Celt_s 283da2e3ebdSchin { 284da2e3ebdSchin short typ; 285da2e3ebdSchin short min; 286da2e3ebdSchin short max; 287da2e3ebdSchin regclass_t fun; 288da2e3ebdSchin Ckey_t beg; 289da2e3ebdSchin Ckey_t end; 290da2e3ebdSchin } Celt_t; 291da2e3ebdSchin 292da2e3ebdSchin /* 293da2e3ebdSchin * private stuff hanging off regex_t 294da2e3ebdSchin */ 295da2e3ebdSchin 296da2e3ebdSchin typedef struct Stk_pos_s 297da2e3ebdSchin { 298da2e3ebdSchin off_t offset; 299da2e3ebdSchin char* base; 300da2e3ebdSchin } Stk_pos_t; 301da2e3ebdSchin 302da2e3ebdSchin typedef struct Vector_s 303da2e3ebdSchin { 304da2e3ebdSchin Stk_t* stk; /* stack pointer */ 305da2e3ebdSchin char* vec; /* the data */ 306da2e3ebdSchin int inc; /* growth increment */ 307da2e3ebdSchin int siz; /* element size */ 308da2e3ebdSchin int max; /* max index */ 309da2e3ebdSchin int cur; /* current index -- user domain */ 310da2e3ebdSchin } Vector_t; 311da2e3ebdSchin 312da2e3ebdSchin /* 313da2e3ebdSchin * Rex_t subtypes 314da2e3ebdSchin */ 315da2e3ebdSchin 316da2e3ebdSchin typedef struct Cond_s 317da2e3ebdSchin { 318da2e3ebdSchin unsigned char* beg; /* beginning of next match */ 319da2e3ebdSchin struct Rex_s* next[2]; /* 0:no 1:yes next pattern */ 320da2e3ebdSchin struct Rex_s* cont; /* right catcher */ 321da2e3ebdSchin int yes; /* yes condition hit */ 322da2e3ebdSchin } Cond_t; 323da2e3ebdSchin 324da2e3ebdSchin typedef struct Conj_left_s 325da2e3ebdSchin { 326da2e3ebdSchin unsigned char* beg; /* beginning of left match */ 327da2e3ebdSchin struct Rex_s* right; /* right pattern */ 328da2e3ebdSchin struct Rex_s* cont; /* right catcher */ 329da2e3ebdSchin } Conj_left_t; 330da2e3ebdSchin 331da2e3ebdSchin typedef struct Conj_right_s 332da2e3ebdSchin { 333da2e3ebdSchin unsigned char* end; /* end of left match */ 334da2e3ebdSchin struct Rex_s* cont; /* ambient continuation */ 335da2e3ebdSchin } Conj_right_t; 336da2e3ebdSchin 337da2e3ebdSchin typedef unsigned int Bm_mask_t; 338da2e3ebdSchin 339da2e3ebdSchin typedef struct Bm_s 340da2e3ebdSchin { 341da2e3ebdSchin Bm_mask_t** mask; 342da2e3ebdSchin size_t* skip; 343da2e3ebdSchin size_t* fail; 344da2e3ebdSchin size_t size; 345da2e3ebdSchin ssize_t back; 346da2e3ebdSchin ssize_t left; 347da2e3ebdSchin ssize_t right; 348da2e3ebdSchin size_t complete; 349da2e3ebdSchin } Bm_t; 350da2e3ebdSchin 351da2e3ebdSchin typedef struct String_s 352da2e3ebdSchin { 353da2e3ebdSchin int* fail; 354da2e3ebdSchin unsigned char* base; 355da2e3ebdSchin size_t size; 356da2e3ebdSchin } String_t; 357da2e3ebdSchin 358da2e3ebdSchin typedef struct Set_s 359da2e3ebdSchin { 360da2e3ebdSchin unsigned char bits[(UCHAR_MAX+1)/CHAR_BIT]; 361da2e3ebdSchin } Set_t; 362da2e3ebdSchin 363da2e3ebdSchin typedef struct Collate_s 364da2e3ebdSchin { 365da2e3ebdSchin int invert; 366da2e3ebdSchin Celt_t* elements; 367da2e3ebdSchin } Collate_t; 368da2e3ebdSchin 369da2e3ebdSchin typedef struct Binary_s 370da2e3ebdSchin { 371da2e3ebdSchin struct Rex_s* left; 372da2e3ebdSchin struct Rex_s* right; 373da2e3ebdSchin int serial; 374da2e3ebdSchin } Binary_t; 375da2e3ebdSchin 376da2e3ebdSchin typedef struct Group_s 377da2e3ebdSchin { 378da2e3ebdSchin int number; /* group number */ 379da2e3ebdSchin int last; /* last contained group number */ 380da2e3ebdSchin int size; /* lookbehind size */ 381da2e3ebdSchin int back; /* backreferenced */ 382da2e3ebdSchin regflags_t flags; /* group flags */ 383da2e3ebdSchin union 384da2e3ebdSchin { 385da2e3ebdSchin Binary_t binary; 386da2e3ebdSchin struct Rex_s* rex; 387da2e3ebdSchin } expr; 388da2e3ebdSchin } Group_t; 389da2e3ebdSchin 390da2e3ebdSchin typedef struct Exec_s 391da2e3ebdSchin { 392da2e3ebdSchin void* data; 393da2e3ebdSchin const char* text; 394da2e3ebdSchin size_t size; 395da2e3ebdSchin } Exec_t; 396da2e3ebdSchin 397da2e3ebdSchin #define REX_NEST_open 0x01 398da2e3ebdSchin #define REX_NEST_close 0x02 399da2e3ebdSchin #define REX_NEST_escape 0x04 400da2e3ebdSchin #define REX_NEST_quote 0x08 401da2e3ebdSchin #define REX_NEST_literal 0x10 402da2e3ebdSchin #define REX_NEST_delimiter 0x20 403da2e3ebdSchin #define REX_NEST_terminator 0x40 404da2e3ebdSchin #define REX_NEST_separator 0x80 405da2e3ebdSchin 406da2e3ebdSchin #define REX_NEST_SHIFT 8 407da2e3ebdSchin 408da2e3ebdSchin typedef struct Nest_s 409da2e3ebdSchin { 410da2e3ebdSchin int primary; 411da2e3ebdSchin unsigned short none; /* for Nest_t.type[-1] */ 412da2e3ebdSchin unsigned short type[1]; 413da2e3ebdSchin } Nest_t; 414da2e3ebdSchin 415da2e3ebdSchin /* 416da2e3ebdSchin * REX_ALT catcher, solely to get control at the end of an 417da2e3ebdSchin * alternative to keep records for comparing matches. 418da2e3ebdSchin */ 419da2e3ebdSchin 420da2e3ebdSchin typedef struct Alt_catch_s 421da2e3ebdSchin { 422da2e3ebdSchin struct Rex_s* cont; 423da2e3ebdSchin } Alt_catch_t; 424da2e3ebdSchin 425da2e3ebdSchin typedef struct Group_catch_s 426da2e3ebdSchin { 427da2e3ebdSchin struct Rex_s* cont; 428da2e3ebdSchin regoff_t* eo; 429da2e3ebdSchin } Group_catch_t; 430da2e3ebdSchin 431da2e3ebdSchin typedef struct Behind_catch_s 432da2e3ebdSchin { 433da2e3ebdSchin struct Rex_s* cont; 434da2e3ebdSchin unsigned char* beg; 435da2e3ebdSchin unsigned char* end; 436da2e3ebdSchin } Behind_catch_t; 437da2e3ebdSchin 438da2e3ebdSchin /* 439da2e3ebdSchin * REX_NEG catcher determines what string lengths can be matched, 440da2e3ebdSchin * then Neg investigates continuations of other lengths. 441da2e3ebdSchin * This is inefficient. For !POSITIONS expressions, we can do better: 442da2e3ebdSchin * since matches to rex will be enumerated in decreasing order, 443da2e3ebdSchin * we can investigate continuations whenever a length is skipped. 444da2e3ebdSchin */ 445da2e3ebdSchin 446da2e3ebdSchin typedef struct Neg_catch_s 447da2e3ebdSchin { 448da2e3ebdSchin unsigned char* beg; 449da2e3ebdSchin unsigned char* index; 450da2e3ebdSchin } Neg_catch_t; 451da2e3ebdSchin 452da2e3ebdSchin /* 453da2e3ebdSchin * REX_REP catcher. One is created on the stack for 454da2e3ebdSchin * each iteration of a complex repetition. 455da2e3ebdSchin */ 456da2e3ebdSchin 457da2e3ebdSchin typedef struct Rep_catch_s 458da2e3ebdSchin { 459da2e3ebdSchin struct Rex_s* cont; 460da2e3ebdSchin struct Rex_s* ref; 461da2e3ebdSchin unsigned char* beg; 462da2e3ebdSchin int n; 463da2e3ebdSchin } Rep_catch_t; 464da2e3ebdSchin 465da2e3ebdSchin /* 466da2e3ebdSchin * data structure for an alternation of pure strings 467da2e3ebdSchin * son points to a subtree of all strings with a common 468da2e3ebdSchin * prefix ending in character c. sib links alternate 469da2e3ebdSchin * letters in the same position of a word. end=1 if 470da2e3ebdSchin * some word ends with c. the order of strings is 471da2e3ebdSchin * irrelevant, except long words must be investigated 472da2e3ebdSchin * before short ones. 473da2e3ebdSchin */ 474da2e3ebdSchin 475da2e3ebdSchin typedef struct Trie_node_s 476da2e3ebdSchin { 477da2e3ebdSchin unsigned char c; 478da2e3ebdSchin unsigned char end; 479da2e3ebdSchin struct Trie_node_s* son; 480da2e3ebdSchin struct Trie_node_s* sib; 481da2e3ebdSchin } Trie_node_t; 482da2e3ebdSchin 483da2e3ebdSchin typedef struct Trie_s 484da2e3ebdSchin { 485da2e3ebdSchin Trie_node_t** root; 486da2e3ebdSchin int min; 487da2e3ebdSchin int max; 488da2e3ebdSchin } Trie_t; 489da2e3ebdSchin 490da2e3ebdSchin /* 491da2e3ebdSchin * Rex_t is a node in a regular expression 492da2e3ebdSchin */ 493da2e3ebdSchin 494da2e3ebdSchin typedef struct Rex_s 495da2e3ebdSchin { 496da2e3ebdSchin unsigned char type; /* node type */ 497da2e3ebdSchin unsigned char marked; /* already marked */ 498da2e3ebdSchin short serial; /* subpattern number */ 499da2e3ebdSchin regflags_t flags; /* scoped flags */ 500da2e3ebdSchin int explicit; /* scoped explicit match*/ 501da2e3ebdSchin struct Rex_s* next; /* remaining parts */ 502da2e3ebdSchin int lo; /* lo dup count */ 503da2e3ebdSchin int hi; /* hi dup count */ 504da2e3ebdSchin unsigned char* map; /* fold and/or ccode map*/ 505da2e3ebdSchin union 506da2e3ebdSchin { 507da2e3ebdSchin Alt_catch_t alt_catch; /* alt catcher */ 508da2e3ebdSchin Bm_t bm; /* bm */ 509da2e3ebdSchin Behind_catch_t behind_catch; /* behind catcher */ 510da2e3ebdSchin Set_t* charclass; /* char class */ 511da2e3ebdSchin Collate_t collate; /* collation class */ 512da2e3ebdSchin Cond_t cond_catch; /* cond catcher */ 513da2e3ebdSchin Conj_left_t conj_left; /* conj left catcher */ 514da2e3ebdSchin Conj_right_t conj_right; /* conj right catcher */ 515da2e3ebdSchin void* data; /* data after Rex_t */ 516da2e3ebdSchin Exec_t exec; /* re.re_exec() args */ 517da2e3ebdSchin Group_t group; /* a|b or rep */ 518da2e3ebdSchin Group_catch_t group_catch; /* group catcher */ 519da2e3ebdSchin Neg_catch_t neg_catch; /* neg catcher */ 520da2e3ebdSchin Nest_t nest; /* nested match */ 521da2e3ebdSchin unsigned char onechar; /* single char */ 522da2e3ebdSchin Rep_catch_t rep_catch; /* rep catcher */ 523da2e3ebdSchin String_t string; /* string/kmp */ 524da2e3ebdSchin Trie_t trie; /* trie */ 525da2e3ebdSchin } re; 526da2e3ebdSchin } Rex_t; 527da2e3ebdSchin 528da2e3ebdSchin typedef struct reglib_s /* library private regex_t info */ 529da2e3ebdSchin { 530da2e3ebdSchin struct Rex_s* rex; /* compiled expression */ 531da2e3ebdSchin regdisc_t* disc; /* REG_DISCIPLINE discipline */ 532da2e3ebdSchin const regex_t* regex; /* from regexec */ 533da2e3ebdSchin unsigned char* beg; /* beginning of string */ 534da2e3ebdSchin unsigned char* end; /* end of string */ 535da2e3ebdSchin Vector_t* pos; /* posns of certain subpatterns */ 536da2e3ebdSchin Vector_t* bestpos; /* ditto for best match */ 537da2e3ebdSchin regmatch_t* match; /* subexrs in current match */ 538da2e3ebdSchin regmatch_t* best; /* ditto in best match yet */ 539da2e3ebdSchin Stk_pos_t stk; /* exec stack pos */ 540da2e3ebdSchin size_t min; /* minimum match length */ 541da2e3ebdSchin size_t nsub; /* internal re_nsub */ 542da2e3ebdSchin regflags_t flags; /* flags from regcomp() */ 543da2e3ebdSchin int error; /* last error */ 544da2e3ebdSchin int explicit; /* explicit match on this char */ 545da2e3ebdSchin int leading; /* leading match on this char */ 546da2e3ebdSchin int refs; /* regcomp()+regdup() references*/ 547da2e3ebdSchin Rex_t done; /* the last continuation */ 548da2e3ebdSchin regstat_t stats; /* for regstat() */ 549da2e3ebdSchin unsigned char fold[UCHAR_MAX+1]; /* REG_ICASE map */ 550da2e3ebdSchin unsigned char hard; /* hard comp */ 551da2e3ebdSchin unsigned char once; /* if 1st parse fails, quit */ 552da2e3ebdSchin unsigned char separate; /* cannot combine */ 553da2e3ebdSchin unsigned char stack; /* hard comp or exec */ 554da2e3ebdSchin unsigned char sub; /* re_sub is valid */ 555da2e3ebdSchin unsigned char test; /* debug/test bitmask */ 556da2e3ebdSchin } Env_t; 557da2e3ebdSchin 558da2e3ebdSchin typedef struct State_s /* shared state */ 559da2e3ebdSchin { 560da2e3ebdSchin regmatch_t nomatch; 561da2e3ebdSchin struct 562da2e3ebdSchin { 563da2e3ebdSchin unsigned char key; 564da2e3ebdSchin short val[15]; 565da2e3ebdSchin } escape[52]; 566da2e3ebdSchin short* magic[UCHAR_MAX+1]; 567da2e3ebdSchin regdisc_t disc; 568da2e3ebdSchin int fatal; 569da2e3ebdSchin int initialized; 570da2e3ebdSchin Dt_t* attrs; 571da2e3ebdSchin Dt_t* names; 572da2e3ebdSchin Dtdisc_t dtdisc; 573da2e3ebdSchin } State_t; 574da2e3ebdSchin 575da2e3ebdSchin extern State_t state; 576da2e3ebdSchin 577da2e3ebdSchin extern void* alloc(regdisc_t*, void*, size_t); 578da2e3ebdSchin extern regclass_t classfun(int); 579da2e3ebdSchin extern void drop(regdisc_t*, Rex_t*); 580da2e3ebdSchin extern int fatal(regdisc_t*, int, const char*); 581da2e3ebdSchin 582da2e3ebdSchin #endif 583