1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2009 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex implementation 26 * 27 * based on Doug McIlroy's C++ implementation 28 * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest 29 * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume 30 */ 31 32 #ifndef _REGLIB_H 33 #define _REGLIB_H 34 35 #define REG_VERSION_EXEC 20020509L 36 #define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */ 37 38 #define re_info env 39 40 #define alloc _reg_alloc 41 #define classfun _reg_classfun 42 #define drop _reg_drop 43 #define fatal _reg_fatal 44 #define state _reg_state 45 46 typedef struct regsubop_s 47 { 48 int op; /* REG_SUB_LOWER,REG_SUB_UPPER */ 49 int off; /* re_rhs or match[] offset */ 50 int len; /* re_rhs len or len==0 match[] */ 51 } regsubop_t; 52 53 #define _REG_SUB_PRIVATE_ \ 54 char* re_cur; /* re_buf cursor */ \ 55 char* re_end; /* re_buf end */ \ 56 regsubop_t* re_ops; /* rhs ops */ \ 57 char re_rhs[1]; /* substitution rhs */ 58 59 #include <ast.h> 60 #include <cdt.h> 61 #include <stk.h> 62 63 #include "regex.h" 64 65 #include <ctype.h> 66 #include <errno.h> 67 68 #if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG) 69 #define _AST_REGEX_DEBUG 1 70 #endif 71 72 #define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1) 73 74 #undef RE_DUP_MAX /* posix puts this in limits.h! */ 75 #define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */ 76 #define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */ 77 #define BACK_REF_MAX 9 78 79 #define REG_COMP (REG_DELIMITED|REG_ESCAPE|REG_EXTENDED|REG_FIRST|REG_ICASE|REG_NOSUB|REG_NEWLINE|REG_SHELL|REG_AUGMENTED|REG_LEFT|REG_LITERAL|REG_MINIMAL|REG_MULTIREF|REG_NULL|REG_RIGHT|REG_LENIENT|REG_MUSTDELIM) 80 #define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND) 81 82 #define REX_NULL 0 /* null string (internal) */ 83 #define REX_ALT 1 /* a|b */ 84 #define REX_ALT_CATCH 2 /* REX_ALT catcher */ 85 #define REX_BACK 3 /* \1, \2, etc */ 86 #define REX_BEG 4 /* initial ^ */ 87 #define REX_BEG_STR 5 /* initial ^ w/ no newline */ 88 #define REX_BM 6 /* Boyer-Moore */ 89 #define REX_CAT 7 /* catenation catcher */ 90 #define REX_CLASS 8 /* [...] */ 91 #define REX_COLL_CLASS 9 /* collation order [...] */ 92 #define REX_CONJ 10 /* a&b */ 93 #define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */ 94 #define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */ 95 #define REX_DONE 13 /* completed match (internal) */ 96 #define REX_DOT 14 /* . */ 97 #define REX_END 15 /* final $ */ 98 #define REX_END_STR 16 /* final $ before tail newline */ 99 #define REX_EXEC 17 /* call re.re_exec() */ 100 #define REX_FIN_STR 18 /* final $ w/ no newline */ 101 #define REX_GROUP 19 /* \(...\) */ 102 #define REX_GROUP_CATCH 20 /* REX_GROUP catcher */ 103 #define REX_GROUP_AHEAD 21 /* 0-width lookahead */ 104 #define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */ 105 #define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */ 106 #define REX_GROUP_BEHIND 24 /* 0-width lookbehind */ 107 #define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */ 108 #define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */ 109 #define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */ 110 #define REX_GROUP_COND 28 /* conditional group */ 111 #define REX_GROUP_COND_CATCH 29 /* conditional group catcher */ 112 #define REX_GROUP_CUT 30 /* don't backtrack over this */ 113 #define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */ 114 #define REX_KMP 32 /* Knuth-Morris-Pratt */ 115 #define REX_NEG 33 /* negation */ 116 #define REX_NEG_CATCH 34 /* REX_NEG catcher */ 117 #define REX_NEST 35 /* nested match */ 118 #define REX_ONECHAR 36 /* a single-character literal */ 119 #define REX_REP 37 /* Kleene closure */ 120 #define REX_REP_CATCH 38 /* REX_REP catcher */ 121 #define REX_STRING 39 /* some chars */ 122 #define REX_TRIE 40 /* alternation of strings */ 123 #define REX_WBEG 41 /* \< */ 124 #define REX_WEND 42 /* \> */ 125 #define REX_WORD 43 /* word boundary */ 126 #define REX_WORD_NOT 44 /* not word boundary */ 127 128 #define T_META ((int)UCHAR_MAX+1) 129 #define T_STAR (T_META+0) 130 #define T_PLUS (T_META+1) 131 #define T_QUES (T_META+2) 132 #define T_BANG (T_META+3) 133 #define T_AT (T_META+4) 134 #define T_TILDE (T_META+5) 135 #define T_PERCENT (T_META+6) 136 #define T_LEFT (T_META+7) 137 #define T_OPEN (T_META+8) 138 #define T_CLOSE (T_OPEN+1) 139 #define T_RIGHT (T_OPEN+2) 140 #define T_CFLX (T_OPEN+3) 141 #define T_DOT (T_OPEN+4) 142 #define T_DOTSTAR (T_OPEN+5) 143 #define T_END (T_OPEN+6) 144 #define T_BAD (T_OPEN+7) 145 #define T_DOLL (T_OPEN+8) 146 #define T_BRA (T_OPEN+9) 147 #define T_BAR (T_OPEN+10) 148 #define T_AND (T_OPEN+11) 149 #define T_LT (T_OPEN+12) 150 #define T_GT (T_OPEN+13) 151 #define T_SLASHPLUS (T_OPEN+14) 152 #define T_GROUP (T_OPEN+15) 153 #define T_WORD (T_OPEN+16) 154 #define T_WORD_NOT (T_WORD+1) 155 #define T_BEG_STR (T_WORD+2) 156 #define T_END_STR (T_WORD+3) 157 #define T_FIN_STR (T_WORD+4) 158 #define T_ESCAPE (T_WORD+5) 159 #define T_ALNUM (T_WORD+6) 160 #define T_ALNUM_NOT (T_ALNUM+1) 161 #define T_DIGIT (T_ALNUM+2) 162 #define T_DIGIT_NOT (T_ALNUM+3) 163 #define T_SPACE (T_ALNUM+4) 164 #define T_SPACE_NOT (T_ALNUM+5) 165 #define T_BACK (T_ALNUM+6) 166 167 #define BRE 0 168 #define ERE 3 169 #define ARE 6 170 #define SRE 9 171 #define KRE 12 172 173 #define HIT SSIZE_MAX 174 175 #define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07)))) 176 #define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07))) 177 #define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07))) 178 179 #define setadd(p,c) bitset((p)->bits,c) 180 #define setclr(p,c) bitclr((p)->bits,c) 181 #define settst(p,c) bittst((p)->bits,c) 182 183 #if _hdr_wchar && _lib_wctype && _lib_iswctype 184 185 #include <stdio.h> /* because <wchar.h> includes it and we generate it */ 186 #include <wchar.h> 187 #if _hdr_wctype 188 #include <wctype.h> 189 #endif 190 191 #if !defined(iswblank) && !_lib_iswblank 192 #define _need_iswblank 1 193 #define iswblank(x) _reg_iswblank(x) 194 extern int _reg_iswblank(wint_t); 195 #endif 196 197 #if !defined(towupper) && !_lib_towupper 198 #define towupper(x) toupper(x) 199 #endif 200 201 #if !defined(towlower) && !_lib_towlower 202 #define towlower(x) tolower(x) 203 #endif 204 205 #else 206 207 #undef _lib_wctype 208 209 #ifndef iswalnum 210 #define iswalnum(x) isalnum(x) 211 #endif 212 #ifndef iswalpha 213 #define iswalpha(x) isalpha(x) 214 #endif 215 #ifndef iswcntrl 216 #define iswcntrl(x) iscntrl(x) 217 #endif 218 #ifndef iswdigit 219 #define iswdigit(x) isdigit(x) 220 #endif 221 #ifndef iswgraph 222 #define iswgraph(x) isgraph(x) 223 #endif 224 #ifndef iswlower 225 #define iswlower(x) islower(x) 226 #endif 227 #ifndef iswprint 228 #define iswprint(x) isprint(x) 229 #endif 230 #ifndef iswpunct 231 #define iswpunct(x) ispunct(x) 232 #endif 233 #ifndef iswspace 234 #define iswspace(x) isspace(x) 235 #endif 236 #ifndef iswupper 237 #define iswupper(x) isupper(x) 238 #endif 239 #ifndef iswxdigit 240 #define iswxdigit(x) isxdigit(x) 241 #endif 242 243 #ifndef towlower 244 #define towlower(x) tolower(x) 245 #endif 246 #ifndef towupper 247 #define towupper(x) toupper(x) 248 #endif 249 250 #endif 251 252 #ifndef iswblank 253 #define iswblank(x) ((x)==' '||(x)=='\t') 254 #endif 255 256 #ifndef iswgraph 257 #define iswgraph(x) (iswprint(x)&&!iswblank(x)) 258 #endif 259 260 #define isword(x) (isalnum(x)||(x)=='_') 261 262 /* 263 * collation element support 264 */ 265 266 #define COLL_KEY_MAX 32 267 268 #if COLL_KEY_MAX < MB_LEN_MAX 269 #undef COLL_KEY_MAX 270 #define COLL_KEY_MAX MB_LEN_MAX 271 #endif 272 273 typedef unsigned char Ckey_t[COLL_KEY_MAX+1]; 274 275 #define COLL_end 0 276 #define COLL_call 1 277 #define COLL_char 2 278 #define COLL_range 3 279 #define COLL_range_lc 4 280 #define COLL_range_uc 5 281 282 typedef struct Celt_s 283 { 284 short typ; 285 short min; 286 short max; 287 regclass_t fun; 288 Ckey_t beg; 289 Ckey_t end; 290 } Celt_t; 291 292 /* 293 * private stuff hanging off regex_t 294 */ 295 296 typedef struct Stk_pos_s 297 { 298 off_t offset; 299 char* base; 300 } Stk_pos_t; 301 302 typedef struct Vector_s 303 { 304 Stk_t* stk; /* stack pointer */ 305 char* vec; /* the data */ 306 int inc; /* growth increment */ 307 int siz; /* element size */ 308 int max; /* max index */ 309 int cur; /* current index -- user domain */ 310 } Vector_t; 311 312 /* 313 * Rex_t subtypes 314 */ 315 316 typedef struct Cond_s 317 { 318 unsigned char* beg; /* beginning of next match */ 319 struct Rex_s* next[2]; /* 0:no 1:yes next pattern */ 320 struct Rex_s* cont; /* right catcher */ 321 int yes; /* yes condition hit */ 322 } Cond_t; 323 324 typedef struct Conj_left_s 325 { 326 unsigned char* beg; /* beginning of left match */ 327 struct Rex_s* right; /* right pattern */ 328 struct Rex_s* cont; /* right catcher */ 329 } Conj_left_t; 330 331 typedef struct Conj_right_s 332 { 333 unsigned char* end; /* end of left match */ 334 struct Rex_s* cont; /* ambient continuation */ 335 } Conj_right_t; 336 337 typedef unsigned int Bm_mask_t; 338 339 typedef struct Bm_s 340 { 341 Bm_mask_t** mask; 342 size_t* skip; 343 size_t* fail; 344 size_t size; 345 ssize_t back; 346 ssize_t left; 347 ssize_t right; 348 size_t complete; 349 } Bm_t; 350 351 typedef struct String_s 352 { 353 int* fail; 354 unsigned char* base; 355 size_t size; 356 } String_t; 357 358 typedef struct Set_s 359 { 360 unsigned char bits[(UCHAR_MAX+1)/CHAR_BIT]; 361 } Set_t; 362 363 typedef struct Collate_s 364 { 365 int invert; 366 Celt_t* elements; 367 } Collate_t; 368 369 typedef struct Binary_s 370 { 371 struct Rex_s* left; 372 struct Rex_s* right; 373 int serial; 374 } Binary_t; 375 376 typedef struct Group_s 377 { 378 int number; /* group number */ 379 int last; /* last contained group number */ 380 int size; /* lookbehind size */ 381 int back; /* backreferenced */ 382 regflags_t flags; /* group flags */ 383 union 384 { 385 Binary_t binary; 386 struct Rex_s* rex; 387 } expr; 388 } Group_t; 389 390 typedef struct Exec_s 391 { 392 void* data; 393 const char* text; 394 size_t size; 395 } Exec_t; 396 397 #define REX_NEST_open 0x01 398 #define REX_NEST_close 0x02 399 #define REX_NEST_escape 0x04 400 #define REX_NEST_quote 0x08 401 #define REX_NEST_literal 0x10 402 #define REX_NEST_delimiter 0x20 403 #define REX_NEST_terminator 0x40 404 #define REX_NEST_separator 0x80 405 406 #define REX_NEST_SHIFT 8 407 408 typedef struct Nest_s 409 { 410 int primary; 411 unsigned short none; /* for Nest_t.type[-1] */ 412 unsigned short type[1]; 413 } Nest_t; 414 415 /* 416 * REX_ALT catcher, solely to get control at the end of an 417 * alternative to keep records for comparing matches. 418 */ 419 420 typedef struct Alt_catch_s 421 { 422 struct Rex_s* cont; 423 } Alt_catch_t; 424 425 typedef struct Group_catch_s 426 { 427 struct Rex_s* cont; 428 regoff_t* eo; 429 } Group_catch_t; 430 431 typedef struct Behind_catch_s 432 { 433 struct Rex_s* cont; 434 unsigned char* beg; 435 unsigned char* end; 436 } Behind_catch_t; 437 438 /* 439 * REX_NEG catcher determines what string lengths can be matched, 440 * then Neg investigates continuations of other lengths. 441 * This is inefficient. For !POSITIONS expressions, we can do better: 442 * since matches to rex will be enumerated in decreasing order, 443 * we can investigate continuations whenever a length is skipped. 444 */ 445 446 typedef struct Neg_catch_s 447 { 448 unsigned char* beg; 449 unsigned char* index; 450 } Neg_catch_t; 451 452 /* 453 * REX_REP catcher. One is created on the stack for 454 * each iteration of a complex repetition. 455 */ 456 457 typedef struct Rep_catch_s 458 { 459 struct Rex_s* cont; 460 struct Rex_s* ref; 461 unsigned char* beg; 462 int n; 463 } Rep_catch_t; 464 465 /* 466 * data structure for an alternation of pure strings 467 * son points to a subtree of all strings with a common 468 * prefix ending in character c. sib links alternate 469 * letters in the same position of a word. end=1 if 470 * some word ends with c. the order of strings is 471 * irrelevant, except long words must be investigated 472 * before short ones. 473 */ 474 475 typedef struct Trie_node_s 476 { 477 unsigned char c; 478 unsigned char end; 479 struct Trie_node_s* son; 480 struct Trie_node_s* sib; 481 } Trie_node_t; 482 483 typedef struct Trie_s 484 { 485 Trie_node_t** root; 486 int min; 487 int max; 488 } Trie_t; 489 490 /* 491 * Rex_t is a node in a regular expression 492 */ 493 494 typedef struct Rex_s 495 { 496 unsigned char type; /* node type */ 497 unsigned char marked; /* already marked */ 498 short serial; /* subpattern number */ 499 regflags_t flags; /* scoped flags */ 500 int explicit; /* scoped explicit match*/ 501 struct Rex_s* next; /* remaining parts */ 502 int lo; /* lo dup count */ 503 int hi; /* hi dup count */ 504 unsigned char* map; /* fold and/or ccode map*/ 505 union 506 { 507 Alt_catch_t alt_catch; /* alt catcher */ 508 Bm_t bm; /* bm */ 509 Behind_catch_t behind_catch; /* behind catcher */ 510 Set_t* charclass; /* char class */ 511 Collate_t collate; /* collation class */ 512 Cond_t cond_catch; /* cond catcher */ 513 Conj_left_t conj_left; /* conj left catcher */ 514 Conj_right_t conj_right; /* conj right catcher */ 515 void* data; /* data after Rex_t */ 516 Exec_t exec; /* re.re_exec() args */ 517 Group_t group; /* a|b or rep */ 518 Group_catch_t group_catch; /* group catcher */ 519 Neg_catch_t neg_catch; /* neg catcher */ 520 Nest_t nest; /* nested match */ 521 unsigned char onechar; /* single char */ 522 Rep_catch_t rep_catch; /* rep catcher */ 523 String_t string; /* string/kmp */ 524 Trie_t trie; /* trie */ 525 } re; 526 } Rex_t; 527 528 typedef struct reglib_s /* library private regex_t info */ 529 { 530 struct Rex_s* rex; /* compiled expression */ 531 regdisc_t* disc; /* REG_DISCIPLINE discipline */ 532 const regex_t* regex; /* from regexec */ 533 unsigned char* beg; /* beginning of string */ 534 unsigned char* end; /* end of string */ 535 Vector_t* pos; /* posns of certain subpatterns */ 536 Vector_t* bestpos; /* ditto for best match */ 537 regmatch_t* match; /* subexrs in current match */ 538 regmatch_t* best; /* ditto in best match yet */ 539 Stk_pos_t stk; /* exec stack pos */ 540 size_t min; /* minimum match length */ 541 size_t nsub; /* internal re_nsub */ 542 regflags_t flags; /* flags from regcomp() */ 543 int error; /* last error */ 544 int explicit; /* explicit match on this char */ 545 int leading; /* leading match on this char */ 546 int refs; /* regcomp()+regdup() references*/ 547 Rex_t done; /* the last continuation */ 548 regstat_t stats; /* for regstat() */ 549 unsigned char fold[UCHAR_MAX+1]; /* REG_ICASE map */ 550 unsigned char hard; /* hard comp */ 551 unsigned char once; /* if 1st parse fails, quit */ 552 unsigned char separate; /* cannot combine */ 553 unsigned char stack; /* hard comp or exec */ 554 unsigned char sub; /* re_sub is valid */ 555 unsigned char test; /* debug/test bitmask */ 556 } Env_t; 557 558 typedef struct State_s /* shared state */ 559 { 560 regmatch_t nomatch; 561 struct 562 { 563 unsigned char key; 564 short val[15]; 565 } escape[52]; 566 short* magic[UCHAR_MAX+1]; 567 regdisc_t disc; 568 int fatal; 569 int initialized; 570 Dt_t* attrs; 571 Dt_t* names; 572 Dtdisc_t dtdisc; 573 } State_t; 574 575 extern State_t state; 576 577 extern void* alloc(regdisc_t*, void*, size_t); 578 extern regclass_t classfun(int); 579 extern void drop(regdisc_t*, Rex_t*); 580 extern int fatal(regdisc_t*, int, const char*); 581 582 #endif 583