1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2008 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex implementation 26 * 27 * based on Doug McIlroy's C++ implementation 28 * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest 29 * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume 30 */ 31 32 #ifndef _REGLIB_H 33 #define _REGLIB_H 34 35 #define REG_VERSION_EXEC 20020509L 36 #define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */ 37 38 #define re_info env 39 40 #define alloc _reg_alloc 41 #define classfun _reg_classfun 42 #define drop _reg_drop 43 #define fatal _reg_fatal 44 #define state _reg_state 45 46 typedef struct regsubop_s 47 { 48 int op; /* REG_SUB_LOWER,REG_SUB_UPPER */ 49 int off; /* re_rhs or match[] offset */ 50 int len; /* re_rhs len or len==0 match[] */ 51 } regsubop_t; 52 53 #define _REG_SUB_PRIVATE_ \ 54 char* re_cur; /* re_buf cursor */ \ 55 char* re_end; /* re_buf end */ \ 56 regsubop_t* re_ops; /* rhs ops */ \ 57 char re_rhs[1]; /* substitution rhs */ 58 59 #include <ast.h> 60 #include <cdt.h> 61 #include <stk.h> 62 63 #include "regex.h" 64 65 #include <ctype.h> 66 #include <errno.h> 67 68 #define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1) 69 70 #undef RE_DUP_MAX /* posix puts this in limits.h! */ 71 #define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */ 72 #define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */ 73 #define BACK_REF_MAX 9 74 75 #define REG_COMP (REG_DELIMITED|REG_ESCAPE|REG_EXTENDED|REG_FIRST|REG_ICASE|REG_NOSUB|REG_NEWLINE|REG_SHELL|REG_AUGMENTED|REG_LEFT|REG_LITERAL|REG_MINIMAL|REG_MULTIREF|REG_NULL|REG_RIGHT|REG_LENIENT|REG_MUSTDELIM) 76 #define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND) 77 78 #define REX_NULL 0 /* null string (internal) */ 79 #define REX_ALT 1 /* a|b */ 80 #define REX_ALT_CATCH 2 /* REX_ALT catcher */ 81 #define REX_BACK 3 /* \1, \2, etc */ 82 #define REX_BEG 4 /* initial ^ */ 83 #define REX_BEG_STR 5 /* initial ^ w/ no newline */ 84 #define REX_BM 6 /* Boyer-Moore */ 85 #define REX_CAT 7 /* catenation catcher */ 86 #define REX_CLASS 8 /* [...] */ 87 #define REX_COLL_CLASS 9 /* collation order [...] */ 88 #define REX_CONJ 10 /* a&b */ 89 #define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */ 90 #define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */ 91 #define REX_DONE 13 /* completed match (internal) */ 92 #define REX_DOT 14 /* . */ 93 #define REX_END 15 /* final $ */ 94 #define REX_END_STR 16 /* final $ before tail newline */ 95 #define REX_EXEC 17 /* call re.re_exec() */ 96 #define REX_FIN_STR 18 /* final $ w/ no newline */ 97 #define REX_GROUP 19 /* \(...\) */ 98 #define REX_GROUP_CATCH 20 /* REX_GROUP catcher */ 99 #define REX_GROUP_AHEAD 21 /* 0-width lookahead */ 100 #define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */ 101 #define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */ 102 #define REX_GROUP_BEHIND 24 /* 0-width lookbehind */ 103 #define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */ 104 #define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */ 105 #define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */ 106 #define REX_GROUP_COND 28 /* conditional group */ 107 #define REX_GROUP_COND_CATCH 29 /* conditional group catcher */ 108 #define REX_GROUP_CUT 30 /* don't backtrack over this */ 109 #define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */ 110 #define REX_KMP 32 /* Knuth-Morris-Pratt */ 111 #define REX_NEG 33 /* negation */ 112 #define REX_NEG_CATCH 34 /* REX_NEG catcher */ 113 #define REX_NEST 35 /* nested match */ 114 #define REX_ONECHAR 36 /* a single-character literal */ 115 #define REX_REP 37 /* Kleene closure */ 116 #define REX_REP_CATCH 38 /* REX_REP catcher */ 117 #define REX_STRING 39 /* some chars */ 118 #define REX_TRIE 40 /* alternation of strings */ 119 #define REX_WBEG 41 /* \< */ 120 #define REX_WEND 42 /* \> */ 121 #define REX_WORD 43 /* word boundary */ 122 #define REX_WORD_NOT 44 /* not word boundary */ 123 124 #define T_META ((int)UCHAR_MAX+1) 125 #define T_STAR (T_META+0) 126 #define T_PLUS (T_META+1) 127 #define T_QUES (T_META+2) 128 #define T_BANG (T_META+3) 129 #define T_AT (T_META+4) 130 #define T_TILDE (T_META+5) 131 #define T_PERCENT (T_META+6) 132 #define T_LEFT (T_META+7) 133 #define T_OPEN (T_META+8) 134 #define T_CLOSE (T_OPEN+1) 135 #define T_RIGHT (T_OPEN+2) 136 #define T_CFLX (T_OPEN+3) 137 #define T_DOT (T_OPEN+4) 138 #define T_DOTSTAR (T_OPEN+5) 139 #define T_END (T_OPEN+6) 140 #define T_BAD (T_OPEN+7) 141 #define T_DOLL (T_OPEN+8) 142 #define T_BRA (T_OPEN+9) 143 #define T_BAR (T_OPEN+10) 144 #define T_AND (T_OPEN+11) 145 #define T_LT (T_OPEN+12) 146 #define T_GT (T_OPEN+13) 147 #define T_SLASHPLUS (T_OPEN+14) 148 #define T_GROUP (T_OPEN+15) 149 #define T_WORD (T_OPEN+16) 150 #define T_WORD_NOT (T_WORD+1) 151 #define T_BEG_STR (T_WORD+2) 152 #define T_END_STR (T_WORD+3) 153 #define T_FIN_STR (T_WORD+4) 154 #define T_ESCAPE (T_WORD+5) 155 #define T_ALNUM (T_WORD+6) 156 #define T_ALNUM_NOT (T_ALNUM+1) 157 #define T_DIGIT (T_ALNUM+2) 158 #define T_DIGIT_NOT (T_ALNUM+3) 159 #define T_SPACE (T_ALNUM+4) 160 #define T_SPACE_NOT (T_ALNUM+5) 161 #define T_BACK (T_ALNUM+6) 162 163 #define BRE 0 164 #define ERE 3 165 #define ARE 6 166 #define SRE 9 167 #define KRE 12 168 169 #define HIT SSIZE_MAX 170 171 #define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07)))) 172 #define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07))) 173 #define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07))) 174 175 #define setadd(p,c) bitset((p)->bits,c) 176 #define setclr(p,c) bitclr((p)->bits,c) 177 #define settst(p,c) bittst((p)->bits,c) 178 179 #if _hdr_wchar && _lib_wctype && _lib_iswctype 180 181 #include <stdio.h> /* because <wchar.h> includes it and we generate it */ 182 #include <wchar.h> 183 #if _hdr_wctype 184 #include <wctype.h> 185 #endif 186 187 #if !defined(iswblank) && !_lib_iswblank 188 #define _need_iswblank 1 189 #define iswblank(x) _reg_iswblank(x) 190 extern int _reg_iswblank(wint_t); 191 #endif 192 193 #if !defined(towupper) && !_lib_towupper 194 #define towupper(x) toupper(x) 195 #endif 196 197 #if !defined(towlower) && !_lib_towlower 198 #define towlower(x) tolower(x) 199 #endif 200 201 #else 202 203 #undef _lib_wctype 204 205 #ifndef iswalnum 206 #define iswalnum(x) isalnum(x) 207 #endif 208 #ifndef iswalpha 209 #define iswalpha(x) isalpha(x) 210 #endif 211 #ifndef iswcntrl 212 #define iswcntrl(x) iscntrl(x) 213 #endif 214 #ifndef iswdigit 215 #define iswdigit(x) isdigit(x) 216 #endif 217 #ifndef iswgraph 218 #define iswgraph(x) isgraph(x) 219 #endif 220 #ifndef iswlower 221 #define iswlower(x) islower(x) 222 #endif 223 #ifndef iswprint 224 #define iswprint(x) isprint(x) 225 #endif 226 #ifndef iswpunct 227 #define iswpunct(x) ispunct(x) 228 #endif 229 #ifndef iswspace 230 #define iswspace(x) isspace(x) 231 #endif 232 #ifndef iswupper 233 #define iswupper(x) isupper(x) 234 #endif 235 #ifndef iswxdigit 236 #define iswxdigit(x) isxdigit(x) 237 #endif 238 239 #ifndef towlower 240 #define towlower(x) tolower(x) 241 #endif 242 #ifndef towupper 243 #define towupper(x) toupper(x) 244 #endif 245 246 #endif 247 248 #ifndef iswblank 249 #define iswblank(x) ((x)==' '||(x)=='\t') 250 #endif 251 252 #ifndef iswgraph 253 #define iswgraph(x) (iswprint(x)&&!iswblank(x)) 254 #endif 255 256 #define isword(x) (isalnum(x)||(x)=='_') 257 258 /* 259 * collation element support 260 */ 261 262 #define COLL_KEY_MAX 32 263 264 #if COLL_KEY_MAX < MB_LEN_MAX 265 #undef COLL_KEY_MAX 266 #define COLL_KEY_MAX MB_LEN_MAX 267 #endif 268 269 typedef unsigned char Ckey_t[COLL_KEY_MAX+1]; 270 271 #define COLL_end 0 272 #define COLL_call 1 273 #define COLL_char 2 274 #define COLL_range 3 275 #define COLL_range_lc 4 276 #define COLL_range_uc 5 277 278 typedef struct Celt_s 279 { 280 short typ; 281 short min; 282 short max; 283 regclass_t fun; 284 Ckey_t beg; 285 Ckey_t end; 286 } Celt_t; 287 288 /* 289 * private stuff hanging off regex_t 290 */ 291 292 typedef struct Stk_pos_s 293 { 294 off_t offset; 295 char* base; 296 } Stk_pos_t; 297 298 typedef struct Vector_s 299 { 300 Stk_t* stk; /* stack pointer */ 301 char* vec; /* the data */ 302 int inc; /* growth increment */ 303 int siz; /* element size */ 304 int max; /* max index */ 305 int cur; /* current index -- user domain */ 306 } Vector_t; 307 308 /* 309 * Rex_t subtypes 310 */ 311 312 typedef struct Cond_s 313 { 314 unsigned char* beg; /* beginning of next match */ 315 struct Rex_s* next[2]; /* 0:no 1:yes next pattern */ 316 struct Rex_s* cont; /* right catcher */ 317 int yes; /* yes condition hit */ 318 } Cond_t; 319 320 typedef struct Conj_left_s 321 { 322 unsigned char* beg; /* beginning of left match */ 323 struct Rex_s* right; /* right pattern */ 324 struct Rex_s* cont; /* right catcher */ 325 } Conj_left_t; 326 327 typedef struct Conj_right_s 328 { 329 unsigned char* end; /* end of left match */ 330 struct Rex_s* cont; /* ambient continuation */ 331 } Conj_right_t; 332 333 typedef unsigned int Bm_mask_t; 334 335 typedef struct Bm_s 336 { 337 Bm_mask_t** mask; 338 size_t* skip; 339 size_t* fail; 340 size_t size; 341 ssize_t back; 342 ssize_t left; 343 ssize_t right; 344 size_t complete; 345 } Bm_t; 346 347 typedef struct String_s 348 { 349 int* fail; 350 unsigned char* base; 351 size_t size; 352 } String_t; 353 354 typedef struct Set_s 355 { 356 unsigned char bits[(UCHAR_MAX+1)/CHAR_BIT]; 357 } Set_t; 358 359 typedef struct Collate_s 360 { 361 int invert; 362 Celt_t* elements; 363 } Collate_t; 364 365 typedef struct Binary_s 366 { 367 struct Rex_s* left; 368 struct Rex_s* right; 369 int serial; 370 } Binary_t; 371 372 typedef struct Group_s 373 { 374 int number; /* group number */ 375 int last; /* last contained group number */ 376 int size; /* lookbehind size */ 377 int back; /* backreferenced */ 378 regflags_t flags; /* group flags */ 379 union 380 { 381 Binary_t binary; 382 struct Rex_s* rex; 383 } expr; 384 } Group_t; 385 386 typedef struct Exec_s 387 { 388 void* data; 389 const char* text; 390 size_t size; 391 } Exec_t; 392 393 #define REX_NEST_open 0x01 394 #define REX_NEST_close 0x02 395 #define REX_NEST_escape 0x04 396 #define REX_NEST_quote 0x08 397 #define REX_NEST_literal 0x10 398 #define REX_NEST_delimiter 0x20 399 #define REX_NEST_terminator 0x40 400 #define REX_NEST_separator 0x80 401 402 #define REX_NEST_SHIFT 8 403 404 typedef struct Nest_s 405 { 406 int primary; 407 unsigned short none; /* for Nest_t.type[-1] */ 408 unsigned short type[1]; 409 } Nest_t; 410 411 /* 412 * REX_ALT catcher, solely to get control at the end of an 413 * alternative to keep records for comparing matches. 414 */ 415 416 typedef struct Alt_catch_s 417 { 418 struct Rex_s* cont; 419 } Alt_catch_t; 420 421 typedef struct Group_catch_s 422 { 423 struct Rex_s* cont; 424 regoff_t* eo; 425 } Group_catch_t; 426 427 typedef struct Behind_catch_s 428 { 429 struct Rex_s* cont; 430 unsigned char* beg; 431 unsigned char* end; 432 } Behind_catch_t; 433 434 /* 435 * REX_NEG catcher determines what string lengths can be matched, 436 * then Neg investigates continuations of other lengths. 437 * This is inefficient. For !POSITIONS expressions, we can do better: 438 * since matches to rex will be enumerated in decreasing order, 439 * we can investigate continuations whenever a length is skipped. 440 */ 441 442 typedef struct Neg_catch_s 443 { 444 unsigned char* beg; 445 unsigned char* index; 446 } Neg_catch_t; 447 448 /* 449 * REX_REP catcher. One is created on the stack for 450 * each iteration of a complex repetition. 451 */ 452 453 typedef struct Rep_catch_s 454 { 455 struct Rex_s* cont; 456 struct Rex_s* ref; 457 unsigned char* beg; 458 int n; 459 } Rep_catch_t; 460 461 /* 462 * data structure for an alternation of pure strings 463 * son points to a subtree of all strings with a common 464 * prefix ending in character c. sib links alternate 465 * letters in the same position of a word. end=1 if 466 * some word ends with c. the order of strings is 467 * irrelevant, except long words must be investigated 468 * before short ones. 469 */ 470 471 typedef struct Trie_node_s 472 { 473 unsigned char c; 474 unsigned char end; 475 struct Trie_node_s* son; 476 struct Trie_node_s* sib; 477 } Trie_node_t; 478 479 typedef struct Trie_s 480 { 481 Trie_node_t** root; 482 int min; 483 int max; 484 } Trie_t; 485 486 /* 487 * Rex_t is a node in a regular expression 488 */ 489 490 typedef struct Rex_s 491 { 492 unsigned char type; /* node type */ 493 unsigned char marked; /* already marked */ 494 short serial; /* subpattern number */ 495 regflags_t flags; /* scoped flags */ 496 int explicit; /* scoped explicit match*/ 497 struct Rex_s* next; /* remaining parts */ 498 int lo; /* lo dup count */ 499 int hi; /* hi dup count */ 500 unsigned char* map; /* fold and/or ccode map*/ 501 union 502 { 503 Alt_catch_t alt_catch; /* alt catcher */ 504 Bm_t bm; /* bm */ 505 Behind_catch_t behind_catch; /* behind catcher */ 506 Set_t* charclass; /* char class */ 507 Collate_t collate; /* collation class */ 508 Cond_t cond_catch; /* cond catcher */ 509 Conj_left_t conj_left; /* conj left catcher */ 510 Conj_right_t conj_right; /* conj right catcher */ 511 void* data; /* data after Rex_t */ 512 Exec_t exec; /* re.re_exec() args */ 513 Group_t group; /* a|b or rep */ 514 Group_catch_t group_catch; /* group catcher */ 515 Neg_catch_t neg_catch; /* neg catcher */ 516 Nest_t nest; /* nested match */ 517 unsigned char onechar; /* single char */ 518 Rep_catch_t rep_catch; /* rep catcher */ 519 String_t string; /* string/kmp */ 520 Trie_t trie; /* trie */ 521 } re; 522 } Rex_t; 523 524 typedef struct reglib_s /* library private regex_t info */ 525 { 526 struct Rex_s* rex; /* compiled expression */ 527 regdisc_t* disc; /* REG_DISCIPLINE discipline */ 528 const regex_t* regex; /* from regexec */ 529 unsigned char* beg; /* beginning of string */ 530 unsigned char* end; /* end of string */ 531 Vector_t* pos; /* posns of certain subpatterns */ 532 Vector_t* bestpos; /* ditto for best match */ 533 regmatch_t* match; /* subexrs in current match */ 534 regmatch_t* best; /* ditto in best match yet */ 535 Stk_pos_t stk; /* exec stack pos */ 536 size_t min; /* minimum match length */ 537 size_t nsub; /* internal re_nsub */ 538 regflags_t flags; /* flags from regcomp() */ 539 int error; /* last error */ 540 int explicit; /* explicit match on this char */ 541 int leading; /* leading match on this char */ 542 int refs; /* regcomp()+regdup() references*/ 543 Rex_t done; /* the last continuation */ 544 regstat_t stats; /* for regstat() */ 545 unsigned char fold[UCHAR_MAX+1]; /* REG_ICASE map */ 546 unsigned char hard; /* hard comp */ 547 unsigned char once; /* if 1st parse fails, quit */ 548 unsigned char separate; /* cannot combine */ 549 unsigned char stack; /* hard comp or exec */ 550 unsigned char sub; /* re_sub is valid */ 551 unsigned char test; /* debug/test bitmask */ 552 } Env_t; 553 554 typedef struct State_s /* shared state */ 555 { 556 regmatch_t nomatch; 557 struct 558 { 559 unsigned char key; 560 short val[15]; 561 } escape[52]; 562 short* magic[UCHAR_MAX+1]; 563 regdisc_t disc; 564 int fatal; 565 int initialized; 566 Dt_t* attrs; 567 Dt_t* names; 568 Dtdisc_t dtdisc; 569 } State_t; 570 571 extern State_t state; 572 573 extern void* alloc(regdisc_t*, void*, size_t); 574 extern regclass_t classfun(int); 575 extern void drop(regdisc_t*, Rex_t*); 576 extern int fatal(regdisc_t*, int, const char*); 577 578 #endif 579