1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2007 AT&T Knowledge Ventures * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Knowledge Ventures * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex executor 26 * single sized-record interface 27 */ 28 29 #include "reglib.h" 30 31 #if _AST_REGEX_DEBUG 32 33 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 34 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 35 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_exec_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 36 37 static unsigned long debug; 38 static unsigned long debug_flag; 39 40 static const char* rexnames[] = 41 { 42 "REX_NULL", 43 "REX_ALT", 44 "REX_ALT_CATCH", 45 "REX_BACK", 46 "REX_BEG", 47 "REX_BEG_STR", 48 "REX_BM", 49 "REX_CAT", 50 "REX_CLASS", 51 "REX_COLL_CLASS", 52 "REX_CONJ", 53 "REX_CONJ_LEFT", 54 "REX_CONJ_RIGHT", 55 "REX_DONE", 56 "REX_DOT", 57 "REX_END", 58 "REX_END_STR", 59 "REX_EXEC", 60 "REX_FIN_STR", 61 "REX_GROUP", 62 "REX_GROUP_CATCH", 63 "REX_GROUP_AHEAD", 64 "REX_GROUP_AHEAD_CATCH", 65 "REX_GROUP_AHEAD_NOT", 66 "REX_GROUP_BEHIND", 67 "REX_GROUP_BEHIND_CATCH", 68 "REX_GROUP_BEHIND_NOT", 69 "REX_GROUP_BEHIND_NOT_CATCH", 70 "REX_GROUP_COND", 71 "REX_GROUP_COND_CATCH", 72 "REX_GROUP_CUT", 73 "REX_GROUP_CUT_CATCH", 74 "REX_KMP", 75 "REX_NEG", 76 "REX_NEG_CATCH", 77 "REX_NEST", 78 "REX_ONECHAR", 79 "REX_REP", 80 "REX_REP_CATCH", 81 "REX_STRING", 82 "REX_TRIE", 83 "REX_WBEG", 84 "REX_WEND", 85 "REX_WORD", 86 "REX_WORD_NOT" 87 }; 88 89 static const char* rexname(Rex_t* rex) 90 { 91 if (!rex) 92 return "NIL"; 93 if (rex->type >= elementsof(rexnames)) 94 return "ERROR"; 95 return rexnames[rex->type]; 96 } 97 98 #else 99 100 #define DEBUG_INIT() 101 #define DEBUG_TEST(f,y,n) (n) 102 #define DEBUG_CODE(f,y,n) do {n} while(0) 103 104 #endif 105 106 #define BEG_ALT 1 /* beginning of an alt */ 107 #define BEG_ONE 2 /* beginning of one iteration of a rep */ 108 #define BEG_REP 3 /* beginning of a repetition */ 109 #define BEG_SUB 4 /* beginning of a subexpression */ 110 #define END_ANY 5 /* end of any of above */ 111 112 /* 113 * returns from parse() 114 */ 115 116 #define NONE 0 /* no parse found */ 117 #define GOOD 1 /* some parse was found */ 118 #define CUT 2 /* no match and no backtrack */ 119 #define BEST 3 /* an unbeatable parse was found */ 120 #define BAD 4 /* error ocurred */ 121 122 /* 123 * REG_SHELL_DOT test 124 */ 125 126 #define LEADING(e,r,s) (*(s)==(e)->leading&&((s)==(e)->beg||*((s)-1)==(r)->explicit)) 127 128 /* 129 * Pos_t is for comparing parses. An entry is made in the 130 * array at the beginning and at the end of each Group_t, 131 * each iteration in a Group_t, and each Binary_t. 132 */ 133 134 typedef struct 135 { 136 unsigned char* p; /* where in string */ 137 size_t length; /* length in string */ 138 short serial; /* preorder subpattern number */ 139 short be; /* which end of pair */ 140 } Pos_t; 141 142 /* ===== begin library support ===== */ 143 144 #define vector(t,v,i) (((i)<(v)->max)?(t*)((v)->vec+(i)*(v)->siz):(t*)vecseek(&(v),i)) 145 146 static Vector_t* 147 vecopen(int inc, int siz) 148 { 149 Vector_t* v; 150 Stk_t* sp; 151 152 if (inc <= 0) 153 inc = 16; 154 if (!(sp = stkopen(STK_SMALL|STK_NULL))) 155 return 0; 156 if (!(v = (Vector_t*)stkseek(sp, sizeof(Vector_t) + inc * siz))) 157 { 158 stkclose(sp); 159 return 0; 160 } 161 v->stk = sp; 162 v->vec = (char*)v + sizeof(Vector_t); 163 v->max = v->inc = inc; 164 v->siz = siz; 165 v->cur = 0; 166 return v; 167 } 168 169 static void* 170 vecseek(Vector_t** p, int index) 171 { 172 Vector_t* v = *p; 173 174 if (index >= v->max) 175 { 176 while ((v->max += v->inc) <= index); 177 if (!(v = (Vector_t*)stkseek(v->stk, sizeof(Vector_t) + v->max * v->siz))) 178 return 0; 179 *p = v; 180 v->vec = (char*)v + sizeof(Vector_t); 181 } 182 return v->vec + index * v->siz; 183 } 184 185 static void 186 vecclose(Vector_t* v) 187 { 188 if (v) 189 stkclose(v->stk); 190 } 191 192 typedef struct 193 { 194 Stk_pos_t pos; 195 char data[1]; 196 } Stk_frame_t; 197 198 #define stknew(s,p) ((p)->offset=stktell(s),(p)->base=stkfreeze(s,0)) 199 #define stkold(s,p) stkset(s,(p)->base,(p)->offset) 200 201 #define stkframe(s) (*((Stk_frame_t**)stktop(s)-1)) 202 #define stkdata(s,t) ((t*)stkframe(s)->data) 203 #define stkpop(s) stkold(s,&(stkframe(s)->pos)) 204 205 static void* 206 stkpush(Stk_t* sp, size_t size) 207 { 208 Stk_frame_t* f; 209 Stk_pos_t p; 210 211 stknew(sp, &p); 212 size = sizeof(Stk_frame_t) + sizeof(size_t) + size - 1; 213 if (!(f = (Stk_frame_t*)stkalloc(sp, sizeof(Stk_frame_t) + sizeof(Stk_frame_t*) + size - 1))) 214 return 0; 215 f->pos = p; 216 stkframe(sp) = f; 217 return f->data; 218 } 219 220 /* ===== end library support ===== */ 221 222 /* 223 * Match_frame_t is for saving and restoring match records 224 * around alternate attempts, so that fossils will not be 225 * left in the match array. These are the only entries in 226 * the match array that are not otherwise guaranteed to 227 * have current data in them when they get used. 228 */ 229 230 typedef struct 231 { 232 size_t size; 233 regmatch_t* match; 234 regmatch_t save[1]; 235 } Match_frame_t; 236 237 #define matchframe stkdata(stkstd,Match_frame_t) 238 #define matchpush(e,x) ((x)->re.group.number?_matchpush(e,x):0) 239 #define matchcopy(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size):(void*)0) 240 #define matchpop(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size),stkpop(stkstd):(void*)0) 241 242 #define pospop(e) (--(e)->pos->cur) 243 244 /* 245 * allocate a frame and push a match onto the stack 246 */ 247 248 static int 249 _matchpush(Env_t* env, Rex_t* rex) 250 { 251 Match_frame_t* f; 252 regmatch_t* m; 253 regmatch_t* e; 254 regmatch_t* s; 255 int num; 256 257 if (rex->re.group.number <= 0 || (num = rex->re.group.last - rex->re.group.number + 1) <= 0) 258 num = 0; 259 if (!(f = (Match_frame_t*)stkpush(stkstd, sizeof(Match_frame_t) + (num - 1) * sizeof(regmatch_t)))) 260 { 261 env->error = REG_ESPACE; 262 return 1; 263 } 264 f->size = num * sizeof(regmatch_t); 265 f->match = m = env->match + rex->re.group.number; 266 e = m + num; 267 s = f->save; 268 while (m < e) 269 { 270 *s++ = *m; 271 *m++ = state.nomatch; 272 } 273 return 0; 274 } 275 276 /* 277 * allocate a frame and push a pos onto the stack 278 */ 279 280 static int 281 pospush(Env_t* env, Rex_t* rex, unsigned char* p, int be) 282 { 283 Pos_t* pos; 284 285 if (!(pos = vector(Pos_t, env->pos, env->pos->cur))) 286 { 287 env->error = REG_ESPACE; 288 return 1; 289 } 290 pos->serial = rex->serial; 291 pos->p = p; 292 pos->be = be; 293 env->pos->cur++; 294 return 0; 295 } 296 297 /* 298 * two matches are known to have the same length 299 * os is start of old pos array, ns is start of new, 300 * oend and nend are end+1 pointers to ends of arrays. 301 * oe and ne are ends (not end+1) of subarrays. 302 * returns 1 if new is better, -1 if old, else 0. 303 */ 304 305 #if _AST_REGEX_DEBUG 306 307 static void 308 showmatch(regmatch_t* p) 309 { 310 sfputc(sfstdout, '('); 311 if (p->rm_so < 0) 312 sfputc(sfstdout, '?'); 313 else 314 sfprintf(sfstdout, "%d", p->rm_so); 315 sfputc(sfstdout, ','); 316 if (p->rm_eo < 0) 317 sfputc(sfstdout, '?'); 318 else 319 sfprintf(sfstdout, "%d", p->rm_eo); 320 sfputc(sfstdout, ')'); 321 } 322 323 static int 324 better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) 325 #define better _better 326 { 327 int i; 328 329 DEBUG_CODE(0x0040,{sfprintf(sfstdout, "AHA better old ");for (i = 0; i <= env->nsub; i++)showmatch(&env->best[i]);sfprintf(sfstdout, "\n new ");for (i = 0; i <= env->nsub; i++)showmatch(&env->match[i]);sfprintf(sfstdout, "\n");},{0;}); 330 i = better(env, os, ns, oend, nend, 0); 331 DEBUG_TEST(0x0040,(sfprintf(sfstdout, " %s\n", i <= 0 ? "OLD" : "NEW")),(0)); 332 return i; 333 } 334 335 #endif 336 337 static int 338 better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) 339 { 340 Pos_t* oe; 341 Pos_t* ne; 342 int k; 343 int n; 344 345 if (env->error) 346 return -1; 347 for (;;) 348 { 349 DEBUG_CODE(0x0080,{sfprintf(sfstdout, " %-*.*sold ", (level + 3) * 4, (level + 3) * 4, "");for (oe = os; oe < oend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n %-*.*snew ", (level + 3) * 4, (level + 3) * 4, "");for (oe = ns; oe < nend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n");},{0;}); 350 if (ns >= nend) 351 return DEBUG_TEST(0x8000,(os < oend),(0)); 352 if (os >= oend) 353 return DEBUG_TEST(0x8000,(-1),(1)); 354 n = os->serial; 355 if (ns->serial > n) 356 return -1; 357 if (n > ns->serial) 358 { 359 env->error = REG_PANIC; 360 return -1; 361 } 362 if (ns->p > os->p) 363 return 1; 364 if (os->p > ns->p) 365 return -1; 366 oe = os; 367 k = 0; 368 for (;;) 369 if ((++oe)->serial == n) 370 { 371 if (oe->be != END_ANY) 372 k++; 373 else if (k-- <= 0) 374 break; 375 } 376 ne = ns; 377 k = 0; 378 for (;;) 379 if ((++ne)->serial == n) 380 { 381 if (ne->be != END_ANY) 382 k++; 383 else if (k-- <= 0) 384 break; 385 } 386 if (ne->p > oe->p) 387 return 1; 388 if (oe->p > ne->p) 389 return -1; 390 if (k = better(env, os + 1, ns + 1, oe, ne, level + 1)) 391 return k; 392 os = oe + 1; 393 ns = ne + 1; 394 } 395 } 396 397 #undef better 398 399 #define follow(e,r,c,s) ((r)->next?parse(e,(r)->next,c,s):(c)?parse(e,c,0,s):BEST) 400 401 static int parse(Env_t*, Rex_t*, Rex_t*, unsigned char*); 402 403 static int 404 parserep(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s, int n) 405 { 406 int i; 407 int r = NONE; 408 Rex_t catcher; 409 410 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep %s %d %d %d `%-.*s'\n", __LINE__, debug_flag, rexname(rex->re.group.expr.rex), rex->lo, n, rex->hi, env->end - s, s)),(0)); 411 if ((rex->flags & REG_MINIMAL) && n >= rex->lo && n < rex->hi) 412 { 413 if (env->stack && pospush(env, rex, s, END_ANY)) 414 return BAD; 415 i = follow(env, rex, cont, s); 416 if (env->stack) 417 pospop(env); 418 switch (i) 419 { 420 case BAD: 421 return BAD; 422 case CUT: 423 return CUT; 424 case BEST: 425 case GOOD: 426 return BEST; 427 } 428 } 429 if (n < rex->hi) 430 { 431 catcher.type = REX_REP_CATCH; 432 catcher.serial = rex->serial; 433 catcher.re.rep_catch.ref = rex; 434 catcher.re.rep_catch.cont = cont; 435 catcher.re.rep_catch.beg = s; 436 catcher.re.rep_catch.n = n + 1; 437 catcher.next = rex->next; 438 if (n == 0) 439 rex->re.rep_catch.beg = s; 440 if (env->stack) 441 { 442 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH (%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 443 if (matchpush(env, rex)) 444 return BAD; 445 if (pospush(env, rex, s, BEG_ONE)) 446 return BAD; 447 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH (%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 448 } 449 r = parse(env, rex->re.group.expr.rex, &catcher, s); 450 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep parse %d `%-.*s'\n", __LINE__, debug_flag, r, env->end - s, s)),(0)); 451 if (env->stack) 452 { 453 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP (%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 454 pospop(env); 455 matchpop(env, rex); 456 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP (%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 457 } 458 switch (r) 459 { 460 case BAD: 461 return BAD; 462 case BEST: 463 return BEST; 464 case CUT: 465 r = NONE; 466 break; 467 case GOOD: 468 if (rex->flags & REG_MINIMAL) 469 return BEST; 470 r = GOOD; 471 break; 472 } 473 } 474 if (n < rex->lo) 475 return r; 476 if (!(rex->flags & REG_MINIMAL) || n >= rex->hi) 477 { 478 if (env->stack && pospush(env, rex, s, END_ANY)) 479 return BAD; 480 i = follow(env, rex, cont, s); 481 if (env->stack) 482 pospop(env); 483 switch (i) 484 { 485 case BAD: 486 r = BAD; 487 break; 488 case CUT: 489 r = CUT; 490 break; 491 case BEST: 492 r = BEST; 493 break; 494 case GOOD: 495 r = (rex->flags & REG_MINIMAL) ? BEST : GOOD; 496 break; 497 } 498 } 499 return r; 500 } 501 502 static int 503 parsetrie(Env_t* env, Trie_node_t* x, Rex_t* rex, Rex_t* cont, unsigned char* s) 504 { 505 unsigned char* p; 506 int r; 507 508 if (p = rex->map) 509 { 510 for (;;) 511 { 512 if (s >= env->end) 513 return NONE; 514 while (x->c != p[*s]) 515 if (!(x = x->sib)) 516 return NONE; 517 if (x->end) 518 break; 519 x = x->son; 520 s++; 521 } 522 } 523 else 524 { 525 for (;;) 526 { 527 if (s >= env->end) 528 return NONE; 529 while (x->c != *s) 530 if (!(x = x->sib)) 531 return NONE; 532 if (x->end) 533 break; 534 x = x->son; 535 s++; 536 } 537 } 538 s++; 539 if (rex->flags & REG_MINIMAL) 540 switch (follow(env, rex, cont, s)) 541 { 542 case BAD: 543 return BAD; 544 case CUT: 545 return CUT; 546 case BEST: 547 case GOOD: 548 return BEST; 549 } 550 if (x->son) 551 switch (parsetrie(env, x->son, rex, cont, s)) 552 { 553 case BAD: 554 return BAD; 555 case CUT: 556 return CUT; 557 case BEST: 558 return BEST; 559 case GOOD: 560 if (rex->flags & REG_MINIMAL) 561 return BEST; 562 r = GOOD; 563 break; 564 default: 565 r = NONE; 566 break; 567 } 568 else 569 r = NONE; 570 if (!(rex->flags & REG_MINIMAL)) 571 switch (follow(env, rex, cont, s)) 572 { 573 case BAD: 574 return BAD; 575 case CUT: 576 return CUT; 577 case BEST: 578 return BEST; 579 case GOOD: 580 return GOOD; 581 } 582 return r; 583 } 584 585 static int 586 collelt(register Celt_t* ce, char* key, int c, int x) 587 { 588 Ckey_t elt; 589 590 mbxfrm(elt, key, COLL_KEY_MAX); 591 for (;; ce++) 592 { 593 switch (ce->typ) 594 { 595 case COLL_call: 596 if (!x && (*ce->fun)(c)) 597 return 1; 598 continue; 599 case COLL_char: 600 if (!strcmp((char*)ce->beg, (char*)elt)) 601 return 1; 602 continue; 603 case COLL_range: 604 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max) 605 return 1; 606 continue; 607 case COLL_range_lc: 608 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswlower(c) || !iswupper(c))) 609 return 1; 610 continue; 611 case COLL_range_uc: 612 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswupper(c) || !iswlower(c))) 613 return 1; 614 continue; 615 } 616 break; 617 } 618 return 0; 619 } 620 621 static int 622 collic(register Celt_t* ce, char* key, register char* nxt, int c, int x) 623 { 624 if (!x) 625 { 626 if (collelt(ce, key, c, x)) 627 return 1; 628 if (iswlower(c)) 629 c = towupper(c); 630 else if (iswupper(c)) 631 c = towlower(c); 632 else 633 return 0; 634 x = wctomb(key, c); 635 key[x] = 0; 636 return collelt(ce, key, c, 0); 637 } 638 while (*nxt) 639 { 640 if (collic(ce, key, nxt + 1, c, x)) 641 return 1; 642 if (islower(*nxt)) 643 *nxt = toupper(*nxt); 644 else if (isupper(*nxt)) 645 *nxt = tolower(*nxt); 646 else 647 return 0; 648 nxt++; 649 } 650 return collelt(ce, key, c, x); 651 } 652 653 static int 654 collmatch(Rex_t* rex, unsigned char* s, unsigned char* e, unsigned char** p) 655 { 656 unsigned char* t; 657 wchar_t c; 658 int w; 659 int r; 660 int x; 661 int ic; 662 Ckey_t key; 663 Ckey_t elt; 664 665 ic = (rex->flags & REG_ICASE); 666 if ((w = MBSIZE(s)) > 1) 667 { 668 memcpy((char*)key, (char*)s, w); 669 key[w] = 0; 670 t = s; 671 c = mbchar(t); 672 #if !_lib_wctype 673 c &= 0xff; 674 #endif 675 x = 0; 676 } 677 else 678 { 679 c = s[0]; 680 if (ic && isupper(c)) 681 c = tolower(c); 682 key[0] = c; 683 key[1] = 0; 684 if (isalpha(c)) 685 { 686 x = e - s; 687 if (x > COLL_KEY_MAX) 688 x = COLL_KEY_MAX; 689 while (w < x) 690 { 691 c = s[w]; 692 if (!isalpha(c)) 693 break; 694 r = mbxfrm(elt, key, COLL_KEY_MAX); 695 if (ic && isupper(c)) 696 c = tolower(c); 697 key[w] = c; 698 key[w + 1] = 0; 699 if (mbxfrm(elt, key, COLL_KEY_MAX) != r) 700 break; 701 w++; 702 } 703 } 704 key[w] = 0; 705 c = key[0]; 706 x = w - 1; 707 } 708 r = 1; 709 for (;;) 710 { 711 if (ic ? collic(rex->re.collate.elements, (char*)key, (char*)key, c, x) : collelt(rex->re.collate.elements, (char*)key, c, x)) 712 break; 713 if (!x) 714 { 715 r = 0; 716 break; 717 } 718 w = x--; 719 key[w] = 0; 720 } 721 *p = s + w; 722 return rex->re.collate.invert ? !r : r; 723 } 724 725 static unsigned char* 726 nestmatch(register unsigned char* s, register unsigned char* e, const unsigned short* type, register int co) 727 { 728 register int c; 729 register int cc; 730 unsigned int n; 731 int oc; 732 733 if (type[co] & (REX_NEST_literal|REX_NEST_quote)) 734 { 735 n = (type[co] & REX_NEST_literal) ? REX_NEST_terminator : (REX_NEST_escape|REX_NEST_terminator); 736 while (s < e) 737 { 738 c = *s++; 739 if (c == co) 740 return s; 741 else if (type[c] & n) 742 { 743 if (s >= e || (type[c] & REX_NEST_terminator)) 744 break; 745 s++; 746 } 747 } 748 } 749 else 750 { 751 cc = type[co] >> REX_NEST_SHIFT; 752 oc = type[co] & (REX_NEST_open|REX_NEST_close); 753 n = 1; 754 while (s < e) 755 { 756 c = *s++; 757 switch (type[c] & (REX_NEST_escape|REX_NEST_open|REX_NEST_close|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) 758 { 759 case REX_NEST_delimiter: 760 case REX_NEST_terminator: 761 return oc ? 0 : s; 762 case REX_NEST_separator: 763 if (!oc) 764 return s; 765 break; 766 case REX_NEST_escape: 767 if (s >= e) 768 return 0; 769 s++; 770 break; 771 case REX_NEST_open|REX_NEST_close: 772 if (c == cc) 773 { 774 if (!--n) 775 return s; 776 } 777 /*FALLTHROUGH*/ 778 case REX_NEST_open: 779 if (c == co) 780 { 781 if (!++n) 782 return 0; 783 } 784 else if (!(s = nestmatch(s, e, type, c))) 785 return 0; 786 break; 787 case REX_NEST_close: 788 if (c != cc) 789 return 0; 790 if (!--n) 791 return s; 792 break; 793 } 794 } 795 return (oc || !(type[UCHAR_MAX+1] & REX_NEST_terminator)) ? 0 : s; 796 } 797 return 0; 798 } 799 800 static int 801 parse(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s) 802 { 803 int c; 804 int d; 805 int i; 806 int m; 807 int n; 808 int r; 809 int* f; 810 unsigned char* p; 811 unsigned char* t; 812 unsigned char* b; 813 unsigned char* e; 814 char* u; 815 regmatch_t* o; 816 Trie_node_t* x; 817 Rex_t* q; 818 Rex_t catcher; 819 Rex_t next; 820 821 for (;;) 822 { 823 DEBUG_TEST(0x0008,(sfprintf(sfstdout, "AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); 824 switch (rex->type) 825 { 826 case REX_ALT: 827 if (env->stack) 828 { 829 if (matchpush(env, rex)) 830 return BAD; 831 if (pospush(env, rex, s, BEG_ALT)) 832 return BAD; 833 catcher.type = REX_ALT_CATCH; 834 catcher.serial = rex->serial; 835 catcher.re.alt_catch.cont = cont; 836 catcher.next = rex->next; 837 r = parse(env, rex->re.group.expr.binary.left, &catcher, s); 838 if (r < BEST || (rex->flags & REG_MINIMAL)) 839 { 840 matchcopy(env, rex); 841 ((Pos_t*)env->pos->vec + env->pos->cur - 1)->serial = catcher.serial = rex->re.group.expr.binary.serial; 842 n = parse(env, rex->re.group.expr.binary.right, &catcher, s); 843 if (n != NONE) 844 r = n; 845 } 846 pospop(env); 847 matchpop(env, rex); 848 } 849 else 850 { 851 if ((r = parse(env, rex->re.group.expr.binary.left, cont, s)) == NONE) 852 r = parse(env, rex->re.group.expr.binary.right, cont, s); 853 if (r == GOOD) 854 r = BEST; 855 } 856 return r; 857 case REX_ALT_CATCH: 858 if (pospush(env, rex, s, END_ANY)) 859 return BAD; 860 r = follow(env, rex, rex->re.alt_catch.cont, s); 861 pospop(env); 862 return r; 863 case REX_BACK: 864 o = &env->match[rex->lo]; 865 if (o->rm_so < 0) 866 return NONE; 867 i = o->rm_eo - o->rm_so; 868 e = s + i; 869 if (e > env->end) 870 return NONE; 871 t = env->beg + o->rm_so; 872 if (!(p = rex->map)) 873 { 874 while (s < e) 875 if (*s++ != *t++) 876 return NONE; 877 } 878 else if (!mbwide()) 879 { 880 while (s < e) 881 if (p[*s++] != p[*t++]) 882 return NONE; 883 } 884 else 885 { 886 while (s < e) 887 { 888 c = mbchar(s); 889 d = mbchar(t); 890 if (towupper(c) != towupper(d)) 891 return NONE; 892 } 893 } 894 break; 895 case REX_BEG: 896 if ((!(rex->flags & REG_NEWLINE) || s <= env->beg || *(s - 1) != '\n') && ((env->flags & REG_NOTBOL) || s != env->beg)) 897 return NONE; 898 break; 899 case REX_CLASS: 900 if (LEADING(env, rex, s)) 901 return NONE; 902 n = rex->hi; 903 if (n > env->end - s) 904 n = env->end - s; 905 m = rex->lo; 906 if (m > n) 907 return NONE; 908 r = NONE; 909 if (!(rex->flags & REG_MINIMAL)) 910 { 911 for (i = 0; i < n; i++) 912 if (!settst(rex->re.charclass, s[i])) 913 { 914 n = i; 915 break; 916 } 917 for (s += n; n-- >= m; s--) 918 switch (follow(env, rex, cont, s)) 919 { 920 case BAD: 921 return BAD; 922 case CUT: 923 return CUT; 924 case BEST: 925 return BEST; 926 case GOOD: 927 r = GOOD; 928 break; 929 } 930 } 931 else 932 { 933 for (e = s + m; s < e; s++) 934 if (!settst(rex->re.charclass, *s)) 935 return r; 936 e += n - m; 937 for (;;) 938 { 939 switch (follow(env, rex, cont, s)) 940 { 941 case BAD: 942 return BAD; 943 case CUT: 944 return CUT; 945 case BEST: 946 case GOOD: 947 return BEST; 948 } 949 if (s >= e || !settst(rex->re.charclass, *s)) 950 break; 951 s++; 952 } 953 } 954 return r; 955 case REX_COLL_CLASS: 956 if (LEADING(env, rex, s)) 957 return NONE; 958 n = rex->hi; 959 if (n > env->end - s) 960 n = env->end - s; 961 m = rex->lo; 962 if (m > n) 963 return NONE; 964 r = NONE; 965 e = env->end; 966 if (!(rex->flags & REG_MINIMAL)) 967 { 968 if (!(b = (unsigned char*)stkpush(stkstd, n))) 969 { 970 env->error = REG_ESPACE; 971 return BAD; 972 } 973 for (i = 0; s < e && i < n && collmatch(rex, s, e, &t); i++) 974 { 975 b[i] = t - s; 976 s = t; 977 } 978 for (; i-- >= rex->lo; s -= b[i]) 979 switch (follow(env, rex, cont, s)) 980 { 981 case BAD: 982 stkpop(stkstd); 983 return BAD; 984 case CUT: 985 stkpop(stkstd); 986 return CUT; 987 case BEST: 988 stkpop(stkstd); 989 return BEST; 990 case GOOD: 991 r = GOOD; 992 break; 993 } 994 stkpop(stkstd); 995 } 996 else 997 { 998 for (i = 0; i < m && s < e; i++, s = t) 999 if (!collmatch(rex, s, e, &t)) 1000 return r; 1001 while (i++ <= n) 1002 { 1003 switch (follow(env, rex, cont, s)) 1004 { 1005 case BAD: 1006 return BAD; 1007 case CUT: 1008 return CUT; 1009 case BEST: 1010 case GOOD: 1011 return BEST; 1012 } 1013 if (s >= e || !collmatch(rex, s, e, &s)) 1014 break; 1015 } 1016 } 1017 return r; 1018 case REX_CONJ: 1019 next.type = REX_CONJ_RIGHT; 1020 next.re.conj_right.cont = cont; 1021 next.next = rex->next; 1022 catcher.type = REX_CONJ_LEFT; 1023 catcher.re.conj_left.right = rex->re.group.expr.binary.right; 1024 catcher.re.conj_left.cont = &next; 1025 catcher.re.conj_left.beg = s; 1026 catcher.next = 0; 1027 return parse(env, rex->re.group.expr.binary.left, &catcher, s); 1028 case REX_CONJ_LEFT: 1029 rex->re.conj_left.cont->re.conj_right.end = s; 1030 cont = rex->re.conj_left.cont; 1031 s = rex->re.conj_left.beg; 1032 rex = rex->re.conj_left.right; 1033 continue; 1034 case REX_CONJ_RIGHT: 1035 if (rex->re.conj_right.end != s) 1036 return NONE; 1037 cont = rex->re.conj_right.cont; 1038 break; 1039 case REX_DONE: 1040 if (!env->stack) 1041 return BEST; 1042 n = s - env->beg; 1043 r = env->nsub; 1044 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 1045 if ((i = env->best[0].rm_eo) >= 0) 1046 { 1047 if (rex->flags & REG_MINIMAL) 1048 { 1049 if (n > i) 1050 return GOOD; 1051 } 1052 else 1053 { 1054 if (n < i) 1055 return GOOD; 1056 } 1057 if (n == i && better(env, 1058 (Pos_t*)env->bestpos->vec, 1059 (Pos_t*)env->pos->vec, 1060 (Pos_t*)env->bestpos->vec+env->bestpos->cur, 1061 (Pos_t*)env->pos->vec+env->pos->cur, 1062 0) <= 0) 1063 return GOOD; 1064 } 1065 env->best[0].rm_eo = n; 1066 memcpy(&env->best[1], &env->match[1], r * sizeof(regmatch_t)); 1067 n = env->pos->cur; 1068 if (!vector(Pos_t, env->bestpos, n)) 1069 { 1070 env->error = REG_ESPACE; 1071 return BAD; 1072 } 1073 env->bestpos->cur = n; 1074 memcpy(env->bestpos->vec, env->pos->vec, n * sizeof(Pos_t)); 1075 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 1076 return GOOD; 1077 case REX_DOT: 1078 if (LEADING(env, rex, s)) 1079 return NONE; 1080 n = rex->hi; 1081 if (n > env->end - s) 1082 n = env->end - s; 1083 m = rex->lo; 1084 if (m > n) 1085 return NONE; 1086 if ((c = rex->explicit) >= 0 && !mbwide()) 1087 for (i = 0; i < n; i++) 1088 if (s[i] == c) 1089 { 1090 n = i; 1091 break; 1092 } 1093 r = NONE; 1094 if (!(rex->flags & REG_MINIMAL)) 1095 { 1096 if (!mbwide()) 1097 { 1098 for (s += n; n-- >= m; s--) 1099 switch (follow(env, rex, cont, s)) 1100 { 1101 case BAD: 1102 return BAD; 1103 case CUT: 1104 return CUT; 1105 case BEST: 1106 return BEST; 1107 case GOOD: 1108 r = GOOD; 1109 break; 1110 } 1111 } 1112 else 1113 { 1114 if (!(b = (unsigned char*)stkpush(stkstd, n))) 1115 { 1116 env->error = REG_ESPACE; 1117 return BAD; 1118 } 1119 e = env->end; 1120 for (i = 0; s < e && i < n && *s != c; i++) 1121 s += b[i] = MBSIZE(s); 1122 for (; i-- >= m; s -= b[i]) 1123 switch (follow(env, rex, cont, s)) 1124 { 1125 case BAD: 1126 stkpop(stkstd); 1127 return BAD; 1128 case CUT: 1129 stkpop(stkstd); 1130 return CUT; 1131 case BEST: 1132 stkpop(stkstd); 1133 return BEST; 1134 case GOOD: 1135 r = GOOD; 1136 break; 1137 } 1138 stkpop(stkstd); 1139 } 1140 } 1141 else 1142 { 1143 if (!mbwide()) 1144 { 1145 e = s + n; 1146 for (s += m; s <= e; s++) 1147 switch (follow(env, rex, cont, s)) 1148 { 1149 case BAD: 1150 return BAD; 1151 case CUT: 1152 return CUT; 1153 case BEST: 1154 case GOOD: 1155 return BEST; 1156 } 1157 } 1158 else 1159 { 1160 e = env->end; 1161 for (i = 0; s < e && i < m && *s != c; i++) 1162 s += MBSIZE(s); 1163 if (i >= m) 1164 for (; s <= e && i <= n; s += MBSIZE(s), i++) 1165 switch (follow(env, rex, cont, s)) 1166 { 1167 case BAD: 1168 return BAD; 1169 case CUT: 1170 return CUT; 1171 case BEST: 1172 case GOOD: 1173 return BEST; 1174 } 1175 } 1176 } 1177 return r; 1178 case REX_END: 1179 if ((!(rex->flags & REG_NEWLINE) || *s != '\n') && ((env->flags & REG_NOTEOL) || s < env->end)) 1180 return NONE; 1181 break; 1182 case REX_GROUP: 1183 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); 1184 if (env->stack) 1185 { 1186 if (rex->re.group.number) 1187 env->match[rex->re.group.number].rm_so = s - env->beg; 1188 if (pospush(env, rex, s, BEG_SUB)) 1189 return BAD; 1190 catcher.re.group_catch.eo = rex->re.group.number ? &env->match[rex->re.group.number].rm_eo : (regoff_t*)0; 1191 } 1192 catcher.type = REX_GROUP_CATCH; 1193 catcher.serial = rex->serial; 1194 catcher.re.group_catch.cont = cont; 1195 catcher.next = rex->next; 1196 r = parse(env, rex->re.group.expr.rex, &catcher, s); 1197 if (env->stack) 1198 { 1199 pospop(env); 1200 if (rex->re.group.number) 1201 env->match[rex->re.group.number].rm_so = -1; 1202 } 1203 return r; 1204 case REX_GROUP_CATCH: 1205 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s=>%s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rexname(rex->re.group_catch.cont), env->end - s, s)),(0)); 1206 if (env->stack) 1207 { 1208 if (rex->re.group_catch.eo) 1209 *rex->re.group_catch.eo = s - env->beg; 1210 if (pospush(env, rex, s, END_ANY)) 1211 return BAD; 1212 } 1213 r = follow(env, rex, rex->re.group_catch.cont, s); 1214 if (env->stack) 1215 { 1216 pospop(env); 1217 if (rex->re.group_catch.eo) 1218 *rex->re.group_catch.eo = -1; 1219 } 1220 return r; 1221 case REX_GROUP_AHEAD: 1222 catcher.type = REX_GROUP_AHEAD_CATCH; 1223 catcher.flags = rex->flags; 1224 catcher.serial = rex->serial; 1225 catcher.re.rep_catch.beg = s; 1226 catcher.re.rep_catch.cont = cont; 1227 catcher.next = rex->next; 1228 return parse(env, rex->re.group.expr.rex, &catcher, s); 1229 case REX_GROUP_AHEAD_CATCH: 1230 return follow(env, rex, rex->re.rep_catch.cont, rex->re.rep_catch.beg); 1231 case REX_GROUP_AHEAD_NOT: 1232 r = parse(env, rex->re.group.expr.rex, NiL, s); 1233 if (r == NONE) 1234 r = follow(env, rex, cont, s); 1235 else if (r != BAD) 1236 r = NONE; 1237 return r; 1238 case REX_GROUP_BEHIND: 1239 if ((s - env->beg) < rex->re.group.size) 1240 return NONE; 1241 catcher.type = REX_GROUP_BEHIND_CATCH; 1242 catcher.flags = rex->flags; 1243 catcher.serial = rex->serial; 1244 catcher.re.behind_catch.beg = s; 1245 catcher.re.behind_catch.end = e = env->end; 1246 catcher.re.behind_catch.cont = cont; 1247 catcher.next = rex->next; 1248 for (t = s - rex->re.group.size; t >= env->beg; t--) 1249 { 1250 env->end = s; 1251 r = parse(env, rex->re.group.expr.rex, &catcher, t); 1252 env->end = e; 1253 if (r != NONE) 1254 return r; 1255 } 1256 return NONE; 1257 case REX_GROUP_BEHIND_CATCH: 1258 if (s != rex->re.behind_catch.beg) 1259 return NONE; 1260 env->end = rex->re.behind_catch.end; 1261 return follow(env, rex, rex->re.behind_catch.cont, rex->re.behind_catch.beg); 1262 case REX_GROUP_BEHIND_NOT: 1263 if ((s - env->beg) < rex->re.group.size) 1264 r = NONE; 1265 else 1266 { 1267 catcher.type = REX_GROUP_BEHIND_NOT_CATCH; 1268 catcher.re.neg_catch.beg = s; 1269 catcher.next = 0; 1270 e = env->end; 1271 env->end = s; 1272 for (t = s - rex->re.group.size; t >= env->beg; t--) 1273 { 1274 r = parse(env, rex->re.group.expr.rex, &catcher, t); 1275 if (r != NONE) 1276 break; 1277 } 1278 env->end = e; 1279 } 1280 if (r == NONE) 1281 r = follow(env, rex, cont, s); 1282 else if (r != BAD) 1283 r = NONE; 1284 return r; 1285 case REX_GROUP_BEHIND_NOT_CATCH: 1286 return s == rex->re.neg_catch.beg ? GOOD : NONE; 1287 case REX_GROUP_COND: 1288 if (q = rex->re.group.expr.binary.right) 1289 { 1290 catcher.re.cond_catch.next[0] = q->re.group.expr.binary.right; 1291 catcher.re.cond_catch.next[1] = q->re.group.expr.binary.left; 1292 } 1293 else 1294 catcher.re.cond_catch.next[0] = catcher.re.cond_catch.next[1] = 0; 1295 if (q = rex->re.group.expr.binary.left) 1296 { 1297 catcher.type = REX_GROUP_COND_CATCH; 1298 catcher.flags = rex->flags; 1299 catcher.serial = rex->serial; 1300 catcher.re.cond_catch.yes = 0; 1301 catcher.re.cond_catch.beg = s; 1302 catcher.re.cond_catch.cont = cont; 1303 catcher.next = rex->next; 1304 r = parse(env, q, &catcher, s); 1305 if (r == BAD || catcher.re.cond_catch.yes) 1306 return r; 1307 } 1308 else if (!rex->re.group.size || rex->re.group.size > 0 && env->match[rex->re.group.size].rm_so >= 0) 1309 r = GOOD; 1310 else 1311 r = NONE; 1312 if (q = catcher.re.cond_catch.next[r != NONE]) 1313 { 1314 catcher.type = REX_CAT; 1315 catcher.flags = q->flags; 1316 catcher.serial = q->serial; 1317 catcher.re.group_catch.cont = cont; 1318 catcher.next = rex->next; 1319 return parse(env, q, &catcher, s); 1320 } 1321 return follow(env, rex, cont, s); 1322 case REX_GROUP_COND_CATCH: 1323 rex->re.cond_catch.yes = 1; 1324 catcher.type = REX_CAT; 1325 catcher.flags = rex->flags; 1326 catcher.serial = rex->serial; 1327 catcher.re.group_catch.cont = rex->re.cond_catch.cont; 1328 catcher.next = rex->next; 1329 return parse(env, rex->re.cond_catch.next[1], &catcher, rex->re.cond_catch.beg); 1330 case REX_CAT: 1331 return follow(env, rex, rex->re.group_catch.cont, s); 1332 case REX_GROUP_CUT: 1333 catcher.type = REX_GROUP_CUT_CATCH; 1334 catcher.flags = rex->flags; 1335 catcher.serial = rex->serial; 1336 catcher.re.group_catch.cont = cont; 1337 catcher.next = rex->next; 1338 return parse(env, rex->re.group.expr.rex, &catcher, s); 1339 case REX_GROUP_CUT_CATCH: 1340 switch (r = follow(env, rex, rex->re.group_catch.cont, s)) 1341 { 1342 case GOOD: 1343 r = BEST; 1344 break; 1345 case NONE: 1346 r = CUT; 1347 break; 1348 } 1349 return r; 1350 case REX_KMP: 1351 f = rex->re.string.fail; 1352 b = rex->re.string.base; 1353 n = rex->re.string.size; 1354 t = s; 1355 e = env->end; 1356 if (p = rex->map) 1357 { 1358 while (t + n <= e) 1359 { 1360 for (i = -1; t < e; t++) 1361 { 1362 while (i >= 0 && b[i+1] != p[*t]) 1363 i = f[i]; 1364 if (b[i+1] == p[*t]) 1365 i++; 1366 if (i + 1 == n) 1367 { 1368 t++; 1369 if (env->stack) 1370 env->best[0].rm_so = t - s - n; 1371 switch (follow(env, rex, cont, t)) 1372 { 1373 case BAD: 1374 return BAD; 1375 case CUT: 1376 return CUT; 1377 case BEST: 1378 case GOOD: 1379 return BEST; 1380 } 1381 t -= n - 1; 1382 break; 1383 } 1384 } 1385 } 1386 } 1387 else 1388 { 1389 while (t + n <= e) 1390 { 1391 for (i = -1; t < e; t++) 1392 { 1393 while (i >= 0 && b[i+1] != *t) 1394 i = f[i]; 1395 if (b[i+1] == *t) 1396 i++; 1397 if (i + 1 == n) 1398 { 1399 t++; 1400 if (env->stack) 1401 env->best[0].rm_so = t - s - n; 1402 switch (follow(env, rex, cont, t)) 1403 { 1404 case BAD: 1405 return BAD; 1406 case CUT: 1407 return CUT; 1408 case BEST: 1409 case GOOD: 1410 return BEST; 1411 } 1412 t -= n - 1; 1413 break; 1414 } 1415 } 1416 } 1417 } 1418 return NONE; 1419 case REX_NEG: 1420 if (LEADING(env, rex, s)) 1421 return NONE; 1422 i = env->end - s; 1423 n = ((i + 7) >> 3) + 1; 1424 catcher.type = REX_NEG_CATCH; 1425 catcher.re.neg_catch.beg = s; 1426 if (!(p = (unsigned char*)stkpush(stkstd, n))) 1427 return BAD; 1428 memset(catcher.re.neg_catch.index = p, 0, n); 1429 catcher.next = rex->next; 1430 if (parse(env, rex->re.group.expr.rex, &catcher, s) == BAD) 1431 r = BAD; 1432 else 1433 { 1434 r = NONE; 1435 for (; i >= 0; i--) 1436 if (!bittst(p, i)) 1437 { 1438 switch (follow(env, rex, cont, s + i)) 1439 { 1440 case BAD: 1441 r = BAD; 1442 break; 1443 case BEST: 1444 r = BEST; 1445 break; 1446 case CUT: 1447 r = CUT; 1448 break; 1449 case GOOD: 1450 r = GOOD; 1451 /*FALLTHROUGH*/ 1452 default: 1453 continue; 1454 } 1455 break; 1456 } 1457 } 1458 stkpop(stkstd); 1459 return r; 1460 case REX_NEG_CATCH: 1461 bitset(rex->re.neg_catch.index, s - rex->re.neg_catch.beg); 1462 return NONE; 1463 case REX_NEST: 1464 do 1465 { 1466 if ((c = *s++) == rex->re.nest.primary) 1467 { 1468 if (s >= env->end || !(s = nestmatch(s, env->end, rex->re.nest.type, c))) 1469 return NONE; 1470 break; 1471 } 1472 if (rex->re.nest.primary >= 0) 1473 return NONE; 1474 if (rex->re.nest.type[c] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) 1475 break; 1476 if (!(s = nestmatch(s, env->end, rex->re.nest.type, c))) 1477 return NONE; 1478 } while (s < env->end && !(rex->re.nest.type[*(s-1)] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))); 1479 break; 1480 case REX_NULL: 1481 break; 1482 case REX_ONECHAR: 1483 n = rex->hi; 1484 if (n > env->end - s) 1485 n = env->end - s; 1486 m = rex->lo; 1487 if (m > n) 1488 return NONE; 1489 r = NONE; 1490 c = rex->re.onechar; 1491 if (!(rex->flags & REG_MINIMAL)) 1492 { 1493 if (!mbwide()) 1494 { 1495 if (p = rex->map) 1496 { 1497 for (i = 0; i < n; i++, s++) 1498 if (p[*s] != c) 1499 break; 1500 } 1501 else 1502 { 1503 for (i = 0; i < n; i++, s++) 1504 if (*s != c) 1505 break; 1506 } 1507 for (; i-- >= m; s--) 1508 switch (follow(env, rex, cont, s)) 1509 { 1510 case BAD: 1511 return BAD; 1512 case BEST: 1513 return BEST; 1514 case CUT: 1515 return CUT; 1516 case GOOD: 1517 r = GOOD; 1518 break; 1519 } 1520 } 1521 else 1522 { 1523 if (!(b = (unsigned char*)stkpush(stkstd, n))) 1524 { 1525 env->error = REG_ESPACE; 1526 return BAD; 1527 } 1528 e = env->end; 1529 if (!(rex->flags & REG_ICASE)) 1530 { 1531 for (i = 0; s < e && i < n; i++, s = t) 1532 { 1533 t = s; 1534 if (mbchar(t) != c) 1535 break; 1536 b[i] = t - s; 1537 } 1538 } 1539 else 1540 { 1541 for (i = 0; s < e && i < n; i++, s = t) 1542 { 1543 t = s; 1544 if (towupper(mbchar(t)) != c) 1545 break; 1546 b[i] = t - s; 1547 } 1548 } 1549 for (; i-- >= m; s -= b[i]) 1550 switch (follow(env, rex, cont, s)) 1551 { 1552 case BAD: 1553 stkpop(stkstd); 1554 return BAD; 1555 case BEST: 1556 stkpop(stkstd); 1557 return BEST; 1558 case CUT: 1559 stkpop(stkstd); 1560 return CUT; 1561 case GOOD: 1562 r = GOOD; 1563 break; 1564 } 1565 stkpop(stkstd); 1566 } 1567 } 1568 else 1569 { 1570 if (!mbwide()) 1571 { 1572 e = s + m; 1573 if (p = rex->map) 1574 { 1575 for (; s < e; s++) 1576 if (p[*s] != c) 1577 return r; 1578 e += n - m; 1579 for (;;) 1580 { 1581 switch (follow(env, rex, cont, s)) 1582 { 1583 case BAD: 1584 return BAD; 1585 case CUT: 1586 return CUT; 1587 case BEST: 1588 case GOOD: 1589 return BEST; 1590 } 1591 if (s >= e || p[*s++] != c) 1592 break; 1593 } 1594 } 1595 else 1596 { 1597 for (; s < e; s++) 1598 if (*s != c) 1599 return r; 1600 e += n - m; 1601 for (;;) 1602 { 1603 switch (follow(env, rex, cont, s)) 1604 { 1605 case BAD: 1606 return BAD; 1607 case CUT: 1608 return CUT; 1609 case BEST: 1610 case GOOD: 1611 return BEST; 1612 } 1613 if (s >= e || *s++ != c) 1614 break; 1615 } 1616 } 1617 } 1618 else 1619 { 1620 if (!(rex->flags & REG_ICASE)) 1621 { 1622 for (i = 0; i < m && s < e; i++, s = t) 1623 { 1624 t = s; 1625 if (mbchar(t) != c) 1626 return r; 1627 } 1628 while (i++ <= n) 1629 { 1630 switch (follow(env, rex, cont, s)) 1631 { 1632 case BAD: 1633 return BAD; 1634 case CUT: 1635 return CUT; 1636 case BEST: 1637 case GOOD: 1638 return BEST; 1639 } 1640 if (s >= e || mbchar(s) != c) 1641 break; 1642 } 1643 } 1644 else 1645 { 1646 for (i = 0; i < m && s < e; i++, s = t) 1647 { 1648 t = s; 1649 if (towupper(mbchar(t)) != c) 1650 return r; 1651 } 1652 while (i++ <= n) 1653 { 1654 switch (follow(env, rex, cont, s)) 1655 { 1656 case BAD: 1657 return BAD; 1658 case CUT: 1659 return CUT; 1660 case BEST: 1661 case GOOD: 1662 return BEST; 1663 } 1664 if (s >= e || towupper(mbchar(s)) != c) 1665 break; 1666 } 1667 } 1668 } 1669 } 1670 return r; 1671 case REX_REP: 1672 if (env->stack && pospush(env, rex, s, BEG_REP)) 1673 return BAD; 1674 r = parserep(env, rex, cont, s, 0); 1675 if (env->stack) 1676 pospop(env); 1677 return r; 1678 case REX_REP_CATCH: 1679 DEBUG_TEST(0x0020,(sfprintf(sfstdout, "AHA#%04d 0x%04x %s n %d len %d s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.rep_catch.n, s - rex->re.rep_catch.beg, env->end - s, s)),(0)); 1680 if (env->stack && pospush(env, rex, s, END_ANY)) 1681 return BAD; 1682 if (s == rex->re.rep_catch.beg && rex->re.rep_catch.n > rex->re.rep_catch.ref->lo) 1683 { 1684 /* 1685 * optional empty iteration 1686 */ 1687 1688 DEBUG_TEST(0x0002,(sfprintf(sfstdout, "AHA#%04d %p re.group.back=%d re.group.expr.rex=%s\n", __LINE__, rex->re.rep_catch.ref->re.group.expr.rex, rex->re.rep_catch.ref->re.group.expr.rex->re.group.back, rexname(rex->re.rep_catch.ref->re.group.expr.rex))),(0)); 1689 if (!env->stack || s != rex->re.rep_catch.ref->re.rep_catch.beg && !rex->re.rep_catch.ref->re.group.expr.rex->re.group.back) 1690 r = NONE; 1691 else if (pospush(env, rex, s, END_ANY)) 1692 r = BAD; 1693 else 1694 { 1695 r = follow(env, rex, rex->re.rep_catch.cont, s); 1696 pospop(env); 1697 } 1698 } 1699 else 1700 r = parserep(env, rex->re.rep_catch.ref, rex->re.rep_catch.cont, s, rex->re.rep_catch.n); 1701 if (env->stack) 1702 pospop(env); 1703 return r; 1704 case REX_STRING: 1705 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s \"%-.*s\" `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.string.size, rex->re.string.base, env->end - s, s)),(0)); 1706 if (rex->re.string.size > (env->end - s)) 1707 return NONE; 1708 t = rex->re.string.base; 1709 e = t + rex->re.string.size; 1710 if (!(p = rex->map)) 1711 { 1712 while (t < e) 1713 if (*s++ != *t++) 1714 return NONE; 1715 } 1716 else if (!mbwide()) 1717 { 1718 while (t < e) 1719 if (p[*s++] != *t++) 1720 return NONE; 1721 } 1722 else 1723 { 1724 while (t < e) 1725 { 1726 c = mbchar(s); 1727 d = mbchar(t); 1728 if (towupper(c) != d) 1729 return NONE; 1730 } 1731 } 1732 break; 1733 case REX_TRIE: 1734 if (((s + rex->re.trie.min) > env->end) || !(x = rex->re.trie.root[rex->map ? rex->map[*s] : *s])) 1735 return NONE; 1736 return parsetrie(env, x, rex, cont, s); 1737 case REX_EXEC: 1738 u = 0; 1739 r = (*env->disc->re_execf)(env->regex, rex->re.exec.data, rex->re.exec.text, rex->re.exec.size, (const char*)s, env->end - s, &u, env->disc); 1740 e = (unsigned char*)u; 1741 if (e >= s && e <= env->end) 1742 s = e; 1743 switch (r) 1744 { 1745 case 0: 1746 break; 1747 case REG_NOMATCH: 1748 return NONE; 1749 default: 1750 env->error = r; 1751 return BAD; 1752 } 1753 break; 1754 case REX_WBEG: 1755 if (!isword(*s) || s > env->beg && isword(*(s - 1))) 1756 return NONE; 1757 break; 1758 case REX_WEND: 1759 if (isword(*s) || s > env->beg && !isword(*(s - 1))) 1760 return NONE; 1761 break; 1762 case REX_WORD: 1763 if (s > env->beg && isword(*(s - 1)) == isword(*s)) 1764 return NONE; 1765 break; 1766 case REX_WORD_NOT: 1767 if (s == env->beg || isword(*(s - 1)) != isword(*s)) 1768 return NONE; 1769 break; 1770 case REX_BEG_STR: 1771 if (s != env->beg) 1772 return NONE; 1773 break; 1774 case REX_END_STR: 1775 for (t = s; t < env->end && *t == '\n'; t++); 1776 if (t < env->end) 1777 return NONE; 1778 break; 1779 case REX_FIN_STR: 1780 if (s < env->end) 1781 return NONE; 1782 break; 1783 } 1784 if (!(rex = rex->next)) 1785 { 1786 if (!(rex = cont)) 1787 break; 1788 cont = 0; 1789 } 1790 } 1791 return GOOD; 1792 } 1793 1794 #if _AST_REGEX_DEBUG 1795 1796 static void 1797 listnode(Rex_t* e, int level) 1798 { 1799 int i; 1800 1801 if (e) 1802 { 1803 do 1804 { 1805 for (i = 0; i < level; i++) 1806 sfprintf(sfstderr, " "); 1807 sfprintf(sfstderr, "%s\n", rexname(e)); 1808 switch (e->type) 1809 { 1810 case REX_ALT: 1811 case REX_CONJ: 1812 listnode(e->re.group.expr.binary.left, level + 1); 1813 listnode(e->re.group.expr.binary.right, level + 1); 1814 break; 1815 case REX_GROUP: 1816 case REX_GROUP_AHEAD: 1817 case REX_GROUP_AHEAD_NOT: 1818 case REX_GROUP_BEHIND: 1819 case REX_GROUP_BEHIND_NOT: 1820 case REX_GROUP_CUT: 1821 case REX_NEG: 1822 case REX_REP: 1823 listnode(e->re.group.expr.rex, level + 1); 1824 break; 1825 } 1826 } while (e = e->next); 1827 } 1828 } 1829 1830 static int 1831 list(Env_t* env, Rex_t* rex) 1832 { 1833 sfprintf(sfstderr, "AHA regex hard=%d stack=%p\n", env->hard, env->stack); 1834 if (rex) 1835 listnode(rex, 1); 1836 return 0; 1837 } 1838 1839 #endif 1840 1841 /* 1842 * returning REG_BADPAT or REG_ESPACE is not explicitly 1843 * countenanced by the standard 1844 */ 1845 1846 int 1847 regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, regmatch_t* match, regflags_t flags) 1848 { 1849 register int n; 1850 register int i; 1851 int j; 1852 int k; 1853 int m; 1854 int advance; 1855 Env_t* env; 1856 Rex_t* e; 1857 1858 DEBUG_INIT(); 1859 DEBUG_TEST(0x0001,(sfprintf(sfstdout, "AHA#%04d 0x%04x regnexec %d 0x%08x `%-.*s'\n", __LINE__, debug_flag, nmatch, flags, len, s)),(0)); 1860 if (!p || !(env = p->env)) 1861 return REG_BADPAT; 1862 if (!s) 1863 return fatal(env->disc, REG_BADPAT, NiL); 1864 if (len < env->min) 1865 { 1866 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, env->min)),(0)); 1867 return REG_NOMATCH; 1868 } 1869 env->regex = p; 1870 env->beg = (unsigned char*)s; 1871 env->end = env->beg + len; 1872 stknew(stkstd, &env->stk); 1873 env->flags &= ~REG_EXEC; 1874 env->flags |= (flags & REG_EXEC); 1875 advance = 0; 1876 if (env->stack = env->hard || !(env->flags & REG_NOSUB) && nmatch) 1877 { 1878 n = env->nsub; 1879 if (!(env->match = (regmatch_t*)stkpush(stkstd, 2 * (n + 1) * sizeof(regmatch_t))) || 1880 !env->pos && !(env->pos = vecopen(16, sizeof(Pos_t))) || 1881 !env->bestpos && !(env->bestpos = vecopen(16, sizeof(Pos_t)))) 1882 { 1883 k = REG_ESPACE; 1884 goto done; 1885 } 1886 env->pos->cur = env->bestpos->cur = 0; 1887 env->best = &env->match[n + 1]; 1888 env->best[0].rm_so = 0; 1889 env->best[0].rm_eo = -1; 1890 for (i = 0; i <= n; i++) 1891 env->match[i] = state.nomatch; 1892 if (flags & REG_ADVANCE) 1893 advance = 1; 1894 } 1895 DEBUG_TEST(0x1000,(list(env,env->rex)),(0)); 1896 k = REG_NOMATCH; 1897 if ((e = env->rex)->type == REX_BM) 1898 { 1899 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM\n", __LINE__)),(0)); 1900 if (len < e->re.bm.right) 1901 { 1902 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, e->re.bm.right)),(0)); 1903 goto done; 1904 } 1905 else if (!(flags & REG_LEFT)) 1906 { 1907 register unsigned char* buf = (unsigned char*)s; 1908 register size_t index = e->re.bm.left + e->re.bm.size; 1909 register size_t mid = len - e->re.bm.right; 1910 register size_t* skip = e->re.bm.skip; 1911 register size_t* fail = e->re.bm.fail; 1912 register Bm_mask_t** mask = e->re.bm.mask; 1913 Bm_mask_t m; 1914 size_t x; 1915 1916 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM len=%d right=%d left=%d size=%d %d %d\n", __LINE__, len, e->re.bm.right, e->re.bm.left, e->re.bm.size, index, mid)),(0)); 1917 for (;;) 1918 { 1919 while ((index += skip[buf[index]]) < mid); 1920 if (index < HIT) 1921 { 1922 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, index, HIT)),(0)); 1923 goto done; 1924 } 1925 index -= HIT; 1926 m = mask[n = e->re.bm.size - 1][buf[index]]; 1927 do 1928 { 1929 if (!n--) 1930 { 1931 if (e->re.bm.back < 0) 1932 goto possible; 1933 if (advance) 1934 { 1935 i = index - e->re.bm.back; 1936 s += i; 1937 if (env->stack) 1938 env->best[0].rm_so += i; 1939 goto possible; 1940 } 1941 x = index; 1942 if (index < e->re.bm.back) 1943 index = 0; 1944 else 1945 index -= e->re.bm.back; 1946 while (index <= x) 1947 { 1948 if ((i = parse(env, e->next, &env->done, buf + index)) != NONE) 1949 { 1950 if (env->stack) 1951 env->best[0].rm_so = index; 1952 n = env->nsub; 1953 goto hit; 1954 } 1955 index++; 1956 } 1957 index += e->re.bm.size; 1958 break; 1959 } 1960 } while (m &= mask[n][buf[--index]]); 1961 if ((index += fail[n + 1]) >= len) 1962 goto done; 1963 } 1964 } 1965 possible: 1966 n = env->nsub; 1967 e = e->next; 1968 } 1969 j = env->once || (flags & REG_LEFT); 1970 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d parse once=%d\n", __LINE__, j)),(0)); 1971 while ((i = parse(env, e, &env->done, (unsigned char*)s)) == NONE || advance && !env->best[0].rm_eo && !(advance = 0)) 1972 { 1973 if (j) 1974 goto done; 1975 i = MBSIZE(s); 1976 s += i; 1977 if ((unsigned char*)s > env->end - env->min) 1978 goto done; 1979 if (env->stack) 1980 env->best[0].rm_so += i; 1981 } 1982 if ((flags & REG_LEFT) && env->stack && env->best[0].rm_so) 1983 goto done; 1984 hit: 1985 if (k = env->error) 1986 goto done; 1987 if (i == CUT) 1988 { 1989 k = env->error = REG_NOMATCH; 1990 goto done; 1991 } 1992 if (!(env->flags & REG_NOSUB)) 1993 { 1994 k = (env->flags & (REG_SHELL|REG_AUGMENTED)) == (REG_SHELL|REG_AUGMENTED); 1995 for (i = j = m = 0; j < nmatch; i++) 1996 if (!i || !k || (i & 1)) 1997 { 1998 if (i > n) 1999 match[j] = state.nomatch; 2000 else 2001 match[m = j] = env->best[i]; 2002 j++; 2003 } 2004 if (k) 2005 { 2006 while (m > 0 && match[m].rm_so == -1 && match[m].rm_eo == -1) 2007 m--; 2008 ((regex_t*)p)->re_nsub = m; 2009 } 2010 } 2011 k = 0; 2012 done: 2013 stkold(stkstd, &env->stk); 2014 env->stk.base = 0; 2015 if (k > REG_NOMATCH) 2016 fatal(p->env->disc, k, NiL); 2017 return k; 2018 } 2019 2020 void 2021 regfree(regex_t* p) 2022 { 2023 Env_t* env; 2024 2025 if (p && (env = p->env)) 2026 { 2027 #if _REG_subcomp 2028 if (env->sub) 2029 { 2030 regsubfree(p); 2031 p->re_sub = 0; 2032 } 2033 #endif 2034 p->env = 0; 2035 if (--env->refs <= 0 && !(env->disc->re_flags & REG_NOFREE)) 2036 { 2037 drop(env->disc, env->rex); 2038 if (env->pos) 2039 vecclose(env->pos); 2040 if (env->bestpos) 2041 vecclose(env->bestpos); 2042 if (env->stk.base) 2043 stkold(stkstd, &env->stk); 2044 alloc(env->disc, env, 0); 2045 } 2046 } 2047 } 2048