1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2009 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex executor 26 * single sized-record interface 27 */ 28 29 #include "reglib.h" 30 31 #if _AST_REGEX_DEBUG 32 33 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 34 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 35 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_exec_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 36 37 static unsigned long debug; 38 static unsigned long debug_flag; 39 40 static const char* rexnames[] = 41 { 42 "REX_NULL", 43 "REX_ALT", 44 "REX_ALT_CATCH", 45 "REX_BACK", 46 "REX_BEG", 47 "REX_BEG_STR", 48 "REX_BM", 49 "REX_CAT", 50 "REX_CLASS", 51 "REX_COLL_CLASS", 52 "REX_CONJ", 53 "REX_CONJ_LEFT", 54 "REX_CONJ_RIGHT", 55 "REX_DONE", 56 "REX_DOT", 57 "REX_END", 58 "REX_END_STR", 59 "REX_EXEC", 60 "REX_FIN_STR", 61 "REX_GROUP", 62 "REX_GROUP_CATCH", 63 "REX_GROUP_AHEAD", 64 "REX_GROUP_AHEAD_CATCH", 65 "REX_GROUP_AHEAD_NOT", 66 "REX_GROUP_BEHIND", 67 "REX_GROUP_BEHIND_CATCH", 68 "REX_GROUP_BEHIND_NOT", 69 "REX_GROUP_BEHIND_NOT_CATCH", 70 "REX_GROUP_COND", 71 "REX_GROUP_COND_CATCH", 72 "REX_GROUP_CUT", 73 "REX_GROUP_CUT_CATCH", 74 "REX_KMP", 75 "REX_NEG", 76 "REX_NEG_CATCH", 77 "REX_NEST", 78 "REX_ONECHAR", 79 "REX_REP", 80 "REX_REP_CATCH", 81 "REX_STRING", 82 "REX_TRIE", 83 "REX_WBEG", 84 "REX_WEND", 85 "REX_WORD", 86 "REX_WORD_NOT" 87 }; 88 89 static const char* rexname(Rex_t* rex) 90 { 91 if (!rex) 92 return "NIL"; 93 if (rex->type >= elementsof(rexnames)) 94 return "ERROR"; 95 return rexnames[rex->type]; 96 } 97 98 #else 99 100 #define DEBUG_INIT() 101 #define DEBUG_TEST(f,y,n) (n) 102 #define DEBUG_CODE(f,y,n) do {n} while(0) 103 104 #endif 105 106 #define BEG_ALT 1 /* beginning of an alt */ 107 #define BEG_ONE 2 /* beginning of one iteration of a rep */ 108 #define BEG_REP 3 /* beginning of a repetition */ 109 #define BEG_SUB 4 /* beginning of a subexpression */ 110 #define END_ANY 5 /* end of any of above */ 111 112 /* 113 * returns from parse() 114 */ 115 116 #define NONE 0 /* no parse found */ 117 #define GOOD 1 /* some parse was found */ 118 #define CUT 2 /* no match and no backtrack */ 119 #define BEST 3 /* an unbeatable parse was found */ 120 #define BAD 4 /* error ocurred */ 121 122 /* 123 * REG_SHELL_DOT test 124 */ 125 126 #define LEADING(e,r,s) (*(s)==(e)->leading&&((s)==(e)->beg||*((s)-1)==(r)->explicit)) 127 128 /* 129 * Pos_t is for comparing parses. An entry is made in the 130 * array at the beginning and at the end of each Group_t, 131 * each iteration in a Group_t, and each Binary_t. 132 */ 133 134 typedef struct 135 { 136 unsigned char* p; /* where in string */ 137 size_t length; /* length in string */ 138 short serial; /* preorder subpattern number */ 139 short be; /* which end of pair */ 140 } Pos_t; 141 142 /* ===== begin library support ===== */ 143 144 #define vector(t,v,i) (((i)<(v)->max)?(t*)((v)->vec+(i)*(v)->siz):(t*)vecseek(&(v),i)) 145 146 static Vector_t* 147 vecopen(int inc, int siz) 148 { 149 Vector_t* v; 150 Stk_t* sp; 151 152 if (inc <= 0) 153 inc = 16; 154 if (!(sp = stkopen(STK_SMALL|STK_NULL))) 155 return 0; 156 if (!(v = (Vector_t*)stkseek(sp, sizeof(Vector_t) + inc * siz))) 157 { 158 stkclose(sp); 159 return 0; 160 } 161 v->stk = sp; 162 v->vec = (char*)v + sizeof(Vector_t); 163 v->max = v->inc = inc; 164 v->siz = siz; 165 v->cur = 0; 166 return v; 167 } 168 169 static void* 170 vecseek(Vector_t** p, int index) 171 { 172 Vector_t* v = *p; 173 174 if (index >= v->max) 175 { 176 while ((v->max += v->inc) <= index); 177 if (!(v = (Vector_t*)stkseek(v->stk, sizeof(Vector_t) + v->max * v->siz))) 178 return 0; 179 *p = v; 180 v->vec = (char*)v + sizeof(Vector_t); 181 } 182 return v->vec + index * v->siz; 183 } 184 185 static void 186 vecclose(Vector_t* v) 187 { 188 if (v) 189 stkclose(v->stk); 190 } 191 192 typedef struct 193 { 194 Stk_pos_t pos; 195 char data[1]; 196 } Stk_frame_t; 197 198 #define stknew(s,p) ((p)->offset=stktell(s),(p)->base=stkfreeze(s,0)) 199 #define stkold(s,p) stkset(s,(p)->base,(p)->offset) 200 201 #define stkframe(s) (*((Stk_frame_t**)stktop(s)-1)) 202 #define stkdata(s,t) ((t*)stkframe(s)->data) 203 #define stkpop(s) stkold(s,&(stkframe(s)->pos)) 204 205 static void* 206 stkpush(Stk_t* sp, size_t size) 207 { 208 Stk_frame_t* f; 209 Stk_pos_t p; 210 211 stknew(sp, &p); 212 size = sizeof(Stk_frame_t) + sizeof(size_t) + size - 1; 213 if (!(f = (Stk_frame_t*)stkalloc(sp, sizeof(Stk_frame_t) + sizeof(Stk_frame_t*) + size - 1))) 214 return 0; 215 f->pos = p; 216 stkframe(sp) = f; 217 return f->data; 218 } 219 220 /* ===== end library support ===== */ 221 222 /* 223 * Match_frame_t is for saving and restoring match records 224 * around alternate attempts, so that fossils will not be 225 * left in the match array. These are the only entries in 226 * the match array that are not otherwise guaranteed to 227 * have current data in them when they get used. 228 */ 229 230 typedef struct 231 { 232 size_t size; 233 regmatch_t* match; 234 regmatch_t save[1]; 235 } Match_frame_t; 236 237 #define matchframe stkdata(stkstd,Match_frame_t) 238 #define matchpush(e,x) ((x)->re.group.number?_matchpush(e,x):0) 239 #define matchcopy(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size):(void*)0) 240 #define matchpop(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size),stkpop(stkstd):(void*)0) 241 242 #define pospop(e) (--(e)->pos->cur) 243 244 /* 245 * allocate a frame and push a match onto the stack 246 */ 247 248 static int 249 _matchpush(Env_t* env, Rex_t* rex) 250 { 251 Match_frame_t* f; 252 regmatch_t* m; 253 regmatch_t* e; 254 regmatch_t* s; 255 int num; 256 257 if (rex->re.group.number <= 0 || (num = rex->re.group.last - rex->re.group.number + 1) <= 0) 258 num = 0; 259 if (!(f = (Match_frame_t*)stkpush(stkstd, sizeof(Match_frame_t) + (num - 1) * sizeof(regmatch_t)))) 260 { 261 env->error = REG_ESPACE; 262 return 1; 263 } 264 f->size = num * sizeof(regmatch_t); 265 f->match = m = env->match + rex->re.group.number; 266 e = m + num; 267 s = f->save; 268 while (m < e) 269 { 270 *s++ = *m; 271 *m++ = state.nomatch; 272 } 273 return 0; 274 } 275 276 /* 277 * allocate a frame and push a pos onto the stack 278 */ 279 280 static int 281 pospush(Env_t* env, Rex_t* rex, unsigned char* p, int be) 282 { 283 Pos_t* pos; 284 285 if (!(pos = vector(Pos_t, env->pos, env->pos->cur))) 286 { 287 env->error = REG_ESPACE; 288 return 1; 289 } 290 pos->serial = rex->serial; 291 pos->p = p; 292 pos->be = be; 293 env->pos->cur++; 294 return 0; 295 } 296 297 /* 298 * two matches are known to have the same length 299 * os is start of old pos array, ns is start of new, 300 * oend and nend are end+1 pointers to ends of arrays. 301 * oe and ne are ends (not end+1) of subarrays. 302 * returns 1 if new is better, -1 if old, else 0. 303 */ 304 305 static int 306 better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) 307 { 308 Pos_t* oe; 309 Pos_t* ne; 310 int k; 311 int n; 312 313 if (env->error) 314 return -1; 315 for (;;) 316 { 317 DEBUG_CODE(0x0080,{sfprintf(sfstdout, " %-*.*sold ", (level + 3) * 4, (level + 3) * 4, "");for (oe = os; oe < oend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n %-*.*snew ", (level + 3) * 4, (level + 3) * 4, "");for (oe = ns; oe < nend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n");},{0;}); 318 if (ns >= nend) 319 return DEBUG_TEST(0x8000,(os < oend),(0)); 320 if (os >= oend) 321 return DEBUG_TEST(0x8000,(-1),(1)); 322 n = os->serial; 323 if (ns->serial > n) 324 return -1; 325 if (n > ns->serial) 326 { 327 env->error = REG_PANIC; 328 return -1; 329 } 330 if (ns->p > os->p) 331 return 1; 332 if (os->p > ns->p) 333 return -1; 334 oe = os; 335 k = 0; 336 for (;;) 337 if ((++oe)->serial == n) 338 { 339 if (oe->be != END_ANY) 340 k++; 341 else if (k-- <= 0) 342 break; 343 } 344 ne = ns; 345 k = 0; 346 for (;;) 347 if ((++ne)->serial == n) 348 { 349 if (ne->be != END_ANY) 350 k++; 351 else if (k-- <= 0) 352 break; 353 } 354 if (ne->p > oe->p) 355 return 1; 356 if (oe->p > ne->p) 357 return -1; 358 if (k = better(env, os + 1, ns + 1, oe, ne, level + 1)) 359 return k; 360 os = oe + 1; 361 ns = ne + 1; 362 } 363 } 364 365 #if _AST_REGEX_DEBUG 366 367 static void 368 showmatch(regmatch_t* p) 369 { 370 sfputc(sfstdout, '('); 371 if (p->rm_so < 0) 372 sfputc(sfstdout, '?'); 373 else 374 sfprintf(sfstdout, "%d", p->rm_so); 375 sfputc(sfstdout, ','); 376 if (p->rm_eo < 0) 377 sfputc(sfstdout, '?'); 378 else 379 sfprintf(sfstdout, "%d", p->rm_eo); 380 sfputc(sfstdout, ')'); 381 } 382 383 static int 384 _better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) 385 { 386 int i; 387 388 DEBUG_CODE(0x0040,{sfprintf(sfstdout, "AHA better old ");for (i = 0; i <= env->nsub; i++)showmatch(&env->best[i]);sfprintf(sfstdout, "\n new ");for (i = 0; i <= env->nsub; i++)showmatch(&env->match[i]);sfprintf(sfstdout, "\n");},{0;}); 389 i = better(env, os, ns, oend, nend, 0); 390 DEBUG_TEST(0x0040,(sfprintf(sfstdout, " %s\n", i <= 0 ? "OLD" : "NEW")),(0)); 391 return i; 392 } 393 394 #define better _better 395 396 #endif 397 398 #define follow(e,r,c,s) ((r)->next?parse(e,(r)->next,c,s):(c)?parse(e,c,0,s):BEST) 399 400 static int parse(Env_t*, Rex_t*, Rex_t*, unsigned char*); 401 402 static int 403 parserep(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s, int n) 404 { 405 int i; 406 int r = NONE; 407 Rex_t catcher; 408 409 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep %s %d %d %d %d `%-.*s'\n", __LINE__, debug_flag, rexname(rex->re.group.expr.rex), rex->re.group.number, rex->lo, n, rex->hi, env->end - s, s)),(0)); 410 if ((rex->flags & REG_MINIMAL) && n >= rex->lo && n < rex->hi) 411 { 412 if (env->stack && pospush(env, rex, s, END_ANY)) 413 return BAD; 414 i = follow(env, rex, cont, s); 415 if (env->stack) 416 pospop(env); 417 switch (i) 418 { 419 case BAD: 420 return BAD; 421 case CUT: 422 return CUT; 423 case BEST: 424 case GOOD: 425 return BEST; 426 } 427 } 428 if (n < rex->hi) 429 { 430 catcher.type = REX_REP_CATCH; 431 catcher.serial = rex->serial; 432 catcher.re.rep_catch.ref = rex; 433 catcher.re.rep_catch.cont = cont; 434 catcher.re.rep_catch.beg = s; 435 catcher.re.rep_catch.n = n + 1; 436 catcher.next = rex->next; 437 if (n == 0) 438 rex->re.rep_catch.beg = s; 439 if (env->stack) 440 { 441 if (matchpush(env, rex)) 442 return BAD; 443 if (pospush(env, rex, s, BEG_ONE)) 444 return BAD; 445 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH %d (%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0)); 446 } 447 r = parse(env, rex->re.group.expr.rex, &catcher, s); 448 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep parse %d %d `%-.*s'\n", __LINE__, debug_flag, rex->re.group.number, r, env->end - s, s)),(0)); 449 if (env->stack) 450 { 451 pospop(env); 452 matchpop(env, rex); 453 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP %d %d (%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, r, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0)); 454 } 455 switch (r) 456 { 457 case BAD: 458 return BAD; 459 case BEST: 460 return BEST; 461 case CUT: 462 r = NONE; 463 break; 464 case GOOD: 465 if (rex->flags & REG_MINIMAL) 466 return BEST; 467 r = GOOD; 468 break; 469 } 470 } 471 if (n < rex->lo) 472 return r; 473 if (!(rex->flags & REG_MINIMAL) || n >= rex->hi) 474 { 475 if (env->stack && pospush(env, rex, s, END_ANY)) 476 return BAD; 477 i = follow(env, rex, cont, s); 478 if (env->stack) 479 pospop(env); 480 switch (i) 481 { 482 case BAD: 483 r = BAD; 484 break; 485 case CUT: 486 r = CUT; 487 break; 488 case BEST: 489 r = BEST; 490 break; 491 case GOOD: 492 r = (rex->flags & REG_MINIMAL) ? BEST : GOOD; 493 break; 494 } 495 } 496 return r; 497 } 498 499 static int 500 parsetrie(Env_t* env, Trie_node_t* x, Rex_t* rex, Rex_t* cont, unsigned char* s) 501 { 502 unsigned char* p; 503 int r; 504 505 if (p = rex->map) 506 { 507 for (;;) 508 { 509 if (s >= env->end) 510 return NONE; 511 while (x->c != p[*s]) 512 if (!(x = x->sib)) 513 return NONE; 514 if (x->end) 515 break; 516 x = x->son; 517 s++; 518 } 519 } 520 else 521 { 522 for (;;) 523 { 524 if (s >= env->end) 525 return NONE; 526 while (x->c != *s) 527 if (!(x = x->sib)) 528 return NONE; 529 if (x->end) 530 break; 531 x = x->son; 532 s++; 533 } 534 } 535 s++; 536 if (rex->flags & REG_MINIMAL) 537 switch (follow(env, rex, cont, s)) 538 { 539 case BAD: 540 return BAD; 541 case CUT: 542 return CUT; 543 case BEST: 544 case GOOD: 545 return BEST; 546 } 547 if (x->son) 548 switch (parsetrie(env, x->son, rex, cont, s)) 549 { 550 case BAD: 551 return BAD; 552 case CUT: 553 return CUT; 554 case BEST: 555 return BEST; 556 case GOOD: 557 if (rex->flags & REG_MINIMAL) 558 return BEST; 559 r = GOOD; 560 break; 561 default: 562 r = NONE; 563 break; 564 } 565 else 566 r = NONE; 567 if (!(rex->flags & REG_MINIMAL)) 568 switch (follow(env, rex, cont, s)) 569 { 570 case BAD: 571 return BAD; 572 case CUT: 573 return CUT; 574 case BEST: 575 return BEST; 576 case GOOD: 577 return GOOD; 578 } 579 return r; 580 } 581 582 static int 583 collelt(register Celt_t* ce, char* key, int c, int x) 584 { 585 Ckey_t elt; 586 587 mbxfrm(elt, key, COLL_KEY_MAX); 588 for (;; ce++) 589 { 590 switch (ce->typ) 591 { 592 case COLL_call: 593 if (!x && (*ce->fun)(c)) 594 return 1; 595 continue; 596 case COLL_char: 597 if (!strcmp((char*)ce->beg, (char*)elt)) 598 return 1; 599 continue; 600 case COLL_range: 601 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max) 602 return 1; 603 continue; 604 case COLL_range_lc: 605 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswlower(c) || !iswupper(c))) 606 return 1; 607 continue; 608 case COLL_range_uc: 609 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswupper(c) || !iswlower(c))) 610 return 1; 611 continue; 612 } 613 break; 614 } 615 return 0; 616 } 617 618 static int 619 collic(register Celt_t* ce, char* key, register char* nxt, int c, int x) 620 { 621 if (!x) 622 { 623 if (collelt(ce, key, c, x)) 624 return 1; 625 if (iswlower(c)) 626 c = towupper(c); 627 else if (iswupper(c)) 628 c = towlower(c); 629 else 630 return 0; 631 x = wctomb(key, c); 632 key[x] = 0; 633 return collelt(ce, key, c, 0); 634 } 635 while (*nxt) 636 { 637 if (collic(ce, key, nxt + 1, c, x)) 638 return 1; 639 if (islower(*nxt)) 640 *nxt = toupper(*nxt); 641 else if (isupper(*nxt)) 642 *nxt = tolower(*nxt); 643 else 644 return 0; 645 nxt++; 646 } 647 return collelt(ce, key, c, x); 648 } 649 650 static int 651 collmatch(Rex_t* rex, unsigned char* s, unsigned char* e, unsigned char** p) 652 { 653 unsigned char* t; 654 wchar_t c; 655 int w; 656 int r; 657 int x; 658 int ic; 659 Ckey_t key; 660 Ckey_t elt; 661 662 ic = (rex->flags & REG_ICASE); 663 if ((w = MBSIZE(s)) > 1) 664 { 665 memcpy((char*)key, (char*)s, w); 666 key[w] = 0; 667 t = s; 668 c = mbchar(t); 669 #if !_lib_wctype 670 c &= 0xff; 671 #endif 672 x = 0; 673 } 674 else 675 { 676 c = s[0]; 677 if (ic && isupper(c)) 678 c = tolower(c); 679 key[0] = c; 680 key[1] = 0; 681 if (isalpha(c)) 682 { 683 x = e - s; 684 if (x > COLL_KEY_MAX) 685 x = COLL_KEY_MAX; 686 while (w < x) 687 { 688 c = s[w]; 689 if (!isalpha(c)) 690 break; 691 r = mbxfrm(elt, key, COLL_KEY_MAX); 692 if (ic && isupper(c)) 693 c = tolower(c); 694 key[w] = c; 695 key[w + 1] = 0; 696 if (mbxfrm(elt, key, COLL_KEY_MAX) != r) 697 break; 698 w++; 699 } 700 } 701 key[w] = 0; 702 c = key[0]; 703 x = w - 1; 704 } 705 r = 1; 706 for (;;) 707 { 708 if (ic ? collic(rex->re.collate.elements, (char*)key, (char*)key, c, x) : collelt(rex->re.collate.elements, (char*)key, c, x)) 709 break; 710 if (!x) 711 { 712 r = 0; 713 break; 714 } 715 w = x--; 716 key[w] = 0; 717 } 718 *p = s + w; 719 return rex->re.collate.invert ? !r : r; 720 } 721 722 static unsigned char* 723 nestmatch(register unsigned char* s, register unsigned char* e, const unsigned short* type, register int co) 724 { 725 register int c; 726 register int cc; 727 unsigned int n; 728 int oc; 729 730 if (type[co] & (REX_NEST_literal|REX_NEST_quote)) 731 { 732 n = (type[co] & REX_NEST_literal) ? REX_NEST_terminator : (REX_NEST_escape|REX_NEST_terminator); 733 while (s < e) 734 { 735 c = *s++; 736 if (c == co) 737 return s; 738 else if (type[c] & n) 739 { 740 if (s >= e || (type[c] & REX_NEST_terminator)) 741 break; 742 s++; 743 } 744 } 745 } 746 else 747 { 748 cc = type[co] >> REX_NEST_SHIFT; 749 oc = type[co] & (REX_NEST_open|REX_NEST_close); 750 n = 1; 751 while (s < e) 752 { 753 c = *s++; 754 switch (type[c] & (REX_NEST_escape|REX_NEST_open|REX_NEST_close|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) 755 { 756 case REX_NEST_delimiter: 757 case REX_NEST_terminator: 758 return oc ? 0 : s; 759 case REX_NEST_separator: 760 if (!oc) 761 return s; 762 break; 763 case REX_NEST_escape: 764 if (s >= e) 765 return 0; 766 s++; 767 break; 768 case REX_NEST_open|REX_NEST_close: 769 if (c == cc) 770 { 771 if (!--n) 772 return s; 773 } 774 /*FALLTHROUGH*/ 775 case REX_NEST_open: 776 if (c == co) 777 { 778 if (!++n) 779 return 0; 780 } 781 else if (!(s = nestmatch(s, e, type, c))) 782 return 0; 783 break; 784 case REX_NEST_close: 785 if (c != cc) 786 return 0; 787 if (!--n) 788 return s; 789 break; 790 } 791 } 792 return (oc || !(type[UCHAR_MAX+1] & REX_NEST_terminator)) ? 0 : s; 793 } 794 return 0; 795 } 796 797 static int 798 parse(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s) 799 { 800 int c; 801 int d; 802 int i; 803 int m; 804 int n; 805 int r; 806 int* f; 807 unsigned char* p; 808 unsigned char* t; 809 unsigned char* b; 810 unsigned char* e; 811 char* u; 812 regmatch_t* o; 813 Trie_node_t* x; 814 Rex_t* q; 815 Rex_t catcher; 816 Rex_t next; 817 818 for (;;) 819 { 820 DEBUG_TEST(0x0008,(sfprintf(sfstdout, "AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); 821 switch (rex->type) 822 { 823 case REX_ALT: 824 if (env->stack) 825 { 826 if (matchpush(env, rex)) 827 return BAD; 828 if (pospush(env, rex, s, BEG_ALT)) 829 return BAD; 830 catcher.type = REX_ALT_CATCH; 831 catcher.serial = rex->serial; 832 catcher.re.alt_catch.cont = cont; 833 catcher.next = rex->next; 834 r = parse(env, rex->re.group.expr.binary.left, &catcher, s); 835 if (r < BEST || (rex->flags & REG_MINIMAL)) 836 { 837 matchcopy(env, rex); 838 ((Pos_t*)env->pos->vec + env->pos->cur - 1)->serial = catcher.serial = rex->re.group.expr.binary.serial; 839 n = parse(env, rex->re.group.expr.binary.right, &catcher, s); 840 if (n != NONE) 841 r = n; 842 } 843 pospop(env); 844 matchpop(env, rex); 845 } 846 else 847 { 848 if ((r = parse(env, rex->re.group.expr.binary.left, cont, s)) == NONE) 849 r = parse(env, rex->re.group.expr.binary.right, cont, s); 850 if (r == GOOD) 851 r = BEST; 852 } 853 return r; 854 case REX_ALT_CATCH: 855 if (pospush(env, rex, s, END_ANY)) 856 return BAD; 857 r = follow(env, rex, rex->re.alt_catch.cont, s); 858 pospop(env); 859 return r; 860 case REX_BACK: 861 o = &env->match[rex->lo]; 862 if (o->rm_so < 0) 863 return NONE; 864 i = o->rm_eo - o->rm_so; 865 e = s + i; 866 if (e > env->end) 867 return NONE; 868 t = env->beg + o->rm_so; 869 if (!(p = rex->map)) 870 { 871 while (s < e) 872 if (*s++ != *t++) 873 return NONE; 874 } 875 else if (!mbwide()) 876 { 877 while (s < e) 878 if (p[*s++] != p[*t++]) 879 return NONE; 880 } 881 else 882 { 883 while (s < e) 884 { 885 c = mbchar(s); 886 d = mbchar(t); 887 if (towupper(c) != towupper(d)) 888 return NONE; 889 } 890 } 891 break; 892 case REX_BEG: 893 if ((!(rex->flags & REG_NEWLINE) || s <= env->beg || *(s - 1) != '\n') && ((env->flags & REG_NOTBOL) || s != env->beg)) 894 return NONE; 895 break; 896 case REX_CLASS: 897 if (LEADING(env, rex, s)) 898 return NONE; 899 n = rex->hi; 900 if (n > env->end - s) 901 n = env->end - s; 902 m = rex->lo; 903 if (m > n) 904 return NONE; 905 r = NONE; 906 if (!(rex->flags & REG_MINIMAL)) 907 { 908 for (i = 0; i < n; i++) 909 if (!settst(rex->re.charclass, s[i])) 910 { 911 n = i; 912 break; 913 } 914 for (s += n; n-- >= m; s--) 915 switch (follow(env, rex, cont, s)) 916 { 917 case BAD: 918 return BAD; 919 case CUT: 920 return CUT; 921 case BEST: 922 return BEST; 923 case GOOD: 924 r = GOOD; 925 break; 926 } 927 } 928 else 929 { 930 for (e = s + m; s < e; s++) 931 if (!settst(rex->re.charclass, *s)) 932 return r; 933 e += n - m; 934 for (;;) 935 { 936 switch (follow(env, rex, cont, s)) 937 { 938 case BAD: 939 return BAD; 940 case CUT: 941 return CUT; 942 case BEST: 943 case GOOD: 944 return BEST; 945 } 946 if (s >= e || !settst(rex->re.charclass, *s)) 947 break; 948 s++; 949 } 950 } 951 return r; 952 case REX_COLL_CLASS: 953 if (LEADING(env, rex, s)) 954 return NONE; 955 n = rex->hi; 956 if (n > env->end - s) 957 n = env->end - s; 958 m = rex->lo; 959 if (m > n) 960 return NONE; 961 r = NONE; 962 e = env->end; 963 if (!(rex->flags & REG_MINIMAL)) 964 { 965 if (!(b = (unsigned char*)stkpush(stkstd, n))) 966 { 967 env->error = REG_ESPACE; 968 return BAD; 969 } 970 for (i = 0; s < e && i < n && collmatch(rex, s, e, &t); i++) 971 { 972 b[i] = t - s; 973 s = t; 974 } 975 for (; i-- >= rex->lo; s -= b[i]) 976 switch (follow(env, rex, cont, s)) 977 { 978 case BAD: 979 stkpop(stkstd); 980 return BAD; 981 case CUT: 982 stkpop(stkstd); 983 return CUT; 984 case BEST: 985 stkpop(stkstd); 986 return BEST; 987 case GOOD: 988 r = GOOD; 989 break; 990 } 991 stkpop(stkstd); 992 } 993 else 994 { 995 for (i = 0; i < m && s < e; i++, s = t) 996 if (!collmatch(rex, s, e, &t)) 997 return r; 998 while (i++ <= n) 999 { 1000 switch (follow(env, rex, cont, s)) 1001 { 1002 case BAD: 1003 return BAD; 1004 case CUT: 1005 return CUT; 1006 case BEST: 1007 case GOOD: 1008 return BEST; 1009 } 1010 if (s >= e || !collmatch(rex, s, e, &s)) 1011 break; 1012 } 1013 } 1014 return r; 1015 case REX_CONJ: 1016 next.type = REX_CONJ_RIGHT; 1017 next.re.conj_right.cont = cont; 1018 next.next = rex->next; 1019 catcher.type = REX_CONJ_LEFT; 1020 catcher.re.conj_left.right = rex->re.group.expr.binary.right; 1021 catcher.re.conj_left.cont = &next; 1022 catcher.re.conj_left.beg = s; 1023 catcher.next = 0; 1024 return parse(env, rex->re.group.expr.binary.left, &catcher, s); 1025 case REX_CONJ_LEFT: 1026 rex->re.conj_left.cont->re.conj_right.end = s; 1027 cont = rex->re.conj_left.cont; 1028 s = rex->re.conj_left.beg; 1029 rex = rex->re.conj_left.right; 1030 continue; 1031 case REX_CONJ_RIGHT: 1032 if (rex->re.conj_right.end != s) 1033 return NONE; 1034 cont = rex->re.conj_right.cont; 1035 break; 1036 case REX_DONE: 1037 if (!env->stack) 1038 return BEST; 1039 n = s - env->beg; 1040 r = env->nsub; 1041 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 1042 if ((i = env->best[0].rm_eo) >= 0) 1043 { 1044 if (rex->flags & REG_MINIMAL) 1045 { 1046 if (n > i) 1047 return GOOD; 1048 } 1049 else 1050 { 1051 if (n < i) 1052 return GOOD; 1053 } 1054 if (n == i && better(env, 1055 (Pos_t*)env->bestpos->vec, 1056 (Pos_t*)env->pos->vec, 1057 (Pos_t*)env->bestpos->vec+env->bestpos->cur, 1058 (Pos_t*)env->pos->vec+env->pos->cur, 1059 0) <= 0) 1060 return GOOD; 1061 } 1062 env->best[0].rm_eo = n; 1063 memcpy(&env->best[1], &env->match[1], r * sizeof(regmatch_t)); 1064 n = env->pos->cur; 1065 if (!vector(Pos_t, env->bestpos, n)) 1066 { 1067 env->error = REG_ESPACE; 1068 return BAD; 1069 } 1070 env->bestpos->cur = n; 1071 memcpy(env->bestpos->vec, env->pos->vec, n * sizeof(Pos_t)); 1072 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 1073 return GOOD; 1074 case REX_DOT: 1075 if (LEADING(env, rex, s)) 1076 return NONE; 1077 n = rex->hi; 1078 if (n > env->end - s) 1079 n = env->end - s; 1080 m = rex->lo; 1081 if (m > n) 1082 return NONE; 1083 if ((c = rex->explicit) >= 0 && !mbwide()) 1084 for (i = 0; i < n; i++) 1085 if (s[i] == c) 1086 { 1087 n = i; 1088 break; 1089 } 1090 r = NONE; 1091 if (!(rex->flags & REG_MINIMAL)) 1092 { 1093 if (!mbwide()) 1094 { 1095 for (s += n; n-- >= m; s--) 1096 switch (follow(env, rex, cont, s)) 1097 { 1098 case BAD: 1099 return BAD; 1100 case CUT: 1101 return CUT; 1102 case BEST: 1103 return BEST; 1104 case GOOD: 1105 r = GOOD; 1106 break; 1107 } 1108 } 1109 else 1110 { 1111 if (!(b = (unsigned char*)stkpush(stkstd, n))) 1112 { 1113 env->error = REG_ESPACE; 1114 return BAD; 1115 } 1116 e = env->end; 1117 for (i = 0; s < e && i < n && *s != c; i++) 1118 s += b[i] = MBSIZE(s); 1119 for (; i-- >= m; s -= b[i]) 1120 switch (follow(env, rex, cont, s)) 1121 { 1122 case BAD: 1123 stkpop(stkstd); 1124 return BAD; 1125 case CUT: 1126 stkpop(stkstd); 1127 return CUT; 1128 case BEST: 1129 stkpop(stkstd); 1130 return BEST; 1131 case GOOD: 1132 r = GOOD; 1133 break; 1134 } 1135 stkpop(stkstd); 1136 } 1137 } 1138 else 1139 { 1140 if (!mbwide()) 1141 { 1142 e = s + n; 1143 for (s += m; s <= e; s++) 1144 switch (follow(env, rex, cont, s)) 1145 { 1146 case BAD: 1147 return BAD; 1148 case CUT: 1149 return CUT; 1150 case BEST: 1151 case GOOD: 1152 return BEST; 1153 } 1154 } 1155 else 1156 { 1157 e = env->end; 1158 for (i = 0; s < e && i < m && *s != c; i++) 1159 s += MBSIZE(s); 1160 if (i >= m) 1161 for (; s <= e && i <= n; s += MBSIZE(s), i++) 1162 switch (follow(env, rex, cont, s)) 1163 { 1164 case BAD: 1165 return BAD; 1166 case CUT: 1167 return CUT; 1168 case BEST: 1169 case GOOD: 1170 return BEST; 1171 } 1172 } 1173 } 1174 return r; 1175 case REX_END: 1176 if ((!(rex->flags & REG_NEWLINE) || *s != '\n') && ((env->flags & REG_NOTEOL) || s < env->end)) 1177 return NONE; 1178 break; 1179 case REX_GROUP: 1180 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); 1181 if (env->stack) 1182 { 1183 if (rex->re.group.number) 1184 env->match[rex->re.group.number].rm_so = s - env->beg; 1185 if (pospush(env, rex, s, BEG_SUB)) 1186 return BAD; 1187 catcher.re.group_catch.eo = rex->re.group.number ? &env->match[rex->re.group.number].rm_eo : (regoff_t*)0; 1188 } 1189 catcher.type = REX_GROUP_CATCH; 1190 catcher.serial = rex->serial; 1191 catcher.re.group_catch.cont = cont; 1192 catcher.next = rex->next; 1193 r = parse(env, rex->re.group.expr.rex, &catcher, s); 1194 if (env->stack) 1195 { 1196 pospop(env); 1197 if (rex->re.group.number) 1198 env->match[rex->re.group.number].rm_so = -1; 1199 } 1200 return r; 1201 case REX_GROUP_CATCH: 1202 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s=>%s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rexname(rex->re.group_catch.cont), env->end - s, s)),(0)); 1203 if (env->stack) 1204 { 1205 if (rex->re.group_catch.eo) 1206 *rex->re.group_catch.eo = s - env->beg; 1207 if (pospush(env, rex, s, END_ANY)) 1208 return BAD; 1209 } 1210 r = follow(env, rex, rex->re.group_catch.cont, s); 1211 if (env->stack) 1212 { 1213 pospop(env); 1214 if (rex->re.group_catch.eo) 1215 *rex->re.group_catch.eo = -1; 1216 } 1217 return r; 1218 case REX_GROUP_AHEAD: 1219 catcher.type = REX_GROUP_AHEAD_CATCH; 1220 catcher.flags = rex->flags; 1221 catcher.serial = rex->serial; 1222 catcher.re.rep_catch.beg = s; 1223 catcher.re.rep_catch.cont = cont; 1224 catcher.next = rex->next; 1225 return parse(env, rex->re.group.expr.rex, &catcher, s); 1226 case REX_GROUP_AHEAD_CATCH: 1227 return follow(env, rex, rex->re.rep_catch.cont, rex->re.rep_catch.beg); 1228 case REX_GROUP_AHEAD_NOT: 1229 r = parse(env, rex->re.group.expr.rex, NiL, s); 1230 if (r == NONE) 1231 r = follow(env, rex, cont, s); 1232 else if (r != BAD) 1233 r = NONE; 1234 return r; 1235 case REX_GROUP_BEHIND: 1236 if ((s - env->beg) < rex->re.group.size) 1237 return NONE; 1238 catcher.type = REX_GROUP_BEHIND_CATCH; 1239 catcher.flags = rex->flags; 1240 catcher.serial = rex->serial; 1241 catcher.re.behind_catch.beg = s; 1242 catcher.re.behind_catch.end = e = env->end; 1243 catcher.re.behind_catch.cont = cont; 1244 catcher.next = rex->next; 1245 for (t = s - rex->re.group.size; t >= env->beg; t--) 1246 { 1247 env->end = s; 1248 r = parse(env, rex->re.group.expr.rex, &catcher, t); 1249 env->end = e; 1250 if (r != NONE) 1251 return r; 1252 } 1253 return NONE; 1254 case REX_GROUP_BEHIND_CATCH: 1255 if (s != rex->re.behind_catch.beg) 1256 return NONE; 1257 env->end = rex->re.behind_catch.end; 1258 return follow(env, rex, rex->re.behind_catch.cont, rex->re.behind_catch.beg); 1259 case REX_GROUP_BEHIND_NOT: 1260 if ((s - env->beg) < rex->re.group.size) 1261 r = NONE; 1262 else 1263 { 1264 catcher.type = REX_GROUP_BEHIND_NOT_CATCH; 1265 catcher.re.neg_catch.beg = s; 1266 catcher.next = 0; 1267 e = env->end; 1268 env->end = s; 1269 for (t = s - rex->re.group.size; t >= env->beg; t--) 1270 { 1271 r = parse(env, rex->re.group.expr.rex, &catcher, t); 1272 if (r != NONE) 1273 break; 1274 } 1275 env->end = e; 1276 } 1277 if (r == NONE) 1278 r = follow(env, rex, cont, s); 1279 else if (r != BAD) 1280 r = NONE; 1281 return r; 1282 case REX_GROUP_BEHIND_NOT_CATCH: 1283 return s == rex->re.neg_catch.beg ? GOOD : NONE; 1284 case REX_GROUP_COND: 1285 if (q = rex->re.group.expr.binary.right) 1286 { 1287 catcher.re.cond_catch.next[0] = q->re.group.expr.binary.right; 1288 catcher.re.cond_catch.next[1] = q->re.group.expr.binary.left; 1289 } 1290 else 1291 catcher.re.cond_catch.next[0] = catcher.re.cond_catch.next[1] = 0; 1292 if (q = rex->re.group.expr.binary.left) 1293 { 1294 catcher.type = REX_GROUP_COND_CATCH; 1295 catcher.flags = rex->flags; 1296 catcher.serial = rex->serial; 1297 catcher.re.cond_catch.yes = 0; 1298 catcher.re.cond_catch.beg = s; 1299 catcher.re.cond_catch.cont = cont; 1300 catcher.next = rex->next; 1301 r = parse(env, q, &catcher, s); 1302 if (r == BAD || catcher.re.cond_catch.yes) 1303 return r; 1304 } 1305 else if (!rex->re.group.size || rex->re.group.size > 0 && env->match[rex->re.group.size].rm_so >= 0) 1306 r = GOOD; 1307 else 1308 r = NONE; 1309 if (q = catcher.re.cond_catch.next[r != NONE]) 1310 { 1311 catcher.type = REX_CAT; 1312 catcher.flags = q->flags; 1313 catcher.serial = q->serial; 1314 catcher.re.group_catch.cont = cont; 1315 catcher.next = rex->next; 1316 return parse(env, q, &catcher, s); 1317 } 1318 return follow(env, rex, cont, s); 1319 case REX_GROUP_COND_CATCH: 1320 rex->re.cond_catch.yes = 1; 1321 catcher.type = REX_CAT; 1322 catcher.flags = rex->flags; 1323 catcher.serial = rex->serial; 1324 catcher.re.group_catch.cont = rex->re.cond_catch.cont; 1325 catcher.next = rex->next; 1326 return parse(env, rex->re.cond_catch.next[1], &catcher, rex->re.cond_catch.beg); 1327 case REX_CAT: 1328 return follow(env, rex, rex->re.group_catch.cont, s); 1329 case REX_GROUP_CUT: 1330 catcher.type = REX_GROUP_CUT_CATCH; 1331 catcher.flags = rex->flags; 1332 catcher.serial = rex->serial; 1333 catcher.re.group_catch.cont = cont; 1334 catcher.next = rex->next; 1335 return parse(env, rex->re.group.expr.rex, &catcher, s); 1336 case REX_GROUP_CUT_CATCH: 1337 switch (r = follow(env, rex, rex->re.group_catch.cont, s)) 1338 { 1339 case GOOD: 1340 r = BEST; 1341 break; 1342 case NONE: 1343 r = CUT; 1344 break; 1345 } 1346 return r; 1347 case REX_KMP: 1348 f = rex->re.string.fail; 1349 b = rex->re.string.base; 1350 n = rex->re.string.size; 1351 t = s; 1352 e = env->end; 1353 if (p = rex->map) 1354 { 1355 while (t + n <= e) 1356 { 1357 for (i = -1; t < e; t++) 1358 { 1359 while (i >= 0 && b[i+1] != p[*t]) 1360 i = f[i]; 1361 if (b[i+1] == p[*t]) 1362 i++; 1363 if (i + 1 == n) 1364 { 1365 t++; 1366 if (env->stack) 1367 env->best[0].rm_so = t - s - n; 1368 switch (follow(env, rex, cont, t)) 1369 { 1370 case BAD: 1371 return BAD; 1372 case CUT: 1373 return CUT; 1374 case BEST: 1375 case GOOD: 1376 return BEST; 1377 } 1378 t -= n - 1; 1379 break; 1380 } 1381 } 1382 } 1383 } 1384 else 1385 { 1386 while (t + n <= e) 1387 { 1388 for (i = -1; t < e; t++) 1389 { 1390 while (i >= 0 && b[i+1] != *t) 1391 i = f[i]; 1392 if (b[i+1] == *t) 1393 i++; 1394 if (i + 1 == n) 1395 { 1396 t++; 1397 if (env->stack) 1398 env->best[0].rm_so = t - s - n; 1399 switch (follow(env, rex, cont, t)) 1400 { 1401 case BAD: 1402 return BAD; 1403 case CUT: 1404 return CUT; 1405 case BEST: 1406 case GOOD: 1407 return BEST; 1408 } 1409 t -= n - 1; 1410 break; 1411 } 1412 } 1413 } 1414 } 1415 return NONE; 1416 case REX_NEG: 1417 if (LEADING(env, rex, s)) 1418 return NONE; 1419 i = env->end - s; 1420 n = ((i + 7) >> 3) + 1; 1421 catcher.type = REX_NEG_CATCH; 1422 catcher.re.neg_catch.beg = s; 1423 if (!(p = (unsigned char*)stkpush(stkstd, n))) 1424 return BAD; 1425 memset(catcher.re.neg_catch.index = p, 0, n); 1426 catcher.next = rex->next; 1427 if (parse(env, rex->re.group.expr.rex, &catcher, s) == BAD) 1428 r = BAD; 1429 else 1430 { 1431 r = NONE; 1432 for (; i >= 0; i--) 1433 if (!bittst(p, i)) 1434 { 1435 switch (follow(env, rex, cont, s + i)) 1436 { 1437 case BAD: 1438 r = BAD; 1439 break; 1440 case BEST: 1441 r = BEST; 1442 break; 1443 case CUT: 1444 r = CUT; 1445 break; 1446 case GOOD: 1447 r = GOOD; 1448 /*FALLTHROUGH*/ 1449 default: 1450 continue; 1451 } 1452 break; 1453 } 1454 } 1455 stkpop(stkstd); 1456 return r; 1457 case REX_NEG_CATCH: 1458 bitset(rex->re.neg_catch.index, s - rex->re.neg_catch.beg); 1459 return NONE; 1460 case REX_NEST: 1461 if (s >= env->end) 1462 return NONE; 1463 do 1464 { 1465 if ((c = *s++) == rex->re.nest.primary) 1466 { 1467 if (s >= env->end || !(s = nestmatch(s, env->end, rex->re.nest.type, c))) 1468 return NONE; 1469 break; 1470 } 1471 if (rex->re.nest.primary >= 0) 1472 return NONE; 1473 if (rex->re.nest.type[c] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) 1474 break; 1475 if (!(s = nestmatch(s, env->end, rex->re.nest.type, c))) 1476 return NONE; 1477 } while (s < env->end && !(rex->re.nest.type[*(s-1)] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))); 1478 break; 1479 case REX_NULL: 1480 break; 1481 case REX_ONECHAR: 1482 n = rex->hi; 1483 if (n > env->end - s) 1484 n = env->end - s; 1485 m = rex->lo; 1486 if (m > n) 1487 return NONE; 1488 r = NONE; 1489 c = rex->re.onechar; 1490 if (!(rex->flags & REG_MINIMAL)) 1491 { 1492 if (!mbwide()) 1493 { 1494 if (p = rex->map) 1495 { 1496 for (i = 0; i < n; i++, s++) 1497 if (p[*s] != c) 1498 break; 1499 } 1500 else 1501 { 1502 for (i = 0; i < n; i++, s++) 1503 if (*s != c) 1504 break; 1505 } 1506 for (; i-- >= m; s--) 1507 switch (follow(env, rex, cont, s)) 1508 { 1509 case BAD: 1510 return BAD; 1511 case BEST: 1512 return BEST; 1513 case CUT: 1514 return CUT; 1515 case GOOD: 1516 r = GOOD; 1517 break; 1518 } 1519 } 1520 else 1521 { 1522 if (!(b = (unsigned char*)stkpush(stkstd, n))) 1523 { 1524 env->error = REG_ESPACE; 1525 return BAD; 1526 } 1527 e = env->end; 1528 if (!(rex->flags & REG_ICASE)) 1529 { 1530 for (i = 0; s < e && i < n; i++, s = t) 1531 { 1532 t = s; 1533 if (mbchar(t) != c) 1534 break; 1535 b[i] = t - s; 1536 } 1537 } 1538 else 1539 { 1540 for (i = 0; s < e && i < n; i++, s = t) 1541 { 1542 t = s; 1543 if (towupper(mbchar(t)) != c) 1544 break; 1545 b[i] = t - s; 1546 } 1547 } 1548 for (; i-- >= m; s -= b[i]) 1549 switch (follow(env, rex, cont, s)) 1550 { 1551 case BAD: 1552 stkpop(stkstd); 1553 return BAD; 1554 case BEST: 1555 stkpop(stkstd); 1556 return BEST; 1557 case CUT: 1558 stkpop(stkstd); 1559 return CUT; 1560 case GOOD: 1561 r = GOOD; 1562 break; 1563 } 1564 stkpop(stkstd); 1565 } 1566 } 1567 else 1568 { 1569 if (!mbwide()) 1570 { 1571 e = s + m; 1572 if (p = rex->map) 1573 { 1574 for (; s < e; s++) 1575 if (p[*s] != c) 1576 return r; 1577 e += n - m; 1578 for (;;) 1579 { 1580 switch (follow(env, rex, cont, s)) 1581 { 1582 case BAD: 1583 return BAD; 1584 case CUT: 1585 return CUT; 1586 case BEST: 1587 case GOOD: 1588 return BEST; 1589 } 1590 if (s >= e || p[*s++] != c) 1591 break; 1592 } 1593 } 1594 else 1595 { 1596 for (; s < e; s++) 1597 if (*s != c) 1598 return r; 1599 e += n - m; 1600 for (;;) 1601 { 1602 switch (follow(env, rex, cont, s)) 1603 { 1604 case BAD: 1605 return BAD; 1606 case CUT: 1607 return CUT; 1608 case BEST: 1609 case GOOD: 1610 return BEST; 1611 } 1612 if (s >= e || *s++ != c) 1613 break; 1614 } 1615 } 1616 } 1617 else 1618 { 1619 if (!(rex->flags & REG_ICASE)) 1620 { 1621 for (i = 0; i < m && s < e; i++, s = t) 1622 { 1623 t = s; 1624 if (mbchar(t) != c) 1625 return r; 1626 } 1627 while (i++ <= n) 1628 { 1629 switch (follow(env, rex, cont, s)) 1630 { 1631 case BAD: 1632 return BAD; 1633 case CUT: 1634 return CUT; 1635 case BEST: 1636 case GOOD: 1637 return BEST; 1638 } 1639 if (s >= e || mbchar(s) != c) 1640 break; 1641 } 1642 } 1643 else 1644 { 1645 for (i = 0; i < m && s < e; i++, s = t) 1646 { 1647 t = s; 1648 if (towupper(mbchar(t)) != c) 1649 return r; 1650 } 1651 while (i++ <= n) 1652 { 1653 switch (follow(env, rex, cont, s)) 1654 { 1655 case BAD: 1656 return BAD; 1657 case CUT: 1658 return CUT; 1659 case BEST: 1660 case GOOD: 1661 return BEST; 1662 } 1663 if (s >= e || towupper(mbchar(s)) != c) 1664 break; 1665 } 1666 } 1667 } 1668 } 1669 return r; 1670 case REX_REP: 1671 if (env->stack && pospush(env, rex, s, BEG_REP)) 1672 return BAD; 1673 r = parserep(env, rex, cont, s, 0); 1674 if (env->stack) 1675 pospop(env); 1676 return r; 1677 case REX_REP_CATCH: 1678 DEBUG_TEST(0x0020,(sfprintf(sfstdout, "AHA#%04d 0x%04x %s n %d len %d s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.rep_catch.n, s - rex->re.rep_catch.beg, env->end - s, s)),(0)); 1679 if (env->stack && pospush(env, rex, s, END_ANY)) 1680 return BAD; 1681 if (s == rex->re.rep_catch.beg && rex->re.rep_catch.n > rex->re.rep_catch.ref->lo) 1682 { 1683 /* 1684 * optional empty iteration 1685 */ 1686 1687 DEBUG_TEST(0x0002,(sfprintf(sfstdout, "AHA#%04d %p re.group.back=%d re.group.expr.rex=%s\n", __LINE__, rex->re.rep_catch.ref->re.group.expr.rex, rex->re.rep_catch.ref->re.group.expr.rex->re.group.back, rexname(rex->re.rep_catch.ref->re.group.expr.rex))),(0)); 1688 if (!env->stack || s != rex->re.rep_catch.ref->re.rep_catch.beg && !rex->re.rep_catch.ref->re.group.expr.rex->re.group.back) 1689 r = NONE; 1690 else if (pospush(env, rex, s, END_ANY)) 1691 r = BAD; 1692 else 1693 { 1694 r = follow(env, rex, rex->re.rep_catch.cont, s); 1695 pospop(env); 1696 } 1697 } 1698 else 1699 r = parserep(env, rex->re.rep_catch.ref, rex->re.rep_catch.cont, s, rex->re.rep_catch.n); 1700 if (env->stack) 1701 pospop(env); 1702 return r; 1703 case REX_STRING: 1704 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s \"%-.*s\" `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.string.size, rex->re.string.base, env->end - s, s)),(0)); 1705 if (rex->re.string.size > (env->end - s)) 1706 return NONE; 1707 t = rex->re.string.base; 1708 e = t + rex->re.string.size; 1709 if (!(p = rex->map)) 1710 { 1711 while (t < e) 1712 if (*s++ != *t++) 1713 return NONE; 1714 } 1715 else if (!mbwide()) 1716 { 1717 while (t < e) 1718 if (p[*s++] != *t++) 1719 return NONE; 1720 } 1721 else 1722 { 1723 while (t < e) 1724 { 1725 c = mbchar(s); 1726 d = mbchar(t); 1727 if (towupper(c) != d) 1728 return NONE; 1729 } 1730 } 1731 break; 1732 case REX_TRIE: 1733 if (((s + rex->re.trie.min) > env->end) || !(x = rex->re.trie.root[rex->map ? rex->map[*s] : *s])) 1734 return NONE; 1735 return parsetrie(env, x, rex, cont, s); 1736 case REX_EXEC: 1737 u = 0; 1738 r = (*env->disc->re_execf)(env->regex, rex->re.exec.data, rex->re.exec.text, rex->re.exec.size, (const char*)s, env->end - s, &u, env->disc); 1739 e = (unsigned char*)u; 1740 if (e >= s && e <= env->end) 1741 s = e; 1742 switch (r) 1743 { 1744 case 0: 1745 break; 1746 case REG_NOMATCH: 1747 return NONE; 1748 default: 1749 env->error = r; 1750 return BAD; 1751 } 1752 break; 1753 case REX_WBEG: 1754 if (!isword(*s) || s > env->beg && isword(*(s - 1))) 1755 return NONE; 1756 break; 1757 case REX_WEND: 1758 if (isword(*s) || s > env->beg && !isword(*(s - 1))) 1759 return NONE; 1760 break; 1761 case REX_WORD: 1762 if (s > env->beg && isword(*(s - 1)) == isword(*s)) 1763 return NONE; 1764 break; 1765 case REX_WORD_NOT: 1766 if (s == env->beg || isword(*(s - 1)) != isword(*s)) 1767 return NONE; 1768 break; 1769 case REX_BEG_STR: 1770 if (s != env->beg) 1771 return NONE; 1772 break; 1773 case REX_END_STR: 1774 for (t = s; t < env->end && *t == '\n'; t++); 1775 if (t < env->end) 1776 return NONE; 1777 break; 1778 case REX_FIN_STR: 1779 if (s < env->end) 1780 return NONE; 1781 break; 1782 } 1783 if (!(rex = rex->next)) 1784 { 1785 if (!(rex = cont)) 1786 break; 1787 cont = 0; 1788 } 1789 } 1790 return GOOD; 1791 } 1792 1793 #if _AST_REGEX_DEBUG 1794 1795 static void 1796 listnode(Rex_t* e, int level) 1797 { 1798 int i; 1799 1800 if (e) 1801 { 1802 do 1803 { 1804 for (i = 0; i < level; i++) 1805 sfprintf(sfstderr, " "); 1806 sfprintf(sfstderr, "%s\n", rexname(e)); 1807 switch (e->type) 1808 { 1809 case REX_ALT: 1810 case REX_CONJ: 1811 listnode(e->re.group.expr.binary.left, level + 1); 1812 listnode(e->re.group.expr.binary.right, level + 1); 1813 break; 1814 case REX_GROUP: 1815 case REX_GROUP_AHEAD: 1816 case REX_GROUP_AHEAD_NOT: 1817 case REX_GROUP_BEHIND: 1818 case REX_GROUP_BEHIND_NOT: 1819 case REX_GROUP_CUT: 1820 case REX_NEG: 1821 case REX_REP: 1822 listnode(e->re.group.expr.rex, level + 1); 1823 break; 1824 } 1825 } while (e = e->next); 1826 } 1827 } 1828 1829 static int 1830 list(Env_t* env, Rex_t* rex) 1831 { 1832 sfprintf(sfstderr, "AHA regex hard=%d stack=%p\n", env->hard, env->stack); 1833 if (rex) 1834 listnode(rex, 1); 1835 return 0; 1836 } 1837 1838 #endif 1839 1840 /* 1841 * returning REG_BADPAT or REG_ESPACE is not explicitly 1842 * countenanced by the standard 1843 */ 1844 1845 int 1846 regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, regmatch_t* match, regflags_t flags) 1847 { 1848 register int n; 1849 register int i; 1850 int j; 1851 int k; 1852 int m; 1853 int advance; 1854 Env_t* env; 1855 Rex_t* e; 1856 1857 DEBUG_INIT(); 1858 DEBUG_TEST(0x0001,(sfprintf(sfstdout, "AHA#%04d 0x%04x regnexec %d 0x%08x `%-.*s'\n", __LINE__, debug_flag, nmatch, flags, len, s)),(0)); 1859 if (!p || !(env = p->env)) 1860 return REG_BADPAT; 1861 if (!s) 1862 return fatal(env->disc, REG_BADPAT, NiL); 1863 if (len < env->min) 1864 { 1865 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, env->min)),(0)); 1866 return REG_NOMATCH; 1867 } 1868 env->regex = p; 1869 env->beg = (unsigned char*)s; 1870 env->end = env->beg + len; 1871 stknew(stkstd, &env->stk); 1872 env->flags &= ~REG_EXEC; 1873 env->flags |= (flags & REG_EXEC); 1874 advance = 0; 1875 if (env->stack = env->hard || !(env->flags & REG_NOSUB) && nmatch) 1876 { 1877 n = env->nsub; 1878 if (!(env->match = (regmatch_t*)stkpush(stkstd, 2 * (n + 1) * sizeof(regmatch_t))) || 1879 !env->pos && !(env->pos = vecopen(16, sizeof(Pos_t))) || 1880 !env->bestpos && !(env->bestpos = vecopen(16, sizeof(Pos_t)))) 1881 { 1882 k = REG_ESPACE; 1883 goto done; 1884 } 1885 env->pos->cur = env->bestpos->cur = 0; 1886 env->best = &env->match[n + 1]; 1887 env->best[0].rm_so = 0; 1888 env->best[0].rm_eo = -1; 1889 for (i = 0; i <= n; i++) 1890 env->match[i] = state.nomatch; 1891 if (flags & REG_ADVANCE) 1892 advance = 1; 1893 } 1894 DEBUG_TEST(0x1000,(list(env,env->rex)),(0)); 1895 k = REG_NOMATCH; 1896 if ((e = env->rex)->type == REX_BM) 1897 { 1898 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM\n", __LINE__)),(0)); 1899 if (len < e->re.bm.right) 1900 { 1901 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, e->re.bm.right)),(0)); 1902 goto done; 1903 } 1904 else if (!(flags & REG_LEFT)) 1905 { 1906 register unsigned char* buf = (unsigned char*)s; 1907 register size_t index = e->re.bm.left + e->re.bm.size; 1908 register size_t mid = len - e->re.bm.right; 1909 register size_t* skip = e->re.bm.skip; 1910 register size_t* fail = e->re.bm.fail; 1911 register Bm_mask_t** mask = e->re.bm.mask; 1912 Bm_mask_t m; 1913 size_t x; 1914 1915 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM len=%d right=%d left=%d size=%d %d %d\n", __LINE__, len, e->re.bm.right, e->re.bm.left, e->re.bm.size, index, mid)),(0)); 1916 for (;;) 1917 { 1918 while ((index += skip[buf[index]]) < mid); 1919 if (index < HIT) 1920 { 1921 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, index, HIT)),(0)); 1922 goto done; 1923 } 1924 index -= HIT; 1925 m = mask[n = e->re.bm.size - 1][buf[index]]; 1926 do 1927 { 1928 if (!n--) 1929 { 1930 if (e->re.bm.back < 0) 1931 goto possible; 1932 if (advance) 1933 { 1934 i = index - e->re.bm.back; 1935 s += i; 1936 if (env->stack) 1937 env->best[0].rm_so += i; 1938 goto possible; 1939 } 1940 x = index; 1941 if (index < e->re.bm.back) 1942 index = 0; 1943 else 1944 index -= e->re.bm.back; 1945 while (index <= x) 1946 { 1947 if ((i = parse(env, e->next, &env->done, buf + index)) != NONE) 1948 { 1949 if (env->stack) 1950 env->best[0].rm_so = index; 1951 n = env->nsub; 1952 goto hit; 1953 } 1954 index++; 1955 } 1956 index += e->re.bm.size; 1957 break; 1958 } 1959 } while (m &= mask[n][buf[--index]]); 1960 if ((index += fail[n + 1]) >= len) 1961 goto done; 1962 } 1963 } 1964 possible: 1965 n = env->nsub; 1966 e = e->next; 1967 } 1968 j = env->once || (flags & REG_LEFT); 1969 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d parse once=%d\n", __LINE__, j)),(0)); 1970 while ((i = parse(env, e, &env->done, (unsigned char*)s)) == NONE || advance && !env->best[0].rm_eo && !(advance = 0)) 1971 { 1972 if (j) 1973 goto done; 1974 i = MBSIZE(s); 1975 s += i; 1976 if ((unsigned char*)s > env->end - env->min) 1977 goto done; 1978 if (env->stack) 1979 env->best[0].rm_so += i; 1980 } 1981 if ((flags & REG_LEFT) && env->stack && env->best[0].rm_so) 1982 goto done; 1983 hit: 1984 if (k = env->error) 1985 goto done; 1986 if (i == CUT) 1987 { 1988 k = env->error = REG_NOMATCH; 1989 goto done; 1990 } 1991 if (!(env->flags & REG_NOSUB)) 1992 { 1993 k = (env->flags & (REG_SHELL|REG_AUGMENTED)) == (REG_SHELL|REG_AUGMENTED); 1994 for (i = j = m = 0; j < nmatch; i++) 1995 if (!i || !k || (i & 1)) 1996 { 1997 if (i > n) 1998 match[j] = state.nomatch; 1999 else 2000 match[m = j] = env->best[i]; 2001 j++; 2002 } 2003 if (k) 2004 { 2005 while (m > 0 && match[m].rm_so == -1 && match[m].rm_eo == -1) 2006 m--; 2007 ((regex_t*)p)->re_nsub = m; 2008 } 2009 } 2010 k = 0; 2011 done: 2012 stkold(stkstd, &env->stk); 2013 env->stk.base = 0; 2014 if (k > REG_NOMATCH) 2015 fatal(p->env->disc, k, NiL); 2016 return k; 2017 } 2018 2019 void 2020 regfree(regex_t* p) 2021 { 2022 Env_t* env; 2023 2024 if (p && (env = p->env)) 2025 { 2026 #if _REG_subcomp 2027 if (env->sub) 2028 { 2029 regsubfree(p); 2030 p->re_sub = 0; 2031 } 2032 #endif 2033 p->env = 0; 2034 if (--env->refs <= 0 && !(env->disc->re_flags & REG_NOFREE)) 2035 { 2036 drop(env->disc, env->rex); 2037 if (env->pos) 2038 vecclose(env->pos); 2039 if (env->bestpos) 2040 vecclose(env->bestpos); 2041 if (env->stk.base) 2042 stkold(stkstd, &env->stk); 2043 alloc(env->disc, env, 0); 2044 } 2045 } 2046 } 2047