1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2008 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex executor 26 * single sized-record interface 27 */ 28 29 #include "reglib.h" 30 31 #if _AST_REGEX_DEBUG 32 33 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 34 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 35 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_exec_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 36 37 static unsigned long debug; 38 static unsigned long debug_flag; 39 40 static const char* rexnames[] = 41 { 42 "REX_NULL", 43 "REX_ALT", 44 "REX_ALT_CATCH", 45 "REX_BACK", 46 "REX_BEG", 47 "REX_BEG_STR", 48 "REX_BM", 49 "REX_CAT", 50 "REX_CLASS", 51 "REX_COLL_CLASS", 52 "REX_CONJ", 53 "REX_CONJ_LEFT", 54 "REX_CONJ_RIGHT", 55 "REX_DONE", 56 "REX_DOT", 57 "REX_END", 58 "REX_END_STR", 59 "REX_EXEC", 60 "REX_FIN_STR", 61 "REX_GROUP", 62 "REX_GROUP_CATCH", 63 "REX_GROUP_AHEAD", 64 "REX_GROUP_AHEAD_CATCH", 65 "REX_GROUP_AHEAD_NOT", 66 "REX_GROUP_BEHIND", 67 "REX_GROUP_BEHIND_CATCH", 68 "REX_GROUP_BEHIND_NOT", 69 "REX_GROUP_BEHIND_NOT_CATCH", 70 "REX_GROUP_COND", 71 "REX_GROUP_COND_CATCH", 72 "REX_GROUP_CUT", 73 "REX_GROUP_CUT_CATCH", 74 "REX_KMP", 75 "REX_NEG", 76 "REX_NEG_CATCH", 77 "REX_NEST", 78 "REX_ONECHAR", 79 "REX_REP", 80 "REX_REP_CATCH", 81 "REX_STRING", 82 "REX_TRIE", 83 "REX_WBEG", 84 "REX_WEND", 85 "REX_WORD", 86 "REX_WORD_NOT" 87 }; 88 89 static const char* rexname(Rex_t* rex) 90 { 91 if (!rex) 92 return "NIL"; 93 if (rex->type >= elementsof(rexnames)) 94 return "ERROR"; 95 return rexnames[rex->type]; 96 } 97 98 #else 99 100 #define DEBUG_INIT() 101 #define DEBUG_TEST(f,y,n) (n) 102 #define DEBUG_CODE(f,y,n) do {n} while(0) 103 104 #endif 105 106 #define BEG_ALT 1 /* beginning of an alt */ 107 #define BEG_ONE 2 /* beginning of one iteration of a rep */ 108 #define BEG_REP 3 /* beginning of a repetition */ 109 #define BEG_SUB 4 /* beginning of a subexpression */ 110 #define END_ANY 5 /* end of any of above */ 111 112 /* 113 * returns from parse() 114 */ 115 116 #define NONE 0 /* no parse found */ 117 #define GOOD 1 /* some parse was found */ 118 #define CUT 2 /* no match and no backtrack */ 119 #define BEST 3 /* an unbeatable parse was found */ 120 #define BAD 4 /* error ocurred */ 121 122 /* 123 * REG_SHELL_DOT test 124 */ 125 126 #define LEADING(e,r,s) (*(s)==(e)->leading&&((s)==(e)->beg||*((s)-1)==(r)->explicit)) 127 128 /* 129 * Pos_t is for comparing parses. An entry is made in the 130 * array at the beginning and at the end of each Group_t, 131 * each iteration in a Group_t, and each Binary_t. 132 */ 133 134 typedef struct 135 { 136 unsigned char* p; /* where in string */ 137 size_t length; /* length in string */ 138 short serial; /* preorder subpattern number */ 139 short be; /* which end of pair */ 140 } Pos_t; 141 142 /* ===== begin library support ===== */ 143 144 #define vector(t,v,i) (((i)<(v)->max)?(t*)((v)->vec+(i)*(v)->siz):(t*)vecseek(&(v),i)) 145 146 static Vector_t* 147 vecopen(int inc, int siz) 148 { 149 Vector_t* v; 150 Stk_t* sp; 151 152 if (inc <= 0) 153 inc = 16; 154 if (!(sp = stkopen(STK_SMALL|STK_NULL))) 155 return 0; 156 if (!(v = (Vector_t*)stkseek(sp, sizeof(Vector_t) + inc * siz))) 157 { 158 stkclose(sp); 159 return 0; 160 } 161 v->stk = sp; 162 v->vec = (char*)v + sizeof(Vector_t); 163 v->max = v->inc = inc; 164 v->siz = siz; 165 v->cur = 0; 166 return v; 167 } 168 169 static void* 170 vecseek(Vector_t** p, int index) 171 { 172 Vector_t* v = *p; 173 174 if (index >= v->max) 175 { 176 while ((v->max += v->inc) <= index); 177 if (!(v = (Vector_t*)stkseek(v->stk, sizeof(Vector_t) + v->max * v->siz))) 178 return 0; 179 *p = v; 180 v->vec = (char*)v + sizeof(Vector_t); 181 } 182 return v->vec + index * v->siz; 183 } 184 185 static void 186 vecclose(Vector_t* v) 187 { 188 if (v) 189 stkclose(v->stk); 190 } 191 192 typedef struct 193 { 194 Stk_pos_t pos; 195 char data[1]; 196 } Stk_frame_t; 197 198 #define stknew(s,p) ((p)->offset=stktell(s),(p)->base=stkfreeze(s,0)) 199 #define stkold(s,p) stkset(s,(p)->base,(p)->offset) 200 201 #define stkframe(s) (*((Stk_frame_t**)stktop(s)-1)) 202 #define stkdata(s,t) ((t*)stkframe(s)->data) 203 #define stkpop(s) stkold(s,&(stkframe(s)->pos)) 204 205 static void* 206 stkpush(Stk_t* sp, size_t size) 207 { 208 Stk_frame_t* f; 209 Stk_pos_t p; 210 211 stknew(sp, &p); 212 size = sizeof(Stk_frame_t) + sizeof(size_t) + size - 1; 213 if (!(f = (Stk_frame_t*)stkalloc(sp, sizeof(Stk_frame_t) + sizeof(Stk_frame_t*) + size - 1))) 214 return 0; 215 f->pos = p; 216 stkframe(sp) = f; 217 return f->data; 218 } 219 220 /* ===== end library support ===== */ 221 222 /* 223 * Match_frame_t is for saving and restoring match records 224 * around alternate attempts, so that fossils will not be 225 * left in the match array. These are the only entries in 226 * the match array that are not otherwise guaranteed to 227 * have current data in them when they get used. 228 */ 229 230 typedef struct 231 { 232 size_t size; 233 regmatch_t* match; 234 regmatch_t save[1]; 235 } Match_frame_t; 236 237 #define matchframe stkdata(stkstd,Match_frame_t) 238 #define matchpush(e,x) ((x)->re.group.number?_matchpush(e,x):0) 239 #define matchcopy(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size):(void*)0) 240 #define matchpop(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size),stkpop(stkstd):(void*)0) 241 242 #define pospop(e) (--(e)->pos->cur) 243 244 /* 245 * allocate a frame and push a match onto the stack 246 */ 247 248 static int 249 _matchpush(Env_t* env, Rex_t* rex) 250 { 251 Match_frame_t* f; 252 regmatch_t* m; 253 regmatch_t* e; 254 regmatch_t* s; 255 int num; 256 257 if (rex->re.group.number <= 0 || (num = rex->re.group.last - rex->re.group.number + 1) <= 0) 258 num = 0; 259 if (!(f = (Match_frame_t*)stkpush(stkstd, sizeof(Match_frame_t) + (num - 1) * sizeof(regmatch_t)))) 260 { 261 env->error = REG_ESPACE; 262 return 1; 263 } 264 f->size = num * sizeof(regmatch_t); 265 f->match = m = env->match + rex->re.group.number; 266 e = m + num; 267 s = f->save; 268 while (m < e) 269 { 270 *s++ = *m; 271 *m++ = state.nomatch; 272 } 273 return 0; 274 } 275 276 /* 277 * allocate a frame and push a pos onto the stack 278 */ 279 280 static int 281 pospush(Env_t* env, Rex_t* rex, unsigned char* p, int be) 282 { 283 Pos_t* pos; 284 285 if (!(pos = vector(Pos_t, env->pos, env->pos->cur))) 286 { 287 env->error = REG_ESPACE; 288 return 1; 289 } 290 pos->serial = rex->serial; 291 pos->p = p; 292 pos->be = be; 293 env->pos->cur++; 294 return 0; 295 } 296 297 /* 298 * two matches are known to have the same length 299 * os is start of old pos array, ns is start of new, 300 * oend and nend are end+1 pointers to ends of arrays. 301 * oe and ne are ends (not end+1) of subarrays. 302 * returns 1 if new is better, -1 if old, else 0. 303 */ 304 305 static int 306 better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) 307 { 308 Pos_t* oe; 309 Pos_t* ne; 310 int k; 311 int n; 312 313 if (env->error) 314 return -1; 315 for (;;) 316 { 317 DEBUG_CODE(0x0080,{sfprintf(sfstdout, " %-*.*sold ", (level + 3) * 4, (level + 3) * 4, "");for (oe = os; oe < oend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n %-*.*snew ", (level + 3) * 4, (level + 3) * 4, "");for (oe = ns; oe < nend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n");},{0;}); 318 if (ns >= nend) 319 return DEBUG_TEST(0x8000,(os < oend),(0)); 320 if (os >= oend) 321 return DEBUG_TEST(0x8000,(-1),(1)); 322 n = os->serial; 323 if (ns->serial > n) 324 return -1; 325 if (n > ns->serial) 326 { 327 env->error = REG_PANIC; 328 return -1; 329 } 330 if (ns->p > os->p) 331 return 1; 332 if (os->p > ns->p) 333 return -1; 334 oe = os; 335 k = 0; 336 for (;;) 337 if ((++oe)->serial == n) 338 { 339 if (oe->be != END_ANY) 340 k++; 341 else if (k-- <= 0) 342 break; 343 } 344 ne = ns; 345 k = 0; 346 for (;;) 347 if ((++ne)->serial == n) 348 { 349 if (ne->be != END_ANY) 350 k++; 351 else if (k-- <= 0) 352 break; 353 } 354 if (ne->p > oe->p) 355 return 1; 356 if (oe->p > ne->p) 357 return -1; 358 if (k = better(env, os + 1, ns + 1, oe, ne, level + 1)) 359 return k; 360 os = oe + 1; 361 ns = ne + 1; 362 } 363 } 364 365 #if _AST_REGEX_DEBUG 366 367 static void 368 showmatch(regmatch_t* p) 369 { 370 sfputc(sfstdout, '('); 371 if (p->rm_so < 0) 372 sfputc(sfstdout, '?'); 373 else 374 sfprintf(sfstdout, "%d", p->rm_so); 375 sfputc(sfstdout, ','); 376 if (p->rm_eo < 0) 377 sfputc(sfstdout, '?'); 378 else 379 sfprintf(sfstdout, "%d", p->rm_eo); 380 sfputc(sfstdout, ')'); 381 } 382 383 static int 384 _better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) 385 { 386 int i; 387 388 DEBUG_CODE(0x0040,{sfprintf(sfstdout, "AHA better old ");for (i = 0; i <= env->nsub; i++)showmatch(&env->best[i]);sfprintf(sfstdout, "\n new ");for (i = 0; i <= env->nsub; i++)showmatch(&env->match[i]);sfprintf(sfstdout, "\n");},{0;}); 389 i = better(env, os, ns, oend, nend, 0); 390 DEBUG_TEST(0x0040,(sfprintf(sfstdout, " %s\n", i <= 0 ? "OLD" : "NEW")),(0)); 391 return i; 392 } 393 394 #define better _better 395 396 #endif 397 398 #define follow(e,r,c,s) ((r)->next?parse(e,(r)->next,c,s):(c)?parse(e,c,0,s):BEST) 399 400 static int parse(Env_t*, Rex_t*, Rex_t*, unsigned char*); 401 402 static int 403 parserep(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s, int n) 404 { 405 int i; 406 int r = NONE; 407 Rex_t catcher; 408 409 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep %s %d %d %d `%-.*s'\n", __LINE__, debug_flag, rexname(rex->re.group.expr.rex), rex->lo, n, rex->hi, env->end - s, s)),(0)); 410 if ((rex->flags & REG_MINIMAL) && n >= rex->lo && n < rex->hi) 411 { 412 if (env->stack && pospush(env, rex, s, END_ANY)) 413 return BAD; 414 i = follow(env, rex, cont, s); 415 if (env->stack) 416 pospop(env); 417 switch (i) 418 { 419 case BAD: 420 return BAD; 421 case CUT: 422 return CUT; 423 case BEST: 424 case GOOD: 425 return BEST; 426 } 427 } 428 if (n < rex->hi) 429 { 430 catcher.type = REX_REP_CATCH; 431 catcher.serial = rex->serial; 432 catcher.re.rep_catch.ref = rex; 433 catcher.re.rep_catch.cont = cont; 434 catcher.re.rep_catch.beg = s; 435 catcher.re.rep_catch.n = n + 1; 436 catcher.next = rex->next; 437 if (n == 0) 438 rex->re.rep_catch.beg = s; 439 if (env->stack) 440 { 441 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH (%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 442 if (matchpush(env, rex)) 443 return BAD; 444 if (pospush(env, rex, s, BEG_ONE)) 445 return BAD; 446 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH (%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 447 } 448 r = parse(env, rex->re.group.expr.rex, &catcher, s); 449 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep parse %d `%-.*s'\n", __LINE__, debug_flag, r, env->end - s, s)),(0)); 450 if (env->stack) 451 { 452 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP (%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 453 pospop(env); 454 matchpop(env, rex); 455 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP (%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 456 } 457 switch (r) 458 { 459 case BAD: 460 return BAD; 461 case BEST: 462 return BEST; 463 case CUT: 464 r = NONE; 465 break; 466 case GOOD: 467 if (rex->flags & REG_MINIMAL) 468 return BEST; 469 r = GOOD; 470 break; 471 } 472 } 473 if (n < rex->lo) 474 return r; 475 if (!(rex->flags & REG_MINIMAL) || n >= rex->hi) 476 { 477 if (env->stack && pospush(env, rex, s, END_ANY)) 478 return BAD; 479 i = follow(env, rex, cont, s); 480 if (env->stack) 481 pospop(env); 482 switch (i) 483 { 484 case BAD: 485 r = BAD; 486 break; 487 case CUT: 488 r = CUT; 489 break; 490 case BEST: 491 r = BEST; 492 break; 493 case GOOD: 494 r = (rex->flags & REG_MINIMAL) ? BEST : GOOD; 495 break; 496 } 497 } 498 return r; 499 } 500 501 static int 502 parsetrie(Env_t* env, Trie_node_t* x, Rex_t* rex, Rex_t* cont, unsigned char* s) 503 { 504 unsigned char* p; 505 int r; 506 507 if (p = rex->map) 508 { 509 for (;;) 510 { 511 if (s >= env->end) 512 return NONE; 513 while (x->c != p[*s]) 514 if (!(x = x->sib)) 515 return NONE; 516 if (x->end) 517 break; 518 x = x->son; 519 s++; 520 } 521 } 522 else 523 { 524 for (;;) 525 { 526 if (s >= env->end) 527 return NONE; 528 while (x->c != *s) 529 if (!(x = x->sib)) 530 return NONE; 531 if (x->end) 532 break; 533 x = x->son; 534 s++; 535 } 536 } 537 s++; 538 if (rex->flags & REG_MINIMAL) 539 switch (follow(env, rex, cont, s)) 540 { 541 case BAD: 542 return BAD; 543 case CUT: 544 return CUT; 545 case BEST: 546 case GOOD: 547 return BEST; 548 } 549 if (x->son) 550 switch (parsetrie(env, x->son, rex, cont, s)) 551 { 552 case BAD: 553 return BAD; 554 case CUT: 555 return CUT; 556 case BEST: 557 return BEST; 558 case GOOD: 559 if (rex->flags & REG_MINIMAL) 560 return BEST; 561 r = GOOD; 562 break; 563 default: 564 r = NONE; 565 break; 566 } 567 else 568 r = NONE; 569 if (!(rex->flags & REG_MINIMAL)) 570 switch (follow(env, rex, cont, s)) 571 { 572 case BAD: 573 return BAD; 574 case CUT: 575 return CUT; 576 case BEST: 577 return BEST; 578 case GOOD: 579 return GOOD; 580 } 581 return r; 582 } 583 584 static int 585 collelt(register Celt_t* ce, char* key, int c, int x) 586 { 587 Ckey_t elt; 588 589 mbxfrm(elt, key, COLL_KEY_MAX); 590 for (;; ce++) 591 { 592 switch (ce->typ) 593 { 594 case COLL_call: 595 if (!x && (*ce->fun)(c)) 596 return 1; 597 continue; 598 case COLL_char: 599 if (!strcmp((char*)ce->beg, (char*)elt)) 600 return 1; 601 continue; 602 case COLL_range: 603 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max) 604 return 1; 605 continue; 606 case COLL_range_lc: 607 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswlower(c) || !iswupper(c))) 608 return 1; 609 continue; 610 case COLL_range_uc: 611 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswupper(c) || !iswlower(c))) 612 return 1; 613 continue; 614 } 615 break; 616 } 617 return 0; 618 } 619 620 static int 621 collic(register Celt_t* ce, char* key, register char* nxt, int c, int x) 622 { 623 if (!x) 624 { 625 if (collelt(ce, key, c, x)) 626 return 1; 627 if (iswlower(c)) 628 c = towupper(c); 629 else if (iswupper(c)) 630 c = towlower(c); 631 else 632 return 0; 633 x = wctomb(key, c); 634 key[x] = 0; 635 return collelt(ce, key, c, 0); 636 } 637 while (*nxt) 638 { 639 if (collic(ce, key, nxt + 1, c, x)) 640 return 1; 641 if (islower(*nxt)) 642 *nxt = toupper(*nxt); 643 else if (isupper(*nxt)) 644 *nxt = tolower(*nxt); 645 else 646 return 0; 647 nxt++; 648 } 649 return collelt(ce, key, c, x); 650 } 651 652 static int 653 collmatch(Rex_t* rex, unsigned char* s, unsigned char* e, unsigned char** p) 654 { 655 unsigned char* t; 656 wchar_t c; 657 int w; 658 int r; 659 int x; 660 int ic; 661 Ckey_t key; 662 Ckey_t elt; 663 664 ic = (rex->flags & REG_ICASE); 665 if ((w = MBSIZE(s)) > 1) 666 { 667 memcpy((char*)key, (char*)s, w); 668 key[w] = 0; 669 t = s; 670 c = mbchar(t); 671 #if !_lib_wctype 672 c &= 0xff; 673 #endif 674 x = 0; 675 } 676 else 677 { 678 c = s[0]; 679 if (ic && isupper(c)) 680 c = tolower(c); 681 key[0] = c; 682 key[1] = 0; 683 if (isalpha(c)) 684 { 685 x = e - s; 686 if (x > COLL_KEY_MAX) 687 x = COLL_KEY_MAX; 688 while (w < x) 689 { 690 c = s[w]; 691 if (!isalpha(c)) 692 break; 693 r = mbxfrm(elt, key, COLL_KEY_MAX); 694 if (ic && isupper(c)) 695 c = tolower(c); 696 key[w] = c; 697 key[w + 1] = 0; 698 if (mbxfrm(elt, key, COLL_KEY_MAX) != r) 699 break; 700 w++; 701 } 702 } 703 key[w] = 0; 704 c = key[0]; 705 x = w - 1; 706 } 707 r = 1; 708 for (;;) 709 { 710 if (ic ? collic(rex->re.collate.elements, (char*)key, (char*)key, c, x) : collelt(rex->re.collate.elements, (char*)key, c, x)) 711 break; 712 if (!x) 713 { 714 r = 0; 715 break; 716 } 717 w = x--; 718 key[w] = 0; 719 } 720 *p = s + w; 721 return rex->re.collate.invert ? !r : r; 722 } 723 724 static unsigned char* 725 nestmatch(register unsigned char* s, register unsigned char* e, const unsigned short* type, register int co) 726 { 727 register int c; 728 register int cc; 729 unsigned int n; 730 int oc; 731 732 if (type[co] & (REX_NEST_literal|REX_NEST_quote)) 733 { 734 n = (type[co] & REX_NEST_literal) ? REX_NEST_terminator : (REX_NEST_escape|REX_NEST_terminator); 735 while (s < e) 736 { 737 c = *s++; 738 if (c == co) 739 return s; 740 else if (type[c] & n) 741 { 742 if (s >= e || (type[c] & REX_NEST_terminator)) 743 break; 744 s++; 745 } 746 } 747 } 748 else 749 { 750 cc = type[co] >> REX_NEST_SHIFT; 751 oc = type[co] & (REX_NEST_open|REX_NEST_close); 752 n = 1; 753 while (s < e) 754 { 755 c = *s++; 756 switch (type[c] & (REX_NEST_escape|REX_NEST_open|REX_NEST_close|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) 757 { 758 case REX_NEST_delimiter: 759 case REX_NEST_terminator: 760 return oc ? 0 : s; 761 case REX_NEST_separator: 762 if (!oc) 763 return s; 764 break; 765 case REX_NEST_escape: 766 if (s >= e) 767 return 0; 768 s++; 769 break; 770 case REX_NEST_open|REX_NEST_close: 771 if (c == cc) 772 { 773 if (!--n) 774 return s; 775 } 776 /*FALLTHROUGH*/ 777 case REX_NEST_open: 778 if (c == co) 779 { 780 if (!++n) 781 return 0; 782 } 783 else if (!(s = nestmatch(s, e, type, c))) 784 return 0; 785 break; 786 case REX_NEST_close: 787 if (c != cc) 788 return 0; 789 if (!--n) 790 return s; 791 break; 792 } 793 } 794 return (oc || !(type[UCHAR_MAX+1] & REX_NEST_terminator)) ? 0 : s; 795 } 796 return 0; 797 } 798 799 static int 800 parse(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s) 801 { 802 int c; 803 int d; 804 int i; 805 int m; 806 int n; 807 int r; 808 int* f; 809 unsigned char* p; 810 unsigned char* t; 811 unsigned char* b; 812 unsigned char* e; 813 char* u; 814 regmatch_t* o; 815 Trie_node_t* x; 816 Rex_t* q; 817 Rex_t catcher; 818 Rex_t next; 819 820 for (;;) 821 { 822 DEBUG_TEST(0x0008,(sfprintf(sfstdout, "AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); 823 switch (rex->type) 824 { 825 case REX_ALT: 826 if (env->stack) 827 { 828 if (matchpush(env, rex)) 829 return BAD; 830 if (pospush(env, rex, s, BEG_ALT)) 831 return BAD; 832 catcher.type = REX_ALT_CATCH; 833 catcher.serial = rex->serial; 834 catcher.re.alt_catch.cont = cont; 835 catcher.next = rex->next; 836 r = parse(env, rex->re.group.expr.binary.left, &catcher, s); 837 if (r < BEST || (rex->flags & REG_MINIMAL)) 838 { 839 matchcopy(env, rex); 840 ((Pos_t*)env->pos->vec + env->pos->cur - 1)->serial = catcher.serial = rex->re.group.expr.binary.serial; 841 n = parse(env, rex->re.group.expr.binary.right, &catcher, s); 842 if (n != NONE) 843 r = n; 844 } 845 pospop(env); 846 matchpop(env, rex); 847 } 848 else 849 { 850 if ((r = parse(env, rex->re.group.expr.binary.left, cont, s)) == NONE) 851 r = parse(env, rex->re.group.expr.binary.right, cont, s); 852 if (r == GOOD) 853 r = BEST; 854 } 855 return r; 856 case REX_ALT_CATCH: 857 if (pospush(env, rex, s, END_ANY)) 858 return BAD; 859 r = follow(env, rex, rex->re.alt_catch.cont, s); 860 pospop(env); 861 return r; 862 case REX_BACK: 863 o = &env->match[rex->lo]; 864 if (o->rm_so < 0) 865 return NONE; 866 i = o->rm_eo - o->rm_so; 867 e = s + i; 868 if (e > env->end) 869 return NONE; 870 t = env->beg + o->rm_so; 871 if (!(p = rex->map)) 872 { 873 while (s < e) 874 if (*s++ != *t++) 875 return NONE; 876 } 877 else if (!mbwide()) 878 { 879 while (s < e) 880 if (p[*s++] != p[*t++]) 881 return NONE; 882 } 883 else 884 { 885 while (s < e) 886 { 887 c = mbchar(s); 888 d = mbchar(t); 889 if (towupper(c) != towupper(d)) 890 return NONE; 891 } 892 } 893 break; 894 case REX_BEG: 895 if ((!(rex->flags & REG_NEWLINE) || s <= env->beg || *(s - 1) != '\n') && ((env->flags & REG_NOTBOL) || s != env->beg)) 896 return NONE; 897 break; 898 case REX_CLASS: 899 if (LEADING(env, rex, s)) 900 return NONE; 901 n = rex->hi; 902 if (n > env->end - s) 903 n = env->end - s; 904 m = rex->lo; 905 if (m > n) 906 return NONE; 907 r = NONE; 908 if (!(rex->flags & REG_MINIMAL)) 909 { 910 for (i = 0; i < n; i++) 911 if (!settst(rex->re.charclass, s[i])) 912 { 913 n = i; 914 break; 915 } 916 for (s += n; n-- >= m; s--) 917 switch (follow(env, rex, cont, s)) 918 { 919 case BAD: 920 return BAD; 921 case CUT: 922 return CUT; 923 case BEST: 924 return BEST; 925 case GOOD: 926 r = GOOD; 927 break; 928 } 929 } 930 else 931 { 932 for (e = s + m; s < e; s++) 933 if (!settst(rex->re.charclass, *s)) 934 return r; 935 e += n - m; 936 for (;;) 937 { 938 switch (follow(env, rex, cont, s)) 939 { 940 case BAD: 941 return BAD; 942 case CUT: 943 return CUT; 944 case BEST: 945 case GOOD: 946 return BEST; 947 } 948 if (s >= e || !settst(rex->re.charclass, *s)) 949 break; 950 s++; 951 } 952 } 953 return r; 954 case REX_COLL_CLASS: 955 if (LEADING(env, rex, s)) 956 return NONE; 957 n = rex->hi; 958 if (n > env->end - s) 959 n = env->end - s; 960 m = rex->lo; 961 if (m > n) 962 return NONE; 963 r = NONE; 964 e = env->end; 965 if (!(rex->flags & REG_MINIMAL)) 966 { 967 if (!(b = (unsigned char*)stkpush(stkstd, n))) 968 { 969 env->error = REG_ESPACE; 970 return BAD; 971 } 972 for (i = 0; s < e && i < n && collmatch(rex, s, e, &t); i++) 973 { 974 b[i] = t - s; 975 s = t; 976 } 977 for (; i-- >= rex->lo; s -= b[i]) 978 switch (follow(env, rex, cont, s)) 979 { 980 case BAD: 981 stkpop(stkstd); 982 return BAD; 983 case CUT: 984 stkpop(stkstd); 985 return CUT; 986 case BEST: 987 stkpop(stkstd); 988 return BEST; 989 case GOOD: 990 r = GOOD; 991 break; 992 } 993 stkpop(stkstd); 994 } 995 else 996 { 997 for (i = 0; i < m && s < e; i++, s = t) 998 if (!collmatch(rex, s, e, &t)) 999 return r; 1000 while (i++ <= n) 1001 { 1002 switch (follow(env, rex, cont, s)) 1003 { 1004 case BAD: 1005 return BAD; 1006 case CUT: 1007 return CUT; 1008 case BEST: 1009 case GOOD: 1010 return BEST; 1011 } 1012 if (s >= e || !collmatch(rex, s, e, &s)) 1013 break; 1014 } 1015 } 1016 return r; 1017 case REX_CONJ: 1018 next.type = REX_CONJ_RIGHT; 1019 next.re.conj_right.cont = cont; 1020 next.next = rex->next; 1021 catcher.type = REX_CONJ_LEFT; 1022 catcher.re.conj_left.right = rex->re.group.expr.binary.right; 1023 catcher.re.conj_left.cont = &next; 1024 catcher.re.conj_left.beg = s; 1025 catcher.next = 0; 1026 return parse(env, rex->re.group.expr.binary.left, &catcher, s); 1027 case REX_CONJ_LEFT: 1028 rex->re.conj_left.cont->re.conj_right.end = s; 1029 cont = rex->re.conj_left.cont; 1030 s = rex->re.conj_left.beg; 1031 rex = rex->re.conj_left.right; 1032 continue; 1033 case REX_CONJ_RIGHT: 1034 if (rex->re.conj_right.end != s) 1035 return NONE; 1036 cont = rex->re.conj_right.cont; 1037 break; 1038 case REX_DONE: 1039 if (!env->stack) 1040 return BEST; 1041 n = s - env->beg; 1042 r = env->nsub; 1043 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 1044 if ((i = env->best[0].rm_eo) >= 0) 1045 { 1046 if (rex->flags & REG_MINIMAL) 1047 { 1048 if (n > i) 1049 return GOOD; 1050 } 1051 else 1052 { 1053 if (n < i) 1054 return GOOD; 1055 } 1056 if (n == i && better(env, 1057 (Pos_t*)env->bestpos->vec, 1058 (Pos_t*)env->pos->vec, 1059 (Pos_t*)env->bestpos->vec+env->bestpos->cur, 1060 (Pos_t*)env->pos->vec+env->pos->cur, 1061 0) <= 0) 1062 return GOOD; 1063 } 1064 env->best[0].rm_eo = n; 1065 memcpy(&env->best[1], &env->match[1], r * sizeof(regmatch_t)); 1066 n = env->pos->cur; 1067 if (!vector(Pos_t, env->bestpos, n)) 1068 { 1069 env->error = REG_ESPACE; 1070 return BAD; 1071 } 1072 env->bestpos->cur = n; 1073 memcpy(env->bestpos->vec, env->pos->vec, n * sizeof(Pos_t)); 1074 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 1075 return GOOD; 1076 case REX_DOT: 1077 if (LEADING(env, rex, s)) 1078 return NONE; 1079 n = rex->hi; 1080 if (n > env->end - s) 1081 n = env->end - s; 1082 m = rex->lo; 1083 if (m > n) 1084 return NONE; 1085 if ((c = rex->explicit) >= 0 && !mbwide()) 1086 for (i = 0; i < n; i++) 1087 if (s[i] == c) 1088 { 1089 n = i; 1090 break; 1091 } 1092 r = NONE; 1093 if (!(rex->flags & REG_MINIMAL)) 1094 { 1095 if (!mbwide()) 1096 { 1097 for (s += n; n-- >= m; s--) 1098 switch (follow(env, rex, cont, s)) 1099 { 1100 case BAD: 1101 return BAD; 1102 case CUT: 1103 return CUT; 1104 case BEST: 1105 return BEST; 1106 case GOOD: 1107 r = GOOD; 1108 break; 1109 } 1110 } 1111 else 1112 { 1113 if (!(b = (unsigned char*)stkpush(stkstd, n))) 1114 { 1115 env->error = REG_ESPACE; 1116 return BAD; 1117 } 1118 e = env->end; 1119 for (i = 0; s < e && i < n && *s != c; i++) 1120 s += b[i] = MBSIZE(s); 1121 for (; i-- >= m; s -= b[i]) 1122 switch (follow(env, rex, cont, s)) 1123 { 1124 case BAD: 1125 stkpop(stkstd); 1126 return BAD; 1127 case CUT: 1128 stkpop(stkstd); 1129 return CUT; 1130 case BEST: 1131 stkpop(stkstd); 1132 return BEST; 1133 case GOOD: 1134 r = GOOD; 1135 break; 1136 } 1137 stkpop(stkstd); 1138 } 1139 } 1140 else 1141 { 1142 if (!mbwide()) 1143 { 1144 e = s + n; 1145 for (s += m; s <= e; s++) 1146 switch (follow(env, rex, cont, s)) 1147 { 1148 case BAD: 1149 return BAD; 1150 case CUT: 1151 return CUT; 1152 case BEST: 1153 case GOOD: 1154 return BEST; 1155 } 1156 } 1157 else 1158 { 1159 e = env->end; 1160 for (i = 0; s < e && i < m && *s != c; i++) 1161 s += MBSIZE(s); 1162 if (i >= m) 1163 for (; s <= e && i <= n; s += MBSIZE(s), i++) 1164 switch (follow(env, rex, cont, s)) 1165 { 1166 case BAD: 1167 return BAD; 1168 case CUT: 1169 return CUT; 1170 case BEST: 1171 case GOOD: 1172 return BEST; 1173 } 1174 } 1175 } 1176 return r; 1177 case REX_END: 1178 if ((!(rex->flags & REG_NEWLINE) || *s != '\n') && ((env->flags & REG_NOTEOL) || s < env->end)) 1179 return NONE; 1180 break; 1181 case REX_GROUP: 1182 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); 1183 if (env->stack) 1184 { 1185 if (rex->re.group.number) 1186 env->match[rex->re.group.number].rm_so = s - env->beg; 1187 if (pospush(env, rex, s, BEG_SUB)) 1188 return BAD; 1189 catcher.re.group_catch.eo = rex->re.group.number ? &env->match[rex->re.group.number].rm_eo : (regoff_t*)0; 1190 } 1191 catcher.type = REX_GROUP_CATCH; 1192 catcher.serial = rex->serial; 1193 catcher.re.group_catch.cont = cont; 1194 catcher.next = rex->next; 1195 r = parse(env, rex->re.group.expr.rex, &catcher, s); 1196 if (env->stack) 1197 { 1198 pospop(env); 1199 if (rex->re.group.number) 1200 env->match[rex->re.group.number].rm_so = -1; 1201 } 1202 return r; 1203 case REX_GROUP_CATCH: 1204 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s=>%s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rexname(rex->re.group_catch.cont), env->end - s, s)),(0)); 1205 if (env->stack) 1206 { 1207 if (rex->re.group_catch.eo) 1208 *rex->re.group_catch.eo = s - env->beg; 1209 if (pospush(env, rex, s, END_ANY)) 1210 return BAD; 1211 } 1212 r = follow(env, rex, rex->re.group_catch.cont, s); 1213 if (env->stack) 1214 { 1215 pospop(env); 1216 if (rex->re.group_catch.eo) 1217 *rex->re.group_catch.eo = -1; 1218 } 1219 return r; 1220 case REX_GROUP_AHEAD: 1221 catcher.type = REX_GROUP_AHEAD_CATCH; 1222 catcher.flags = rex->flags; 1223 catcher.serial = rex->serial; 1224 catcher.re.rep_catch.beg = s; 1225 catcher.re.rep_catch.cont = cont; 1226 catcher.next = rex->next; 1227 return parse(env, rex->re.group.expr.rex, &catcher, s); 1228 case REX_GROUP_AHEAD_CATCH: 1229 return follow(env, rex, rex->re.rep_catch.cont, rex->re.rep_catch.beg); 1230 case REX_GROUP_AHEAD_NOT: 1231 r = parse(env, rex->re.group.expr.rex, NiL, s); 1232 if (r == NONE) 1233 r = follow(env, rex, cont, s); 1234 else if (r != BAD) 1235 r = NONE; 1236 return r; 1237 case REX_GROUP_BEHIND: 1238 if ((s - env->beg) < rex->re.group.size) 1239 return NONE; 1240 catcher.type = REX_GROUP_BEHIND_CATCH; 1241 catcher.flags = rex->flags; 1242 catcher.serial = rex->serial; 1243 catcher.re.behind_catch.beg = s; 1244 catcher.re.behind_catch.end = e = env->end; 1245 catcher.re.behind_catch.cont = cont; 1246 catcher.next = rex->next; 1247 for (t = s - rex->re.group.size; t >= env->beg; t--) 1248 { 1249 env->end = s; 1250 r = parse(env, rex->re.group.expr.rex, &catcher, t); 1251 env->end = e; 1252 if (r != NONE) 1253 return r; 1254 } 1255 return NONE; 1256 case REX_GROUP_BEHIND_CATCH: 1257 if (s != rex->re.behind_catch.beg) 1258 return NONE; 1259 env->end = rex->re.behind_catch.end; 1260 return follow(env, rex, rex->re.behind_catch.cont, rex->re.behind_catch.beg); 1261 case REX_GROUP_BEHIND_NOT: 1262 if ((s - env->beg) < rex->re.group.size) 1263 r = NONE; 1264 else 1265 { 1266 catcher.type = REX_GROUP_BEHIND_NOT_CATCH; 1267 catcher.re.neg_catch.beg = s; 1268 catcher.next = 0; 1269 e = env->end; 1270 env->end = s; 1271 for (t = s - rex->re.group.size; t >= env->beg; t--) 1272 { 1273 r = parse(env, rex->re.group.expr.rex, &catcher, t); 1274 if (r != NONE) 1275 break; 1276 } 1277 env->end = e; 1278 } 1279 if (r == NONE) 1280 r = follow(env, rex, cont, s); 1281 else if (r != BAD) 1282 r = NONE; 1283 return r; 1284 case REX_GROUP_BEHIND_NOT_CATCH: 1285 return s == rex->re.neg_catch.beg ? GOOD : NONE; 1286 case REX_GROUP_COND: 1287 if (q = rex->re.group.expr.binary.right) 1288 { 1289 catcher.re.cond_catch.next[0] = q->re.group.expr.binary.right; 1290 catcher.re.cond_catch.next[1] = q->re.group.expr.binary.left; 1291 } 1292 else 1293 catcher.re.cond_catch.next[0] = catcher.re.cond_catch.next[1] = 0; 1294 if (q = rex->re.group.expr.binary.left) 1295 { 1296 catcher.type = REX_GROUP_COND_CATCH; 1297 catcher.flags = rex->flags; 1298 catcher.serial = rex->serial; 1299 catcher.re.cond_catch.yes = 0; 1300 catcher.re.cond_catch.beg = s; 1301 catcher.re.cond_catch.cont = cont; 1302 catcher.next = rex->next; 1303 r = parse(env, q, &catcher, s); 1304 if (r == BAD || catcher.re.cond_catch.yes) 1305 return r; 1306 } 1307 else if (!rex->re.group.size || rex->re.group.size > 0 && env->match[rex->re.group.size].rm_so >= 0) 1308 r = GOOD; 1309 else 1310 r = NONE; 1311 if (q = catcher.re.cond_catch.next[r != NONE]) 1312 { 1313 catcher.type = REX_CAT; 1314 catcher.flags = q->flags; 1315 catcher.serial = q->serial; 1316 catcher.re.group_catch.cont = cont; 1317 catcher.next = rex->next; 1318 return parse(env, q, &catcher, s); 1319 } 1320 return follow(env, rex, cont, s); 1321 case REX_GROUP_COND_CATCH: 1322 rex->re.cond_catch.yes = 1; 1323 catcher.type = REX_CAT; 1324 catcher.flags = rex->flags; 1325 catcher.serial = rex->serial; 1326 catcher.re.group_catch.cont = rex->re.cond_catch.cont; 1327 catcher.next = rex->next; 1328 return parse(env, rex->re.cond_catch.next[1], &catcher, rex->re.cond_catch.beg); 1329 case REX_CAT: 1330 return follow(env, rex, rex->re.group_catch.cont, s); 1331 case REX_GROUP_CUT: 1332 catcher.type = REX_GROUP_CUT_CATCH; 1333 catcher.flags = rex->flags; 1334 catcher.serial = rex->serial; 1335 catcher.re.group_catch.cont = cont; 1336 catcher.next = rex->next; 1337 return parse(env, rex->re.group.expr.rex, &catcher, s); 1338 case REX_GROUP_CUT_CATCH: 1339 switch (r = follow(env, rex, rex->re.group_catch.cont, s)) 1340 { 1341 case GOOD: 1342 r = BEST; 1343 break; 1344 case NONE: 1345 r = CUT; 1346 break; 1347 } 1348 return r; 1349 case REX_KMP: 1350 f = rex->re.string.fail; 1351 b = rex->re.string.base; 1352 n = rex->re.string.size; 1353 t = s; 1354 e = env->end; 1355 if (p = rex->map) 1356 { 1357 while (t + n <= e) 1358 { 1359 for (i = -1; t < e; t++) 1360 { 1361 while (i >= 0 && b[i+1] != p[*t]) 1362 i = f[i]; 1363 if (b[i+1] == p[*t]) 1364 i++; 1365 if (i + 1 == n) 1366 { 1367 t++; 1368 if (env->stack) 1369 env->best[0].rm_so = t - s - n; 1370 switch (follow(env, rex, cont, t)) 1371 { 1372 case BAD: 1373 return BAD; 1374 case CUT: 1375 return CUT; 1376 case BEST: 1377 case GOOD: 1378 return BEST; 1379 } 1380 t -= n - 1; 1381 break; 1382 } 1383 } 1384 } 1385 } 1386 else 1387 { 1388 while (t + n <= e) 1389 { 1390 for (i = -1; t < e; t++) 1391 { 1392 while (i >= 0 && b[i+1] != *t) 1393 i = f[i]; 1394 if (b[i+1] == *t) 1395 i++; 1396 if (i + 1 == n) 1397 { 1398 t++; 1399 if (env->stack) 1400 env->best[0].rm_so = t - s - n; 1401 switch (follow(env, rex, cont, t)) 1402 { 1403 case BAD: 1404 return BAD; 1405 case CUT: 1406 return CUT; 1407 case BEST: 1408 case GOOD: 1409 return BEST; 1410 } 1411 t -= n - 1; 1412 break; 1413 } 1414 } 1415 } 1416 } 1417 return NONE; 1418 case REX_NEG: 1419 if (LEADING(env, rex, s)) 1420 return NONE; 1421 i = env->end - s; 1422 n = ((i + 7) >> 3) + 1; 1423 catcher.type = REX_NEG_CATCH; 1424 catcher.re.neg_catch.beg = s; 1425 if (!(p = (unsigned char*)stkpush(stkstd, n))) 1426 return BAD; 1427 memset(catcher.re.neg_catch.index = p, 0, n); 1428 catcher.next = rex->next; 1429 if (parse(env, rex->re.group.expr.rex, &catcher, s) == BAD) 1430 r = BAD; 1431 else 1432 { 1433 r = NONE; 1434 for (; i >= 0; i--) 1435 if (!bittst(p, i)) 1436 { 1437 switch (follow(env, rex, cont, s + i)) 1438 { 1439 case BAD: 1440 r = BAD; 1441 break; 1442 case BEST: 1443 r = BEST; 1444 break; 1445 case CUT: 1446 r = CUT; 1447 break; 1448 case GOOD: 1449 r = GOOD; 1450 /*FALLTHROUGH*/ 1451 default: 1452 continue; 1453 } 1454 break; 1455 } 1456 } 1457 stkpop(stkstd); 1458 return r; 1459 case REX_NEG_CATCH: 1460 bitset(rex->re.neg_catch.index, s - rex->re.neg_catch.beg); 1461 return NONE; 1462 case REX_NEST: 1463 if (s >= env->end) 1464 return NONE; 1465 do 1466 { 1467 if ((c = *s++) == rex->re.nest.primary) 1468 { 1469 if (s >= env->end || !(s = nestmatch(s, env->end, rex->re.nest.type, c))) 1470 return NONE; 1471 break; 1472 } 1473 if (rex->re.nest.primary >= 0) 1474 return NONE; 1475 if (rex->re.nest.type[c] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) 1476 break; 1477 if (!(s = nestmatch(s, env->end, rex->re.nest.type, c))) 1478 return NONE; 1479 } while (s < env->end && !(rex->re.nest.type[*(s-1)] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))); 1480 break; 1481 case REX_NULL: 1482 break; 1483 case REX_ONECHAR: 1484 n = rex->hi; 1485 if (n > env->end - s) 1486 n = env->end - s; 1487 m = rex->lo; 1488 if (m > n) 1489 return NONE; 1490 r = NONE; 1491 c = rex->re.onechar; 1492 if (!(rex->flags & REG_MINIMAL)) 1493 { 1494 if (!mbwide()) 1495 { 1496 if (p = rex->map) 1497 { 1498 for (i = 0; i < n; i++, s++) 1499 if (p[*s] != c) 1500 break; 1501 } 1502 else 1503 { 1504 for (i = 0; i < n; i++, s++) 1505 if (*s != c) 1506 break; 1507 } 1508 for (; i-- >= m; s--) 1509 switch (follow(env, rex, cont, s)) 1510 { 1511 case BAD: 1512 return BAD; 1513 case BEST: 1514 return BEST; 1515 case CUT: 1516 return CUT; 1517 case GOOD: 1518 r = GOOD; 1519 break; 1520 } 1521 } 1522 else 1523 { 1524 if (!(b = (unsigned char*)stkpush(stkstd, n))) 1525 { 1526 env->error = REG_ESPACE; 1527 return BAD; 1528 } 1529 e = env->end; 1530 if (!(rex->flags & REG_ICASE)) 1531 { 1532 for (i = 0; s < e && i < n; i++, s = t) 1533 { 1534 t = s; 1535 if (mbchar(t) != c) 1536 break; 1537 b[i] = t - s; 1538 } 1539 } 1540 else 1541 { 1542 for (i = 0; s < e && i < n; i++, s = t) 1543 { 1544 t = s; 1545 if (towupper(mbchar(t)) != c) 1546 break; 1547 b[i] = t - s; 1548 } 1549 } 1550 for (; i-- >= m; s -= b[i]) 1551 switch (follow(env, rex, cont, s)) 1552 { 1553 case BAD: 1554 stkpop(stkstd); 1555 return BAD; 1556 case BEST: 1557 stkpop(stkstd); 1558 return BEST; 1559 case CUT: 1560 stkpop(stkstd); 1561 return CUT; 1562 case GOOD: 1563 r = GOOD; 1564 break; 1565 } 1566 stkpop(stkstd); 1567 } 1568 } 1569 else 1570 { 1571 if (!mbwide()) 1572 { 1573 e = s + m; 1574 if (p = rex->map) 1575 { 1576 for (; s < e; s++) 1577 if (p[*s] != c) 1578 return r; 1579 e += n - m; 1580 for (;;) 1581 { 1582 switch (follow(env, rex, cont, s)) 1583 { 1584 case BAD: 1585 return BAD; 1586 case CUT: 1587 return CUT; 1588 case BEST: 1589 case GOOD: 1590 return BEST; 1591 } 1592 if (s >= e || p[*s++] != c) 1593 break; 1594 } 1595 } 1596 else 1597 { 1598 for (; s < e; s++) 1599 if (*s != c) 1600 return r; 1601 e += n - m; 1602 for (;;) 1603 { 1604 switch (follow(env, rex, cont, s)) 1605 { 1606 case BAD: 1607 return BAD; 1608 case CUT: 1609 return CUT; 1610 case BEST: 1611 case GOOD: 1612 return BEST; 1613 } 1614 if (s >= e || *s++ != c) 1615 break; 1616 } 1617 } 1618 } 1619 else 1620 { 1621 if (!(rex->flags & REG_ICASE)) 1622 { 1623 for (i = 0; i < m && s < e; i++, s = t) 1624 { 1625 t = s; 1626 if (mbchar(t) != c) 1627 return r; 1628 } 1629 while (i++ <= n) 1630 { 1631 switch (follow(env, rex, cont, s)) 1632 { 1633 case BAD: 1634 return BAD; 1635 case CUT: 1636 return CUT; 1637 case BEST: 1638 case GOOD: 1639 return BEST; 1640 } 1641 if (s >= e || mbchar(s) != c) 1642 break; 1643 } 1644 } 1645 else 1646 { 1647 for (i = 0; i < m && s < e; i++, s = t) 1648 { 1649 t = s; 1650 if (towupper(mbchar(t)) != c) 1651 return r; 1652 } 1653 while (i++ <= n) 1654 { 1655 switch (follow(env, rex, cont, s)) 1656 { 1657 case BAD: 1658 return BAD; 1659 case CUT: 1660 return CUT; 1661 case BEST: 1662 case GOOD: 1663 return BEST; 1664 } 1665 if (s >= e || towupper(mbchar(s)) != c) 1666 break; 1667 } 1668 } 1669 } 1670 } 1671 return r; 1672 case REX_REP: 1673 if (env->stack && pospush(env, rex, s, BEG_REP)) 1674 return BAD; 1675 r = parserep(env, rex, cont, s, 0); 1676 if (env->stack) 1677 pospop(env); 1678 return r; 1679 case REX_REP_CATCH: 1680 DEBUG_TEST(0x0020,(sfprintf(sfstdout, "AHA#%04d 0x%04x %s n %d len %d s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.rep_catch.n, s - rex->re.rep_catch.beg, env->end - s, s)),(0)); 1681 if (env->stack && pospush(env, rex, s, END_ANY)) 1682 return BAD; 1683 if (s == rex->re.rep_catch.beg && rex->re.rep_catch.n > rex->re.rep_catch.ref->lo) 1684 { 1685 /* 1686 * optional empty iteration 1687 */ 1688 1689 DEBUG_TEST(0x0002,(sfprintf(sfstdout, "AHA#%04d %p re.group.back=%d re.group.expr.rex=%s\n", __LINE__, rex->re.rep_catch.ref->re.group.expr.rex, rex->re.rep_catch.ref->re.group.expr.rex->re.group.back, rexname(rex->re.rep_catch.ref->re.group.expr.rex))),(0)); 1690 if (!env->stack || s != rex->re.rep_catch.ref->re.rep_catch.beg && !rex->re.rep_catch.ref->re.group.expr.rex->re.group.back) 1691 r = NONE; 1692 else if (pospush(env, rex, s, END_ANY)) 1693 r = BAD; 1694 else 1695 { 1696 r = follow(env, rex, rex->re.rep_catch.cont, s); 1697 pospop(env); 1698 } 1699 } 1700 else 1701 r = parserep(env, rex->re.rep_catch.ref, rex->re.rep_catch.cont, s, rex->re.rep_catch.n); 1702 if (env->stack) 1703 pospop(env); 1704 return r; 1705 case REX_STRING: 1706 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s \"%-.*s\" `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.string.size, rex->re.string.base, env->end - s, s)),(0)); 1707 if (rex->re.string.size > (env->end - s)) 1708 return NONE; 1709 t = rex->re.string.base; 1710 e = t + rex->re.string.size; 1711 if (!(p = rex->map)) 1712 { 1713 while (t < e) 1714 if (*s++ != *t++) 1715 return NONE; 1716 } 1717 else if (!mbwide()) 1718 { 1719 while (t < e) 1720 if (p[*s++] != *t++) 1721 return NONE; 1722 } 1723 else 1724 { 1725 while (t < e) 1726 { 1727 c = mbchar(s); 1728 d = mbchar(t); 1729 if (towupper(c) != d) 1730 return NONE; 1731 } 1732 } 1733 break; 1734 case REX_TRIE: 1735 if (((s + rex->re.trie.min) > env->end) || !(x = rex->re.trie.root[rex->map ? rex->map[*s] : *s])) 1736 return NONE; 1737 return parsetrie(env, x, rex, cont, s); 1738 case REX_EXEC: 1739 u = 0; 1740 r = (*env->disc->re_execf)(env->regex, rex->re.exec.data, rex->re.exec.text, rex->re.exec.size, (const char*)s, env->end - s, &u, env->disc); 1741 e = (unsigned char*)u; 1742 if (e >= s && e <= env->end) 1743 s = e; 1744 switch (r) 1745 { 1746 case 0: 1747 break; 1748 case REG_NOMATCH: 1749 return NONE; 1750 default: 1751 env->error = r; 1752 return BAD; 1753 } 1754 break; 1755 case REX_WBEG: 1756 if (!isword(*s) || s > env->beg && isword(*(s - 1))) 1757 return NONE; 1758 break; 1759 case REX_WEND: 1760 if (isword(*s) || s > env->beg && !isword(*(s - 1))) 1761 return NONE; 1762 break; 1763 case REX_WORD: 1764 if (s > env->beg && isword(*(s - 1)) == isword(*s)) 1765 return NONE; 1766 break; 1767 case REX_WORD_NOT: 1768 if (s == env->beg || isword(*(s - 1)) != isword(*s)) 1769 return NONE; 1770 break; 1771 case REX_BEG_STR: 1772 if (s != env->beg) 1773 return NONE; 1774 break; 1775 case REX_END_STR: 1776 for (t = s; t < env->end && *t == '\n'; t++); 1777 if (t < env->end) 1778 return NONE; 1779 break; 1780 case REX_FIN_STR: 1781 if (s < env->end) 1782 return NONE; 1783 break; 1784 } 1785 if (!(rex = rex->next)) 1786 { 1787 if (!(rex = cont)) 1788 break; 1789 cont = 0; 1790 } 1791 } 1792 return GOOD; 1793 } 1794 1795 #if _AST_REGEX_DEBUG 1796 1797 static void 1798 listnode(Rex_t* e, int level) 1799 { 1800 int i; 1801 1802 if (e) 1803 { 1804 do 1805 { 1806 for (i = 0; i < level; i++) 1807 sfprintf(sfstderr, " "); 1808 sfprintf(sfstderr, "%s\n", rexname(e)); 1809 switch (e->type) 1810 { 1811 case REX_ALT: 1812 case REX_CONJ: 1813 listnode(e->re.group.expr.binary.left, level + 1); 1814 listnode(e->re.group.expr.binary.right, level + 1); 1815 break; 1816 case REX_GROUP: 1817 case REX_GROUP_AHEAD: 1818 case REX_GROUP_AHEAD_NOT: 1819 case REX_GROUP_BEHIND: 1820 case REX_GROUP_BEHIND_NOT: 1821 case REX_GROUP_CUT: 1822 case REX_NEG: 1823 case REX_REP: 1824 listnode(e->re.group.expr.rex, level + 1); 1825 break; 1826 } 1827 } while (e = e->next); 1828 } 1829 } 1830 1831 static int 1832 list(Env_t* env, Rex_t* rex) 1833 { 1834 sfprintf(sfstderr, "AHA regex hard=%d stack=%p\n", env->hard, env->stack); 1835 if (rex) 1836 listnode(rex, 1); 1837 return 0; 1838 } 1839 1840 #endif 1841 1842 /* 1843 * returning REG_BADPAT or REG_ESPACE is not explicitly 1844 * countenanced by the standard 1845 */ 1846 1847 int 1848 regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, regmatch_t* match, regflags_t flags) 1849 { 1850 register int n; 1851 register int i; 1852 int j; 1853 int k; 1854 int m; 1855 int advance; 1856 Env_t* env; 1857 Rex_t* e; 1858 1859 DEBUG_INIT(); 1860 DEBUG_TEST(0x0001,(sfprintf(sfstdout, "AHA#%04d 0x%04x regnexec %d 0x%08x `%-.*s'\n", __LINE__, debug_flag, nmatch, flags, len, s)),(0)); 1861 if (!p || !(env = p->env)) 1862 return REG_BADPAT; 1863 if (!s) 1864 return fatal(env->disc, REG_BADPAT, NiL); 1865 if (len < env->min) 1866 { 1867 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, env->min)),(0)); 1868 return REG_NOMATCH; 1869 } 1870 env->regex = p; 1871 env->beg = (unsigned char*)s; 1872 env->end = env->beg + len; 1873 stknew(stkstd, &env->stk); 1874 env->flags &= ~REG_EXEC; 1875 env->flags |= (flags & REG_EXEC); 1876 advance = 0; 1877 if (env->stack = env->hard || !(env->flags & REG_NOSUB) && nmatch) 1878 { 1879 n = env->nsub; 1880 if (!(env->match = (regmatch_t*)stkpush(stkstd, 2 * (n + 1) * sizeof(regmatch_t))) || 1881 !env->pos && !(env->pos = vecopen(16, sizeof(Pos_t))) || 1882 !env->bestpos && !(env->bestpos = vecopen(16, sizeof(Pos_t)))) 1883 { 1884 k = REG_ESPACE; 1885 goto done; 1886 } 1887 env->pos->cur = env->bestpos->cur = 0; 1888 env->best = &env->match[n + 1]; 1889 env->best[0].rm_so = 0; 1890 env->best[0].rm_eo = -1; 1891 for (i = 0; i <= n; i++) 1892 env->match[i] = state.nomatch; 1893 if (flags & REG_ADVANCE) 1894 advance = 1; 1895 } 1896 DEBUG_TEST(0x1000,(list(env,env->rex)),(0)); 1897 k = REG_NOMATCH; 1898 if ((e = env->rex)->type == REX_BM) 1899 { 1900 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM\n", __LINE__)),(0)); 1901 if (len < e->re.bm.right) 1902 { 1903 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, e->re.bm.right)),(0)); 1904 goto done; 1905 } 1906 else if (!(flags & REG_LEFT)) 1907 { 1908 register unsigned char* buf = (unsigned char*)s; 1909 register size_t index = e->re.bm.left + e->re.bm.size; 1910 register size_t mid = len - e->re.bm.right; 1911 register size_t* skip = e->re.bm.skip; 1912 register size_t* fail = e->re.bm.fail; 1913 register Bm_mask_t** mask = e->re.bm.mask; 1914 Bm_mask_t m; 1915 size_t x; 1916 1917 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM len=%d right=%d left=%d size=%d %d %d\n", __LINE__, len, e->re.bm.right, e->re.bm.left, e->re.bm.size, index, mid)),(0)); 1918 for (;;) 1919 { 1920 while ((index += skip[buf[index]]) < mid); 1921 if (index < HIT) 1922 { 1923 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, index, HIT)),(0)); 1924 goto done; 1925 } 1926 index -= HIT; 1927 m = mask[n = e->re.bm.size - 1][buf[index]]; 1928 do 1929 { 1930 if (!n--) 1931 { 1932 if (e->re.bm.back < 0) 1933 goto possible; 1934 if (advance) 1935 { 1936 i = index - e->re.bm.back; 1937 s += i; 1938 if (env->stack) 1939 env->best[0].rm_so += i; 1940 goto possible; 1941 } 1942 x = index; 1943 if (index < e->re.bm.back) 1944 index = 0; 1945 else 1946 index -= e->re.bm.back; 1947 while (index <= x) 1948 { 1949 if ((i = parse(env, e->next, &env->done, buf + index)) != NONE) 1950 { 1951 if (env->stack) 1952 env->best[0].rm_so = index; 1953 n = env->nsub; 1954 goto hit; 1955 } 1956 index++; 1957 } 1958 index += e->re.bm.size; 1959 break; 1960 } 1961 } while (m &= mask[n][buf[--index]]); 1962 if ((index += fail[n + 1]) >= len) 1963 goto done; 1964 } 1965 } 1966 possible: 1967 n = env->nsub; 1968 e = e->next; 1969 } 1970 j = env->once || (flags & REG_LEFT); 1971 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d parse once=%d\n", __LINE__, j)),(0)); 1972 while ((i = parse(env, e, &env->done, (unsigned char*)s)) == NONE || advance && !env->best[0].rm_eo && !(advance = 0)) 1973 { 1974 if (j) 1975 goto done; 1976 i = MBSIZE(s); 1977 s += i; 1978 if ((unsigned char*)s > env->end - env->min) 1979 goto done; 1980 if (env->stack) 1981 env->best[0].rm_so += i; 1982 } 1983 if ((flags & REG_LEFT) && env->stack && env->best[0].rm_so) 1984 goto done; 1985 hit: 1986 if (k = env->error) 1987 goto done; 1988 if (i == CUT) 1989 { 1990 k = env->error = REG_NOMATCH; 1991 goto done; 1992 } 1993 if (!(env->flags & REG_NOSUB)) 1994 { 1995 k = (env->flags & (REG_SHELL|REG_AUGMENTED)) == (REG_SHELL|REG_AUGMENTED); 1996 for (i = j = m = 0; j < nmatch; i++) 1997 if (!i || !k || (i & 1)) 1998 { 1999 if (i > n) 2000 match[j] = state.nomatch; 2001 else 2002 match[m = j] = env->best[i]; 2003 j++; 2004 } 2005 if (k) 2006 { 2007 while (m > 0 && match[m].rm_so == -1 && match[m].rm_eo == -1) 2008 m--; 2009 ((regex_t*)p)->re_nsub = m; 2010 } 2011 } 2012 k = 0; 2013 done: 2014 stkold(stkstd, &env->stk); 2015 env->stk.base = 0; 2016 if (k > REG_NOMATCH) 2017 fatal(p->env->disc, k, NiL); 2018 return k; 2019 } 2020 2021 void 2022 regfree(regex_t* p) 2023 { 2024 Env_t* env; 2025 2026 if (p && (env = p->env)) 2027 { 2028 #if _REG_subcomp 2029 if (env->sub) 2030 { 2031 regsubfree(p); 2032 p->re_sub = 0; 2033 } 2034 #endif 2035 p->env = 0; 2036 if (--env->refs <= 0 && !(env->disc->re_flags & REG_NOFREE)) 2037 { 2038 drop(env->disc, env->rex); 2039 if (env->pos) 2040 vecclose(env->pos); 2041 if (env->bestpos) 2042 vecclose(env->bestpos); 2043 if (env->stk.base) 2044 stkold(stkstd, &env->stk); 2045 alloc(env->disc, env, 0); 2046 } 2047 } 2048 } 2049