1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2012 AT&T Intellectual Property * 5 * and is licensed under the * 6 * Eclipse Public License, Version 1.0 * 7 * by AT&T Intellectual Property * 8 * * 9 * A copy of the License is available at * 10 * http://www.eclipse.org/org/documents/epl-v10.html * 11 * (with md5 checksum b35adb5213ca9657e911e9befb180842) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * posix regex executor 26 * single sized-record interface 27 */ 28 29 #include "reglib.h" 30 31 #if _AST_REGEX_DEBUG 32 33 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n)) 34 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0) 35 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_exec_debug")) debug |= strtoul(t, NiL, 0); } } while (0) 36 37 static unsigned long debug; 38 static unsigned long debug_flag; 39 40 static const char* rexnames[] = 41 { 42 "REX_NULL", 43 "REX_ALT", 44 "REX_ALT_CATCH", 45 "REX_BACK", 46 "REX_BEG", 47 "REX_BEG_STR", 48 "REX_BM", 49 "REX_CAT", 50 "REX_CLASS", 51 "REX_COLL_CLASS", 52 "REX_CONJ", 53 "REX_CONJ_LEFT", 54 "REX_CONJ_RIGHT", 55 "REX_DONE", 56 "REX_DOT", 57 "REX_END", 58 "REX_END_STR", 59 "REX_EXEC", 60 "REX_FIN_STR", 61 "REX_GROUP", 62 "REX_GROUP_CATCH", 63 "REX_GROUP_AHEAD", 64 "REX_GROUP_AHEAD_CATCH", 65 "REX_GROUP_AHEAD_NOT", 66 "REX_GROUP_BEHIND", 67 "REX_GROUP_BEHIND_CATCH", 68 "REX_GROUP_BEHIND_NOT", 69 "REX_GROUP_BEHIND_NOT_CATCH", 70 "REX_GROUP_COND", 71 "REX_GROUP_COND_CATCH", 72 "REX_GROUP_CUT", 73 "REX_GROUP_CUT_CATCH", 74 "REX_KMP", 75 "REX_NEG", 76 "REX_NEG_CATCH", 77 "REX_NEST", 78 "REX_ONECHAR", 79 "REX_REP", 80 "REX_REP_CATCH", 81 "REX_STRING", 82 "REX_TRIE", 83 "REX_WBEG", 84 "REX_WEND", 85 "REX_WORD", 86 "REX_WORD_NOT" 87 }; 88 89 static const char* rexname(Rex_t* rex) 90 { 91 if (!rex) 92 return "NIL"; 93 if (rex->type >= elementsof(rexnames)) 94 return "ERROR"; 95 return rexnames[rex->type]; 96 } 97 98 #else 99 100 #define DEBUG_INIT() 101 #define DEBUG_TEST(f,y,n) (n) 102 #define DEBUG_CODE(f,y,n) do {n} while(0) 103 104 #endif 105 106 #define BEG_ALT 1 /* beginning of an alt */ 107 #define BEG_ONE 2 /* beginning of one iteration of a rep */ 108 #define BEG_REP 3 /* beginning of a repetition */ 109 #define BEG_SUB 4 /* beginning of a subexpression */ 110 #define END_ANY 5 /* end of any of above */ 111 112 /* 113 * returns from parse() 114 */ 115 116 #define NONE 0 /* no parse found */ 117 #define GOOD 1 /* some parse was found */ 118 #define CUT 2 /* no match and no backtrack */ 119 #define BEST 3 /* an unbeatable parse was found */ 120 #define BAD 4 /* error ocurred */ 121 122 /* 123 * REG_SHELL_DOT test 124 */ 125 126 #define LEADING(e,r,s) (*(s)==(e)->leading&&((s)==(e)->beg||*((s)-1)==(r)->explicit)) 127 128 /* 129 * Pos_t is for comparing parses. An entry is made in the 130 * array at the beginning and at the end of each Group_t, 131 * each iteration in a Group_t, and each Binary_t. 132 */ 133 134 typedef struct 135 { 136 unsigned char* p; /* where in string */ 137 size_t length; /* length in string */ 138 short serial; /* preorder subpattern number */ 139 short be; /* which end of pair */ 140 } Pos_t; 141 142 /* ===== begin library support ===== */ 143 144 #define vector(t,v,i) (((i)<(v)->max)?(t*)((v)->vec+(i)*(v)->siz):(t*)vecseek(&(v),i)) 145 146 static Vector_t* 147 vecopen(int inc, int siz) 148 { 149 Vector_t* v; 150 Stk_t* sp; 151 152 if (inc <= 0) 153 inc = 16; 154 if (!(sp = stkopen(STK_SMALL|STK_NULL))) 155 return 0; 156 if (!(v = (Vector_t*)stkseek(sp, sizeof(Vector_t) + inc * siz))) 157 { 158 stkclose(sp); 159 return 0; 160 } 161 v->stk = sp; 162 v->vec = (char*)v + sizeof(Vector_t); 163 v->max = v->inc = inc; 164 v->siz = siz; 165 v->cur = 0; 166 return v; 167 } 168 169 static void* 170 vecseek(Vector_t** p, int index) 171 { 172 Vector_t* v = *p; 173 174 if (index >= v->max) 175 { 176 while ((v->max += v->inc) <= index); 177 if (!(v = (Vector_t*)stkseek(v->stk, sizeof(Vector_t) + v->max * v->siz))) 178 return 0; 179 *p = v; 180 v->vec = (char*)v + sizeof(Vector_t); 181 } 182 return v->vec + index * v->siz; 183 } 184 185 static void 186 vecclose(Vector_t* v) 187 { 188 if (v) 189 stkclose(v->stk); 190 } 191 192 typedef struct 193 { 194 Stk_pos_t pos; 195 char data[1]; 196 } Stk_frame_t; 197 198 #define stknew(s,p) ((p)->offset=stktell(s),(p)->base=stkfreeze(s,0)) 199 #define stkold(s,p) stkset(s,(p)->base,(p)->offset) 200 201 #define stkframe(s) (*((Stk_frame_t**)stktop(s)-1)) 202 #define stkdata(s,t) ((t*)stkframe(s)->data) 203 #define stkpop(s) stkold(s,&(stkframe(s)->pos)) 204 205 static void* 206 stkpush(Stk_t* sp, size_t size) 207 { 208 Stk_frame_t* f; 209 Stk_pos_t p; 210 211 stknew(sp, &p); 212 size = sizeof(Stk_frame_t) + sizeof(size_t) + size - 1; 213 if (!(f = (Stk_frame_t*)stkalloc(sp, sizeof(Stk_frame_t) + sizeof(Stk_frame_t*) + size - 1))) 214 return 0; 215 f->pos = p; 216 stkframe(sp) = f; 217 return f->data; 218 } 219 220 /* ===== end library support ===== */ 221 222 /* 223 * Match_frame_t is for saving and restoring match records 224 * around alternate attempts, so that fossils will not be 225 * left in the match array. These are the only entries in 226 * the match array that are not otherwise guaranteed to 227 * have current data in them when they get used. 228 */ 229 230 typedef struct 231 { 232 size_t size; 233 regmatch_t* match; 234 regmatch_t save[1]; 235 } Match_frame_t; 236 237 #define matchpush(e,x) ((x)->re.group.number?_matchpush(e,x):0) 238 #define matchcopy(e,x) do if ((x)->re.group.number) { Match_frame_t* fp = (void*)stkframe(stkstd)->data; memcpy(fp->match, fp->save, fp->size); } while (0) 239 #define matchpop(e,x) do if ((x)->re.group.number) { Match_frame_t* fp = (void*)stkframe(stkstd)->data; memcpy(fp->match, fp->save, fp->size); stkpop(stkstd); } while (0) 240 241 #define pospop(e) (--(e)->pos->cur) 242 243 /* 244 * allocate a frame and push a match onto the stack 245 */ 246 247 static int 248 _matchpush(Env_t* env, Rex_t* rex) 249 { 250 Match_frame_t* f; 251 regmatch_t* m; 252 regmatch_t* e; 253 regmatch_t* s; 254 int num; 255 256 if (rex->re.group.number <= 0 || (num = rex->re.group.last - rex->re.group.number + 1) <= 0) 257 num = 0; 258 if (!(f = (Match_frame_t*)stkpush(stkstd, sizeof(Match_frame_t) + (num - 1) * sizeof(regmatch_t)))) 259 { 260 env->error = REG_ESPACE; 261 return 1; 262 } 263 f->size = num * sizeof(regmatch_t); 264 f->match = m = env->match + rex->re.group.number; 265 e = m + num; 266 s = f->save; 267 while (m < e) 268 { 269 *s++ = *m; 270 *m++ = state.nomatch; 271 } 272 return 0; 273 } 274 275 /* 276 * allocate a frame and push a pos onto the stack 277 */ 278 279 static int 280 pospush(Env_t* env, Rex_t* rex, unsigned char* p, int be) 281 { 282 Pos_t* pos; 283 284 if (!(pos = vector(Pos_t, env->pos, env->pos->cur))) 285 { 286 env->error = REG_ESPACE; 287 return 1; 288 } 289 pos->serial = rex->serial; 290 pos->p = p; 291 pos->be = be; 292 env->pos->cur++; 293 return 0; 294 } 295 296 /* 297 * two matches are known to have the same length 298 * os is start of old pos array, ns is start of new, 299 * oend and nend are end+1 pointers to ends of arrays. 300 * oe and ne are ends (not end+1) of subarrays. 301 * returns 1 if new is better, -1 if old, else 0. 302 */ 303 304 static int 305 better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) 306 { 307 Pos_t* oe; 308 Pos_t* ne; 309 int k; 310 int n; 311 312 if (env->error) 313 return -1; 314 for (;;) 315 { 316 DEBUG_CODE(0x0080,{sfprintf(sfstdout, " %-*.*sold ", (level + 3) * 4, (level + 3) * 4, "");for (oe = os; oe < oend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n %-*.*snew ", (level + 3) * 4, (level + 3) * 4, "");for (oe = ns; oe < nend; oe++)sfprintf(sfstdout, "<%d,%d,%d>", oe->p - env->beg, oe->serial, oe->be);sfprintf(sfstdout, "\n");},{0;}); 317 if (ns >= nend) 318 return DEBUG_TEST(0x8000,(os < oend),(0)); 319 if (os >= oend) 320 return DEBUG_TEST(0x8000,(-1),(1)); 321 n = os->serial; 322 if (ns->serial > n) 323 return -1; 324 if (n > ns->serial) 325 { 326 env->error = REG_PANIC; 327 return -1; 328 } 329 if (ns->p > os->p) 330 return 1; 331 if (os->p > ns->p) 332 return -1; 333 oe = os; 334 k = 0; 335 for (;;) 336 if ((++oe)->serial == n) 337 { 338 if (oe->be != END_ANY) 339 k++; 340 else if (k-- <= 0) 341 break; 342 } 343 ne = ns; 344 k = 0; 345 for (;;) 346 if ((++ne)->serial == n) 347 { 348 if (ne->be != END_ANY) 349 k++; 350 else if (k-- <= 0) 351 break; 352 } 353 if (ne->p > oe->p) 354 return 1; 355 if (oe->p > ne->p) 356 return -1; 357 if (k = better(env, os + 1, ns + 1, oe, ne, level + 1)) 358 return k; 359 os = oe + 1; 360 ns = ne + 1; 361 } 362 } 363 364 #if _AST_REGEX_DEBUG 365 366 static void 367 showmatch(regmatch_t* p) 368 { 369 sfputc(sfstdout, '('); 370 if (p->rm_so < 0) 371 sfputc(sfstdout, '?'); 372 else 373 sfprintf(sfstdout, "%z", p->rm_so); 374 sfputc(sfstdout, ','); 375 if (p->rm_eo < 0) 376 sfputc(sfstdout, '?'); 377 else 378 sfprintf(sfstdout, "%z", p->rm_eo); 379 sfputc(sfstdout, ')'); 380 } 381 382 static int 383 _better(Env_t* env, Pos_t* os, Pos_t* ns, Pos_t* oend, Pos_t* nend, int level) 384 { 385 int i; 386 387 DEBUG_CODE(0x0040,{sfprintf(sfstdout, "AHA better old ");for (i = 0; i <= env->nsub; i++)showmatch(&env->best[i]);sfprintf(sfstdout, "\n new ");for (i = 0; i <= env->nsub; i++)showmatch(&env->match[i]);sfprintf(sfstdout, "\n");},{0;}); 388 i = better(env, os, ns, oend, nend, 0); 389 DEBUG_TEST(0x0040,(sfprintf(sfstdout, " %s\n", i <= 0 ? "OLD" : "NEW")),(0)); 390 return i; 391 } 392 393 #define better _better 394 395 #endif 396 397 #define follow(e,r,c,s) ((r)->next?parse(e,(r)->next,c,s):(c)?parse(e,c,0,s):BEST) 398 399 static int parse(Env_t*, Rex_t*, Rex_t*, unsigned char*); 400 401 static int 402 parserep(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s, int n) 403 { 404 int i; 405 int r = NONE; 406 Rex_t catcher; 407 408 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep %s %d %d %d %d `%-.*s'\n", __LINE__, debug_flag, rexname(rex->re.group.expr.rex), rex->re.group.number, rex->lo, n, rex->hi, env->end - s, s)),(0)); 409 if ((rex->flags & REG_MINIMAL) && n >= rex->lo && n < rex->hi) 410 { 411 if (env->stack && pospush(env, rex, s, END_ANY)) 412 return BAD; 413 i = follow(env, rex, cont, s); 414 if (env->stack) 415 pospop(env); 416 switch (i) 417 { 418 case BAD: 419 return BAD; 420 case CUT: 421 return CUT; 422 case BEST: 423 case GOOD: 424 return BEST; 425 } 426 } 427 if (n < rex->hi) 428 { 429 catcher.type = REX_REP_CATCH; 430 catcher.serial = rex->serial; 431 catcher.re.rep_catch.ref = rex; 432 catcher.re.rep_catch.cont = cont; 433 catcher.re.rep_catch.beg = s; 434 catcher.re.rep_catch.n = n + 1; 435 catcher.next = rex->next; 436 if (n == 0) 437 rex->re.rep_catch.beg = s; 438 if (env->stack) 439 { 440 if (matchpush(env, rex)) 441 return BAD; 442 if (pospush(env, rex, s, BEG_ONE)) 443 return BAD; 444 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x PUSH %d (%z,%z)(%z,%z)(%z,%z) (%z,%z)(%z,%z)(%z,%z)\n", __LINE__, debug_flag, rex->re.group.number, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0)); 445 } 446 r = parse(env, rex->re.group.expr.rex, &catcher, s); 447 DEBUG_TEST(0x0010,(sfprintf(sfstdout, "AHA#%04d 0x%04x parserep parse %d %d `%-.*s'\n", __LINE__, debug_flag, rex->re.group.number, r, env->end - s, s)),(0)); 448 if (env->stack) 449 { 450 pospop(env); 451 matchpop(env, rex); 452 DEBUG_TEST(0x0004,(sfprintf(sfstdout,"AHA#%04d 0x%04x POP %d %d (%z,%z)(%z,%z)(%z,%z) (%z,%z)(%z,%z)(%z,%z)\n", __LINE__, debug_flag, rex->re.group.number, r, env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo, env->match[2].rm_so, env->match[2].rm_eo)),(0)); 453 } 454 switch (r) 455 { 456 case BAD: 457 return BAD; 458 case BEST: 459 return BEST; 460 case CUT: 461 r = NONE; 462 break; 463 case GOOD: 464 if (rex->flags & REG_MINIMAL) 465 return BEST; 466 r = GOOD; 467 break; 468 } 469 } 470 if (n < rex->lo) 471 return r; 472 if (!(rex->flags & REG_MINIMAL) || n >= rex->hi) 473 { 474 if (env->stack && pospush(env, rex, s, END_ANY)) 475 return BAD; 476 i = follow(env, rex, cont, s); 477 if (env->stack) 478 pospop(env); 479 switch (i) 480 { 481 case BAD: 482 r = BAD; 483 break; 484 case CUT: 485 r = CUT; 486 break; 487 case BEST: 488 r = BEST; 489 break; 490 case GOOD: 491 r = (rex->flags & REG_MINIMAL) ? BEST : GOOD; 492 break; 493 } 494 } 495 return r; 496 } 497 498 static int 499 parsetrie(Env_t* env, Trie_node_t* x, Rex_t* rex, Rex_t* cont, unsigned char* s) 500 { 501 unsigned char* p; 502 int r; 503 504 if (p = rex->map) 505 { 506 for (;;) 507 { 508 if (s >= env->end) 509 return NONE; 510 while (x->c != p[*s]) 511 if (!(x = x->sib)) 512 return NONE; 513 if (x->end) 514 break; 515 x = x->son; 516 s++; 517 } 518 } 519 else 520 { 521 for (;;) 522 { 523 if (s >= env->end) 524 return NONE; 525 while (x->c != *s) 526 if (!(x = x->sib)) 527 return NONE; 528 if (x->end) 529 break; 530 x = x->son; 531 s++; 532 } 533 } 534 s++; 535 if (rex->flags & REG_MINIMAL) 536 switch (follow(env, rex, cont, s)) 537 { 538 case BAD: 539 return BAD; 540 case CUT: 541 return CUT; 542 case BEST: 543 case GOOD: 544 return BEST; 545 } 546 if (x->son) 547 switch (parsetrie(env, x->son, rex, cont, s)) 548 { 549 case BAD: 550 return BAD; 551 case CUT: 552 return CUT; 553 case BEST: 554 return BEST; 555 case GOOD: 556 if (rex->flags & REG_MINIMAL) 557 return BEST; 558 r = GOOD; 559 break; 560 default: 561 r = NONE; 562 break; 563 } 564 else 565 r = NONE; 566 if (!(rex->flags & REG_MINIMAL)) 567 switch (follow(env, rex, cont, s)) 568 { 569 case BAD: 570 return BAD; 571 case CUT: 572 return CUT; 573 case BEST: 574 return BEST; 575 case GOOD: 576 return GOOD; 577 } 578 return r; 579 } 580 581 static int 582 collelt(register Celt_t* ce, char* key, int c, int x) 583 { 584 Ckey_t elt; 585 586 mbxfrm(elt, key, COLL_KEY_MAX); 587 for (;; ce++) 588 { 589 switch (ce->typ) 590 { 591 case COLL_call: 592 if (!x && (*ce->fun)(c)) 593 return 1; 594 continue; 595 case COLL_char: 596 if (!strcmp((char*)ce->beg, (char*)elt)) 597 return 1; 598 continue; 599 case COLL_range: 600 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max) 601 return 1; 602 continue; 603 case COLL_range_lc: 604 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswlower(c) || !iswupper(c))) 605 return 1; 606 continue; 607 case COLL_range_uc: 608 if (strcmp((char*)ce->beg, (char*)elt) <= ce->min && strcmp((char*)elt, (char*)ce->end) <= ce->max && (iswupper(c) || !iswlower(c))) 609 return 1; 610 continue; 611 } 612 break; 613 } 614 return 0; 615 } 616 617 static int 618 collic(register Celt_t* ce, char* key, register char* nxt, int c, int x) 619 { 620 if (!x) 621 { 622 if (collelt(ce, key, c, x)) 623 return 1; 624 if (iswlower(c)) 625 c = towupper(c); 626 else if (iswupper(c)) 627 c = towlower(c); 628 else 629 return 0; 630 x = mbconv(key, c); 631 key[x] = 0; 632 return collelt(ce, key, c, 0); 633 } 634 while (*nxt) 635 { 636 if (collic(ce, key, nxt + 1, c, x)) 637 return 1; 638 if (islower(*nxt)) 639 *nxt = toupper(*nxt); 640 else if (isupper(*nxt)) 641 *nxt = tolower(*nxt); 642 else 643 return 0; 644 nxt++; 645 } 646 return collelt(ce, key, c, x); 647 } 648 649 static int 650 collmatch(Rex_t* rex, unsigned char* s, unsigned char* e, unsigned char** p) 651 { 652 unsigned char* t; 653 wchar_t c; 654 int w; 655 int r; 656 int x; 657 int ic; 658 Ckey_t key; 659 Ckey_t elt; 660 661 ic = (rex->flags & REG_ICASE); 662 if ((w = MBSIZE(s)) > 1) 663 { 664 memcpy((char*)key, (char*)s, w); 665 key[w] = 0; 666 t = s; 667 c = mbchar(t); 668 #if !_lib_wctype 669 c &= 0xff; 670 #endif 671 x = 0; 672 } 673 else 674 { 675 c = s[0]; 676 if (ic && isupper(c)) 677 c = tolower(c); 678 key[0] = c; 679 key[1] = 0; 680 if (isalpha(c)) 681 { 682 x = e - s; 683 if (x > COLL_KEY_MAX) 684 x = COLL_KEY_MAX; 685 while (w < x) 686 { 687 c = s[w]; 688 if (!isalpha(c)) 689 break; 690 r = mbxfrm(elt, key, COLL_KEY_MAX); 691 if (ic && isupper(c)) 692 c = tolower(c); 693 key[w] = c; 694 key[w + 1] = 0; 695 if (mbxfrm(elt, key, COLL_KEY_MAX) != r) 696 break; 697 w++; 698 } 699 } 700 key[w] = 0; 701 c = key[0]; 702 x = w - 1; 703 } 704 r = 1; 705 for (;;) 706 { 707 if (ic ? collic(rex->re.collate.elements, (char*)key, (char*)key, c, x) : collelt(rex->re.collate.elements, (char*)key, c, x)) 708 break; 709 if (!x) 710 { 711 r = 0; 712 break; 713 } 714 w = x--; 715 key[w] = 0; 716 } 717 *p = s + w; 718 return rex->re.collate.invert ? !r : r; 719 } 720 721 static unsigned char* 722 nestmatch(register unsigned char* s, register unsigned char* e, const unsigned short* type, register int co) 723 { 724 register int c; 725 register int cc; 726 unsigned int n; 727 int oc; 728 729 if (type[co] & (REX_NEST_literal|REX_NEST_quote)) 730 { 731 n = (type[co] & REX_NEST_literal) ? REX_NEST_terminator : (REX_NEST_escape|REX_NEST_terminator); 732 while (s < e) 733 { 734 c = *s++; 735 if (c == co) 736 return s; 737 else if (type[c] & n) 738 { 739 if (s >= e || (type[c] & REX_NEST_terminator)) 740 break; 741 s++; 742 } 743 } 744 } 745 else 746 { 747 cc = type[co] >> REX_NEST_SHIFT; 748 oc = type[co] & (REX_NEST_open|REX_NEST_close); 749 n = 1; 750 while (s < e) 751 { 752 c = *s++; 753 switch (type[c] & (REX_NEST_escape|REX_NEST_open|REX_NEST_close|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) 754 { 755 case REX_NEST_delimiter: 756 case REX_NEST_terminator: 757 return oc ? 0 : s; 758 case REX_NEST_separator: 759 if (!oc) 760 return s; 761 break; 762 case REX_NEST_escape: 763 if (s >= e) 764 return 0; 765 s++; 766 break; 767 case REX_NEST_open|REX_NEST_close: 768 if (c == cc) 769 { 770 if (!--n) 771 return s; 772 } 773 /*FALLTHROUGH*/ 774 case REX_NEST_open: 775 if (c == co) 776 { 777 if (!++n) 778 return 0; 779 } 780 else if (!(s = nestmatch(s, e, type, c))) 781 return 0; 782 break; 783 case REX_NEST_close: 784 if (c != cc) 785 return 0; 786 if (!--n) 787 return s; 788 break; 789 } 790 } 791 return (oc || !(type[UCHAR_MAX+1] & REX_NEST_terminator)) ? 0 : s; 792 } 793 return 0; 794 } 795 796 static int 797 parse(Env_t* env, Rex_t* rex, Rex_t* cont, unsigned char* s) 798 { 799 int c; 800 int d; 801 int m; 802 int r; 803 ssize_t i; 804 ssize_t n; 805 int* f; 806 unsigned char* p; 807 unsigned char* t; 808 unsigned char* b; 809 unsigned char* e; 810 char* u; 811 regmatch_t* o; 812 Trie_node_t* x; 813 Rex_t* q; 814 Rex_t catcher; 815 Rex_t next; 816 817 for (;;) 818 { 819 DEBUG_TEST(0x0008,(sfprintf(sfstdout, "AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); 820 switch (rex->type) 821 { 822 case REX_ALT: 823 if (env->stack) 824 { 825 if (matchpush(env, rex)) 826 return BAD; 827 if (pospush(env, rex, s, BEG_ALT)) 828 return BAD; 829 catcher.type = REX_ALT_CATCH; 830 catcher.serial = rex->serial; 831 catcher.re.alt_catch.cont = cont; 832 catcher.next = rex->next; 833 r = parse(env, rex->re.group.expr.binary.left, &catcher, s); 834 if (r < BEST || (rex->flags & REG_MINIMAL)) 835 { 836 matchcopy(env, rex); 837 ((Pos_t*)env->pos->vec + env->pos->cur - 1)->serial = catcher.serial = rex->re.group.expr.binary.serial; 838 n = parse(env, rex->re.group.expr.binary.right, &catcher, s); 839 if (n != NONE) 840 r = n; 841 } 842 pospop(env); 843 matchpop(env, rex); 844 } 845 else 846 { 847 if ((r = parse(env, rex->re.group.expr.binary.left, cont, s)) == NONE) 848 r = parse(env, rex->re.group.expr.binary.right, cont, s); 849 if (r == GOOD) 850 r = BEST; 851 } 852 return r; 853 case REX_ALT_CATCH: 854 if (pospush(env, rex, s, END_ANY)) 855 return BAD; 856 r = follow(env, rex, rex->re.alt_catch.cont, s); 857 pospop(env); 858 return r; 859 case REX_BACK: 860 o = &env->match[rex->lo]; 861 if (o->rm_so < 0) 862 return NONE; 863 i = o->rm_eo - o->rm_so; 864 e = s + i; 865 if (e > env->end) 866 return NONE; 867 t = env->beg + o->rm_so; 868 if (!(p = rex->map)) 869 { 870 while (s < e) 871 if (*s++ != *t++) 872 return NONE; 873 } 874 else if (!mbwide()) 875 { 876 while (s < e) 877 if (p[*s++] != p[*t++]) 878 return NONE; 879 } 880 else 881 { 882 while (s < e) 883 { 884 c = mbchar(s); 885 d = mbchar(t); 886 if (towupper(c) != towupper(d)) 887 return NONE; 888 } 889 } 890 break; 891 case REX_BEG: 892 if ((!(rex->flags & REG_NEWLINE) || s <= env->beg || *(s - 1) != '\n') && ((env->flags & REG_NOTBOL) || s != env->beg)) 893 return NONE; 894 break; 895 case REX_CLASS: 896 if (LEADING(env, rex, s)) 897 return NONE; 898 n = rex->hi; 899 if (n > env->end - s) 900 n = env->end - s; 901 m = rex->lo; 902 if (m > n) 903 return NONE; 904 r = NONE; 905 if (!(rex->flags & REG_MINIMAL)) 906 { 907 for (i = 0; i < n; i++) 908 if (!settst(rex->re.charclass, s[i])) 909 { 910 n = i; 911 break; 912 } 913 for (s += n; n-- >= m; s--) 914 switch (follow(env, rex, cont, s)) 915 { 916 case BAD: 917 return BAD; 918 case CUT: 919 return CUT; 920 case BEST: 921 return BEST; 922 case GOOD: 923 r = GOOD; 924 break; 925 } 926 } 927 else 928 { 929 for (e = s + m; s < e; s++) 930 if (!settst(rex->re.charclass, *s)) 931 return r; 932 e += n - m; 933 for (;;) 934 { 935 switch (follow(env, rex, cont, s)) 936 { 937 case BAD: 938 return BAD; 939 case CUT: 940 return CUT; 941 case BEST: 942 case GOOD: 943 return BEST; 944 } 945 if (s >= e || !settst(rex->re.charclass, *s)) 946 break; 947 s++; 948 } 949 } 950 return r; 951 case REX_COLL_CLASS: 952 if (LEADING(env, rex, s)) 953 return NONE; 954 n = rex->hi; 955 if (n > env->end - s) 956 n = env->end - s; 957 m = rex->lo; 958 if (m > n) 959 return NONE; 960 r = NONE; 961 e = env->end; 962 if (!(rex->flags & REG_MINIMAL)) 963 { 964 if (!(b = (unsigned char*)stkpush(stkstd, n))) 965 { 966 env->error = REG_ESPACE; 967 return BAD; 968 } 969 for (i = 0; s < e && i < n && collmatch(rex, s, e, &t); i++) 970 { 971 b[i] = t - s; 972 s = t; 973 } 974 for (; i-- >= rex->lo; s -= b[i]) 975 switch (follow(env, rex, cont, s)) 976 { 977 case BAD: 978 stkpop(stkstd); 979 return BAD; 980 case CUT: 981 stkpop(stkstd); 982 return CUT; 983 case BEST: 984 stkpop(stkstd); 985 return BEST; 986 case GOOD: 987 r = GOOD; 988 break; 989 } 990 stkpop(stkstd); 991 } 992 else 993 { 994 for (i = 0; i < m && s < e; i++, s = t) 995 if (!collmatch(rex, s, e, &t)) 996 return r; 997 while (i++ <= n) 998 { 999 switch (follow(env, rex, cont, s)) 1000 { 1001 case BAD: 1002 return BAD; 1003 case CUT: 1004 return CUT; 1005 case BEST: 1006 case GOOD: 1007 return BEST; 1008 } 1009 if (s >= e || !collmatch(rex, s, e, &s)) 1010 break; 1011 } 1012 } 1013 return r; 1014 case REX_CONJ: 1015 next.type = REX_CONJ_RIGHT; 1016 next.re.conj_right.cont = cont; 1017 next.next = rex->next; 1018 catcher.type = REX_CONJ_LEFT; 1019 catcher.re.conj_left.right = rex->re.group.expr.binary.right; 1020 catcher.re.conj_left.cont = &next; 1021 catcher.re.conj_left.beg = s; 1022 catcher.next = 0; 1023 return parse(env, rex->re.group.expr.binary.left, &catcher, s); 1024 case REX_CONJ_LEFT: 1025 rex->re.conj_left.cont->re.conj_right.end = s; 1026 cont = rex->re.conj_left.cont; 1027 s = rex->re.conj_left.beg; 1028 rex = rex->re.conj_left.right; 1029 continue; 1030 case REX_CONJ_RIGHT: 1031 if (rex->re.conj_right.end != s) 1032 return NONE; 1033 cont = rex->re.conj_right.cont; 1034 break; 1035 case REX_DONE: 1036 if (!env->stack) 1037 return BEST; 1038 n = s - env->beg; 1039 r = env->nsub; 1040 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%z,%z)(%z,%z)(%z,%z)(%z,%z) (%z,%z)(%z,%z)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 1041 if ((i = env->best[0].rm_eo) >= 0) 1042 { 1043 if (rex->flags & REG_MINIMAL) 1044 { 1045 if (n > i) 1046 return GOOD; 1047 } 1048 else 1049 { 1050 if (n < i) 1051 return GOOD; 1052 } 1053 if (n == i && better(env, 1054 (Pos_t*)env->bestpos->vec, 1055 (Pos_t*)env->pos->vec, 1056 (Pos_t*)env->bestpos->vec+env->bestpos->cur, 1057 (Pos_t*)env->pos->vec+env->pos->cur, 1058 0) <= 0) 1059 return GOOD; 1060 } 1061 env->best[0].rm_eo = n; 1062 memcpy(&env->best[1], &env->match[1], r * sizeof(regmatch_t)); 1063 n = env->pos->cur; 1064 if (!vector(Pos_t, env->bestpos, n)) 1065 { 1066 env->error = REG_ESPACE; 1067 return BAD; 1068 } 1069 env->bestpos->cur = n; 1070 memcpy(env->bestpos->vec, env->pos->vec, n * sizeof(Pos_t)); 1071 DEBUG_TEST(0x0100,(sfprintf(sfstdout,"AHA#%04d 0x%04x %s (%z,%z)(%z,%z)(%z,%z)(%z,%z) (%z,%z)(%z,%z)\n", __LINE__, debug_flag, rexname(rex), env->best[0].rm_so, env->best[0].rm_eo, env->best[1].rm_so, env->best[1].rm_eo, env->best[2].rm_so, env->best[2].rm_eo, env->best[3].rm_so, env->best[3].rm_eo, env->match[0].rm_so, env->match[0].rm_eo, env->match[1].rm_so, env->match[1].rm_eo)),(0)); 1072 return GOOD; 1073 case REX_DOT: 1074 if (LEADING(env, rex, s)) 1075 return NONE; 1076 n = rex->hi; 1077 if (n > env->end - s) 1078 n = env->end - s; 1079 m = rex->lo; 1080 if (m > n) 1081 return NONE; 1082 if ((c = rex->explicit) >= 0 && !mbwide()) 1083 for (i = 0; i < n; i++) 1084 if (s[i] == c) 1085 { 1086 n = i; 1087 break; 1088 } 1089 r = NONE; 1090 if (!(rex->flags & REG_MINIMAL)) 1091 { 1092 if (!mbwide()) 1093 { 1094 for (s += n; n-- >= m; s--) 1095 switch (follow(env, rex, cont, s)) 1096 { 1097 case BAD: 1098 return BAD; 1099 case CUT: 1100 return CUT; 1101 case BEST: 1102 return BEST; 1103 case GOOD: 1104 r = GOOD; 1105 break; 1106 } 1107 } 1108 else 1109 { 1110 if (!(b = (unsigned char*)stkpush(stkstd, n))) 1111 { 1112 env->error = REG_ESPACE; 1113 return BAD; 1114 } 1115 e = env->end; 1116 for (i = 0; s < e && i < n && *s != c; i++) 1117 s += b[i] = MBSIZE(s); 1118 for (; i-- >= m; s -= b[i]) 1119 switch (follow(env, rex, cont, s)) 1120 { 1121 case BAD: 1122 stkpop(stkstd); 1123 return BAD; 1124 case CUT: 1125 stkpop(stkstd); 1126 return CUT; 1127 case BEST: 1128 stkpop(stkstd); 1129 return BEST; 1130 case GOOD: 1131 r = GOOD; 1132 break; 1133 } 1134 stkpop(stkstd); 1135 } 1136 } 1137 else 1138 { 1139 if (!mbwide()) 1140 { 1141 e = s + n; 1142 for (s += m; s <= e; s++) 1143 switch (follow(env, rex, cont, s)) 1144 { 1145 case BAD: 1146 return BAD; 1147 case CUT: 1148 return CUT; 1149 case BEST: 1150 case GOOD: 1151 return BEST; 1152 } 1153 } 1154 else 1155 { 1156 e = env->end; 1157 for (i = 0; s < e && i < m && *s != c; i++) 1158 s += MBSIZE(s); 1159 if (i >= m) 1160 for (; s <= e && i <= n; s += MBSIZE(s), i++) 1161 switch (follow(env, rex, cont, s)) 1162 { 1163 case BAD: 1164 return BAD; 1165 case CUT: 1166 return CUT; 1167 case BEST: 1168 case GOOD: 1169 return BEST; 1170 } 1171 } 1172 } 1173 return r; 1174 case REX_END: 1175 if ((!(rex->flags & REG_NEWLINE) || *s != '\n') && ((env->flags & REG_NOTEOL) || s < env->end)) 1176 return NONE; 1177 break; 1178 case REX_GROUP: 1179 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), env->end - s, s)),(0)); 1180 if (env->stack) 1181 { 1182 if (rex->re.group.number) 1183 env->match[rex->re.group.number].rm_so = s - env->beg; 1184 if (pospush(env, rex, s, BEG_SUB)) 1185 return BAD; 1186 catcher.re.group_catch.eo = rex->re.group.number ? &env->match[rex->re.group.number].rm_eo : (regoff_t*)0; 1187 } 1188 catcher.type = REX_GROUP_CATCH; 1189 catcher.serial = rex->serial; 1190 catcher.re.group_catch.cont = cont; 1191 catcher.next = rex->next; 1192 r = parse(env, rex->re.group.expr.rex, &catcher, s); 1193 if (env->stack) 1194 { 1195 pospop(env); 1196 if (rex->re.group.number) 1197 env->match[rex->re.group.number].rm_so = -1; 1198 } 1199 return r; 1200 case REX_GROUP_CATCH: 1201 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s=>%s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rexname(rex->re.group_catch.cont), env->end - s, s)),(0)); 1202 if (env->stack) 1203 { 1204 if (rex->re.group_catch.eo) 1205 *rex->re.group_catch.eo = s - env->beg; 1206 if (pospush(env, rex, s, END_ANY)) 1207 return BAD; 1208 } 1209 r = follow(env, rex, rex->re.group_catch.cont, s); 1210 if (env->stack) 1211 { 1212 pospop(env); 1213 if (rex->re.group_catch.eo) 1214 *rex->re.group_catch.eo = -1; 1215 } 1216 return r; 1217 case REX_GROUP_AHEAD: 1218 catcher.type = REX_GROUP_AHEAD_CATCH; 1219 catcher.flags = rex->flags; 1220 catcher.serial = rex->serial; 1221 catcher.re.rep_catch.beg = s; 1222 catcher.re.rep_catch.cont = cont; 1223 catcher.next = rex->next; 1224 return parse(env, rex->re.group.expr.rex, &catcher, s); 1225 case REX_GROUP_AHEAD_CATCH: 1226 return follow(env, rex, rex->re.rep_catch.cont, rex->re.rep_catch.beg); 1227 case REX_GROUP_AHEAD_NOT: 1228 r = parse(env, rex->re.group.expr.rex, NiL, s); 1229 if (r == NONE) 1230 r = follow(env, rex, cont, s); 1231 else if (r != BAD) 1232 r = NONE; 1233 return r; 1234 case REX_GROUP_BEHIND: 1235 if ((s - env->beg) < rex->re.group.size) 1236 return NONE; 1237 catcher.type = REX_GROUP_BEHIND_CATCH; 1238 catcher.flags = rex->flags; 1239 catcher.serial = rex->serial; 1240 catcher.re.behind_catch.beg = s; 1241 catcher.re.behind_catch.end = e = env->end; 1242 catcher.re.behind_catch.cont = cont; 1243 catcher.next = rex->next; 1244 for (t = s - rex->re.group.size; t >= env->beg; t--) 1245 { 1246 env->end = s; 1247 r = parse(env, rex->re.group.expr.rex, &catcher, t); 1248 env->end = e; 1249 if (r != NONE) 1250 return r; 1251 } 1252 return NONE; 1253 case REX_GROUP_BEHIND_CATCH: 1254 if (s != rex->re.behind_catch.beg) 1255 return NONE; 1256 env->end = rex->re.behind_catch.end; 1257 return follow(env, rex, rex->re.behind_catch.cont, rex->re.behind_catch.beg); 1258 case REX_GROUP_BEHIND_NOT: 1259 if ((s - env->beg) < rex->re.group.size) 1260 r = NONE; 1261 else 1262 { 1263 catcher.type = REX_GROUP_BEHIND_NOT_CATCH; 1264 catcher.re.neg_catch.beg = s; 1265 catcher.next = 0; 1266 e = env->end; 1267 env->end = s; 1268 for (t = s - rex->re.group.size; t >= env->beg; t--) 1269 { 1270 r = parse(env, rex->re.group.expr.rex, &catcher, t); 1271 if (r != NONE) 1272 break; 1273 } 1274 env->end = e; 1275 } 1276 if (r == NONE) 1277 r = follow(env, rex, cont, s); 1278 else if (r != BAD) 1279 r = NONE; 1280 return r; 1281 case REX_GROUP_BEHIND_NOT_CATCH: 1282 return s == rex->re.neg_catch.beg ? GOOD : NONE; 1283 case REX_GROUP_COND: 1284 if (q = rex->re.group.expr.binary.right) 1285 { 1286 catcher.re.cond_catch.next[0] = q->re.group.expr.binary.right; 1287 catcher.re.cond_catch.next[1] = q->re.group.expr.binary.left; 1288 } 1289 else 1290 catcher.re.cond_catch.next[0] = catcher.re.cond_catch.next[1] = 0; 1291 if (q = rex->re.group.expr.binary.left) 1292 { 1293 catcher.type = REX_GROUP_COND_CATCH; 1294 catcher.flags = rex->flags; 1295 catcher.serial = rex->serial; 1296 catcher.re.cond_catch.yes = 0; 1297 catcher.re.cond_catch.beg = s; 1298 catcher.re.cond_catch.cont = cont; 1299 catcher.next = rex->next; 1300 r = parse(env, q, &catcher, s); 1301 if (r == BAD || catcher.re.cond_catch.yes) 1302 return r; 1303 } 1304 else if (!rex->re.group.size || rex->re.group.size > 0 && env->match[rex->re.group.size].rm_so >= 0) 1305 r = GOOD; 1306 else 1307 r = NONE; 1308 if (q = catcher.re.cond_catch.next[r != NONE]) 1309 { 1310 catcher.type = REX_CAT; 1311 catcher.flags = q->flags; 1312 catcher.serial = q->serial; 1313 catcher.re.group_catch.cont = cont; 1314 catcher.next = rex->next; 1315 return parse(env, q, &catcher, s); 1316 } 1317 return follow(env, rex, cont, s); 1318 case REX_GROUP_COND_CATCH: 1319 rex->re.cond_catch.yes = 1; 1320 catcher.type = REX_CAT; 1321 catcher.flags = rex->flags; 1322 catcher.serial = rex->serial; 1323 catcher.re.group_catch.cont = rex->re.cond_catch.cont; 1324 catcher.next = rex->next; 1325 return parse(env, rex->re.cond_catch.next[1], &catcher, rex->re.cond_catch.beg); 1326 case REX_CAT: 1327 return follow(env, rex, rex->re.group_catch.cont, s); 1328 case REX_GROUP_CUT: 1329 catcher.type = REX_GROUP_CUT_CATCH; 1330 catcher.flags = rex->flags; 1331 catcher.serial = rex->serial; 1332 catcher.re.group_catch.cont = cont; 1333 catcher.next = rex->next; 1334 return parse(env, rex->re.group.expr.rex, &catcher, s); 1335 case REX_GROUP_CUT_CATCH: 1336 switch (r = follow(env, rex, rex->re.group_catch.cont, s)) 1337 { 1338 case GOOD: 1339 r = BEST; 1340 break; 1341 case NONE: 1342 r = CUT; 1343 break; 1344 } 1345 return r; 1346 case REX_KMP: 1347 f = rex->re.string.fail; 1348 b = rex->re.string.base; 1349 n = rex->re.string.size; 1350 t = s; 1351 e = env->end; 1352 if (p = rex->map) 1353 { 1354 while (t + n <= e) 1355 { 1356 for (i = -1; t < e; t++) 1357 { 1358 while (i >= 0 && b[i+1] != p[*t]) 1359 i = f[i]; 1360 if (b[i+1] == p[*t]) 1361 i++; 1362 if (i + 1 == n) 1363 { 1364 t++; 1365 if (env->stack) 1366 env->best[0].rm_so = t - s - n; 1367 switch (follow(env, rex, cont, t)) 1368 { 1369 case BAD: 1370 return BAD; 1371 case CUT: 1372 return CUT; 1373 case BEST: 1374 case GOOD: 1375 return BEST; 1376 } 1377 t -= n - 1; 1378 break; 1379 } 1380 } 1381 } 1382 } 1383 else 1384 { 1385 while (t + n <= e) 1386 { 1387 for (i = -1; t < e; t++) 1388 { 1389 while (i >= 0 && b[i+1] != *t) 1390 i = f[i]; 1391 if (b[i+1] == *t) 1392 i++; 1393 if (i + 1 == n) 1394 { 1395 t++; 1396 if (env->stack) 1397 env->best[0].rm_so = t - s - n; 1398 switch (follow(env, rex, cont, t)) 1399 { 1400 case BAD: 1401 return BAD; 1402 case CUT: 1403 return CUT; 1404 case BEST: 1405 case GOOD: 1406 return BEST; 1407 } 1408 t -= n - 1; 1409 break; 1410 } 1411 } 1412 } 1413 } 1414 return NONE; 1415 case REX_NEG: 1416 if (LEADING(env, rex, s)) 1417 return NONE; 1418 i = env->end - s; 1419 n = ((i + 7) >> 3) + 1; 1420 catcher.type = REX_NEG_CATCH; 1421 catcher.re.neg_catch.beg = s; 1422 if (!(p = (unsigned char*)stkpush(stkstd, n))) 1423 return BAD; 1424 memset(catcher.re.neg_catch.index = p, 0, n); 1425 catcher.next = rex->next; 1426 if (parse(env, rex->re.group.expr.rex, &catcher, s) == BAD) 1427 r = BAD; 1428 else 1429 { 1430 r = NONE; 1431 for (; i >= 0; i--) 1432 if (!bittst(p, i)) 1433 { 1434 switch (follow(env, rex, cont, s + i)) 1435 { 1436 case BAD: 1437 r = BAD; 1438 break; 1439 case BEST: 1440 r = BEST; 1441 break; 1442 case CUT: 1443 r = CUT; 1444 break; 1445 case GOOD: 1446 r = GOOD; 1447 /*FALLTHROUGH*/ 1448 default: 1449 continue; 1450 } 1451 break; 1452 } 1453 } 1454 stkpop(stkstd); 1455 return r; 1456 case REX_NEG_CATCH: 1457 bitset(rex->re.neg_catch.index, s - rex->re.neg_catch.beg); 1458 return NONE; 1459 case REX_NEST: 1460 if (s >= env->end) 1461 return NONE; 1462 do 1463 { 1464 if ((c = *s++) == rex->re.nest.primary) 1465 { 1466 if (s >= env->end || !(s = nestmatch(s, env->end, rex->re.nest.type, c))) 1467 return NONE; 1468 break; 1469 } 1470 if (rex->re.nest.primary >= 0) 1471 return NONE; 1472 if (rex->re.nest.type[c] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)) 1473 break; 1474 if (!(s = nestmatch(s, env->end, rex->re.nest.type, c))) 1475 return NONE; 1476 } while (s < env->end && !(rex->re.nest.type[*(s-1)] & (REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator))); 1477 break; 1478 case REX_NULL: 1479 break; 1480 case REX_ONECHAR: 1481 n = rex->hi; 1482 if (n > env->end - s) 1483 n = env->end - s; 1484 m = rex->lo; 1485 if (m > n) 1486 return NONE; 1487 r = NONE; 1488 c = rex->re.onechar; 1489 if (!(rex->flags & REG_MINIMAL)) 1490 { 1491 if (!mbwide()) 1492 { 1493 if (p = rex->map) 1494 { 1495 for (i = 0; i < n; i++, s++) 1496 if (p[*s] != c) 1497 break; 1498 } 1499 else 1500 { 1501 for (i = 0; i < n; i++, s++) 1502 if (*s != c) 1503 break; 1504 } 1505 for (; i-- >= m; s--) 1506 switch (follow(env, rex, cont, s)) 1507 { 1508 case BAD: 1509 return BAD; 1510 case BEST: 1511 return BEST; 1512 case CUT: 1513 return CUT; 1514 case GOOD: 1515 r = GOOD; 1516 break; 1517 } 1518 } 1519 else 1520 { 1521 if (!(b = (unsigned char*)stkpush(stkstd, n))) 1522 { 1523 env->error = REG_ESPACE; 1524 return BAD; 1525 } 1526 e = env->end; 1527 if (!(rex->flags & REG_ICASE)) 1528 { 1529 for (i = 0; s < e && i < n; i++, s = t) 1530 { 1531 t = s; 1532 if (mbchar(t) != c) 1533 break; 1534 b[i] = t - s; 1535 } 1536 } 1537 else 1538 { 1539 for (i = 0; s < e && i < n; i++, s = t) 1540 { 1541 t = s; 1542 if (towupper(mbchar(t)) != c) 1543 break; 1544 b[i] = t - s; 1545 } 1546 } 1547 for (; i-- >= m; s -= b[i]) 1548 switch (follow(env, rex, cont, s)) 1549 { 1550 case BAD: 1551 stkpop(stkstd); 1552 return BAD; 1553 case BEST: 1554 stkpop(stkstd); 1555 return BEST; 1556 case CUT: 1557 stkpop(stkstd); 1558 return CUT; 1559 case GOOD: 1560 r = GOOD; 1561 break; 1562 } 1563 stkpop(stkstd); 1564 } 1565 } 1566 else 1567 { 1568 if (!mbwide()) 1569 { 1570 e = s + m; 1571 if (p = rex->map) 1572 { 1573 for (; s < e; s++) 1574 if (p[*s] != c) 1575 return r; 1576 e += n - m; 1577 for (;;) 1578 { 1579 switch (follow(env, rex, cont, s)) 1580 { 1581 case BAD: 1582 return BAD; 1583 case CUT: 1584 return CUT; 1585 case BEST: 1586 case GOOD: 1587 return BEST; 1588 } 1589 if (s >= e || p[*s++] != c) 1590 break; 1591 } 1592 } 1593 else 1594 { 1595 for (; s < e; s++) 1596 if (*s != c) 1597 return r; 1598 e += n - m; 1599 for (;;) 1600 { 1601 switch (follow(env, rex, cont, s)) 1602 { 1603 case BAD: 1604 return BAD; 1605 case CUT: 1606 return CUT; 1607 case BEST: 1608 case GOOD: 1609 return BEST; 1610 } 1611 if (s >= e || *s++ != c) 1612 break; 1613 } 1614 } 1615 } 1616 else 1617 { 1618 e = env->end; 1619 if (!(rex->flags & REG_ICASE)) 1620 { 1621 for (i = 0; i < m && s < e; i++, s = t) 1622 { 1623 t = s; 1624 if (mbchar(t) != c) 1625 return r; 1626 } 1627 while (i++ <= n) 1628 { 1629 switch (follow(env, rex, cont, s)) 1630 { 1631 case BAD: 1632 return BAD; 1633 case CUT: 1634 return CUT; 1635 case BEST: 1636 case GOOD: 1637 return BEST; 1638 } 1639 if (s >= e || mbchar(s) != c) 1640 break; 1641 } 1642 } 1643 else 1644 { 1645 for (i = 0; i < m && s < e; i++, s = t) 1646 { 1647 t = s; 1648 if (towupper(mbchar(t)) != c) 1649 return r; 1650 } 1651 while (i++ <= n) 1652 { 1653 switch (follow(env, rex, cont, s)) 1654 { 1655 case BAD: 1656 return BAD; 1657 case CUT: 1658 return CUT; 1659 case BEST: 1660 case GOOD: 1661 return BEST; 1662 } 1663 if (s >= e || towupper(mbchar(s)) != c) 1664 break; 1665 } 1666 } 1667 } 1668 } 1669 return r; 1670 case REX_REP: 1671 if (env->stack && pospush(env, rex, s, BEG_REP)) 1672 return BAD; 1673 r = parserep(env, rex, cont, s, 0); 1674 if (env->stack) 1675 pospop(env); 1676 return r; 1677 case REX_REP_CATCH: 1678 DEBUG_TEST(0x0020,(sfprintf(sfstdout, "AHA#%04d 0x%04x %s n %d len %d s `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.rep_catch.n, s - rex->re.rep_catch.beg, env->end - s, s)),(0)); 1679 if (env->stack && pospush(env, rex, s, END_ANY)) 1680 return BAD; 1681 if (s == rex->re.rep_catch.beg && rex->re.rep_catch.n > rex->re.rep_catch.ref->lo) 1682 { 1683 /* 1684 * optional empty iteration 1685 */ 1686 1687 DEBUG_TEST(0x0002,(sfprintf(sfstdout, "AHA#%04d %p re.group.back=%d re.group.expr.rex=%s\n", __LINE__, rex->re.rep_catch.ref->re.group.expr.rex, rex->re.rep_catch.ref->re.group.expr.rex->re.group.back, rexname(rex->re.rep_catch.ref->re.group.expr.rex))),(0)); 1688 if (!env->stack || s != rex->re.rep_catch.ref->re.rep_catch.beg && !rex->re.rep_catch.ref->re.group.expr.rex->re.group.back) 1689 r = NONE; 1690 else if (pospush(env, rex, s, END_ANY)) 1691 r = BAD; 1692 else 1693 { 1694 r = follow(env, rex, rex->re.rep_catch.cont, s); 1695 pospop(env); 1696 } 1697 } 1698 else 1699 r = parserep(env, rex->re.rep_catch.ref, rex->re.rep_catch.cont, s, rex->re.rep_catch.n); 1700 if (env->stack) 1701 pospop(env); 1702 return r; 1703 case REX_STRING: 1704 DEBUG_TEST(0x0200,(sfprintf(sfstdout,"AHA#%04d 0x%04x parse %s \"%-.*s\" `%-.*s'\n", __LINE__, debug_flag, rexname(rex), rex->re.string.size, rex->re.string.base, env->end - s, s)),(0)); 1705 if (rex->re.string.size > (env->end - s)) 1706 return NONE; 1707 t = rex->re.string.base; 1708 e = t + rex->re.string.size; 1709 if (!(p = rex->map)) 1710 { 1711 while (t < e) 1712 if (*s++ != *t++) 1713 return NONE; 1714 } 1715 else if (!mbwide()) 1716 { 1717 while (t < e) 1718 if (p[*s++] != *t++) 1719 return NONE; 1720 } 1721 else 1722 { 1723 while (t < e) 1724 { 1725 c = mbchar(s); 1726 d = mbchar(t); 1727 if (towupper(c) != d) 1728 return NONE; 1729 } 1730 } 1731 break; 1732 case REX_TRIE: 1733 if (((s + rex->re.trie.min) > env->end) || !(x = rex->re.trie.root[rex->map ? rex->map[*s] : *s])) 1734 return NONE; 1735 return parsetrie(env, x, rex, cont, s); 1736 case REX_EXEC: 1737 u = 0; 1738 r = (*env->disc->re_execf)(env->regex, rex->re.exec.data, rex->re.exec.text, rex->re.exec.size, (const char*)s, env->end - s, &u, env->disc); 1739 e = (unsigned char*)u; 1740 if (e >= s && e <= env->end) 1741 s = e; 1742 switch (r) 1743 { 1744 case 0: 1745 break; 1746 case REG_NOMATCH: 1747 return NONE; 1748 default: 1749 env->error = r; 1750 return BAD; 1751 } 1752 break; 1753 case REX_WBEG: 1754 if (!isword(*s) || s > env->beg && isword(*(s - 1))) 1755 return NONE; 1756 break; 1757 case REX_WEND: 1758 if (isword(*s) || s > env->beg && !isword(*(s - 1))) 1759 return NONE; 1760 break; 1761 case REX_WORD: 1762 if (s > env->beg && isword(*(s - 1)) == isword(*s)) 1763 return NONE; 1764 break; 1765 case REX_WORD_NOT: 1766 if (s == env->beg || isword(*(s - 1)) != isword(*s)) 1767 return NONE; 1768 break; 1769 case REX_BEG_STR: 1770 if (s != env->beg) 1771 return NONE; 1772 break; 1773 case REX_END_STR: 1774 for (t = s; t < env->end && *t == '\n'; t++); 1775 if (t < env->end) 1776 return NONE; 1777 break; 1778 case REX_FIN_STR: 1779 if (s < env->end) 1780 return NONE; 1781 break; 1782 } 1783 if (!(rex = rex->next)) 1784 { 1785 if (!(rex = cont)) 1786 break; 1787 cont = 0; 1788 } 1789 } 1790 return GOOD; 1791 } 1792 1793 #if _AST_REGEX_DEBUG 1794 1795 static void 1796 listnode(Rex_t* e, int level) 1797 { 1798 int i; 1799 1800 if (e) 1801 { 1802 do 1803 { 1804 for (i = 0; i < level; i++) 1805 sfprintf(sfstderr, " "); 1806 sfprintf(sfstderr, "%s\n", rexname(e)); 1807 switch (e->type) 1808 { 1809 case REX_ALT: 1810 case REX_CONJ: 1811 listnode(e->re.group.expr.binary.left, level + 1); 1812 listnode(e->re.group.expr.binary.right, level + 1); 1813 break; 1814 case REX_GROUP: 1815 case REX_GROUP_AHEAD: 1816 case REX_GROUP_AHEAD_NOT: 1817 case REX_GROUP_BEHIND: 1818 case REX_GROUP_BEHIND_NOT: 1819 case REX_GROUP_CUT: 1820 case REX_NEG: 1821 case REX_REP: 1822 listnode(e->re.group.expr.rex, level + 1); 1823 break; 1824 } 1825 } while (e = e->next); 1826 } 1827 } 1828 1829 static int 1830 list(Env_t* env, Rex_t* rex) 1831 { 1832 sfprintf(sfstderr, "AHA regex hard=%d stack=%p\n", env->hard, env->stack); 1833 if (rex) 1834 listnode(rex, 1); 1835 return 0; 1836 } 1837 1838 #endif 1839 1840 /* 1841 * returning REG_BADPAT or REG_ESPACE is not explicitly 1842 * countenanced by the standard 1843 */ 1844 1845 int 1846 regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, regmatch_t* match, regflags_t flags) 1847 { 1848 register ssize_t n; 1849 register int i; 1850 int j; 1851 int k; 1852 int m; 1853 int advance; 1854 Env_t* env; 1855 Rex_t* e; 1856 1857 DEBUG_INIT(); 1858 DEBUG_TEST(0x0001,(sfprintf(sfstdout, "AHA#%04d 0x%04x regnexec %d 0x%08x `%-.*s'\n", __LINE__, debug_flag, nmatch, flags, len, s)),(0)); 1859 if (!p || !(env = p->env)) 1860 return REG_BADPAT; 1861 if (!s) 1862 return fatal(env->disc, REG_BADPAT, NiL); 1863 if (len < env->min) 1864 { 1865 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, env->min)),(0)); 1866 return REG_NOMATCH; 1867 } 1868 env->regex = p; 1869 env->beg = (unsigned char*)s; 1870 env->end = env->beg + len; 1871 stknew(stkstd, &env->stk); 1872 env->flags &= ~REG_EXEC; 1873 env->flags |= (flags & REG_EXEC); 1874 advance = 0; 1875 if (env->stack = env->hard || !(env->flags & REG_NOSUB) && nmatch) 1876 { 1877 n = env->nsub; 1878 if (!(env->match = (regmatch_t*)stkpush(stkstd, 2 * (n + 1) * sizeof(regmatch_t))) || 1879 !env->pos && !(env->pos = vecopen(16, sizeof(Pos_t))) || 1880 !env->bestpos && !(env->bestpos = vecopen(16, sizeof(Pos_t)))) 1881 { 1882 k = REG_ESPACE; 1883 goto done; 1884 } 1885 env->pos->cur = env->bestpos->cur = 0; 1886 env->best = &env->match[n + 1]; 1887 env->best[0].rm_so = 0; 1888 env->best[0].rm_eo = -1; 1889 for (i = 0; i <= n; i++) 1890 env->match[i] = state.nomatch; 1891 if (flags & REG_ADVANCE) 1892 advance = 1; 1893 } 1894 DEBUG_TEST(0x1000,(list(env,env->rex)),(0)); 1895 k = REG_NOMATCH; 1896 if ((e = env->rex)->type == REX_BM) 1897 { 1898 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM\n", __LINE__)),(0)); 1899 if (len < e->re.bm.right) 1900 { 1901 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, len, e->re.bm.right)),(0)); 1902 goto done; 1903 } 1904 else if (!(flags & REG_LEFT)) 1905 { 1906 register unsigned char* buf = (unsigned char*)s; 1907 register size_t index = e->re.bm.left + e->re.bm.size; 1908 register size_t mid = len - e->re.bm.right; 1909 register size_t* skip = e->re.bm.skip; 1910 register size_t* fail = e->re.bm.fail; 1911 register Bm_mask_t** mask = e->re.bm.mask; 1912 Bm_mask_t m; 1913 size_t x; 1914 1915 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REX_BM len=%d right=%d left=%d size=%d %d %d\n", __LINE__, len, e->re.bm.right, e->re.bm.left, e->re.bm.size, index, mid)),(0)); 1916 for (;;) 1917 { 1918 while ((index += skip[buf[index]]) < mid); 1919 if (index < HIT) 1920 { 1921 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__, index, HIT)),(0)); 1922 goto done; 1923 } 1924 index -= HIT; 1925 m = mask[n = e->re.bm.size - 1][buf[index]]; 1926 do 1927 { 1928 if (!n--) 1929 { 1930 if (e->re.bm.back < 0) 1931 goto possible; 1932 if (advance) 1933 { 1934 i = index - e->re.bm.back; 1935 s += i; 1936 if (env->stack) 1937 env->best[0].rm_so += i; 1938 goto possible; 1939 } 1940 x = index; 1941 if (index < e->re.bm.back) 1942 index = 0; 1943 else 1944 index -= e->re.bm.back; 1945 while (index <= x) 1946 { 1947 if ((i = parse(env, e->next, &env->done, buf + index)) != NONE) 1948 { 1949 if (env->stack) 1950 env->best[0].rm_so = index; 1951 n = env->nsub; 1952 goto hit; 1953 } 1954 index++; 1955 } 1956 index += e->re.bm.size; 1957 break; 1958 } 1959 } while (m &= mask[n][buf[--index]]); 1960 if ((index += fail[n + 1]) >= len) 1961 goto done; 1962 } 1963 } 1964 possible: 1965 n = env->nsub; 1966 e = e->next; 1967 } 1968 j = env->once || (flags & REG_LEFT); 1969 DEBUG_TEST(0x0080,(sfprintf(sfstdout, "AHA#%04d parse once=%d\n", __LINE__, j)),(0)); 1970 while ((i = parse(env, e, &env->done, (unsigned char*)s)) == NONE || advance && !env->best[0].rm_eo && !(advance = 0)) 1971 { 1972 if (j) 1973 goto done; 1974 i = MBSIZE(s); 1975 s += i; 1976 if ((unsigned char*)s > env->end - env->min) 1977 goto done; 1978 if (env->stack) 1979 env->best[0].rm_so += i; 1980 } 1981 if ((flags & REG_LEFT) && env->stack && env->best[0].rm_so) 1982 goto done; 1983 hit: 1984 if (k = env->error) 1985 goto done; 1986 if (i == CUT) 1987 { 1988 k = env->error = REG_NOMATCH; 1989 goto done; 1990 } 1991 if (!(env->flags & REG_NOSUB)) 1992 { 1993 k = (env->flags & (REG_SHELL|REG_AUGMENTED)) == (REG_SHELL|REG_AUGMENTED); 1994 for (i = j = m = 0; j < nmatch; i++) 1995 if (!i || !k || (i & 1)) 1996 { 1997 if (i > n) 1998 match[j] = state.nomatch; 1999 else 2000 match[m = j] = env->best[i]; 2001 j++; 2002 } 2003 if (k) 2004 { 2005 while (m > 0 && match[m].rm_so == -1 && match[m].rm_eo == -1) 2006 m--; 2007 ((regex_t*)p)->re_nsub = m; 2008 } 2009 } 2010 k = 0; 2011 done: 2012 stkold(stkstd, &env->stk); 2013 env->stk.base = 0; 2014 if (k > REG_NOMATCH) 2015 fatal(p->env->disc, k, NiL); 2016 return k; 2017 } 2018 2019 void 2020 regfree(regex_t* p) 2021 { 2022 Env_t* env; 2023 2024 if (p && (env = p->env)) 2025 { 2026 #if _REG_subcomp 2027 if (env->sub) 2028 { 2029 regsubfree(p); 2030 p->re_sub = 0; 2031 } 2032 #endif 2033 p->env = 0; 2034 if (--env->refs <= 0 && !(env->disc->re_flags & REG_NOFREE)) 2035 { 2036 drop(env->disc, env->rex); 2037 if (env->pos) 2038 vecclose(env->pos); 2039 if (env->bestpos) 2040 vecclose(env->bestpos); 2041 if (env->stk.base) 2042 stkold(stkstd, &env->stk); 2043 alloc(env->disc, env, 0); 2044 } 2045 } 2046 } 2047 2048 /* 2049 * 20120528: regoff_t changed from int to ssize_t 2050 */ 2051 2052 #if defined(__EXPORT__) 2053 #define extern __EXPORT__ 2054 #endif 2055 2056 #undef regnexec 2057 #if _map_libc 2058 #define regnexec _ast_regnexec 2059 #endif 2060 2061 extern int 2062 regnexec(const regex_t* p, const char* s, size_t len, size_t nmatch, oldregmatch_t* oldmatch, regflags_t flags) 2063 { 2064 if (oldmatch) 2065 { 2066 regmatch_t* match; 2067 ssize_t i; 2068 int r; 2069 2070 if (!(match = oldof(0, regmatch_t, nmatch, 0))) 2071 return -1; 2072 if (!(r = regnexec_20120528(p, s, len, nmatch, match, flags))) 2073 for (i = 0; i < nmatch; i++) 2074 { 2075 oldmatch[i].rm_so = match[i].rm_so; 2076 oldmatch[i].rm_eo = match[i].rm_eo; 2077 } 2078 free(match); 2079 return r; 2080 } 2081 return regnexec_20120528(p, s, len, 0, NiL, flags); 2082 } 2083