1/*- 2 * This code is derived from OpenBSD's libc/regex, original license follows: 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38/* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45#ifdef SNAMES 46#define matcher smatcher 47#define fast sfast 48#define slow sslow 49#define dissect sdissect 50#define backref sbackref 51#define step sstep 52#define print sprint 53#define at sat 54#define match smat 55#define nope snope 56#define step_back sstep_back 57#endif 58#ifdef LNAMES 59#define matcher lmatcher 60#define fast lfast 61#define slow lslow 62#define dissect ldissect 63#define backref lbackref 64#define step lstep 65#define print lprint 66#define at lat 67#define match lmat 68#define nope lnope 69#define step_back lstep_back 70#endif 71 72/* another structure passed up and down to avoid zillions of parameters */ 73struct match { 74 struct re_guts *g; 75 int eflags; 76 llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 77 const char *offp; /* offsets work from here */ 78 const char *beginp; /* start of string -- virtual NUL precedes */ 79 const char *endp; /* end of string -- virtual NUL here */ 80 const char *coldp; /* can be no match starting before here */ 81 const char **lastpos; /* [nplus+1] */ 82 STATEVARS; 83 states st; /* current states */ 84 states fresh; /* states for a fresh start */ 85 states tmp; /* temporary */ 86 states empty; /* empty set of states */ 87}; 88 89static int matcher(struct re_guts *, const char *, size_t, 90 llvm_regmatch_t[], int); 91static const char *dissect(struct match *, const char *, const char *, sopno, 92 sopno); 93static const char *backref(struct match *, const char *, const char *, sopno, 94 sopno, sopno, int); 95static const char *fast(struct match *, const char *, const char *, sopno, sopno); 96static const char *slow(struct match *, const char *, const char *, sopno, sopno); 97static states step(struct re_guts *, sopno, sopno, states, int, states); 98#define MAX_RECURSION 100 99#define BOL (OUT+1) 100#define EOL (BOL+1) 101#define BOLEOL (BOL+2) 102#define NOTHING (BOL+3) 103#define BOW (BOL+4) 104#define EOW (BOL+5) 105#define CODEMAX (BOL+5) /* highest code used */ 106#define NONCHAR(c) ((c) > CHAR_MAX) 107#define NNONCHAR (CODEMAX-CHAR_MAX) 108#ifdef REDEBUG 109static void print(struct match *, const char *, states, int, FILE *); 110#endif 111#ifdef REDEBUG 112static void at( 113 struct match *, const char *, const char *, const char *, sopno, sopno); 114#endif 115#ifdef REDEBUG 116static char *pchar(int); 117#endif 118 119#ifdef REDEBUG 120#define SP(t, s, c) print(m, t, s, c, stdout) 121#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 122#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 123static int nope = 0; 124#else 125#define SP(t, s, c) /* nothing */ 126#define AT(t, p1, p2, s1, s2) /* nothing */ 127#define NOTE(s) /* nothing */ 128#endif 129 130/* 131 - matcher - the actual matching engine 132 */ 133static int /* 0 success, REG_NOMATCH failure */ 134matcher(struct re_guts *g, const char *string, size_t nmatch, 135 llvm_regmatch_t pmatch[], 136 int eflags) 137{ 138 const char *endp; 139 size_t i; 140 struct match mv; 141 struct match *m = &mv; 142 const char *dp; 143 const sopno gf = g->firststate+1; /* +1 for OEND */ 144 const sopno gl = g->laststate; 145 const char *start; 146 const char *stop; 147 148 /* simplify the situation where possible */ 149 if (g->cflags®_NOSUB) 150 nmatch = 0; 151 if (eflags®_STARTEND) { 152 start = string + pmatch[0].rm_so; 153 stop = string + pmatch[0].rm_eo; 154 } else { 155 start = string; 156 stop = start + strlen(start); 157 } 158 if (stop < start) 159 return(REG_INVARG); 160 161 /* prescreening; this does wonders for this rather slow code */ 162 if (g->must != NULL) { 163 for (dp = start; dp < stop; dp++) 164 if (*dp == g->must[0] && stop - dp >= g->mlen && 165 memcmp(dp, g->must, (size_t)g->mlen) == 0) 166 break; 167 if (dp == stop) /* we didn't find g->must */ 168 return(REG_NOMATCH); 169 } 170 171 /* match struct setup */ 172 m->g = g; 173 m->eflags = eflags; 174 m->pmatch = NULL; 175 m->lastpos = NULL; 176 m->offp = string; 177 m->beginp = start; 178 m->endp = stop; 179 STATESETUP(m, 4); 180 SETUP(m->st); 181 SETUP(m->fresh); 182 SETUP(m->tmp); 183 SETUP(m->empty); 184 CLEAR(m->empty); 185 186 /* this loop does only one repetition except for backrefs */ 187 for (;;) { 188 endp = fast(m, start, stop, gf, gl); 189 if (endp == NULL) { /* a miss */ 190 free(m->pmatch); 191 free((void*)m->lastpos); 192 STATETEARDOWN(m); 193 return(REG_NOMATCH); 194 } 195 if (nmatch == 0 && !g->backrefs) 196 break; /* no further info needed */ 197 198 /* where? */ 199 assert(m->coldp != NULL); 200 for (;;) { 201 NOTE("finding start"); 202 endp = slow(m, m->coldp, stop, gf, gl); 203 if (endp != NULL) 204 break; 205 assert(m->coldp < m->endp); 206 m->coldp++; 207 } 208 if (nmatch == 1 && !g->backrefs) 209 break; /* no further info needed */ 210 211 /* oh my, they want the subexpressions... */ 212 if (m->pmatch == NULL) 213 m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) * 214 sizeof(llvm_regmatch_t)); 215 if (m->pmatch == NULL) { 216 STATETEARDOWN(m); 217 return(REG_ESPACE); 218 } 219 for (i = 1; i <= m->g->nsub; i++) 220 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 221 if (!g->backrefs && !(m->eflags®_BACKR)) { 222 NOTE("dissecting"); 223 dp = dissect(m, m->coldp, endp, gf, gl); 224 } else { 225 if (g->nplus > 0 && m->lastpos == NULL) 226 m->lastpos = (const char **)malloc((g->nplus+1) * 227 sizeof(char *)); 228 if (g->nplus > 0 && m->lastpos == NULL) { 229 free(m->pmatch); 230 STATETEARDOWN(m); 231 return(REG_ESPACE); 232 } 233 NOTE("backref dissect"); 234 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 235 } 236 if (dp != NULL) 237 break; 238 239 /* uh-oh... we couldn't find a subexpression-level match */ 240 assert(g->backrefs); /* must be back references doing it */ 241 assert(g->nplus == 0 || m->lastpos != NULL); 242 for (;;) { 243 if (dp != NULL || endp <= m->coldp) 244 break; /* defeat */ 245 NOTE("backoff"); 246 endp = slow(m, m->coldp, endp-1, gf, gl); 247 if (endp == NULL) 248 break; /* defeat */ 249 /* try it on a shorter possibility */ 250#ifndef NDEBUG 251 for (i = 1; i <= m->g->nsub; i++) { 252 assert(m->pmatch[i].rm_so == -1); 253 assert(m->pmatch[i].rm_eo == -1); 254 } 255#endif 256 NOTE("backoff dissect"); 257 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 258 } 259 assert(dp == NULL || dp == endp); 260 if (dp != NULL) /* found a shorter one */ 261 break; 262 263 /* despite initial appearances, there is no match here */ 264 NOTE("false alarm"); 265 if (m->coldp == stop) 266 break; 267 start = m->coldp + 1; /* recycle starting later */ 268 } 269 270 /* fill in the details if requested */ 271 if (nmatch > 0) { 272 pmatch[0].rm_so = m->coldp - m->offp; 273 pmatch[0].rm_eo = endp - m->offp; 274 } 275 if (nmatch > 1) { 276 assert(m->pmatch != NULL); 277 for (i = 1; i < nmatch; i++) 278 if (i <= m->g->nsub) 279 pmatch[i] = m->pmatch[i]; 280 else { 281 pmatch[i].rm_so = -1; 282 pmatch[i].rm_eo = -1; 283 } 284 } 285 286 if (m->pmatch != NULL) 287 free((char *)m->pmatch); 288 if (m->lastpos != NULL) 289 free((char *)m->lastpos); 290 STATETEARDOWN(m); 291 return(0); 292} 293 294/* Step back from "stop" to a position where the strip startst..stopst might 295 * match. This can always conservatively return "stop - 1", but may return an 296 * earlier position if matches at later positions are impossible. */ 297static const char * 298step_back(struct re_guts *g, const char *start, const char *stop, sopno startst, 299 sopno stopst) 300{ 301 /* Always step back at least one character. */ 302 assert(stop > start); 303 const char *res = stop - 1; 304 305 /* Check whether the strip startst..stropst starts with a fixed character, 306 * ignoring any closing parentheses. If not, return a conservative result. */ 307 for (;;) { 308 if (startst >= stopst) 309 return res; 310 if (OP(g->strip[startst]) != ORPAREN) 311 break; 312 startst++; 313 } 314 if (OP(g->strip[startst]) != OCHAR) 315 return res; 316 317 /* Find the character that starts the following match. */ 318 char ch = OPND(g->strip[startst]); 319 for (; res != start; --res) { 320 if (*res == ch) { 321 /* Try to check the next fixed character as well. */ 322 sopno nextst = startst + 1; 323 const char *next = res + 1; 324 if (nextst >= stopst || OP(g->strip[nextst]) != OCHAR || next >= stop || 325 *next == (char)OPND(g->strip[nextst])) 326 break; 327 } 328 } 329 return res; 330} 331 332/* 333 - dissect - figure out what matched what, no back references 334 */ 335static const char * /* == stop (success) always */ 336dissect(struct match *m, const char *start, const char *stop, sopno startst, 337 sopno stopst) 338{ 339 int i; 340 sopno ss; /* start sop of current subRE */ 341 sopno es; /* end sop of current subRE */ 342 const char *sp; /* start of string matched by it */ 343 const char *stp; /* string matched by it cannot pass here */ 344 const char *rest; /* start of rest of string */ 345 const char *tail; /* string unmatched by rest of RE */ 346 sopno ssub; /* start sop of subsubRE */ 347 sopno esub; /* end sop of subsubRE */ 348 const char *ssp; /* start of string matched by subsubRE */ 349 const char *sep; /* end of string matched by subsubRE */ 350 const char *oldssp; /* previous ssp */ 351 352 AT("diss", start, stop, startst, stopst); 353 sp = start; 354 for (ss = startst; ss < stopst; ss = es) { 355 /* identify end of subRE */ 356 es = ss; 357 switch (OP(m->g->strip[es])) { 358 case OPLUS_: 359 case OQUEST_: 360 es += OPND(m->g->strip[es]); 361 break; 362 case OCH_: 363 while (OP(m->g->strip[es]) != O_CH) 364 es += OPND(m->g->strip[es]); 365 break; 366 } 367 es++; 368 369 /* figure out what it matched */ 370 switch (OP(m->g->strip[ss])) { 371 case OEND: 372 assert(nope); 373 break; 374 case OCHAR: 375 sp++; 376 break; 377 case OBOL: 378 case OEOL: 379 case OBOW: 380 case OEOW: 381 break; 382 case OANY: 383 case OANYOF: 384 sp++; 385 break; 386 case OBACK_: 387 case O_BACK: 388 assert(nope); 389 break; 390 /* cases where length of match is hard to find */ 391 case OQUEST_: 392 stp = stop; 393 for (;;) { 394 /* how long could this one be? */ 395 rest = slow(m, sp, stp, ss, es); 396 assert(rest != NULL); /* it did match */ 397 /* could the rest match the rest? */ 398 tail = slow(m, rest, stop, es, stopst); 399 if (tail == stop) 400 break; /* yes! */ 401 /* no -- try a shorter match for this one */ 402 stp = step_back(m->g, sp, rest, es, stopst); 403 assert(stp >= sp); /* it did work */ 404 } 405 ssub = ss + 1; 406 esub = es - 1; 407 /* did innards match? */ 408 if (slow(m, sp, rest, ssub, esub) != NULL) { 409 const char *dp = dissect(m, sp, rest, ssub, esub); 410 (void)dp; /* avoid warning if assertions off */ 411 assert(dp == rest); 412 } else /* no */ 413 assert(sp == rest); 414 sp = rest; 415 break; 416 case OPLUS_: 417 stp = stop; 418 for (;;) { 419 /* how long could this one be? */ 420 rest = slow(m, sp, stp, ss, es); 421 assert(rest != NULL); /* it did match */ 422 /* could the rest match the rest? */ 423 tail = slow(m, rest, stop, es, stopst); 424 if (tail == stop) 425 break; /* yes! */ 426 /* no -- try a shorter match for this one */ 427 stp = step_back(m->g, sp, rest, es, stopst); 428 assert(stp >= sp); /* it did work */ 429 } 430 ssub = ss + 1; 431 esub = es - 1; 432 ssp = sp; 433 oldssp = ssp; 434 for (;;) { /* find last match of innards */ 435 sep = slow(m, ssp, rest, ssub, esub); 436 if (sep == NULL || sep == ssp) 437 break; /* failed or matched null */ 438 oldssp = ssp; /* on to next try */ 439 ssp = sep; 440 } 441 if (sep == NULL) { 442 /* last successful match */ 443 sep = ssp; 444 ssp = oldssp; 445 } 446 assert(sep == rest); /* must exhaust substring */ 447 assert(slow(m, ssp, sep, ssub, esub) == rest); 448 { 449 const char *dp = dissect(m, ssp, sep, ssub, esub); 450 (void)dp; /* avoid warning if assertions off */ 451 assert(dp == sep); 452 } 453 sp = rest; 454 break; 455 case OCH_: 456 stp = stop; 457 for (;;) { 458 /* how long could this one be? */ 459 rest = slow(m, sp, stp, ss, es); 460 assert(rest != NULL); /* it did match */ 461 /* could the rest match the rest? */ 462 tail = slow(m, rest, stop, es, stopst); 463 if (tail == stop) 464 break; /* yes! */ 465 /* no -- try a shorter match for this one */ 466 stp = rest - 1; 467 assert(stp >= sp); /* it did work */ 468 } 469 ssub = ss + 1; 470 esub = ss + OPND(m->g->strip[ss]) - 1; 471 assert(OP(m->g->strip[esub]) == OOR1); 472 for (;;) { /* find first matching branch */ 473 if (slow(m, sp, rest, ssub, esub) == rest) 474 break; /* it matched all of it */ 475 /* that one missed, try next one */ 476 assert(OP(m->g->strip[esub]) == OOR1); 477 esub++; 478 assert(OP(m->g->strip[esub]) == OOR2); 479 ssub = esub + 1; 480 esub += OPND(m->g->strip[esub]); 481 if (OP(m->g->strip[esub]) == OOR2) 482 esub--; 483 else 484 assert(OP(m->g->strip[esub]) == O_CH); 485 } 486 { 487 const char *dp = dissect(m, sp, rest, ssub, esub); 488 (void)dp; /* avoid warning if assertions off */ 489 assert(dp == rest); 490 } 491 sp = rest; 492 break; 493 case O_PLUS: 494 case O_QUEST: 495 case OOR1: 496 case OOR2: 497 case O_CH: 498 assert(nope); 499 break; 500 case OLPAREN: 501 i = OPND(m->g->strip[ss]); 502 assert(0 < i && i <= m->g->nsub); 503 m->pmatch[i].rm_so = sp - m->offp; 504 break; 505 case ORPAREN: 506 i = OPND(m->g->strip[ss]); 507 assert(0 < i && i <= m->g->nsub); 508 m->pmatch[i].rm_eo = sp - m->offp; 509 break; 510 default: /* uh oh */ 511 assert(nope); 512 break; 513 } 514 } 515 516 assert(sp == stop); 517 return(sp); 518} 519 520/* 521 - backref - figure out what matched what, figuring in back references 522 */ 523static const char * /* == stop (success) or NULL (failure) */ 524backref(struct match *m, const char *start, const char *stop, sopno startst, 525 sopno stopst, sopno lev, int rec) /* PLUS nesting level */ 526{ 527 int i; 528 sopno ss; /* start sop of current subRE */ 529 const char *sp; /* start of string matched by it */ 530 sopno ssub; /* start sop of subsubRE */ 531 sopno esub; /* end sop of subsubRE */ 532 const char *ssp; /* start of string matched by subsubRE */ 533 const char *dp; 534 size_t len; 535 int hard; 536 sop s; 537 llvm_regoff_t offsave; 538 cset *cs; 539 540 AT("back", start, stop, startst, stopst); 541 sp = start; 542 543 /* get as far as we can with easy stuff */ 544 hard = 0; 545 for (ss = startst; !hard && ss < stopst; ss++) 546 switch (OP(s = m->g->strip[ss])) { 547 case OCHAR: 548 if (sp == stop || *sp++ != (char)OPND(s)) 549 return(NULL); 550 break; 551 case OANY: 552 if (sp == stop) 553 return(NULL); 554 sp++; 555 break; 556 case OANYOF: 557 cs = &m->g->sets[OPND(s)]; 558 if (sp == stop || !CHIN(cs, *sp++)) 559 return(NULL); 560 break; 561 case OBOL: 562 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 563 (sp < m->endp && *(sp-1) == '\n' && 564 (m->g->cflags®_NEWLINE)) ) 565 { /* yes */ } 566 else 567 return(NULL); 568 break; 569 case OEOL: 570 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 571 (sp < m->endp && *sp == '\n' && 572 (m->g->cflags®_NEWLINE)) ) 573 { /* yes */ } 574 else 575 return(NULL); 576 break; 577 case OBOW: 578 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 579 (sp < m->endp && *(sp-1) == '\n' && 580 (m->g->cflags®_NEWLINE)) || 581 (sp > m->beginp && 582 !ISWORD(*(sp-1))) ) && 583 (sp < m->endp && ISWORD(*sp)) ) 584 { /* yes */ } 585 else 586 return(NULL); 587 break; 588 case OEOW: 589 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 590 (sp < m->endp && *sp == '\n' && 591 (m->g->cflags®_NEWLINE)) || 592 (sp < m->endp && !ISWORD(*sp)) ) && 593 (sp > m->beginp && ISWORD(*(sp-1))) ) 594 { /* yes */ } 595 else 596 return(NULL); 597 break; 598 case O_QUEST: 599 case O_CH: 600 break; 601 case OOR1: /* matches null but needs to skip */ 602 ss++; 603 s = m->g->strip[ss]; 604 do { 605 assert(OP(s) == OOR2); 606 ss += OPND(s); 607 } while (OP(s = m->g->strip[ss]) != O_CH); 608 /* note that the ss++ gets us past the O_CH */ 609 break; 610 default: /* have to make a choice */ 611 hard = 1; 612 break; 613 } 614 if (!hard) { /* that was it! */ 615 if (sp != stop) 616 return(NULL); 617 return(sp); 618 } 619 ss--; /* adjust for the for's final increment */ 620 621 /* the hard stuff */ 622 AT("hard", sp, stop, ss, stopst); 623 s = m->g->strip[ss]; 624 switch (OP(s)) { 625 case OBACK_: /* the vilest depths */ 626 i = OPND(s); 627 assert(0 < i && i <= m->g->nsub); 628 if (m->pmatch[i].rm_eo == -1) 629 return(NULL); 630 assert(m->pmatch[i].rm_so != -1); 631 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 632 if (len == 0 && rec++ > MAX_RECURSION) 633 return(NULL); 634 assert(stop - m->beginp >= len); 635 if (sp > stop - len) 636 return(NULL); /* not enough left to match */ 637 ssp = m->offp + m->pmatch[i].rm_so; 638 if (memcmp(sp, ssp, len) != 0) 639 return(NULL); 640 while (m->g->strip[ss] != SOP(O_BACK, i)) 641 ss++; 642 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 643 break; 644 case OQUEST_: /* to null or not */ 645 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 646 if (dp != NULL) 647 return(dp); /* not */ 648 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 649 break; 650 case OPLUS_: 651 assert(m->lastpos != NULL); 652 assert(lev+1 <= m->g->nplus); 653 m->lastpos[lev+1] = sp; 654 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 655 break; 656 case O_PLUS: 657 if (sp == m->lastpos[lev]) /* last pass matched null */ 658 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 659 /* try another pass */ 660 m->lastpos[lev] = sp; 661 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 662 if (dp == NULL) 663 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 664 else 665 return(dp); 666 break; 667 case OCH_: /* find the right one, if any */ 668 ssub = ss + 1; 669 esub = ss + OPND(s) - 1; 670 assert(OP(m->g->strip[esub]) == OOR1); 671 for (;;) { /* find first matching branch */ 672 dp = backref(m, sp, stop, ssub, stopst, lev, rec); 673 if (dp != NULL) 674 return(dp); 675 /* that one missed, try next one */ 676 if (OP(m->g->strip[esub]) == O_CH) 677 return(NULL); /* there is none */ 678 esub++; 679 assert(OP(m->g->strip[esub]) == OOR2); 680 ssub = esub + 1; 681 esub += OPND(m->g->strip[esub]); 682 if (OP(m->g->strip[esub]) == OOR2) 683 esub--; 684 else 685 assert(OP(m->g->strip[esub]) == O_CH); 686 } 687 break; 688 case OLPAREN: /* must undo assignment if rest fails */ 689 i = OPND(s); 690 assert(0 < i && i <= m->g->nsub); 691 offsave = m->pmatch[i].rm_so; 692 m->pmatch[i].rm_so = sp - m->offp; 693 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 694 if (dp != NULL) 695 return(dp); 696 m->pmatch[i].rm_so = offsave; 697 return(NULL); 698 break; 699 case ORPAREN: /* must undo assignment if rest fails */ 700 i = OPND(s); 701 assert(0 < i && i <= m->g->nsub); 702 offsave = m->pmatch[i].rm_eo; 703 m->pmatch[i].rm_eo = sp - m->offp; 704 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 705 if (dp != NULL) 706 return(dp); 707 m->pmatch[i].rm_eo = offsave; 708 return(NULL); 709 break; 710 default: /* uh oh */ 711 assert(nope); 712 break; 713 } 714 715 /* "can't happen" */ 716 assert(nope); 717 /* NOTREACHED */ 718 return NULL; 719} 720 721/* 722 - fast - step through the string at top speed 723 */ 724static const char * /* where tentative match ended, or NULL */ 725fast(struct match *m, const char *start, const char *stop, sopno startst, 726 sopno stopst) 727{ 728 states st = m->st; 729 states fresh = m->fresh; 730 states tmp = m->tmp; 731 const char *p = start; 732 int c = (start == m->beginp) ? OUT : *(start-1); 733 int lastc; /* previous c */ 734 int flagch; 735 int i; 736 const char *coldp; /* last p after which no match was underway */ 737 738 CLEAR(st); 739 SET1(st, startst); 740 st = step(m->g, startst, stopst, st, NOTHING, st); 741 ASSIGN(fresh, st); 742 SP("start", st, *p); 743 coldp = NULL; 744 for (;;) { 745 /* next character */ 746 lastc = c; 747 c = (p == m->endp) ? OUT : *p; 748 if (EQ(st, fresh)) 749 coldp = p; 750 751 /* is there an EOL and/or BOL between lastc and c? */ 752 flagch = '\0'; 753 i = 0; 754 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 755 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 756 flagch = BOL; 757 i = m->g->nbol; 758 } 759 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 760 (c == OUT && !(m->eflags®_NOTEOL)) ) { 761 flagch = (flagch == BOL) ? BOLEOL : EOL; 762 i += m->g->neol; 763 } 764 if (i != 0) { 765 for (; i > 0; i--) 766 st = step(m->g, startst, stopst, st, flagch, st); 767 SP("boleol", st, c); 768 } 769 770 /* how about a word boundary? */ 771 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 772 (c != OUT && ISWORD(c)) ) { 773 flagch = BOW; 774 } 775 if ( (lastc != OUT && ISWORD(lastc)) && 776 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 777 flagch = EOW; 778 } 779 if (flagch == BOW || flagch == EOW) { 780 st = step(m->g, startst, stopst, st, flagch, st); 781 SP("boweow", st, c); 782 } 783 784 /* are we done? */ 785 if (ISSET(st, stopst) || p == stop) 786 break; /* NOTE BREAK OUT */ 787 788 /* no, we must deal with this character */ 789 ASSIGN(tmp, st); 790 ASSIGN(st, fresh); 791 assert(c != OUT); 792 st = step(m->g, startst, stopst, tmp, c, st); 793 SP("aft", st, c); 794 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 795 p++; 796 } 797 798 assert(coldp != NULL); 799 m->coldp = coldp; 800 if (ISSET(st, stopst)) 801 return(p+1); 802 else 803 return(NULL); 804} 805 806/* 807 - slow - step through the string more deliberately 808 */ 809static const char * /* where it ended */ 810slow(struct match *m, const char *start, const char *stop, sopno startst, 811 sopno stopst) 812{ 813 /* Quickly skip over fixed character matches at the start. */ 814 const char *p = start; 815 for (; startst < stopst; ++startst) { 816 int hard = 0; 817 sop s = m->g->strip[startst]; 818 switch (OP(s)) { 819 case OLPAREN: 820 case ORPAREN: 821 /* Not relevant here. */ 822 break; 823 case OCHAR: 824 if (p == stop || *p != (char)OPND(s)) 825 return NULL; 826 ++p; 827 break; 828 default: 829 hard = 1; 830 break; 831 } 832 if (hard) 833 break; 834 } 835 836 states st = m->st; 837 states empty = m->empty; 838 states tmp = m->tmp; 839 int c = (p == m->beginp) ? OUT : *(p-1); 840 int lastc; /* previous c */ 841 int flagch; 842 int i; 843 const char *matchp; /* last p at which a match ended */ 844 845 AT("slow", start, stop, startst, stopst); 846 CLEAR(st); 847 SET1(st, startst); 848 SP("sstart", st, *p); 849 st = step(m->g, startst, stopst, st, NOTHING, st); 850 matchp = NULL; 851 for (;;) { 852 /* next character */ 853 lastc = c; 854 c = (p == m->endp) ? OUT : *p; 855 856 /* is there an EOL and/or BOL between lastc and c? */ 857 flagch = '\0'; 858 i = 0; 859 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 860 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 861 flagch = BOL; 862 i = m->g->nbol; 863 } 864 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 865 (c == OUT && !(m->eflags®_NOTEOL)) ) { 866 flagch = (flagch == BOL) ? BOLEOL : EOL; 867 i += m->g->neol; 868 } 869 if (i != 0) { 870 for (; i > 0; i--) 871 st = step(m->g, startst, stopst, st, flagch, st); 872 SP("sboleol", st, c); 873 } 874 875 /* how about a word boundary? */ 876 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 877 (c != OUT && ISWORD(c)) ) { 878 flagch = BOW; 879 } 880 if ( (lastc != OUT && ISWORD(lastc)) && 881 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 882 flagch = EOW; 883 } 884 if (flagch == BOW || flagch == EOW) { 885 st = step(m->g, startst, stopst, st, flagch, st); 886 SP("sboweow", st, c); 887 } 888 889 /* are we done? */ 890 if (ISSET(st, stopst)) 891 matchp = p; 892 if (EQ(st, empty) || p == stop) 893 break; /* NOTE BREAK OUT */ 894 895 /* no, we must deal with this character */ 896 ASSIGN(tmp, st); 897 ASSIGN(st, empty); 898 assert(c != OUT); 899 st = step(m->g, startst, stopst, tmp, c, st); 900 SP("saft", st, c); 901 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 902 p++; 903 } 904 905 return(matchp); 906} 907 908 909/* 910 - step - map set of states reachable before char to set reachable after 911 */ 912static states 913step(struct re_guts *g, 914 sopno start, /* start state within strip */ 915 sopno stop, /* state after stop state within strip */ 916 states bef, /* states reachable before */ 917 int ch, /* character or NONCHAR code */ 918 states aft) /* states already known reachable after */ 919{ 920 cset *cs; 921 sop s; 922 sopno pc; 923 onestate here; /* note, macros know this name */ 924 sopno look; 925 int i; 926 927 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 928 s = g->strip[pc]; 929 switch (OP(s)) { 930 case OEND: 931 assert(pc == stop-1); 932 break; 933 case OCHAR: 934 /* only characters can match */ 935 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 936 if (ch == (char)OPND(s)) 937 FWD(aft, bef, 1); 938 break; 939 case OBOL: 940 if (ch == BOL || ch == BOLEOL) 941 FWD(aft, bef, 1); 942 break; 943 case OEOL: 944 if (ch == EOL || ch == BOLEOL) 945 FWD(aft, bef, 1); 946 break; 947 case OBOW: 948 if (ch == BOW) 949 FWD(aft, bef, 1); 950 break; 951 case OEOW: 952 if (ch == EOW) 953 FWD(aft, bef, 1); 954 break; 955 case OANY: 956 if (!NONCHAR(ch)) 957 FWD(aft, bef, 1); 958 break; 959 case OANYOF: 960 cs = &g->sets[OPND(s)]; 961 if (!NONCHAR(ch) && CHIN(cs, ch)) 962 FWD(aft, bef, 1); 963 break; 964 case OBACK_: /* ignored here */ 965 case O_BACK: 966 FWD(aft, aft, 1); 967 break; 968 case OPLUS_: /* forward, this is just an empty */ 969 FWD(aft, aft, 1); 970 break; 971 case O_PLUS: /* both forward and back */ 972 FWD(aft, aft, 1); 973 i = ISSETBACK(aft, OPND(s)); 974 BACK(aft, aft, OPND(s)); 975 if (!i && ISSETBACK(aft, OPND(s))) { 976 /* oho, must reconsider loop body */ 977 pc -= OPND(s) + 1; 978 INIT(here, pc); 979 } 980 break; 981 case OQUEST_: /* two branches, both forward */ 982 FWD(aft, aft, 1); 983 FWD(aft, aft, OPND(s)); 984 break; 985 case O_QUEST: /* just an empty */ 986 FWD(aft, aft, 1); 987 break; 988 case OLPAREN: /* not significant here */ 989 case ORPAREN: 990 FWD(aft, aft, 1); 991 break; 992 case OCH_: /* mark the first two branches */ 993 FWD(aft, aft, 1); 994 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 995 FWD(aft, aft, OPND(s)); 996 break; 997 case OOR1: /* done a branch, find the O_CH */ 998 if (ISSTATEIN(aft, here)) { 999 for (look = 1; 1000 OP(s = g->strip[pc+look]) != O_CH; 1001 look += OPND(s)) 1002 assert(OP(s) == OOR2); 1003 FWD(aft, aft, look); 1004 } 1005 break; 1006 case OOR2: /* propagate OCH_'s marking */ 1007 FWD(aft, aft, 1); 1008 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1009 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1010 FWD(aft, aft, OPND(s)); 1011 } 1012 break; 1013 case O_CH: /* just empty */ 1014 FWD(aft, aft, 1); 1015 break; 1016 default: /* ooooops... */ 1017 assert(nope); 1018 break; 1019 } 1020 } 1021 1022 return(aft); 1023} 1024 1025#ifdef REDEBUG 1026/* 1027 - print - print a set of states 1028 */ 1029static void 1030print(struct match *m, const char *caption, states st, int ch, FILE *d) 1031{ 1032 struct re_guts *g = m->g; 1033 int i; 1034 int first = 1; 1035 1036 if (!(m->eflags®_TRACE)) 1037 return; 1038 1039 (void)fprintf(d, "%s", caption); 1040 if (ch != '\0') 1041 (void)fprintf(d, " %s", pchar(ch)); 1042 for (i = 0; i < g->nstates; i++) 1043 if (ISSET(st, i)) { 1044 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1045 first = 0; 1046 } 1047 (void)fprintf(d, "\n"); 1048} 1049 1050/* 1051 - at - print current situation 1052 */ 1053static void 1054at(struct match *m, const char *title, const char *start, const char *stop, 1055 sopno startst, sopno stopst) 1056{ 1057 if (!(m->eflags®_TRACE)) 1058 return; 1059 1060 (void)printf("%s %s-", title, pchar(*start)); 1061 (void)printf("%s ", pchar(*stop)); 1062 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 1063} 1064 1065#ifndef PCHARDONE 1066#define PCHARDONE /* never again */ 1067/* 1068 - pchar - make a character printable 1069 * 1070 * Is this identical to regchar() over in debug.c? Well, yes. But a 1071 * duplicate here avoids having a debugging-capable regexec.o tied to 1072 * a matching debug.o, and this is convenient. It all disappears in 1073 * the non-debug compilation anyway, so it doesn't matter much. 1074 */ 1075static char * /* -> representation */ 1076pchar(int ch) 1077{ 1078 static char pbuf[10]; 1079 1080 if (isprint(ch) || ch == ' ') 1081 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1082 else 1083 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1084 return(pbuf); 1085} 1086#endif 1087#endif 1088 1089#undef matcher 1090#undef fast 1091#undef slow 1092#undef dissect 1093#undef backref 1094#undef step 1095#undef print 1096#undef at 1097#undef match 1098#undef nope 1099#undef step_back 1100