1/*- 2 * This code is derived from OpenBSD's libc/regex, original license follows: 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38/* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45#ifdef SNAMES 46#define matcher smatcher 47#define fast sfast 48#define slow sslow 49#define dissect sdissect 50#define backref sbackref 51#define step sstep 52#define print sprint 53#define at sat 54#define match smat 55#define nope snope 56#define step_back sstep_back 57#endif 58#ifdef LNAMES 59#define matcher lmatcher 60#define fast lfast 61#define slow lslow 62#define dissect ldissect 63#define backref lbackref 64#define step lstep 65#define print lprint 66#define at lat 67#define match lmat 68#define nope lnope 69#define step_back lstep_back 70#endif 71 72/* another structure passed up and down to avoid zillions of parameters */ 73struct match { 74 struct re_guts *g; 75 int eflags; 76 llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 77 const char *offp; /* offsets work from here */ 78 const char *beginp; /* start of string -- virtual NUL precedes */ 79 const char *endp; /* end of string -- virtual NUL here */ 80 const char *coldp; /* can be no match starting before here */ 81 const char **lastpos; /* [nplus+1] */ 82 STATEVARS; 83 states st; /* current states */ 84 states fresh; /* states for a fresh start */ 85 states tmp; /* temporary */ 86 states empty; /* empty set of states */ 87}; 88 89static int matcher(struct re_guts *, const char *, size_t, 90 llvm_regmatch_t[], int); 91static const char *dissect(struct match *, const char *, const char *, sopno, 92 sopno); 93static const char *backref(struct match *, const char *, const char *, sopno, 94 sopno, sopno, int); 95static const char *fast(struct match *, const char *, const char *, sopno, sopno); 96static const char *slow(struct match *, const char *, const char *, sopno, sopno); 97static states step(struct re_guts *, sopno, sopno, states, int, states); 98#define MAX_RECURSION 100 99#define BOL (OUT+1) 100#define EOL (BOL+1) 101#define BOLEOL (BOL+2) 102#define NOTHING (BOL+3) 103#define BOW (BOL+4) 104#define EOW (BOL+5) 105#define CODEMAX (BOL+5) /* highest code used */ 106#define NONCHAR(c) ((c) > CHAR_MAX) 107#define NNONCHAR (CODEMAX-CHAR_MAX) 108#ifdef REDEBUG 109static void print(struct match *, char *, states, int, FILE *); 110#endif 111#ifdef REDEBUG 112static void at(struct match *, char *, char *, char *, sopno, sopno); 113#endif 114#ifdef REDEBUG 115static char *pchar(int); 116#endif 117 118#ifdef REDEBUG 119#define SP(t, s, c) print(m, t, s, c, stdout) 120#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 121#define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 122static int nope = 0; 123#else 124#define SP(t, s, c) /* nothing */ 125#define AT(t, p1, p2, s1, s2) /* nothing */ 126#define NOTE(s) /* nothing */ 127#endif 128 129/* 130 - matcher - the actual matching engine 131 */ 132static int /* 0 success, REG_NOMATCH failure */ 133matcher(struct re_guts *g, const char *string, size_t nmatch, 134 llvm_regmatch_t pmatch[], 135 int eflags) 136{ 137 const char *endp; 138 size_t i; 139 struct match mv; 140 struct match *m = &mv; 141 const char *dp; 142 const sopno gf = g->firststate+1; /* +1 for OEND */ 143 const sopno gl = g->laststate; 144 const char *start; 145 const char *stop; 146 147 /* simplify the situation where possible */ 148 if (g->cflags®_NOSUB) 149 nmatch = 0; 150 if (eflags®_STARTEND) { 151 start = string + pmatch[0].rm_so; 152 stop = string + pmatch[0].rm_eo; 153 } else { 154 start = string; 155 stop = start + strlen(start); 156 } 157 if (stop < start) 158 return(REG_INVARG); 159 160 /* prescreening; this does wonders for this rather slow code */ 161 if (g->must != NULL) { 162 for (dp = start; dp < stop; dp++) 163 if (*dp == g->must[0] && stop - dp >= g->mlen && 164 memcmp(dp, g->must, (size_t)g->mlen) == 0) 165 break; 166 if (dp == stop) /* we didn't find g->must */ 167 return(REG_NOMATCH); 168 } 169 170 /* match struct setup */ 171 m->g = g; 172 m->eflags = eflags; 173 m->pmatch = NULL; 174 m->lastpos = NULL; 175 m->offp = string; 176 m->beginp = start; 177 m->endp = stop; 178 STATESETUP(m, 4); 179 SETUP(m->st); 180 SETUP(m->fresh); 181 SETUP(m->tmp); 182 SETUP(m->empty); 183 CLEAR(m->empty); 184 185 /* this loop does only one repetition except for backrefs */ 186 for (;;) { 187 endp = fast(m, start, stop, gf, gl); 188 if (endp == NULL) { /* a miss */ 189 free(m->pmatch); 190 free((void*)m->lastpos); 191 STATETEARDOWN(m); 192 return(REG_NOMATCH); 193 } 194 if (nmatch == 0 && !g->backrefs) 195 break; /* no further info needed */ 196 197 /* where? */ 198 assert(m->coldp != NULL); 199 for (;;) { 200 NOTE("finding start"); 201 endp = slow(m, m->coldp, stop, gf, gl); 202 if (endp != NULL) 203 break; 204 assert(m->coldp < m->endp); 205 m->coldp++; 206 } 207 if (nmatch == 1 && !g->backrefs) 208 break; /* no further info needed */ 209 210 /* oh my, they want the subexpressions... */ 211 if (m->pmatch == NULL) 212 m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) * 213 sizeof(llvm_regmatch_t)); 214 if (m->pmatch == NULL) { 215 STATETEARDOWN(m); 216 return(REG_ESPACE); 217 } 218 for (i = 1; i <= m->g->nsub; i++) 219 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 220 if (!g->backrefs && !(m->eflags®_BACKR)) { 221 NOTE("dissecting"); 222 dp = dissect(m, m->coldp, endp, gf, gl); 223 } else { 224 if (g->nplus > 0 && m->lastpos == NULL) 225 m->lastpos = (const char **)malloc((g->nplus+1) * 226 sizeof(char *)); 227 if (g->nplus > 0 && m->lastpos == NULL) { 228 free(m->pmatch); 229 STATETEARDOWN(m); 230 return(REG_ESPACE); 231 } 232 NOTE("backref dissect"); 233 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 234 } 235 if (dp != NULL) 236 break; 237 238 /* uh-oh... we couldn't find a subexpression-level match */ 239 assert(g->backrefs); /* must be back references doing it */ 240 assert(g->nplus == 0 || m->lastpos != NULL); 241 for (;;) { 242 if (dp != NULL || endp <= m->coldp) 243 break; /* defeat */ 244 NOTE("backoff"); 245 endp = slow(m, m->coldp, endp-1, gf, gl); 246 if (endp == NULL) 247 break; /* defeat */ 248 /* try it on a shorter possibility */ 249#ifndef NDEBUG 250 for (i = 1; i <= m->g->nsub; i++) { 251 assert(m->pmatch[i].rm_so == -1); 252 assert(m->pmatch[i].rm_eo == -1); 253 } 254#endif 255 NOTE("backoff dissect"); 256 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 257 } 258 assert(dp == NULL || dp == endp); 259 if (dp != NULL) /* found a shorter one */ 260 break; 261 262 /* despite initial appearances, there is no match here */ 263 NOTE("false alarm"); 264 if (m->coldp == stop) 265 break; 266 start = m->coldp + 1; /* recycle starting later */ 267 } 268 269 /* fill in the details if requested */ 270 if (nmatch > 0) { 271 pmatch[0].rm_so = m->coldp - m->offp; 272 pmatch[0].rm_eo = endp - m->offp; 273 } 274 if (nmatch > 1) { 275 assert(m->pmatch != NULL); 276 for (i = 1; i < nmatch; i++) 277 if (i <= m->g->nsub) 278 pmatch[i] = m->pmatch[i]; 279 else { 280 pmatch[i].rm_so = -1; 281 pmatch[i].rm_eo = -1; 282 } 283 } 284 285 if (m->pmatch != NULL) 286 free((char *)m->pmatch); 287 if (m->lastpos != NULL) 288 free((char *)m->lastpos); 289 STATETEARDOWN(m); 290 return(0); 291} 292 293/* Step back from "stop" to a position where the strip startst..stopst might 294 * match. This can always conservatively return "stop - 1", but may return an 295 * earlier position if matches at later positions are impossible. */ 296static const char * 297step_back(struct re_guts *g, const char *start, const char *stop, sopno startst, 298 sopno stopst) 299{ 300 /* Always step back at least one character. */ 301 assert(stop > start); 302 const char *res = stop - 1; 303 304 /* Check whether the strip startst..stropst starts with a fixed character, 305 * ignoring any closing parentheses. If not, return a conservative result. */ 306 for (;;) { 307 if (startst >= stopst) 308 return res; 309 if (OP(g->strip[startst]) != ORPAREN) 310 break; 311 startst++; 312 } 313 if (OP(g->strip[startst]) != OCHAR) 314 return res; 315 316 /* Find the character that starts the following match. */ 317 char ch = OPND(g->strip[startst]); 318 for (; res != start; --res) { 319 if (*res == ch) 320 break; 321 } 322 return res; 323} 324 325/* 326 - dissect - figure out what matched what, no back references 327 */ 328static const char * /* == stop (success) always */ 329dissect(struct match *m, const char *start, const char *stop, sopno startst, 330 sopno stopst) 331{ 332 int i; 333 sopno ss; /* start sop of current subRE */ 334 sopno es; /* end sop of current subRE */ 335 const char *sp; /* start of string matched by it */ 336 const char *stp; /* string matched by it cannot pass here */ 337 const char *rest; /* start of rest of string */ 338 const char *tail; /* string unmatched by rest of RE */ 339 sopno ssub; /* start sop of subsubRE */ 340 sopno esub; /* end sop of subsubRE */ 341 const char *ssp; /* start of string matched by subsubRE */ 342 const char *sep; /* end of string matched by subsubRE */ 343 const char *oldssp; /* previous ssp */ 344 345 AT("diss", start, stop, startst, stopst); 346 sp = start; 347 for (ss = startst; ss < stopst; ss = es) { 348 /* identify end of subRE */ 349 es = ss; 350 switch (OP(m->g->strip[es])) { 351 case OPLUS_: 352 case OQUEST_: 353 es += OPND(m->g->strip[es]); 354 break; 355 case OCH_: 356 while (OP(m->g->strip[es]) != O_CH) 357 es += OPND(m->g->strip[es]); 358 break; 359 } 360 es++; 361 362 /* figure out what it matched */ 363 switch (OP(m->g->strip[ss])) { 364 case OEND: 365 assert(nope); 366 break; 367 case OCHAR: 368 sp++; 369 break; 370 case OBOL: 371 case OEOL: 372 case OBOW: 373 case OEOW: 374 break; 375 case OANY: 376 case OANYOF: 377 sp++; 378 break; 379 case OBACK_: 380 case O_BACK: 381 assert(nope); 382 break; 383 /* cases where length of match is hard to find */ 384 case OQUEST_: 385 stp = stop; 386 for (;;) { 387 /* how long could this one be? */ 388 rest = slow(m, sp, stp, ss, es); 389 assert(rest != NULL); /* it did match */ 390 /* could the rest match the rest? */ 391 tail = slow(m, rest, stop, es, stopst); 392 if (tail == stop) 393 break; /* yes! */ 394 /* no -- try a shorter match for this one */ 395 stp = step_back(m->g, sp, rest, es, stopst); 396 assert(stp >= sp); /* it did work */ 397 } 398 ssub = ss + 1; 399 esub = es - 1; 400 /* did innards match? */ 401 if (slow(m, sp, rest, ssub, esub) != NULL) { 402 const char *dp = dissect(m, sp, rest, ssub, esub); 403 (void)dp; /* avoid warning if assertions off */ 404 assert(dp == rest); 405 } else /* no */ 406 assert(sp == rest); 407 sp = rest; 408 break; 409 case OPLUS_: 410 stp = stop; 411 for (;;) { 412 /* how long could this one be? */ 413 rest = slow(m, sp, stp, ss, es); 414 assert(rest != NULL); /* it did match */ 415 /* could the rest match the rest? */ 416 tail = slow(m, rest, stop, es, stopst); 417 if (tail == stop) 418 break; /* yes! */ 419 /* no -- try a shorter match for this one */ 420 stp = step_back(m->g, sp, rest, es, stopst); 421 assert(stp >= sp); /* it did work */ 422 } 423 ssub = ss + 1; 424 esub = es - 1; 425 ssp = sp; 426 oldssp = ssp; 427 for (;;) { /* find last match of innards */ 428 sep = slow(m, ssp, rest, ssub, esub); 429 if (sep == NULL || sep == ssp) 430 break; /* failed or matched null */ 431 oldssp = ssp; /* on to next try */ 432 ssp = sep; 433 } 434 if (sep == NULL) { 435 /* last successful match */ 436 sep = ssp; 437 ssp = oldssp; 438 } 439 assert(sep == rest); /* must exhaust substring */ 440 assert(slow(m, ssp, sep, ssub, esub) == rest); 441 { 442 const char *dp = dissect(m, ssp, sep, ssub, esub); 443 (void)dp; /* avoid warning if assertions off */ 444 assert(dp == sep); 445 } 446 sp = rest; 447 break; 448 case OCH_: 449 stp = stop; 450 for (;;) { 451 /* how long could this one be? */ 452 rest = slow(m, sp, stp, ss, es); 453 assert(rest != NULL); /* it did match */ 454 /* could the rest match the rest? */ 455 tail = slow(m, rest, stop, es, stopst); 456 if (tail == stop) 457 break; /* yes! */ 458 /* no -- try a shorter match for this one */ 459 stp = rest - 1; 460 assert(stp >= sp); /* it did work */ 461 } 462 ssub = ss + 1; 463 esub = ss + OPND(m->g->strip[ss]) - 1; 464 assert(OP(m->g->strip[esub]) == OOR1); 465 for (;;) { /* find first matching branch */ 466 if (slow(m, sp, rest, ssub, esub) == rest) 467 break; /* it matched all of it */ 468 /* that one missed, try next one */ 469 assert(OP(m->g->strip[esub]) == OOR1); 470 esub++; 471 assert(OP(m->g->strip[esub]) == OOR2); 472 ssub = esub + 1; 473 esub += OPND(m->g->strip[esub]); 474 if (OP(m->g->strip[esub]) == OOR2) 475 esub--; 476 else 477 assert(OP(m->g->strip[esub]) == O_CH); 478 } 479 { 480 const char *dp = dissect(m, sp, rest, ssub, esub); 481 (void)dp; /* avoid warning if assertions off */ 482 assert(dp == rest); 483 } 484 sp = rest; 485 break; 486 case O_PLUS: 487 case O_QUEST: 488 case OOR1: 489 case OOR2: 490 case O_CH: 491 assert(nope); 492 break; 493 case OLPAREN: 494 i = OPND(m->g->strip[ss]); 495 assert(0 < i && i <= m->g->nsub); 496 m->pmatch[i].rm_so = sp - m->offp; 497 break; 498 case ORPAREN: 499 i = OPND(m->g->strip[ss]); 500 assert(0 < i && i <= m->g->nsub); 501 m->pmatch[i].rm_eo = sp - m->offp; 502 break; 503 default: /* uh oh */ 504 assert(nope); 505 break; 506 } 507 } 508 509 assert(sp == stop); 510 return(sp); 511} 512 513/* 514 - backref - figure out what matched what, figuring in back references 515 */ 516static const char * /* == stop (success) or NULL (failure) */ 517backref(struct match *m, const char *start, const char *stop, sopno startst, 518 sopno stopst, sopno lev, int rec) /* PLUS nesting level */ 519{ 520 int i; 521 sopno ss; /* start sop of current subRE */ 522 const char *sp; /* start of string matched by it */ 523 sopno ssub; /* start sop of subsubRE */ 524 sopno esub; /* end sop of subsubRE */ 525 const char *ssp; /* start of string matched by subsubRE */ 526 const char *dp; 527 size_t len; 528 int hard; 529 sop s; 530 llvm_regoff_t offsave; 531 cset *cs; 532 533 AT("back", start, stop, startst, stopst); 534 sp = start; 535 536 /* get as far as we can with easy stuff */ 537 hard = 0; 538 for (ss = startst; !hard && ss < stopst; ss++) 539 switch (OP(s = m->g->strip[ss])) { 540 case OCHAR: 541 if (sp == stop || *sp++ != (char)OPND(s)) 542 return(NULL); 543 break; 544 case OANY: 545 if (sp == stop) 546 return(NULL); 547 sp++; 548 break; 549 case OANYOF: 550 cs = &m->g->sets[OPND(s)]; 551 if (sp == stop || !CHIN(cs, *sp++)) 552 return(NULL); 553 break; 554 case OBOL: 555 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 556 (sp < m->endp && *(sp-1) == '\n' && 557 (m->g->cflags®_NEWLINE)) ) 558 { /* yes */ } 559 else 560 return(NULL); 561 break; 562 case OEOL: 563 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 564 (sp < m->endp && *sp == '\n' && 565 (m->g->cflags®_NEWLINE)) ) 566 { /* yes */ } 567 else 568 return(NULL); 569 break; 570 case OBOW: 571 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 572 (sp < m->endp && *(sp-1) == '\n' && 573 (m->g->cflags®_NEWLINE)) || 574 (sp > m->beginp && 575 !ISWORD(*(sp-1))) ) && 576 (sp < m->endp && ISWORD(*sp)) ) 577 { /* yes */ } 578 else 579 return(NULL); 580 break; 581 case OEOW: 582 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 583 (sp < m->endp && *sp == '\n' && 584 (m->g->cflags®_NEWLINE)) || 585 (sp < m->endp && !ISWORD(*sp)) ) && 586 (sp > m->beginp && ISWORD(*(sp-1))) ) 587 { /* yes */ } 588 else 589 return(NULL); 590 break; 591 case O_QUEST: 592 break; 593 case OOR1: /* matches null but needs to skip */ 594 ss++; 595 s = m->g->strip[ss]; 596 do { 597 assert(OP(s) == OOR2); 598 ss += OPND(s); 599 } while (OP(s = m->g->strip[ss]) != O_CH); 600 /* note that the ss++ gets us past the O_CH */ 601 break; 602 default: /* have to make a choice */ 603 hard = 1; 604 break; 605 } 606 if (!hard) { /* that was it! */ 607 if (sp != stop) 608 return(NULL); 609 return(sp); 610 } 611 ss--; /* adjust for the for's final increment */ 612 613 /* the hard stuff */ 614 AT("hard", sp, stop, ss, stopst); 615 s = m->g->strip[ss]; 616 switch (OP(s)) { 617 case OBACK_: /* the vilest depths */ 618 i = OPND(s); 619 assert(0 < i && i <= m->g->nsub); 620 if (m->pmatch[i].rm_eo == -1) 621 return(NULL); 622 assert(m->pmatch[i].rm_so != -1); 623 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 624 if (len == 0 && rec++ > MAX_RECURSION) 625 return(NULL); 626 assert(stop - m->beginp >= len); 627 if (sp > stop - len) 628 return(NULL); /* not enough left to match */ 629 ssp = m->offp + m->pmatch[i].rm_so; 630 if (memcmp(sp, ssp, len) != 0) 631 return(NULL); 632 while (m->g->strip[ss] != SOP(O_BACK, i)) 633 ss++; 634 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 635 break; 636 case OQUEST_: /* to null or not */ 637 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 638 if (dp != NULL) 639 return(dp); /* not */ 640 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 641 break; 642 case OPLUS_: 643 assert(m->lastpos != NULL); 644 assert(lev+1 <= m->g->nplus); 645 m->lastpos[lev+1] = sp; 646 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 647 break; 648 case O_PLUS: 649 if (sp == m->lastpos[lev]) /* last pass matched null */ 650 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 651 /* try another pass */ 652 m->lastpos[lev] = sp; 653 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 654 if (dp == NULL) 655 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 656 else 657 return(dp); 658 break; 659 case OCH_: /* find the right one, if any */ 660 ssub = ss + 1; 661 esub = ss + OPND(s) - 1; 662 assert(OP(m->g->strip[esub]) == OOR1); 663 for (;;) { /* find first matching branch */ 664 dp = backref(m, sp, stop, ssub, esub, lev, rec); 665 if (dp != NULL) 666 return(dp); 667 /* that one missed, try next one */ 668 if (OP(m->g->strip[esub]) == O_CH) 669 return(NULL); /* there is none */ 670 esub++; 671 assert(OP(m->g->strip[esub]) == OOR2); 672 ssub = esub + 1; 673 esub += OPND(m->g->strip[esub]); 674 if (OP(m->g->strip[esub]) == OOR2) 675 esub--; 676 else 677 assert(OP(m->g->strip[esub]) == O_CH); 678 } 679 break; 680 case OLPAREN: /* must undo assignment if rest fails */ 681 i = OPND(s); 682 assert(0 < i && i <= m->g->nsub); 683 offsave = m->pmatch[i].rm_so; 684 m->pmatch[i].rm_so = sp - m->offp; 685 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 686 if (dp != NULL) 687 return(dp); 688 m->pmatch[i].rm_so = offsave; 689 return(NULL); 690 break; 691 case ORPAREN: /* must undo assignment if rest fails */ 692 i = OPND(s); 693 assert(0 < i && i <= m->g->nsub); 694 offsave = m->pmatch[i].rm_eo; 695 m->pmatch[i].rm_eo = sp - m->offp; 696 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 697 if (dp != NULL) 698 return(dp); 699 m->pmatch[i].rm_eo = offsave; 700 return(NULL); 701 break; 702 default: /* uh oh */ 703 assert(nope); 704 break; 705 } 706 707 /* "can't happen" */ 708 assert(nope); 709 /* NOTREACHED */ 710 return NULL; 711} 712 713/* 714 - fast - step through the string at top speed 715 */ 716static const char * /* where tentative match ended, or NULL */ 717fast(struct match *m, const char *start, const char *stop, sopno startst, 718 sopno stopst) 719{ 720 states st = m->st; 721 states fresh = m->fresh; 722 states tmp = m->tmp; 723 const char *p = start; 724 int c = (start == m->beginp) ? OUT : *(start-1); 725 int lastc; /* previous c */ 726 int flagch; 727 int i; 728 const char *coldp; /* last p after which no match was underway */ 729 730 CLEAR(st); 731 SET1(st, startst); 732 st = step(m->g, startst, stopst, st, NOTHING, st); 733 ASSIGN(fresh, st); 734 SP("start", st, *p); 735 coldp = NULL; 736 for (;;) { 737 /* next character */ 738 lastc = c; 739 c = (p == m->endp) ? OUT : *p; 740 if (EQ(st, fresh)) 741 coldp = p; 742 743 /* is there an EOL and/or BOL between lastc and c? */ 744 flagch = '\0'; 745 i = 0; 746 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 747 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 748 flagch = BOL; 749 i = m->g->nbol; 750 } 751 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 752 (c == OUT && !(m->eflags®_NOTEOL)) ) { 753 flagch = (flagch == BOL) ? BOLEOL : EOL; 754 i += m->g->neol; 755 } 756 if (i != 0) { 757 for (; i > 0; i--) 758 st = step(m->g, startst, stopst, st, flagch, st); 759 SP("boleol", st, c); 760 } 761 762 /* how about a word boundary? */ 763 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 764 (c != OUT && ISWORD(c)) ) { 765 flagch = BOW; 766 } 767 if ( (lastc != OUT && ISWORD(lastc)) && 768 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 769 flagch = EOW; 770 } 771 if (flagch == BOW || flagch == EOW) { 772 st = step(m->g, startst, stopst, st, flagch, st); 773 SP("boweow", st, c); 774 } 775 776 /* are we done? */ 777 if (ISSET(st, stopst) || p == stop) 778 break; /* NOTE BREAK OUT */ 779 780 /* no, we must deal with this character */ 781 ASSIGN(tmp, st); 782 ASSIGN(st, fresh); 783 assert(c != OUT); 784 st = step(m->g, startst, stopst, tmp, c, st); 785 SP("aft", st, c); 786 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 787 p++; 788 } 789 790 assert(coldp != NULL); 791 m->coldp = coldp; 792 if (ISSET(st, stopst)) 793 return(p+1); 794 else 795 return(NULL); 796} 797 798/* 799 - slow - step through the string more deliberately 800 */ 801static const char * /* where it ended */ 802slow(struct match *m, const char *start, const char *stop, sopno startst, 803 sopno stopst) 804{ 805 states st = m->st; 806 states empty = m->empty; 807 states tmp = m->tmp; 808 const char *p = start; 809 int c = (start == m->beginp) ? OUT : *(start-1); 810 int lastc; /* previous c */ 811 int flagch; 812 int i; 813 const char *matchp; /* last p at which a match ended */ 814 815 AT("slow", start, stop, startst, stopst); 816 CLEAR(st); 817 SET1(st, startst); 818 SP("sstart", st, *p); 819 st = step(m->g, startst, stopst, st, NOTHING, st); 820 matchp = NULL; 821 for (;;) { 822 /* next character */ 823 lastc = c; 824 c = (p == m->endp) ? OUT : *p; 825 826 /* is there an EOL and/or BOL between lastc and c? */ 827 flagch = '\0'; 828 i = 0; 829 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 830 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 831 flagch = BOL; 832 i = m->g->nbol; 833 } 834 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 835 (c == OUT && !(m->eflags®_NOTEOL)) ) { 836 flagch = (flagch == BOL) ? BOLEOL : EOL; 837 i += m->g->neol; 838 } 839 if (i != 0) { 840 for (; i > 0; i--) 841 st = step(m->g, startst, stopst, st, flagch, st); 842 SP("sboleol", st, c); 843 } 844 845 /* how about a word boundary? */ 846 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 847 (c != OUT && ISWORD(c)) ) { 848 flagch = BOW; 849 } 850 if ( (lastc != OUT && ISWORD(lastc)) && 851 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 852 flagch = EOW; 853 } 854 if (flagch == BOW || flagch == EOW) { 855 st = step(m->g, startst, stopst, st, flagch, st); 856 SP("sboweow", st, c); 857 } 858 859 /* are we done? */ 860 if (ISSET(st, stopst)) 861 matchp = p; 862 if (EQ(st, empty) || p == stop) 863 break; /* NOTE BREAK OUT */ 864 865 /* no, we must deal with this character */ 866 ASSIGN(tmp, st); 867 ASSIGN(st, empty); 868 assert(c != OUT); 869 st = step(m->g, startst, stopst, tmp, c, st); 870 SP("saft", st, c); 871 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 872 p++; 873 } 874 875 return(matchp); 876} 877 878 879/* 880 - step - map set of states reachable before char to set reachable after 881 */ 882static states 883step(struct re_guts *g, 884 sopno start, /* start state within strip */ 885 sopno stop, /* state after stop state within strip */ 886 states bef, /* states reachable before */ 887 int ch, /* character or NONCHAR code */ 888 states aft) /* states already known reachable after */ 889{ 890 cset *cs; 891 sop s; 892 sopno pc; 893 onestate here; /* note, macros know this name */ 894 sopno look; 895 int i; 896 897 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 898 s = g->strip[pc]; 899 switch (OP(s)) { 900 case OEND: 901 assert(pc == stop-1); 902 break; 903 case OCHAR: 904 /* only characters can match */ 905 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 906 if (ch == (char)OPND(s)) 907 FWD(aft, bef, 1); 908 break; 909 case OBOL: 910 if (ch == BOL || ch == BOLEOL) 911 FWD(aft, bef, 1); 912 break; 913 case OEOL: 914 if (ch == EOL || ch == BOLEOL) 915 FWD(aft, bef, 1); 916 break; 917 case OBOW: 918 if (ch == BOW) 919 FWD(aft, bef, 1); 920 break; 921 case OEOW: 922 if (ch == EOW) 923 FWD(aft, bef, 1); 924 break; 925 case OANY: 926 if (!NONCHAR(ch)) 927 FWD(aft, bef, 1); 928 break; 929 case OANYOF: 930 cs = &g->sets[OPND(s)]; 931 if (!NONCHAR(ch) && CHIN(cs, ch)) 932 FWD(aft, bef, 1); 933 break; 934 case OBACK_: /* ignored here */ 935 case O_BACK: 936 FWD(aft, aft, 1); 937 break; 938 case OPLUS_: /* forward, this is just an empty */ 939 FWD(aft, aft, 1); 940 break; 941 case O_PLUS: /* both forward and back */ 942 FWD(aft, aft, 1); 943 i = ISSETBACK(aft, OPND(s)); 944 BACK(aft, aft, OPND(s)); 945 if (!i && ISSETBACK(aft, OPND(s))) { 946 /* oho, must reconsider loop body */ 947 pc -= OPND(s) + 1; 948 INIT(here, pc); 949 } 950 break; 951 case OQUEST_: /* two branches, both forward */ 952 FWD(aft, aft, 1); 953 FWD(aft, aft, OPND(s)); 954 break; 955 case O_QUEST: /* just an empty */ 956 FWD(aft, aft, 1); 957 break; 958 case OLPAREN: /* not significant here */ 959 case ORPAREN: 960 FWD(aft, aft, 1); 961 break; 962 case OCH_: /* mark the first two branches */ 963 FWD(aft, aft, 1); 964 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 965 FWD(aft, aft, OPND(s)); 966 break; 967 case OOR1: /* done a branch, find the O_CH */ 968 if (ISSTATEIN(aft, here)) { 969 for (look = 1; 970 OP(s = g->strip[pc+look]) != O_CH; 971 look += OPND(s)) 972 assert(OP(s) == OOR2); 973 FWD(aft, aft, look); 974 } 975 break; 976 case OOR2: /* propagate OCH_'s marking */ 977 FWD(aft, aft, 1); 978 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 979 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 980 FWD(aft, aft, OPND(s)); 981 } 982 break; 983 case O_CH: /* just empty */ 984 FWD(aft, aft, 1); 985 break; 986 default: /* ooooops... */ 987 assert(nope); 988 break; 989 } 990 } 991 992 return(aft); 993} 994 995#ifdef REDEBUG 996/* 997 - print - print a set of states 998 */ 999static void 1000print(struct match *m, char *caption, states st, int ch, FILE *d) 1001{ 1002 struct re_guts *g = m->g; 1003 int i; 1004 int first = 1; 1005 1006 if (!(m->eflags®_TRACE)) 1007 return; 1008 1009 (void)fprintf(d, "%s", caption); 1010 if (ch != '\0') 1011 (void)fprintf(d, " %s", pchar(ch)); 1012 for (i = 0; i < g->nstates; i++) 1013 if (ISSET(st, i)) { 1014 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1015 first = 0; 1016 } 1017 (void)fprintf(d, "\n"); 1018} 1019 1020/* 1021 - at - print current situation 1022 */ 1023static void 1024at(struct match *m, char *title, char *start, char *stop, sopno startst, 1025 sopno stopst) 1026{ 1027 if (!(m->eflags®_TRACE)) 1028 return; 1029 1030 (void)printf("%s %s-", title, pchar(*start)); 1031 (void)printf("%s ", pchar(*stop)); 1032 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 1033} 1034 1035#ifndef PCHARDONE 1036#define PCHARDONE /* never again */ 1037/* 1038 - pchar - make a character printable 1039 * 1040 * Is this identical to regchar() over in debug.c? Well, yes. But a 1041 * duplicate here avoids having a debugging-capable regexec.o tied to 1042 * a matching debug.o, and this is convenient. It all disappears in 1043 * the non-debug compilation anyway, so it doesn't matter much. 1044 */ 1045static char * /* -> representation */ 1046pchar(int ch) 1047{ 1048 static char pbuf[10]; 1049 1050 if (isPrint(ch) || ch == ' ') 1051 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1052 else 1053 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1054 return(pbuf); 1055} 1056#endif 1057#endif 1058 1059#undef matcher 1060#undef fast 1061#undef slow 1062#undef dissect 1063#undef backref 1064#undef step 1065#undef print 1066#undef at 1067#undef match 1068#undef nope 1069#undef step_back 1070