1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)engine.c 8.5 (Berkeley) 3/20/94 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <stdbool.h> 40 41 /* 42 * The matching engine and friends. This file is #included by regexec.c 43 * after suitable #defines of a variety of macros used herein, so that 44 * different state representations can be used without duplicating masses 45 * of code. 46 */ 47 48 #ifdef SNAMES 49 #define matcher smatcher 50 #define walk swalk 51 #define dissect sdissect 52 #define backref sbackref 53 #define step sstep 54 #define print sprint 55 #define at sat 56 #define match smat 57 #endif 58 #ifdef LNAMES 59 #define matcher lmatcher 60 #define walk lwalk 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #endif 68 #ifdef MNAMES 69 #define matcher mmatcher 70 #define walk mwalk 71 #define dissect mdissect 72 #define backref mbackref 73 #define step mstep 74 #define print mprint 75 #define at mat 76 #define match mmat 77 #endif 78 79 /* another structure passed up and down to avoid zillions of parameters */ 80 struct match { 81 struct re_guts *g; 82 int eflags; 83 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 84 const char *offp; /* offsets work from here */ 85 const char *beginp; /* start of string -- virtual NUL precedes */ 86 const char *endp; /* end of string -- virtual NUL here */ 87 const char *coldp; /* can be no match starting before here */ 88 const char **lastpos; /* [nplus+1] */ 89 STATEVARS; 90 states st; /* current states */ 91 states fresh; /* states for a fresh start */ 92 states tmp; /* temporary */ 93 states empty; /* empty set of states */ 94 mbstate_t mbs; /* multibyte conversion state */ 95 }; 96 97 /* ========= begin header generated by ./mkh ========= */ 98 #ifdef __cplusplus 99 extern "C" { 100 #endif 101 102 /* === engine.c === */ 103 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 104 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 105 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 106 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); 107 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); 108 #define MAX_RECURSION 100 109 #define BOL (OUT-1) 110 #define EOL (BOL-1) 111 #define BOLEOL (BOL-2) 112 #define NOTHING (BOL-3) 113 #define BOW (BOL-4) 114 #define EOW (BOL-5) 115 #define BADCHAR (BOL-6) 116 #define NONCHAR(c) ((c) <= OUT) 117 #ifdef REDEBUG 118 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 119 #endif 120 #ifdef REDEBUG 121 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 122 #endif 123 #ifdef REDEBUG 124 static const char *pchar(int ch); 125 #endif 126 127 #ifdef __cplusplus 128 } 129 #endif 130 /* ========= end header generated by ./mkh ========= */ 131 132 #ifdef REDEBUG 133 #define SP(t, s, c) print(m, t, s, c, stdout) 134 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 135 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 136 #else 137 #define SP(t, s, c) /* nothing */ 138 #define AT(t, p1, p2, s1, s2) /* nothing */ 139 #define NOTE(s) /* nothing */ 140 #endif 141 142 /* 143 - matcher - the actual matching engine 144 == static int matcher(struct re_guts *g, const char *string, \ 145 == size_t nmatch, regmatch_t pmatch[], int eflags); 146 */ 147 static int /* 0 success, REG_NOMATCH failure */ 148 matcher(struct re_guts *g, 149 const char *string, 150 size_t nmatch, 151 regmatch_t pmatch[], 152 int eflags) 153 { 154 const char *endp; 155 size_t i; 156 struct match mv; 157 struct match *m = &mv; 158 const char *dp = NULL; 159 const sopno gf = g->firststate+1; /* +1 for OEND */ 160 const sopno gl = g->laststate; 161 const char *start; 162 const char *stop; 163 /* Boyer-Moore algorithms variables */ 164 const char *pp; 165 int cj, mj; 166 const char *mustfirst; 167 const char *mustlast; 168 int *matchjump; 169 int *charjump; 170 171 /* simplify the situation where possible */ 172 if (g->cflags®_NOSUB) 173 nmatch = 0; 174 if (eflags®_STARTEND) { 175 start = string + pmatch[0].rm_so; 176 stop = string + pmatch[0].rm_eo; 177 } else { 178 start = string; 179 stop = start + strlen(start); 180 } 181 if (stop < start) 182 return(REG_INVARG); 183 184 /* prescreening; this does wonders for this rather slow code */ 185 if (g->must != NULL) { 186 if (g->charjump != NULL && g->matchjump != NULL) { 187 mustfirst = g->must; 188 mustlast = g->must + g->mlen - 1; 189 charjump = g->charjump; 190 matchjump = g->matchjump; 191 pp = mustlast; 192 for (dp = start+g->mlen-1; dp < stop;) { 193 /* Fast skip non-matches */ 194 while (dp < stop && charjump[(int)*dp]) 195 dp += charjump[(int)*dp]; 196 197 if (dp >= stop) 198 break; 199 200 /* Greedy matcher */ 201 /* We depend on not being used for 202 * for strings of length 1 203 */ 204 while (*--dp == *--pp && pp != mustfirst); 205 206 if (*dp == *pp) 207 break; 208 209 /* Jump to next possible match */ 210 mj = matchjump[pp - mustfirst]; 211 cj = charjump[(int)*dp]; 212 dp += (cj < mj ? mj : cj); 213 pp = mustlast; 214 } 215 if (pp != mustfirst) 216 return(REG_NOMATCH); 217 } else { 218 for (dp = start; dp < stop; dp++) 219 if (*dp == g->must[0] && 220 stop - dp >= g->mlen && 221 memcmp(dp, g->must, (size_t)g->mlen) == 0) 222 break; 223 if (dp == stop) /* we didn't find g->must */ 224 return(REG_NOMATCH); 225 } 226 } 227 228 /* match struct setup */ 229 m->g = g; 230 m->eflags = eflags; 231 m->pmatch = NULL; 232 m->lastpos = NULL; 233 m->offp = string; 234 m->beginp = start; 235 m->endp = stop; 236 STATESETUP(m, 4); 237 SETUP(m->st); 238 SETUP(m->fresh); 239 SETUP(m->tmp); 240 SETUP(m->empty); 241 CLEAR(m->empty); 242 ZAPSTATE(&m->mbs); 243 244 /* Adjust start according to moffset, to speed things up */ 245 if (dp != NULL && g->moffset > -1) 246 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 247 248 SP("mloop", m->st, *start); 249 250 /* this loop does only one repetition except for backrefs */ 251 for (;;) { 252 endp = walk(m, start, stop, gf, gl, true); 253 if (endp == NULL) { /* a miss */ 254 if (m->pmatch != NULL) 255 free((char *)m->pmatch); 256 if (m->lastpos != NULL) 257 free((char *)m->lastpos); 258 STATETEARDOWN(m); 259 return(REG_NOMATCH); 260 } 261 if (nmatch == 0 && !g->backrefs) 262 break; /* no further info needed */ 263 264 /* where? */ 265 assert(m->coldp != NULL); 266 for (;;) { 267 NOTE("finding start"); 268 endp = walk(m, m->coldp, stop, gf, gl, false); 269 if (endp != NULL) 270 break; 271 assert(m->coldp < m->endp); 272 m->coldp += XMBRTOWC(NULL, m->coldp, 273 m->endp - m->coldp, &m->mbs, 0); 274 } 275 if (nmatch == 1 && !g->backrefs) 276 break; /* no further info needed */ 277 278 /* oh my, he wants the subexpressions... */ 279 if (m->pmatch == NULL) 280 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 281 sizeof(regmatch_t)); 282 if (m->pmatch == NULL) { 283 STATETEARDOWN(m); 284 return(REG_ESPACE); 285 } 286 for (i = 1; i <= m->g->nsub; i++) 287 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 288 if (!g->backrefs && !(m->eflags®_BACKR)) { 289 NOTE("dissecting"); 290 dp = dissect(m, m->coldp, endp, gf, gl); 291 } else { 292 if (g->nplus > 0 && m->lastpos == NULL) 293 m->lastpos = malloc((g->nplus+1) * 294 sizeof(const char *)); 295 if (g->nplus > 0 && m->lastpos == NULL) { 296 free(m->pmatch); 297 STATETEARDOWN(m); 298 return(REG_ESPACE); 299 } 300 NOTE("backref dissect"); 301 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 302 } 303 if (dp != NULL) 304 break; 305 306 /* uh-oh... we couldn't find a subexpression-level match */ 307 assert(g->backrefs); /* must be back references doing it */ 308 assert(g->nplus == 0 || m->lastpos != NULL); 309 for (;;) { 310 if (dp != NULL || endp <= m->coldp) 311 break; /* defeat */ 312 NOTE("backoff"); 313 endp = walk(m, m->coldp, endp-1, gf, gl, false); 314 if (endp == NULL) 315 break; /* defeat */ 316 /* try it on a shorter possibility */ 317 #ifndef NDEBUG 318 for (i = 1; i <= m->g->nsub; i++) { 319 assert(m->pmatch[i].rm_so == -1); 320 assert(m->pmatch[i].rm_eo == -1); 321 } 322 #endif 323 NOTE("backoff dissect"); 324 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 325 } 326 assert(dp == NULL || dp == endp); 327 if (dp != NULL) /* found a shorter one */ 328 break; 329 330 /* despite initial appearances, there is no match here */ 331 NOTE("false alarm"); 332 /* recycle starting later */ 333 start = m->coldp + XMBRTOWC(NULL, m->coldp, 334 stop - m->coldp, &m->mbs, 0); 335 assert(start <= stop); 336 } 337 338 /* fill in the details if requested */ 339 if (nmatch > 0) { 340 pmatch[0].rm_so = m->coldp - m->offp; 341 pmatch[0].rm_eo = endp - m->offp; 342 } 343 if (nmatch > 1) { 344 assert(m->pmatch != NULL); 345 for (i = 1; i < nmatch; i++) 346 if (i <= m->g->nsub) 347 pmatch[i] = m->pmatch[i]; 348 else { 349 pmatch[i].rm_so = -1; 350 pmatch[i].rm_eo = -1; 351 } 352 } 353 354 if (m->pmatch != NULL) 355 free((char *)m->pmatch); 356 if (m->lastpos != NULL) 357 free((char *)m->lastpos); 358 STATETEARDOWN(m); 359 return(0); 360 } 361 362 /* 363 - dissect - figure out what matched what, no back references 364 == static const char *dissect(struct match *m, const char *start, \ 365 == const char *stop, sopno startst, sopno stopst); 366 */ 367 static const char * /* == stop (success) always */ 368 dissect(struct match *m, 369 const char *start, 370 const char *stop, 371 sopno startst, 372 sopno stopst) 373 { 374 int i; 375 sopno ss; /* start sop of current subRE */ 376 sopno es; /* end sop of current subRE */ 377 const char *sp; /* start of string matched by it */ 378 const char *stp; /* string matched by it cannot pass here */ 379 const char *rest; /* start of rest of string */ 380 const char *tail; /* string unmatched by rest of RE */ 381 sopno ssub; /* start sop of subsubRE */ 382 sopno esub; /* end sop of subsubRE */ 383 const char *ssp; /* start of string matched by subsubRE */ 384 const char *sep; /* end of string matched by subsubRE */ 385 const char *oldssp; /* previous ssp */ 386 const char *dp; 387 388 AT("diss", start, stop, startst, stopst); 389 sp = start; 390 for (ss = startst; ss < stopst; ss = es) { 391 /* identify end of subRE */ 392 es = ss; 393 switch (OP(m->g->strip[es])) { 394 case OPLUS_: 395 case OQUEST_: 396 es += OPND(m->g->strip[es]); 397 break; 398 case OCH_: 399 while (OP(m->g->strip[es]) != O_CH) 400 es += OPND(m->g->strip[es]); 401 break; 402 } 403 es++; 404 405 /* figure out what it matched */ 406 switch (OP(m->g->strip[ss])) { 407 case OEND: 408 assert(nope); 409 break; 410 case OCHAR: 411 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 412 break; 413 case OBOL: 414 case OEOL: 415 case OBOW: 416 case OEOW: 417 break; 418 case OANY: 419 case OANYOF: 420 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 421 break; 422 case OBACK_: 423 case O_BACK: 424 assert(nope); 425 break; 426 /* cases where length of match is hard to find */ 427 case OQUEST_: 428 stp = stop; 429 for (;;) { 430 /* how long could this one be? */ 431 rest = walk(m, sp, stp, ss, es, false); 432 assert(rest != NULL); /* it did match */ 433 /* could the rest match the rest? */ 434 tail = walk(m, rest, stop, es, stopst, false); 435 if (tail == stop) 436 break; /* yes! */ 437 /* no -- try a shorter match for this one */ 438 stp = rest - 1; 439 assert(stp >= sp); /* it did work */ 440 } 441 ssub = ss + 1; 442 esub = es - 1; 443 /* did innards match? */ 444 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 445 dp = dissect(m, sp, rest, ssub, esub); 446 assert(dp == rest); 447 } else /* no */ 448 assert(sp == rest); 449 sp = rest; 450 break; 451 case OPLUS_: 452 stp = stop; 453 for (;;) { 454 /* how long could this one be? */ 455 rest = walk(m, sp, stp, ss, es, false); 456 assert(rest != NULL); /* it did match */ 457 /* could the rest match the rest? */ 458 tail = walk(m, rest, stop, es, stopst, false); 459 if (tail == stop) 460 break; /* yes! */ 461 /* no -- try a shorter match for this one */ 462 stp = rest - 1; 463 assert(stp >= sp); /* it did work */ 464 } 465 ssub = ss + 1; 466 esub = es - 1; 467 ssp = sp; 468 oldssp = ssp; 469 for (;;) { /* find last match of innards */ 470 sep = walk(m, ssp, rest, ssub, esub, false); 471 if (sep == NULL || sep == ssp) 472 break; /* failed or matched null */ 473 oldssp = ssp; /* on to next try */ 474 ssp = sep; 475 } 476 if (sep == NULL) { 477 /* last successful match */ 478 sep = ssp; 479 ssp = oldssp; 480 } 481 assert(sep == rest); /* must exhaust substring */ 482 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 483 dp = dissect(m, ssp, sep, ssub, esub); 484 assert(dp == sep); 485 sp = rest; 486 break; 487 case OCH_: 488 stp = stop; 489 for (;;) { 490 /* how long could this one be? */ 491 rest = walk(m, sp, stp, ss, es, false); 492 assert(rest != NULL); /* it did match */ 493 /* could the rest match the rest? */ 494 tail = walk(m, rest, stop, es, stopst, false); 495 if (tail == stop) 496 break; /* yes! */ 497 /* no -- try a shorter match for this one */ 498 stp = rest - 1; 499 assert(stp >= sp); /* it did work */ 500 } 501 ssub = ss + 1; 502 esub = ss + OPND(m->g->strip[ss]) - 1; 503 assert(OP(m->g->strip[esub]) == OOR1); 504 for (;;) { /* find first matching branch */ 505 if (walk(m, sp, rest, ssub, esub, false) == rest) 506 break; /* it matched all of it */ 507 /* that one missed, try next one */ 508 assert(OP(m->g->strip[esub]) == OOR1); 509 esub++; 510 assert(OP(m->g->strip[esub]) == OOR2); 511 ssub = esub + 1; 512 esub += OPND(m->g->strip[esub]); 513 if (OP(m->g->strip[esub]) == OOR2) 514 esub--; 515 else 516 assert(OP(m->g->strip[esub]) == O_CH); 517 } 518 dp = dissect(m, sp, rest, ssub, esub); 519 assert(dp == rest); 520 sp = rest; 521 break; 522 case O_PLUS: 523 case O_QUEST: 524 case OOR1: 525 case OOR2: 526 case O_CH: 527 assert(nope); 528 break; 529 case OLPAREN: 530 i = OPND(m->g->strip[ss]); 531 assert(0 < i && i <= m->g->nsub); 532 m->pmatch[i].rm_so = sp - m->offp; 533 break; 534 case ORPAREN: 535 i = OPND(m->g->strip[ss]); 536 assert(0 < i && i <= m->g->nsub); 537 m->pmatch[i].rm_eo = sp - m->offp; 538 break; 539 default: /* uh oh */ 540 assert(nope); 541 break; 542 } 543 } 544 545 assert(sp == stop); 546 return(sp); 547 } 548 549 /* 550 - backref - figure out what matched what, figuring in back references 551 == static const char *backref(struct match *m, const char *start, \ 552 == const char *stop, sopno startst, sopno stopst, sopno lev); 553 */ 554 static const char * /* == stop (success) or NULL (failure) */ 555 backref(struct match *m, 556 const char *start, 557 const char *stop, 558 sopno startst, 559 sopno stopst, 560 sopno lev, /* PLUS nesting level */ 561 int rec) 562 { 563 int i; 564 sopno ss; /* start sop of current subRE */ 565 const char *sp; /* start of string matched by it */ 566 sopno ssub; /* start sop of subsubRE */ 567 sopno esub; /* end sop of subsubRE */ 568 const char *ssp; /* start of string matched by subsubRE */ 569 const char *dp; 570 size_t len; 571 int hard; 572 sop s; 573 regoff_t offsave; 574 cset *cs; 575 wint_t wc; 576 577 AT("back", start, stop, startst, stopst); 578 sp = start; 579 580 /* get as far as we can with easy stuff */ 581 hard = 0; 582 for (ss = startst; !hard && ss < stopst; ss++) 583 switch (OP(s = m->g->strip[ss])) { 584 case OCHAR: 585 if (sp == stop) 586 return(NULL); 587 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 588 if (wc != OPND(s)) 589 return(NULL); 590 break; 591 case OANY: 592 if (sp == stop) 593 return(NULL); 594 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 595 if (wc == BADCHAR) 596 return (NULL); 597 break; 598 case OANYOF: 599 if (sp == stop) 600 return (NULL); 601 cs = &m->g->sets[OPND(s)]; 602 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 603 if (wc == BADCHAR || !CHIN(cs, wc)) 604 return(NULL); 605 break; 606 case OBOL: 607 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 608 (sp > m->offp && sp < m->endp && 609 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 610 { /* yes */ } 611 else 612 return(NULL); 613 break; 614 case OEOL: 615 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 616 (sp < m->endp && *sp == '\n' && 617 (m->g->cflags®_NEWLINE)) ) 618 { /* yes */ } 619 else 620 return(NULL); 621 break; 622 case OBOW: 623 if (sp < m->endp && ISWORD(*sp) && 624 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 625 (sp > m->offp && !ISWORD(*(sp-1))))) 626 { /* yes */ } 627 else 628 return(NULL); 629 break; 630 case OEOW: 631 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 632 (sp < m->endp && *sp == '\n' && 633 (m->g->cflags®_NEWLINE)) || 634 (sp < m->endp && !ISWORD(*sp)) ) && 635 (sp > m->beginp && ISWORD(*(sp-1))) ) 636 { /* yes */ } 637 else 638 return(NULL); 639 break; 640 case O_QUEST: 641 break; 642 case OOR1: /* matches null but needs to skip */ 643 ss++; 644 s = m->g->strip[ss]; 645 do { 646 assert(OP(s) == OOR2); 647 ss += OPND(s); 648 } while (OP(s = m->g->strip[ss]) != O_CH); 649 /* note that the ss++ gets us past the O_CH */ 650 break; 651 default: /* have to make a choice */ 652 hard = 1; 653 break; 654 } 655 if (!hard) { /* that was it! */ 656 if (sp != stop) 657 return(NULL); 658 return(sp); 659 } 660 ss--; /* adjust for the for's final increment */ 661 662 /* the hard stuff */ 663 AT("hard", sp, stop, ss, stopst); 664 s = m->g->strip[ss]; 665 switch (OP(s)) { 666 case OBACK_: /* the vilest depths */ 667 i = OPND(s); 668 assert(0 < i && i <= m->g->nsub); 669 if (m->pmatch[i].rm_eo == -1) 670 return(NULL); 671 assert(m->pmatch[i].rm_so != -1); 672 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 673 if (len == 0 && rec++ > MAX_RECURSION) 674 return(NULL); 675 assert(stop - m->beginp >= len); 676 if (sp > stop - len) 677 return(NULL); /* not enough left to match */ 678 ssp = m->offp + m->pmatch[i].rm_so; 679 if (memcmp(sp, ssp, len) != 0) 680 return(NULL); 681 while (m->g->strip[ss] != SOP(O_BACK, i)) 682 ss++; 683 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 684 case OQUEST_: /* to null or not */ 685 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 686 if (dp != NULL) 687 return(dp); /* not */ 688 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 689 case OPLUS_: 690 assert(m->lastpos != NULL); 691 assert(lev+1 <= m->g->nplus); 692 m->lastpos[lev+1] = sp; 693 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 694 case O_PLUS: 695 if (sp == m->lastpos[lev]) /* last pass matched null */ 696 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 697 /* try another pass */ 698 m->lastpos[lev] = sp; 699 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 700 if (dp == NULL) 701 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 702 else 703 return(dp); 704 case OCH_: /* find the right one, if any */ 705 ssub = ss + 1; 706 esub = ss + OPND(s) - 1; 707 assert(OP(m->g->strip[esub]) == OOR1); 708 for (;;) { /* find first matching branch */ 709 dp = backref(m, sp, stop, ssub, esub, lev, rec); 710 if (dp != NULL) 711 return(dp); 712 /* that one missed, try next one */ 713 if (OP(m->g->strip[esub]) == O_CH) 714 return(NULL); /* there is none */ 715 esub++; 716 assert(OP(m->g->strip[esub]) == OOR2); 717 ssub = esub + 1; 718 esub += OPND(m->g->strip[esub]); 719 if (OP(m->g->strip[esub]) == OOR2) 720 esub--; 721 else 722 assert(OP(m->g->strip[esub]) == O_CH); 723 } 724 /* NOTREACHED */ 725 break; 726 case OLPAREN: /* must undo assignment if rest fails */ 727 i = OPND(s); 728 assert(0 < i && i <= m->g->nsub); 729 offsave = m->pmatch[i].rm_so; 730 m->pmatch[i].rm_so = sp - m->offp; 731 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 732 if (dp != NULL) 733 return(dp); 734 m->pmatch[i].rm_so = offsave; 735 return(NULL); 736 case ORPAREN: /* must undo assignment if rest fails */ 737 i = OPND(s); 738 assert(0 < i && i <= m->g->nsub); 739 offsave = m->pmatch[i].rm_eo; 740 m->pmatch[i].rm_eo = sp - m->offp; 741 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 742 if (dp != NULL) 743 return(dp); 744 m->pmatch[i].rm_eo = offsave; 745 return(NULL); 746 default: /* uh oh */ 747 assert(nope); 748 break; 749 } 750 751 /* "can't happen" */ 752 assert(nope); 753 /* NOTREACHED */ 754 return "shut up gcc"; 755 } 756 757 /* 758 - walk - step through the string either quickly or slowly 759 == static const char *walk(struct match *m, const char *start, \ 760 == const char *stop, sopno startst, sopno stopst, bool fast); 761 */ 762 static const char * /* where it ended, or NULL */ 763 walk(struct match *m, const char *start, const char *stop, sopno startst, 764 sopno stopst, bool fast) 765 { 766 states st = m->st; 767 states fresh = m->fresh; 768 states empty = m->empty; 769 states tmp = m->tmp; 770 const char *p = start; 771 wint_t c; 772 wint_t lastc; /* previous c */ 773 wint_t flagch; 774 int i; 775 const char *matchp; /* last p at which a match ended */ 776 size_t clen; 777 778 AT("slow", start, stop, startst, stopst); 779 CLEAR(st); 780 SET1(st, startst); 781 SP("sstart", st, *p); 782 st = step(m->g, startst, stopst, st, NOTHING, st); 783 if (fast) 784 ASSIGN(fresh, st); 785 matchp = NULL; 786 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 787 c = OUT; 788 else { 789 /* 790 * XXX Wrong if the previous character was multi-byte. 791 * Newline never is (in encodings supported by FreeBSD), 792 * so this only breaks the ISWORD tests below. 793 */ 794 c = (uch)*(start - 1); 795 } 796 for (;;) { 797 /* next character */ 798 lastc = c; 799 if (p == m->endp) { 800 c = OUT; 801 clen = 0; 802 } else 803 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 804 805 if (fast && EQ(st, fresh)) 806 matchp = p; 807 808 /* is there an EOL and/or BOL between lastc and c? */ 809 flagch = '\0'; 810 i = 0; 811 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 812 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 813 flagch = BOL; 814 i = m->g->nbol; 815 } 816 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 817 (c == OUT && !(m->eflags®_NOTEOL)) ) { 818 flagch = (flagch == BOL) ? BOLEOL : EOL; 819 i += m->g->neol; 820 } 821 if (i != 0) { 822 for (; i > 0; i--) 823 st = step(m->g, startst, stopst, st, flagch, st); 824 SP("sboleol", st, c); 825 } 826 827 /* how about a word boundary? */ 828 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 829 (c != OUT && ISWORD(c)) ) { 830 flagch = BOW; 831 } 832 if ( (lastc != OUT && ISWORD(lastc)) && 833 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 834 flagch = EOW; 835 } 836 if (flagch == BOW || flagch == EOW) { 837 st = step(m->g, startst, stopst, st, flagch, st); 838 SP("sboweow", st, c); 839 } 840 841 /* are we done? */ 842 if (ISSET(st, stopst)) { 843 if (fast) 844 break; 845 else 846 matchp = p; 847 } 848 if (EQ(st, empty) || p == stop || clen > stop - p) 849 break; /* NOTE BREAK OUT */ 850 851 /* no, we must deal with this character */ 852 ASSIGN(tmp, st); 853 if (fast) 854 ASSIGN(st, fresh); 855 else 856 ASSIGN(st, empty); 857 assert(c != OUT); 858 st = step(m->g, startst, stopst, tmp, c, st); 859 SP("saft", st, c); 860 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 861 p += clen; 862 } 863 864 if (fast) { 865 assert(matchp != NULL); 866 m->coldp = matchp; 867 if (ISSET(st, stopst)) 868 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 869 else 870 return (NULL); 871 } else 872 return (matchp); 873 } 874 875 /* 876 - step - map set of states reachable before char to set reachable after 877 == static states step(struct re_guts *g, sopno start, sopno stop, \ 878 == states bef, int ch, states aft); 879 == #define BOL (OUT-1) 880 == #define EOL (BOL-1) 881 == #define BOLEOL (BOL-2) 882 == #define NOTHING (BOL-3) 883 == #define BOW (BOL-4) 884 == #define EOW (BOL-5) 885 == #define BADCHAR (BOL-6) 886 == #define NONCHAR(c) ((c) <= OUT) 887 */ 888 static states 889 step(struct re_guts *g, 890 sopno start, /* start state within strip */ 891 sopno stop, /* state after stop state within strip */ 892 states bef, /* states reachable before */ 893 wint_t ch, /* character or NONCHAR code */ 894 states aft) /* states already known reachable after */ 895 { 896 cset *cs; 897 sop s; 898 sopno pc; 899 onestate here; /* note, macros know this name */ 900 sopno look; 901 int i; 902 903 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 904 s = g->strip[pc]; 905 switch (OP(s)) { 906 case OEND: 907 assert(pc == stop-1); 908 break; 909 case OCHAR: 910 /* only characters can match */ 911 assert(!NONCHAR(ch) || ch != OPND(s)); 912 if (ch == OPND(s)) 913 FWD(aft, bef, 1); 914 break; 915 case OBOL: 916 if (ch == BOL || ch == BOLEOL) 917 FWD(aft, bef, 1); 918 break; 919 case OEOL: 920 if (ch == EOL || ch == BOLEOL) 921 FWD(aft, bef, 1); 922 break; 923 case OBOW: 924 if (ch == BOW) 925 FWD(aft, bef, 1); 926 break; 927 case OEOW: 928 if (ch == EOW) 929 FWD(aft, bef, 1); 930 break; 931 case OANY: 932 if (!NONCHAR(ch)) 933 FWD(aft, bef, 1); 934 break; 935 case OANYOF: 936 cs = &g->sets[OPND(s)]; 937 if (!NONCHAR(ch) && CHIN(cs, ch)) 938 FWD(aft, bef, 1); 939 break; 940 case OBACK_: /* ignored here */ 941 case O_BACK: 942 FWD(aft, aft, 1); 943 break; 944 case OPLUS_: /* forward, this is just an empty */ 945 FWD(aft, aft, 1); 946 break; 947 case O_PLUS: /* both forward and back */ 948 FWD(aft, aft, 1); 949 i = ISSETBACK(aft, OPND(s)); 950 BACK(aft, aft, OPND(s)); 951 if (!i && ISSETBACK(aft, OPND(s))) { 952 /* oho, must reconsider loop body */ 953 pc -= OPND(s) + 1; 954 INIT(here, pc); 955 } 956 break; 957 case OQUEST_: /* two branches, both forward */ 958 FWD(aft, aft, 1); 959 FWD(aft, aft, OPND(s)); 960 break; 961 case O_QUEST: /* just an empty */ 962 FWD(aft, aft, 1); 963 break; 964 case OLPAREN: /* not significant here */ 965 case ORPAREN: 966 FWD(aft, aft, 1); 967 break; 968 case OCH_: /* mark the first two branches */ 969 FWD(aft, aft, 1); 970 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 971 FWD(aft, aft, OPND(s)); 972 break; 973 case OOR1: /* done a branch, find the O_CH */ 974 if (ISSTATEIN(aft, here)) { 975 for (look = 1; 976 OP(s = g->strip[pc+look]) != O_CH; 977 look += OPND(s)) 978 assert(OP(s) == OOR2); 979 FWD(aft, aft, look + 1); 980 } 981 break; 982 case OOR2: /* propagate OCH_'s marking */ 983 FWD(aft, aft, 1); 984 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 985 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 986 FWD(aft, aft, OPND(s)); 987 } 988 break; 989 case O_CH: /* just empty */ 990 FWD(aft, aft, 1); 991 break; 992 default: /* ooooops... */ 993 assert(nope); 994 break; 995 } 996 } 997 998 return(aft); 999 } 1000 1001 #ifdef REDEBUG 1002 /* 1003 - print - print a set of states 1004 == #ifdef REDEBUG 1005 == static void print(struct match *m, const char *caption, states st, \ 1006 == int ch, FILE *d); 1007 == #endif 1008 */ 1009 static void 1010 print(struct match *m, 1011 const char *caption, 1012 states st, 1013 int ch, 1014 FILE *d) 1015 { 1016 struct re_guts *g = m->g; 1017 sopno i; 1018 int first = 1; 1019 1020 if (!(m->eflags®_TRACE)) 1021 return; 1022 1023 fprintf(d, "%s", caption); 1024 if (ch != '\0') 1025 fprintf(d, " %s", pchar(ch)); 1026 for (i = 0; i < g->nstates; i++) 1027 if (ISSET(st, i)) { 1028 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1029 first = 0; 1030 } 1031 fprintf(d, "\n"); 1032 } 1033 1034 /* 1035 - at - print current situation 1036 == #ifdef REDEBUG 1037 == static void at(struct match *m, const char *title, const char *start, \ 1038 == const char *stop, sopno startst, sopno stopst); 1039 == #endif 1040 */ 1041 static void 1042 at( struct match *m, 1043 const char *title, 1044 const char *start, 1045 const char *stop, 1046 sopno startst, 1047 sopno stopst) 1048 { 1049 if (!(m->eflags®_TRACE)) 1050 return; 1051 1052 printf("%s %s-", title, pchar(*start)); 1053 printf("%s ", pchar(*stop)); 1054 printf("%ld-%ld\n", (long)startst, (long)stopst); 1055 } 1056 1057 #ifndef PCHARDONE 1058 #define PCHARDONE /* never again */ 1059 /* 1060 - pchar - make a character printable 1061 == #ifdef REDEBUG 1062 == static const char *pchar(int ch); 1063 == #endif 1064 * 1065 * Is this identical to regchar() over in debug.c? Well, yes. But a 1066 * duplicate here avoids having a debugging-capable regexec.o tied to 1067 * a matching debug.o, and this is convenient. It all disappears in 1068 * the non-debug compilation anyway, so it doesn't matter much. 1069 */ 1070 static const char * /* -> representation */ 1071 pchar(int ch) 1072 { 1073 static char pbuf[10]; 1074 1075 if (isprint((uch)ch) || ch == ' ') 1076 sprintf(pbuf, "%c", ch); 1077 else 1078 sprintf(pbuf, "\\%o", ch); 1079 return(pbuf); 1080 } 1081 #endif 1082 #endif 1083 1084 #undef matcher 1085 #undef walk 1086 #undef dissect 1087 #undef backref 1088 #undef step 1089 #undef print 1090 #undef at 1091 #undef match 1092