1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)engine.c 8.5 (Berkeley) 3/20/94 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 /* 40 * The matching engine and friends. This file is #included by regexec.c 41 * after suitable #defines of a variety of macros used herein, so that 42 * different state representations can be used without duplicating masses 43 * of code. 44 */ 45 46 #ifdef SNAMES 47 #define matcher smatcher 48 #define fast sfast 49 #define slow sslow 50 #define dissect sdissect 51 #define backref sbackref 52 #define step sstep 53 #define print sprint 54 #define at sat 55 #define match smat 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #endif 68 #ifdef MNAMES 69 #define matcher mmatcher 70 #define fast mfast 71 #define slow mslow 72 #define dissect mdissect 73 #define backref mbackref 74 #define step mstep 75 #define print mprint 76 #define at mat 77 #define match mmat 78 #endif 79 80 /* another structure passed up and down to avoid zillions of parameters */ 81 struct match { 82 struct re_guts *g; 83 int eflags; 84 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 85 const char *offp; /* offsets work from here */ 86 const char *beginp; /* start of string -- virtual NUL precedes */ 87 const char *endp; /* end of string -- virtual NUL here */ 88 const char *coldp; /* can be no match starting before here */ 89 const char **lastpos; /* [nplus+1] */ 90 STATEVARS; 91 states st; /* current states */ 92 states fresh; /* states for a fresh start */ 93 states tmp; /* temporary */ 94 states empty; /* empty set of states */ 95 mbstate_t mbs; /* multibyte conversion state */ 96 }; 97 98 /* ========= begin header generated by ./mkh ========= */ 99 #ifdef __cplusplus 100 extern "C" { 101 #endif 102 103 /* === engine.c === */ 104 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 105 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 106 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 107 static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 108 static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 109 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); 110 #define MAX_RECURSION 100 111 #define BOL (OUT-1) 112 #define EOL (BOL-1) 113 #define BOLEOL (BOL-2) 114 #define NOTHING (BOL-3) 115 #define BOW (BOL-4) 116 #define EOW (BOL-5) 117 #define BADCHAR (BOL-6) 118 #define NONCHAR(c) ((c) <= OUT) 119 #ifdef REDEBUG 120 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 121 #endif 122 #ifdef REDEBUG 123 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 124 #endif 125 #ifdef REDEBUG 126 static const char *pchar(int ch); 127 #endif 128 129 #ifdef __cplusplus 130 } 131 #endif 132 /* ========= end header generated by ./mkh ========= */ 133 134 #ifdef REDEBUG 135 #define SP(t, s, c) print(m, t, s, c, stdout) 136 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 137 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 138 #else 139 #define SP(t, s, c) /* nothing */ 140 #define AT(t, p1, p2, s1, s2) /* nothing */ 141 #define NOTE(s) /* nothing */ 142 #endif 143 144 /* 145 - matcher - the actual matching engine 146 == static int matcher(struct re_guts *g, const char *string, \ 147 == size_t nmatch, regmatch_t pmatch[], int eflags); 148 */ 149 static int /* 0 success, REG_NOMATCH failure */ 150 matcher(struct re_guts *g, 151 const char *string, 152 size_t nmatch, 153 regmatch_t pmatch[], 154 int eflags) 155 { 156 const char *endp; 157 int i; 158 struct match mv; 159 struct match *m = &mv; 160 const char *dp; 161 const sopno gf = g->firststate+1; /* +1 for OEND */ 162 const sopno gl = g->laststate; 163 const char *start; 164 const char *stop; 165 /* Boyer-Moore algorithms variables */ 166 const char *pp; 167 int cj, mj; 168 const char *mustfirst; 169 const char *mustlast; 170 int *matchjump; 171 int *charjump; 172 173 /* simplify the situation where possible */ 174 if (g->cflags®_NOSUB) 175 nmatch = 0; 176 if (eflags®_STARTEND) { 177 start = string + pmatch[0].rm_so; 178 stop = string + pmatch[0].rm_eo; 179 } else { 180 start = string; 181 stop = start + strlen(start); 182 } 183 if (stop < start) 184 return(REG_INVARG); 185 186 /* prescreening; this does wonders for this rather slow code */ 187 if (g->must != NULL) { 188 if (g->charjump != NULL && g->matchjump != NULL) { 189 mustfirst = g->must; 190 mustlast = g->must + g->mlen - 1; 191 charjump = g->charjump; 192 matchjump = g->matchjump; 193 pp = mustlast; 194 for (dp = start+g->mlen-1; dp < stop;) { 195 /* Fast skip non-matches */ 196 while (dp < stop && charjump[(int)*dp]) 197 dp += charjump[(int)*dp]; 198 199 if (dp >= stop) 200 break; 201 202 /* Greedy matcher */ 203 /* We depend on not being used for 204 * for strings of length 1 205 */ 206 while (*--dp == *--pp && pp != mustfirst); 207 208 if (*dp == *pp) 209 break; 210 211 /* Jump to next possible match */ 212 mj = matchjump[pp - mustfirst]; 213 cj = charjump[(int)*dp]; 214 dp += (cj < mj ? mj : cj); 215 pp = mustlast; 216 } 217 if (pp != mustfirst) 218 return(REG_NOMATCH); 219 } else { 220 for (dp = start; dp < stop; dp++) 221 if (*dp == g->must[0] && 222 stop - dp >= g->mlen && 223 memcmp(dp, g->must, (size_t)g->mlen) == 0) 224 break; 225 if (dp == stop) /* we didn't find g->must */ 226 return(REG_NOMATCH); 227 } 228 } 229 230 /* match struct setup */ 231 m->g = g; 232 m->eflags = eflags; 233 m->pmatch = NULL; 234 m->lastpos = NULL; 235 m->offp = string; 236 m->beginp = start; 237 m->endp = stop; 238 STATESETUP(m, 4); 239 SETUP(m->st); 240 SETUP(m->fresh); 241 SETUP(m->tmp); 242 SETUP(m->empty); 243 CLEAR(m->empty); 244 ZAPSTATE(&m->mbs); 245 246 /* Adjust start according to moffset, to speed things up */ 247 if (g->moffset > -1) 248 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 249 250 SP("mloop", m->st, *start); 251 252 /* this loop does only one repetition except for backrefs */ 253 for (;;) { 254 endp = fast(m, start, stop, gf, gl); 255 if (endp == NULL) { /* a miss */ 256 if (m->pmatch != NULL) 257 free((char *)m->pmatch); 258 if (m->lastpos != NULL) 259 free((char *)m->lastpos); 260 STATETEARDOWN(m); 261 return(REG_NOMATCH); 262 } 263 if (nmatch == 0 && !g->backrefs) 264 break; /* no further info needed */ 265 266 /* where? */ 267 assert(m->coldp != NULL); 268 for (;;) { 269 NOTE("finding start"); 270 endp = slow(m, m->coldp, stop, gf, gl); 271 if (endp != NULL) 272 break; 273 assert(m->coldp < m->endp); 274 m->coldp += XMBRTOWC(NULL, m->coldp, 275 m->endp - m->coldp, &m->mbs, 0); 276 } 277 if (nmatch == 1 && !g->backrefs) 278 break; /* no further info needed */ 279 280 /* oh my, he wants the subexpressions... */ 281 if (m->pmatch == NULL) 282 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 283 sizeof(regmatch_t)); 284 if (m->pmatch == NULL) { 285 STATETEARDOWN(m); 286 return(REG_ESPACE); 287 } 288 for (i = 1; i <= m->g->nsub; i++) 289 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 290 if (!g->backrefs && !(m->eflags®_BACKR)) { 291 NOTE("dissecting"); 292 dp = dissect(m, m->coldp, endp, gf, gl); 293 } else { 294 if (g->nplus > 0 && m->lastpos == NULL) 295 m->lastpos = malloc((g->nplus+1) * 296 sizeof(const char *)); 297 if (g->nplus > 0 && m->lastpos == NULL) { 298 free(m->pmatch); 299 STATETEARDOWN(m); 300 return(REG_ESPACE); 301 } 302 NOTE("backref dissect"); 303 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 304 } 305 if (dp != NULL) 306 break; 307 308 /* uh-oh... we couldn't find a subexpression-level match */ 309 assert(g->backrefs); /* must be back references doing it */ 310 assert(g->nplus == 0 || m->lastpos != NULL); 311 for (;;) { 312 if (dp != NULL || endp <= m->coldp) 313 break; /* defeat */ 314 NOTE("backoff"); 315 endp = slow(m, m->coldp, endp-1, gf, gl); 316 if (endp == NULL) 317 break; /* defeat */ 318 /* try it on a shorter possibility */ 319 #ifndef NDEBUG 320 for (i = 1; i <= m->g->nsub; i++) { 321 assert(m->pmatch[i].rm_so == -1); 322 assert(m->pmatch[i].rm_eo == -1); 323 } 324 #endif 325 NOTE("backoff dissect"); 326 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 327 } 328 assert(dp == NULL || dp == endp); 329 if (dp != NULL) /* found a shorter one */ 330 break; 331 332 /* despite initial appearances, there is no match here */ 333 NOTE("false alarm"); 334 /* recycle starting later */ 335 start = m->coldp + XMBRTOWC(NULL, m->coldp, 336 stop - m->coldp, &m->mbs, 0); 337 assert(start <= stop); 338 } 339 340 /* fill in the details if requested */ 341 if (nmatch > 0) { 342 pmatch[0].rm_so = m->coldp - m->offp; 343 pmatch[0].rm_eo = endp - m->offp; 344 } 345 if (nmatch > 1) { 346 assert(m->pmatch != NULL); 347 for (i = 1; i < nmatch; i++) 348 if (i <= m->g->nsub) 349 pmatch[i] = m->pmatch[i]; 350 else { 351 pmatch[i].rm_so = -1; 352 pmatch[i].rm_eo = -1; 353 } 354 } 355 356 if (m->pmatch != NULL) 357 free((char *)m->pmatch); 358 if (m->lastpos != NULL) 359 free((char *)m->lastpos); 360 STATETEARDOWN(m); 361 return(0); 362 } 363 364 /* 365 - dissect - figure out what matched what, no back references 366 == static const char *dissect(struct match *m, const char *start, \ 367 == const char *stop, sopno startst, sopno stopst); 368 */ 369 static const char * /* == stop (success) always */ 370 dissect(struct match *m, 371 const char *start, 372 const char *stop, 373 sopno startst, 374 sopno stopst) 375 { 376 int i; 377 sopno ss; /* start sop of current subRE */ 378 sopno es; /* end sop of current subRE */ 379 const char *sp; /* start of string matched by it */ 380 const char *stp; /* string matched by it cannot pass here */ 381 const char *rest; /* start of rest of string */ 382 const char *tail; /* string unmatched by rest of RE */ 383 sopno ssub; /* start sop of subsubRE */ 384 sopno esub; /* end sop of subsubRE */ 385 const char *ssp; /* start of string matched by subsubRE */ 386 const char *sep; /* end of string matched by subsubRE */ 387 const char *oldssp; /* previous ssp */ 388 const char *dp; 389 390 AT("diss", start, stop, startst, stopst); 391 sp = start; 392 for (ss = startst; ss < stopst; ss = es) { 393 /* identify end of subRE */ 394 es = ss; 395 switch (OP(m->g->strip[es])) { 396 case OPLUS_: 397 case OQUEST_: 398 es += OPND(m->g->strip[es]); 399 break; 400 case OCH_: 401 while (OP(m->g->strip[es]) != O_CH) 402 es += OPND(m->g->strip[es]); 403 break; 404 } 405 es++; 406 407 /* figure out what it matched */ 408 switch (OP(m->g->strip[ss])) { 409 case OEND: 410 assert(nope); 411 break; 412 case OCHAR: 413 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 414 break; 415 case OBOL: 416 case OEOL: 417 case OBOW: 418 case OEOW: 419 break; 420 case OANY: 421 case OANYOF: 422 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 423 break; 424 case OBACK_: 425 case O_BACK: 426 assert(nope); 427 break; 428 /* cases where length of match is hard to find */ 429 case OQUEST_: 430 stp = stop; 431 for (;;) { 432 /* how long could this one be? */ 433 rest = slow(m, sp, stp, ss, es); 434 assert(rest != NULL); /* it did match */ 435 /* could the rest match the rest? */ 436 tail = slow(m, rest, stop, es, stopst); 437 if (tail == stop) 438 break; /* yes! */ 439 /* no -- try a shorter match for this one */ 440 stp = rest - 1; 441 assert(stp >= sp); /* it did work */ 442 } 443 ssub = ss + 1; 444 esub = es - 1; 445 /* did innards match? */ 446 if (slow(m, sp, rest, ssub, esub) != NULL) { 447 dp = dissect(m, sp, rest, ssub, esub); 448 assert(dp == rest); 449 } else /* no */ 450 assert(sp == rest); 451 sp = rest; 452 break; 453 case OPLUS_: 454 stp = stop; 455 for (;;) { 456 /* how long could this one be? */ 457 rest = slow(m, sp, stp, ss, es); 458 assert(rest != NULL); /* it did match */ 459 /* could the rest match the rest? */ 460 tail = slow(m, rest, stop, es, stopst); 461 if (tail == stop) 462 break; /* yes! */ 463 /* no -- try a shorter match for this one */ 464 stp = rest - 1; 465 assert(stp >= sp); /* it did work */ 466 } 467 ssub = ss + 1; 468 esub = es - 1; 469 ssp = sp; 470 oldssp = ssp; 471 for (;;) { /* find last match of innards */ 472 sep = slow(m, ssp, rest, ssub, esub); 473 if (sep == NULL || sep == ssp) 474 break; /* failed or matched null */ 475 oldssp = ssp; /* on to next try */ 476 ssp = sep; 477 } 478 if (sep == NULL) { 479 /* last successful match */ 480 sep = ssp; 481 ssp = oldssp; 482 } 483 assert(sep == rest); /* must exhaust substring */ 484 assert(slow(m, ssp, sep, ssub, esub) == rest); 485 dp = dissect(m, ssp, sep, ssub, esub); 486 assert(dp == sep); 487 sp = rest; 488 break; 489 case OCH_: 490 stp = stop; 491 for (;;) { 492 /* how long could this one be? */ 493 rest = slow(m, sp, stp, ss, es); 494 assert(rest != NULL); /* it did match */ 495 /* could the rest match the rest? */ 496 tail = slow(m, rest, stop, es, stopst); 497 if (tail == stop) 498 break; /* yes! */ 499 /* no -- try a shorter match for this one */ 500 stp = rest - 1; 501 assert(stp >= sp); /* it did work */ 502 } 503 ssub = ss + 1; 504 esub = ss + OPND(m->g->strip[ss]) - 1; 505 assert(OP(m->g->strip[esub]) == OOR1); 506 for (;;) { /* find first matching branch */ 507 if (slow(m, sp, rest, ssub, esub) == rest) 508 break; /* it matched all of it */ 509 /* that one missed, try next one */ 510 assert(OP(m->g->strip[esub]) == OOR1); 511 esub++; 512 assert(OP(m->g->strip[esub]) == OOR2); 513 ssub = esub + 1; 514 esub += OPND(m->g->strip[esub]); 515 if (OP(m->g->strip[esub]) == OOR2) 516 esub--; 517 else 518 assert(OP(m->g->strip[esub]) == O_CH); 519 } 520 dp = dissect(m, sp, rest, ssub, esub); 521 assert(dp == rest); 522 sp = rest; 523 break; 524 case O_PLUS: 525 case O_QUEST: 526 case OOR1: 527 case OOR2: 528 case O_CH: 529 assert(nope); 530 break; 531 case OLPAREN: 532 i = OPND(m->g->strip[ss]); 533 assert(0 < i && i <= m->g->nsub); 534 m->pmatch[i].rm_so = sp - m->offp; 535 break; 536 case ORPAREN: 537 i = OPND(m->g->strip[ss]); 538 assert(0 < i && i <= m->g->nsub); 539 m->pmatch[i].rm_eo = sp - m->offp; 540 break; 541 default: /* uh oh */ 542 assert(nope); 543 break; 544 } 545 } 546 547 assert(sp == stop); 548 return(sp); 549 } 550 551 /* 552 - backref - figure out what matched what, figuring in back references 553 == static const char *backref(struct match *m, const char *start, \ 554 == const char *stop, sopno startst, sopno stopst, sopno lev); 555 */ 556 static const char * /* == stop (success) or NULL (failure) */ 557 backref(struct match *m, 558 const char *start, 559 const char *stop, 560 sopno startst, 561 sopno stopst, 562 sopno lev, /* PLUS nesting level */ 563 int rec) 564 { 565 int i; 566 sopno ss; /* start sop of current subRE */ 567 const char *sp; /* start of string matched by it */ 568 sopno ssub; /* start sop of subsubRE */ 569 sopno esub; /* end sop of subsubRE */ 570 const char *ssp; /* start of string matched by subsubRE */ 571 const char *dp; 572 size_t len; 573 int hard; 574 sop s; 575 regoff_t offsave; 576 cset *cs; 577 wint_t wc; 578 579 AT("back", start, stop, startst, stopst); 580 sp = start; 581 582 /* get as far as we can with easy stuff */ 583 hard = 0; 584 for (ss = startst; !hard && ss < stopst; ss++) 585 switch (OP(s = m->g->strip[ss])) { 586 case OCHAR: 587 if (sp == stop) 588 return(NULL); 589 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 590 if (wc != OPND(s)) 591 return(NULL); 592 break; 593 case OANY: 594 if (sp == stop) 595 return(NULL); 596 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 597 if (wc == BADCHAR) 598 return (NULL); 599 break; 600 case OANYOF: 601 if (sp == stop) 602 return (NULL); 603 cs = &m->g->sets[OPND(s)]; 604 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 605 if (wc == BADCHAR || !CHIN(cs, wc)) 606 return(NULL); 607 break; 608 case OBOL: 609 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 610 (sp < m->endp && *(sp-1) == '\n' && 611 (m->g->cflags®_NEWLINE)) ) 612 { /* yes */ } 613 else 614 return(NULL); 615 break; 616 case OEOL: 617 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 618 (sp < m->endp && *sp == '\n' && 619 (m->g->cflags®_NEWLINE)) ) 620 { /* yes */ } 621 else 622 return(NULL); 623 break; 624 case OBOW: 625 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 626 (sp < m->endp && *(sp-1) == '\n' && 627 (m->g->cflags®_NEWLINE)) || 628 (sp > m->beginp && 629 !ISWORD(*(sp-1))) ) && 630 (sp < m->endp && ISWORD(*sp)) ) 631 { /* yes */ } 632 else 633 return(NULL); 634 break; 635 case OEOW: 636 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 637 (sp < m->endp && *sp == '\n' && 638 (m->g->cflags®_NEWLINE)) || 639 (sp < m->endp && !ISWORD(*sp)) ) && 640 (sp > m->beginp && ISWORD(*(sp-1))) ) 641 { /* yes */ } 642 else 643 return(NULL); 644 break; 645 case O_QUEST: 646 break; 647 case OOR1: /* matches null but needs to skip */ 648 ss++; 649 s = m->g->strip[ss]; 650 do { 651 assert(OP(s) == OOR2); 652 ss += OPND(s); 653 } while (OP(s = m->g->strip[ss]) != O_CH); 654 /* note that the ss++ gets us past the O_CH */ 655 break; 656 default: /* have to make a choice */ 657 hard = 1; 658 break; 659 } 660 if (!hard) { /* that was it! */ 661 if (sp != stop) 662 return(NULL); 663 return(sp); 664 } 665 ss--; /* adjust for the for's final increment */ 666 667 /* the hard stuff */ 668 AT("hard", sp, stop, ss, stopst); 669 s = m->g->strip[ss]; 670 switch (OP(s)) { 671 case OBACK_: /* the vilest depths */ 672 i = OPND(s); 673 assert(0 < i && i <= m->g->nsub); 674 if (m->pmatch[i].rm_eo == -1) 675 return(NULL); 676 assert(m->pmatch[i].rm_so != -1); 677 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 678 if (len == 0 && rec++ > MAX_RECURSION) 679 return(NULL); 680 assert(stop - m->beginp >= len); 681 if (sp > stop - len) 682 return(NULL); /* not enough left to match */ 683 ssp = m->offp + m->pmatch[i].rm_so; 684 if (memcmp(sp, ssp, len) != 0) 685 return(NULL); 686 while (m->g->strip[ss] != SOP(O_BACK, i)) 687 ss++; 688 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 689 break; 690 case OQUEST_: /* to null or not */ 691 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 692 if (dp != NULL) 693 return(dp); /* not */ 694 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 695 break; 696 case OPLUS_: 697 assert(m->lastpos != NULL); 698 assert(lev+1 <= m->g->nplus); 699 m->lastpos[lev+1] = sp; 700 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 701 break; 702 case O_PLUS: 703 if (sp == m->lastpos[lev]) /* last pass matched null */ 704 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 705 /* try another pass */ 706 m->lastpos[lev] = sp; 707 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 708 if (dp == NULL) 709 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 710 else 711 return(dp); 712 break; 713 case OCH_: /* find the right one, if any */ 714 ssub = ss + 1; 715 esub = ss + OPND(s) - 1; 716 assert(OP(m->g->strip[esub]) == OOR1); 717 for (;;) { /* find first matching branch */ 718 dp = backref(m, sp, stop, ssub, esub, lev, rec); 719 if (dp != NULL) 720 return(dp); 721 /* that one missed, try next one */ 722 if (OP(m->g->strip[esub]) == O_CH) 723 return(NULL); /* there is none */ 724 esub++; 725 assert(OP(m->g->strip[esub]) == OOR2); 726 ssub = esub + 1; 727 esub += OPND(m->g->strip[esub]); 728 if (OP(m->g->strip[esub]) == OOR2) 729 esub--; 730 else 731 assert(OP(m->g->strip[esub]) == O_CH); 732 } 733 break; 734 case OLPAREN: /* must undo assignment if rest fails */ 735 i = OPND(s); 736 assert(0 < i && i <= m->g->nsub); 737 offsave = m->pmatch[i].rm_so; 738 m->pmatch[i].rm_so = sp - m->offp; 739 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 740 if (dp != NULL) 741 return(dp); 742 m->pmatch[i].rm_so = offsave; 743 return(NULL); 744 break; 745 case ORPAREN: /* must undo assignment if rest fails */ 746 i = OPND(s); 747 assert(0 < i && i <= m->g->nsub); 748 offsave = m->pmatch[i].rm_eo; 749 m->pmatch[i].rm_eo = sp - m->offp; 750 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 751 if (dp != NULL) 752 return(dp); 753 m->pmatch[i].rm_eo = offsave; 754 return(NULL); 755 break; 756 default: /* uh oh */ 757 assert(nope); 758 break; 759 } 760 761 /* "can't happen" */ 762 assert(nope); 763 /* NOTREACHED */ 764 return "shut up gcc"; 765 } 766 767 /* 768 - fast - step through the string at top speed 769 == static const char *fast(struct match *m, const char *start, \ 770 == const char *stop, sopno startst, sopno stopst); 771 */ 772 static const char * /* where tentative match ended, or NULL */ 773 fast( struct match *m, 774 const char *start, 775 const char *stop, 776 sopno startst, 777 sopno stopst) 778 { 779 states st = m->st; 780 states fresh = m->fresh; 781 states tmp = m->tmp; 782 const char *p = start; 783 wint_t c; 784 wint_t lastc; /* previous c */ 785 wint_t flagch; 786 int i; 787 const char *coldp; /* last p after which no match was underway */ 788 size_t clen; 789 790 CLEAR(st); 791 SET1(st, startst); 792 SP("fast", st, *p); 793 st = step(m->g, startst, stopst, st, NOTHING, st); 794 ASSIGN(fresh, st); 795 SP("start", st, *p); 796 coldp = NULL; 797 if (start == m->beginp) 798 c = OUT; 799 else { 800 /* 801 * XXX Wrong if the previous character was multi-byte. 802 * Newline never is (in encodings supported by FreeBSD), 803 * so this only breaks the ISWORD tests below. 804 */ 805 c = (uch)*(start - 1); 806 } 807 for (;;) { 808 /* next character */ 809 lastc = c; 810 if (p == m->endp) { 811 clen = 0; 812 c = OUT; 813 } else 814 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 815 if (EQ(st, fresh)) 816 coldp = p; 817 818 /* is there an EOL and/or BOL between lastc and c? */ 819 flagch = '\0'; 820 i = 0; 821 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 822 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 823 flagch = BOL; 824 i = m->g->nbol; 825 } 826 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 827 (c == OUT && !(m->eflags®_NOTEOL)) ) { 828 flagch = (flagch == BOL) ? BOLEOL : EOL; 829 i += m->g->neol; 830 } 831 if (i != 0) { 832 for (; i > 0; i--) 833 st = step(m->g, startst, stopst, st, flagch, st); 834 SP("boleol", st, c); 835 } 836 837 /* how about a word boundary? */ 838 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 839 (c != OUT && ISWORD(c)) ) { 840 flagch = BOW; 841 } 842 if ( (lastc != OUT && ISWORD(lastc)) && 843 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 844 flagch = EOW; 845 } 846 if (flagch == BOW || flagch == EOW) { 847 st = step(m->g, startst, stopst, st, flagch, st); 848 SP("boweow", st, c); 849 } 850 851 /* are we done? */ 852 if (ISSET(st, stopst) || p == stop || clen > stop - p) 853 break; /* NOTE BREAK OUT */ 854 855 /* no, we must deal with this character */ 856 ASSIGN(tmp, st); 857 ASSIGN(st, fresh); 858 assert(c != OUT); 859 st = step(m->g, startst, stopst, tmp, c, st); 860 SP("aft", st, c); 861 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 862 p += clen; 863 } 864 865 assert(coldp != NULL); 866 m->coldp = coldp; 867 if (ISSET(st, stopst)) 868 return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 869 else 870 return(NULL); 871 } 872 873 /* 874 - slow - step through the string more deliberately 875 == static const char *slow(struct match *m, const char *start, \ 876 == const char *stop, sopno startst, sopno stopst); 877 */ 878 static const char * /* where it ended */ 879 slow( struct match *m, 880 const char *start, 881 const char *stop, 882 sopno startst, 883 sopno stopst) 884 { 885 states st = m->st; 886 states empty = m->empty; 887 states tmp = m->tmp; 888 const char *p = start; 889 wint_t c; 890 wint_t lastc; /* previous c */ 891 wint_t flagch; 892 int i; 893 const char *matchp; /* last p at which a match ended */ 894 size_t clen; 895 896 AT("slow", start, stop, startst, stopst); 897 CLEAR(st); 898 SET1(st, startst); 899 SP("sstart", st, *p); 900 st = step(m->g, startst, stopst, st, NOTHING, st); 901 matchp = NULL; 902 if (start == m->beginp) 903 c = OUT; 904 else { 905 /* 906 * XXX Wrong if the previous character was multi-byte. 907 * Newline never is (in encodings supported by FreeBSD), 908 * so this only breaks the ISWORD tests below. 909 */ 910 c = (uch)*(start - 1); 911 } 912 for (;;) { 913 /* next character */ 914 lastc = c; 915 if (p == m->endp) { 916 c = OUT; 917 clen = 0; 918 } else 919 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 920 921 /* is there an EOL and/or BOL between lastc and c? */ 922 flagch = '\0'; 923 i = 0; 924 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 925 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 926 flagch = BOL; 927 i = m->g->nbol; 928 } 929 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 930 (c == OUT && !(m->eflags®_NOTEOL)) ) { 931 flagch = (flagch == BOL) ? BOLEOL : EOL; 932 i += m->g->neol; 933 } 934 if (i != 0) { 935 for (; i > 0; i--) 936 st = step(m->g, startst, stopst, st, flagch, st); 937 SP("sboleol", st, c); 938 } 939 940 /* how about a word boundary? */ 941 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 942 (c != OUT && ISWORD(c)) ) { 943 flagch = BOW; 944 } 945 if ( (lastc != OUT && ISWORD(lastc)) && 946 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 947 flagch = EOW; 948 } 949 if (flagch == BOW || flagch == EOW) { 950 st = step(m->g, startst, stopst, st, flagch, st); 951 SP("sboweow", st, c); 952 } 953 954 /* are we done? */ 955 if (ISSET(st, stopst)) 956 matchp = p; 957 if (EQ(st, empty) || p == stop || clen > stop - p) 958 break; /* NOTE BREAK OUT */ 959 960 /* no, we must deal with this character */ 961 ASSIGN(tmp, st); 962 ASSIGN(st, empty); 963 assert(c != OUT); 964 st = step(m->g, startst, stopst, tmp, c, st); 965 SP("saft", st, c); 966 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 967 p += clen; 968 } 969 970 return(matchp); 971 } 972 973 974 /* 975 - step - map set of states reachable before char to set reachable after 976 == static states step(struct re_guts *g, sopno start, sopno stop, \ 977 == states bef, int ch, states aft); 978 == #define BOL (OUT-1) 979 == #define EOL (BOL-1) 980 == #define BOLEOL (BOL-2) 981 == #define NOTHING (BOL-3) 982 == #define BOW (BOL-4) 983 == #define EOW (BOL-5) 984 == #define BADCHAR (BOL-6) 985 == #define NONCHAR(c) ((c) <= OUT) 986 */ 987 static states 988 step(struct re_guts *g, 989 sopno start, /* start state within strip */ 990 sopno stop, /* state after stop state within strip */ 991 states bef, /* states reachable before */ 992 wint_t ch, /* character or NONCHAR code */ 993 states aft) /* states already known reachable after */ 994 { 995 cset *cs; 996 sop s; 997 sopno pc; 998 onestate here; /* note, macros know this name */ 999 sopno look; 1000 int i; 1001 1002 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1003 s = g->strip[pc]; 1004 switch (OP(s)) { 1005 case OEND: 1006 assert(pc == stop-1); 1007 break; 1008 case OCHAR: 1009 /* only characters can match */ 1010 assert(!NONCHAR(ch) || ch != OPND(s)); 1011 if (ch == OPND(s)) 1012 FWD(aft, bef, 1); 1013 break; 1014 case OBOL: 1015 if (ch == BOL || ch == BOLEOL) 1016 FWD(aft, bef, 1); 1017 break; 1018 case OEOL: 1019 if (ch == EOL || ch == BOLEOL) 1020 FWD(aft, bef, 1); 1021 break; 1022 case OBOW: 1023 if (ch == BOW) 1024 FWD(aft, bef, 1); 1025 break; 1026 case OEOW: 1027 if (ch == EOW) 1028 FWD(aft, bef, 1); 1029 break; 1030 case OANY: 1031 if (!NONCHAR(ch)) 1032 FWD(aft, bef, 1); 1033 break; 1034 case OANYOF: 1035 cs = &g->sets[OPND(s)]; 1036 if (!NONCHAR(ch) && CHIN(cs, ch)) 1037 FWD(aft, bef, 1); 1038 break; 1039 case OBACK_: /* ignored here */ 1040 case O_BACK: 1041 FWD(aft, aft, 1); 1042 break; 1043 case OPLUS_: /* forward, this is just an empty */ 1044 FWD(aft, aft, 1); 1045 break; 1046 case O_PLUS: /* both forward and back */ 1047 FWD(aft, aft, 1); 1048 i = ISSETBACK(aft, OPND(s)); 1049 BACK(aft, aft, OPND(s)); 1050 if (!i && ISSETBACK(aft, OPND(s))) { 1051 /* oho, must reconsider loop body */ 1052 pc -= OPND(s) + 1; 1053 INIT(here, pc); 1054 } 1055 break; 1056 case OQUEST_: /* two branches, both forward */ 1057 FWD(aft, aft, 1); 1058 FWD(aft, aft, OPND(s)); 1059 break; 1060 case O_QUEST: /* just an empty */ 1061 FWD(aft, aft, 1); 1062 break; 1063 case OLPAREN: /* not significant here */ 1064 case ORPAREN: 1065 FWD(aft, aft, 1); 1066 break; 1067 case OCH_: /* mark the first two branches */ 1068 FWD(aft, aft, 1); 1069 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1070 FWD(aft, aft, OPND(s)); 1071 break; 1072 case OOR1: /* done a branch, find the O_CH */ 1073 if (ISSTATEIN(aft, here)) { 1074 for (look = 1; 1075 OP(s = g->strip[pc+look]) != O_CH; 1076 look += OPND(s)) 1077 assert(OP(s) == OOR2); 1078 FWD(aft, aft, look + 1); 1079 } 1080 break; 1081 case OOR2: /* propagate OCH_'s marking */ 1082 FWD(aft, aft, 1); 1083 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1084 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1085 FWD(aft, aft, OPND(s)); 1086 } 1087 break; 1088 case O_CH: /* just empty */ 1089 FWD(aft, aft, 1); 1090 break; 1091 default: /* ooooops... */ 1092 assert(nope); 1093 break; 1094 } 1095 } 1096 1097 return(aft); 1098 } 1099 1100 #ifdef REDEBUG 1101 /* 1102 - print - print a set of states 1103 == #ifdef REDEBUG 1104 == static void print(struct match *m, const char *caption, states st, \ 1105 == int ch, FILE *d); 1106 == #endif 1107 */ 1108 static void 1109 print(struct match *m, 1110 const char *caption, 1111 states st, 1112 int ch, 1113 FILE *d) 1114 { 1115 struct re_guts *g = m->g; 1116 int i; 1117 int first = 1; 1118 1119 if (!(m->eflags®_TRACE)) 1120 return; 1121 1122 fprintf(d, "%s", caption); 1123 if (ch != '\0') 1124 fprintf(d, " %s", pchar(ch)); 1125 for (i = 0; i < g->nstates; i++) 1126 if (ISSET(st, i)) { 1127 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1128 first = 0; 1129 } 1130 fprintf(d, "\n"); 1131 } 1132 1133 /* 1134 - at - print current situation 1135 == #ifdef REDEBUG 1136 == static void at(struct match *m, const char *title, const char *start, \ 1137 == const char *stop, sopno startst, sopno stopst); 1138 == #endif 1139 */ 1140 static void 1141 at( struct match *m, 1142 const char *title, 1143 const char *start, 1144 const char *stop, 1145 sopno startst, 1146 sopno stopst) 1147 { 1148 if (!(m->eflags®_TRACE)) 1149 return; 1150 1151 printf("%s %s-", title, pchar(*start)); 1152 printf("%s ", pchar(*stop)); 1153 printf("%ld-%ld\n", (long)startst, (long)stopst); 1154 } 1155 1156 #ifndef PCHARDONE 1157 #define PCHARDONE /* never again */ 1158 /* 1159 - pchar - make a character printable 1160 == #ifdef REDEBUG 1161 == static const char *pchar(int ch); 1162 == #endif 1163 * 1164 * Is this identical to regchar() over in debug.c? Well, yes. But a 1165 * duplicate here avoids having a debugging-capable regexec.o tied to 1166 * a matching debug.o, and this is convenient. It all disappears in 1167 * the non-debug compilation anyway, so it doesn't matter much. 1168 */ 1169 static const char * /* -> representation */ 1170 pchar(int ch) 1171 { 1172 static char pbuf[10]; 1173 1174 if (isprint((uch)ch) || ch == ' ') 1175 sprintf(pbuf, "%c", ch); 1176 else 1177 sprintf(pbuf, "\\%o", ch); 1178 return(pbuf); 1179 } 1180 #endif 1181 #endif 1182 1183 #undef matcher 1184 #undef fast 1185 #undef slow 1186 #undef dissect 1187 #undef backref 1188 #undef step 1189 #undef print 1190 #undef at 1191 #undef match 1192