1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)engine.c 8.5 (Berkeley) 3/20/94 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 /* 40 * The matching engine and friends. This file is #included by regexec.c 41 * after suitable #defines of a variety of macros used herein, so that 42 * different state representations can be used without duplicating masses 43 * of code. 44 */ 45 46 #ifdef SNAMES 47 #define matcher smatcher 48 #define fast sfast 49 #define slow sslow 50 #define dissect sdissect 51 #define backref sbackref 52 #define step sstep 53 #define print sprint 54 #define at sat 55 #define match smat 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #endif 68 #ifdef MNAMES 69 #define matcher mmatcher 70 #define fast mfast 71 #define slow mslow 72 #define dissect mdissect 73 #define backref mbackref 74 #define step mstep 75 #define print mprint 76 #define at mat 77 #define match mmat 78 #endif 79 80 /* another structure passed up and down to avoid zillions of parameters */ 81 struct match { 82 struct re_guts *g; 83 int eflags; 84 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 85 const char *offp; /* offsets work from here */ 86 const char *beginp; /* start of string -- virtual NUL precedes */ 87 const char *endp; /* end of string -- virtual NUL here */ 88 const char *coldp; /* can be no match starting before here */ 89 const char **lastpos; /* [nplus+1] */ 90 STATEVARS; 91 states st; /* current states */ 92 states fresh; /* states for a fresh start */ 93 states tmp; /* temporary */ 94 states empty; /* empty set of states */ 95 mbstate_t mbs; /* multibyte conversion state */ 96 }; 97 98 /* ========= begin header generated by ./mkh ========= */ 99 #ifdef __cplusplus 100 extern "C" { 101 #endif 102 103 /* === engine.c === */ 104 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 105 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 106 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 107 static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 108 static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 109 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); 110 #define MAX_RECURSION 100 111 #define BOL (OUT-1) 112 #define EOL (BOL-1) 113 #define BOLEOL (BOL-2) 114 #define NOTHING (BOL-3) 115 #define BOW (BOL-4) 116 #define EOW (BOL-5) 117 #define BADCHAR (BOL-6) 118 #define NONCHAR(c) ((c) <= OUT) 119 #ifdef REDEBUG 120 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 121 #endif 122 #ifdef REDEBUG 123 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 124 #endif 125 #ifdef REDEBUG 126 static const char *pchar(int ch); 127 #endif 128 129 #ifdef __cplusplus 130 } 131 #endif 132 /* ========= end header generated by ./mkh ========= */ 133 134 #ifdef REDEBUG 135 #define SP(t, s, c) print(m, t, s, c, stdout) 136 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 137 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 138 #else 139 #define SP(t, s, c) /* nothing */ 140 #define AT(t, p1, p2, s1, s2) /* nothing */ 141 #define NOTE(s) /* nothing */ 142 #endif 143 144 /* 145 - matcher - the actual matching engine 146 == static int matcher(struct re_guts *g, const char *string, \ 147 == size_t nmatch, regmatch_t pmatch[], int eflags); 148 */ 149 static int /* 0 success, REG_NOMATCH failure */ 150 matcher(struct re_guts *g, 151 const char *string, 152 size_t nmatch, 153 regmatch_t pmatch[], 154 int eflags) 155 { 156 const char *endp; 157 int i; 158 struct match mv; 159 struct match *m = &mv; 160 const char *dp; 161 const sopno gf = g->firststate+1; /* +1 for OEND */ 162 const sopno gl = g->laststate; 163 const char *start; 164 const char *stop; 165 /* Boyer-Moore algorithms variables */ 166 const char *pp; 167 int cj, mj; 168 const char *mustfirst; 169 const char *mustlast; 170 int *matchjump; 171 int *charjump; 172 173 /* simplify the situation where possible */ 174 if (g->cflags®_NOSUB) 175 nmatch = 0; 176 if (eflags®_STARTEND) { 177 start = string + pmatch[0].rm_so; 178 stop = string + pmatch[0].rm_eo; 179 } else { 180 start = string; 181 stop = start + strlen(start); 182 } 183 if (stop < start) 184 return(REG_INVARG); 185 186 /* prescreening; this does wonders for this rather slow code */ 187 if (g->must != NULL) { 188 if (g->charjump != NULL && g->matchjump != NULL) { 189 mustfirst = g->must; 190 mustlast = g->must + g->mlen - 1; 191 charjump = g->charjump; 192 matchjump = g->matchjump; 193 pp = mustlast; 194 for (dp = start+g->mlen-1; dp < stop;) { 195 /* Fast skip non-matches */ 196 while (dp < stop && charjump[(int)*dp]) 197 dp += charjump[(int)*dp]; 198 199 if (dp >= stop) 200 break; 201 202 /* Greedy matcher */ 203 /* We depend on not being used for 204 * for strings of length 1 205 */ 206 while (*--dp == *--pp && pp != mustfirst); 207 208 if (*dp == *pp) 209 break; 210 211 /* Jump to next possible match */ 212 mj = matchjump[pp - mustfirst]; 213 cj = charjump[(int)*dp]; 214 dp += (cj < mj ? mj : cj); 215 pp = mustlast; 216 } 217 if (pp != mustfirst) 218 return(REG_NOMATCH); 219 } else { 220 for (dp = start; dp < stop; dp++) 221 if (*dp == g->must[0] && 222 stop - dp >= g->mlen && 223 memcmp(dp, g->must, (size_t)g->mlen) == 0) 224 break; 225 if (dp == stop) /* we didn't find g->must */ 226 return(REG_NOMATCH); 227 } 228 } 229 230 /* match struct setup */ 231 m->g = g; 232 m->eflags = eflags; 233 m->pmatch = NULL; 234 m->lastpos = NULL; 235 m->offp = string; 236 m->beginp = start; 237 m->endp = stop; 238 STATESETUP(m, 4); 239 SETUP(m->st); 240 SETUP(m->fresh); 241 SETUP(m->tmp); 242 SETUP(m->empty); 243 CLEAR(m->empty); 244 ZAPSTATE(&m->mbs); 245 246 /* Adjust start according to moffset, to speed things up */ 247 if (g->moffset > -1) 248 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 249 250 /* this loop does only one repetition except for backrefs */ 251 for (;;) { 252 endp = fast(m, start, stop, gf, gl); 253 if (endp == NULL) { /* a miss */ 254 if (m->pmatch != NULL) 255 free((char *)m->pmatch); 256 if (m->lastpos != NULL) 257 free((char *)m->lastpos); 258 STATETEARDOWN(m); 259 return(REG_NOMATCH); 260 } 261 if (nmatch == 0 && !g->backrefs) 262 break; /* no further info needed */ 263 264 /* where? */ 265 assert(m->coldp != NULL); 266 for (;;) { 267 NOTE("finding start"); 268 endp = slow(m, m->coldp, stop, gf, gl); 269 if (endp != NULL) 270 break; 271 assert(m->coldp < m->endp); 272 m->coldp += XMBRTOWC(NULL, m->coldp, 273 m->endp - m->coldp, &m->mbs, 0); 274 } 275 if (nmatch == 1 && !g->backrefs) 276 break; /* no further info needed */ 277 278 /* oh my, he wants the subexpressions... */ 279 if (m->pmatch == NULL) 280 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 281 sizeof(regmatch_t)); 282 if (m->pmatch == NULL) { 283 STATETEARDOWN(m); 284 return(REG_ESPACE); 285 } 286 for (i = 1; i <= m->g->nsub; i++) 287 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 288 if (!g->backrefs && !(m->eflags®_BACKR)) { 289 NOTE("dissecting"); 290 dp = dissect(m, m->coldp, endp, gf, gl); 291 } else { 292 if (g->nplus > 0 && m->lastpos == NULL) 293 m->lastpos = malloc((g->nplus+1) * 294 sizeof(const char *)); 295 if (g->nplus > 0 && m->lastpos == NULL) { 296 free(m->pmatch); 297 STATETEARDOWN(m); 298 return(REG_ESPACE); 299 } 300 NOTE("backref dissect"); 301 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 302 } 303 if (dp != NULL) 304 break; 305 306 /* uh-oh... we couldn't find a subexpression-level match */ 307 assert(g->backrefs); /* must be back references doing it */ 308 assert(g->nplus == 0 || m->lastpos != NULL); 309 for (;;) { 310 if (dp != NULL || endp <= m->coldp) 311 break; /* defeat */ 312 NOTE("backoff"); 313 endp = slow(m, m->coldp, endp-1, gf, gl); 314 if (endp == NULL) 315 break; /* defeat */ 316 /* try it on a shorter possibility */ 317 #ifndef NDEBUG 318 for (i = 1; i <= m->g->nsub; i++) { 319 assert(m->pmatch[i].rm_so == -1); 320 assert(m->pmatch[i].rm_eo == -1); 321 } 322 #endif 323 NOTE("backoff dissect"); 324 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 325 } 326 assert(dp == NULL || dp == endp); 327 if (dp != NULL) /* found a shorter one */ 328 break; 329 330 /* despite initial appearances, there is no match here */ 331 NOTE("false alarm"); 332 /* recycle starting later */ 333 start = m->coldp + XMBRTOWC(NULL, m->coldp, 334 stop - m->coldp, &m->mbs, 0); 335 assert(start <= stop); 336 } 337 338 /* fill in the details if requested */ 339 if (nmatch > 0) { 340 pmatch[0].rm_so = m->coldp - m->offp; 341 pmatch[0].rm_eo = endp - m->offp; 342 } 343 if (nmatch > 1) { 344 assert(m->pmatch != NULL); 345 for (i = 1; i < nmatch; i++) 346 if (i <= m->g->nsub) 347 pmatch[i] = m->pmatch[i]; 348 else { 349 pmatch[i].rm_so = -1; 350 pmatch[i].rm_eo = -1; 351 } 352 } 353 354 if (m->pmatch != NULL) 355 free((char *)m->pmatch); 356 if (m->lastpos != NULL) 357 free((char *)m->lastpos); 358 STATETEARDOWN(m); 359 return(0); 360 } 361 362 /* 363 - dissect - figure out what matched what, no back references 364 == static const char *dissect(struct match *m, const char *start, \ 365 == const char *stop, sopno startst, sopno stopst); 366 */ 367 static const char * /* == stop (success) always */ 368 dissect(struct match *m, 369 const char *start, 370 const char *stop, 371 sopno startst, 372 sopno stopst) 373 { 374 int i; 375 sopno ss; /* start sop of current subRE */ 376 sopno es; /* end sop of current subRE */ 377 const char *sp; /* start of string matched by it */ 378 const char *stp; /* string matched by it cannot pass here */ 379 const char *rest; /* start of rest of string */ 380 const char *tail; /* string unmatched by rest of RE */ 381 sopno ssub; /* start sop of subsubRE */ 382 sopno esub; /* end sop of subsubRE */ 383 const char *ssp; /* start of string matched by subsubRE */ 384 const char *sep; /* end of string matched by subsubRE */ 385 const char *oldssp; /* previous ssp */ 386 const char *dp; 387 388 AT("diss", start, stop, startst, stopst); 389 sp = start; 390 for (ss = startst; ss < stopst; ss = es) { 391 /* identify end of subRE */ 392 es = ss; 393 switch (OP(m->g->strip[es])) { 394 case OPLUS_: 395 case OQUEST_: 396 es += OPND(m->g->strip[es]); 397 break; 398 case OCH_: 399 while (OP(m->g->strip[es]) != O_CH) 400 es += OPND(m->g->strip[es]); 401 break; 402 } 403 es++; 404 405 /* figure out what it matched */ 406 switch (OP(m->g->strip[ss])) { 407 case OEND: 408 assert(nope); 409 break; 410 case OCHAR: 411 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 412 break; 413 case OBOL: 414 case OEOL: 415 case OBOW: 416 case OEOW: 417 break; 418 case OANY: 419 case OANYOF: 420 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 421 break; 422 case OBACK_: 423 case O_BACK: 424 assert(nope); 425 break; 426 /* cases where length of match is hard to find */ 427 case OQUEST_: 428 stp = stop; 429 for (;;) { 430 /* how long could this one be? */ 431 rest = slow(m, sp, stp, ss, es); 432 assert(rest != NULL); /* it did match */ 433 /* could the rest match the rest? */ 434 tail = slow(m, rest, stop, es, stopst); 435 if (tail == stop) 436 break; /* yes! */ 437 /* no -- try a shorter match for this one */ 438 stp = rest - 1; 439 assert(stp >= sp); /* it did work */ 440 } 441 ssub = ss + 1; 442 esub = es - 1; 443 /* did innards match? */ 444 if (slow(m, sp, rest, ssub, esub) != NULL) { 445 dp = dissect(m, sp, rest, ssub, esub); 446 assert(dp == rest); 447 } else /* no */ 448 assert(sp == rest); 449 sp = rest; 450 break; 451 case OPLUS_: 452 stp = stop; 453 for (;;) { 454 /* how long could this one be? */ 455 rest = slow(m, sp, stp, ss, es); 456 assert(rest != NULL); /* it did match */ 457 /* could the rest match the rest? */ 458 tail = slow(m, rest, stop, es, stopst); 459 if (tail == stop) 460 break; /* yes! */ 461 /* no -- try a shorter match for this one */ 462 stp = rest - 1; 463 assert(stp >= sp); /* it did work */ 464 } 465 ssub = ss + 1; 466 esub = es - 1; 467 ssp = sp; 468 oldssp = ssp; 469 for (;;) { /* find last match of innards */ 470 sep = slow(m, ssp, rest, ssub, esub); 471 if (sep == NULL || sep == ssp) 472 break; /* failed or matched null */ 473 oldssp = ssp; /* on to next try */ 474 ssp = sep; 475 } 476 if (sep == NULL) { 477 /* last successful match */ 478 sep = ssp; 479 ssp = oldssp; 480 } 481 assert(sep == rest); /* must exhaust substring */ 482 assert(slow(m, ssp, sep, ssub, esub) == rest); 483 dp = dissect(m, ssp, sep, ssub, esub); 484 assert(dp == sep); 485 sp = rest; 486 break; 487 case OCH_: 488 stp = stop; 489 for (;;) { 490 /* how long could this one be? */ 491 rest = slow(m, sp, stp, ss, es); 492 assert(rest != NULL); /* it did match */ 493 /* could the rest match the rest? */ 494 tail = slow(m, rest, stop, es, stopst); 495 if (tail == stop) 496 break; /* yes! */ 497 /* no -- try a shorter match for this one */ 498 stp = rest - 1; 499 assert(stp >= sp); /* it did work */ 500 } 501 ssub = ss + 1; 502 esub = ss + OPND(m->g->strip[ss]) - 1; 503 assert(OP(m->g->strip[esub]) == OOR1); 504 for (;;) { /* find first matching branch */ 505 if (slow(m, sp, rest, ssub, esub) == rest) 506 break; /* it matched all of it */ 507 /* that one missed, try next one */ 508 assert(OP(m->g->strip[esub]) == OOR1); 509 esub++; 510 assert(OP(m->g->strip[esub]) == OOR2); 511 ssub = esub + 1; 512 esub += OPND(m->g->strip[esub]); 513 if (OP(m->g->strip[esub]) == OOR2) 514 esub--; 515 else 516 assert(OP(m->g->strip[esub]) == O_CH); 517 } 518 dp = dissect(m, sp, rest, ssub, esub); 519 assert(dp == rest); 520 sp = rest; 521 break; 522 case O_PLUS: 523 case O_QUEST: 524 case OOR1: 525 case OOR2: 526 case O_CH: 527 assert(nope); 528 break; 529 case OLPAREN: 530 i = OPND(m->g->strip[ss]); 531 assert(0 < i && i <= m->g->nsub); 532 m->pmatch[i].rm_so = sp - m->offp; 533 break; 534 case ORPAREN: 535 i = OPND(m->g->strip[ss]); 536 assert(0 < i && i <= m->g->nsub); 537 m->pmatch[i].rm_eo = sp - m->offp; 538 break; 539 default: /* uh oh */ 540 assert(nope); 541 break; 542 } 543 } 544 545 assert(sp == stop); 546 return(sp); 547 } 548 549 /* 550 - backref - figure out what matched what, figuring in back references 551 == static const char *backref(struct match *m, const char *start, \ 552 == const char *stop, sopno startst, sopno stopst, sopno lev); 553 */ 554 static const char * /* == stop (success) or NULL (failure) */ 555 backref(struct match *m, 556 const char *start, 557 const char *stop, 558 sopno startst, 559 sopno stopst, 560 sopno lev, /* PLUS nesting level */ 561 int rec) 562 { 563 int i; 564 sopno ss; /* start sop of current subRE */ 565 const char *sp; /* start of string matched by it */ 566 sopno ssub; /* start sop of subsubRE */ 567 sopno esub; /* end sop of subsubRE */ 568 const char *ssp; /* start of string matched by subsubRE */ 569 const char *dp; 570 size_t len; 571 int hard; 572 sop s; 573 regoff_t offsave; 574 cset *cs; 575 wint_t wc; 576 577 AT("back", start, stop, startst, stopst); 578 sp = start; 579 580 /* get as far as we can with easy stuff */ 581 hard = 0; 582 for (ss = startst; !hard && ss < stopst; ss++) 583 switch (OP(s = m->g->strip[ss])) { 584 case OCHAR: 585 if (sp == stop) 586 return(NULL); 587 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 588 if (wc != OPND(s)) 589 return(NULL); 590 break; 591 case OANY: 592 if (sp == stop) 593 return(NULL); 594 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 595 if (wc == BADCHAR) 596 return (NULL); 597 break; 598 case OANYOF: 599 if (sp == stop) 600 return (NULL); 601 cs = &m->g->sets[OPND(s)]; 602 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 603 if (wc == BADCHAR || !CHIN(cs, wc)) 604 return(NULL); 605 break; 606 case OBOL: 607 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 608 (sp < m->endp && *(sp-1) == '\n' && 609 (m->g->cflags®_NEWLINE)) ) 610 { /* yes */ } 611 else 612 return(NULL); 613 break; 614 case OEOL: 615 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 616 (sp < m->endp && *sp == '\n' && 617 (m->g->cflags®_NEWLINE)) ) 618 { /* yes */ } 619 else 620 return(NULL); 621 break; 622 case OBOW: 623 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 624 (sp < m->endp && *(sp-1) == '\n' && 625 (m->g->cflags®_NEWLINE)) || 626 (sp > m->beginp && 627 !ISWORD(*(sp-1))) ) && 628 (sp < m->endp && ISWORD(*sp)) ) 629 { /* yes */ } 630 else 631 return(NULL); 632 break; 633 case OEOW: 634 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 635 (sp < m->endp && *sp == '\n' && 636 (m->g->cflags®_NEWLINE)) || 637 (sp < m->endp && !ISWORD(*sp)) ) && 638 (sp > m->beginp && ISWORD(*(sp-1))) ) 639 { /* yes */ } 640 else 641 return(NULL); 642 break; 643 case O_QUEST: 644 break; 645 case OOR1: /* matches null but needs to skip */ 646 ss++; 647 s = m->g->strip[ss]; 648 do { 649 assert(OP(s) == OOR2); 650 ss += OPND(s); 651 } while (OP(s = m->g->strip[ss]) != O_CH); 652 /* note that the ss++ gets us past the O_CH */ 653 break; 654 default: /* have to make a choice */ 655 hard = 1; 656 break; 657 } 658 if (!hard) { /* that was it! */ 659 if (sp != stop) 660 return(NULL); 661 return(sp); 662 } 663 ss--; /* adjust for the for's final increment */ 664 665 /* the hard stuff */ 666 AT("hard", sp, stop, ss, stopst); 667 s = m->g->strip[ss]; 668 switch (OP(s)) { 669 case OBACK_: /* the vilest depths */ 670 i = OPND(s); 671 assert(0 < i && i <= m->g->nsub); 672 if (m->pmatch[i].rm_eo == -1) 673 return(NULL); 674 assert(m->pmatch[i].rm_so != -1); 675 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 676 if (len == 0 && rec++ > MAX_RECURSION) 677 return(NULL); 678 assert(stop - m->beginp >= len); 679 if (sp > stop - len) 680 return(NULL); /* not enough left to match */ 681 ssp = m->offp + m->pmatch[i].rm_so; 682 if (memcmp(sp, ssp, len) != 0) 683 return(NULL); 684 while (m->g->strip[ss] != SOP(O_BACK, i)) 685 ss++; 686 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 687 break; 688 case OQUEST_: /* to null or not */ 689 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 690 if (dp != NULL) 691 return(dp); /* not */ 692 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 693 break; 694 case OPLUS_: 695 assert(m->lastpos != NULL); 696 assert(lev+1 <= m->g->nplus); 697 m->lastpos[lev+1] = sp; 698 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 699 break; 700 case O_PLUS: 701 if (sp == m->lastpos[lev]) /* last pass matched null */ 702 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 703 /* try another pass */ 704 m->lastpos[lev] = sp; 705 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 706 if (dp == NULL) 707 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 708 else 709 return(dp); 710 break; 711 case OCH_: /* find the right one, if any */ 712 ssub = ss + 1; 713 esub = ss + OPND(s) - 1; 714 assert(OP(m->g->strip[esub]) == OOR1); 715 for (;;) { /* find first matching branch */ 716 dp = backref(m, sp, stop, ssub, esub, lev, rec); 717 if (dp != NULL) 718 return(dp); 719 /* that one missed, try next one */ 720 if (OP(m->g->strip[esub]) == O_CH) 721 return(NULL); /* there is none */ 722 esub++; 723 assert(OP(m->g->strip[esub]) == OOR2); 724 ssub = esub + 1; 725 esub += OPND(m->g->strip[esub]); 726 if (OP(m->g->strip[esub]) == OOR2) 727 esub--; 728 else 729 assert(OP(m->g->strip[esub]) == O_CH); 730 } 731 break; 732 case OLPAREN: /* must undo assignment if rest fails */ 733 i = OPND(s); 734 assert(0 < i && i <= m->g->nsub); 735 offsave = m->pmatch[i].rm_so; 736 m->pmatch[i].rm_so = sp - m->offp; 737 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 738 if (dp != NULL) 739 return(dp); 740 m->pmatch[i].rm_so = offsave; 741 return(NULL); 742 break; 743 case ORPAREN: /* must undo assignment if rest fails */ 744 i = OPND(s); 745 assert(0 < i && i <= m->g->nsub); 746 offsave = m->pmatch[i].rm_eo; 747 m->pmatch[i].rm_eo = sp - m->offp; 748 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 749 if (dp != NULL) 750 return(dp); 751 m->pmatch[i].rm_eo = offsave; 752 return(NULL); 753 break; 754 default: /* uh oh */ 755 assert(nope); 756 break; 757 } 758 759 /* "can't happen" */ 760 assert(nope); 761 /* NOTREACHED */ 762 return "shut up gcc"; 763 } 764 765 /* 766 - fast - step through the string at top speed 767 == static const char *fast(struct match *m, const char *start, \ 768 == const char *stop, sopno startst, sopno stopst); 769 */ 770 static const char * /* where tentative match ended, or NULL */ 771 fast( struct match *m, 772 const char *start, 773 const char *stop, 774 sopno startst, 775 sopno stopst) 776 { 777 states st = m->st; 778 states fresh = m->fresh; 779 states tmp = m->tmp; 780 const char *p = start; 781 wint_t c; 782 wint_t lastc; /* previous c */ 783 wint_t flagch; 784 int i; 785 const char *coldp; /* last p after which no match was underway */ 786 size_t clen; 787 788 CLEAR(st); 789 SET1(st, startst); 790 st = step(m->g, startst, stopst, st, NOTHING, st); 791 ASSIGN(fresh, st); 792 SP("start", st, *p); 793 coldp = NULL; 794 if (start == m->beginp) 795 c = OUT; 796 else { 797 /* 798 * XXX Wrong if the previous character was multi-byte. 799 * Newline never is (in encodings supported by FreeBSD), 800 * so this only breaks the ISWORD tests below. 801 */ 802 c = (uch)*(start - 1); 803 } 804 for (;;) { 805 /* next character */ 806 lastc = c; 807 if (p == m->endp) { 808 clen = 0; 809 c = OUT; 810 } else 811 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 812 if (EQ(st, fresh)) 813 coldp = p; 814 815 /* is there an EOL and/or BOL between lastc and c? */ 816 flagch = '\0'; 817 i = 0; 818 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 819 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 820 flagch = BOL; 821 i = m->g->nbol; 822 } 823 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 824 (c == OUT && !(m->eflags®_NOTEOL)) ) { 825 flagch = (flagch == BOL) ? BOLEOL : EOL; 826 i += m->g->neol; 827 } 828 if (i != 0) { 829 for (; i > 0; i--) 830 st = step(m->g, startst, stopst, st, flagch, st); 831 SP("boleol", st, c); 832 } 833 834 /* how about a word boundary? */ 835 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 836 (c != OUT && ISWORD(c)) ) { 837 flagch = BOW; 838 } 839 if ( (lastc != OUT && ISWORD(lastc)) && 840 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 841 flagch = EOW; 842 } 843 if (flagch == BOW || flagch == EOW) { 844 st = step(m->g, startst, stopst, st, flagch, st); 845 SP("boweow", st, c); 846 } 847 848 /* are we done? */ 849 if (ISSET(st, stopst) || p == stop || clen > stop - p) 850 break; /* NOTE BREAK OUT */ 851 852 /* no, we must deal with this character */ 853 ASSIGN(tmp, st); 854 ASSIGN(st, fresh); 855 assert(c != OUT); 856 st = step(m->g, startst, stopst, tmp, c, st); 857 SP("aft", st, c); 858 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 859 p += clen; 860 } 861 862 assert(coldp != NULL); 863 m->coldp = coldp; 864 if (ISSET(st, stopst)) 865 return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 866 else 867 return(NULL); 868 } 869 870 /* 871 - slow - step through the string more deliberately 872 == static const char *slow(struct match *m, const char *start, \ 873 == const char *stop, sopno startst, sopno stopst); 874 */ 875 static const char * /* where it ended */ 876 slow( struct match *m, 877 const char *start, 878 const char *stop, 879 sopno startst, 880 sopno stopst) 881 { 882 states st = m->st; 883 states empty = m->empty; 884 states tmp = m->tmp; 885 const char *p = start; 886 wint_t c; 887 wint_t lastc; /* previous c */ 888 wint_t flagch; 889 int i; 890 const char *matchp; /* last p at which a match ended */ 891 size_t clen; 892 893 AT("slow", start, stop, startst, stopst); 894 CLEAR(st); 895 SET1(st, startst); 896 SP("sstart", st, *p); 897 st = step(m->g, startst, stopst, st, NOTHING, st); 898 matchp = NULL; 899 if (start == m->beginp) 900 c = OUT; 901 else { 902 /* 903 * XXX Wrong if the previous character was multi-byte. 904 * Newline never is (in encodings supported by FreeBSD), 905 * so this only breaks the ISWORD tests below. 906 */ 907 c = (uch)*(start - 1); 908 } 909 for (;;) { 910 /* next character */ 911 lastc = c; 912 if (p == m->endp) { 913 c = OUT; 914 clen = 0; 915 } else 916 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 917 918 /* is there an EOL and/or BOL between lastc and c? */ 919 flagch = '\0'; 920 i = 0; 921 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 922 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 923 flagch = BOL; 924 i = m->g->nbol; 925 } 926 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 927 (c == OUT && !(m->eflags®_NOTEOL)) ) { 928 flagch = (flagch == BOL) ? BOLEOL : EOL; 929 i += m->g->neol; 930 } 931 if (i != 0) { 932 for (; i > 0; i--) 933 st = step(m->g, startst, stopst, st, flagch, st); 934 SP("sboleol", st, c); 935 } 936 937 /* how about a word boundary? */ 938 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 939 (c != OUT && ISWORD(c)) ) { 940 flagch = BOW; 941 } 942 if ( (lastc != OUT && ISWORD(lastc)) && 943 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 944 flagch = EOW; 945 } 946 if (flagch == BOW || flagch == EOW) { 947 st = step(m->g, startst, stopst, st, flagch, st); 948 SP("sboweow", st, c); 949 } 950 951 /* are we done? */ 952 if (ISSET(st, stopst)) 953 matchp = p; 954 if (EQ(st, empty) || p == stop || clen > stop - p) 955 break; /* NOTE BREAK OUT */ 956 957 /* no, we must deal with this character */ 958 ASSIGN(tmp, st); 959 ASSIGN(st, empty); 960 assert(c != OUT); 961 st = step(m->g, startst, stopst, tmp, c, st); 962 SP("saft", st, c); 963 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 964 p += clen; 965 } 966 967 return(matchp); 968 } 969 970 971 /* 972 - step - map set of states reachable before char to set reachable after 973 == static states step(struct re_guts *g, sopno start, sopno stop, \ 974 == states bef, int ch, states aft); 975 == #define BOL (OUT-1) 976 == #define EOL (BOL-1) 977 == #define BOLEOL (BOL-2) 978 == #define NOTHING (BOL-3) 979 == #define BOW (BOL-4) 980 == #define EOW (BOL-5) 981 == #define BADCHAR (BOL-6) 982 == #define NONCHAR(c) ((c) <= OUT) 983 */ 984 static states 985 step(struct re_guts *g, 986 sopno start, /* start state within strip */ 987 sopno stop, /* state after stop state within strip */ 988 states bef, /* states reachable before */ 989 wint_t ch, /* character or NONCHAR code */ 990 states aft) /* states already known reachable after */ 991 { 992 cset *cs; 993 sop s; 994 sopno pc; 995 onestate here; /* note, macros know this name */ 996 sopno look; 997 int i; 998 999 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1000 s = g->strip[pc]; 1001 switch (OP(s)) { 1002 case OEND: 1003 assert(pc == stop-1); 1004 break; 1005 case OCHAR: 1006 /* only characters can match */ 1007 assert(!NONCHAR(ch) || ch != OPND(s)); 1008 if (ch == OPND(s)) 1009 FWD(aft, bef, 1); 1010 break; 1011 case OBOL: 1012 if (ch == BOL || ch == BOLEOL) 1013 FWD(aft, bef, 1); 1014 break; 1015 case OEOL: 1016 if (ch == EOL || ch == BOLEOL) 1017 FWD(aft, bef, 1); 1018 break; 1019 case OBOW: 1020 if (ch == BOW) 1021 FWD(aft, bef, 1); 1022 break; 1023 case OEOW: 1024 if (ch == EOW) 1025 FWD(aft, bef, 1); 1026 break; 1027 case OANY: 1028 if (!NONCHAR(ch)) 1029 FWD(aft, bef, 1); 1030 break; 1031 case OANYOF: 1032 cs = &g->sets[OPND(s)]; 1033 if (!NONCHAR(ch) && CHIN(cs, ch)) 1034 FWD(aft, bef, 1); 1035 break; 1036 case OBACK_: /* ignored here */ 1037 case O_BACK: 1038 FWD(aft, aft, 1); 1039 break; 1040 case OPLUS_: /* forward, this is just an empty */ 1041 FWD(aft, aft, 1); 1042 break; 1043 case O_PLUS: /* both forward and back */ 1044 FWD(aft, aft, 1); 1045 i = ISSETBACK(aft, OPND(s)); 1046 BACK(aft, aft, OPND(s)); 1047 if (!i && ISSETBACK(aft, OPND(s))) { 1048 /* oho, must reconsider loop body */ 1049 pc -= OPND(s) + 1; 1050 INIT(here, pc); 1051 } 1052 break; 1053 case OQUEST_: /* two branches, both forward */ 1054 FWD(aft, aft, 1); 1055 FWD(aft, aft, OPND(s)); 1056 break; 1057 case O_QUEST: /* just an empty */ 1058 FWD(aft, aft, 1); 1059 break; 1060 case OLPAREN: /* not significant here */ 1061 case ORPAREN: 1062 FWD(aft, aft, 1); 1063 break; 1064 case OCH_: /* mark the first two branches */ 1065 FWD(aft, aft, 1); 1066 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1067 FWD(aft, aft, OPND(s)); 1068 break; 1069 case OOR1: /* done a branch, find the O_CH */ 1070 if (ISSTATEIN(aft, here)) { 1071 for (look = 1; 1072 OP(s = g->strip[pc+look]) != O_CH; 1073 look += OPND(s)) 1074 assert(OP(s) == OOR2); 1075 FWD(aft, aft, look); 1076 } 1077 break; 1078 case OOR2: /* propagate OCH_'s marking */ 1079 FWD(aft, aft, 1); 1080 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1081 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1082 FWD(aft, aft, OPND(s)); 1083 } 1084 break; 1085 case O_CH: /* just empty */ 1086 FWD(aft, aft, 1); 1087 break; 1088 default: /* ooooops... */ 1089 assert(nope); 1090 break; 1091 } 1092 } 1093 1094 return(aft); 1095 } 1096 1097 #ifdef REDEBUG 1098 /* 1099 - print - print a set of states 1100 == #ifdef REDEBUG 1101 == static void print(struct match *m, const char *caption, states st, \ 1102 == int ch, FILE *d); 1103 == #endif 1104 */ 1105 static void 1106 print(struct match *m, 1107 const char *caption, 1108 states st, 1109 int ch, 1110 FILE *d) 1111 { 1112 struct re_guts *g = m->g; 1113 int i; 1114 int first = 1; 1115 1116 if (!(m->eflags®_TRACE)) 1117 return; 1118 1119 fprintf(d, "%s", caption); 1120 if (ch != '\0') 1121 fprintf(d, " %s", pchar(ch)); 1122 for (i = 0; i < g->nstates; i++) 1123 if (ISSET(st, i)) { 1124 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1125 first = 0; 1126 } 1127 fprintf(d, "\n"); 1128 } 1129 1130 /* 1131 - at - print current situation 1132 == #ifdef REDEBUG 1133 == static void at(struct match *m, const char *title, const char *start, \ 1134 == const char *stop, sopno startst, sopno stopst); 1135 == #endif 1136 */ 1137 static void 1138 at( struct match *m, 1139 const char *title, 1140 const char *start, 1141 const char *stop, 1142 sopno startst, 1143 sopno stopst) 1144 { 1145 if (!(m->eflags®_TRACE)) 1146 return; 1147 1148 printf("%s %s-", title, pchar(*start)); 1149 printf("%s ", pchar(*stop)); 1150 printf("%ld-%ld\n", (long)startst, (long)stopst); 1151 } 1152 1153 #ifndef PCHARDONE 1154 #define PCHARDONE /* never again */ 1155 /* 1156 - pchar - make a character printable 1157 == #ifdef REDEBUG 1158 == static const char *pchar(int ch); 1159 == #endif 1160 * 1161 * Is this identical to regchar() over in debug.c? Well, yes. But a 1162 * duplicate here avoids having a debugging-capable regexec.o tied to 1163 * a matching debug.o, and this is convenient. It all disappears in 1164 * the non-debug compilation anyway, so it doesn't matter much. 1165 */ 1166 static const char * /* -> representation */ 1167 pchar(int ch) 1168 { 1169 static char pbuf[10]; 1170 1171 if (isprint((uch)ch) || ch == ' ') 1172 sprintf(pbuf, "%c", ch); 1173 else 1174 sprintf(pbuf, "\\%o", ch); 1175 return(pbuf); 1176 } 1177 #endif 1178 #endif 1179 1180 #undef matcher 1181 #undef fast 1182 #undef slow 1183 #undef dissect 1184 #undef backref 1185 #undef step 1186 #undef print 1187 #undef at 1188 #undef match 1189