1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)engine.c 8.5 (Berkeley) 3/20/94 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 /* 40 * The matching engine and friends. This file is #included by regexec.c 41 * after suitable #defines of a variety of macros used herein, so that 42 * different state representations can be used without duplicating masses 43 * of code. 44 */ 45 46 #ifdef SNAMES 47 #define matcher smatcher 48 #define fast sfast 49 #define slow sslow 50 #define dissect sdissect 51 #define backref sbackref 52 #define step sstep 53 #define print sprint 54 #define at sat 55 #define match smat 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #endif 68 #ifdef MNAMES 69 #define matcher mmatcher 70 #define fast mfast 71 #define slow mslow 72 #define dissect mdissect 73 #define backref mbackref 74 #define step mstep 75 #define print mprint 76 #define at mat 77 #define match mmat 78 #endif 79 80 /* another structure passed up and down to avoid zillions of parameters */ 81 struct match { 82 struct re_guts *g; 83 int eflags; 84 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 85 char *offp; /* offsets work from here */ 86 char *beginp; /* start of string -- virtual NUL precedes */ 87 char *endp; /* end of string -- virtual NUL here */ 88 char *coldp; /* can be no match starting before here */ 89 char **lastpos; /* [nplus+1] */ 90 STATEVARS; 91 states st; /* current states */ 92 states fresh; /* states for a fresh start */ 93 states tmp; /* temporary */ 94 states empty; /* empty set of states */ 95 mbstate_t mbs; /* multibyte conversion state */ 96 }; 97 98 /* ========= begin header generated by ./mkh ========= */ 99 #ifdef __cplusplus 100 extern "C" { 101 #endif 102 103 /* === engine.c === */ 104 static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 105 static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 106 static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); 107 static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 108 static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 109 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); 110 #define BOL (OUT-1) 111 #define EOL (BOL-1) 112 #define BOLEOL (BOL-2) 113 #define NOTHING (BOL-3) 114 #define BOW (BOL-4) 115 #define EOW (BOL-5) 116 #define BADCHAR (BOL-6) 117 #define NONCHAR(c) ((c) <= OUT) 118 #ifdef REDEBUG 119 static void print(struct match *m, char *caption, states st, int ch, FILE *d); 120 #endif 121 #ifdef REDEBUG 122 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 123 #endif 124 #ifdef REDEBUG 125 static char *pchar(int ch); 126 #endif 127 128 #ifdef __cplusplus 129 } 130 #endif 131 /* ========= end header generated by ./mkh ========= */ 132 133 #ifdef REDEBUG 134 #define SP(t, s, c) print(m, t, s, c, stdout) 135 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 136 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 137 #else 138 #define SP(t, s, c) /* nothing */ 139 #define AT(t, p1, p2, s1, s2) /* nothing */ 140 #define NOTE(s) /* nothing */ 141 #endif 142 143 /* 144 - matcher - the actual matching engine 145 == static int matcher(struct re_guts *g, char *string, \ 146 == size_t nmatch, regmatch_t pmatch[], int eflags); 147 */ 148 static int /* 0 success, REG_NOMATCH failure */ 149 matcher(g, string, nmatch, pmatch, eflags) 150 struct re_guts *g; 151 char *string; 152 size_t nmatch; 153 regmatch_t pmatch[]; 154 int eflags; 155 { 156 char *endp; 157 int i; 158 struct match mv; 159 struct match *m = &mv; 160 char *dp; 161 const sopno gf = g->firststate+1; /* +1 for OEND */ 162 const sopno gl = g->laststate; 163 char *start; 164 char *stop; 165 /* Boyer-Moore algorithms variables */ 166 char *pp; 167 int cj, mj; 168 char *mustfirst; 169 char *mustlast; 170 int *matchjump; 171 int *charjump; 172 173 /* simplify the situation where possible */ 174 if (g->cflags®_NOSUB) 175 nmatch = 0; 176 if (eflags®_STARTEND) { 177 start = string + pmatch[0].rm_so; 178 stop = string + pmatch[0].rm_eo; 179 } else { 180 start = string; 181 stop = start + strlen(start); 182 } 183 if (stop < start) 184 return(REG_INVARG); 185 186 /* prescreening; this does wonders for this rather slow code */ 187 if (g->must != NULL) { 188 if (g->charjump != NULL && g->matchjump != NULL) { 189 mustfirst = g->must; 190 mustlast = g->must + g->mlen - 1; 191 charjump = g->charjump; 192 matchjump = g->matchjump; 193 pp = mustlast; 194 for (dp = start+g->mlen-1; dp < stop;) { 195 /* Fast skip non-matches */ 196 while (dp < stop && charjump[(int)*dp]) 197 dp += charjump[(int)*dp]; 198 199 if (dp >= stop) 200 break; 201 202 /* Greedy matcher */ 203 /* We depend on not being used for 204 * for strings of length 1 205 */ 206 while (*--dp == *--pp && pp != mustfirst); 207 208 if (*dp == *pp) 209 break; 210 211 /* Jump to next possible match */ 212 mj = matchjump[pp - mustfirst]; 213 cj = charjump[(int)*dp]; 214 dp += (cj < mj ? mj : cj); 215 pp = mustlast; 216 } 217 if (pp != mustfirst) 218 return(REG_NOMATCH); 219 } else { 220 for (dp = start; dp < stop; dp++) 221 if (*dp == g->must[0] && 222 stop - dp >= g->mlen && 223 memcmp(dp, g->must, (size_t)g->mlen) == 0) 224 break; 225 if (dp == stop) /* we didn't find g->must */ 226 return(REG_NOMATCH); 227 } 228 } 229 230 /* match struct setup */ 231 m->g = g; 232 m->eflags = eflags; 233 m->pmatch = NULL; 234 m->lastpos = NULL; 235 m->offp = string; 236 m->beginp = start; 237 m->endp = stop; 238 STATESETUP(m, 4); 239 SETUP(m->st); 240 SETUP(m->fresh); 241 SETUP(m->tmp); 242 SETUP(m->empty); 243 CLEAR(m->empty); 244 ZAPSTATE(&m->mbs); 245 246 /* Adjust start according to moffset, to speed things up */ 247 if (g->moffset > -1) 248 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 249 250 /* this loop does only one repetition except for backrefs */ 251 for (;;) { 252 endp = fast(m, start, stop, gf, gl); 253 if (endp == NULL) { /* a miss */ 254 if (m->pmatch != NULL) 255 free((char *)m->pmatch); 256 if (m->lastpos != NULL) 257 free((char *)m->lastpos); 258 STATETEARDOWN(m); 259 return(REG_NOMATCH); 260 } 261 if (nmatch == 0 && !g->backrefs) 262 break; /* no further info needed */ 263 264 /* where? */ 265 assert(m->coldp != NULL); 266 for (;;) { 267 NOTE("finding start"); 268 endp = slow(m, m->coldp, stop, gf, gl); 269 if (endp != NULL) 270 break; 271 assert(m->coldp < m->endp); 272 m->coldp += XMBRTOWC(NULL, m->coldp, 273 m->endp - m->coldp, &m->mbs, 0); 274 } 275 if (nmatch == 1 && !g->backrefs) 276 break; /* no further info needed */ 277 278 /* oh my, he wants the subexpressions... */ 279 if (m->pmatch == NULL) 280 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 281 sizeof(regmatch_t)); 282 if (m->pmatch == NULL) { 283 STATETEARDOWN(m); 284 return(REG_ESPACE); 285 } 286 for (i = 1; i <= m->g->nsub; i++) 287 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 288 if (!g->backrefs && !(m->eflags®_BACKR)) { 289 NOTE("dissecting"); 290 dp = dissect(m, m->coldp, endp, gf, gl); 291 } else { 292 if (g->nplus > 0 && m->lastpos == NULL) 293 m->lastpos = (char **)malloc((g->nplus+1) * 294 sizeof(char *)); 295 if (g->nplus > 0 && m->lastpos == NULL) { 296 free(m->pmatch); 297 STATETEARDOWN(m); 298 return(REG_ESPACE); 299 } 300 NOTE("backref dissect"); 301 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 302 } 303 if (dp != NULL) 304 break; 305 306 /* uh-oh... we couldn't find a subexpression-level match */ 307 assert(g->backrefs); /* must be back references doing it */ 308 assert(g->nplus == 0 || m->lastpos != NULL); 309 for (;;) { 310 if (dp != NULL || endp <= m->coldp) 311 break; /* defeat */ 312 NOTE("backoff"); 313 endp = slow(m, m->coldp, endp-1, gf, gl); 314 if (endp == NULL) 315 break; /* defeat */ 316 /* try it on a shorter possibility */ 317 #ifndef NDEBUG 318 for (i = 1; i <= m->g->nsub; i++) { 319 assert(m->pmatch[i].rm_so == -1); 320 assert(m->pmatch[i].rm_eo == -1); 321 } 322 #endif 323 NOTE("backoff dissect"); 324 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 325 } 326 assert(dp == NULL || dp == endp); 327 if (dp != NULL) /* found a shorter one */ 328 break; 329 330 /* despite initial appearances, there is no match here */ 331 NOTE("false alarm"); 332 /* recycle starting later */ 333 start = m->coldp + XMBRTOWC(NULL, m->coldp, 334 stop - m->coldp, &m->mbs, 0); 335 assert(start <= stop); 336 } 337 338 /* fill in the details if requested */ 339 if (nmatch > 0) { 340 pmatch[0].rm_so = m->coldp - m->offp; 341 pmatch[0].rm_eo = endp - m->offp; 342 } 343 if (nmatch > 1) { 344 assert(m->pmatch != NULL); 345 for (i = 1; i < nmatch; i++) 346 if (i <= m->g->nsub) 347 pmatch[i] = m->pmatch[i]; 348 else { 349 pmatch[i].rm_so = -1; 350 pmatch[i].rm_eo = -1; 351 } 352 } 353 354 if (m->pmatch != NULL) 355 free((char *)m->pmatch); 356 if (m->lastpos != NULL) 357 free((char *)m->lastpos); 358 STATETEARDOWN(m); 359 return(0); 360 } 361 362 /* 363 - dissect - figure out what matched what, no back references 364 == static char *dissect(struct match *m, char *start, \ 365 == char *stop, sopno startst, sopno stopst); 366 */ 367 static char * /* == stop (success) always */ 368 dissect(m, start, stop, startst, stopst) 369 struct match *m; 370 char *start; 371 char *stop; 372 sopno startst; 373 sopno stopst; 374 { 375 int i; 376 sopno ss; /* start sop of current subRE */ 377 sopno es; /* end sop of current subRE */ 378 char *sp; /* start of string matched by it */ 379 char *stp; /* string matched by it cannot pass here */ 380 char *rest; /* start of rest of string */ 381 char *tail; /* string unmatched by rest of RE */ 382 sopno ssub; /* start sop of subsubRE */ 383 sopno esub; /* end sop of subsubRE */ 384 char *ssp; /* start of string matched by subsubRE */ 385 char *sep; /* end of string matched by subsubRE */ 386 char *oldssp; /* previous ssp */ 387 char *dp; 388 389 AT("diss", start, stop, startst, stopst); 390 sp = start; 391 for (ss = startst; ss < stopst; ss = es) { 392 /* identify end of subRE */ 393 es = ss; 394 switch (OP(m->g->strip[es])) { 395 case OPLUS_: 396 case OQUEST_: 397 es += OPND(m->g->strip[es]); 398 break; 399 case OCH_: 400 while (OP(m->g->strip[es]) != O_CH) 401 es += OPND(m->g->strip[es]); 402 break; 403 } 404 es++; 405 406 /* figure out what it matched */ 407 switch (OP(m->g->strip[ss])) { 408 case OEND: 409 assert(nope); 410 break; 411 case OCHAR: 412 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 413 break; 414 case OBOL: 415 case OEOL: 416 case OBOW: 417 case OEOW: 418 break; 419 case OANY: 420 case OANYOF: 421 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 422 break; 423 case OBACK_: 424 case O_BACK: 425 assert(nope); 426 break; 427 /* cases where length of match is hard to find */ 428 case OQUEST_: 429 stp = stop; 430 for (;;) { 431 /* how long could this one be? */ 432 rest = slow(m, sp, stp, ss, es); 433 assert(rest != NULL); /* it did match */ 434 /* could the rest match the rest? */ 435 tail = slow(m, rest, stop, es, stopst); 436 if (tail == stop) 437 break; /* yes! */ 438 /* no -- try a shorter match for this one */ 439 stp = rest - 1; 440 assert(stp >= sp); /* it did work */ 441 } 442 ssub = ss + 1; 443 esub = es - 1; 444 /* did innards match? */ 445 if (slow(m, sp, rest, ssub, esub) != NULL) { 446 dp = dissect(m, sp, rest, ssub, esub); 447 assert(dp == rest); 448 } else /* no */ 449 assert(sp == rest); 450 sp = rest; 451 break; 452 case OPLUS_: 453 stp = stop; 454 for (;;) { 455 /* how long could this one be? */ 456 rest = slow(m, sp, stp, ss, es); 457 assert(rest != NULL); /* it did match */ 458 /* could the rest match the rest? */ 459 tail = slow(m, rest, stop, es, stopst); 460 if (tail == stop) 461 break; /* yes! */ 462 /* no -- try a shorter match for this one */ 463 stp = rest - 1; 464 assert(stp >= sp); /* it did work */ 465 } 466 ssub = ss + 1; 467 esub = es - 1; 468 ssp = sp; 469 oldssp = ssp; 470 for (;;) { /* find last match of innards */ 471 sep = slow(m, ssp, rest, ssub, esub); 472 if (sep == NULL || sep == ssp) 473 break; /* failed or matched null */ 474 oldssp = ssp; /* on to next try */ 475 ssp = sep; 476 } 477 if (sep == NULL) { 478 /* last successful match */ 479 sep = ssp; 480 ssp = oldssp; 481 } 482 assert(sep == rest); /* must exhaust substring */ 483 assert(slow(m, ssp, sep, ssub, esub) == rest); 484 dp = dissect(m, ssp, sep, ssub, esub); 485 assert(dp == sep); 486 sp = rest; 487 break; 488 case OCH_: 489 stp = stop; 490 for (;;) { 491 /* how long could this one be? */ 492 rest = slow(m, sp, stp, ss, es); 493 assert(rest != NULL); /* it did match */ 494 /* could the rest match the rest? */ 495 tail = slow(m, rest, stop, es, stopst); 496 if (tail == stop) 497 break; /* yes! */ 498 /* no -- try a shorter match for this one */ 499 stp = rest - 1; 500 assert(stp >= sp); /* it did work */ 501 } 502 ssub = ss + 1; 503 esub = ss + OPND(m->g->strip[ss]) - 1; 504 assert(OP(m->g->strip[esub]) == OOR1); 505 for (;;) { /* find first matching branch */ 506 if (slow(m, sp, rest, ssub, esub) == rest) 507 break; /* it matched all of it */ 508 /* that one missed, try next one */ 509 assert(OP(m->g->strip[esub]) == OOR1); 510 esub++; 511 assert(OP(m->g->strip[esub]) == OOR2); 512 ssub = esub + 1; 513 esub += OPND(m->g->strip[esub]); 514 if (OP(m->g->strip[esub]) == OOR2) 515 esub--; 516 else 517 assert(OP(m->g->strip[esub]) == O_CH); 518 } 519 dp = dissect(m, sp, rest, ssub, esub); 520 assert(dp == rest); 521 sp = rest; 522 break; 523 case O_PLUS: 524 case O_QUEST: 525 case OOR1: 526 case OOR2: 527 case O_CH: 528 assert(nope); 529 break; 530 case OLPAREN: 531 i = OPND(m->g->strip[ss]); 532 assert(0 < i && i <= m->g->nsub); 533 m->pmatch[i].rm_so = sp - m->offp; 534 break; 535 case ORPAREN: 536 i = OPND(m->g->strip[ss]); 537 assert(0 < i && i <= m->g->nsub); 538 m->pmatch[i].rm_eo = sp - m->offp; 539 break; 540 default: /* uh oh */ 541 assert(nope); 542 break; 543 } 544 } 545 546 assert(sp == stop); 547 return(sp); 548 } 549 550 /* 551 - backref - figure out what matched what, figuring in back references 552 == static char *backref(struct match *m, char *start, \ 553 == char *stop, sopno startst, sopno stopst, sopno lev); 554 */ 555 static char * /* == stop (success) or NULL (failure) */ 556 backref(m, start, stop, startst, stopst, lev) 557 struct match *m; 558 char *start; 559 char *stop; 560 sopno startst; 561 sopno stopst; 562 sopno lev; /* PLUS nesting level */ 563 { 564 int i; 565 sopno ss; /* start sop of current subRE */ 566 char *sp; /* start of string matched by it */ 567 sopno ssub; /* start sop of subsubRE */ 568 sopno esub; /* end sop of subsubRE */ 569 char *ssp; /* start of string matched by subsubRE */ 570 char *dp; 571 size_t len; 572 int hard; 573 sop s; 574 regoff_t offsave; 575 cset *cs; 576 wint_t wc; 577 578 AT("back", start, stop, startst, stopst); 579 sp = start; 580 581 /* get as far as we can with easy stuff */ 582 hard = 0; 583 for (ss = startst; !hard && ss < stopst; ss++) 584 switch (OP(s = m->g->strip[ss])) { 585 case OCHAR: 586 if (sp == stop) 587 return(NULL); 588 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 589 if (wc != OPND(s)) 590 return(NULL); 591 break; 592 case OANY: 593 if (sp == stop) 594 return(NULL); 595 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 596 if (wc == BADCHAR) 597 return (NULL); 598 break; 599 case OANYOF: 600 if (sp == stop) 601 return (NULL); 602 cs = &m->g->sets[OPND(s)]; 603 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 604 if (wc == BADCHAR || !CHIN(cs, wc)) 605 return(NULL); 606 break; 607 case OBOL: 608 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 609 (sp < m->endp && *(sp-1) == '\n' && 610 (m->g->cflags®_NEWLINE)) ) 611 { /* yes */ } 612 else 613 return(NULL); 614 break; 615 case OEOL: 616 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 617 (sp < m->endp && *sp == '\n' && 618 (m->g->cflags®_NEWLINE)) ) 619 { /* yes */ } 620 else 621 return(NULL); 622 break; 623 case OBOW: 624 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 625 (sp < m->endp && *(sp-1) == '\n' && 626 (m->g->cflags®_NEWLINE)) || 627 (sp > m->beginp && 628 !ISWORD(*(sp-1))) ) && 629 (sp < m->endp && ISWORD(*sp)) ) 630 { /* yes */ } 631 else 632 return(NULL); 633 break; 634 case OEOW: 635 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 636 (sp < m->endp && *sp == '\n' && 637 (m->g->cflags®_NEWLINE)) || 638 (sp < m->endp && !ISWORD(*sp)) ) && 639 (sp > m->beginp && ISWORD(*(sp-1))) ) 640 { /* yes */ } 641 else 642 return(NULL); 643 break; 644 case O_QUEST: 645 break; 646 case OOR1: /* matches null but needs to skip */ 647 ss++; 648 s = m->g->strip[ss]; 649 do { 650 assert(OP(s) == OOR2); 651 ss += OPND(s); 652 } while (OP(s = m->g->strip[ss]) != O_CH); 653 /* note that the ss++ gets us past the O_CH */ 654 break; 655 default: /* have to make a choice */ 656 hard = 1; 657 break; 658 } 659 if (!hard) { /* that was it! */ 660 if (sp != stop) 661 return(NULL); 662 return(sp); 663 } 664 ss--; /* adjust for the for's final increment */ 665 666 /* the hard stuff */ 667 AT("hard", sp, stop, ss, stopst); 668 s = m->g->strip[ss]; 669 switch (OP(s)) { 670 case OBACK_: /* the vilest depths */ 671 i = OPND(s); 672 assert(0 < i && i <= m->g->nsub); 673 if (m->pmatch[i].rm_eo == -1) 674 return(NULL); 675 assert(m->pmatch[i].rm_so != -1); 676 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 677 assert(stop - m->beginp >= len); 678 if (sp > stop - len) 679 return(NULL); /* not enough left to match */ 680 ssp = m->offp + m->pmatch[i].rm_so; 681 if (memcmp(sp, ssp, len) != 0) 682 return(NULL); 683 while (m->g->strip[ss] != SOP(O_BACK, i)) 684 ss++; 685 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 686 break; 687 case OQUEST_: /* to null or not */ 688 dp = backref(m, sp, stop, ss+1, stopst, lev); 689 if (dp != NULL) 690 return(dp); /* not */ 691 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 692 break; 693 case OPLUS_: 694 assert(m->lastpos != NULL); 695 assert(lev+1 <= m->g->nplus); 696 m->lastpos[lev+1] = sp; 697 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 698 break; 699 case O_PLUS: 700 if (sp == m->lastpos[lev]) /* last pass matched null */ 701 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 702 /* try another pass */ 703 m->lastpos[lev] = sp; 704 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 705 if (dp == NULL) 706 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 707 else 708 return(dp); 709 break; 710 case OCH_: /* find the right one, if any */ 711 ssub = ss + 1; 712 esub = ss + OPND(s) - 1; 713 assert(OP(m->g->strip[esub]) == OOR1); 714 for (;;) { /* find first matching branch */ 715 dp = backref(m, sp, stop, ssub, esub, lev); 716 if (dp != NULL) 717 return(dp); 718 /* that one missed, try next one */ 719 if (OP(m->g->strip[esub]) == O_CH) 720 return(NULL); /* there is none */ 721 esub++; 722 assert(OP(m->g->strip[esub]) == OOR2); 723 ssub = esub + 1; 724 esub += OPND(m->g->strip[esub]); 725 if (OP(m->g->strip[esub]) == OOR2) 726 esub--; 727 else 728 assert(OP(m->g->strip[esub]) == O_CH); 729 } 730 break; 731 case OLPAREN: /* must undo assignment if rest fails */ 732 i = OPND(s); 733 assert(0 < i && i <= m->g->nsub); 734 offsave = m->pmatch[i].rm_so; 735 m->pmatch[i].rm_so = sp - m->offp; 736 dp = backref(m, sp, stop, ss+1, stopst, lev); 737 if (dp != NULL) 738 return(dp); 739 m->pmatch[i].rm_so = offsave; 740 return(NULL); 741 break; 742 case ORPAREN: /* must undo assignment if rest fails */ 743 i = OPND(s); 744 assert(0 < i && i <= m->g->nsub); 745 offsave = m->pmatch[i].rm_eo; 746 m->pmatch[i].rm_eo = sp - m->offp; 747 dp = backref(m, sp, stop, ss+1, stopst, lev); 748 if (dp != NULL) 749 return(dp); 750 m->pmatch[i].rm_eo = offsave; 751 return(NULL); 752 break; 753 default: /* uh oh */ 754 assert(nope); 755 break; 756 } 757 758 /* "can't happen" */ 759 assert(nope); 760 /* NOTREACHED */ 761 return "shut up gcc"; 762 } 763 764 /* 765 - fast - step through the string at top speed 766 == static char *fast(struct match *m, char *start, \ 767 == char *stop, sopno startst, sopno stopst); 768 */ 769 static char * /* where tentative match ended, or NULL */ 770 fast(m, start, stop, startst, stopst) 771 struct match *m; 772 char *start; 773 char *stop; 774 sopno startst; 775 sopno stopst; 776 { 777 states st = m->st; 778 states fresh = m->fresh; 779 states tmp = m->tmp; 780 char *p = start; 781 wint_t c; 782 wint_t lastc; /* previous c */ 783 wint_t flagch; 784 int i; 785 char *coldp; /* last p after which no match was underway */ 786 size_t clen; 787 788 CLEAR(st); 789 SET1(st, startst); 790 st = step(m->g, startst, stopst, st, NOTHING, st); 791 ASSIGN(fresh, st); 792 SP("start", st, *p); 793 coldp = NULL; 794 if (start == m->beginp) 795 c = OUT; 796 else { 797 /* 798 * XXX Wrong if the previous character was multi-byte. 799 * Newline never is (in encodings supported by FreeBSD), 800 * so this only breaks the ISWORD tests below. 801 */ 802 c = (uch)*(start - 1); 803 } 804 for (;;) { 805 /* next character */ 806 lastc = c; 807 if (p == m->endp) { 808 clen = 0; 809 c = OUT; 810 } else 811 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 812 if (EQ(st, fresh)) 813 coldp = p; 814 815 /* is there an EOL and/or BOL between lastc and c? */ 816 flagch = '\0'; 817 i = 0; 818 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 819 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 820 flagch = BOL; 821 i = m->g->nbol; 822 } 823 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 824 (c == OUT && !(m->eflags®_NOTEOL)) ) { 825 flagch = (flagch == BOL) ? BOLEOL : EOL; 826 i += m->g->neol; 827 } 828 if (i != 0) { 829 for (; i > 0; i--) 830 st = step(m->g, startst, stopst, st, flagch, st); 831 SP("boleol", st, c); 832 } 833 834 /* how about a word boundary? */ 835 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 836 (c != OUT && ISWORD(c)) ) { 837 flagch = BOW; 838 } 839 if ( (lastc != OUT && ISWORD(lastc)) && 840 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 841 flagch = EOW; 842 } 843 if (flagch == BOW || flagch == EOW) { 844 st = step(m->g, startst, stopst, st, flagch, st); 845 SP("boweow", st, c); 846 } 847 848 /* are we done? */ 849 if (ISSET(st, stopst) || p == stop || clen > stop - p) 850 break; /* NOTE BREAK OUT */ 851 852 /* no, we must deal with this character */ 853 ASSIGN(tmp, st); 854 ASSIGN(st, fresh); 855 assert(c != OUT); 856 st = step(m->g, startst, stopst, tmp, c, st); 857 SP("aft", st, c); 858 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 859 p += clen; 860 } 861 862 assert(coldp != NULL); 863 m->coldp = coldp; 864 if (ISSET(st, stopst)) 865 return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 866 else 867 return(NULL); 868 } 869 870 /* 871 - slow - step through the string more deliberately 872 == static char *slow(struct match *m, char *start, \ 873 == char *stop, sopno startst, sopno stopst); 874 */ 875 static char * /* where it ended */ 876 slow(m, start, stop, startst, stopst) 877 struct match *m; 878 char *start; 879 char *stop; 880 sopno startst; 881 sopno stopst; 882 { 883 states st = m->st; 884 states empty = m->empty; 885 states tmp = m->tmp; 886 char *p = start; 887 wint_t c; 888 wint_t lastc; /* previous c */ 889 wint_t flagch; 890 int i; 891 char *matchp; /* last p at which a match ended */ 892 size_t clen; 893 894 AT("slow", start, stop, startst, stopst); 895 CLEAR(st); 896 SET1(st, startst); 897 SP("sstart", st, *p); 898 st = step(m->g, startst, stopst, st, NOTHING, st); 899 matchp = NULL; 900 if (start == m->beginp) 901 c = OUT; 902 else { 903 /* 904 * XXX Wrong if the previous character was multi-byte. 905 * Newline never is (in encodings supported by FreeBSD), 906 * so this only breaks the ISWORD tests below. 907 */ 908 c = (uch)*(start - 1); 909 } 910 for (;;) { 911 /* next character */ 912 lastc = c; 913 if (p == m->endp) { 914 c = OUT; 915 clen = 0; 916 } else 917 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 918 919 /* is there an EOL and/or BOL between lastc and c? */ 920 flagch = '\0'; 921 i = 0; 922 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 923 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 924 flagch = BOL; 925 i = m->g->nbol; 926 } 927 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 928 (c == OUT && !(m->eflags®_NOTEOL)) ) { 929 flagch = (flagch == BOL) ? BOLEOL : EOL; 930 i += m->g->neol; 931 } 932 if (i != 0) { 933 for (; i > 0; i--) 934 st = step(m->g, startst, stopst, st, flagch, st); 935 SP("sboleol", st, c); 936 } 937 938 /* how about a word boundary? */ 939 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 940 (c != OUT && ISWORD(c)) ) { 941 flagch = BOW; 942 } 943 if ( (lastc != OUT && ISWORD(lastc)) && 944 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 945 flagch = EOW; 946 } 947 if (flagch == BOW || flagch == EOW) { 948 st = step(m->g, startst, stopst, st, flagch, st); 949 SP("sboweow", st, c); 950 } 951 952 /* are we done? */ 953 if (ISSET(st, stopst)) 954 matchp = p; 955 if (EQ(st, empty) || p == stop || clen > stop - p) 956 break; /* NOTE BREAK OUT */ 957 958 /* no, we must deal with this character */ 959 ASSIGN(tmp, st); 960 ASSIGN(st, empty); 961 assert(c != OUT); 962 st = step(m->g, startst, stopst, tmp, c, st); 963 SP("saft", st, c); 964 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 965 p += clen; 966 } 967 968 return(matchp); 969 } 970 971 972 /* 973 - step - map set of states reachable before char to set reachable after 974 == static states step(struct re_guts *g, sopno start, sopno stop, \ 975 == states bef, int ch, states aft); 976 == #define BOL (OUT-1) 977 == #define EOL (BOL-1) 978 == #define BOLEOL (BOL-2) 979 == #define NOTHING (BOL-3) 980 == #define BOW (BOL-4) 981 == #define EOW (BOL-5) 982 == #define BADCHAR (BOL-6) 983 == #define NONCHAR(c) ((c) <= OUT) 984 */ 985 static states 986 step(g, start, stop, bef, ch, aft) 987 struct re_guts *g; 988 sopno start; /* start state within strip */ 989 sopno stop; /* state after stop state within strip */ 990 states bef; /* states reachable before */ 991 wint_t ch; /* character or NONCHAR code */ 992 states aft; /* states already known reachable after */ 993 { 994 cset *cs; 995 sop s; 996 sopno pc; 997 onestate here; /* note, macros know this name */ 998 sopno look; 999 int i; 1000 1001 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1002 s = g->strip[pc]; 1003 switch (OP(s)) { 1004 case OEND: 1005 assert(pc == stop-1); 1006 break; 1007 case OCHAR: 1008 /* only characters can match */ 1009 assert(!NONCHAR(ch) || ch != OPND(s)); 1010 if (ch == OPND(s)) 1011 FWD(aft, bef, 1); 1012 break; 1013 case OBOL: 1014 if (ch == BOL || ch == BOLEOL) 1015 FWD(aft, bef, 1); 1016 break; 1017 case OEOL: 1018 if (ch == EOL || ch == BOLEOL) 1019 FWD(aft, bef, 1); 1020 break; 1021 case OBOW: 1022 if (ch == BOW) 1023 FWD(aft, bef, 1); 1024 break; 1025 case OEOW: 1026 if (ch == EOW) 1027 FWD(aft, bef, 1); 1028 break; 1029 case OANY: 1030 if (!NONCHAR(ch)) 1031 FWD(aft, bef, 1); 1032 break; 1033 case OANYOF: 1034 cs = &g->sets[OPND(s)]; 1035 if (!NONCHAR(ch) && CHIN(cs, ch)) 1036 FWD(aft, bef, 1); 1037 break; 1038 case OBACK_: /* ignored here */ 1039 case O_BACK: 1040 FWD(aft, aft, 1); 1041 break; 1042 case OPLUS_: /* forward, this is just an empty */ 1043 FWD(aft, aft, 1); 1044 break; 1045 case O_PLUS: /* both forward and back */ 1046 FWD(aft, aft, 1); 1047 i = ISSETBACK(aft, OPND(s)); 1048 BACK(aft, aft, OPND(s)); 1049 if (!i && ISSETBACK(aft, OPND(s))) { 1050 /* oho, must reconsider loop body */ 1051 pc -= OPND(s) + 1; 1052 INIT(here, pc); 1053 } 1054 break; 1055 case OQUEST_: /* two branches, both forward */ 1056 FWD(aft, aft, 1); 1057 FWD(aft, aft, OPND(s)); 1058 break; 1059 case O_QUEST: /* just an empty */ 1060 FWD(aft, aft, 1); 1061 break; 1062 case OLPAREN: /* not significant here */ 1063 case ORPAREN: 1064 FWD(aft, aft, 1); 1065 break; 1066 case OCH_: /* mark the first two branches */ 1067 FWD(aft, aft, 1); 1068 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1069 FWD(aft, aft, OPND(s)); 1070 break; 1071 case OOR1: /* done a branch, find the O_CH */ 1072 if (ISSTATEIN(aft, here)) { 1073 for (look = 1; 1074 OP(s = g->strip[pc+look]) != O_CH; 1075 look += OPND(s)) 1076 assert(OP(s) == OOR2); 1077 FWD(aft, aft, look); 1078 } 1079 break; 1080 case OOR2: /* propagate OCH_'s marking */ 1081 FWD(aft, aft, 1); 1082 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1083 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1084 FWD(aft, aft, OPND(s)); 1085 } 1086 break; 1087 case O_CH: /* just empty */ 1088 FWD(aft, aft, 1); 1089 break; 1090 default: /* ooooops... */ 1091 assert(nope); 1092 break; 1093 } 1094 } 1095 1096 return(aft); 1097 } 1098 1099 #ifdef REDEBUG 1100 /* 1101 - print - print a set of states 1102 == #ifdef REDEBUG 1103 == static void print(struct match *m, char *caption, states st, \ 1104 == int ch, FILE *d); 1105 == #endif 1106 */ 1107 static void 1108 print(m, caption, st, ch, d) 1109 struct match *m; 1110 char *caption; 1111 states st; 1112 int ch; 1113 FILE *d; 1114 { 1115 struct re_guts *g = m->g; 1116 int i; 1117 int first = 1; 1118 1119 if (!(m->eflags®_TRACE)) 1120 return; 1121 1122 fprintf(d, "%s", caption); 1123 if (ch != '\0') 1124 fprintf(d, " %s", pchar(ch)); 1125 for (i = 0; i < g->nstates; i++) 1126 if (ISSET(st, i)) { 1127 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1128 first = 0; 1129 } 1130 fprintf(d, "\n"); 1131 } 1132 1133 /* 1134 - at - print current situation 1135 == #ifdef REDEBUG 1136 == static void at(struct match *m, char *title, char *start, char *stop, \ 1137 == sopno startst, sopno stopst); 1138 == #endif 1139 */ 1140 static void 1141 at(m, title, start, stop, startst, stopst) 1142 struct match *m; 1143 char *title; 1144 char *start; 1145 char *stop; 1146 sopno startst; 1147 sopno stopst; 1148 { 1149 if (!(m->eflags®_TRACE)) 1150 return; 1151 1152 printf("%s %s-", title, pchar(*start)); 1153 printf("%s ", pchar(*stop)); 1154 printf("%ld-%ld\n", (long)startst, (long)stopst); 1155 } 1156 1157 #ifndef PCHARDONE 1158 #define PCHARDONE /* never again */ 1159 /* 1160 - pchar - make a character printable 1161 == #ifdef REDEBUG 1162 == static char *pchar(int ch); 1163 == #endif 1164 * 1165 * Is this identical to regchar() over in debug.c? Well, yes. But a 1166 * duplicate here avoids having a debugging-capable regexec.o tied to 1167 * a matching debug.o, and this is convenient. It all disappears in 1168 * the non-debug compilation anyway, so it doesn't matter much. 1169 */ 1170 static char * /* -> representation */ 1171 pchar(ch) 1172 int ch; 1173 { 1174 static char pbuf[10]; 1175 1176 if (isprint((uch)ch) || ch == ' ') 1177 sprintf(pbuf, "%c", ch); 1178 else 1179 sprintf(pbuf, "\\%o", ch); 1180 return(pbuf); 1181 } 1182 #endif 1183 #endif 1184 1185 #undef matcher 1186 #undef fast 1187 #undef slow 1188 #undef dissect 1189 #undef backref 1190 #undef step 1191 #undef print 1192 #undef at 1193 #undef match 1194