1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)engine.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 /* 44 * The matching engine and friends. This file is #included by regexec.c 45 * after suitable #defines of a variety of macros used herein, so that 46 * different state representations can be used without duplicating masses 47 * of code. 48 */ 49 50 #ifdef SNAMES 51 #define matcher smatcher 52 #define fast sfast 53 #define slow sslow 54 #define dissect sdissect 55 #define backref sbackref 56 #define step sstep 57 #define print sprint 58 #define at sat 59 #define match smat 60 #endif 61 #ifdef LNAMES 62 #define matcher lmatcher 63 #define fast lfast 64 #define slow lslow 65 #define dissect ldissect 66 #define backref lbackref 67 #define step lstep 68 #define print lprint 69 #define at lat 70 #define match lmat 71 #endif 72 73 /* another structure passed up and down to avoid zillions of parameters */ 74 struct match { 75 struct re_guts *g; 76 int eflags; 77 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 78 char *offp; /* offsets work from here */ 79 char *beginp; /* start of string -- virtual NUL precedes */ 80 char *endp; /* end of string -- virtual NUL here */ 81 char *coldp; /* can be no match starting before here */ 82 char **lastpos; /* [nplus+1] */ 83 STATEVARS; 84 states st; /* current states */ 85 states fresh; /* states for a fresh start */ 86 states tmp; /* temporary */ 87 states empty; /* empty set of states */ 88 }; 89 90 /* ========= begin header generated by ./mkh ========= */ 91 #ifdef __cplusplus 92 extern "C" { 93 #endif 94 95 /* === engine.c === */ 96 static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 97 static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 98 static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); 99 static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 100 static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 101 static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); 102 #define BOL (OUT+1) 103 #define EOL (BOL+1) 104 #define BOLEOL (BOL+2) 105 #define NOTHING (BOL+3) 106 #define BOW (BOL+4) 107 #define EOW (BOL+5) 108 #define CODEMAX (BOL+5) /* highest code used */ 109 #define NONCHAR(c) ((c) > CHAR_MAX) 110 #define NNONCHAR (CODEMAX-CHAR_MAX) 111 #ifdef REDEBUG 112 static void print(struct match *m, char *caption, states st, int ch, FILE *d); 113 #endif 114 #ifdef REDEBUG 115 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 116 #endif 117 #ifdef REDEBUG 118 static char *pchar(int ch); 119 #endif 120 121 #ifdef __cplusplus 122 } 123 #endif 124 /* ========= end header generated by ./mkh ========= */ 125 126 #ifdef REDEBUG 127 #define SP(t, s, c) print(m, t, s, c, stdout) 128 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 129 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 130 #else 131 #define SP(t, s, c) /* nothing */ 132 #define AT(t, p1, p2, s1, s2) /* nothing */ 133 #define NOTE(s) /* nothing */ 134 #endif 135 136 /* 137 - matcher - the actual matching engine 138 == static int matcher(struct re_guts *g, char *string, \ 139 == size_t nmatch, regmatch_t pmatch[], int eflags); 140 */ 141 static int /* 0 success, REG_NOMATCH failure */ 142 matcher(g, string, nmatch, pmatch, eflags) 143 struct re_guts *g; 144 char *string; 145 size_t nmatch; 146 regmatch_t pmatch[]; 147 int eflags; 148 { 149 char *endp; 150 int i; 151 struct match mv; 152 struct match *m = &mv; 153 char *dp; 154 const sopno gf = g->firststate+1; /* +1 for OEND */ 155 const sopno gl = g->laststate; 156 char *start; 157 char *stop; 158 /* Boyer-Moore algorithms variables */ 159 char *pp; 160 int cj, mj; 161 char *mustfirst; 162 char *mustlast; 163 int *matchjump; 164 int *charjump; 165 166 /* simplify the situation where possible */ 167 if (g->cflags®_NOSUB) 168 nmatch = 0; 169 if (eflags®_STARTEND) { 170 start = string + pmatch[0].rm_so; 171 stop = string + pmatch[0].rm_eo; 172 } else { 173 start = string; 174 stop = start + strlen(start); 175 } 176 if (stop < start) 177 return(REG_INVARG); 178 179 /* prescreening; this does wonders for this rather slow code */ 180 if (g->must != NULL) { 181 if (g->charjump != NULL && g->matchjump != NULL) { 182 mustfirst = g->must; 183 mustlast = g->must + g->mlen - 1; 184 charjump = g->charjump; 185 matchjump = g->matchjump; 186 pp = mustlast; 187 for (dp = start+g->mlen-1; dp < stop;) { 188 /* Fast skip non-matches */ 189 while (dp < stop && charjump[*dp]) 190 dp += charjump[*dp]; 191 192 if (dp >= stop) 193 break; 194 195 /* Greedy matcher */ 196 /* We depend on not being used for 197 * for strings of length 1 198 */ 199 while (*--dp == *--pp && pp != mustfirst); 200 201 if (*dp == *pp) 202 break; 203 204 /* Jump to next possible match */ 205 mj = matchjump[pp - mustfirst]; 206 cj = charjump[*dp]; 207 dp += (cj < mj ? mj : cj); 208 pp = mustlast; 209 } 210 if (pp != mustfirst) 211 return(REG_NOMATCH); 212 } else { 213 for (dp = start; dp < stop; dp++) 214 if (*dp == g->must[0] && 215 stop - dp >= g->mlen && 216 memcmp(dp, g->must, (size_t)g->mlen) == 0) 217 break; 218 if (dp == stop) /* we didn't find g->must */ 219 return(REG_NOMATCH); 220 } 221 } 222 223 /* match struct setup */ 224 m->g = g; 225 m->eflags = eflags; 226 m->pmatch = NULL; 227 m->lastpos = NULL; 228 m->offp = string; 229 m->beginp = start; 230 m->endp = stop; 231 STATESETUP(m, 4); 232 SETUP(m->st); 233 SETUP(m->fresh); 234 SETUP(m->tmp); 235 SETUP(m->empty); 236 CLEAR(m->empty); 237 238 /* Adjust start according to moffset, to speed things up */ 239 if (g->moffset > -1) 240 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 241 242 /* this loop does only one repetition except for backrefs */ 243 for (;;) { 244 endp = fast(m, start, stop, gf, gl); 245 if (endp == NULL) { /* a miss */ 246 STATETEARDOWN(m); 247 return(REG_NOMATCH); 248 } 249 if (nmatch == 0 && !g->backrefs) 250 break; /* no further info needed */ 251 252 /* where? */ 253 assert(m->coldp != NULL); 254 for (;;) { 255 NOTE("finding start"); 256 endp = slow(m, m->coldp, stop, gf, gl); 257 if (endp != NULL) 258 break; 259 assert(m->coldp < m->endp); 260 m->coldp++; 261 } 262 if (nmatch == 1 && !g->backrefs) 263 break; /* no further info needed */ 264 265 /* oh my, he wants the subexpressions... */ 266 if (m->pmatch == NULL) 267 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 268 sizeof(regmatch_t)); 269 if (m->pmatch == NULL) { 270 STATETEARDOWN(m); 271 return(REG_ESPACE); 272 } 273 for (i = 1; i <= m->g->nsub; i++) 274 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 275 if (!g->backrefs && !(m->eflags®_BACKR)) { 276 NOTE("dissecting"); 277 dp = dissect(m, m->coldp, endp, gf, gl); 278 } else { 279 if (g->nplus > 0 && m->lastpos == NULL) 280 m->lastpos = (char **)malloc((g->nplus+1) * 281 sizeof(char *)); 282 if (g->nplus > 0 && m->lastpos == NULL) { 283 free(m->pmatch); 284 STATETEARDOWN(m); 285 return(REG_ESPACE); 286 } 287 NOTE("backref dissect"); 288 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 289 } 290 if (dp != NULL) 291 break; 292 293 /* uh-oh... we couldn't find a subexpression-level match */ 294 assert(g->backrefs); /* must be back references doing it */ 295 assert(g->nplus == 0 || m->lastpos != NULL); 296 for (;;) { 297 if (dp != NULL || endp <= m->coldp) 298 break; /* defeat */ 299 NOTE("backoff"); 300 endp = slow(m, m->coldp, endp-1, gf, gl); 301 if (endp == NULL) 302 break; /* defeat */ 303 /* try it on a shorter possibility */ 304 #ifndef NDEBUG 305 for (i = 1; i <= m->g->nsub; i++) { 306 assert(m->pmatch[i].rm_so == -1); 307 assert(m->pmatch[i].rm_eo == -1); 308 } 309 #endif 310 NOTE("backoff dissect"); 311 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 312 } 313 assert(dp == NULL || dp == endp); 314 if (dp != NULL) /* found a shorter one */ 315 break; 316 317 /* despite initial appearances, there is no match here */ 318 NOTE("false alarm"); 319 start = m->coldp + 1; /* recycle starting later */ 320 assert(start <= stop); 321 } 322 323 /* fill in the details if requested */ 324 if (nmatch > 0) { 325 pmatch[0].rm_so = m->coldp - m->offp; 326 pmatch[0].rm_eo = endp - m->offp; 327 } 328 if (nmatch > 1) { 329 assert(m->pmatch != NULL); 330 for (i = 1; i < nmatch; i++) 331 if (i <= m->g->nsub) 332 pmatch[i] = m->pmatch[i]; 333 else { 334 pmatch[i].rm_so = -1; 335 pmatch[i].rm_eo = -1; 336 } 337 } 338 339 if (m->pmatch != NULL) 340 free((char *)m->pmatch); 341 if (m->lastpos != NULL) 342 free((char *)m->lastpos); 343 STATETEARDOWN(m); 344 return(0); 345 } 346 347 /* 348 - dissect - figure out what matched what, no back references 349 == static char *dissect(struct match *m, char *start, \ 350 == char *stop, sopno startst, sopno stopst); 351 */ 352 static char * /* == stop (success) always */ 353 dissect(m, start, stop, startst, stopst) 354 struct match *m; 355 char *start; 356 char *stop; 357 sopno startst; 358 sopno stopst; 359 { 360 int i; 361 sopno ss; /* start sop of current subRE */ 362 sopno es; /* end sop of current subRE */ 363 char *sp; /* start of string matched by it */ 364 char *stp; /* string matched by it cannot pass here */ 365 char *rest; /* start of rest of string */ 366 char *tail; /* string unmatched by rest of RE */ 367 sopno ssub; /* start sop of subsubRE */ 368 sopno esub; /* end sop of subsubRE */ 369 char *ssp; /* start of string matched by subsubRE */ 370 char *sep; /* end of string matched by subsubRE */ 371 char *oldssp; /* previous ssp */ 372 char *dp; 373 374 AT("diss", start, stop, startst, stopst); 375 sp = start; 376 for (ss = startst; ss < stopst; ss = es) { 377 /* identify end of subRE */ 378 es = ss; 379 switch (OP(m->g->strip[es])) { 380 case OPLUS_: 381 case OQUEST_: 382 es += OPND(m->g->strip[es]); 383 break; 384 case OCH_: 385 while (OP(m->g->strip[es]) != O_CH) 386 es += OPND(m->g->strip[es]); 387 break; 388 } 389 es++; 390 391 /* figure out what it matched */ 392 switch (OP(m->g->strip[ss])) { 393 case OEND: 394 assert(nope); 395 break; 396 case OCHAR: 397 sp++; 398 break; 399 case OBOL: 400 case OEOL: 401 case OBOW: 402 case OEOW: 403 break; 404 case OANY: 405 case OANYOF: 406 sp++; 407 break; 408 case OBACK_: 409 case O_BACK: 410 assert(nope); 411 break; 412 /* cases where length of match is hard to find */ 413 case OQUEST_: 414 stp = stop; 415 for (;;) { 416 /* how long could this one be? */ 417 rest = slow(m, sp, stp, ss, es); 418 assert(rest != NULL); /* it did match */ 419 /* could the rest match the rest? */ 420 tail = slow(m, rest, stop, es, stopst); 421 if (tail == stop) 422 break; /* yes! */ 423 /* no -- try a shorter match for this one */ 424 stp = rest - 1; 425 assert(stp >= sp); /* it did work */ 426 } 427 ssub = ss + 1; 428 esub = es - 1; 429 /* did innards match? */ 430 if (slow(m, sp, rest, ssub, esub) != NULL) { 431 dp = dissect(m, sp, rest, ssub, esub); 432 assert(dp == rest); 433 } else /* no */ 434 assert(sp == rest); 435 sp = rest; 436 break; 437 case OPLUS_: 438 stp = stop; 439 for (;;) { 440 /* how long could this one be? */ 441 rest = slow(m, sp, stp, ss, es); 442 assert(rest != NULL); /* it did match */ 443 /* could the rest match the rest? */ 444 tail = slow(m, rest, stop, es, stopst); 445 if (tail == stop) 446 break; /* yes! */ 447 /* no -- try a shorter match for this one */ 448 stp = rest - 1; 449 assert(stp >= sp); /* it did work */ 450 } 451 ssub = ss + 1; 452 esub = es - 1; 453 ssp = sp; 454 oldssp = ssp; 455 for (;;) { /* find last match of innards */ 456 sep = slow(m, ssp, rest, ssub, esub); 457 if (sep == NULL || sep == ssp) 458 break; /* failed or matched null */ 459 oldssp = ssp; /* on to next try */ 460 ssp = sep; 461 } 462 if (sep == NULL) { 463 /* last successful match */ 464 sep = ssp; 465 ssp = oldssp; 466 } 467 assert(sep == rest); /* must exhaust substring */ 468 assert(slow(m, ssp, sep, ssub, esub) == rest); 469 dp = dissect(m, ssp, sep, ssub, esub); 470 assert(dp == sep); 471 sp = rest; 472 break; 473 case OCH_: 474 stp = stop; 475 for (;;) { 476 /* how long could this one be? */ 477 rest = slow(m, sp, stp, ss, es); 478 assert(rest != NULL); /* it did match */ 479 /* could the rest match the rest? */ 480 tail = slow(m, rest, stop, es, stopst); 481 if (tail == stop) 482 break; /* yes! */ 483 /* no -- try a shorter match for this one */ 484 stp = rest - 1; 485 assert(stp >= sp); /* it did work */ 486 } 487 ssub = ss + 1; 488 esub = ss + OPND(m->g->strip[ss]) - 1; 489 assert(OP(m->g->strip[esub]) == OOR1); 490 for (;;) { /* find first matching branch */ 491 if (slow(m, sp, rest, ssub, esub) == rest) 492 break; /* it matched all of it */ 493 /* that one missed, try next one */ 494 assert(OP(m->g->strip[esub]) == OOR1); 495 esub++; 496 assert(OP(m->g->strip[esub]) == OOR2); 497 ssub = esub + 1; 498 esub += OPND(m->g->strip[esub]); 499 if (OP(m->g->strip[esub]) == OOR2) 500 esub--; 501 else 502 assert(OP(m->g->strip[esub]) == O_CH); 503 } 504 dp = dissect(m, sp, rest, ssub, esub); 505 assert(dp == rest); 506 sp = rest; 507 break; 508 case O_PLUS: 509 case O_QUEST: 510 case OOR1: 511 case OOR2: 512 case O_CH: 513 assert(nope); 514 break; 515 case OLPAREN: 516 i = OPND(m->g->strip[ss]); 517 assert(0 < i && i <= m->g->nsub); 518 m->pmatch[i].rm_so = sp - m->offp; 519 break; 520 case ORPAREN: 521 i = OPND(m->g->strip[ss]); 522 assert(0 < i && i <= m->g->nsub); 523 m->pmatch[i].rm_eo = sp - m->offp; 524 break; 525 default: /* uh oh */ 526 assert(nope); 527 break; 528 } 529 } 530 531 assert(sp == stop); 532 return(sp); 533 } 534 535 /* 536 - backref - figure out what matched what, figuring in back references 537 == static char *backref(struct match *m, char *start, \ 538 == char *stop, sopno startst, sopno stopst, sopno lev); 539 */ 540 static char * /* == stop (success) or NULL (failure) */ 541 backref(m, start, stop, startst, stopst, lev) 542 struct match *m; 543 char *start; 544 char *stop; 545 sopno startst; 546 sopno stopst; 547 sopno lev; /* PLUS nesting level */ 548 { 549 int i; 550 sopno ss; /* start sop of current subRE */ 551 char *sp; /* start of string matched by it */ 552 sopno ssub; /* start sop of subsubRE */ 553 sopno esub; /* end sop of subsubRE */ 554 char *ssp; /* start of string matched by subsubRE */ 555 char *dp; 556 size_t len; 557 int hard; 558 sop s; 559 regoff_t offsave; 560 cset *cs; 561 562 AT("back", start, stop, startst, stopst); 563 sp = start; 564 565 /* get as far as we can with easy stuff */ 566 hard = 0; 567 for (ss = startst; !hard && ss < stopst; ss++) 568 switch (OP(s = m->g->strip[ss])) { 569 case OCHAR: 570 if (sp == stop || *sp++ != (char)OPND(s)) 571 return(NULL); 572 break; 573 case OANY: 574 if (sp == stop) 575 return(NULL); 576 sp++; 577 break; 578 case OANYOF: 579 cs = &m->g->sets[OPND(s)]; 580 if (sp == stop || !CHIN(cs, *sp++)) 581 return(NULL); 582 break; 583 case OBOL: 584 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 585 (sp < m->endp && *(sp-1) == '\n' && 586 (m->g->cflags®_NEWLINE)) ) 587 { /* yes */ } 588 else 589 return(NULL); 590 break; 591 case OEOL: 592 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 593 (sp < m->endp && *sp == '\n' && 594 (m->g->cflags®_NEWLINE)) ) 595 { /* yes */ } 596 else 597 return(NULL); 598 break; 599 case OBOW: 600 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 601 (sp < m->endp && *(sp-1) == '\n' && 602 (m->g->cflags®_NEWLINE)) || 603 (sp > m->beginp && 604 !ISWORD(*(sp-1))) ) && 605 (sp < m->endp && ISWORD(*sp)) ) 606 { /* yes */ } 607 else 608 return(NULL); 609 break; 610 case OEOW: 611 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 612 (sp < m->endp && *sp == '\n' && 613 (m->g->cflags®_NEWLINE)) || 614 (sp < m->endp && !ISWORD(*sp)) ) && 615 (sp > m->beginp && ISWORD(*(sp-1))) ) 616 { /* yes */ } 617 else 618 return(NULL); 619 break; 620 case O_QUEST: 621 break; 622 case OOR1: /* matches null but needs to skip */ 623 ss++; 624 s = m->g->strip[ss]; 625 do { 626 assert(OP(s) == OOR2); 627 ss += OPND(s); 628 } while (OP(s = m->g->strip[ss]) != O_CH); 629 /* note that the ss++ gets us past the O_CH */ 630 break; 631 default: /* have to make a choice */ 632 hard = 1; 633 break; 634 } 635 if (!hard) { /* that was it! */ 636 if (sp != stop) 637 return(NULL); 638 return(sp); 639 } 640 ss--; /* adjust for the for's final increment */ 641 642 /* the hard stuff */ 643 AT("hard", sp, stop, ss, stopst); 644 s = m->g->strip[ss]; 645 switch (OP(s)) { 646 case OBACK_: /* the vilest depths */ 647 i = OPND(s); 648 assert(0 < i && i <= m->g->nsub); 649 if (m->pmatch[i].rm_eo == -1) 650 return(NULL); 651 assert(m->pmatch[i].rm_so != -1); 652 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 653 assert(stop - m->beginp >= len); 654 if (sp > stop - len) 655 return(NULL); /* not enough left to match */ 656 ssp = m->offp + m->pmatch[i].rm_so; 657 if (memcmp(sp, ssp, len) != 0) 658 return(NULL); 659 while (m->g->strip[ss] != SOP(O_BACK, i)) 660 ss++; 661 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 662 break; 663 case OQUEST_: /* to null or not */ 664 dp = backref(m, sp, stop, ss+1, stopst, lev); 665 if (dp != NULL) 666 return(dp); /* not */ 667 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 668 break; 669 case OPLUS_: 670 assert(m->lastpos != NULL); 671 assert(lev+1 <= m->g->nplus); 672 m->lastpos[lev+1] = sp; 673 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 674 break; 675 case O_PLUS: 676 if (sp == m->lastpos[lev]) /* last pass matched null */ 677 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 678 /* try another pass */ 679 m->lastpos[lev] = sp; 680 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 681 if (dp == NULL) 682 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 683 else 684 return(dp); 685 break; 686 case OCH_: /* find the right one, if any */ 687 ssub = ss + 1; 688 esub = ss + OPND(s) - 1; 689 assert(OP(m->g->strip[esub]) == OOR1); 690 for (;;) { /* find first matching branch */ 691 dp = backref(m, sp, stop, ssub, esub, lev); 692 if (dp != NULL) 693 return(dp); 694 /* that one missed, try next one */ 695 if (OP(m->g->strip[esub]) == O_CH) 696 return(NULL); /* there is none */ 697 esub++; 698 assert(OP(m->g->strip[esub]) == OOR2); 699 ssub = esub + 1; 700 esub += OPND(m->g->strip[esub]); 701 if (OP(m->g->strip[esub]) == OOR2) 702 esub--; 703 else 704 assert(OP(m->g->strip[esub]) == O_CH); 705 } 706 break; 707 case OLPAREN: /* must undo assignment if rest fails */ 708 i = OPND(s); 709 assert(0 < i && i <= m->g->nsub); 710 offsave = m->pmatch[i].rm_so; 711 m->pmatch[i].rm_so = sp - m->offp; 712 dp = backref(m, sp, stop, ss+1, stopst, lev); 713 if (dp != NULL) 714 return(dp); 715 m->pmatch[i].rm_so = offsave; 716 return(NULL); 717 break; 718 case ORPAREN: /* must undo assignment if rest fails */ 719 i = OPND(s); 720 assert(0 < i && i <= m->g->nsub); 721 offsave = m->pmatch[i].rm_eo; 722 m->pmatch[i].rm_eo = sp - m->offp; 723 dp = backref(m, sp, stop, ss+1, stopst, lev); 724 if (dp != NULL) 725 return(dp); 726 m->pmatch[i].rm_eo = offsave; 727 return(NULL); 728 break; 729 default: /* uh oh */ 730 assert(nope); 731 break; 732 } 733 734 /* "can't happen" */ 735 assert(nope); 736 /* NOTREACHED */ 737 return "shut up gcc"; 738 } 739 740 /* 741 - fast - step through the string at top speed 742 == static char *fast(struct match *m, char *start, \ 743 == char *stop, sopno startst, sopno stopst); 744 */ 745 static char * /* where tentative match ended, or NULL */ 746 fast(m, start, stop, startst, stopst) 747 struct match *m; 748 char *start; 749 char *stop; 750 sopno startst; 751 sopno stopst; 752 { 753 states st = m->st; 754 states fresh = m->fresh; 755 states tmp = m->tmp; 756 char *p = start; 757 int c = (start == m->beginp) ? OUT : *(start-1); 758 int lastc; /* previous c */ 759 int flagch; 760 int i; 761 char *coldp; /* last p after which no match was underway */ 762 763 CLEAR(st); 764 SET1(st, startst); 765 st = step(m->g, startst, stopst, st, NOTHING, st); 766 ASSIGN(fresh, st); 767 SP("start", st, *p); 768 coldp = NULL; 769 for (;;) { 770 /* next character */ 771 lastc = c; 772 c = (p == m->endp) ? OUT : *p; 773 if (EQ(st, fresh)) 774 coldp = p; 775 776 /* is there an EOL and/or BOL between lastc and c? */ 777 flagch = '\0'; 778 i = 0; 779 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 780 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 781 flagch = BOL; 782 i = m->g->nbol; 783 } 784 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 785 (c == OUT && !(m->eflags®_NOTEOL)) ) { 786 flagch = (flagch == BOL) ? BOLEOL : EOL; 787 i += m->g->neol; 788 } 789 if (i != 0) { 790 for (; i > 0; i--) 791 st = step(m->g, startst, stopst, st, flagch, st); 792 SP("boleol", st, c); 793 } 794 795 /* how about a word boundary? */ 796 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 797 (c != OUT && ISWORD(c)) ) { 798 flagch = BOW; 799 } 800 if ( (lastc != OUT && ISWORD(lastc)) && 801 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 802 flagch = EOW; 803 } 804 if (flagch == BOW || flagch == EOW) { 805 st = step(m->g, startst, stopst, st, flagch, st); 806 SP("boweow", st, c); 807 } 808 809 /* are we done? */ 810 if (ISSET(st, stopst) || p == stop) 811 break; /* NOTE BREAK OUT */ 812 813 /* no, we must deal with this character */ 814 ASSIGN(tmp, st); 815 ASSIGN(st, fresh); 816 assert(c != OUT); 817 st = step(m->g, startst, stopst, tmp, c, st); 818 SP("aft", st, c); 819 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 820 p++; 821 } 822 823 assert(coldp != NULL); 824 m->coldp = coldp; 825 if (ISSET(st, stopst)) 826 return(p+1); 827 else 828 return(NULL); 829 } 830 831 /* 832 - slow - step through the string more deliberately 833 == static char *slow(struct match *m, char *start, \ 834 == char *stop, sopno startst, sopno stopst); 835 */ 836 static char * /* where it ended */ 837 slow(m, start, stop, startst, stopst) 838 struct match *m; 839 char *start; 840 char *stop; 841 sopno startst; 842 sopno stopst; 843 { 844 states st = m->st; 845 states empty = m->empty; 846 states tmp = m->tmp; 847 char *p = start; 848 int c = (start == m->beginp) ? OUT : *(start-1); 849 int lastc; /* previous c */ 850 int flagch; 851 int i; 852 char *matchp; /* last p at which a match ended */ 853 854 AT("slow", start, stop, startst, stopst); 855 CLEAR(st); 856 SET1(st, startst); 857 SP("sstart", st, *p); 858 st = step(m->g, startst, stopst, st, NOTHING, st); 859 matchp = NULL; 860 for (;;) { 861 /* next character */ 862 lastc = c; 863 c = (p == m->endp) ? OUT : *p; 864 865 /* is there an EOL and/or BOL between lastc and c? */ 866 flagch = '\0'; 867 i = 0; 868 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 869 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 870 flagch = BOL; 871 i = m->g->nbol; 872 } 873 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 874 (c == OUT && !(m->eflags®_NOTEOL)) ) { 875 flagch = (flagch == BOL) ? BOLEOL : EOL; 876 i += m->g->neol; 877 } 878 if (i != 0) { 879 for (; i > 0; i--) 880 st = step(m->g, startst, stopst, st, flagch, st); 881 SP("sboleol", st, c); 882 } 883 884 /* how about a word boundary? */ 885 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 886 (c != OUT && ISWORD(c)) ) { 887 flagch = BOW; 888 } 889 if ( (lastc != OUT && ISWORD(lastc)) && 890 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 891 flagch = EOW; 892 } 893 if (flagch == BOW || flagch == EOW) { 894 st = step(m->g, startst, stopst, st, flagch, st); 895 SP("sboweow", st, c); 896 } 897 898 /* are we done? */ 899 if (ISSET(st, stopst)) 900 matchp = p; 901 if (EQ(st, empty) || p == stop) 902 break; /* NOTE BREAK OUT */ 903 904 /* no, we must deal with this character */ 905 ASSIGN(tmp, st); 906 ASSIGN(st, empty); 907 assert(c != OUT); 908 st = step(m->g, startst, stopst, tmp, c, st); 909 SP("saft", st, c); 910 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 911 p++; 912 } 913 914 return(matchp); 915 } 916 917 918 /* 919 - step - map set of states reachable before char to set reachable after 920 == static states step(struct re_guts *g, sopno start, sopno stop, \ 921 == states bef, int ch, states aft); 922 == #define BOL (OUT+1) 923 == #define EOL (BOL+1) 924 == #define BOLEOL (BOL+2) 925 == #define NOTHING (BOL+3) 926 == #define BOW (BOL+4) 927 == #define EOW (BOL+5) 928 == #define CODEMAX (BOL+5) // highest code used 929 == #define NONCHAR(c) ((c) > CHAR_MAX) 930 == #define NNONCHAR (CODEMAX-CHAR_MAX) 931 */ 932 static states 933 step(g, start, stop, bef, ch, aft) 934 struct re_guts *g; 935 sopno start; /* start state within strip */ 936 sopno stop; /* state after stop state within strip */ 937 states bef; /* states reachable before */ 938 int ch; /* character or NONCHAR code */ 939 states aft; /* states already known reachable after */ 940 { 941 cset *cs; 942 sop s; 943 sopno pc; 944 onestate here; /* note, macros know this name */ 945 sopno look; 946 int i; 947 948 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 949 s = g->strip[pc]; 950 switch (OP(s)) { 951 case OEND: 952 assert(pc == stop-1); 953 break; 954 case OCHAR: 955 /* only characters can match */ 956 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 957 if (ch == (char)OPND(s)) 958 FWD(aft, bef, 1); 959 break; 960 case OBOL: 961 if (ch == BOL || ch == BOLEOL) 962 FWD(aft, bef, 1); 963 break; 964 case OEOL: 965 if (ch == EOL || ch == BOLEOL) 966 FWD(aft, bef, 1); 967 break; 968 case OBOW: 969 if (ch == BOW) 970 FWD(aft, bef, 1); 971 break; 972 case OEOW: 973 if (ch == EOW) 974 FWD(aft, bef, 1); 975 break; 976 case OANY: 977 if (!NONCHAR(ch)) 978 FWD(aft, bef, 1); 979 break; 980 case OANYOF: 981 cs = &g->sets[OPND(s)]; 982 if (!NONCHAR(ch) && CHIN(cs, ch)) 983 FWD(aft, bef, 1); 984 break; 985 case OBACK_: /* ignored here */ 986 case O_BACK: 987 FWD(aft, aft, 1); 988 break; 989 case OPLUS_: /* forward, this is just an empty */ 990 FWD(aft, aft, 1); 991 break; 992 case O_PLUS: /* both forward and back */ 993 FWD(aft, aft, 1); 994 i = ISSETBACK(aft, OPND(s)); 995 BACK(aft, aft, OPND(s)); 996 if (!i && ISSETBACK(aft, OPND(s))) { 997 /* oho, must reconsider loop body */ 998 pc -= OPND(s) + 1; 999 INIT(here, pc); 1000 } 1001 break; 1002 case OQUEST_: /* two branches, both forward */ 1003 FWD(aft, aft, 1); 1004 FWD(aft, aft, OPND(s)); 1005 break; 1006 case O_QUEST: /* just an empty */ 1007 FWD(aft, aft, 1); 1008 break; 1009 case OLPAREN: /* not significant here */ 1010 case ORPAREN: 1011 FWD(aft, aft, 1); 1012 break; 1013 case OCH_: /* mark the first two branches */ 1014 FWD(aft, aft, 1); 1015 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1016 FWD(aft, aft, OPND(s)); 1017 break; 1018 case OOR1: /* done a branch, find the O_CH */ 1019 if (ISSTATEIN(aft, here)) { 1020 for (look = 1; 1021 OP(s = g->strip[pc+look]) != O_CH; 1022 look += OPND(s)) 1023 assert(OP(s) == OOR2); 1024 FWD(aft, aft, look); 1025 } 1026 break; 1027 case OOR2: /* propagate OCH_'s marking */ 1028 FWD(aft, aft, 1); 1029 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1030 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1031 FWD(aft, aft, OPND(s)); 1032 } 1033 break; 1034 case O_CH: /* just empty */ 1035 FWD(aft, aft, 1); 1036 break; 1037 default: /* ooooops... */ 1038 assert(nope); 1039 break; 1040 } 1041 } 1042 1043 return(aft); 1044 } 1045 1046 #ifdef REDEBUG 1047 /* 1048 - print - print a set of states 1049 == #ifdef REDEBUG 1050 == static void print(struct match *m, char *caption, states st, \ 1051 == int ch, FILE *d); 1052 == #endif 1053 */ 1054 static void 1055 print(m, caption, st, ch, d) 1056 struct match *m; 1057 char *caption; 1058 states st; 1059 int ch; 1060 FILE *d; 1061 { 1062 struct re_guts *g = m->g; 1063 int i; 1064 int first = 1; 1065 1066 if (!(m->eflags®_TRACE)) 1067 return; 1068 1069 fprintf(d, "%s", caption); 1070 if (ch != '\0') 1071 fprintf(d, " %s", pchar(ch)); 1072 for (i = 0; i < g->nstates; i++) 1073 if (ISSET(st, i)) { 1074 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1075 first = 0; 1076 } 1077 fprintf(d, "\n"); 1078 } 1079 1080 /* 1081 - at - print current situation 1082 == #ifdef REDEBUG 1083 == static void at(struct match *m, char *title, char *start, char *stop, \ 1084 == sopno startst, sopno stopst); 1085 == #endif 1086 */ 1087 static void 1088 at(m, title, start, stop, startst, stopst) 1089 struct match *m; 1090 char *title; 1091 char *start; 1092 char *stop; 1093 sopno startst; 1094 sopno stopst; 1095 { 1096 if (!(m->eflags®_TRACE)) 1097 return; 1098 1099 printf("%s %s-", title, pchar(*start)); 1100 printf("%s ", pchar(*stop)); 1101 printf("%ld-%ld\n", (long)startst, (long)stopst); 1102 } 1103 1104 #ifndef PCHARDONE 1105 #define PCHARDONE /* never again */ 1106 /* 1107 - pchar - make a character printable 1108 == #ifdef REDEBUG 1109 == static char *pchar(int ch); 1110 == #endif 1111 * 1112 * Is this identical to regchar() over in debug.c? Well, yes. But a 1113 * duplicate here avoids having a debugging-capable regexec.o tied to 1114 * a matching debug.o, and this is convenient. It all disappears in 1115 * the non-debug compilation anyway, so it doesn't matter much. 1116 */ 1117 static char * /* -> representation */ 1118 pchar(ch) 1119 int ch; 1120 { 1121 static char pbuf[10]; 1122 1123 if (isprint((uch)ch) || ch == ' ') 1124 sprintf(pbuf, "%c", ch); 1125 else 1126 sprintf(pbuf, "\\%o", ch); 1127 return(pbuf); 1128 } 1129 #endif 1130 #endif 1131 1132 #undef matcher 1133 #undef fast 1134 #undef slow 1135 #undef dissect 1136 #undef backref 1137 #undef step 1138 #undef print 1139 #undef at 1140 #undef match 1141