1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)engine.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 /* 41 * The matching engine and friends. This file is #included by regexec.c 42 * after suitable #defines of a variety of macros used herein, so that 43 * different state representations can be used without duplicating masses 44 * of code. 45 */ 46 47 #ifdef SNAMES 48 #define matcher smatcher 49 #define fast sfast 50 #define slow sslow 51 #define dissect sdissect 52 #define backref sbackref 53 #define step sstep 54 #define print sprint 55 #define at sat 56 #define match smat 57 #endif 58 #ifdef LNAMES 59 #define matcher lmatcher 60 #define fast lfast 61 #define slow lslow 62 #define dissect ldissect 63 #define backref lbackref 64 #define step lstep 65 #define print lprint 66 #define at lat 67 #define match lmat 68 #endif 69 70 /* another structure passed up and down to avoid zillions of parameters */ 71 struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 char *offp; /* offsets work from here */ 76 char *beginp; /* start of string -- virtual NUL precedes */ 77 char *endp; /* end of string -- virtual NUL here */ 78 char *coldp; /* can be no match starting before here */ 79 char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85 }; 86 87 /* ========= begin header generated by ./mkh ========= */ 88 #ifdef __cplusplus 89 extern "C" { 90 #endif 91 92 /* === engine.c === */ 93 static int matcher __P((struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags)); 94 static char *dissect __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 95 static char *backref __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev)); 96 static char *fast __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 97 static char *slow __P((struct match *m, char *start, char *stop, sopno startst, sopno stopst)); 98 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft)); 99 #define BOL (OUT+1) 100 #define EOL (BOL+1) 101 #define BOLEOL (BOL+2) 102 #define NOTHING (BOL+3) 103 #define BOW (BOL+4) 104 #define EOW (BOL+5) 105 #define CODEMAX (BOL+5) /* highest code used */ 106 #define NONCHAR(c) ((c) > CHAR_MAX) 107 #define NNONCHAR (CODEMAX-CHAR_MAX) 108 #ifdef REDEBUG 109 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d)); 110 #endif 111 #ifdef REDEBUG 112 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); 113 #endif 114 #ifdef REDEBUG 115 static char *pchar __P((int ch)); 116 #endif 117 118 #ifdef __cplusplus 119 } 120 #endif 121 /* ========= end header generated by ./mkh ========= */ 122 123 #ifdef REDEBUG 124 #define SP(t, s, c) print(m, t, s, c, stdout) 125 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 126 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 127 #else 128 #define SP(t, s, c) /* nothing */ 129 #define AT(t, p1, p2, s1, s2) /* nothing */ 130 #define NOTE(s) /* nothing */ 131 #endif 132 133 /* 134 - matcher - the actual matching engine 135 == static int matcher(register struct re_guts *g, char *string, \ 136 == size_t nmatch, regmatch_t pmatch[], int eflags); 137 */ 138 static int /* 0 success, REG_NOMATCH failure */ 139 matcher(g, string, nmatch, pmatch, eflags) 140 register struct re_guts *g; 141 char *string; 142 size_t nmatch; 143 regmatch_t pmatch[]; 144 int eflags; 145 { 146 register char *endp; 147 register int i; 148 struct match mv; 149 register struct match *m = &mv; 150 register char *dp; 151 register const sopno gf = g->firststate+1; /* +1 for OEND */ 152 register const sopno gl = g->laststate; 153 char *start; 154 char *stop; 155 156 /* simplify the situation where possible */ 157 if (g->cflags®_NOSUB) 158 nmatch = 0; 159 if (eflags®_STARTEND) { 160 start = string + pmatch[0].rm_so; 161 stop = string + pmatch[0].rm_eo; 162 } else { 163 start = string; 164 stop = start + strlen(start); 165 } 166 if (stop < start) 167 return(REG_INVARG); 168 169 /* prescreening; this does wonders for this rather slow code */ 170 if (g->must != NULL) { 171 for (dp = start; dp < stop; dp++) 172 if (*dp == g->must[0] && stop - dp >= g->mlen && 173 memcmp(dp, g->must, (size_t)g->mlen) == 0) 174 break; 175 if (dp == stop) /* we didn't find g->must */ 176 return(REG_NOMATCH); 177 } 178 179 /* match struct setup */ 180 m->g = g; 181 m->eflags = eflags; 182 m->pmatch = NULL; 183 m->lastpos = NULL; 184 m->offp = string; 185 m->beginp = start; 186 m->endp = stop; 187 STATESETUP(m, 4); 188 SETUP(m->st); 189 SETUP(m->fresh); 190 SETUP(m->tmp); 191 SETUP(m->empty); 192 CLEAR(m->empty); 193 194 /* this loop does only one repetition except for backrefs */ 195 for (;;) { 196 endp = fast(m, start, stop, gf, gl); 197 if (endp == NULL) { /* a miss */ 198 STATETEARDOWN(m); 199 return(REG_NOMATCH); 200 } 201 if (nmatch == 0 && !g->backrefs) 202 break; /* no further info needed */ 203 204 /* where? */ 205 assert(m->coldp != NULL); 206 for (;;) { 207 NOTE("finding start"); 208 endp = slow(m, m->coldp, stop, gf, gl); 209 if (endp != NULL) 210 break; 211 assert(m->coldp < m->endp); 212 m->coldp++; 213 } 214 if (nmatch == 1 && !g->backrefs) 215 break; /* no further info needed */ 216 217 /* oh my, he wants the subexpressions... */ 218 if (m->pmatch == NULL) 219 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 220 sizeof(regmatch_t)); 221 if (m->pmatch == NULL) { 222 STATETEARDOWN(m); 223 return(REG_ESPACE); 224 } 225 for (i = 1; i <= m->g->nsub; i++) 226 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 227 if (!g->backrefs && !(m->eflags®_BACKR)) { 228 NOTE("dissecting"); 229 dp = dissect(m, m->coldp, endp, gf, gl); 230 } else { 231 if (g->nplus > 0 && m->lastpos == NULL) 232 m->lastpos = (char **)malloc((g->nplus+1) * 233 sizeof(char *)); 234 if (g->nplus > 0 && m->lastpos == NULL) { 235 free(m->pmatch); 236 STATETEARDOWN(m); 237 return(REG_ESPACE); 238 } 239 NOTE("backref dissect"); 240 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 241 } 242 if (dp != NULL) 243 break; 244 245 /* uh-oh... we couldn't find a subexpression-level match */ 246 assert(g->backrefs); /* must be back references doing it */ 247 assert(g->nplus == 0 || m->lastpos != NULL); 248 for (;;) { 249 if (dp != NULL || endp <= m->coldp) 250 break; /* defeat */ 251 NOTE("backoff"); 252 endp = slow(m, m->coldp, endp-1, gf, gl); 253 if (endp == NULL) 254 break; /* defeat */ 255 /* try it on a shorter possibility */ 256 #ifndef NDEBUG 257 for (i = 1; i <= m->g->nsub; i++) { 258 assert(m->pmatch[i].rm_so == -1); 259 assert(m->pmatch[i].rm_eo == -1); 260 } 261 #endif 262 NOTE("backoff dissect"); 263 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 264 } 265 assert(dp == NULL || dp == endp); 266 if (dp != NULL) /* found a shorter one */ 267 break; 268 269 /* despite initial appearances, there is no match here */ 270 NOTE("false alarm"); 271 start = m->coldp + 1; /* recycle starting later */ 272 assert(start <= stop); 273 } 274 275 /* fill in the details if requested */ 276 if (nmatch > 0) { 277 pmatch[0].rm_so = m->coldp - m->offp; 278 pmatch[0].rm_eo = endp - m->offp; 279 } 280 if (nmatch > 1) { 281 assert(m->pmatch != NULL); 282 for (i = 1; i < nmatch; i++) 283 if (i <= m->g->nsub) 284 pmatch[i] = m->pmatch[i]; 285 else { 286 pmatch[i].rm_so = -1; 287 pmatch[i].rm_eo = -1; 288 } 289 } 290 291 if (m->pmatch != NULL) 292 free((char *)m->pmatch); 293 if (m->lastpos != NULL) 294 free((char *)m->lastpos); 295 STATETEARDOWN(m); 296 return(0); 297 } 298 299 /* 300 - dissect - figure out what matched what, no back references 301 == static char *dissect(register struct match *m, char *start, \ 302 == char *stop, sopno startst, sopno stopst); 303 */ 304 static char * /* == stop (success) always */ 305 dissect(m, start, stop, startst, stopst) 306 register struct match *m; 307 char *start; 308 char *stop; 309 sopno startst; 310 sopno stopst; 311 { 312 register int i; 313 register sopno ss; /* start sop of current subRE */ 314 register sopno es; /* end sop of current subRE */ 315 register char *sp; /* start of string matched by it */ 316 register char *stp; /* string matched by it cannot pass here */ 317 register char *rest; /* start of rest of string */ 318 register char *tail; /* string unmatched by rest of RE */ 319 register sopno ssub; /* start sop of subsubRE */ 320 register sopno esub; /* end sop of subsubRE */ 321 register char *ssp; /* start of string matched by subsubRE */ 322 register char *sep; /* end of string matched by subsubRE */ 323 register char *oldssp; /* previous ssp */ 324 register char *dp; 325 326 AT("diss", start, stop, startst, stopst); 327 sp = start; 328 for (ss = startst; ss < stopst; ss = es) { 329 /* identify end of subRE */ 330 es = ss; 331 switch (OP(m->g->strip[es])) { 332 case OPLUS_: 333 case OQUEST_: 334 es += OPND(m->g->strip[es]); 335 break; 336 case OCH_: 337 while (OP(m->g->strip[es]) != O_CH) 338 es += OPND(m->g->strip[es]); 339 break; 340 } 341 es++; 342 343 /* figure out what it matched */ 344 switch (OP(m->g->strip[ss])) { 345 case OEND: 346 assert(nope); 347 break; 348 case OCHAR: 349 sp++; 350 break; 351 case OBOL: 352 case OEOL: 353 case OBOW: 354 case OEOW: 355 break; 356 case OANY: 357 case OANYOF: 358 sp++; 359 break; 360 case OBACK_: 361 case O_BACK: 362 assert(nope); 363 break; 364 /* cases where length of match is hard to find */ 365 case OQUEST_: 366 stp = stop; 367 for (;;) { 368 /* how long could this one be? */ 369 rest = slow(m, sp, stp, ss, es); 370 assert(rest != NULL); /* it did match */ 371 /* could the rest match the rest? */ 372 tail = slow(m, rest, stop, es, stopst); 373 if (tail == stop) 374 break; /* yes! */ 375 /* no -- try a shorter match for this one */ 376 stp = rest - 1; 377 assert(stp >= sp); /* it did work */ 378 } 379 ssub = ss + 1; 380 esub = es - 1; 381 /* did innards match? */ 382 if (slow(m, sp, rest, ssub, esub) != NULL) { 383 dp = dissect(m, sp, rest, ssub, esub); 384 assert(dp == rest); 385 } else /* no */ 386 assert(sp == rest); 387 sp = rest; 388 break; 389 case OPLUS_: 390 stp = stop; 391 for (;;) { 392 /* how long could this one be? */ 393 rest = slow(m, sp, stp, ss, es); 394 assert(rest != NULL); /* it did match */ 395 /* could the rest match the rest? */ 396 tail = slow(m, rest, stop, es, stopst); 397 if (tail == stop) 398 break; /* yes! */ 399 /* no -- try a shorter match for this one */ 400 stp = rest - 1; 401 assert(stp >= sp); /* it did work */ 402 } 403 ssub = ss + 1; 404 esub = es - 1; 405 ssp = sp; 406 oldssp = ssp; 407 for (;;) { /* find last match of innards */ 408 sep = slow(m, ssp, rest, ssub, esub); 409 if (sep == NULL || sep == ssp) 410 break; /* failed or matched null */ 411 oldssp = ssp; /* on to next try */ 412 ssp = sep; 413 } 414 if (sep == NULL) { 415 /* last successful match */ 416 sep = ssp; 417 ssp = oldssp; 418 } 419 assert(sep == rest); /* must exhaust substring */ 420 assert(slow(m, ssp, sep, ssub, esub) == rest); 421 dp = dissect(m, ssp, sep, ssub, esub); 422 assert(dp == sep); 423 sp = rest; 424 break; 425 case OCH_: 426 stp = stop; 427 for (;;) { 428 /* how long could this one be? */ 429 rest = slow(m, sp, stp, ss, es); 430 assert(rest != NULL); /* it did match */ 431 /* could the rest match the rest? */ 432 tail = slow(m, rest, stop, es, stopst); 433 if (tail == stop) 434 break; /* yes! */ 435 /* no -- try a shorter match for this one */ 436 stp = rest - 1; 437 assert(stp >= sp); /* it did work */ 438 } 439 ssub = ss + 1; 440 esub = ss + OPND(m->g->strip[ss]) - 1; 441 assert(OP(m->g->strip[esub]) == OOR1); 442 for (;;) { /* find first matching branch */ 443 if (slow(m, sp, rest, ssub, esub) == rest) 444 break; /* it matched all of it */ 445 /* that one missed, try next one */ 446 assert(OP(m->g->strip[esub]) == OOR1); 447 esub++; 448 assert(OP(m->g->strip[esub]) == OOR2); 449 ssub = esub + 1; 450 esub += OPND(m->g->strip[esub]); 451 if (OP(m->g->strip[esub]) == OOR2) 452 esub--; 453 else 454 assert(OP(m->g->strip[esub]) == O_CH); 455 } 456 dp = dissect(m, sp, rest, ssub, esub); 457 assert(dp == rest); 458 sp = rest; 459 break; 460 case O_PLUS: 461 case O_QUEST: 462 case OOR1: 463 case OOR2: 464 case O_CH: 465 assert(nope); 466 break; 467 case OLPAREN: 468 i = OPND(m->g->strip[ss]); 469 assert(0 < i && i <= m->g->nsub); 470 m->pmatch[i].rm_so = sp - m->offp; 471 break; 472 case ORPAREN: 473 i = OPND(m->g->strip[ss]); 474 assert(0 < i && i <= m->g->nsub); 475 m->pmatch[i].rm_eo = sp - m->offp; 476 break; 477 default: /* uh oh */ 478 assert(nope); 479 break; 480 } 481 } 482 483 assert(sp == stop); 484 return(sp); 485 } 486 487 /* 488 - backref - figure out what matched what, figuring in back references 489 == static char *backref(register struct match *m, char *start, \ 490 == char *stop, sopno startst, sopno stopst, sopno lev); 491 */ 492 static char * /* == stop (success) or NULL (failure) */ 493 backref(m, start, stop, startst, stopst, lev) 494 register struct match *m; 495 char *start; 496 char *stop; 497 sopno startst; 498 sopno stopst; 499 sopno lev; /* PLUS nesting level */ 500 { 501 register int i; 502 register sopno ss; /* start sop of current subRE */ 503 register char *sp; /* start of string matched by it */ 504 register sopno ssub; /* start sop of subsubRE */ 505 register sopno esub; /* end sop of subsubRE */ 506 register char *ssp; /* start of string matched by subsubRE */ 507 register char *dp; 508 register size_t len; 509 register int hard; 510 register sop s; 511 register regoff_t offsave; 512 register cset *cs; 513 514 AT("back", start, stop, startst, stopst); 515 sp = start; 516 517 /* get as far as we can with easy stuff */ 518 hard = 0; 519 for (ss = startst; !hard && ss < stopst; ss++) 520 switch (OP(s = m->g->strip[ss])) { 521 case OCHAR: 522 if (sp == stop || *sp++ != (char)OPND(s)) 523 return(NULL); 524 break; 525 case OANY: 526 if (sp == stop) 527 return(NULL); 528 sp++; 529 break; 530 case OANYOF: 531 cs = &m->g->sets[OPND(s)]; 532 if (sp == stop || !CHIN(cs, *sp++)) 533 return(NULL); 534 break; 535 case OBOL: 536 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 537 (sp < m->endp && *(sp-1) == '\n' && 538 (m->g->cflags®_NEWLINE)) ) 539 { /* yes */ } 540 else 541 return(NULL); 542 break; 543 case OEOL: 544 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 545 (sp < m->endp && *sp == '\n' && 546 (m->g->cflags®_NEWLINE)) ) 547 { /* yes */ } 548 else 549 return(NULL); 550 break; 551 case OBOW: 552 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 553 (sp < m->endp && *(sp-1) == '\n' && 554 (m->g->cflags®_NEWLINE)) || 555 (sp > m->beginp && 556 !ISWORD(*(sp-1))) ) && 557 (sp < m->endp && ISWORD(*sp)) ) 558 { /* yes */ } 559 else 560 return(NULL); 561 break; 562 case OEOW: 563 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 564 (sp < m->endp && *sp == '\n' && 565 (m->g->cflags®_NEWLINE)) || 566 (sp < m->endp && !ISWORD(*sp)) ) && 567 (sp > m->beginp && ISWORD(*(sp-1))) ) 568 { /* yes */ } 569 else 570 return(NULL); 571 break; 572 case O_QUEST: 573 break; 574 case OOR1: /* matches null but needs to skip */ 575 ss++; 576 s = m->g->strip[ss]; 577 do { 578 assert(OP(s) == OOR2); 579 ss += OPND(s); 580 } while (OP(s = m->g->strip[ss]) != O_CH); 581 /* note that the ss++ gets us past the O_CH */ 582 break; 583 default: /* have to make a choice */ 584 hard = 1; 585 break; 586 } 587 if (!hard) { /* that was it! */ 588 if (sp != stop) 589 return(NULL); 590 return(sp); 591 } 592 ss--; /* adjust for the for's final increment */ 593 594 /* the hard stuff */ 595 AT("hard", sp, stop, ss, stopst); 596 s = m->g->strip[ss]; 597 switch (OP(s)) { 598 case OBACK_: /* the vilest depths */ 599 i = OPND(s); 600 assert(0 < i && i <= m->g->nsub); 601 if (m->pmatch[i].rm_eo == -1) 602 return(NULL); 603 assert(m->pmatch[i].rm_so != -1); 604 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 605 assert(stop - m->beginp >= len); 606 if (sp > stop - len) 607 return(NULL); /* not enough left to match */ 608 ssp = m->offp + m->pmatch[i].rm_so; 609 if (memcmp(sp, ssp, len) != 0) 610 return(NULL); 611 while (m->g->strip[ss] != SOP(O_BACK, i)) 612 ss++; 613 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 614 break; 615 case OQUEST_: /* to null or not */ 616 dp = backref(m, sp, stop, ss+1, stopst, lev); 617 if (dp != NULL) 618 return(dp); /* not */ 619 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 620 break; 621 case OPLUS_: 622 assert(m->lastpos != NULL); 623 assert(lev+1 <= m->g->nplus); 624 m->lastpos[lev+1] = sp; 625 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 626 break; 627 case O_PLUS: 628 if (sp == m->lastpos[lev]) /* last pass matched null */ 629 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 630 /* try another pass */ 631 m->lastpos[lev] = sp; 632 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 633 if (dp == NULL) 634 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 635 else 636 return(dp); 637 break; 638 case OCH_: /* find the right one, if any */ 639 ssub = ss + 1; 640 esub = ss + OPND(s) - 1; 641 assert(OP(m->g->strip[esub]) == OOR1); 642 for (;;) { /* find first matching branch */ 643 dp = backref(m, sp, stop, ssub, esub, lev); 644 if (dp != NULL) 645 return(dp); 646 /* that one missed, try next one */ 647 if (OP(m->g->strip[esub]) == O_CH) 648 return(NULL); /* there is none */ 649 esub++; 650 assert(OP(m->g->strip[esub]) == OOR2); 651 ssub = esub + 1; 652 esub += OPND(m->g->strip[esub]); 653 if (OP(m->g->strip[esub]) == OOR2) 654 esub--; 655 else 656 assert(OP(m->g->strip[esub]) == O_CH); 657 } 658 break; 659 case OLPAREN: /* must undo assignment if rest fails */ 660 i = OPND(s); 661 assert(0 < i && i <= m->g->nsub); 662 offsave = m->pmatch[i].rm_so; 663 m->pmatch[i].rm_so = sp - m->offp; 664 dp = backref(m, sp, stop, ss+1, stopst, lev); 665 if (dp != NULL) 666 return(dp); 667 m->pmatch[i].rm_so = offsave; 668 return(NULL); 669 break; 670 case ORPAREN: /* must undo assignment if rest fails */ 671 i = OPND(s); 672 assert(0 < i && i <= m->g->nsub); 673 offsave = m->pmatch[i].rm_eo; 674 m->pmatch[i].rm_eo = sp - m->offp; 675 dp = backref(m, sp, stop, ss+1, stopst, lev); 676 if (dp != NULL) 677 return(dp); 678 m->pmatch[i].rm_eo = offsave; 679 return(NULL); 680 break; 681 default: /* uh oh */ 682 assert(nope); 683 break; 684 } 685 686 /* "can't happen" */ 687 assert(nope); 688 /* NOTREACHED */ 689 return "shut up gcc"; 690 } 691 692 /* 693 - fast - step through the string at top speed 694 == static char *fast(register struct match *m, char *start, \ 695 == char *stop, sopno startst, sopno stopst); 696 */ 697 static char * /* where tentative match ended, or NULL */ 698 fast(m, start, stop, startst, stopst) 699 register struct match *m; 700 char *start; 701 char *stop; 702 sopno startst; 703 sopno stopst; 704 { 705 register states st = m->st; 706 register states fresh = m->fresh; 707 register states tmp = m->tmp; 708 register char *p = start; 709 register int c = (start == m->beginp) ? OUT : *(start-1); 710 register int lastc; /* previous c */ 711 register int flagch; 712 register int i; 713 register char *coldp; /* last p after which no match was underway */ 714 715 CLEAR(st); 716 SET1(st, startst); 717 st = step(m->g, startst, stopst, st, NOTHING, st); 718 ASSIGN(fresh, st); 719 SP("start", st, *p); 720 coldp = NULL; 721 for (;;) { 722 /* next character */ 723 lastc = c; 724 c = (p == m->endp) ? OUT : *p; 725 if (EQ(st, fresh)) 726 coldp = p; 727 728 /* is there an EOL and/or BOL between lastc and c? */ 729 flagch = '\0'; 730 i = 0; 731 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 732 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 733 flagch = BOL; 734 i = m->g->nbol; 735 } 736 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 737 (c == OUT && !(m->eflags®_NOTEOL)) ) { 738 flagch = (flagch == BOL) ? BOLEOL : EOL; 739 i += m->g->neol; 740 } 741 if (i != 0) { 742 for (; i > 0; i--) 743 st = step(m->g, startst, stopst, st, flagch, st); 744 SP("boleol", st, c); 745 } 746 747 /* how about a word boundary? */ 748 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 749 (c != OUT && ISWORD(c)) ) { 750 flagch = BOW; 751 } 752 if ( (lastc != OUT && ISWORD(lastc)) && 753 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 754 flagch = EOW; 755 } 756 if (flagch == BOW || flagch == EOW) { 757 st = step(m->g, startst, stopst, st, flagch, st); 758 SP("boweow", st, c); 759 } 760 761 /* are we done? */ 762 if (ISSET(st, stopst) || p == stop) 763 break; /* NOTE BREAK OUT */ 764 765 /* no, we must deal with this character */ 766 ASSIGN(tmp, st); 767 ASSIGN(st, fresh); 768 assert(c != OUT); 769 st = step(m->g, startst, stopst, tmp, c, st); 770 SP("aft", st, c); 771 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 772 p++; 773 } 774 775 assert(coldp != NULL); 776 m->coldp = coldp; 777 if (ISSET(st, stopst)) 778 return(p+1); 779 else 780 return(NULL); 781 } 782 783 /* 784 - slow - step through the string more deliberately 785 == static char *slow(register struct match *m, char *start, \ 786 == char *stop, sopno startst, sopno stopst); 787 */ 788 static char * /* where it ended */ 789 slow(m, start, stop, startst, stopst) 790 register struct match *m; 791 char *start; 792 char *stop; 793 sopno startst; 794 sopno stopst; 795 { 796 register states st = m->st; 797 register states empty = m->empty; 798 register states tmp = m->tmp; 799 register char *p = start; 800 register int c = (start == m->beginp) ? OUT : *(start-1); 801 register int lastc; /* previous c */ 802 register int flagch; 803 register int i; 804 register char *matchp; /* last p at which a match ended */ 805 806 AT("slow", start, stop, startst, stopst); 807 CLEAR(st); 808 SET1(st, startst); 809 SP("sstart", st, *p); 810 st = step(m->g, startst, stopst, st, NOTHING, st); 811 matchp = NULL; 812 for (;;) { 813 /* next character */ 814 lastc = c; 815 c = (p == m->endp) ? OUT : *p; 816 817 /* is there an EOL and/or BOL between lastc and c? */ 818 flagch = '\0'; 819 i = 0; 820 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 821 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 822 flagch = BOL; 823 i = m->g->nbol; 824 } 825 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 826 (c == OUT && !(m->eflags®_NOTEOL)) ) { 827 flagch = (flagch == BOL) ? BOLEOL : EOL; 828 i += m->g->neol; 829 } 830 if (i != 0) { 831 for (; i > 0; i--) 832 st = step(m->g, startst, stopst, st, flagch, st); 833 SP("sboleol", st, c); 834 } 835 836 /* how about a word boundary? */ 837 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 838 (c != OUT && ISWORD(c)) ) { 839 flagch = BOW; 840 } 841 if ( (lastc != OUT && ISWORD(lastc)) && 842 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 843 flagch = EOW; 844 } 845 if (flagch == BOW || flagch == EOW) { 846 st = step(m->g, startst, stopst, st, flagch, st); 847 SP("sboweow", st, c); 848 } 849 850 /* are we done? */ 851 if (ISSET(st, stopst)) 852 matchp = p; 853 if (EQ(st, empty) || p == stop) 854 break; /* NOTE BREAK OUT */ 855 856 /* no, we must deal with this character */ 857 ASSIGN(tmp, st); 858 ASSIGN(st, empty); 859 assert(c != OUT); 860 st = step(m->g, startst, stopst, tmp, c, st); 861 SP("saft", st, c); 862 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 863 p++; 864 } 865 866 return(matchp); 867 } 868 869 870 /* 871 - step - map set of states reachable before char to set reachable after 872 == static states step(register struct re_guts *g, sopno start, sopno stop, \ 873 == register states bef, int ch, register states aft); 874 == #define BOL (OUT+1) 875 == #define EOL (BOL+1) 876 == #define BOLEOL (BOL+2) 877 == #define NOTHING (BOL+3) 878 == #define BOW (BOL+4) 879 == #define EOW (BOL+5) 880 == #define CODEMAX (BOL+5) // highest code used 881 == #define NONCHAR(c) ((c) > CHAR_MAX) 882 == #define NNONCHAR (CODEMAX-CHAR_MAX) 883 */ 884 static states 885 step(g, start, stop, bef, ch, aft) 886 register struct re_guts *g; 887 sopno start; /* start state within strip */ 888 sopno stop; /* state after stop state within strip */ 889 register states bef; /* states reachable before */ 890 int ch; /* character or NONCHAR code */ 891 register states aft; /* states already known reachable after */ 892 { 893 register cset *cs; 894 register sop s; 895 register sopno pc; 896 register onestate here; /* note, macros know this name */ 897 register sopno look; 898 register int i; 899 900 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 901 s = g->strip[pc]; 902 switch (OP(s)) { 903 case OEND: 904 assert(pc == stop-1); 905 break; 906 case OCHAR: 907 /* only characters can match */ 908 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 909 if (ch == (char)OPND(s)) 910 FWD(aft, bef, 1); 911 break; 912 case OBOL: 913 if (ch == BOL || ch == BOLEOL) 914 FWD(aft, bef, 1); 915 break; 916 case OEOL: 917 if (ch == EOL || ch == BOLEOL) 918 FWD(aft, bef, 1); 919 break; 920 case OBOW: 921 if (ch == BOW) 922 FWD(aft, bef, 1); 923 break; 924 case OEOW: 925 if (ch == EOW) 926 FWD(aft, bef, 1); 927 break; 928 case OANY: 929 if (!NONCHAR(ch)) 930 FWD(aft, bef, 1); 931 break; 932 case OANYOF: 933 cs = &g->sets[OPND(s)]; 934 if (!NONCHAR(ch) && CHIN(cs, ch)) 935 FWD(aft, bef, 1); 936 break; 937 case OBACK_: /* ignored here */ 938 case O_BACK: 939 FWD(aft, aft, 1); 940 break; 941 case OPLUS_: /* forward, this is just an empty */ 942 FWD(aft, aft, 1); 943 break; 944 case O_PLUS: /* both forward and back */ 945 FWD(aft, aft, 1); 946 i = ISSETBACK(aft, OPND(s)); 947 BACK(aft, aft, OPND(s)); 948 if (!i && ISSETBACK(aft, OPND(s))) { 949 /* oho, must reconsider loop body */ 950 pc -= OPND(s) + 1; 951 INIT(here, pc); 952 } 953 break; 954 case OQUEST_: /* two branches, both forward */ 955 FWD(aft, aft, 1); 956 FWD(aft, aft, OPND(s)); 957 break; 958 case O_QUEST: /* just an empty */ 959 FWD(aft, aft, 1); 960 break; 961 case OLPAREN: /* not significant here */ 962 case ORPAREN: 963 FWD(aft, aft, 1); 964 break; 965 case OCH_: /* mark the first two branches */ 966 FWD(aft, aft, 1); 967 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 968 FWD(aft, aft, OPND(s)); 969 break; 970 case OOR1: /* done a branch, find the O_CH */ 971 if (ISSTATEIN(aft, here)) { 972 for (look = 1; 973 OP(s = g->strip[pc+look]) != O_CH; 974 look += OPND(s)) 975 assert(OP(s) == OOR2); 976 FWD(aft, aft, look); 977 } 978 break; 979 case OOR2: /* propagate OCH_'s marking */ 980 FWD(aft, aft, 1); 981 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 982 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 983 FWD(aft, aft, OPND(s)); 984 } 985 break; 986 case O_CH: /* just empty */ 987 FWD(aft, aft, 1); 988 break; 989 default: /* ooooops... */ 990 assert(nope); 991 break; 992 } 993 } 994 995 return(aft); 996 } 997 998 #ifdef REDEBUG 999 /* 1000 - print - print a set of states 1001 == #ifdef REDEBUG 1002 == static void print(struct match *m, char *caption, states st, \ 1003 == int ch, FILE *d); 1004 == #endif 1005 */ 1006 static void 1007 print(m, caption, st, ch, d) 1008 struct match *m; 1009 char *caption; 1010 states st; 1011 int ch; 1012 FILE *d; 1013 { 1014 register struct re_guts *g = m->g; 1015 register int i; 1016 register int first = 1; 1017 1018 if (!(m->eflags®_TRACE)) 1019 return; 1020 1021 fprintf(d, "%s", caption); 1022 if (ch != '\0') 1023 fprintf(d, " %s", pchar(ch)); 1024 for (i = 0; i < g->nstates; i++) 1025 if (ISSET(st, i)) { 1026 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1027 first = 0; 1028 } 1029 fprintf(d, "\n"); 1030 } 1031 1032 /* 1033 - at - print current situation 1034 == #ifdef REDEBUG 1035 == static void at(struct match *m, char *title, char *start, char *stop, \ 1036 == sopno startst, sopno stopst); 1037 == #endif 1038 */ 1039 static void 1040 at(m, title, start, stop, startst, stopst) 1041 struct match *m; 1042 char *title; 1043 char *start; 1044 char *stop; 1045 sopno startst; 1046 sopno stopst; 1047 { 1048 if (!(m->eflags®_TRACE)) 1049 return; 1050 1051 printf("%s %s-", title, pchar(*start)); 1052 printf("%s ", pchar(*stop)); 1053 printf("%ld-%ld\n", (long)startst, (long)stopst); 1054 } 1055 1056 #ifndef PCHARDONE 1057 #define PCHARDONE /* never again */ 1058 /* 1059 - pchar - make a character printable 1060 == #ifdef REDEBUG 1061 == static char *pchar(int ch); 1062 == #endif 1063 * 1064 * Is this identical to regchar() over in debug.c? Well, yes. But a 1065 * duplicate here avoids having a debugging-capable regexec.o tied to 1066 * a matching debug.o, and this is convenient. It all disappears in 1067 * the non-debug compilation anyway, so it doesn't matter much. 1068 */ 1069 static char * /* -> representation */ 1070 pchar(ch) 1071 int ch; 1072 { 1073 static char pbuf[10]; 1074 1075 if (isprint((uch)ch) || ch == ' ') 1076 sprintf(pbuf, "%c", ch); 1077 else 1078 sprintf(pbuf, "\\%o", ch); 1079 return(pbuf); 1080 } 1081 #endif 1082 #endif 1083 1084 #undef matcher 1085 #undef fast 1086 #undef slow 1087 #undef dissect 1088 #undef backref 1089 #undef step 1090 #undef print 1091 #undef at 1092 #undef match 1093