1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)engine.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 /* 44 * The matching engine and friends. This file is #included by regexec.c 45 * after suitable #defines of a variety of macros used herein, so that 46 * different state representations can be used without duplicating masses 47 * of code. 48 */ 49 50 #ifdef SNAMES 51 #define matcher smatcher 52 #define fast sfast 53 #define slow sslow 54 #define dissect sdissect 55 #define backref sbackref 56 #define step sstep 57 #define print sprint 58 #define at sat 59 #define match smat 60 #endif 61 #ifdef LNAMES 62 #define matcher lmatcher 63 #define fast lfast 64 #define slow lslow 65 #define dissect ldissect 66 #define backref lbackref 67 #define step lstep 68 #define print lprint 69 #define at lat 70 #define match lmat 71 #endif 72 #ifdef MNAMES 73 #define matcher mmatcher 74 #define fast mfast 75 #define slow mslow 76 #define dissect mdissect 77 #define backref mbackref 78 #define step mstep 79 #define print mprint 80 #define at mat 81 #define match mmat 82 #endif 83 84 /* another structure passed up and down to avoid zillions of parameters */ 85 struct match { 86 struct re_guts *g; 87 int eflags; 88 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 89 char *offp; /* offsets work from here */ 90 char *beginp; /* start of string -- virtual NUL precedes */ 91 char *endp; /* end of string -- virtual NUL here */ 92 char *coldp; /* can be no match starting before here */ 93 char **lastpos; /* [nplus+1] */ 94 STATEVARS; 95 states st; /* current states */ 96 states fresh; /* states for a fresh start */ 97 states tmp; /* temporary */ 98 states empty; /* empty set of states */ 99 mbstate_t mbs; /* multibyte conversion state */ 100 }; 101 102 /* ========= begin header generated by ./mkh ========= */ 103 #ifdef __cplusplus 104 extern "C" { 105 #endif 106 107 /* === engine.c === */ 108 static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 109 static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 110 static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); 111 static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 112 static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 113 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); 114 #define BOL (OUT-1) 115 #define EOL (BOL-1) 116 #define BOLEOL (BOL-2) 117 #define NOTHING (BOL-3) 118 #define BOW (BOL-4) 119 #define EOW (BOL-5) 120 #define BADCHAR (BOL-6) 121 #define NONCHAR(c) ((c) <= OUT) 122 #ifdef REDEBUG 123 static void print(struct match *m, char *caption, states st, int ch, FILE *d); 124 #endif 125 #ifdef REDEBUG 126 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 127 #endif 128 #ifdef REDEBUG 129 static char *pchar(int ch); 130 #endif 131 132 #ifdef __cplusplus 133 } 134 #endif 135 /* ========= end header generated by ./mkh ========= */ 136 137 #ifdef REDEBUG 138 #define SP(t, s, c) print(m, t, s, c, stdout) 139 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 140 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 141 #else 142 #define SP(t, s, c) /* nothing */ 143 #define AT(t, p1, p2, s1, s2) /* nothing */ 144 #define NOTE(s) /* nothing */ 145 #endif 146 147 /* 148 - matcher - the actual matching engine 149 == static int matcher(struct re_guts *g, char *string, \ 150 == size_t nmatch, regmatch_t pmatch[], int eflags); 151 */ 152 static int /* 0 success, REG_NOMATCH failure */ 153 matcher(g, string, nmatch, pmatch, eflags) 154 struct re_guts *g; 155 char *string; 156 size_t nmatch; 157 regmatch_t pmatch[]; 158 int eflags; 159 { 160 char *endp; 161 int i; 162 struct match mv; 163 struct match *m = &mv; 164 char *dp; 165 const sopno gf = g->firststate+1; /* +1 for OEND */ 166 const sopno gl = g->laststate; 167 char *start; 168 char *stop; 169 /* Boyer-Moore algorithms variables */ 170 char *pp; 171 int cj, mj; 172 char *mustfirst; 173 char *mustlast; 174 int *matchjump; 175 int *charjump; 176 177 /* simplify the situation where possible */ 178 if (g->cflags®_NOSUB) 179 nmatch = 0; 180 if (eflags®_STARTEND) { 181 start = string + pmatch[0].rm_so; 182 stop = string + pmatch[0].rm_eo; 183 } else { 184 start = string; 185 stop = start + strlen(start); 186 } 187 if (stop < start) 188 return(REG_INVARG); 189 190 /* prescreening; this does wonders for this rather slow code */ 191 if (g->must != NULL) { 192 if (g->charjump != NULL && g->matchjump != NULL) { 193 mustfirst = g->must; 194 mustlast = g->must + g->mlen - 1; 195 charjump = g->charjump; 196 matchjump = g->matchjump; 197 pp = mustlast; 198 for (dp = start+g->mlen-1; dp < stop;) { 199 /* Fast skip non-matches */ 200 while (dp < stop && charjump[(int)*dp]) 201 dp += charjump[(int)*dp]; 202 203 if (dp >= stop) 204 break; 205 206 /* Greedy matcher */ 207 /* We depend on not being used for 208 * for strings of length 1 209 */ 210 while (*--dp == *--pp && pp != mustfirst); 211 212 if (*dp == *pp) 213 break; 214 215 /* Jump to next possible match */ 216 mj = matchjump[pp - mustfirst]; 217 cj = charjump[(int)*dp]; 218 dp += (cj < mj ? mj : cj); 219 pp = mustlast; 220 } 221 if (pp != mustfirst) 222 return(REG_NOMATCH); 223 } else { 224 for (dp = start; dp < stop; dp++) 225 if (*dp == g->must[0] && 226 stop - dp >= g->mlen && 227 memcmp(dp, g->must, (size_t)g->mlen) == 0) 228 break; 229 if (dp == stop) /* we didn't find g->must */ 230 return(REG_NOMATCH); 231 } 232 } 233 234 /* match struct setup */ 235 m->g = g; 236 m->eflags = eflags; 237 m->pmatch = NULL; 238 m->lastpos = NULL; 239 m->offp = string; 240 m->beginp = start; 241 m->endp = stop; 242 STATESETUP(m, 4); 243 SETUP(m->st); 244 SETUP(m->fresh); 245 SETUP(m->tmp); 246 SETUP(m->empty); 247 CLEAR(m->empty); 248 ZAPSTATE(&m->mbs); 249 250 /* Adjust start according to moffset, to speed things up */ 251 if (g->moffset > -1) 252 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 253 254 /* this loop does only one repetition except for backrefs */ 255 for (;;) { 256 endp = fast(m, start, stop, gf, gl); 257 if (endp == NULL) { /* a miss */ 258 if (m->pmatch != NULL) 259 free((char *)m->pmatch); 260 if (m->lastpos != NULL) 261 free((char *)m->lastpos); 262 STATETEARDOWN(m); 263 return(REG_NOMATCH); 264 } 265 if (nmatch == 0 && !g->backrefs) 266 break; /* no further info needed */ 267 268 /* where? */ 269 assert(m->coldp != NULL); 270 for (;;) { 271 NOTE("finding start"); 272 endp = slow(m, m->coldp, stop, gf, gl); 273 if (endp != NULL) 274 break; 275 assert(m->coldp < m->endp); 276 m->coldp += XMBRTOWC(NULL, m->coldp, 277 m->endp - m->coldp, &m->mbs, 0); 278 } 279 if (nmatch == 1 && !g->backrefs) 280 break; /* no further info needed */ 281 282 /* oh my, he wants the subexpressions... */ 283 if (m->pmatch == NULL) 284 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 285 sizeof(regmatch_t)); 286 if (m->pmatch == NULL) { 287 STATETEARDOWN(m); 288 return(REG_ESPACE); 289 } 290 for (i = 1; i <= m->g->nsub; i++) 291 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 292 if (!g->backrefs && !(m->eflags®_BACKR)) { 293 NOTE("dissecting"); 294 dp = dissect(m, m->coldp, endp, gf, gl); 295 } else { 296 if (g->nplus > 0 && m->lastpos == NULL) 297 m->lastpos = (char **)malloc((g->nplus+1) * 298 sizeof(char *)); 299 if (g->nplus > 0 && m->lastpos == NULL) { 300 free(m->pmatch); 301 STATETEARDOWN(m); 302 return(REG_ESPACE); 303 } 304 NOTE("backref dissect"); 305 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 306 } 307 if (dp != NULL) 308 break; 309 310 /* uh-oh... we couldn't find a subexpression-level match */ 311 assert(g->backrefs); /* must be back references doing it */ 312 assert(g->nplus == 0 || m->lastpos != NULL); 313 for (;;) { 314 if (dp != NULL || endp <= m->coldp) 315 break; /* defeat */ 316 NOTE("backoff"); 317 endp = slow(m, m->coldp, endp-1, gf, gl); 318 if (endp == NULL) 319 break; /* defeat */ 320 /* try it on a shorter possibility */ 321 #ifndef NDEBUG 322 for (i = 1; i <= m->g->nsub; i++) { 323 assert(m->pmatch[i].rm_so == -1); 324 assert(m->pmatch[i].rm_eo == -1); 325 } 326 #endif 327 NOTE("backoff dissect"); 328 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 329 } 330 assert(dp == NULL || dp == endp); 331 if (dp != NULL) /* found a shorter one */ 332 break; 333 334 /* despite initial appearances, there is no match here */ 335 NOTE("false alarm"); 336 /* recycle starting later */ 337 start = m->coldp + XMBRTOWC(NULL, m->coldp, 338 stop - m->coldp, &m->mbs, 0); 339 assert(start <= stop); 340 } 341 342 /* fill in the details if requested */ 343 if (nmatch > 0) { 344 pmatch[0].rm_so = m->coldp - m->offp; 345 pmatch[0].rm_eo = endp - m->offp; 346 } 347 if (nmatch > 1) { 348 assert(m->pmatch != NULL); 349 for (i = 1; i < nmatch; i++) 350 if (i <= m->g->nsub) 351 pmatch[i] = m->pmatch[i]; 352 else { 353 pmatch[i].rm_so = -1; 354 pmatch[i].rm_eo = -1; 355 } 356 } 357 358 if (m->pmatch != NULL) 359 free((char *)m->pmatch); 360 if (m->lastpos != NULL) 361 free((char *)m->lastpos); 362 STATETEARDOWN(m); 363 return(0); 364 } 365 366 /* 367 - dissect - figure out what matched what, no back references 368 == static char *dissect(struct match *m, char *start, \ 369 == char *stop, sopno startst, sopno stopst); 370 */ 371 static char * /* == stop (success) always */ 372 dissect(m, start, stop, startst, stopst) 373 struct match *m; 374 char *start; 375 char *stop; 376 sopno startst; 377 sopno stopst; 378 { 379 int i; 380 sopno ss; /* start sop of current subRE */ 381 sopno es; /* end sop of current subRE */ 382 char *sp; /* start of string matched by it */ 383 char *stp; /* string matched by it cannot pass here */ 384 char *rest; /* start of rest of string */ 385 char *tail; /* string unmatched by rest of RE */ 386 sopno ssub; /* start sop of subsubRE */ 387 sopno esub; /* end sop of subsubRE */ 388 char *ssp; /* start of string matched by subsubRE */ 389 char *sep; /* end of string matched by subsubRE */ 390 char *oldssp; /* previous ssp */ 391 char *dp; 392 393 AT("diss", start, stop, startst, stopst); 394 sp = start; 395 for (ss = startst; ss < stopst; ss = es) { 396 /* identify end of subRE */ 397 es = ss; 398 switch (OP(m->g->strip[es])) { 399 case OPLUS_: 400 case OQUEST_: 401 es += OPND(m->g->strip[es]); 402 break; 403 case OCH_: 404 while (OP(m->g->strip[es]) != O_CH) 405 es += OPND(m->g->strip[es]); 406 break; 407 } 408 es++; 409 410 /* figure out what it matched */ 411 switch (OP(m->g->strip[ss])) { 412 case OEND: 413 assert(nope); 414 break; 415 case OCHAR: 416 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 417 break; 418 case OBOL: 419 case OEOL: 420 case OBOW: 421 case OEOW: 422 break; 423 case OANY: 424 case OANYOF: 425 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 426 break; 427 case OBACK_: 428 case O_BACK: 429 assert(nope); 430 break; 431 /* cases where length of match is hard to find */ 432 case OQUEST_: 433 stp = stop; 434 for (;;) { 435 /* how long could this one be? */ 436 rest = slow(m, sp, stp, ss, es); 437 assert(rest != NULL); /* it did match */ 438 /* could the rest match the rest? */ 439 tail = slow(m, rest, stop, es, stopst); 440 if (tail == stop) 441 break; /* yes! */ 442 /* no -- try a shorter match for this one */ 443 stp = rest - 1; 444 assert(stp >= sp); /* it did work */ 445 } 446 ssub = ss + 1; 447 esub = es - 1; 448 /* did innards match? */ 449 if (slow(m, sp, rest, ssub, esub) != NULL) { 450 dp = dissect(m, sp, rest, ssub, esub); 451 assert(dp == rest); 452 } else /* no */ 453 assert(sp == rest); 454 sp = rest; 455 break; 456 case OPLUS_: 457 stp = stop; 458 for (;;) { 459 /* how long could this one be? */ 460 rest = slow(m, sp, stp, ss, es); 461 assert(rest != NULL); /* it did match */ 462 /* could the rest match the rest? */ 463 tail = slow(m, rest, stop, es, stopst); 464 if (tail == stop) 465 break; /* yes! */ 466 /* no -- try a shorter match for this one */ 467 stp = rest - 1; 468 assert(stp >= sp); /* it did work */ 469 } 470 ssub = ss + 1; 471 esub = es - 1; 472 ssp = sp; 473 oldssp = ssp; 474 for (;;) { /* find last match of innards */ 475 sep = slow(m, ssp, rest, ssub, esub); 476 if (sep == NULL || sep == ssp) 477 break; /* failed or matched null */ 478 oldssp = ssp; /* on to next try */ 479 ssp = sep; 480 } 481 if (sep == NULL) { 482 /* last successful match */ 483 sep = ssp; 484 ssp = oldssp; 485 } 486 assert(sep == rest); /* must exhaust substring */ 487 assert(slow(m, ssp, sep, ssub, esub) == rest); 488 dp = dissect(m, ssp, sep, ssub, esub); 489 assert(dp == sep); 490 sp = rest; 491 break; 492 case OCH_: 493 stp = stop; 494 for (;;) { 495 /* how long could this one be? */ 496 rest = slow(m, sp, stp, ss, es); 497 assert(rest != NULL); /* it did match */ 498 /* could the rest match the rest? */ 499 tail = slow(m, rest, stop, es, stopst); 500 if (tail == stop) 501 break; /* yes! */ 502 /* no -- try a shorter match for this one */ 503 stp = rest - 1; 504 assert(stp >= sp); /* it did work */ 505 } 506 ssub = ss + 1; 507 esub = ss + OPND(m->g->strip[ss]) - 1; 508 assert(OP(m->g->strip[esub]) == OOR1); 509 for (;;) { /* find first matching branch */ 510 if (slow(m, sp, rest, ssub, esub) == rest) 511 break; /* it matched all of it */ 512 /* that one missed, try next one */ 513 assert(OP(m->g->strip[esub]) == OOR1); 514 esub++; 515 assert(OP(m->g->strip[esub]) == OOR2); 516 ssub = esub + 1; 517 esub += OPND(m->g->strip[esub]); 518 if (OP(m->g->strip[esub]) == OOR2) 519 esub--; 520 else 521 assert(OP(m->g->strip[esub]) == O_CH); 522 } 523 dp = dissect(m, sp, rest, ssub, esub); 524 assert(dp == rest); 525 sp = rest; 526 break; 527 case O_PLUS: 528 case O_QUEST: 529 case OOR1: 530 case OOR2: 531 case O_CH: 532 assert(nope); 533 break; 534 case OLPAREN: 535 i = OPND(m->g->strip[ss]); 536 assert(0 < i && i <= m->g->nsub); 537 m->pmatch[i].rm_so = sp - m->offp; 538 break; 539 case ORPAREN: 540 i = OPND(m->g->strip[ss]); 541 assert(0 < i && i <= m->g->nsub); 542 m->pmatch[i].rm_eo = sp - m->offp; 543 break; 544 default: /* uh oh */ 545 assert(nope); 546 break; 547 } 548 } 549 550 assert(sp == stop); 551 return(sp); 552 } 553 554 /* 555 - backref - figure out what matched what, figuring in back references 556 == static char *backref(struct match *m, char *start, \ 557 == char *stop, sopno startst, sopno stopst, sopno lev); 558 */ 559 static char * /* == stop (success) or NULL (failure) */ 560 backref(m, start, stop, startst, stopst, lev) 561 struct match *m; 562 char *start; 563 char *stop; 564 sopno startst; 565 sopno stopst; 566 sopno lev; /* PLUS nesting level */ 567 { 568 int i; 569 sopno ss; /* start sop of current subRE */ 570 char *sp; /* start of string matched by it */ 571 sopno ssub; /* start sop of subsubRE */ 572 sopno esub; /* end sop of subsubRE */ 573 char *ssp; /* start of string matched by subsubRE */ 574 char *dp; 575 size_t len; 576 int hard; 577 sop s; 578 regoff_t offsave; 579 cset *cs; 580 wint_t wc; 581 582 AT("back", start, stop, startst, stopst); 583 sp = start; 584 585 /* get as far as we can with easy stuff */ 586 hard = 0; 587 for (ss = startst; !hard && ss < stopst; ss++) 588 switch (OP(s = m->g->strip[ss])) { 589 case OCHAR: 590 if (sp == stop) 591 return(NULL); 592 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 593 if (wc != OPND(s)) 594 return(NULL); 595 break; 596 case OANY: 597 if (sp == stop) 598 return(NULL); 599 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 600 if (wc == BADCHAR) 601 return (NULL); 602 break; 603 case OANYOF: 604 if (sp == stop) 605 return (NULL); 606 cs = &m->g->sets[OPND(s)]; 607 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 608 if (wc == BADCHAR || !CHIN(cs, wc)) 609 return(NULL); 610 break; 611 case OBOL: 612 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 613 (sp < m->endp && *(sp-1) == '\n' && 614 (m->g->cflags®_NEWLINE)) ) 615 { /* yes */ } 616 else 617 return(NULL); 618 break; 619 case OEOL: 620 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 621 (sp < m->endp && *sp == '\n' && 622 (m->g->cflags®_NEWLINE)) ) 623 { /* yes */ } 624 else 625 return(NULL); 626 break; 627 case OBOW: 628 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 629 (sp < m->endp && *(sp-1) == '\n' && 630 (m->g->cflags®_NEWLINE)) || 631 (sp > m->beginp && 632 !ISWORD(*(sp-1))) ) && 633 (sp < m->endp && ISWORD(*sp)) ) 634 { /* yes */ } 635 else 636 return(NULL); 637 break; 638 case OEOW: 639 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 640 (sp < m->endp && *sp == '\n' && 641 (m->g->cflags®_NEWLINE)) || 642 (sp < m->endp && !ISWORD(*sp)) ) && 643 (sp > m->beginp && ISWORD(*(sp-1))) ) 644 { /* yes */ } 645 else 646 return(NULL); 647 break; 648 case O_QUEST: 649 break; 650 case OOR1: /* matches null but needs to skip */ 651 ss++; 652 s = m->g->strip[ss]; 653 do { 654 assert(OP(s) == OOR2); 655 ss += OPND(s); 656 } while (OP(s = m->g->strip[ss]) != O_CH); 657 /* note that the ss++ gets us past the O_CH */ 658 break; 659 default: /* have to make a choice */ 660 hard = 1; 661 break; 662 } 663 if (!hard) { /* that was it! */ 664 if (sp != stop) 665 return(NULL); 666 return(sp); 667 } 668 ss--; /* adjust for the for's final increment */ 669 670 /* the hard stuff */ 671 AT("hard", sp, stop, ss, stopst); 672 s = m->g->strip[ss]; 673 switch (OP(s)) { 674 case OBACK_: /* the vilest depths */ 675 i = OPND(s); 676 assert(0 < i && i <= m->g->nsub); 677 if (m->pmatch[i].rm_eo == -1) 678 return(NULL); 679 assert(m->pmatch[i].rm_so != -1); 680 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 681 assert(stop - m->beginp >= len); 682 if (sp > stop - len) 683 return(NULL); /* not enough left to match */ 684 ssp = m->offp + m->pmatch[i].rm_so; 685 if (memcmp(sp, ssp, len) != 0) 686 return(NULL); 687 while (m->g->strip[ss] != SOP(O_BACK, i)) 688 ss++; 689 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 690 break; 691 case OQUEST_: /* to null or not */ 692 dp = backref(m, sp, stop, ss+1, stopst, lev); 693 if (dp != NULL) 694 return(dp); /* not */ 695 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 696 break; 697 case OPLUS_: 698 assert(m->lastpos != NULL); 699 assert(lev+1 <= m->g->nplus); 700 m->lastpos[lev+1] = sp; 701 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 702 break; 703 case O_PLUS: 704 if (sp == m->lastpos[lev]) /* last pass matched null */ 705 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 706 /* try another pass */ 707 m->lastpos[lev] = sp; 708 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 709 if (dp == NULL) 710 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 711 else 712 return(dp); 713 break; 714 case OCH_: /* find the right one, if any */ 715 ssub = ss + 1; 716 esub = ss + OPND(s) - 1; 717 assert(OP(m->g->strip[esub]) == OOR1); 718 for (;;) { /* find first matching branch */ 719 dp = backref(m, sp, stop, ssub, esub, lev); 720 if (dp != NULL) 721 return(dp); 722 /* that one missed, try next one */ 723 if (OP(m->g->strip[esub]) == O_CH) 724 return(NULL); /* there is none */ 725 esub++; 726 assert(OP(m->g->strip[esub]) == OOR2); 727 ssub = esub + 1; 728 esub += OPND(m->g->strip[esub]); 729 if (OP(m->g->strip[esub]) == OOR2) 730 esub--; 731 else 732 assert(OP(m->g->strip[esub]) == O_CH); 733 } 734 break; 735 case OLPAREN: /* must undo assignment if rest fails */ 736 i = OPND(s); 737 assert(0 < i && i <= m->g->nsub); 738 offsave = m->pmatch[i].rm_so; 739 m->pmatch[i].rm_so = sp - m->offp; 740 dp = backref(m, sp, stop, ss+1, stopst, lev); 741 if (dp != NULL) 742 return(dp); 743 m->pmatch[i].rm_so = offsave; 744 return(NULL); 745 break; 746 case ORPAREN: /* must undo assignment if rest fails */ 747 i = OPND(s); 748 assert(0 < i && i <= m->g->nsub); 749 offsave = m->pmatch[i].rm_eo; 750 m->pmatch[i].rm_eo = sp - m->offp; 751 dp = backref(m, sp, stop, ss+1, stopst, lev); 752 if (dp != NULL) 753 return(dp); 754 m->pmatch[i].rm_eo = offsave; 755 return(NULL); 756 break; 757 default: /* uh oh */ 758 assert(nope); 759 break; 760 } 761 762 /* "can't happen" */ 763 assert(nope); 764 /* NOTREACHED */ 765 return "shut up gcc"; 766 } 767 768 /* 769 - fast - step through the string at top speed 770 == static char *fast(struct match *m, char *start, \ 771 == char *stop, sopno startst, sopno stopst); 772 */ 773 static char * /* where tentative match ended, or NULL */ 774 fast(m, start, stop, startst, stopst) 775 struct match *m; 776 char *start; 777 char *stop; 778 sopno startst; 779 sopno stopst; 780 { 781 states st = m->st; 782 states fresh = m->fresh; 783 states tmp = m->tmp; 784 char *p = start; 785 wint_t c; 786 wint_t lastc; /* previous c */ 787 wint_t flagch; 788 int i; 789 char *coldp; /* last p after which no match was underway */ 790 size_t clen; 791 792 CLEAR(st); 793 SET1(st, startst); 794 st = step(m->g, startst, stopst, st, NOTHING, st); 795 ASSIGN(fresh, st); 796 SP("start", st, *p); 797 coldp = NULL; 798 if (start == m->beginp) 799 c = OUT; 800 else { 801 /* 802 * XXX Wrong if the previous character was multi-byte. 803 * Newline never is (in encodings supported by FreeBSD), 804 * so this only breaks the ISWORD tests below. 805 */ 806 c = (uch)*(start - 1); 807 } 808 for (;;) { 809 /* next character */ 810 lastc = c; 811 if (p == m->endp) 812 c = OUT; 813 else 814 clen = XMBRTOWC(&c, p, stop - p, &m->mbs, BADCHAR); 815 if (EQ(st, fresh)) 816 coldp = p; 817 818 /* is there an EOL and/or BOL between lastc and c? */ 819 flagch = '\0'; 820 i = 0; 821 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 822 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 823 flagch = BOL; 824 i = m->g->nbol; 825 } 826 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 827 (c == OUT && !(m->eflags®_NOTEOL)) ) { 828 flagch = (flagch == BOL) ? BOLEOL : EOL; 829 i += m->g->neol; 830 } 831 if (i != 0) { 832 for (; i > 0; i--) 833 st = step(m->g, startst, stopst, st, flagch, st); 834 SP("boleol", st, c); 835 } 836 837 /* how about a word boundary? */ 838 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 839 (c != OUT && ISWORD(c)) ) { 840 flagch = BOW; 841 } 842 if ( (lastc != OUT && ISWORD(lastc)) && 843 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 844 flagch = EOW; 845 } 846 if (flagch == BOW || flagch == EOW) { 847 st = step(m->g, startst, stopst, st, flagch, st); 848 SP("boweow", st, c); 849 } 850 851 /* are we done? */ 852 if (ISSET(st, stopst) || p == stop) 853 break; /* NOTE BREAK OUT */ 854 855 /* no, we must deal with this character */ 856 ASSIGN(tmp, st); 857 ASSIGN(st, fresh); 858 assert(c != OUT); 859 st = step(m->g, startst, stopst, tmp, c, st); 860 SP("aft", st, c); 861 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 862 p += clen; 863 } 864 865 assert(coldp != NULL); 866 m->coldp = coldp; 867 if (ISSET(st, stopst)) 868 return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 869 else 870 return(NULL); 871 } 872 873 /* 874 - slow - step through the string more deliberately 875 == static char *slow(struct match *m, char *start, \ 876 == char *stop, sopno startst, sopno stopst); 877 */ 878 static char * /* where it ended */ 879 slow(m, start, stop, startst, stopst) 880 struct match *m; 881 char *start; 882 char *stop; 883 sopno startst; 884 sopno stopst; 885 { 886 states st = m->st; 887 states empty = m->empty; 888 states tmp = m->tmp; 889 char *p = start; 890 wint_t c; 891 wint_t lastc; /* previous c */ 892 wint_t flagch; 893 int i; 894 char *matchp; /* last p at which a match ended */ 895 size_t clen; 896 897 AT("slow", start, stop, startst, stopst); 898 CLEAR(st); 899 SET1(st, startst); 900 SP("sstart", st, *p); 901 st = step(m->g, startst, stopst, st, NOTHING, st); 902 matchp = NULL; 903 if (start == m->beginp) 904 c = OUT; 905 else { 906 /* 907 * XXX Wrong if the previous character was multi-byte. 908 * Newline never is (in encodings supported by FreeBSD), 909 * so this only breaks the ISWORD tests below. 910 */ 911 c = (uch)*(start - 1); 912 } 913 for (;;) { 914 /* next character */ 915 lastc = c; 916 if (p == m->endp) { 917 c = OUT; 918 clen = 0; 919 } else 920 clen = XMBRTOWC(&c, p, stop - p, &m->mbs, BADCHAR); 921 922 /* is there an EOL and/or BOL between lastc and c? */ 923 flagch = '\0'; 924 i = 0; 925 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 926 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 927 flagch = BOL; 928 i = m->g->nbol; 929 } 930 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 931 (c == OUT && !(m->eflags®_NOTEOL)) ) { 932 flagch = (flagch == BOL) ? BOLEOL : EOL; 933 i += m->g->neol; 934 } 935 if (i != 0) { 936 for (; i > 0; i--) 937 st = step(m->g, startst, stopst, st, flagch, st); 938 SP("sboleol", st, c); 939 } 940 941 /* how about a word boundary? */ 942 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 943 (c != OUT && ISWORD(c)) ) { 944 flagch = BOW; 945 } 946 if ( (lastc != OUT && ISWORD(lastc)) && 947 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 948 flagch = EOW; 949 } 950 if (flagch == BOW || flagch == EOW) { 951 st = step(m->g, startst, stopst, st, flagch, st); 952 SP("sboweow", st, c); 953 } 954 955 /* are we done? */ 956 if (ISSET(st, stopst)) 957 matchp = p; 958 if (EQ(st, empty) || p == stop) 959 break; /* NOTE BREAK OUT */ 960 961 /* no, we must deal with this character */ 962 ASSIGN(tmp, st); 963 ASSIGN(st, empty); 964 assert(c != OUT); 965 st = step(m->g, startst, stopst, tmp, c, st); 966 SP("saft", st, c); 967 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 968 p += clen; 969 } 970 971 return(matchp); 972 } 973 974 975 /* 976 - step - map set of states reachable before char to set reachable after 977 == static states step(struct re_guts *g, sopno start, sopno stop, \ 978 == states bef, int ch, states aft); 979 == #define BOL (OUT-1) 980 == #define EOL (BOL-1) 981 == #define BOLEOL (BOL-2) 982 == #define NOTHING (BOL-3) 983 == #define BOW (BOL-4) 984 == #define EOW (BOL-5) 985 == #define BADCHAR (BOL-6) 986 == #define NONCHAR(c) ((c) <= OUT) 987 */ 988 static states 989 step(g, start, stop, bef, ch, aft) 990 struct re_guts *g; 991 sopno start; /* start state within strip */ 992 sopno stop; /* state after stop state within strip */ 993 states bef; /* states reachable before */ 994 wint_t ch; /* character or NONCHAR code */ 995 states aft; /* states already known reachable after */ 996 { 997 cset *cs; 998 sop s; 999 sopno pc; 1000 onestate here; /* note, macros know this name */ 1001 sopno look; 1002 int i; 1003 1004 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1005 s = g->strip[pc]; 1006 switch (OP(s)) { 1007 case OEND: 1008 assert(pc == stop-1); 1009 break; 1010 case OCHAR: 1011 /* only characters can match */ 1012 assert(!NONCHAR(ch) || ch != OPND(s)); 1013 if (ch == OPND(s)) 1014 FWD(aft, bef, 1); 1015 break; 1016 case OBOL: 1017 if (ch == BOL || ch == BOLEOL) 1018 FWD(aft, bef, 1); 1019 break; 1020 case OEOL: 1021 if (ch == EOL || ch == BOLEOL) 1022 FWD(aft, bef, 1); 1023 break; 1024 case OBOW: 1025 if (ch == BOW) 1026 FWD(aft, bef, 1); 1027 break; 1028 case OEOW: 1029 if (ch == EOW) 1030 FWD(aft, bef, 1); 1031 break; 1032 case OANY: 1033 if (!NONCHAR(ch)) 1034 FWD(aft, bef, 1); 1035 break; 1036 case OANYOF: 1037 cs = &g->sets[OPND(s)]; 1038 if (!NONCHAR(ch) && CHIN(cs, ch)) 1039 FWD(aft, bef, 1); 1040 break; 1041 case OBACK_: /* ignored here */ 1042 case O_BACK: 1043 FWD(aft, aft, 1); 1044 break; 1045 case OPLUS_: /* forward, this is just an empty */ 1046 FWD(aft, aft, 1); 1047 break; 1048 case O_PLUS: /* both forward and back */ 1049 FWD(aft, aft, 1); 1050 i = ISSETBACK(aft, OPND(s)); 1051 BACK(aft, aft, OPND(s)); 1052 if (!i && ISSETBACK(aft, OPND(s))) { 1053 /* oho, must reconsider loop body */ 1054 pc -= OPND(s) + 1; 1055 INIT(here, pc); 1056 } 1057 break; 1058 case OQUEST_: /* two branches, both forward */ 1059 FWD(aft, aft, 1); 1060 FWD(aft, aft, OPND(s)); 1061 break; 1062 case O_QUEST: /* just an empty */ 1063 FWD(aft, aft, 1); 1064 break; 1065 case OLPAREN: /* not significant here */ 1066 case ORPAREN: 1067 FWD(aft, aft, 1); 1068 break; 1069 case OCH_: /* mark the first two branches */ 1070 FWD(aft, aft, 1); 1071 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1072 FWD(aft, aft, OPND(s)); 1073 break; 1074 case OOR1: /* done a branch, find the O_CH */ 1075 if (ISSTATEIN(aft, here)) { 1076 for (look = 1; 1077 OP(s = g->strip[pc+look]) != O_CH; 1078 look += OPND(s)) 1079 assert(OP(s) == OOR2); 1080 FWD(aft, aft, look); 1081 } 1082 break; 1083 case OOR2: /* propagate OCH_'s marking */ 1084 FWD(aft, aft, 1); 1085 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1086 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1087 FWD(aft, aft, OPND(s)); 1088 } 1089 break; 1090 case O_CH: /* just empty */ 1091 FWD(aft, aft, 1); 1092 break; 1093 default: /* ooooops... */ 1094 assert(nope); 1095 break; 1096 } 1097 } 1098 1099 return(aft); 1100 } 1101 1102 #ifdef REDEBUG 1103 /* 1104 - print - print a set of states 1105 == #ifdef REDEBUG 1106 == static void print(struct match *m, char *caption, states st, \ 1107 == int ch, FILE *d); 1108 == #endif 1109 */ 1110 static void 1111 print(m, caption, st, ch, d) 1112 struct match *m; 1113 char *caption; 1114 states st; 1115 int ch; 1116 FILE *d; 1117 { 1118 struct re_guts *g = m->g; 1119 int i; 1120 int first = 1; 1121 1122 if (!(m->eflags®_TRACE)) 1123 return; 1124 1125 fprintf(d, "%s", caption); 1126 if (ch != '\0') 1127 fprintf(d, " %s", pchar(ch)); 1128 for (i = 0; i < g->nstates; i++) 1129 if (ISSET(st, i)) { 1130 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1131 first = 0; 1132 } 1133 fprintf(d, "\n"); 1134 } 1135 1136 /* 1137 - at - print current situation 1138 == #ifdef REDEBUG 1139 == static void at(struct match *m, char *title, char *start, char *stop, \ 1140 == sopno startst, sopno stopst); 1141 == #endif 1142 */ 1143 static void 1144 at(m, title, start, stop, startst, stopst) 1145 struct match *m; 1146 char *title; 1147 char *start; 1148 char *stop; 1149 sopno startst; 1150 sopno stopst; 1151 { 1152 if (!(m->eflags®_TRACE)) 1153 return; 1154 1155 printf("%s %s-", title, pchar(*start)); 1156 printf("%s ", pchar(*stop)); 1157 printf("%ld-%ld\n", (long)startst, (long)stopst); 1158 } 1159 1160 #ifndef PCHARDONE 1161 #define PCHARDONE /* never again */ 1162 /* 1163 - pchar - make a character printable 1164 == #ifdef REDEBUG 1165 == static char *pchar(int ch); 1166 == #endif 1167 * 1168 * Is this identical to regchar() over in debug.c? Well, yes. But a 1169 * duplicate here avoids having a debugging-capable regexec.o tied to 1170 * a matching debug.o, and this is convenient. It all disappears in 1171 * the non-debug compilation anyway, so it doesn't matter much. 1172 */ 1173 static char * /* -> representation */ 1174 pchar(ch) 1175 int ch; 1176 { 1177 static char pbuf[10]; 1178 1179 if (isprint((uch)ch) || ch == ' ') 1180 sprintf(pbuf, "%c", ch); 1181 else 1182 sprintf(pbuf, "\\%o", ch); 1183 return(pbuf); 1184 } 1185 #endif 1186 #endif 1187 1188 #undef matcher 1189 #undef fast 1190 #undef slow 1191 #undef dissect 1192 #undef backref 1193 #undef step 1194 #undef print 1195 #undef at 1196 #undef match 1197