1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)engine.c 8.5 (Berkeley) 3/20/94 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 /* 40 * The matching engine and friends. This file is #included by regexec.c 41 * after suitable #defines of a variety of macros used herein, so that 42 * different state representations can be used without duplicating masses 43 * of code. 44 */ 45 46 #ifdef SNAMES 47 #define matcher smatcher 48 #define fast sfast 49 #define slow sslow 50 #define dissect sdissect 51 #define backref sbackref 52 #define step sstep 53 #define print sprint 54 #define at sat 55 #define match smat 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #endif 68 #ifdef MNAMES 69 #define matcher mmatcher 70 #define fast mfast 71 #define slow mslow 72 #define dissect mdissect 73 #define backref mbackref 74 #define step mstep 75 #define print mprint 76 #define at mat 77 #define match mmat 78 #endif 79 80 /* another structure passed up and down to avoid zillions of parameters */ 81 struct match { 82 struct re_guts *g; 83 int eflags; 84 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 85 char *offp; /* offsets work from here */ 86 char *beginp; /* start of string -- virtual NUL precedes */ 87 char *endp; /* end of string -- virtual NUL here */ 88 char *coldp; /* can be no match starting before here */ 89 char **lastpos; /* [nplus+1] */ 90 STATEVARS; 91 states st; /* current states */ 92 states fresh; /* states for a fresh start */ 93 states tmp; /* temporary */ 94 states empty; /* empty set of states */ 95 mbstate_t mbs; /* multibyte conversion state */ 96 }; 97 98 /* ========= begin header generated by ./mkh ========= */ 99 #ifdef __cplusplus 100 extern "C" { 101 #endif 102 103 /* === engine.c === */ 104 static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 105 static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 106 static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev, int); 107 static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 108 static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); 109 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); 110 #define MAX_RECURSION 100 111 #define BOL (OUT-1) 112 #define EOL (BOL-1) 113 #define BOLEOL (BOL-2) 114 #define NOTHING (BOL-3) 115 #define BOW (BOL-4) 116 #define EOW (BOL-5) 117 #define BADCHAR (BOL-6) 118 #define NONCHAR(c) ((c) <= OUT) 119 #ifdef REDEBUG 120 static void print(struct match *m, char *caption, states st, int ch, FILE *d); 121 #endif 122 #ifdef REDEBUG 123 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 124 #endif 125 #ifdef REDEBUG 126 static char *pchar(int ch); 127 #endif 128 129 #ifdef __cplusplus 130 } 131 #endif 132 /* ========= end header generated by ./mkh ========= */ 133 134 #ifdef REDEBUG 135 #define SP(t, s, c) print(m, t, s, c, stdout) 136 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 137 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 138 #else 139 #define SP(t, s, c) /* nothing */ 140 #define AT(t, p1, p2, s1, s2) /* nothing */ 141 #define NOTE(s) /* nothing */ 142 #endif 143 144 /* 145 - matcher - the actual matching engine 146 == static int matcher(struct re_guts *g, char *string, \ 147 == size_t nmatch, regmatch_t pmatch[], int eflags); 148 */ 149 static int /* 0 success, REG_NOMATCH failure */ 150 matcher(g, string, nmatch, pmatch, eflags) 151 struct re_guts *g; 152 char *string; 153 size_t nmatch; 154 regmatch_t pmatch[]; 155 int eflags; 156 { 157 char *endp; 158 int i; 159 struct match mv; 160 struct match *m = &mv; 161 char *dp; 162 const sopno gf = g->firststate+1; /* +1 for OEND */ 163 const sopno gl = g->laststate; 164 char *start; 165 char *stop; 166 /* Boyer-Moore algorithms variables */ 167 char *pp; 168 int cj, mj; 169 char *mustfirst; 170 char *mustlast; 171 int *matchjump; 172 int *charjump; 173 174 /* simplify the situation where possible */ 175 if (g->cflags®_NOSUB) 176 nmatch = 0; 177 if (eflags®_STARTEND) { 178 start = string + pmatch[0].rm_so; 179 stop = string + pmatch[0].rm_eo; 180 } else { 181 start = string; 182 stop = start + strlen(start); 183 } 184 if (stop < start) 185 return(REG_INVARG); 186 187 /* prescreening; this does wonders for this rather slow code */ 188 if (g->must != NULL) { 189 if (g->charjump != NULL && g->matchjump != NULL) { 190 mustfirst = g->must; 191 mustlast = g->must + g->mlen - 1; 192 charjump = g->charjump; 193 matchjump = g->matchjump; 194 pp = mustlast; 195 for (dp = start+g->mlen-1; dp < stop;) { 196 /* Fast skip non-matches */ 197 while (dp < stop && charjump[(int)*dp]) 198 dp += charjump[(int)*dp]; 199 200 if (dp >= stop) 201 break; 202 203 /* Greedy matcher */ 204 /* We depend on not being used for 205 * for strings of length 1 206 */ 207 while (*--dp == *--pp && pp != mustfirst); 208 209 if (*dp == *pp) 210 break; 211 212 /* Jump to next possible match */ 213 mj = matchjump[pp - mustfirst]; 214 cj = charjump[(int)*dp]; 215 dp += (cj < mj ? mj : cj); 216 pp = mustlast; 217 } 218 if (pp != mustfirst) 219 return(REG_NOMATCH); 220 } else { 221 for (dp = start; dp < stop; dp++) 222 if (*dp == g->must[0] && 223 stop - dp >= g->mlen && 224 memcmp(dp, g->must, (size_t)g->mlen) == 0) 225 break; 226 if (dp == stop) /* we didn't find g->must */ 227 return(REG_NOMATCH); 228 } 229 } 230 231 /* match struct setup */ 232 m->g = g; 233 m->eflags = eflags; 234 m->pmatch = NULL; 235 m->lastpos = NULL; 236 m->offp = string; 237 m->beginp = start; 238 m->endp = stop; 239 STATESETUP(m, 4); 240 SETUP(m->st); 241 SETUP(m->fresh); 242 SETUP(m->tmp); 243 SETUP(m->empty); 244 CLEAR(m->empty); 245 ZAPSTATE(&m->mbs); 246 247 /* Adjust start according to moffset, to speed things up */ 248 if (g->moffset > -1) 249 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 250 251 /* this loop does only one repetition except for backrefs */ 252 for (;;) { 253 endp = fast(m, start, stop, gf, gl); 254 if (endp == NULL) { /* a miss */ 255 if (m->pmatch != NULL) 256 free((char *)m->pmatch); 257 if (m->lastpos != NULL) 258 free((char *)m->lastpos); 259 STATETEARDOWN(m); 260 return(REG_NOMATCH); 261 } 262 if (nmatch == 0 && !g->backrefs) 263 break; /* no further info needed */ 264 265 /* where? */ 266 assert(m->coldp != NULL); 267 for (;;) { 268 NOTE("finding start"); 269 endp = slow(m, m->coldp, stop, gf, gl); 270 if (endp != NULL) 271 break; 272 assert(m->coldp < m->endp); 273 m->coldp += XMBRTOWC(NULL, m->coldp, 274 m->endp - m->coldp, &m->mbs, 0); 275 } 276 if (nmatch == 1 && !g->backrefs) 277 break; /* no further info needed */ 278 279 /* oh my, he wants the subexpressions... */ 280 if (m->pmatch == NULL) 281 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 282 sizeof(regmatch_t)); 283 if (m->pmatch == NULL) { 284 STATETEARDOWN(m); 285 return(REG_ESPACE); 286 } 287 for (i = 1; i <= m->g->nsub; i++) 288 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 289 if (!g->backrefs && !(m->eflags®_BACKR)) { 290 NOTE("dissecting"); 291 dp = dissect(m, m->coldp, endp, gf, gl); 292 } else { 293 if (g->nplus > 0 && m->lastpos == NULL) 294 m->lastpos = (char **)malloc((g->nplus+1) * 295 sizeof(char *)); 296 if (g->nplus > 0 && m->lastpos == NULL) { 297 free(m->pmatch); 298 STATETEARDOWN(m); 299 return(REG_ESPACE); 300 } 301 NOTE("backref dissect"); 302 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 303 } 304 if (dp != NULL) 305 break; 306 307 /* uh-oh... we couldn't find a subexpression-level match */ 308 assert(g->backrefs); /* must be back references doing it */ 309 assert(g->nplus == 0 || m->lastpos != NULL); 310 for (;;) { 311 if (dp != NULL || endp <= m->coldp) 312 break; /* defeat */ 313 NOTE("backoff"); 314 endp = slow(m, m->coldp, endp-1, gf, gl); 315 if (endp == NULL) 316 break; /* defeat */ 317 /* try it on a shorter possibility */ 318 #ifndef NDEBUG 319 for (i = 1; i <= m->g->nsub; i++) { 320 assert(m->pmatch[i].rm_so == -1); 321 assert(m->pmatch[i].rm_eo == -1); 322 } 323 #endif 324 NOTE("backoff dissect"); 325 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 326 } 327 assert(dp == NULL || dp == endp); 328 if (dp != NULL) /* found a shorter one */ 329 break; 330 331 /* despite initial appearances, there is no match here */ 332 NOTE("false alarm"); 333 /* recycle starting later */ 334 start = m->coldp + XMBRTOWC(NULL, m->coldp, 335 stop - m->coldp, &m->mbs, 0); 336 assert(start <= stop); 337 } 338 339 /* fill in the details if requested */ 340 if (nmatch > 0) { 341 pmatch[0].rm_so = m->coldp - m->offp; 342 pmatch[0].rm_eo = endp - m->offp; 343 } 344 if (nmatch > 1) { 345 assert(m->pmatch != NULL); 346 for (i = 1; i < nmatch; i++) 347 if (i <= m->g->nsub) 348 pmatch[i] = m->pmatch[i]; 349 else { 350 pmatch[i].rm_so = -1; 351 pmatch[i].rm_eo = -1; 352 } 353 } 354 355 if (m->pmatch != NULL) 356 free((char *)m->pmatch); 357 if (m->lastpos != NULL) 358 free((char *)m->lastpos); 359 STATETEARDOWN(m); 360 return(0); 361 } 362 363 /* 364 - dissect - figure out what matched what, no back references 365 == static char *dissect(struct match *m, char *start, \ 366 == char *stop, sopno startst, sopno stopst); 367 */ 368 static char * /* == stop (success) always */ 369 dissect(m, start, stop, startst, stopst) 370 struct match *m; 371 char *start; 372 char *stop; 373 sopno startst; 374 sopno stopst; 375 { 376 int i; 377 sopno ss; /* start sop of current subRE */ 378 sopno es; /* end sop of current subRE */ 379 char *sp; /* start of string matched by it */ 380 char *stp; /* string matched by it cannot pass here */ 381 char *rest; /* start of rest of string */ 382 char *tail; /* string unmatched by rest of RE */ 383 sopno ssub; /* start sop of subsubRE */ 384 sopno esub; /* end sop of subsubRE */ 385 char *ssp; /* start of string matched by subsubRE */ 386 char *sep; /* end of string matched by subsubRE */ 387 char *oldssp; /* previous ssp */ 388 char *dp; 389 390 AT("diss", start, stop, startst, stopst); 391 sp = start; 392 for (ss = startst; ss < stopst; ss = es) { 393 /* identify end of subRE */ 394 es = ss; 395 switch (OP(m->g->strip[es])) { 396 case OPLUS_: 397 case OQUEST_: 398 es += OPND(m->g->strip[es]); 399 break; 400 case OCH_: 401 while (OP(m->g->strip[es]) != O_CH) 402 es += OPND(m->g->strip[es]); 403 break; 404 } 405 es++; 406 407 /* figure out what it matched */ 408 switch (OP(m->g->strip[ss])) { 409 case OEND: 410 assert(nope); 411 break; 412 case OCHAR: 413 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 414 break; 415 case OBOL: 416 case OEOL: 417 case OBOW: 418 case OEOW: 419 break; 420 case OANY: 421 case OANYOF: 422 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 423 break; 424 case OBACK_: 425 case O_BACK: 426 assert(nope); 427 break; 428 /* cases where length of match is hard to find */ 429 case OQUEST_: 430 stp = stop; 431 for (;;) { 432 /* how long could this one be? */ 433 rest = slow(m, sp, stp, ss, es); 434 assert(rest != NULL); /* it did match */ 435 /* could the rest match the rest? */ 436 tail = slow(m, rest, stop, es, stopst); 437 if (tail == stop) 438 break; /* yes! */ 439 /* no -- try a shorter match for this one */ 440 stp = rest - 1; 441 assert(stp >= sp); /* it did work */ 442 } 443 ssub = ss + 1; 444 esub = es - 1; 445 /* did innards match? */ 446 if (slow(m, sp, rest, ssub, esub) != NULL) { 447 dp = dissect(m, sp, rest, ssub, esub); 448 assert(dp == rest); 449 } else /* no */ 450 assert(sp == rest); 451 sp = rest; 452 break; 453 case OPLUS_: 454 stp = stop; 455 for (;;) { 456 /* how long could this one be? */ 457 rest = slow(m, sp, stp, ss, es); 458 assert(rest != NULL); /* it did match */ 459 /* could the rest match the rest? */ 460 tail = slow(m, rest, stop, es, stopst); 461 if (tail == stop) 462 break; /* yes! */ 463 /* no -- try a shorter match for this one */ 464 stp = rest - 1; 465 assert(stp >= sp); /* it did work */ 466 } 467 ssub = ss + 1; 468 esub = es - 1; 469 ssp = sp; 470 oldssp = ssp; 471 for (;;) { /* find last match of innards */ 472 sep = slow(m, ssp, rest, ssub, esub); 473 if (sep == NULL || sep == ssp) 474 break; /* failed or matched null */ 475 oldssp = ssp; /* on to next try */ 476 ssp = sep; 477 } 478 if (sep == NULL) { 479 /* last successful match */ 480 sep = ssp; 481 ssp = oldssp; 482 } 483 assert(sep == rest); /* must exhaust substring */ 484 assert(slow(m, ssp, sep, ssub, esub) == rest); 485 dp = dissect(m, ssp, sep, ssub, esub); 486 assert(dp == sep); 487 sp = rest; 488 break; 489 case OCH_: 490 stp = stop; 491 for (;;) { 492 /* how long could this one be? */ 493 rest = slow(m, sp, stp, ss, es); 494 assert(rest != NULL); /* it did match */ 495 /* could the rest match the rest? */ 496 tail = slow(m, rest, stop, es, stopst); 497 if (tail == stop) 498 break; /* yes! */ 499 /* no -- try a shorter match for this one */ 500 stp = rest - 1; 501 assert(stp >= sp); /* it did work */ 502 } 503 ssub = ss + 1; 504 esub = ss + OPND(m->g->strip[ss]) - 1; 505 assert(OP(m->g->strip[esub]) == OOR1); 506 for (;;) { /* find first matching branch */ 507 if (slow(m, sp, rest, ssub, esub) == rest) 508 break; /* it matched all of it */ 509 /* that one missed, try next one */ 510 assert(OP(m->g->strip[esub]) == OOR1); 511 esub++; 512 assert(OP(m->g->strip[esub]) == OOR2); 513 ssub = esub + 1; 514 esub += OPND(m->g->strip[esub]); 515 if (OP(m->g->strip[esub]) == OOR2) 516 esub--; 517 else 518 assert(OP(m->g->strip[esub]) == O_CH); 519 } 520 dp = dissect(m, sp, rest, ssub, esub); 521 assert(dp == rest); 522 sp = rest; 523 break; 524 case O_PLUS: 525 case O_QUEST: 526 case OOR1: 527 case OOR2: 528 case O_CH: 529 assert(nope); 530 break; 531 case OLPAREN: 532 i = OPND(m->g->strip[ss]); 533 assert(0 < i && i <= m->g->nsub); 534 m->pmatch[i].rm_so = sp - m->offp; 535 break; 536 case ORPAREN: 537 i = OPND(m->g->strip[ss]); 538 assert(0 < i && i <= m->g->nsub); 539 m->pmatch[i].rm_eo = sp - m->offp; 540 break; 541 default: /* uh oh */ 542 assert(nope); 543 break; 544 } 545 } 546 547 assert(sp == stop); 548 return(sp); 549 } 550 551 /* 552 - backref - figure out what matched what, figuring in back references 553 == static char *backref(struct match *m, char *start, \ 554 == char *stop, sopno startst, sopno stopst, sopno lev); 555 */ 556 static char * /* == stop (success) or NULL (failure) */ 557 backref(m, start, stop, startst, stopst, lev, rec) 558 struct match *m; 559 char *start; 560 char *stop; 561 sopno startst; 562 sopno stopst; 563 sopno lev; /* PLUS nesting level */ 564 int rec; 565 { 566 int i; 567 sopno ss; /* start sop of current subRE */ 568 char *sp; /* start of string matched by it */ 569 sopno ssub; /* start sop of subsubRE */ 570 sopno esub; /* end sop of subsubRE */ 571 char *ssp; /* start of string matched by subsubRE */ 572 char *dp; 573 size_t len; 574 int hard; 575 sop s; 576 regoff_t offsave; 577 cset *cs; 578 wint_t wc; 579 580 AT("back", start, stop, startst, stopst); 581 sp = start; 582 583 /* get as far as we can with easy stuff */ 584 hard = 0; 585 for (ss = startst; !hard && ss < stopst; ss++) 586 switch (OP(s = m->g->strip[ss])) { 587 case OCHAR: 588 if (sp == stop) 589 return(NULL); 590 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 591 if (wc != OPND(s)) 592 return(NULL); 593 break; 594 case OANY: 595 if (sp == stop) 596 return(NULL); 597 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 598 if (wc == BADCHAR) 599 return (NULL); 600 break; 601 case OANYOF: 602 if (sp == stop) 603 return (NULL); 604 cs = &m->g->sets[OPND(s)]; 605 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 606 if (wc == BADCHAR || !CHIN(cs, wc)) 607 return(NULL); 608 break; 609 case OBOL: 610 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 611 (sp < m->endp && *(sp-1) == '\n' && 612 (m->g->cflags®_NEWLINE)) ) 613 { /* yes */ } 614 else 615 return(NULL); 616 break; 617 case OEOL: 618 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 619 (sp < m->endp && *sp == '\n' && 620 (m->g->cflags®_NEWLINE)) ) 621 { /* yes */ } 622 else 623 return(NULL); 624 break; 625 case OBOW: 626 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 627 (sp < m->endp && *(sp-1) == '\n' && 628 (m->g->cflags®_NEWLINE)) || 629 (sp > m->beginp && 630 !ISWORD(*(sp-1))) ) && 631 (sp < m->endp && ISWORD(*sp)) ) 632 { /* yes */ } 633 else 634 return(NULL); 635 break; 636 case OEOW: 637 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 638 (sp < m->endp && *sp == '\n' && 639 (m->g->cflags®_NEWLINE)) || 640 (sp < m->endp && !ISWORD(*sp)) ) && 641 (sp > m->beginp && ISWORD(*(sp-1))) ) 642 { /* yes */ } 643 else 644 return(NULL); 645 break; 646 case O_QUEST: 647 break; 648 case OOR1: /* matches null but needs to skip */ 649 ss++; 650 s = m->g->strip[ss]; 651 do { 652 assert(OP(s) == OOR2); 653 ss += OPND(s); 654 } while (OP(s = m->g->strip[ss]) != O_CH); 655 /* note that the ss++ gets us past the O_CH */ 656 break; 657 default: /* have to make a choice */ 658 hard = 1; 659 break; 660 } 661 if (!hard) { /* that was it! */ 662 if (sp != stop) 663 return(NULL); 664 return(sp); 665 } 666 ss--; /* adjust for the for's final increment */ 667 668 /* the hard stuff */ 669 AT("hard", sp, stop, ss, stopst); 670 s = m->g->strip[ss]; 671 switch (OP(s)) { 672 case OBACK_: /* the vilest depths */ 673 i = OPND(s); 674 assert(0 < i && i <= m->g->nsub); 675 if (m->pmatch[i].rm_eo == -1) 676 return(NULL); 677 assert(m->pmatch[i].rm_so != -1); 678 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 679 if (len == 0 && rec++ > MAX_RECURSION) 680 return(NULL); 681 assert(stop - m->beginp >= len); 682 if (sp > stop - len) 683 return(NULL); /* not enough left to match */ 684 ssp = m->offp + m->pmatch[i].rm_so; 685 if (memcmp(sp, ssp, len) != 0) 686 return(NULL); 687 while (m->g->strip[ss] != SOP(O_BACK, i)) 688 ss++; 689 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 690 break; 691 case OQUEST_: /* to null or not */ 692 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 693 if (dp != NULL) 694 return(dp); /* not */ 695 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 696 break; 697 case OPLUS_: 698 assert(m->lastpos != NULL); 699 assert(lev+1 <= m->g->nplus); 700 m->lastpos[lev+1] = sp; 701 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 702 break; 703 case O_PLUS: 704 if (sp == m->lastpos[lev]) /* last pass matched null */ 705 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 706 /* try another pass */ 707 m->lastpos[lev] = sp; 708 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 709 if (dp == NULL) 710 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 711 else 712 return(dp); 713 break; 714 case OCH_: /* find the right one, if any */ 715 ssub = ss + 1; 716 esub = ss + OPND(s) - 1; 717 assert(OP(m->g->strip[esub]) == OOR1); 718 for (;;) { /* find first matching branch */ 719 dp = backref(m, sp, stop, ssub, esub, lev, rec); 720 if (dp != NULL) 721 return(dp); 722 /* that one missed, try next one */ 723 if (OP(m->g->strip[esub]) == O_CH) 724 return(NULL); /* there is none */ 725 esub++; 726 assert(OP(m->g->strip[esub]) == OOR2); 727 ssub = esub + 1; 728 esub += OPND(m->g->strip[esub]); 729 if (OP(m->g->strip[esub]) == OOR2) 730 esub--; 731 else 732 assert(OP(m->g->strip[esub]) == O_CH); 733 } 734 break; 735 case OLPAREN: /* must undo assignment if rest fails */ 736 i = OPND(s); 737 assert(0 < i && i <= m->g->nsub); 738 offsave = m->pmatch[i].rm_so; 739 m->pmatch[i].rm_so = sp - m->offp; 740 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 741 if (dp != NULL) 742 return(dp); 743 m->pmatch[i].rm_so = offsave; 744 return(NULL); 745 break; 746 case ORPAREN: /* must undo assignment if rest fails */ 747 i = OPND(s); 748 assert(0 < i && i <= m->g->nsub); 749 offsave = m->pmatch[i].rm_eo; 750 m->pmatch[i].rm_eo = sp - m->offp; 751 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 752 if (dp != NULL) 753 return(dp); 754 m->pmatch[i].rm_eo = offsave; 755 return(NULL); 756 break; 757 default: /* uh oh */ 758 assert(nope); 759 break; 760 } 761 762 /* "can't happen" */ 763 assert(nope); 764 /* NOTREACHED */ 765 return "shut up gcc"; 766 } 767 768 /* 769 - fast - step through the string at top speed 770 == static char *fast(struct match *m, char *start, \ 771 == char *stop, sopno startst, sopno stopst); 772 */ 773 static char * /* where tentative match ended, or NULL */ 774 fast(m, start, stop, startst, stopst) 775 struct match *m; 776 char *start; 777 char *stop; 778 sopno startst; 779 sopno stopst; 780 { 781 states st = m->st; 782 states fresh = m->fresh; 783 states tmp = m->tmp; 784 char *p = start; 785 wint_t c; 786 wint_t lastc; /* previous c */ 787 wint_t flagch; 788 int i; 789 char *coldp; /* last p after which no match was underway */ 790 size_t clen; 791 792 CLEAR(st); 793 SET1(st, startst); 794 st = step(m->g, startst, stopst, st, NOTHING, st); 795 ASSIGN(fresh, st); 796 SP("start", st, *p); 797 coldp = NULL; 798 if (start == m->beginp) 799 c = OUT; 800 else { 801 /* 802 * XXX Wrong if the previous character was multi-byte. 803 * Newline never is (in encodings supported by FreeBSD), 804 * so this only breaks the ISWORD tests below. 805 */ 806 c = (uch)*(start - 1); 807 } 808 for (;;) { 809 /* next character */ 810 lastc = c; 811 if (p == m->endp) { 812 clen = 0; 813 c = OUT; 814 } else 815 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 816 if (EQ(st, fresh)) 817 coldp = p; 818 819 /* is there an EOL and/or BOL between lastc and c? */ 820 flagch = '\0'; 821 i = 0; 822 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 823 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 824 flagch = BOL; 825 i = m->g->nbol; 826 } 827 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 828 (c == OUT && !(m->eflags®_NOTEOL)) ) { 829 flagch = (flagch == BOL) ? BOLEOL : EOL; 830 i += m->g->neol; 831 } 832 if (i != 0) { 833 for (; i > 0; i--) 834 st = step(m->g, startst, stopst, st, flagch, st); 835 SP("boleol", st, c); 836 } 837 838 /* how about a word boundary? */ 839 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 840 (c != OUT && ISWORD(c)) ) { 841 flagch = BOW; 842 } 843 if ( (lastc != OUT && ISWORD(lastc)) && 844 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 845 flagch = EOW; 846 } 847 if (flagch == BOW || flagch == EOW) { 848 st = step(m->g, startst, stopst, st, flagch, st); 849 SP("boweow", st, c); 850 } 851 852 /* are we done? */ 853 if (ISSET(st, stopst) || p == stop || clen > stop - p) 854 break; /* NOTE BREAK OUT */ 855 856 /* no, we must deal with this character */ 857 ASSIGN(tmp, st); 858 ASSIGN(st, fresh); 859 assert(c != OUT); 860 st = step(m->g, startst, stopst, tmp, c, st); 861 SP("aft", st, c); 862 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 863 p += clen; 864 } 865 866 assert(coldp != NULL); 867 m->coldp = coldp; 868 if (ISSET(st, stopst)) 869 return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 870 else 871 return(NULL); 872 } 873 874 /* 875 - slow - step through the string more deliberately 876 == static char *slow(struct match *m, char *start, \ 877 == char *stop, sopno startst, sopno stopst); 878 */ 879 static char * /* where it ended */ 880 slow(m, start, stop, startst, stopst) 881 struct match *m; 882 char *start; 883 char *stop; 884 sopno startst; 885 sopno stopst; 886 { 887 states st = m->st; 888 states empty = m->empty; 889 states tmp = m->tmp; 890 char *p = start; 891 wint_t c; 892 wint_t lastc; /* previous c */ 893 wint_t flagch; 894 int i; 895 char *matchp; /* last p at which a match ended */ 896 size_t clen; 897 898 AT("slow", start, stop, startst, stopst); 899 CLEAR(st); 900 SET1(st, startst); 901 SP("sstart", st, *p); 902 st = step(m->g, startst, stopst, st, NOTHING, st); 903 matchp = NULL; 904 if (start == m->beginp) 905 c = OUT; 906 else { 907 /* 908 * XXX Wrong if the previous character was multi-byte. 909 * Newline never is (in encodings supported by FreeBSD), 910 * so this only breaks the ISWORD tests below. 911 */ 912 c = (uch)*(start - 1); 913 } 914 for (;;) { 915 /* next character */ 916 lastc = c; 917 if (p == m->endp) { 918 c = OUT; 919 clen = 0; 920 } else 921 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 922 923 /* is there an EOL and/or BOL between lastc and c? */ 924 flagch = '\0'; 925 i = 0; 926 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 927 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 928 flagch = BOL; 929 i = m->g->nbol; 930 } 931 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 932 (c == OUT && !(m->eflags®_NOTEOL)) ) { 933 flagch = (flagch == BOL) ? BOLEOL : EOL; 934 i += m->g->neol; 935 } 936 if (i != 0) { 937 for (; i > 0; i--) 938 st = step(m->g, startst, stopst, st, flagch, st); 939 SP("sboleol", st, c); 940 } 941 942 /* how about a word boundary? */ 943 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 944 (c != OUT && ISWORD(c)) ) { 945 flagch = BOW; 946 } 947 if ( (lastc != OUT && ISWORD(lastc)) && 948 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 949 flagch = EOW; 950 } 951 if (flagch == BOW || flagch == EOW) { 952 st = step(m->g, startst, stopst, st, flagch, st); 953 SP("sboweow", st, c); 954 } 955 956 /* are we done? */ 957 if (ISSET(st, stopst)) 958 matchp = p; 959 if (EQ(st, empty) || p == stop || clen > stop - p) 960 break; /* NOTE BREAK OUT */ 961 962 /* no, we must deal with this character */ 963 ASSIGN(tmp, st); 964 ASSIGN(st, empty); 965 assert(c != OUT); 966 st = step(m->g, startst, stopst, tmp, c, st); 967 SP("saft", st, c); 968 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 969 p += clen; 970 } 971 972 return(matchp); 973 } 974 975 976 /* 977 - step - map set of states reachable before char to set reachable after 978 == static states step(struct re_guts *g, sopno start, sopno stop, \ 979 == states bef, int ch, states aft); 980 == #define BOL (OUT-1) 981 == #define EOL (BOL-1) 982 == #define BOLEOL (BOL-2) 983 == #define NOTHING (BOL-3) 984 == #define BOW (BOL-4) 985 == #define EOW (BOL-5) 986 == #define BADCHAR (BOL-6) 987 == #define NONCHAR(c) ((c) <= OUT) 988 */ 989 static states 990 step(g, start, stop, bef, ch, aft) 991 struct re_guts *g; 992 sopno start; /* start state within strip */ 993 sopno stop; /* state after stop state within strip */ 994 states bef; /* states reachable before */ 995 wint_t ch; /* character or NONCHAR code */ 996 states aft; /* states already known reachable after */ 997 { 998 cset *cs; 999 sop s; 1000 sopno pc; 1001 onestate here; /* note, macros know this name */ 1002 sopno look; 1003 int i; 1004 1005 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1006 s = g->strip[pc]; 1007 switch (OP(s)) { 1008 case OEND: 1009 assert(pc == stop-1); 1010 break; 1011 case OCHAR: 1012 /* only characters can match */ 1013 assert(!NONCHAR(ch) || ch != OPND(s)); 1014 if (ch == OPND(s)) 1015 FWD(aft, bef, 1); 1016 break; 1017 case OBOL: 1018 if (ch == BOL || ch == BOLEOL) 1019 FWD(aft, bef, 1); 1020 break; 1021 case OEOL: 1022 if (ch == EOL || ch == BOLEOL) 1023 FWD(aft, bef, 1); 1024 break; 1025 case OBOW: 1026 if (ch == BOW) 1027 FWD(aft, bef, 1); 1028 break; 1029 case OEOW: 1030 if (ch == EOW) 1031 FWD(aft, bef, 1); 1032 break; 1033 case OANY: 1034 if (!NONCHAR(ch)) 1035 FWD(aft, bef, 1); 1036 break; 1037 case OANYOF: 1038 cs = &g->sets[OPND(s)]; 1039 if (!NONCHAR(ch) && CHIN(cs, ch)) 1040 FWD(aft, bef, 1); 1041 break; 1042 case OBACK_: /* ignored here */ 1043 case O_BACK: 1044 FWD(aft, aft, 1); 1045 break; 1046 case OPLUS_: /* forward, this is just an empty */ 1047 FWD(aft, aft, 1); 1048 break; 1049 case O_PLUS: /* both forward and back */ 1050 FWD(aft, aft, 1); 1051 i = ISSETBACK(aft, OPND(s)); 1052 BACK(aft, aft, OPND(s)); 1053 if (!i && ISSETBACK(aft, OPND(s))) { 1054 /* oho, must reconsider loop body */ 1055 pc -= OPND(s) + 1; 1056 INIT(here, pc); 1057 } 1058 break; 1059 case OQUEST_: /* two branches, both forward */ 1060 FWD(aft, aft, 1); 1061 FWD(aft, aft, OPND(s)); 1062 break; 1063 case O_QUEST: /* just an empty */ 1064 FWD(aft, aft, 1); 1065 break; 1066 case OLPAREN: /* not significant here */ 1067 case ORPAREN: 1068 FWD(aft, aft, 1); 1069 break; 1070 case OCH_: /* mark the first two branches */ 1071 FWD(aft, aft, 1); 1072 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1073 FWD(aft, aft, OPND(s)); 1074 break; 1075 case OOR1: /* done a branch, find the O_CH */ 1076 if (ISSTATEIN(aft, here)) { 1077 for (look = 1; 1078 OP(s = g->strip[pc+look]) != O_CH; 1079 look += OPND(s)) 1080 assert(OP(s) == OOR2); 1081 FWD(aft, aft, look); 1082 } 1083 break; 1084 case OOR2: /* propagate OCH_'s marking */ 1085 FWD(aft, aft, 1); 1086 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1087 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1088 FWD(aft, aft, OPND(s)); 1089 } 1090 break; 1091 case O_CH: /* just empty */ 1092 FWD(aft, aft, 1); 1093 break; 1094 default: /* ooooops... */ 1095 assert(nope); 1096 break; 1097 } 1098 } 1099 1100 return(aft); 1101 } 1102 1103 #ifdef REDEBUG 1104 /* 1105 - print - print a set of states 1106 == #ifdef REDEBUG 1107 == static void print(struct match *m, char *caption, states st, \ 1108 == int ch, FILE *d); 1109 == #endif 1110 */ 1111 static void 1112 print(m, caption, st, ch, d) 1113 struct match *m; 1114 char *caption; 1115 states st; 1116 int ch; 1117 FILE *d; 1118 { 1119 struct re_guts *g = m->g; 1120 int i; 1121 int first = 1; 1122 1123 if (!(m->eflags®_TRACE)) 1124 return; 1125 1126 fprintf(d, "%s", caption); 1127 if (ch != '\0') 1128 fprintf(d, " %s", pchar(ch)); 1129 for (i = 0; i < g->nstates; i++) 1130 if (ISSET(st, i)) { 1131 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1132 first = 0; 1133 } 1134 fprintf(d, "\n"); 1135 } 1136 1137 /* 1138 - at - print current situation 1139 == #ifdef REDEBUG 1140 == static void at(struct match *m, char *title, char *start, char *stop, \ 1141 == sopno startst, sopno stopst); 1142 == #endif 1143 */ 1144 static void 1145 at(m, title, start, stop, startst, stopst) 1146 struct match *m; 1147 char *title; 1148 char *start; 1149 char *stop; 1150 sopno startst; 1151 sopno stopst; 1152 { 1153 if (!(m->eflags®_TRACE)) 1154 return; 1155 1156 printf("%s %s-", title, pchar(*start)); 1157 printf("%s ", pchar(*stop)); 1158 printf("%ld-%ld\n", (long)startst, (long)stopst); 1159 } 1160 1161 #ifndef PCHARDONE 1162 #define PCHARDONE /* never again */ 1163 /* 1164 - pchar - make a character printable 1165 == #ifdef REDEBUG 1166 == static char *pchar(int ch); 1167 == #endif 1168 * 1169 * Is this identical to regchar() over in debug.c? Well, yes. But a 1170 * duplicate here avoids having a debugging-capable regexec.o tied to 1171 * a matching debug.o, and this is convenient. It all disappears in 1172 * the non-debug compilation anyway, so it doesn't matter much. 1173 */ 1174 static char * /* -> representation */ 1175 pchar(ch) 1176 int ch; 1177 { 1178 static char pbuf[10]; 1179 1180 if (isprint((uch)ch) || ch == ' ') 1181 sprintf(pbuf, "%c", ch); 1182 else 1183 sprintf(pbuf, "\\%o", ch); 1184 return(pbuf); 1185 } 1186 #endif 1187 #endif 1188 1189 #undef matcher 1190 #undef fast 1191 #undef slow 1192 #undef dissect 1193 #undef backref 1194 #undef step 1195 #undef print 1196 #undef at 1197 #undef match 1198