1 /* 2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * The matching engine and friends. This file is #included by regexec.c 37 * after suitable #defines of a variety of macros used herein, so that 38 * different state representations can be used without duplicating masses 39 * of code. 40 */ 41 42 #ifdef SNAMES 43 #define matcher smatcher 44 #define fast sfast 45 #define slow sslow 46 #define dissect sdissect 47 #define backref sbackref 48 #define step sstep 49 #define print sprint 50 #define at sat 51 #define match smat 52 #endif 53 #ifdef LNAMES 54 #define matcher lmatcher 55 #define fast lfast 56 #define slow lslow 57 #define dissect ldissect 58 #define backref lbackref 59 #define step lstep 60 #define print lprint 61 #define at lat 62 #define match lmat 63 #endif 64 #ifdef MNAMES 65 #define matcher mmatcher 66 #define fast mfast 67 #define slow mslow 68 #define dissect mdissect 69 #define backref mbackref 70 #define step mstep 71 #define print mprint 72 #define at mat 73 #define match mmat 74 #endif 75 76 /* another structure passed up and down to avoid zillions of parameters */ 77 struct match { 78 struct re_guts *g; 79 int eflags; 80 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 81 const char *offp; /* offsets work from here */ 82 const char *beginp; /* start of string -- virtual NUL precedes */ 83 const char *endp; /* end of string -- virtual NUL here */ 84 const char *coldp; /* can be no match starting before here */ 85 const char **lastpos; /* [nplus+1] */ 86 STATEVARS; 87 states st; /* current states */ 88 states fresh; /* states for a fresh start */ 89 states tmp; /* temporary */ 90 states empty; /* empty set of states */ 91 mbstate_t mbs; /* multibyte conversion state */ 92 }; 93 94 /* ========= begin header generated by ./mkh ========= */ 95 #ifdef __cplusplus 96 extern "C" { 97 #endif 98 99 /* === engine.c === */ 100 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 101 static const char *dissect(struct match *, const char *, const char *, 102 sopno, sopno); 103 static const char *backref(struct match *, const char *, const char *, sopno, 104 sopno, sopno, int); 105 static const char *fast(struct match *, const char *, const char *, sopno, 106 sopno); 107 static const char *slow(struct match *, const char *, const char *, sopno, 108 sopno); 109 static states step(struct re_guts *, sopno, sopno, states, wint_t, states); 110 #define MAX_RECURSION 100 111 #define BOL (OUT-1) 112 #define EOL (BOL-1) 113 #define BOLEOL (BOL-2) 114 #define NOTHING (BOL-3) 115 #define BOW (BOL-4) 116 #define EOW (BOL-5) 117 #define BADCHAR (BOL-6) 118 #define NONCHAR(c) ((c) <= OUT) 119 #ifdef REDEBUG 120 static void print(struct match *, const char *, states, int, FILE *); 121 #endif 122 #ifdef REDEBUG 123 static void at(struct match *, const char *, const char *, const char *, 124 sopno, sopno); 125 #endif 126 #ifdef REDEBUG 127 static const char *pchar(int ch); 128 #endif 129 130 #ifdef __cplusplus 131 } 132 #endif 133 /* ========= end header generated by ./mkh ========= */ 134 135 #ifdef REDEBUG 136 #define SP(t, s, c) print(m, t, s, c, stdout) 137 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 138 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 139 #else 140 #define SP(t, s, c) /* nothing */ 141 #define AT(t, p1, p2, s1, s2) /* nothing */ 142 #define NOTE(s) /* nothing */ 143 #endif 144 145 /* 146 * matcher - the actual matching engine 147 */ 148 static int /* 0 success, REG_NOMATCH failure */ 149 matcher(struct re_guts *g, 150 const char *string, 151 size_t nmatch, 152 regmatch_t pmatch[], 153 int eflags) 154 { 155 const char *endp; 156 int i; 157 struct match mv; 158 struct match *m = &mv; 159 const char *dp; 160 const sopno gf = g->firststate+1; /* +1 for OEND */ 161 const sopno gl = g->laststate; 162 const char *start; 163 const char *stop; 164 /* Boyer-Moore algorithms variables */ 165 const char *pp; 166 int cj, mj; 167 const char *mustfirst; 168 const char *mustlast; 169 int *matchjump; 170 int *charjump; 171 172 /* simplify the situation where possible */ 173 if (g->cflags®_NOSUB) 174 nmatch = 0; 175 176 if (eflags®_STARTEND) { 177 start = string + pmatch[0].rm_so; 178 stop = string + pmatch[0].rm_eo; 179 } else { 180 start = string; 181 stop = start + strlen(start); 182 } 183 184 if (stop < start) 185 return (REG_EFATAL); 186 187 /* prescreening; this does wonders for this rather slow code */ 188 if (g->must != NULL) { 189 if (g->charjump != NULL && g->matchjump != NULL) { 190 mustfirst = g->must; 191 mustlast = g->must + g->mlen - 1; 192 charjump = g->charjump; 193 matchjump = g->matchjump; 194 pp = mustlast; 195 for (dp = start+g->mlen-1; dp < stop; ) { 196 /* Fast skip non-matches */ 197 while (dp < stop && charjump[(int)*dp]) 198 dp += charjump[(int)*dp]; 199 200 if (dp >= stop) 201 break; 202 203 /* Greedy matcher */ 204 /* 205 * We depend on not being used for 206 * for strings of length 1 207 */ 208 while (*--dp == *--pp && pp != mustfirst) 209 ; 210 211 if (*dp == *pp) 212 break; 213 214 /* Jump to next possible match */ 215 mj = matchjump[pp - mustfirst]; 216 cj = charjump[(int)*dp]; 217 dp += (cj < mj ? mj : cj); 218 pp = mustlast; 219 } 220 if (pp != mustfirst) 221 return (REG_NOMATCH); 222 } else { 223 for (dp = start; dp < stop; dp++) 224 if (*dp == g->must[0] && 225 stop - dp >= g->mlen && 226 memcmp(dp, g->must, (size_t)g->mlen) == 0) 227 break; 228 if (dp == stop) /* we didn't find g->must */ 229 return (REG_NOMATCH); 230 } 231 } 232 233 /* match struct setup */ 234 m->g = g; 235 m->eflags = eflags; 236 m->pmatch = NULL; 237 m->lastpos = NULL; 238 m->offp = string; 239 m->beginp = start; 240 m->endp = stop; 241 STATESETUP(m, 4); 242 SETUP(m->st); 243 SETUP(m->fresh); 244 SETUP(m->tmp); 245 SETUP(m->empty); 246 CLEAR(m->empty); 247 ZAPSTATE(&m->mbs); 248 249 /* Adjust start according to moffset, to speed things up */ 250 if (g->moffset > -1) 251 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 252 253 SP("mloop", m->st, *start); 254 255 /* this loop does only one repetition except for backrefs */ 256 for (;;) { 257 endp = fast(m, start, stop, gf, gl); 258 if (endp == NULL) { /* a miss */ 259 if (m->pmatch != NULL) 260 free((char *)m->pmatch); 261 if (m->lastpos != NULL) 262 free((char *)m->lastpos); 263 STATETEARDOWN(m); 264 return (REG_NOMATCH); 265 } 266 if (nmatch == 0 && !g->backrefs) 267 break; /* no further info needed */ 268 269 /* where? */ 270 assert(m->coldp != NULL); 271 for (;;) { 272 NOTE("finding start"); 273 endp = slow(m, m->coldp, stop, gf, gl); 274 if (endp != NULL) 275 break; 276 assert(m->coldp < m->endp); 277 m->coldp += XMBRTOWC(NULL, m->coldp, 278 m->endp - m->coldp, &m->mbs, 0); 279 } 280 if (nmatch == 1 && !g->backrefs) 281 break; /* no further info needed */ 282 283 /* oh my, he wants the subexpressions... */ 284 if (m->pmatch == NULL) 285 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 286 sizeof (regmatch_t)); 287 if (m->pmatch == NULL) { 288 STATETEARDOWN(m); 289 return (REG_ESPACE); 290 } 291 for (i = 1; i <= m->g->nsub; i++) 292 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 293 /* NB: FreeBSD has REG_BACKR, we do not */ 294 if (!g->backrefs /* && !(m->eflags®_BACKR) */) { 295 NOTE("dissecting"); 296 dp = dissect(m, m->coldp, endp, gf, gl); 297 } else { 298 if (g->nplus > 0 && m->lastpos == NULL) 299 m->lastpos = malloc((g->nplus+1) * 300 sizeof (const char *)); 301 if (g->nplus > 0 && m->lastpos == NULL) { 302 free(m->pmatch); 303 STATETEARDOWN(m); 304 return (REG_ESPACE); 305 } 306 NOTE("backref dissect"); 307 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 308 } 309 if (dp != NULL) 310 break; 311 312 /* uh-oh... we couldn't find a subexpression-level match */ 313 assert(g->backrefs); /* must be back references doing it */ 314 assert(g->nplus == 0 || m->lastpos != NULL); 315 for (;;) { 316 if (dp != NULL || endp <= m->coldp) 317 break; /* defeat */ 318 NOTE("backoff"); 319 endp = slow(m, m->coldp, endp-1, gf, gl); 320 if (endp == NULL) 321 break; /* defeat */ 322 /* try it on a shorter possibility */ 323 #ifndef NDEBUG 324 for (i = 1; i <= m->g->nsub; i++) { 325 assert(m->pmatch[i].rm_so == -1); 326 assert(m->pmatch[i].rm_eo == -1); 327 } 328 #endif 329 NOTE("backoff dissect"); 330 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 331 } 332 assert(dp == NULL || dp == endp); 333 if (dp != NULL) /* found a shorter one */ 334 break; 335 336 /* despite initial appearances, there is no match here */ 337 NOTE("false alarm"); 338 /* recycle starting later */ 339 start = m->coldp + XMBRTOWC(NULL, m->coldp, 340 stop - m->coldp, &m->mbs, 0); 341 assert(start <= stop); 342 } 343 344 /* fill in the details if requested */ 345 if (nmatch > 0) { 346 pmatch[0].rm_so = m->coldp - m->offp; 347 pmatch[0].rm_eo = endp - m->offp; 348 } 349 if (nmatch > 1) { 350 assert(m->pmatch != NULL); 351 for (i = 1; i < nmatch; i++) 352 if (i <= m->g->nsub) 353 pmatch[i] = m->pmatch[i]; 354 else { 355 pmatch[i].rm_so = -1; 356 pmatch[i].rm_eo = -1; 357 } 358 } 359 360 if (m->pmatch != NULL) 361 free((char *)m->pmatch); 362 if (m->lastpos != NULL) 363 free((char *)m->lastpos); 364 STATETEARDOWN(m); 365 return (0); 366 } 367 368 /* 369 * dissect - figure out what matched what, no back references 370 */ 371 static const char * 372 dissect(struct match *m, const char *start, const char *stop, sopno startst, 373 sopno stopst) 374 { 375 int i; 376 sopno ss; /* start sop of current subRE */ 377 sopno es; /* end sop of current subRE */ 378 const char *sp; /* start of string matched by it */ 379 const char *stp; /* string matched by it cannot pass here */ 380 const char *rest; /* start of rest of string */ 381 const char *tail; /* string unmatched by rest of RE */ 382 sopno ssub; /* start sop of subsubRE */ 383 sopno esub; /* end sop of subsubRE */ 384 const char *ssp; /* start of string matched by subsubRE */ 385 const char *sep; /* end of string matched by subsubRE */ 386 const char *oldssp; /* previous ssp */ 387 const char *dp; 388 389 AT("diss", start, stop, startst, stopst); 390 sp = start; 391 for (ss = startst; ss < stopst; ss = es) { 392 /* identify end of subRE */ 393 es = ss; 394 switch (OP(m->g->strip[es])) { 395 case OPLUS_: 396 case OQUEST_: 397 es += OPND(m->g->strip[es]); 398 break; 399 case OCH_: 400 while (OP(m->g->strip[es]) != O_CH) 401 es += OPND(m->g->strip[es]); 402 break; 403 } 404 es++; 405 406 /* figure out what it matched */ 407 switch (OP(m->g->strip[ss])) { 408 case OEND: 409 assert(0); 410 break; 411 case OCHAR: 412 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 413 break; 414 case OBOL: 415 case OEOL: 416 case OBOW: 417 case OEOW: 418 break; 419 case OANY: 420 case OANYOF: 421 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 422 break; 423 case OBACK_: 424 case O_BACK: 425 assert(0); 426 break; 427 /* cases where length of match is hard to find */ 428 case OQUEST_: 429 stp = stop; 430 for (;;) { 431 /* how long could this one be? */ 432 rest = slow(m, sp, stp, ss, es); 433 assert(rest != NULL); /* it did match */ 434 /* could the rest match the rest? */ 435 tail = slow(m, rest, stop, es, stopst); 436 if (tail == stop) 437 break; /* yes! */ 438 /* no -- try a shorter match for this one */ 439 stp = rest - 1; 440 assert(stp >= sp); /* it did work */ 441 } 442 ssub = ss + 1; 443 esub = es - 1; 444 /* did innards match? */ 445 if (slow(m, sp, rest, ssub, esub) != NULL) { 446 dp = dissect(m, sp, rest, ssub, esub); 447 assert(dp == rest); 448 #if defined(__lint) 449 (void) dp; 450 #endif 451 } else /* no */ 452 assert(sp == rest); 453 sp = rest; 454 break; 455 case OPLUS_: 456 stp = stop; 457 for (;;) { 458 /* how long could this one be? */ 459 rest = slow(m, sp, stp, ss, es); 460 assert(rest != NULL); /* it did match */ 461 /* could the rest match the rest? */ 462 tail = slow(m, rest, stop, es, stopst); 463 if (tail == stop) 464 break; /* yes! */ 465 /* no -- try a shorter match for this one */ 466 stp = rest - 1; 467 assert(stp >= sp); /* it did work */ 468 } 469 ssub = ss + 1; 470 esub = es - 1; 471 ssp = sp; 472 oldssp = ssp; 473 for (;;) { /* find last match of innards */ 474 sep = slow(m, ssp, rest, ssub, esub); 475 if (sep == NULL || sep == ssp) 476 break; /* failed or matched null */ 477 oldssp = ssp; /* on to next try */ 478 ssp = sep; 479 } 480 if (sep == NULL) { 481 /* last successful match */ 482 sep = ssp; 483 ssp = oldssp; 484 } 485 assert(sep == rest); /* must exhaust substring */ 486 assert(slow(m, ssp, sep, ssub, esub) == rest); 487 dp = dissect(m, ssp, sep, ssub, esub); 488 assert(dp == sep); 489 sp = rest; 490 break; 491 case OCH_: 492 stp = stop; 493 for (;;) { 494 /* how long could this one be? */ 495 rest = slow(m, sp, stp, ss, es); 496 assert(rest != NULL); /* it did match */ 497 /* could the rest match the rest? */ 498 tail = slow(m, rest, stop, es, stopst); 499 if (tail == stop) 500 break; /* yes! */ 501 /* no -- try a shorter match for this one */ 502 stp = rest - 1; 503 assert(stp >= sp); /* it did work */ 504 } 505 ssub = ss + 1; 506 esub = ss + OPND(m->g->strip[ss]) - 1; 507 assert(OP(m->g->strip[esub]) == OOR1); 508 for (;;) { /* find first matching branch */ 509 if (slow(m, sp, rest, ssub, esub) == rest) 510 break; /* it matched all of it */ 511 /* that one missed, try next one */ 512 assert(OP(m->g->strip[esub]) == OOR1); 513 esub++; 514 assert(OP(m->g->strip[esub]) == OOR2); 515 ssub = esub + 1; 516 esub += OPND(m->g->strip[esub]); 517 if (OP(m->g->strip[esub]) == OOR2) 518 esub--; 519 else 520 assert(OP(m->g->strip[esub]) == O_CH); 521 } 522 dp = dissect(m, sp, rest, ssub, esub); 523 assert(dp == rest); 524 sp = rest; 525 break; 526 case O_PLUS: 527 case O_QUEST: 528 case OOR1: 529 case OOR2: 530 case O_CH: 531 assert(0); 532 break; 533 case OLPAREN: 534 i = OPND(m->g->strip[ss]); 535 assert(0 < i && i <= m->g->nsub); 536 m->pmatch[i].rm_so = sp - m->offp; 537 break; 538 case ORPAREN: 539 i = OPND(m->g->strip[ss]); 540 assert(0 < i && i <= m->g->nsub); 541 m->pmatch[i].rm_eo = sp - m->offp; 542 break; 543 default: /* uh oh */ 544 assert(0); 545 break; 546 } 547 } 548 549 assert(sp == stop); 550 return (sp); 551 } 552 553 /* 554 * backref - figure out what matched what, figuring in back references 555 */ 556 static const char * 557 backref(struct match *m, const char *start, const char *stop, sopno startst, 558 sopno stopst, sopno lev, /* PLUS nesting level */ 559 int rec) 560 { 561 int i; 562 sopno ss; /* start sop of current subRE */ 563 const char *sp; /* start of string matched by it */ 564 sopno ssub; /* start sop of subsubRE */ 565 sopno esub; /* end sop of subsubRE */ 566 const char *ssp; /* start of string matched by subsubRE */ 567 const char *dp; 568 size_t len; 569 int hard; 570 sop s; 571 regoff_t offsave; 572 cset *cs; 573 wint_t wc; 574 575 AT("back", start, stop, startst, stopst); 576 sp = start; 577 578 /* get as far as we can with easy stuff */ 579 hard = 0; 580 for (ss = startst; !hard && ss < stopst; ss++) 581 switch (OP(s = m->g->strip[ss])) { 582 case OCHAR: 583 if (sp == stop) 584 return (NULL); 585 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 586 if (wc != OPND(s)) 587 return (NULL); 588 break; 589 case OANY: 590 if (sp == stop) 591 return (NULL); 592 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 593 if (wc == BADCHAR) 594 return (NULL); 595 break; 596 case OANYOF: 597 if (sp == stop) 598 return (NULL); 599 cs = &m->g->sets[OPND(s)]; 600 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 601 if (wc == BADCHAR || !CHIN(cs, wc)) 602 return (NULL); 603 break; 604 case OBOL: 605 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 606 (sp < m->endp && *(sp-1) == '\n' && 607 (m->g->cflags®_NEWLINE))) { 608 break; 609 } 610 else 611 return (NULL); 612 break; 613 case OEOL: 614 if ((sp == m->endp && !(m->eflags®_NOTEOL)) || 615 (sp < m->endp && *sp == '\n' && 616 (m->g->cflags®_NEWLINE))) { 617 break; 618 } 619 else 620 return (NULL); 621 break; 622 case OBOW: 623 if (((sp == m->beginp && !(m->eflags®_NOTBOL)) || 624 (sp < m->endp && *(sp-1) == '\n' && 625 (m->g->cflags®_NEWLINE)) || 626 (sp > m->beginp && !ISWORD(*(sp-1)))) && 627 (sp < m->endp && ISWORD(*sp))) { 628 break; 629 } else 630 return (NULL); 631 break; 632 case OEOW: 633 if (((sp == m->endp && !(m->eflags®_NOTEOL)) || 634 (sp < m->endp && *sp == '\n' && 635 (m->g->cflags®_NEWLINE)) || 636 (sp < m->endp && !ISWORD(*sp))) && 637 (sp > m->beginp && ISWORD(*(sp-1)))) { 638 break; 639 } else 640 return (NULL); 641 break; 642 case O_QUEST: 643 break; 644 case OOR1: /* matches null but needs to skip */ 645 ss++; 646 s = m->g->strip[ss]; 647 do { 648 assert(OP(s) == OOR2); 649 ss += OPND(s); 650 } while (OP(s = m->g->strip[ss]) != O_CH); 651 /* note that the ss++ gets us past the O_CH */ 652 break; 653 default: /* have to make a choice */ 654 hard = 1; 655 break; 656 } 657 if (!hard) { /* that was it! */ 658 if (sp != stop) 659 return (NULL); 660 return (sp); 661 } 662 ss--; /* adjust for the for's final increment */ 663 664 /* the hard stuff */ 665 AT("hard", sp, stop, ss, stopst); 666 s = m->g->strip[ss]; 667 switch (OP(s)) { 668 case OBACK_: /* the vilest depths */ 669 i = OPND(s); 670 assert(0 < i && i <= m->g->nsub); 671 if (m->pmatch[i].rm_eo == -1) 672 return (NULL); 673 assert(m->pmatch[i].rm_so != -1); 674 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 675 if (len == 0 && rec++ > MAX_RECURSION) 676 return (NULL); 677 assert(stop - m->beginp >= len); 678 if (sp > stop - len) 679 return (NULL); /* not enough left to match */ 680 ssp = m->offp + m->pmatch[i].rm_so; 681 if (memcmp(sp, ssp, len) != 0) 682 return (NULL); 683 while (m->g->strip[ss] != SOP(O_BACK, i)) 684 ss++; 685 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 686 break; 687 case OQUEST_: /* to null or not */ 688 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 689 if (dp != NULL) 690 return (dp); /* not */ 691 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 692 break; 693 case OPLUS_: 694 assert(m->lastpos != NULL); 695 assert(lev+1 <= m->g->nplus); 696 m->lastpos[lev+1] = sp; 697 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 698 break; 699 case O_PLUS: 700 if (sp == m->lastpos[lev]) /* last pass matched null */ 701 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 702 /* try another pass */ 703 m->lastpos[lev] = sp; 704 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 705 if (dp == NULL) 706 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 707 else 708 return (dp); 709 break; 710 case OCH_: /* find the right one, if any */ 711 ssub = ss + 1; 712 esub = ss + OPND(s) - 1; 713 assert(OP(m->g->strip[esub]) == OOR1); 714 for (;;) { /* find first matching branch */ 715 dp = backref(m, sp, stop, ssub, esub, lev, rec); 716 if (dp != NULL) 717 return (dp); 718 /* that one missed, try next one */ 719 if (OP(m->g->strip[esub]) == O_CH) 720 return (NULL); /* there is none */ 721 esub++; 722 assert(OP(m->g->strip[esub]) == OOR2); 723 ssub = esub + 1; 724 esub += OPND(m->g->strip[esub]); 725 if (OP(m->g->strip[esub]) == OOR2) 726 esub--; 727 else 728 assert(OP(m->g->strip[esub]) == O_CH); 729 } 730 break; 731 case OLPAREN: /* must undo assignment if rest fails */ 732 i = OPND(s); 733 assert(0 < i && i <= m->g->nsub); 734 offsave = m->pmatch[i].rm_so; 735 m->pmatch[i].rm_so = sp - m->offp; 736 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 737 if (dp != NULL) 738 return (dp); 739 m->pmatch[i].rm_so = offsave; 740 return (NULL); 741 break; 742 case ORPAREN: /* must undo assignment if rest fails */ 743 i = OPND(s); 744 assert(0 < i && i <= m->g->nsub); 745 offsave = m->pmatch[i].rm_eo; 746 m->pmatch[i].rm_eo = sp - m->offp; 747 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 748 if (dp != NULL) 749 return (dp); 750 m->pmatch[i].rm_eo = offsave; 751 return (NULL); 752 break; 753 default: /* uh oh */ 754 assert(0); 755 break; 756 } 757 758 /* "can't happen" */ 759 assert(0); 760 return (NULL); 761 } 762 763 /* 764 * fast - step through the string at top speed 765 */ 766 static const char * 767 fast(struct match *m, const char *start, const char *stop, sopno startst, 768 sopno stopst) 769 { 770 states st = m->st; 771 states fresh = m->fresh; 772 states tmp = m->tmp; 773 const char *p = start; 774 wint_t c; 775 wint_t lastc; /* previous c */ 776 wint_t flagch; 777 int i; 778 const char *coldp; /* last p after which no match was underway */ 779 size_t clen; 780 781 CLEAR(st); 782 SET1(st, startst); 783 SP("fast", st, *p); 784 st = step(m->g, startst, stopst, st, NOTHING, st); 785 ASSIGN(fresh, st); 786 SP("start", st, *p); 787 coldp = NULL; 788 if (start == m->beginp) 789 c = OUT; 790 else { 791 /* 792 * XXX Wrong if the previous character was multi-byte. 793 * Newline never is (in encodings supported by FreeBSD), 794 * so this only breaks the ISWORD tests below. 795 */ 796 c = (uch)*(start - 1); 797 } 798 for (;;) { 799 /* next character */ 800 lastc = c; 801 if (p == m->endp) { 802 clen = 0; 803 c = OUT; 804 } else 805 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 806 if (EQ(st, fresh)) 807 coldp = p; 808 809 /* is there an EOL and/or BOL between lastc and c? */ 810 flagch = '\0'; 811 i = 0; 812 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 813 (lastc == OUT && !(m->eflags®_NOTBOL))) { 814 flagch = BOL; 815 i = m->g->nbol; 816 } 817 if ((c == '\n' && m->g->cflags®_NEWLINE) || 818 (c == OUT && !(m->eflags®_NOTEOL))) { 819 flagch = (flagch == BOL) ? BOLEOL : EOL; 820 i += m->g->neol; 821 } 822 if (i != 0) { 823 for (; i > 0; i--) 824 st = step(m->g, startst, stopst, st, 825 flagch, st); 826 SP("boleol", st, c); 827 } 828 829 /* how about a word boundary? */ 830 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 831 (c != OUT && ISWORD(c))) { 832 flagch = BOW; 833 } 834 if ((lastc != OUT && ISWORD(lastc)) && 835 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 836 flagch = EOW; 837 } 838 if (flagch == BOW || flagch == EOW) { 839 st = step(m->g, startst, stopst, st, flagch, st); 840 SP("boweow", st, c); 841 } 842 843 /* are we done? */ 844 if (ISSET(st, stopst) || p == stop || clen > stop - p) 845 break; /* NOTE BREAK OUT */ 846 847 /* no, we must deal with this character */ 848 ASSIGN(tmp, st); 849 ASSIGN(st, fresh); 850 assert(c != OUT); 851 st = step(m->g, startst, stopst, tmp, c, st); 852 SP("aft", st, c); 853 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 854 p += clen; 855 } 856 857 assert(coldp != NULL); 858 m->coldp = coldp; 859 if (ISSET(st, stopst)) 860 return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 861 else 862 return (NULL); 863 } 864 865 /* 866 * slow - step through the string more deliberately 867 */ 868 static const char * 869 slow(struct match *m, const char *start, const char *stop, sopno startst, 870 sopno stopst) 871 { 872 states st = m->st; 873 states empty = m->empty; 874 states tmp = m->tmp; 875 const char *p = start; 876 wint_t c; 877 wint_t lastc; /* previous c */ 878 wint_t flagch; 879 int i; 880 const char *matchp; /* last p at which a match ended */ 881 size_t clen; 882 883 AT("slow", start, stop, startst, stopst); 884 CLEAR(st); 885 SET1(st, startst); 886 SP("sstart", st, *p); 887 st = step(m->g, startst, stopst, st, NOTHING, st); 888 matchp = NULL; 889 if (start == m->beginp) 890 c = OUT; 891 else { 892 /* 893 * XXX Wrong if the previous character was multi-byte. 894 * Newline never is (in encodings supported by FreeBSD), 895 * so this only breaks the ISWORD tests below. 896 */ 897 c = (uch)*(start - 1); 898 } 899 for (;;) { 900 /* next character */ 901 lastc = c; 902 if (p == m->endp) { 903 c = OUT; 904 clen = 0; 905 } else 906 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 907 908 /* is there an EOL and/or BOL between lastc and c? */ 909 flagch = '\0'; 910 i = 0; 911 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 912 (lastc == OUT && !(m->eflags®_NOTBOL))) { 913 flagch = BOL; 914 i = m->g->nbol; 915 } 916 if ((c == '\n' && m->g->cflags®_NEWLINE) || 917 (c == OUT && !(m->eflags®_NOTEOL))) { 918 flagch = (flagch == BOL) ? BOLEOL : EOL; 919 i += m->g->neol; 920 } 921 if (i != 0) { 922 for (; i > 0; i--) 923 st = step(m->g, startst, stopst, st, 924 flagch, st); 925 SP("sboleol", st, c); 926 } 927 928 /* how about a word boundary? */ 929 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 930 (c != OUT && ISWORD(c))) { 931 flagch = BOW; 932 } 933 if ((lastc != OUT && ISWORD(lastc)) && 934 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 935 flagch = EOW; 936 } 937 if (flagch == BOW || flagch == EOW) { 938 st = step(m->g, startst, stopst, st, flagch, st); 939 SP("sboweow", st, c); 940 } 941 942 /* are we done? */ 943 if (ISSET(st, stopst)) 944 matchp = p; 945 if (EQ(st, empty) || p == stop || clen > stop - p) 946 break; /* NOTE BREAK OUT */ 947 948 /* no, we must deal with this character */ 949 ASSIGN(tmp, st); 950 ASSIGN(st, empty); 951 assert(c != OUT); 952 st = step(m->g, startst, stopst, tmp, c, st); 953 SP("saft", st, c); 954 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 955 p += clen; 956 } 957 958 return (matchp); 959 } 960 961 962 /* 963 * step - map set of states reachable before char to set reachable after 964 */ 965 static states 966 step(struct re_guts *g, 967 sopno start, /* start state within strip */ 968 sopno stop, /* state after stop state within strip */ 969 states bef, /* states reachable before */ 970 wint_t ch, /* character or NONCHAR code */ 971 states aft) /* states already known reachable after */ 972 { 973 cset *cs; 974 sop s; 975 sopno pc; 976 onestate here; /* note, macros know this name */ 977 sopno look; 978 int i; 979 980 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 981 s = g->strip[pc]; 982 switch (OP(s)) { 983 case OEND: 984 assert(pc == stop-1); 985 break; 986 case OCHAR: 987 /* only characters can match */ 988 assert(!NONCHAR(ch) || ch != OPND(s)); 989 if (ch == OPND(s)) 990 FWD(aft, bef, 1); 991 break; 992 case OBOL: 993 if (ch == BOL || ch == BOLEOL) 994 FWD(aft, bef, 1); 995 break; 996 case OEOL: 997 if (ch == EOL || ch == BOLEOL) 998 FWD(aft, bef, 1); 999 break; 1000 case OBOW: 1001 if (ch == BOW) 1002 FWD(aft, bef, 1); 1003 break; 1004 case OEOW: 1005 if (ch == EOW) 1006 FWD(aft, bef, 1); 1007 break; 1008 case OANY: 1009 if (!NONCHAR(ch)) 1010 FWD(aft, bef, 1); 1011 break; 1012 case OANYOF: 1013 cs = &g->sets[OPND(s)]; 1014 if (!NONCHAR(ch) && CHIN(cs, ch)) 1015 FWD(aft, bef, 1); 1016 break; 1017 case OBACK_: /* ignored here */ 1018 case O_BACK: 1019 FWD(aft, aft, 1); 1020 break; 1021 case OPLUS_: /* forward, this is just an empty */ 1022 FWD(aft, aft, 1); 1023 break; 1024 case O_PLUS: /* both forward and back */ 1025 FWD(aft, aft, 1); 1026 i = ISSETBACK(aft, OPND(s)); 1027 BACK(aft, aft, OPND(s)); 1028 if (!i && ISSETBACK(aft, OPND(s))) { 1029 /* oho, must reconsider loop body */ 1030 pc -= OPND(s) + 1; 1031 INIT(here, pc); 1032 } 1033 break; 1034 case OQUEST_: /* two branches, both forward */ 1035 FWD(aft, aft, 1); 1036 FWD(aft, aft, OPND(s)); 1037 break; 1038 case O_QUEST: /* just an empty */ 1039 FWD(aft, aft, 1); 1040 break; 1041 case OLPAREN: /* not significant here */ 1042 case ORPAREN: 1043 FWD(aft, aft, 1); 1044 break; 1045 case OCH_: /* mark the first two branches */ 1046 FWD(aft, aft, 1); 1047 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1048 FWD(aft, aft, OPND(s)); 1049 break; 1050 case OOR1: /* done a branch, find the O_CH */ 1051 if (ISSTATEIN(aft, here)) { 1052 for (look = 1; 1053 OP(s = g->strip[pc+look]) != O_CH; 1054 look += OPND(s)) 1055 assert(OP(s) == OOR2); 1056 FWD(aft, aft, look + 1); 1057 } 1058 break; 1059 case OOR2: /* propagate OCH_'s marking */ 1060 FWD(aft, aft, 1); 1061 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1062 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1063 FWD(aft, aft, OPND(s)); 1064 } 1065 break; 1066 case O_CH: /* just empty */ 1067 FWD(aft, aft, 1); 1068 break; 1069 default: /* ooooops... */ 1070 assert(0); 1071 break; 1072 } 1073 } 1074 1075 return (aft); 1076 } 1077 1078 #ifdef REDEBUG 1079 /* 1080 * print - print a set of states 1081 */ 1082 static void 1083 print(struct match *m, const char *caption, states st, int ch, FILE *d) 1084 { 1085 struct re_guts *g = m->g; 1086 int i; 1087 int first = 1; 1088 1089 if (!(m->eflags®_TRACE)) 1090 return; 1091 1092 (void) fprintf(d, "%s", caption); 1093 if (ch != '\0') 1094 (void) fprintf(d, " %s", pchar(ch)); 1095 for (i = 0; i < g->nstates; i++) 1096 if (ISSET(st, i)) { 1097 (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1098 first = 0; 1099 } 1100 (void) fprintf(d, "\n"); 1101 } 1102 1103 /* 1104 * at - print current situation 1105 */ 1106 static void 1107 at(struct match *m, const char *title, const char *start, const char *stop, 1108 sopno startst, sopno stopst) 1109 { 1110 if (!(m->eflags®_TRACE)) 1111 return; 1112 1113 (void) printf("%s %s-", title, pchar(*start)); 1114 (void) printf("%s ", pchar(*stop)); 1115 (void) printf("%ld-%ld\n", (long)startst, (long)stopst); 1116 } 1117 1118 #ifndef PCHARDONE 1119 #define PCHARDONE /* never again */ 1120 /* 1121 * pchar - make a character printable 1122 * 1123 * Is this identical to regchar() over in debug.c? Well, yes. But a 1124 * duplicate here avoids having a debugging-capable regexec.o tied to 1125 * a matching debug.o, and this is convenient. It all disappears in 1126 * the non-debug compilation anyway, so it doesn't matter much. 1127 */ 1128 static const char * 1129 pchar(int ch) 1130 { 1131 static char pbuf[10]; 1132 1133 if (isprint((uch)ch) || ch == ' ') 1134 (void) sprintf(pbuf, "%c", ch); 1135 else 1136 (void) sprintf(pbuf, "\\%o", ch); 1137 return (pbuf); 1138 } 1139 #endif 1140 #endif 1141 1142 #undef matcher 1143 #undef fast 1144 #undef slow 1145 #undef dissect 1146 #undef backref 1147 #undef step 1148 #undef print 1149 #undef at 1150 #undef match 1151