1 /* 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 36 * Use is subject to license terms. 37 */ 38 39 /* 40 * The matching engine and friends. This file is #included by regexec.c 41 * after suitable #defines of a variety of macros used herein, so that 42 * different state representations can be used without duplicating masses 43 * of code. 44 */ 45 46 #ifdef SNAMES 47 #define matcher smatcher 48 #define fast sfast 49 #define slow sslow 50 #define dissect sdissect 51 #define backref sbackref 52 #define step sstep 53 #define print sprint 54 #define at sat 55 #define match smat 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #endif 68 #ifdef MNAMES 69 #define matcher mmatcher 70 #define fast mfast 71 #define slow mslow 72 #define dissect mdissect 73 #define backref mbackref 74 #define step mstep 75 #define print mprint 76 #define at mat 77 #define match mmat 78 #endif 79 80 /* another structure passed up and down to avoid zillions of parameters */ 81 struct match { 82 struct re_guts *g; 83 int eflags; 84 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 85 const char *offp; /* offsets work from here */ 86 const char *beginp; /* start of string -- virtual NUL precedes */ 87 const char *endp; /* end of string -- virtual NUL here */ 88 const char *coldp; /* can be no match starting before here */ 89 const char **lastpos; /* [nplus+1] */ 90 STATEVARS; 91 states st; /* current states */ 92 states fresh; /* states for a fresh start */ 93 states tmp; /* temporary */ 94 states empty; /* empty set of states */ 95 mbstate_t mbs; /* multibyte conversion state */ 96 }; 97 98 /* ========= begin header generated by ./mkh ========= */ 99 #ifdef __cplusplus 100 extern "C" { 101 #endif 102 103 /* === engine.c === */ 104 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 105 static const char *dissect(struct match *, const char *, const char *, 106 sopno, sopno); 107 static const char *backref(struct match *, const char *, const char *, sopno, 108 sopno, sopno, int); 109 static const char *fast(struct match *, const char *, const char *, sopno, 110 sopno); 111 static const char *slow(struct match *, const char *, const char *, sopno, 112 sopno); 113 static states step(struct re_guts *, sopno, sopno, states, wint_t, states); 114 #define MAX_RECURSION 100 115 #define BOL (OUT-1) 116 #define EOL (BOL-1) 117 #define BOLEOL (BOL-2) 118 #define NOTHING (BOL-3) 119 #define BOW (BOL-4) 120 #define EOW (BOL-5) 121 #define BADCHAR (BOL-6) 122 #define NONCHAR(c) ((c) <= OUT) 123 #ifdef REDEBUG 124 static void print(struct match *, const char *, states, int, FILE *); 125 #endif 126 #ifdef REDEBUG 127 static void at(struct match *, const char *, const char *, const char *, 128 sopno, sopno); 129 #endif 130 #ifdef REDEBUG 131 static const char *pchar(int ch); 132 #endif 133 134 #ifdef __cplusplus 135 } 136 #endif 137 /* ========= end header generated by ./mkh ========= */ 138 139 #ifdef REDEBUG 140 #define SP(t, s, c) print(m, t, s, c, stdout) 141 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 142 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 143 #else 144 #define SP(t, s, c) /* nothing */ 145 #define AT(t, p1, p2, s1, s2) /* nothing */ 146 #define NOTE(s) /* nothing */ 147 #endif 148 149 /* 150 * matcher - the actual matching engine 151 */ 152 static int /* 0 success, REG_NOMATCH failure */ 153 matcher(struct re_guts *g, 154 const char *string, 155 size_t nmatch, 156 regmatch_t pmatch[], 157 int eflags) 158 { 159 const char *endp; 160 int i; 161 struct match mv; 162 struct match *m = &mv; 163 const char *dp; 164 const sopno gf = g->firststate+1; /* +1 for OEND */ 165 const sopno gl = g->laststate; 166 const char *start; 167 const char *stop; 168 /* Boyer-Moore algorithms variables */ 169 const char *pp; 170 int cj, mj; 171 const char *mustfirst; 172 const char *mustlast; 173 int *matchjump; 174 int *charjump; 175 176 /* simplify the situation where possible */ 177 if (g->cflags®_NOSUB) 178 nmatch = 0; 179 #ifdef REG_STARTEND 180 if (eflags®_STARTEND) { 181 start = string + pmatch[0].rm_so; 182 stop = string + pmatch[0].rm_eo; 183 } else { 184 start = string; 185 stop = start + strlen(start); 186 } 187 #else 188 start = string; 189 stop = start + strlen(start); 190 #endif 191 if (stop < start) 192 return (REG_EFATAL); 193 194 /* prescreening; this does wonders for this rather slow code */ 195 if (g->must != NULL) { 196 if (g->charjump != NULL && g->matchjump != NULL) { 197 mustfirst = g->must; 198 mustlast = g->must + g->mlen - 1; 199 charjump = g->charjump; 200 matchjump = g->matchjump; 201 pp = mustlast; 202 for (dp = start+g->mlen-1; dp < stop; ) { 203 /* Fast skip non-matches */ 204 while (dp < stop && charjump[(int)*dp]) 205 dp += charjump[(int)*dp]; 206 207 if (dp >= stop) 208 break; 209 210 /* Greedy matcher */ 211 /* 212 * We depend on not being used for 213 * for strings of length 1 214 */ 215 while (*--dp == *--pp && pp != mustfirst) 216 ; 217 218 if (*dp == *pp) 219 break; 220 221 /* Jump to next possible match */ 222 mj = matchjump[pp - mustfirst]; 223 cj = charjump[(int)*dp]; 224 dp += (cj < mj ? mj : cj); 225 pp = mustlast; 226 } 227 if (pp != mustfirst) 228 return (REG_NOMATCH); 229 } else { 230 for (dp = start; dp < stop; dp++) 231 if (*dp == g->must[0] && 232 stop - dp >= g->mlen && 233 memcmp(dp, g->must, (size_t)g->mlen) == 0) 234 break; 235 if (dp == stop) /* we didn't find g->must */ 236 return (REG_NOMATCH); 237 } 238 } 239 240 /* match struct setup */ 241 m->g = g; 242 m->eflags = eflags; 243 m->pmatch = NULL; 244 m->lastpos = NULL; 245 m->offp = string; 246 m->beginp = start; 247 m->endp = stop; 248 STATESETUP(m, 4); 249 SETUP(m->st); 250 SETUP(m->fresh); 251 SETUP(m->tmp); 252 SETUP(m->empty); 253 CLEAR(m->empty); 254 ZAPSTATE(&m->mbs); 255 256 /* Adjust start according to moffset, to speed things up */ 257 if (g->moffset > -1) 258 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 259 260 SP("mloop", m->st, *start); 261 262 /* this loop does only one repetition except for backrefs */ 263 for (;;) { 264 endp = fast(m, start, stop, gf, gl); 265 if (endp == NULL) { /* a miss */ 266 if (m->pmatch != NULL) 267 free((char *)m->pmatch); 268 if (m->lastpos != NULL) 269 free((char *)m->lastpos); 270 STATETEARDOWN(m); 271 return (REG_NOMATCH); 272 } 273 if (nmatch == 0 && !g->backrefs) 274 break; /* no further info needed */ 275 276 /* where? */ 277 assert(m->coldp != NULL); 278 for (;;) { 279 NOTE("finding start"); 280 endp = slow(m, m->coldp, stop, gf, gl); 281 if (endp != NULL) 282 break; 283 assert(m->coldp < m->endp); 284 m->coldp += XMBRTOWC(NULL, m->coldp, 285 m->endp - m->coldp, &m->mbs, 0); 286 } 287 if (nmatch == 1 && !g->backrefs) 288 break; /* no further info needed */ 289 290 /* oh my, he wants the subexpressions... */ 291 if (m->pmatch == NULL) 292 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 293 sizeof (regmatch_t)); 294 if (m->pmatch == NULL) { 295 STATETEARDOWN(m); 296 return (REG_ESPACE); 297 } 298 for (i = 1; i <= m->g->nsub; i++) 299 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 300 /* NB: FreeBSD has REG_BACKR, we do not */ 301 if (!g->backrefs /* && !(m->eflags®_BACKR) */) { 302 NOTE("dissecting"); 303 dp = dissect(m, m->coldp, endp, gf, gl); 304 } else { 305 if (g->nplus > 0 && m->lastpos == NULL) 306 m->lastpos = malloc((g->nplus+1) * 307 sizeof (const char *)); 308 if (g->nplus > 0 && m->lastpos == NULL) { 309 free(m->pmatch); 310 STATETEARDOWN(m); 311 return (REG_ESPACE); 312 } 313 NOTE("backref dissect"); 314 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 315 } 316 if (dp != NULL) 317 break; 318 319 /* uh-oh... we couldn't find a subexpression-level match */ 320 assert(g->backrefs); /* must be back references doing it */ 321 assert(g->nplus == 0 || m->lastpos != NULL); 322 for (;;) { 323 if (dp != NULL || endp <= m->coldp) 324 break; /* defeat */ 325 NOTE("backoff"); 326 endp = slow(m, m->coldp, endp-1, gf, gl); 327 if (endp == NULL) 328 break; /* defeat */ 329 /* try it on a shorter possibility */ 330 #ifndef NDEBUG 331 for (i = 1; i <= m->g->nsub; i++) { 332 assert(m->pmatch[i].rm_so == -1); 333 assert(m->pmatch[i].rm_eo == -1); 334 } 335 #endif 336 NOTE("backoff dissect"); 337 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 338 } 339 assert(dp == NULL || dp == endp); 340 if (dp != NULL) /* found a shorter one */ 341 break; 342 343 /* despite initial appearances, there is no match here */ 344 NOTE("false alarm"); 345 /* recycle starting later */ 346 start = m->coldp + XMBRTOWC(NULL, m->coldp, 347 stop - m->coldp, &m->mbs, 0); 348 assert(start <= stop); 349 } 350 351 /* fill in the details if requested */ 352 if (nmatch > 0) { 353 pmatch[0].rm_so = m->coldp - m->offp; 354 pmatch[0].rm_eo = endp - m->offp; 355 } 356 if (nmatch > 1) { 357 assert(m->pmatch != NULL); 358 for (i = 1; i < nmatch; i++) 359 if (i <= m->g->nsub) 360 pmatch[i] = m->pmatch[i]; 361 else { 362 pmatch[i].rm_so = -1; 363 pmatch[i].rm_eo = -1; 364 } 365 } 366 367 if (m->pmatch != NULL) 368 free((char *)m->pmatch); 369 if (m->lastpos != NULL) 370 free((char *)m->lastpos); 371 STATETEARDOWN(m); 372 return (0); 373 } 374 375 /* 376 * dissect - figure out what matched what, no back references 377 */ 378 static const char * 379 dissect(struct match *m, const char *start, const char *stop, sopno startst, 380 sopno stopst) 381 { 382 int i; 383 sopno ss; /* start sop of current subRE */ 384 sopno es; /* end sop of current subRE */ 385 const char *sp; /* start of string matched by it */ 386 const char *stp; /* string matched by it cannot pass here */ 387 const char *rest; /* start of rest of string */ 388 const char *tail; /* string unmatched by rest of RE */ 389 sopno ssub; /* start sop of subsubRE */ 390 sopno esub; /* end sop of subsubRE */ 391 const char *ssp; /* start of string matched by subsubRE */ 392 const char *sep; /* end of string matched by subsubRE */ 393 const char *oldssp; /* previous ssp */ 394 const char *dp; 395 396 AT("diss", start, stop, startst, stopst); 397 sp = start; 398 for (ss = startst; ss < stopst; ss = es) { 399 /* identify end of subRE */ 400 es = ss; 401 switch (OP(m->g->strip[es])) { 402 case OPLUS_: 403 case OQUEST_: 404 es += OPND(m->g->strip[es]); 405 break; 406 case OCH_: 407 while (OP(m->g->strip[es]) != O_CH) 408 es += OPND(m->g->strip[es]); 409 break; 410 } 411 es++; 412 413 /* figure out what it matched */ 414 switch (OP(m->g->strip[ss])) { 415 case OEND: 416 assert(0); 417 break; 418 case OCHAR: 419 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 420 break; 421 case OBOL: 422 case OEOL: 423 case OBOW: 424 case OEOW: 425 break; 426 case OANY: 427 case OANYOF: 428 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 429 break; 430 case OBACK_: 431 case O_BACK: 432 assert(0); 433 break; 434 /* cases where length of match is hard to find */ 435 case OQUEST_: 436 stp = stop; 437 for (;;) { 438 /* how long could this one be? */ 439 rest = slow(m, sp, stp, ss, es); 440 assert(rest != NULL); /* it did match */ 441 /* could the rest match the rest? */ 442 tail = slow(m, rest, stop, es, stopst); 443 if (tail == stop) 444 break; /* yes! */ 445 /* no -- try a shorter match for this one */ 446 stp = rest - 1; 447 assert(stp >= sp); /* it did work */ 448 } 449 ssub = ss + 1; 450 esub = es - 1; 451 /* did innards match? */ 452 if (slow(m, sp, rest, ssub, esub) != NULL) { 453 dp = dissect(m, sp, rest, ssub, esub); 454 assert(dp == rest); 455 #if defined(__lint) 456 (void) dp; 457 #endif 458 } else /* no */ 459 assert(sp == rest); 460 sp = rest; 461 break; 462 case OPLUS_: 463 stp = stop; 464 for (;;) { 465 /* how long could this one be? */ 466 rest = slow(m, sp, stp, ss, es); 467 assert(rest != NULL); /* it did match */ 468 /* could the rest match the rest? */ 469 tail = slow(m, rest, stop, es, stopst); 470 if (tail == stop) 471 break; /* yes! */ 472 /* no -- try a shorter match for this one */ 473 stp = rest - 1; 474 assert(stp >= sp); /* it did work */ 475 } 476 ssub = ss + 1; 477 esub = es - 1; 478 ssp = sp; 479 oldssp = ssp; 480 for (;;) { /* find last match of innards */ 481 sep = slow(m, ssp, rest, ssub, esub); 482 if (sep == NULL || sep == ssp) 483 break; /* failed or matched null */ 484 oldssp = ssp; /* on to next try */ 485 ssp = sep; 486 } 487 if (sep == NULL) { 488 /* last successful match */ 489 sep = ssp; 490 ssp = oldssp; 491 } 492 assert(sep == rest); /* must exhaust substring */ 493 assert(slow(m, ssp, sep, ssub, esub) == rest); 494 dp = dissect(m, ssp, sep, ssub, esub); 495 assert(dp == sep); 496 sp = rest; 497 break; 498 case OCH_: 499 stp = stop; 500 for (;;) { 501 /* how long could this one be? */ 502 rest = slow(m, sp, stp, ss, es); 503 assert(rest != NULL); /* it did match */ 504 /* could the rest match the rest? */ 505 tail = slow(m, rest, stop, es, stopst); 506 if (tail == stop) 507 break; /* yes! */ 508 /* no -- try a shorter match for this one */ 509 stp = rest - 1; 510 assert(stp >= sp); /* it did work */ 511 } 512 ssub = ss + 1; 513 esub = ss + OPND(m->g->strip[ss]) - 1; 514 assert(OP(m->g->strip[esub]) == OOR1); 515 for (;;) { /* find first matching branch */ 516 if (slow(m, sp, rest, ssub, esub) == rest) 517 break; /* it matched all of it */ 518 /* that one missed, try next one */ 519 assert(OP(m->g->strip[esub]) == OOR1); 520 esub++; 521 assert(OP(m->g->strip[esub]) == OOR2); 522 ssub = esub + 1; 523 esub += OPND(m->g->strip[esub]); 524 if (OP(m->g->strip[esub]) == OOR2) 525 esub--; 526 else 527 assert(OP(m->g->strip[esub]) == O_CH); 528 } 529 dp = dissect(m, sp, rest, ssub, esub); 530 assert(dp == rest); 531 sp = rest; 532 break; 533 case O_PLUS: 534 case O_QUEST: 535 case OOR1: 536 case OOR2: 537 case O_CH: 538 assert(0); 539 break; 540 case OLPAREN: 541 i = OPND(m->g->strip[ss]); 542 assert(0 < i && i <= m->g->nsub); 543 m->pmatch[i].rm_so = sp - m->offp; 544 break; 545 case ORPAREN: 546 i = OPND(m->g->strip[ss]); 547 assert(0 < i && i <= m->g->nsub); 548 m->pmatch[i].rm_eo = sp - m->offp; 549 break; 550 default: /* uh oh */ 551 assert(0); 552 break; 553 } 554 } 555 556 assert(sp == stop); 557 return (sp); 558 } 559 560 /* 561 * backref - figure out what matched what, figuring in back references 562 */ 563 static const char * 564 backref(struct match *m, const char *start, const char *stop, sopno startst, 565 sopno stopst, sopno lev, /* PLUS nesting level */ 566 int rec) 567 { 568 int i; 569 sopno ss; /* start sop of current subRE */ 570 const char *sp; /* start of string matched by it */ 571 sopno ssub; /* start sop of subsubRE */ 572 sopno esub; /* end sop of subsubRE */ 573 const char *ssp; /* start of string matched by subsubRE */ 574 const char *dp; 575 size_t len; 576 int hard; 577 sop s; 578 regoff_t offsave; 579 cset *cs; 580 wint_t wc; 581 582 AT("back", start, stop, startst, stopst); 583 sp = start; 584 585 /* get as far as we can with easy stuff */ 586 hard = 0; 587 for (ss = startst; !hard && ss < stopst; ss++) 588 switch (OP(s = m->g->strip[ss])) { 589 case OCHAR: 590 if (sp == stop) 591 return (NULL); 592 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 593 if (wc != OPND(s)) 594 return (NULL); 595 break; 596 case OANY: 597 if (sp == stop) 598 return (NULL); 599 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 600 if (wc == BADCHAR) 601 return (NULL); 602 break; 603 case OANYOF: 604 if (sp == stop) 605 return (NULL); 606 cs = &m->g->sets[OPND(s)]; 607 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 608 if (wc == BADCHAR || !CHIN(cs, wc)) 609 return (NULL); 610 break; 611 case OBOL: 612 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 613 (sp < m->endp && *(sp-1) == '\n' && 614 (m->g->cflags®_NEWLINE))) { 615 break; 616 } 617 else 618 return (NULL); 619 break; 620 case OEOL: 621 if ((sp == m->endp && !(m->eflags®_NOTEOL)) || 622 (sp < m->endp && *sp == '\n' && 623 (m->g->cflags®_NEWLINE))) { 624 break; 625 } 626 else 627 return (NULL); 628 break; 629 case OBOW: 630 if (((sp == m->beginp && !(m->eflags®_NOTBOL)) || 631 (sp < m->endp && *(sp-1) == '\n' && 632 (m->g->cflags®_NEWLINE)) || 633 (sp > m->beginp && !ISWORD(*(sp-1)))) && 634 (sp < m->endp && ISWORD(*sp))) { 635 break; 636 } else 637 return (NULL); 638 break; 639 case OEOW: 640 if (((sp == m->endp && !(m->eflags®_NOTEOL)) || 641 (sp < m->endp && *sp == '\n' && 642 (m->g->cflags®_NEWLINE)) || 643 (sp < m->endp && !ISWORD(*sp))) && 644 (sp > m->beginp && ISWORD(*(sp-1)))) { 645 break; 646 } else 647 return (NULL); 648 break; 649 case O_QUEST: 650 break; 651 case OOR1: /* matches null but needs to skip */ 652 ss++; 653 s = m->g->strip[ss]; 654 do { 655 assert(OP(s) == OOR2); 656 ss += OPND(s); 657 } while (OP(s = m->g->strip[ss]) != O_CH); 658 /* note that the ss++ gets us past the O_CH */ 659 break; 660 default: /* have to make a choice */ 661 hard = 1; 662 break; 663 } 664 if (!hard) { /* that was it! */ 665 if (sp != stop) 666 return (NULL); 667 return (sp); 668 } 669 ss--; /* adjust for the for's final increment */ 670 671 /* the hard stuff */ 672 AT("hard", sp, stop, ss, stopst); 673 s = m->g->strip[ss]; 674 switch (OP(s)) { 675 case OBACK_: /* the vilest depths */ 676 i = OPND(s); 677 assert(0 < i && i <= m->g->nsub); 678 if (m->pmatch[i].rm_eo == -1) 679 return (NULL); 680 assert(m->pmatch[i].rm_so != -1); 681 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 682 if (len == 0 && rec++ > MAX_RECURSION) 683 return (NULL); 684 assert(stop - m->beginp >= len); 685 if (sp > stop - len) 686 return (NULL); /* not enough left to match */ 687 ssp = m->offp + m->pmatch[i].rm_so; 688 if (memcmp(sp, ssp, len) != 0) 689 return (NULL); 690 while (m->g->strip[ss] != SOP(O_BACK, i)) 691 ss++; 692 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 693 break; 694 case OQUEST_: /* to null or not */ 695 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 696 if (dp != NULL) 697 return (dp); /* not */ 698 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 699 break; 700 case OPLUS_: 701 assert(m->lastpos != NULL); 702 assert(lev+1 <= m->g->nplus); 703 m->lastpos[lev+1] = sp; 704 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 705 break; 706 case O_PLUS: 707 if (sp == m->lastpos[lev]) /* last pass matched null */ 708 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 709 /* try another pass */ 710 m->lastpos[lev] = sp; 711 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 712 if (dp == NULL) 713 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 714 else 715 return (dp); 716 break; 717 case OCH_: /* find the right one, if any */ 718 ssub = ss + 1; 719 esub = ss + OPND(s) - 1; 720 assert(OP(m->g->strip[esub]) == OOR1); 721 for (;;) { /* find first matching branch */ 722 dp = backref(m, sp, stop, ssub, esub, lev, rec); 723 if (dp != NULL) 724 return (dp); 725 /* that one missed, try next one */ 726 if (OP(m->g->strip[esub]) == O_CH) 727 return (NULL); /* there is none */ 728 esub++; 729 assert(OP(m->g->strip[esub]) == OOR2); 730 ssub = esub + 1; 731 esub += OPND(m->g->strip[esub]); 732 if (OP(m->g->strip[esub]) == OOR2) 733 esub--; 734 else 735 assert(OP(m->g->strip[esub]) == O_CH); 736 } 737 break; 738 case OLPAREN: /* must undo assignment if rest fails */ 739 i = OPND(s); 740 assert(0 < i && i <= m->g->nsub); 741 offsave = m->pmatch[i].rm_so; 742 m->pmatch[i].rm_so = sp - m->offp; 743 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 744 if (dp != NULL) 745 return (dp); 746 m->pmatch[i].rm_so = offsave; 747 return (NULL); 748 break; 749 case ORPAREN: /* must undo assignment if rest fails */ 750 i = OPND(s); 751 assert(0 < i && i <= m->g->nsub); 752 offsave = m->pmatch[i].rm_eo; 753 m->pmatch[i].rm_eo = sp - m->offp; 754 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 755 if (dp != NULL) 756 return (dp); 757 m->pmatch[i].rm_eo = offsave; 758 return (NULL); 759 break; 760 default: /* uh oh */ 761 assert(0); 762 break; 763 } 764 765 /* "can't happen" */ 766 assert(0); 767 return (NULL); 768 } 769 770 /* 771 * fast - step through the string at top speed 772 */ 773 static const char * 774 fast(struct match *m, const char *start, const char *stop, sopno startst, 775 sopno stopst) 776 { 777 states st = m->st; 778 states fresh = m->fresh; 779 states tmp = m->tmp; 780 const char *p = start; 781 wint_t c; 782 wint_t lastc; /* previous c */ 783 wint_t flagch; 784 int i; 785 const char *coldp; /* last p after which no match was underway */ 786 size_t clen; 787 788 CLEAR(st); 789 SET1(st, startst); 790 SP("fast", st, *p); 791 st = step(m->g, startst, stopst, st, NOTHING, st); 792 ASSIGN(fresh, st); 793 SP("start", st, *p); 794 coldp = NULL; 795 if (start == m->beginp) 796 c = OUT; 797 else { 798 /* 799 * XXX Wrong if the previous character was multi-byte. 800 * Newline never is (in encodings supported by FreeBSD), 801 * so this only breaks the ISWORD tests below. 802 */ 803 c = (uch)*(start - 1); 804 } 805 for (;;) { 806 /* next character */ 807 lastc = c; 808 if (p == m->endp) { 809 clen = 0; 810 c = OUT; 811 } else 812 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 813 if (EQ(st, fresh)) 814 coldp = p; 815 816 /* is there an EOL and/or BOL between lastc and c? */ 817 flagch = '\0'; 818 i = 0; 819 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 820 (lastc == OUT && !(m->eflags®_NOTBOL))) { 821 flagch = BOL; 822 i = m->g->nbol; 823 } 824 if ((c == '\n' && m->g->cflags®_NEWLINE) || 825 (c == OUT && !(m->eflags®_NOTEOL))) { 826 flagch = (flagch == BOL) ? BOLEOL : EOL; 827 i += m->g->neol; 828 } 829 if (i != 0) { 830 for (; i > 0; i--) 831 st = step(m->g, startst, stopst, st, 832 flagch, st); 833 SP("boleol", st, c); 834 } 835 836 /* how about a word boundary? */ 837 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 838 (c != OUT && ISWORD(c))) { 839 flagch = BOW; 840 } 841 if ((lastc != OUT && ISWORD(lastc)) && 842 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 843 flagch = EOW; 844 } 845 if (flagch == BOW || flagch == EOW) { 846 st = step(m->g, startst, stopst, st, flagch, st); 847 SP("boweow", st, c); 848 } 849 850 /* are we done? */ 851 if (ISSET(st, stopst) || p == stop || clen > stop - p) 852 break; /* NOTE BREAK OUT */ 853 854 /* no, we must deal with this character */ 855 ASSIGN(tmp, st); 856 ASSIGN(st, fresh); 857 assert(c != OUT); 858 st = step(m->g, startst, stopst, tmp, c, st); 859 SP("aft", st, c); 860 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 861 p += clen; 862 } 863 864 assert(coldp != NULL); 865 m->coldp = coldp; 866 if (ISSET(st, stopst)) 867 return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 868 else 869 return (NULL); 870 } 871 872 /* 873 * slow - step through the string more deliberately 874 */ 875 static const char * 876 slow(struct match *m, const char *start, const char *stop, sopno startst, 877 sopno stopst) 878 { 879 states st = m->st; 880 states empty = m->empty; 881 states tmp = m->tmp; 882 const char *p = start; 883 wint_t c; 884 wint_t lastc; /* previous c */ 885 wint_t flagch; 886 int i; 887 const char *matchp; /* last p at which a match ended */ 888 size_t clen; 889 890 AT("slow", start, stop, startst, stopst); 891 CLEAR(st); 892 SET1(st, startst); 893 SP("sstart", st, *p); 894 st = step(m->g, startst, stopst, st, NOTHING, st); 895 matchp = NULL; 896 if (start == m->beginp) 897 c = OUT; 898 else { 899 /* 900 * XXX Wrong if the previous character was multi-byte. 901 * Newline never is (in encodings supported by FreeBSD), 902 * so this only breaks the ISWORD tests below. 903 */ 904 c = (uch)*(start - 1); 905 } 906 for (;;) { 907 /* next character */ 908 lastc = c; 909 if (p == m->endp) { 910 c = OUT; 911 clen = 0; 912 } else 913 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 914 915 /* is there an EOL and/or BOL between lastc and c? */ 916 flagch = '\0'; 917 i = 0; 918 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 919 (lastc == OUT && !(m->eflags®_NOTBOL))) { 920 flagch = BOL; 921 i = m->g->nbol; 922 } 923 if ((c == '\n' && m->g->cflags®_NEWLINE) || 924 (c == OUT && !(m->eflags®_NOTEOL))) { 925 flagch = (flagch == BOL) ? BOLEOL : EOL; 926 i += m->g->neol; 927 } 928 if (i != 0) { 929 for (; i > 0; i--) 930 st = step(m->g, startst, stopst, st, 931 flagch, st); 932 SP("sboleol", st, c); 933 } 934 935 /* how about a word boundary? */ 936 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 937 (c != OUT && ISWORD(c))) { 938 flagch = BOW; 939 } 940 if ((lastc != OUT && ISWORD(lastc)) && 941 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 942 flagch = EOW; 943 } 944 if (flagch == BOW || flagch == EOW) { 945 st = step(m->g, startst, stopst, st, flagch, st); 946 SP("sboweow", st, c); 947 } 948 949 /* are we done? */ 950 if (ISSET(st, stopst)) 951 matchp = p; 952 if (EQ(st, empty) || p == stop || clen > stop - p) 953 break; /* NOTE BREAK OUT */ 954 955 /* no, we must deal with this character */ 956 ASSIGN(tmp, st); 957 ASSIGN(st, empty); 958 assert(c != OUT); 959 st = step(m->g, startst, stopst, tmp, c, st); 960 SP("saft", st, c); 961 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 962 p += clen; 963 } 964 965 return (matchp); 966 } 967 968 969 /* 970 * step - map set of states reachable before char to set reachable after 971 */ 972 static states 973 step(struct re_guts *g, 974 sopno start, /* start state within strip */ 975 sopno stop, /* state after stop state within strip */ 976 states bef, /* states reachable before */ 977 wint_t ch, /* character or NONCHAR code */ 978 states aft) /* states already known reachable after */ 979 { 980 cset *cs; 981 sop s; 982 sopno pc; 983 onestate here; /* note, macros know this name */ 984 sopno look; 985 int i; 986 987 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 988 s = g->strip[pc]; 989 switch (OP(s)) { 990 case OEND: 991 assert(pc == stop-1); 992 break; 993 case OCHAR: 994 /* only characters can match */ 995 assert(!NONCHAR(ch) || ch != OPND(s)); 996 if (ch == OPND(s)) 997 FWD(aft, bef, 1); 998 break; 999 case OBOL: 1000 if (ch == BOL || ch == BOLEOL) 1001 FWD(aft, bef, 1); 1002 break; 1003 case OEOL: 1004 if (ch == EOL || ch == BOLEOL) 1005 FWD(aft, bef, 1); 1006 break; 1007 case OBOW: 1008 if (ch == BOW) 1009 FWD(aft, bef, 1); 1010 break; 1011 case OEOW: 1012 if (ch == EOW) 1013 FWD(aft, bef, 1); 1014 break; 1015 case OANY: 1016 if (!NONCHAR(ch)) 1017 FWD(aft, bef, 1); 1018 break; 1019 case OANYOF: 1020 cs = &g->sets[OPND(s)]; 1021 if (!NONCHAR(ch) && CHIN(cs, ch)) 1022 FWD(aft, bef, 1); 1023 break; 1024 case OBACK_: /* ignored here */ 1025 case O_BACK: 1026 FWD(aft, aft, 1); 1027 break; 1028 case OPLUS_: /* forward, this is just an empty */ 1029 FWD(aft, aft, 1); 1030 break; 1031 case O_PLUS: /* both forward and back */ 1032 FWD(aft, aft, 1); 1033 i = ISSETBACK(aft, OPND(s)); 1034 BACK(aft, aft, OPND(s)); 1035 if (!i && ISSETBACK(aft, OPND(s))) { 1036 /* oho, must reconsider loop body */ 1037 pc -= OPND(s) + 1; 1038 INIT(here, pc); 1039 } 1040 break; 1041 case OQUEST_: /* two branches, both forward */ 1042 FWD(aft, aft, 1); 1043 FWD(aft, aft, OPND(s)); 1044 break; 1045 case O_QUEST: /* just an empty */ 1046 FWD(aft, aft, 1); 1047 break; 1048 case OLPAREN: /* not significant here */ 1049 case ORPAREN: 1050 FWD(aft, aft, 1); 1051 break; 1052 case OCH_: /* mark the first two branches */ 1053 FWD(aft, aft, 1); 1054 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1055 FWD(aft, aft, OPND(s)); 1056 break; 1057 case OOR1: /* done a branch, find the O_CH */ 1058 if (ISSTATEIN(aft, here)) { 1059 for (look = 1; 1060 OP(s = g->strip[pc+look]) != O_CH; 1061 look += OPND(s)) 1062 assert(OP(s) == OOR2); 1063 FWD(aft, aft, look + 1); 1064 } 1065 break; 1066 case OOR2: /* propagate OCH_'s marking */ 1067 FWD(aft, aft, 1); 1068 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1069 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1070 FWD(aft, aft, OPND(s)); 1071 } 1072 break; 1073 case O_CH: /* just empty */ 1074 FWD(aft, aft, 1); 1075 break; 1076 default: /* ooooops... */ 1077 assert(0); 1078 break; 1079 } 1080 } 1081 1082 return (aft); 1083 } 1084 1085 #ifdef REDEBUG 1086 /* 1087 * print - print a set of states 1088 */ 1089 static void 1090 print(struct match *m, const char *caption, states st, int ch, FILE *d) 1091 { 1092 struct re_guts *g = m->g; 1093 int i; 1094 int first = 1; 1095 1096 if (!(m->eflags®_TRACE)) 1097 return; 1098 1099 (void) fprintf(d, "%s", caption); 1100 if (ch != '\0') 1101 (void) fprintf(d, " %s", pchar(ch)); 1102 for (i = 0; i < g->nstates; i++) 1103 if (ISSET(st, i)) { 1104 (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1105 first = 0; 1106 } 1107 (void) fprintf(d, "\n"); 1108 } 1109 1110 /* 1111 * at - print current situation 1112 */ 1113 static void 1114 at(struct match *m, const char *title, const char *start, const char *stop, 1115 sopno startst, sopno stopst) 1116 { 1117 if (!(m->eflags®_TRACE)) 1118 return; 1119 1120 (void) printf("%s %s-", title, pchar(*start)); 1121 (void) printf("%s ", pchar(*stop)); 1122 (void) printf("%ld-%ld\n", (long)startst, (long)stopst); 1123 } 1124 1125 #ifndef PCHARDONE 1126 #define PCHARDONE /* never again */ 1127 /* 1128 * pchar - make a character printable 1129 * 1130 * Is this identical to regchar() over in debug.c? Well, yes. But a 1131 * duplicate here avoids having a debugging-capable regexec.o tied to 1132 * a matching debug.o, and this is convenient. It all disappears in 1133 * the non-debug compilation anyway, so it doesn't matter much. 1134 */ 1135 static const char * 1136 pchar(int ch) 1137 { 1138 static char pbuf[10]; 1139 1140 if (isprint((uch)ch) || ch == ' ') 1141 (void) sprintf(pbuf, "%c", ch); 1142 else 1143 (void) sprintf(pbuf, "\\%o", ch); 1144 return (pbuf); 1145 } 1146 #endif 1147 #endif 1148 1149 #undef matcher 1150 #undef fast 1151 #undef slow 1152 #undef dissect 1153 #undef backref 1154 #undef step 1155 #undef print 1156 #undef at 1157 #undef match 1158