1 /* 2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Milan Jurik. All rights reserved. 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 /* 37 * The matching engine and friends. This file is #included by regexec.c 38 * after suitable #defines of a variety of macros used herein, so that 39 * different state representations can be used without duplicating masses 40 * of code. 41 */ 42 43 #ifdef SNAMES 44 #define matcher smatcher 45 #define fast sfast 46 #define slow sslow 47 #define dissect sdissect 48 #define backref sbackref 49 #define step sstep 50 #define print sprint 51 #define at sat 52 #define match smat 53 #endif 54 #ifdef LNAMES 55 #define matcher lmatcher 56 #define fast lfast 57 #define slow lslow 58 #define dissect ldissect 59 #define backref lbackref 60 #define step lstep 61 #define print lprint 62 #define at lat 63 #define match lmat 64 #endif 65 #ifdef MNAMES 66 #define matcher mmatcher 67 #define fast mfast 68 #define slow mslow 69 #define dissect mdissect 70 #define backref mbackref 71 #define step mstep 72 #define print mprint 73 #define at mat 74 #define match mmat 75 #endif 76 77 /* another structure passed up and down to avoid zillions of parameters */ 78 struct match { 79 struct re_guts *g; 80 int eflags; 81 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 82 const char *offp; /* offsets work from here */ 83 const char *beginp; /* start of string -- virtual NUL precedes */ 84 const char *endp; /* end of string -- virtual NUL here */ 85 const char *coldp; /* can be no match starting before here */ 86 const char **lastpos; /* [nplus+1] */ 87 STATEVARS; 88 states st; /* current states */ 89 states fresh; /* states for a fresh start */ 90 states tmp; /* temporary */ 91 states empty; /* empty set of states */ 92 mbstate_t mbs; /* multibyte conversion state */ 93 }; 94 95 /* ========= begin header generated by ./mkh ========= */ 96 #ifdef __cplusplus 97 extern "C" { 98 #endif 99 100 /* === engine.c === */ 101 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 102 static const char *dissect(struct match *, const char *, const char *, 103 sopno, sopno); 104 static const char *backref(struct match *, const char *, const char *, sopno, 105 sopno, sopno, int); 106 static const char *fast(struct match *, const char *, const char *, sopno, 107 sopno); 108 static const char *slow(struct match *, const char *, const char *, sopno, 109 sopno); 110 static states step(struct re_guts *, sopno, sopno, states, wint_t, states); 111 #define MAX_RECURSION 100 112 #define BOL (OUT-1) 113 #define EOL (BOL-1) 114 #define BOLEOL (BOL-2) 115 #define NOTHING (BOL-3) 116 #define BOW (BOL-4) 117 #define EOW (BOL-5) 118 #define BADCHAR (BOL-6) 119 #define NONCHAR(c) ((c) <= OUT) 120 #ifdef REDEBUG 121 static void print(struct match *, const char *, states, int, FILE *); 122 #endif 123 #ifdef REDEBUG 124 static void at(struct match *, const char *, const char *, const char *, 125 sopno, sopno); 126 #endif 127 #ifdef REDEBUG 128 static const char *pchar(int ch); 129 #endif 130 131 #ifdef __cplusplus 132 } 133 #endif 134 /* ========= end header generated by ./mkh ========= */ 135 136 #ifdef REDEBUG 137 #define SP(t, s, c) print(m, t, s, c, stdout) 138 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 139 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 140 #else 141 #define SP(t, s, c) /* nothing */ 142 #define AT(t, p1, p2, s1, s2) /* nothing */ 143 #define NOTE(s) /* nothing */ 144 #endif 145 146 /* 147 * matcher - the actual matching engine 148 */ 149 static int /* 0 success, REG_NOMATCH failure */ 150 matcher(struct re_guts *g, 151 const char *string, 152 size_t nmatch, 153 regmatch_t pmatch[], 154 int eflags) 155 { 156 const char *endp; 157 int i; 158 struct match mv; 159 struct match *m = &mv; 160 const char *dp; 161 const sopno gf = g->firststate+1; /* +1 for OEND */ 162 const sopno gl = g->laststate; 163 const char *start; 164 const char *stop; 165 /* Boyer-Moore algorithms variables */ 166 const char *pp; 167 int cj, mj; 168 const char *mustfirst; 169 const char *mustlast; 170 int *matchjump; 171 int *charjump; 172 173 /* simplify the situation where possible */ 174 if (g->cflags®_NOSUB) 175 nmatch = 0; 176 177 if (eflags®_STARTEND) { 178 start = string + pmatch[0].rm_so; 179 stop = string + pmatch[0].rm_eo; 180 } else { 181 start = string; 182 stop = start + strlen(start); 183 } 184 185 if (stop < start) 186 return (REG_EFATAL); 187 188 /* prescreening; this does wonders for this rather slow code */ 189 if (g->must != NULL) { 190 if (g->charjump != NULL && g->matchjump != NULL) { 191 mustfirst = g->must; 192 mustlast = g->must + g->mlen - 1; 193 charjump = g->charjump; 194 matchjump = g->matchjump; 195 pp = mustlast; 196 for (dp = start+g->mlen-1; dp < stop; ) { 197 /* Fast skip non-matches */ 198 while (dp < stop && charjump[(int)*dp]) 199 dp += charjump[(int)*dp]; 200 201 if (dp >= stop) 202 break; 203 204 /* Greedy matcher */ 205 /* 206 * We depend on not being used for 207 * for strings of length 1 208 */ 209 while (*--dp == *--pp && pp != mustfirst) 210 ; 211 212 if (*dp == *pp) 213 break; 214 215 /* Jump to next possible match */ 216 mj = matchjump[pp - mustfirst]; 217 cj = charjump[(int)*dp]; 218 dp += (cj < mj ? mj : cj); 219 pp = mustlast; 220 } 221 if (pp != mustfirst) 222 return (REG_NOMATCH); 223 } else { 224 for (dp = start; dp < stop; dp++) 225 if (*dp == g->must[0] && 226 stop - dp >= g->mlen && 227 memcmp(dp, g->must, (size_t)g->mlen) == 0) 228 break; 229 if (dp == stop) /* we didn't find g->must */ 230 return (REG_NOMATCH); 231 } 232 } 233 234 /* match struct setup */ 235 m->g = g; 236 m->eflags = eflags; 237 m->pmatch = NULL; 238 m->lastpos = NULL; 239 m->offp = string; 240 m->beginp = start; 241 m->endp = stop; 242 STATESETUP(m, 4); 243 SETUP(m->st); 244 SETUP(m->fresh); 245 SETUP(m->tmp); 246 SETUP(m->empty); 247 CLEAR(m->empty); 248 ZAPSTATE(&m->mbs); 249 250 /* Adjust start according to moffset, to speed things up */ 251 if (g->moffset > -1) 252 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 253 254 SP("mloop", m->st, *start); 255 256 /* this loop does only one repetition except for backrefs */ 257 for (;;) { 258 endp = fast(m, start, stop, gf, gl); 259 if (endp == NULL) { /* a miss */ 260 if (m->pmatch != NULL) 261 free((char *)m->pmatch); 262 if (m->lastpos != NULL) 263 free((char *)m->lastpos); 264 STATETEARDOWN(m); 265 return (REG_NOMATCH); 266 } 267 if (nmatch == 0 && !g->backrefs) 268 break; /* no further info needed */ 269 270 /* where? */ 271 assert(m->coldp != NULL); 272 for (;;) { 273 NOTE("finding start"); 274 endp = slow(m, m->coldp, stop, gf, gl); 275 if (endp != NULL) 276 break; 277 assert(m->coldp < m->endp); 278 m->coldp += XMBRTOWC(NULL, m->coldp, 279 m->endp - m->coldp, &m->mbs, 0); 280 } 281 if (nmatch == 1 && !g->backrefs) 282 break; /* no further info needed */ 283 284 /* oh my, he wants the subexpressions... */ 285 if (m->pmatch == NULL) 286 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 287 sizeof (regmatch_t)); 288 if (m->pmatch == NULL) { 289 STATETEARDOWN(m); 290 return (REG_ESPACE); 291 } 292 for (i = 1; i <= m->g->nsub; i++) 293 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 294 /* NB: FreeBSD has REG_BACKR, we do not */ 295 if (!g->backrefs /* && !(m->eflags®_BACKR) */) { 296 NOTE("dissecting"); 297 dp = dissect(m, m->coldp, endp, gf, gl); 298 } else { 299 if (g->nplus > 0 && m->lastpos == NULL) 300 m->lastpos = malloc((g->nplus+1) * 301 sizeof (const char *)); 302 if (g->nplus > 0 && m->lastpos == NULL) { 303 free(m->pmatch); 304 STATETEARDOWN(m); 305 return (REG_ESPACE); 306 } 307 NOTE("backref dissect"); 308 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 309 } 310 if (dp != NULL) 311 break; 312 313 /* uh-oh... we couldn't find a subexpression-level match */ 314 assert(g->backrefs); /* must be back references doing it */ 315 assert(g->nplus == 0 || m->lastpos != NULL); 316 for (;;) { 317 if (dp != NULL || endp <= m->coldp) 318 break; /* defeat */ 319 NOTE("backoff"); 320 endp = slow(m, m->coldp, endp-1, gf, gl); 321 if (endp == NULL) 322 break; /* defeat */ 323 /* try it on a shorter possibility */ 324 #ifndef NDEBUG 325 for (i = 1; i <= m->g->nsub; i++) { 326 assert(m->pmatch[i].rm_so == -1); 327 assert(m->pmatch[i].rm_eo == -1); 328 } 329 #endif 330 NOTE("backoff dissect"); 331 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 332 } 333 assert(dp == NULL || dp == endp); 334 if (dp != NULL) /* found a shorter one */ 335 break; 336 337 /* despite initial appearances, there is no match here */ 338 NOTE("false alarm"); 339 /* recycle starting later */ 340 start = m->coldp + XMBRTOWC(NULL, m->coldp, 341 stop - m->coldp, &m->mbs, 0); 342 assert(start <= stop); 343 } 344 345 /* fill in the details if requested */ 346 if (nmatch > 0) { 347 pmatch[0].rm_so = m->coldp - m->offp; 348 pmatch[0].rm_eo = endp - m->offp; 349 } 350 if (nmatch > 1) { 351 assert(m->pmatch != NULL); 352 for (i = 1; i < nmatch; i++) 353 if (i <= m->g->nsub) 354 pmatch[i] = m->pmatch[i]; 355 else { 356 pmatch[i].rm_so = -1; 357 pmatch[i].rm_eo = -1; 358 } 359 } 360 361 if (m->pmatch != NULL) 362 free((char *)m->pmatch); 363 if (m->lastpos != NULL) 364 free((char *)m->lastpos); 365 STATETEARDOWN(m); 366 return (0); 367 } 368 369 /* 370 * dissect - figure out what matched what, no back references 371 */ 372 static const char * 373 dissect(struct match *m, const char *start, const char *stop, sopno startst, 374 sopno stopst) 375 { 376 int i; 377 sopno ss; /* start sop of current subRE */ 378 sopno es; /* end sop of current subRE */ 379 const char *sp; /* start of string matched by it */ 380 const char *stp; /* string matched by it cannot pass here */ 381 const char *rest; /* start of rest of string */ 382 const char *tail; /* string unmatched by rest of RE */ 383 sopno ssub; /* start sop of subsubRE */ 384 sopno esub; /* end sop of subsubRE */ 385 const char *ssp; /* start of string matched by subsubRE */ 386 const char *sep; /* end of string matched by subsubRE */ 387 const char *oldssp; /* previous ssp */ 388 const char *dp; 389 390 AT("diss", start, stop, startst, stopst); 391 sp = start; 392 for (ss = startst; ss < stopst; ss = es) { 393 /* identify end of subRE */ 394 es = ss; 395 switch (OP(m->g->strip[es])) { 396 case OPLUS_: 397 case OQUEST_: 398 es += OPND(m->g->strip[es]); 399 break; 400 case OCH_: 401 while (OP(m->g->strip[es]) != O_CH) 402 es += OPND(m->g->strip[es]); 403 break; 404 } 405 es++; 406 407 /* figure out what it matched */ 408 switch (OP(m->g->strip[ss])) { 409 case OEND: 410 assert(0); 411 break; 412 case OCHAR: 413 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 414 break; 415 case OBOL: 416 case OEOL: 417 case OBOW: 418 case OEOW: 419 break; 420 case OANY: 421 case OANYOF: 422 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 423 break; 424 case OBACK_: 425 case O_BACK: 426 assert(0); 427 break; 428 /* cases where length of match is hard to find */ 429 case OQUEST_: 430 stp = stop; 431 for (;;) { 432 /* how long could this one be? */ 433 rest = slow(m, sp, stp, ss, es); 434 assert(rest != NULL); /* it did match */ 435 /* could the rest match the rest? */ 436 tail = slow(m, rest, stop, es, stopst); 437 if (tail == stop) 438 break; /* yes! */ 439 /* no -- try a shorter match for this one */ 440 stp = rest - 1; 441 assert(stp >= sp); /* it did work */ 442 } 443 ssub = ss + 1; 444 esub = es - 1; 445 /* did innards match? */ 446 if (slow(m, sp, rest, ssub, esub) != NULL) { 447 dp = dissect(m, sp, rest, ssub, esub); 448 assert(dp == rest); 449 #if defined(__lint) 450 (void) dp; 451 #endif 452 } else /* no */ 453 assert(sp == rest); 454 sp = rest; 455 break; 456 case OPLUS_: 457 stp = stop; 458 for (;;) { 459 /* how long could this one be? */ 460 rest = slow(m, sp, stp, ss, es); 461 assert(rest != NULL); /* it did match */ 462 /* could the rest match the rest? */ 463 tail = slow(m, rest, stop, es, stopst); 464 if (tail == stop) 465 break; /* yes! */ 466 /* no -- try a shorter match for this one */ 467 stp = rest - 1; 468 assert(stp >= sp); /* it did work */ 469 } 470 ssub = ss + 1; 471 esub = es - 1; 472 ssp = sp; 473 oldssp = ssp; 474 for (;;) { /* find last match of innards */ 475 sep = slow(m, ssp, rest, ssub, esub); 476 if (sep == NULL || sep == ssp) 477 break; /* failed or matched null */ 478 oldssp = ssp; /* on to next try */ 479 ssp = sep; 480 } 481 if (sep == NULL) { 482 /* last successful match */ 483 sep = ssp; 484 ssp = oldssp; 485 } 486 assert(sep == rest); /* must exhaust substring */ 487 assert(slow(m, ssp, sep, ssub, esub) == rest); 488 dp = dissect(m, ssp, sep, ssub, esub); 489 assert(dp == sep); 490 sp = rest; 491 break; 492 case OCH_: 493 stp = stop; 494 for (;;) { 495 /* how long could this one be? */ 496 rest = slow(m, sp, stp, ss, es); 497 assert(rest != NULL); /* it did match */ 498 /* could the rest match the rest? */ 499 tail = slow(m, rest, stop, es, stopst); 500 if (tail == stop) 501 break; /* yes! */ 502 /* no -- try a shorter match for this one */ 503 stp = rest - 1; 504 assert(stp >= sp); /* it did work */ 505 } 506 ssub = ss + 1; 507 esub = ss + OPND(m->g->strip[ss]) - 1; 508 assert(OP(m->g->strip[esub]) == OOR1); 509 for (;;) { /* find first matching branch */ 510 if (slow(m, sp, rest, ssub, esub) == rest) 511 break; /* it matched all of it */ 512 /* that one missed, try next one */ 513 assert(OP(m->g->strip[esub]) == OOR1); 514 esub++; 515 assert(OP(m->g->strip[esub]) == OOR2); 516 ssub = esub + 1; 517 esub += OPND(m->g->strip[esub]); 518 if (OP(m->g->strip[esub]) == OOR2) 519 esub--; 520 else 521 assert(OP(m->g->strip[esub]) == O_CH); 522 } 523 dp = dissect(m, sp, rest, ssub, esub); 524 assert(dp == rest); 525 sp = rest; 526 break; 527 case O_PLUS: 528 case O_QUEST: 529 case OOR1: 530 case OOR2: 531 case O_CH: 532 assert(0); 533 break; 534 case OLPAREN: 535 i = OPND(m->g->strip[ss]); 536 assert(0 < i && i <= m->g->nsub); 537 m->pmatch[i].rm_so = sp - m->offp; 538 break; 539 case ORPAREN: 540 i = OPND(m->g->strip[ss]); 541 assert(0 < i && i <= m->g->nsub); 542 m->pmatch[i].rm_eo = sp - m->offp; 543 break; 544 default: /* uh oh */ 545 assert(0); 546 break; 547 } 548 } 549 550 assert(sp == stop); 551 return (sp); 552 } 553 554 /* 555 * backref - figure out what matched what, figuring in back references 556 */ 557 static const char * 558 backref(struct match *m, const char *start, const char *stop, sopno startst, 559 sopno stopst, sopno lev, /* PLUS nesting level */ 560 int rec) 561 { 562 int i; 563 sopno ss; /* start sop of current subRE */ 564 const char *sp; /* start of string matched by it */ 565 sopno ssub; /* start sop of subsubRE */ 566 sopno esub; /* end sop of subsubRE */ 567 const char *ssp; /* start of string matched by subsubRE */ 568 const char *dp; 569 size_t len; 570 int hard; 571 sop s; 572 regoff_t offsave; 573 cset *cs; 574 wint_t wc; 575 576 AT("back", start, stop, startst, stopst); 577 sp = start; 578 579 /* get as far as we can with easy stuff */ 580 hard = 0; 581 for (ss = startst; !hard && ss < stopst; ss++) 582 switch (OP(s = m->g->strip[ss])) { 583 case OCHAR: 584 if (sp == stop) 585 return (NULL); 586 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 587 if (wc != OPND(s)) 588 return (NULL); 589 break; 590 case OANY: 591 if (sp == stop) 592 return (NULL); 593 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 594 if (wc == BADCHAR) 595 return (NULL); 596 break; 597 case OANYOF: 598 if (sp == stop) 599 return (NULL); 600 cs = &m->g->sets[OPND(s)]; 601 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 602 if (wc == BADCHAR || !CHIN(cs, wc)) 603 return (NULL); 604 break; 605 case OBOL: 606 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 607 (sp < m->endp && *(sp-1) == '\n' && 608 (m->g->cflags®_NEWLINE))) { 609 break; 610 } 611 return (NULL); 612 case OEOL: 613 if ((sp == m->endp && !(m->eflags®_NOTEOL)) || 614 (sp < m->endp && *sp == '\n' && 615 (m->g->cflags®_NEWLINE))) { 616 break; 617 } 618 return (NULL); 619 case OBOW: 620 if (((sp == m->beginp && !(m->eflags®_NOTBOL)) || 621 (sp < m->endp && *(sp-1) == '\n' && 622 (m->g->cflags®_NEWLINE)) || 623 (sp > m->beginp && !ISWORD(*(sp-1)))) && 624 (sp < m->endp && ISWORD(*sp))) { 625 break; 626 } 627 return (NULL); 628 case OEOW: 629 if (((sp == m->endp && !(m->eflags®_NOTEOL)) || 630 (sp < m->endp && *sp == '\n' && 631 (m->g->cflags®_NEWLINE)) || 632 (sp < m->endp && !ISWORD(*sp))) && 633 (sp > m->beginp && ISWORD(*(sp-1)))) { 634 break; 635 } 636 return (NULL); 637 case O_QUEST: 638 break; 639 case OOR1: /* matches null but needs to skip */ 640 ss++; 641 s = m->g->strip[ss]; 642 do { 643 assert(OP(s) == OOR2); 644 ss += OPND(s); 645 } while (OP(s = m->g->strip[ss]) != O_CH); 646 /* note that the ss++ gets us past the O_CH */ 647 break; 648 default: /* have to make a choice */ 649 hard = 1; 650 break; 651 } 652 if (!hard) { /* that was it! */ 653 if (sp != stop) 654 return (NULL); 655 return (sp); 656 } 657 ss--; /* adjust for the for's final increment */ 658 659 /* the hard stuff */ 660 AT("hard", sp, stop, ss, stopst); 661 s = m->g->strip[ss]; 662 switch (OP(s)) { 663 case OBACK_: /* the vilest depths */ 664 i = OPND(s); 665 assert(0 < i && i <= m->g->nsub); 666 if (m->pmatch[i].rm_eo == -1) 667 return (NULL); 668 assert(m->pmatch[i].rm_so != -1); 669 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 670 if (len == 0 && rec++ > MAX_RECURSION) 671 return (NULL); 672 assert(stop - m->beginp >= len); 673 if (sp > stop - len) 674 return (NULL); /* not enough left to match */ 675 ssp = m->offp + m->pmatch[i].rm_so; 676 if (memcmp(sp, ssp, len) != 0) 677 return (NULL); 678 while (m->g->strip[ss] != SOP(O_BACK, i)) 679 ss++; 680 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 681 case OQUEST_: /* to null or not */ 682 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 683 if (dp != NULL) 684 return (dp); /* not */ 685 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 686 case OPLUS_: 687 assert(m->lastpos != NULL); 688 assert(lev+1 <= m->g->nplus); 689 m->lastpos[lev+1] = sp; 690 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 691 case O_PLUS: 692 if (sp == m->lastpos[lev]) /* last pass matched null */ 693 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 694 /* try another pass */ 695 m->lastpos[lev] = sp; 696 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 697 if (dp == NULL) 698 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 699 return (dp); 700 case OCH_: /* find the right one, if any */ 701 ssub = ss + 1; 702 esub = ss + OPND(s) - 1; 703 assert(OP(m->g->strip[esub]) == OOR1); 704 for (;;) { /* find first matching branch */ 705 dp = backref(m, sp, stop, ssub, esub, lev, rec); 706 if (dp != NULL) 707 return (dp); 708 /* that one missed, try next one */ 709 if (OP(m->g->strip[esub]) == O_CH) 710 return (NULL); /* there is none */ 711 esub++; 712 assert(OP(m->g->strip[esub]) == OOR2); 713 ssub = esub + 1; 714 esub += OPND(m->g->strip[esub]); 715 if (OP(m->g->strip[esub]) == OOR2) 716 esub--; 717 else 718 assert(OP(m->g->strip[esub]) == O_CH); 719 } 720 /* NOTREACHED */ 721 break; 722 case OLPAREN: /* must undo assignment if rest fails */ 723 i = OPND(s); 724 assert(0 < i && i <= m->g->nsub); 725 offsave = m->pmatch[i].rm_so; 726 m->pmatch[i].rm_so = sp - m->offp; 727 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 728 if (dp != NULL) 729 return (dp); 730 m->pmatch[i].rm_so = offsave; 731 return (NULL); 732 case ORPAREN: /* must undo assignment if rest fails */ 733 i = OPND(s); 734 assert(0 < i && i <= m->g->nsub); 735 offsave = m->pmatch[i].rm_eo; 736 m->pmatch[i].rm_eo = sp - m->offp; 737 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 738 if (dp != NULL) 739 return (dp); 740 m->pmatch[i].rm_eo = offsave; 741 return (NULL); 742 default: /* uh oh */ 743 assert(0); 744 break; 745 } 746 747 /* "can't happen" */ 748 assert(0); 749 return (NULL); 750 } 751 752 /* 753 * fast - step through the string at top speed 754 */ 755 static const char * 756 fast(struct match *m, const char *start, const char *stop, sopno startst, 757 sopno stopst) 758 { 759 states st = m->st; 760 states fresh = m->fresh; 761 states tmp = m->tmp; 762 const char *p = start; 763 wint_t c; 764 wint_t lastc; /* previous c */ 765 wint_t flagch; 766 int i; 767 const char *coldp; /* last p after which no match was underway */ 768 size_t clen; 769 770 CLEAR(st); 771 SET1(st, startst); 772 SP("fast", st, *p); 773 st = step(m->g, startst, stopst, st, NOTHING, st); 774 ASSIGN(fresh, st); 775 SP("start", st, *p); 776 coldp = NULL; 777 if (start == m->beginp) 778 c = OUT; 779 else { 780 /* 781 * XXX Wrong if the previous character was multi-byte. 782 * Newline never is (in encodings supported by FreeBSD), 783 * so this only breaks the ISWORD tests below. 784 */ 785 c = (uch)*(start - 1); 786 } 787 for (;;) { 788 /* next character */ 789 lastc = c; 790 if (p == m->endp) { 791 clen = 0; 792 c = OUT; 793 } else 794 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 795 if (EQ(st, fresh)) 796 coldp = p; 797 798 /* is there an EOL and/or BOL between lastc and c? */ 799 flagch = '\0'; 800 i = 0; 801 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 802 (lastc == OUT && !(m->eflags®_NOTBOL))) { 803 flagch = BOL; 804 i = m->g->nbol; 805 } 806 if ((c == '\n' && m->g->cflags®_NEWLINE) || 807 (c == OUT && !(m->eflags®_NOTEOL))) { 808 flagch = (flagch == BOL) ? BOLEOL : EOL; 809 i += m->g->neol; 810 } 811 if (i != 0) { 812 for (; i > 0; i--) 813 st = step(m->g, startst, stopst, st, 814 flagch, st); 815 SP("boleol", st, c); 816 } 817 818 /* how about a word boundary? */ 819 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 820 (c != OUT && ISWORD(c))) { 821 flagch = BOW; 822 } 823 if ((lastc != OUT && ISWORD(lastc)) && 824 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 825 flagch = EOW; 826 } 827 if (flagch == BOW || flagch == EOW) { 828 st = step(m->g, startst, stopst, st, flagch, st); 829 SP("boweow", st, c); 830 } 831 832 /* are we done? */ 833 if (ISSET(st, stopst) || p == stop || clen > stop - p) 834 break; /* NOTE BREAK OUT */ 835 836 /* no, we must deal with this character */ 837 ASSIGN(tmp, st); 838 ASSIGN(st, fresh); 839 assert(c != OUT); 840 st = step(m->g, startst, stopst, tmp, c, st); 841 SP("aft", st, c); 842 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 843 p += clen; 844 } 845 846 assert(coldp != NULL); 847 m->coldp = coldp; 848 if (ISSET(st, stopst)) 849 return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 850 else 851 return (NULL); 852 } 853 854 /* 855 * slow - step through the string more deliberately 856 */ 857 static const char * 858 slow(struct match *m, const char *start, const char *stop, sopno startst, 859 sopno stopst) 860 { 861 states st = m->st; 862 states empty = m->empty; 863 states tmp = m->tmp; 864 const char *p = start; 865 wint_t c; 866 wint_t lastc; /* previous c */ 867 wint_t flagch; 868 int i; 869 const char *matchp; /* last p at which a match ended */ 870 size_t clen; 871 872 AT("slow", start, stop, startst, stopst); 873 CLEAR(st); 874 SET1(st, startst); 875 SP("sstart", st, *p); 876 st = step(m->g, startst, stopst, st, NOTHING, st); 877 matchp = NULL; 878 if (start == m->beginp) 879 c = OUT; 880 else { 881 /* 882 * XXX Wrong if the previous character was multi-byte. 883 * Newline never is (in encodings supported by FreeBSD), 884 * so this only breaks the ISWORD tests below. 885 */ 886 c = (uch)*(start - 1); 887 } 888 for (;;) { 889 /* next character */ 890 lastc = c; 891 if (p == m->endp) { 892 c = OUT; 893 clen = 0; 894 } else 895 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 896 897 /* is there an EOL and/or BOL between lastc and c? */ 898 flagch = '\0'; 899 i = 0; 900 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 901 (lastc == OUT && !(m->eflags®_NOTBOL))) { 902 flagch = BOL; 903 i = m->g->nbol; 904 } 905 if ((c == '\n' && m->g->cflags®_NEWLINE) || 906 (c == OUT && !(m->eflags®_NOTEOL))) { 907 flagch = (flagch == BOL) ? BOLEOL : EOL; 908 i += m->g->neol; 909 } 910 if (i != 0) { 911 for (; i > 0; i--) 912 st = step(m->g, startst, stopst, st, 913 flagch, st); 914 SP("sboleol", st, c); 915 } 916 917 /* how about a word boundary? */ 918 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 919 (c != OUT && ISWORD(c))) { 920 flagch = BOW; 921 } 922 if ((lastc != OUT && ISWORD(lastc)) && 923 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 924 flagch = EOW; 925 } 926 if (flagch == BOW || flagch == EOW) { 927 st = step(m->g, startst, stopst, st, flagch, st); 928 SP("sboweow", st, c); 929 } 930 931 /* are we done? */ 932 if (ISSET(st, stopst)) 933 matchp = p; 934 if (EQ(st, empty) || p == stop || clen > stop - p) 935 break; /* NOTE BREAK OUT */ 936 937 /* no, we must deal with this character */ 938 ASSIGN(tmp, st); 939 ASSIGN(st, empty); 940 assert(c != OUT); 941 st = step(m->g, startst, stopst, tmp, c, st); 942 SP("saft", st, c); 943 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 944 p += clen; 945 } 946 947 return (matchp); 948 } 949 950 951 /* 952 * step - map set of states reachable before char to set reachable after 953 */ 954 static states 955 step(struct re_guts *g, 956 sopno start, /* start state within strip */ 957 sopno stop, /* state after stop state within strip */ 958 states bef, /* states reachable before */ 959 wint_t ch, /* character or NONCHAR code */ 960 states aft) /* states already known reachable after */ 961 { 962 cset *cs; 963 sop s; 964 sopno pc; 965 onestate here; /* note, macros know this name */ 966 sopno look; 967 int i; 968 969 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 970 s = g->strip[pc]; 971 switch (OP(s)) { 972 case OEND: 973 assert(pc == stop-1); 974 break; 975 case OCHAR: 976 /* only characters can match */ 977 assert(!NONCHAR(ch) || ch != OPND(s)); 978 if (ch == OPND(s)) 979 FWD(aft, bef, 1); 980 break; 981 case OBOL: 982 if (ch == BOL || ch == BOLEOL) 983 FWD(aft, bef, 1); 984 break; 985 case OEOL: 986 if (ch == EOL || ch == BOLEOL) 987 FWD(aft, bef, 1); 988 break; 989 case OBOW: 990 if (ch == BOW) 991 FWD(aft, bef, 1); 992 break; 993 case OEOW: 994 if (ch == EOW) 995 FWD(aft, bef, 1); 996 break; 997 case OANY: 998 if (!NONCHAR(ch)) 999 FWD(aft, bef, 1); 1000 break; 1001 case OANYOF: 1002 cs = &g->sets[OPND(s)]; 1003 if (!NONCHAR(ch) && CHIN(cs, ch)) 1004 FWD(aft, bef, 1); 1005 break; 1006 case OBACK_: /* ignored here */ 1007 case O_BACK: 1008 FWD(aft, aft, 1); 1009 break; 1010 case OPLUS_: /* forward, this is just an empty */ 1011 FWD(aft, aft, 1); 1012 break; 1013 case O_PLUS: /* both forward and back */ 1014 FWD(aft, aft, 1); 1015 i = ISSETBACK(aft, OPND(s)); 1016 BACK(aft, aft, OPND(s)); 1017 if (!i && ISSETBACK(aft, OPND(s))) { 1018 /* oho, must reconsider loop body */ 1019 pc -= OPND(s) + 1; 1020 INIT(here, pc); 1021 } 1022 break; 1023 case OQUEST_: /* two branches, both forward */ 1024 FWD(aft, aft, 1); 1025 FWD(aft, aft, OPND(s)); 1026 break; 1027 case O_QUEST: /* just an empty */ 1028 FWD(aft, aft, 1); 1029 break; 1030 case OLPAREN: /* not significant here */ 1031 case ORPAREN: 1032 FWD(aft, aft, 1); 1033 break; 1034 case OCH_: /* mark the first two branches */ 1035 FWD(aft, aft, 1); 1036 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1037 FWD(aft, aft, OPND(s)); 1038 break; 1039 case OOR1: /* done a branch, find the O_CH */ 1040 if (ISSTATEIN(aft, here)) { 1041 for (look = 1; 1042 OP(s = g->strip[pc+look]) != O_CH; 1043 look += OPND(s)) 1044 assert(OP(s) == OOR2); 1045 FWD(aft, aft, look + 1); 1046 } 1047 break; 1048 case OOR2: /* propagate OCH_'s marking */ 1049 FWD(aft, aft, 1); 1050 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1051 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1052 FWD(aft, aft, OPND(s)); 1053 } 1054 break; 1055 case O_CH: /* just empty */ 1056 FWD(aft, aft, 1); 1057 break; 1058 default: /* ooooops... */ 1059 assert(0); 1060 break; 1061 } 1062 } 1063 1064 return (aft); 1065 } 1066 1067 #ifdef REDEBUG 1068 /* 1069 * print - print a set of states 1070 */ 1071 static void 1072 print(struct match *m, const char *caption, states st, int ch, FILE *d) 1073 { 1074 struct re_guts *g = m->g; 1075 int i; 1076 int first = 1; 1077 1078 if (!(m->eflags®_TRACE)) 1079 return; 1080 1081 (void) fprintf(d, "%s", caption); 1082 if (ch != '\0') 1083 (void) fprintf(d, " %s", pchar(ch)); 1084 for (i = 0; i < g->nstates; i++) 1085 if (ISSET(st, i)) { 1086 (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1087 first = 0; 1088 } 1089 (void) fprintf(d, "\n"); 1090 } 1091 1092 /* 1093 * at - print current situation 1094 */ 1095 static void 1096 at(struct match *m, const char *title, const char *start, const char *stop, 1097 sopno startst, sopno stopst) 1098 { 1099 if (!(m->eflags®_TRACE)) 1100 return; 1101 1102 (void) printf("%s %s-", title, pchar(*start)); 1103 (void) printf("%s ", pchar(*stop)); 1104 (void) printf("%ld-%ld\n", (long)startst, (long)stopst); 1105 } 1106 1107 #ifndef PCHARDONE 1108 #define PCHARDONE /* never again */ 1109 /* 1110 * pchar - make a character printable 1111 * 1112 * Is this identical to regchar() over in debug.c? Well, yes. But a 1113 * duplicate here avoids having a debugging-capable regexec.o tied to 1114 * a matching debug.o, and this is convenient. It all disappears in 1115 * the non-debug compilation anyway, so it doesn't matter much. 1116 */ 1117 static const char * 1118 pchar(int ch) 1119 { 1120 static char pbuf[10]; 1121 1122 if (isprint((uch)ch) || ch == ' ') 1123 (void) sprintf(pbuf, "%c", ch); 1124 else 1125 (void) sprintf(pbuf, "\\%o", ch); 1126 return (pbuf); 1127 } 1128 #endif 1129 #endif 1130 1131 #undef matcher 1132 #undef fast 1133 #undef slow 1134 #undef dissect 1135 #undef backref 1136 #undef step 1137 #undef print 1138 #undef at 1139 #undef match 1140