1 /* 2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * The matching engine and friends. This file is #included by regexec.c 37 * after suitable #defines of a variety of macros used herein, so that 38 * different state representations can be used without duplicating masses 39 * of code. 40 */ 41 42 #ifdef SNAMES 43 #define matcher smatcher 44 #define fast sfast 45 #define slow sslow 46 #define dissect sdissect 47 #define backref sbackref 48 #define step sstep 49 #define print sprint 50 #define at sat 51 #define match smat 52 #endif 53 #ifdef LNAMES 54 #define matcher lmatcher 55 #define fast lfast 56 #define slow lslow 57 #define dissect ldissect 58 #define backref lbackref 59 #define step lstep 60 #define print lprint 61 #define at lat 62 #define match lmat 63 #endif 64 #ifdef MNAMES 65 #define matcher mmatcher 66 #define fast mfast 67 #define slow mslow 68 #define dissect mdissect 69 #define backref mbackref 70 #define step mstep 71 #define print mprint 72 #define at mat 73 #define match mmat 74 #endif 75 76 /* another structure passed up and down to avoid zillions of parameters */ 77 struct match { 78 struct re_guts *g; 79 int eflags; 80 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 81 const char *offp; /* offsets work from here */ 82 const char *beginp; /* start of string -- virtual NUL precedes */ 83 const char *endp; /* end of string -- virtual NUL here */ 84 const char *coldp; /* can be no match starting before here */ 85 const char **lastpos; /* [nplus+1] */ 86 STATEVARS; 87 states st; /* current states */ 88 states fresh; /* states for a fresh start */ 89 states tmp; /* temporary */ 90 states empty; /* empty set of states */ 91 mbstate_t mbs; /* multibyte conversion state */ 92 }; 93 94 /* ========= begin header generated by ./mkh ========= */ 95 #ifdef __cplusplus 96 extern "C" { 97 #endif 98 99 /* === engine.c === */ 100 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 101 static const char *dissect(struct match *, const char *, const char *, 102 sopno, sopno); 103 static const char *backref(struct match *, const char *, const char *, sopno, 104 sopno, sopno, int); 105 static const char *fast(struct match *, const char *, const char *, sopno, 106 sopno); 107 static const char *slow(struct match *, const char *, const char *, sopno, 108 sopno); 109 static states step(struct re_guts *, sopno, sopno, states, wint_t, states); 110 #define MAX_RECURSION 100 111 #define BOL (OUT-1) 112 #define EOL (BOL-1) 113 #define BOLEOL (BOL-2) 114 #define NOTHING (BOL-3) 115 #define BOW (BOL-4) 116 #define EOW (BOL-5) 117 #define BADCHAR (BOL-6) 118 #define NONCHAR(c) ((c) <= OUT) 119 #ifdef REDEBUG 120 static void print(struct match *, const char *, states, int, FILE *); 121 #endif 122 #ifdef REDEBUG 123 static void at(struct match *, const char *, const char *, const char *, 124 sopno, sopno); 125 #endif 126 #ifdef REDEBUG 127 static const char *pchar(int ch); 128 #endif 129 130 #ifdef __cplusplus 131 } 132 #endif 133 /* ========= end header generated by ./mkh ========= */ 134 135 #ifdef REDEBUG 136 #define SP(t, s, c) print(m, t, s, c, stdout) 137 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 138 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 139 #else 140 #define SP(t, s, c) /* nothing */ 141 #define AT(t, p1, p2, s1, s2) /* nothing */ 142 #define NOTE(s) /* nothing */ 143 #endif 144 145 /* 146 * matcher - the actual matching engine 147 */ 148 static int /* 0 success, REG_NOMATCH failure */ 149 matcher(struct re_guts *g, 150 const char *string, 151 size_t nmatch, 152 regmatch_t pmatch[], 153 int eflags) 154 { 155 const char *endp; 156 int i; 157 struct match mv; 158 struct match *m = &mv; 159 const char *dp; 160 const sopno gf = g->firststate+1; /* +1 for OEND */ 161 const sopno gl = g->laststate; 162 const char *start; 163 const char *stop; 164 /* Boyer-Moore algorithms variables */ 165 const char *pp; 166 int cj, mj; 167 const char *mustfirst; 168 const char *mustlast; 169 int *matchjump; 170 int *charjump; 171 172 /* simplify the situation where possible */ 173 if (g->cflags®_NOSUB) 174 nmatch = 0; 175 #ifdef REG_STARTEND 176 if (eflags®_STARTEND) { 177 start = string + pmatch[0].rm_so; 178 stop = string + pmatch[0].rm_eo; 179 } else { 180 start = string; 181 stop = start + strlen(start); 182 } 183 #else 184 start = string; 185 stop = start + strlen(start); 186 #endif 187 if (stop < start) 188 return (REG_EFATAL); 189 190 /* prescreening; this does wonders for this rather slow code */ 191 if (g->must != NULL) { 192 if (g->charjump != NULL && g->matchjump != NULL) { 193 mustfirst = g->must; 194 mustlast = g->must + g->mlen - 1; 195 charjump = g->charjump; 196 matchjump = g->matchjump; 197 pp = mustlast; 198 for (dp = start+g->mlen-1; dp < stop; ) { 199 /* Fast skip non-matches */ 200 while (dp < stop && charjump[(int)*dp]) 201 dp += charjump[(int)*dp]; 202 203 if (dp >= stop) 204 break; 205 206 /* Greedy matcher */ 207 /* 208 * We depend on not being used for 209 * for strings of length 1 210 */ 211 while (*--dp == *--pp && pp != mustfirst) 212 ; 213 214 if (*dp == *pp) 215 break; 216 217 /* Jump to next possible match */ 218 mj = matchjump[pp - mustfirst]; 219 cj = charjump[(int)*dp]; 220 dp += (cj < mj ? mj : cj); 221 pp = mustlast; 222 } 223 if (pp != mustfirst) 224 return (REG_NOMATCH); 225 } else { 226 for (dp = start; dp < stop; dp++) 227 if (*dp == g->must[0] && 228 stop - dp >= g->mlen && 229 memcmp(dp, g->must, (size_t)g->mlen) == 0) 230 break; 231 if (dp == stop) /* we didn't find g->must */ 232 return (REG_NOMATCH); 233 } 234 } 235 236 /* match struct setup */ 237 m->g = g; 238 m->eflags = eflags; 239 m->pmatch = NULL; 240 m->lastpos = NULL; 241 m->offp = string; 242 m->beginp = start; 243 m->endp = stop; 244 STATESETUP(m, 4); 245 SETUP(m->st); 246 SETUP(m->fresh); 247 SETUP(m->tmp); 248 SETUP(m->empty); 249 CLEAR(m->empty); 250 ZAPSTATE(&m->mbs); 251 252 /* Adjust start according to moffset, to speed things up */ 253 if (g->moffset > -1) 254 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 255 256 SP("mloop", m->st, *start); 257 258 /* this loop does only one repetition except for backrefs */ 259 for (;;) { 260 endp = fast(m, start, stop, gf, gl); 261 if (endp == NULL) { /* a miss */ 262 if (m->pmatch != NULL) 263 free((char *)m->pmatch); 264 if (m->lastpos != NULL) 265 free((char *)m->lastpos); 266 STATETEARDOWN(m); 267 return (REG_NOMATCH); 268 } 269 if (nmatch == 0 && !g->backrefs) 270 break; /* no further info needed */ 271 272 /* where? */ 273 assert(m->coldp != NULL); 274 for (;;) { 275 NOTE("finding start"); 276 endp = slow(m, m->coldp, stop, gf, gl); 277 if (endp != NULL) 278 break; 279 assert(m->coldp < m->endp); 280 m->coldp += XMBRTOWC(NULL, m->coldp, 281 m->endp - m->coldp, &m->mbs, 0); 282 } 283 if (nmatch == 1 && !g->backrefs) 284 break; /* no further info needed */ 285 286 /* oh my, he wants the subexpressions... */ 287 if (m->pmatch == NULL) 288 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 289 sizeof (regmatch_t)); 290 if (m->pmatch == NULL) { 291 STATETEARDOWN(m); 292 return (REG_ESPACE); 293 } 294 for (i = 1; i <= m->g->nsub; i++) 295 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 296 /* NB: FreeBSD has REG_BACKR, we do not */ 297 if (!g->backrefs /* && !(m->eflags®_BACKR) */) { 298 NOTE("dissecting"); 299 dp = dissect(m, m->coldp, endp, gf, gl); 300 } else { 301 if (g->nplus > 0 && m->lastpos == NULL) 302 m->lastpos = malloc((g->nplus+1) * 303 sizeof (const char *)); 304 if (g->nplus > 0 && m->lastpos == NULL) { 305 free(m->pmatch); 306 STATETEARDOWN(m); 307 return (REG_ESPACE); 308 } 309 NOTE("backref dissect"); 310 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 311 } 312 if (dp != NULL) 313 break; 314 315 /* uh-oh... we couldn't find a subexpression-level match */ 316 assert(g->backrefs); /* must be back references doing it */ 317 assert(g->nplus == 0 || m->lastpos != NULL); 318 for (;;) { 319 if (dp != NULL || endp <= m->coldp) 320 break; /* defeat */ 321 NOTE("backoff"); 322 endp = slow(m, m->coldp, endp-1, gf, gl); 323 if (endp == NULL) 324 break; /* defeat */ 325 /* try it on a shorter possibility */ 326 #ifndef NDEBUG 327 for (i = 1; i <= m->g->nsub; i++) { 328 assert(m->pmatch[i].rm_so == -1); 329 assert(m->pmatch[i].rm_eo == -1); 330 } 331 #endif 332 NOTE("backoff dissect"); 333 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 334 } 335 assert(dp == NULL || dp == endp); 336 if (dp != NULL) /* found a shorter one */ 337 break; 338 339 /* despite initial appearances, there is no match here */ 340 NOTE("false alarm"); 341 /* recycle starting later */ 342 start = m->coldp + XMBRTOWC(NULL, m->coldp, 343 stop - m->coldp, &m->mbs, 0); 344 assert(start <= stop); 345 } 346 347 /* fill in the details if requested */ 348 if (nmatch > 0) { 349 pmatch[0].rm_so = m->coldp - m->offp; 350 pmatch[0].rm_eo = endp - m->offp; 351 } 352 if (nmatch > 1) { 353 assert(m->pmatch != NULL); 354 for (i = 1; i < nmatch; i++) 355 if (i <= m->g->nsub) 356 pmatch[i] = m->pmatch[i]; 357 else { 358 pmatch[i].rm_so = -1; 359 pmatch[i].rm_eo = -1; 360 } 361 } 362 363 if (m->pmatch != NULL) 364 free((char *)m->pmatch); 365 if (m->lastpos != NULL) 366 free((char *)m->lastpos); 367 STATETEARDOWN(m); 368 return (0); 369 } 370 371 /* 372 * dissect - figure out what matched what, no back references 373 */ 374 static const char * 375 dissect(struct match *m, const char *start, const char *stop, sopno startst, 376 sopno stopst) 377 { 378 int i; 379 sopno ss; /* start sop of current subRE */ 380 sopno es; /* end sop of current subRE */ 381 const char *sp; /* start of string matched by it */ 382 const char *stp; /* string matched by it cannot pass here */ 383 const char *rest; /* start of rest of string */ 384 const char *tail; /* string unmatched by rest of RE */ 385 sopno ssub; /* start sop of subsubRE */ 386 sopno esub; /* end sop of subsubRE */ 387 const char *ssp; /* start of string matched by subsubRE */ 388 const char *sep; /* end of string matched by subsubRE */ 389 const char *oldssp; /* previous ssp */ 390 const char *dp; 391 392 AT("diss", start, stop, startst, stopst); 393 sp = start; 394 for (ss = startst; ss < stopst; ss = es) { 395 /* identify end of subRE */ 396 es = ss; 397 switch (OP(m->g->strip[es])) { 398 case OPLUS_: 399 case OQUEST_: 400 es += OPND(m->g->strip[es]); 401 break; 402 case OCH_: 403 while (OP(m->g->strip[es]) != O_CH) 404 es += OPND(m->g->strip[es]); 405 break; 406 } 407 es++; 408 409 /* figure out what it matched */ 410 switch (OP(m->g->strip[ss])) { 411 case OEND: 412 assert(0); 413 break; 414 case OCHAR: 415 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 416 break; 417 case OBOL: 418 case OEOL: 419 case OBOW: 420 case OEOW: 421 break; 422 case OANY: 423 case OANYOF: 424 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 425 break; 426 case OBACK_: 427 case O_BACK: 428 assert(0); 429 break; 430 /* cases where length of match is hard to find */ 431 case OQUEST_: 432 stp = stop; 433 for (;;) { 434 /* how long could this one be? */ 435 rest = slow(m, sp, stp, ss, es); 436 assert(rest != NULL); /* it did match */ 437 /* could the rest match the rest? */ 438 tail = slow(m, rest, stop, es, stopst); 439 if (tail == stop) 440 break; /* yes! */ 441 /* no -- try a shorter match for this one */ 442 stp = rest - 1; 443 assert(stp >= sp); /* it did work */ 444 } 445 ssub = ss + 1; 446 esub = es - 1; 447 /* did innards match? */ 448 if (slow(m, sp, rest, ssub, esub) != NULL) { 449 dp = dissect(m, sp, rest, ssub, esub); 450 assert(dp == rest); 451 #if defined(__lint) 452 (void) dp; 453 #endif 454 } else /* no */ 455 assert(sp == rest); 456 sp = rest; 457 break; 458 case OPLUS_: 459 stp = stop; 460 for (;;) { 461 /* how long could this one be? */ 462 rest = slow(m, sp, stp, ss, es); 463 assert(rest != NULL); /* it did match */ 464 /* could the rest match the rest? */ 465 tail = slow(m, rest, stop, es, stopst); 466 if (tail == stop) 467 break; /* yes! */ 468 /* no -- try a shorter match for this one */ 469 stp = rest - 1; 470 assert(stp >= sp); /* it did work */ 471 } 472 ssub = ss + 1; 473 esub = es - 1; 474 ssp = sp; 475 oldssp = ssp; 476 for (;;) { /* find last match of innards */ 477 sep = slow(m, ssp, rest, ssub, esub); 478 if (sep == NULL || sep == ssp) 479 break; /* failed or matched null */ 480 oldssp = ssp; /* on to next try */ 481 ssp = sep; 482 } 483 if (sep == NULL) { 484 /* last successful match */ 485 sep = ssp; 486 ssp = oldssp; 487 } 488 assert(sep == rest); /* must exhaust substring */ 489 assert(slow(m, ssp, sep, ssub, esub) == rest); 490 dp = dissect(m, ssp, sep, ssub, esub); 491 assert(dp == sep); 492 sp = rest; 493 break; 494 case OCH_: 495 stp = stop; 496 for (;;) { 497 /* how long could this one be? */ 498 rest = slow(m, sp, stp, ss, es); 499 assert(rest != NULL); /* it did match */ 500 /* could the rest match the rest? */ 501 tail = slow(m, rest, stop, es, stopst); 502 if (tail == stop) 503 break; /* yes! */ 504 /* no -- try a shorter match for this one */ 505 stp = rest - 1; 506 assert(stp >= sp); /* it did work */ 507 } 508 ssub = ss + 1; 509 esub = ss + OPND(m->g->strip[ss]) - 1; 510 assert(OP(m->g->strip[esub]) == OOR1); 511 for (;;) { /* find first matching branch */ 512 if (slow(m, sp, rest, ssub, esub) == rest) 513 break; /* it matched all of it */ 514 /* that one missed, try next one */ 515 assert(OP(m->g->strip[esub]) == OOR1); 516 esub++; 517 assert(OP(m->g->strip[esub]) == OOR2); 518 ssub = esub + 1; 519 esub += OPND(m->g->strip[esub]); 520 if (OP(m->g->strip[esub]) == OOR2) 521 esub--; 522 else 523 assert(OP(m->g->strip[esub]) == O_CH); 524 } 525 dp = dissect(m, sp, rest, ssub, esub); 526 assert(dp == rest); 527 sp = rest; 528 break; 529 case O_PLUS: 530 case O_QUEST: 531 case OOR1: 532 case OOR2: 533 case O_CH: 534 assert(0); 535 break; 536 case OLPAREN: 537 i = OPND(m->g->strip[ss]); 538 assert(0 < i && i <= m->g->nsub); 539 m->pmatch[i].rm_so = sp - m->offp; 540 break; 541 case ORPAREN: 542 i = OPND(m->g->strip[ss]); 543 assert(0 < i && i <= m->g->nsub); 544 m->pmatch[i].rm_eo = sp - m->offp; 545 break; 546 default: /* uh oh */ 547 assert(0); 548 break; 549 } 550 } 551 552 assert(sp == stop); 553 return (sp); 554 } 555 556 /* 557 * backref - figure out what matched what, figuring in back references 558 */ 559 static const char * 560 backref(struct match *m, const char *start, const char *stop, sopno startst, 561 sopno stopst, sopno lev, /* PLUS nesting level */ 562 int rec) 563 { 564 int i; 565 sopno ss; /* start sop of current subRE */ 566 const char *sp; /* start of string matched by it */ 567 sopno ssub; /* start sop of subsubRE */ 568 sopno esub; /* end sop of subsubRE */ 569 const char *ssp; /* start of string matched by subsubRE */ 570 const char *dp; 571 size_t len; 572 int hard; 573 sop s; 574 regoff_t offsave; 575 cset *cs; 576 wint_t wc; 577 578 AT("back", start, stop, startst, stopst); 579 sp = start; 580 581 /* get as far as we can with easy stuff */ 582 hard = 0; 583 for (ss = startst; !hard && ss < stopst; ss++) 584 switch (OP(s = m->g->strip[ss])) { 585 case OCHAR: 586 if (sp == stop) 587 return (NULL); 588 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 589 if (wc != OPND(s)) 590 return (NULL); 591 break; 592 case OANY: 593 if (sp == stop) 594 return (NULL); 595 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 596 if (wc == BADCHAR) 597 return (NULL); 598 break; 599 case OANYOF: 600 if (sp == stop) 601 return (NULL); 602 cs = &m->g->sets[OPND(s)]; 603 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 604 if (wc == BADCHAR || !CHIN(cs, wc)) 605 return (NULL); 606 break; 607 case OBOL: 608 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 609 (sp < m->endp && *(sp-1) == '\n' && 610 (m->g->cflags®_NEWLINE))) { 611 break; 612 } 613 else 614 return (NULL); 615 break; 616 case OEOL: 617 if ((sp == m->endp && !(m->eflags®_NOTEOL)) || 618 (sp < m->endp && *sp == '\n' && 619 (m->g->cflags®_NEWLINE))) { 620 break; 621 } 622 else 623 return (NULL); 624 break; 625 case OBOW: 626 if (((sp == m->beginp && !(m->eflags®_NOTBOL)) || 627 (sp < m->endp && *(sp-1) == '\n' && 628 (m->g->cflags®_NEWLINE)) || 629 (sp > m->beginp && !ISWORD(*(sp-1)))) && 630 (sp < m->endp && ISWORD(*sp))) { 631 break; 632 } else 633 return (NULL); 634 break; 635 case OEOW: 636 if (((sp == m->endp && !(m->eflags®_NOTEOL)) || 637 (sp < m->endp && *sp == '\n' && 638 (m->g->cflags®_NEWLINE)) || 639 (sp < m->endp && !ISWORD(*sp))) && 640 (sp > m->beginp && ISWORD(*(sp-1)))) { 641 break; 642 } else 643 return (NULL); 644 break; 645 case O_QUEST: 646 break; 647 case OOR1: /* matches null but needs to skip */ 648 ss++; 649 s = m->g->strip[ss]; 650 do { 651 assert(OP(s) == OOR2); 652 ss += OPND(s); 653 } while (OP(s = m->g->strip[ss]) != O_CH); 654 /* note that the ss++ gets us past the O_CH */ 655 break; 656 default: /* have to make a choice */ 657 hard = 1; 658 break; 659 } 660 if (!hard) { /* that was it! */ 661 if (sp != stop) 662 return (NULL); 663 return (sp); 664 } 665 ss--; /* adjust for the for's final increment */ 666 667 /* the hard stuff */ 668 AT("hard", sp, stop, ss, stopst); 669 s = m->g->strip[ss]; 670 switch (OP(s)) { 671 case OBACK_: /* the vilest depths */ 672 i = OPND(s); 673 assert(0 < i && i <= m->g->nsub); 674 if (m->pmatch[i].rm_eo == -1) 675 return (NULL); 676 assert(m->pmatch[i].rm_so != -1); 677 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 678 if (len == 0 && rec++ > MAX_RECURSION) 679 return (NULL); 680 assert(stop - m->beginp >= len); 681 if (sp > stop - len) 682 return (NULL); /* not enough left to match */ 683 ssp = m->offp + m->pmatch[i].rm_so; 684 if (memcmp(sp, ssp, len) != 0) 685 return (NULL); 686 while (m->g->strip[ss] != SOP(O_BACK, i)) 687 ss++; 688 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 689 break; 690 case OQUEST_: /* to null or not */ 691 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 692 if (dp != NULL) 693 return (dp); /* not */ 694 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 695 break; 696 case OPLUS_: 697 assert(m->lastpos != NULL); 698 assert(lev+1 <= m->g->nplus); 699 m->lastpos[lev+1] = sp; 700 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 701 break; 702 case O_PLUS: 703 if (sp == m->lastpos[lev]) /* last pass matched null */ 704 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 705 /* try another pass */ 706 m->lastpos[lev] = sp; 707 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 708 if (dp == NULL) 709 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 710 else 711 return (dp); 712 break; 713 case OCH_: /* find the right one, if any */ 714 ssub = ss + 1; 715 esub = ss + OPND(s) - 1; 716 assert(OP(m->g->strip[esub]) == OOR1); 717 for (;;) { /* find first matching branch */ 718 dp = backref(m, sp, stop, ssub, esub, lev, rec); 719 if (dp != NULL) 720 return (dp); 721 /* that one missed, try next one */ 722 if (OP(m->g->strip[esub]) == O_CH) 723 return (NULL); /* there is none */ 724 esub++; 725 assert(OP(m->g->strip[esub]) == OOR2); 726 ssub = esub + 1; 727 esub += OPND(m->g->strip[esub]); 728 if (OP(m->g->strip[esub]) == OOR2) 729 esub--; 730 else 731 assert(OP(m->g->strip[esub]) == O_CH); 732 } 733 break; 734 case OLPAREN: /* must undo assignment if rest fails */ 735 i = OPND(s); 736 assert(0 < i && i <= m->g->nsub); 737 offsave = m->pmatch[i].rm_so; 738 m->pmatch[i].rm_so = sp - m->offp; 739 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 740 if (dp != NULL) 741 return (dp); 742 m->pmatch[i].rm_so = offsave; 743 return (NULL); 744 break; 745 case ORPAREN: /* must undo assignment if rest fails */ 746 i = OPND(s); 747 assert(0 < i && i <= m->g->nsub); 748 offsave = m->pmatch[i].rm_eo; 749 m->pmatch[i].rm_eo = sp - m->offp; 750 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 751 if (dp != NULL) 752 return (dp); 753 m->pmatch[i].rm_eo = offsave; 754 return (NULL); 755 break; 756 default: /* uh oh */ 757 assert(0); 758 break; 759 } 760 761 /* "can't happen" */ 762 assert(0); 763 return (NULL); 764 } 765 766 /* 767 * fast - step through the string at top speed 768 */ 769 static const char * 770 fast(struct match *m, const char *start, const char *stop, sopno startst, 771 sopno stopst) 772 { 773 states st = m->st; 774 states fresh = m->fresh; 775 states tmp = m->tmp; 776 const char *p = start; 777 wint_t c; 778 wint_t lastc; /* previous c */ 779 wint_t flagch; 780 int i; 781 const char *coldp; /* last p after which no match was underway */ 782 size_t clen; 783 784 CLEAR(st); 785 SET1(st, startst); 786 SP("fast", st, *p); 787 st = step(m->g, startst, stopst, st, NOTHING, st); 788 ASSIGN(fresh, st); 789 SP("start", st, *p); 790 coldp = NULL; 791 if (start == m->beginp) 792 c = OUT; 793 else { 794 /* 795 * XXX Wrong if the previous character was multi-byte. 796 * Newline never is (in encodings supported by FreeBSD), 797 * so this only breaks the ISWORD tests below. 798 */ 799 c = (uch)*(start - 1); 800 } 801 for (;;) { 802 /* next character */ 803 lastc = c; 804 if (p == m->endp) { 805 clen = 0; 806 c = OUT; 807 } else 808 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 809 if (EQ(st, fresh)) 810 coldp = p; 811 812 /* is there an EOL and/or BOL between lastc and c? */ 813 flagch = '\0'; 814 i = 0; 815 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 816 (lastc == OUT && !(m->eflags®_NOTBOL))) { 817 flagch = BOL; 818 i = m->g->nbol; 819 } 820 if ((c == '\n' && m->g->cflags®_NEWLINE) || 821 (c == OUT && !(m->eflags®_NOTEOL))) { 822 flagch = (flagch == BOL) ? BOLEOL : EOL; 823 i += m->g->neol; 824 } 825 if (i != 0) { 826 for (; i > 0; i--) 827 st = step(m->g, startst, stopst, st, 828 flagch, st); 829 SP("boleol", st, c); 830 } 831 832 /* how about a word boundary? */ 833 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 834 (c != OUT && ISWORD(c))) { 835 flagch = BOW; 836 } 837 if ((lastc != OUT && ISWORD(lastc)) && 838 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 839 flagch = EOW; 840 } 841 if (flagch == BOW || flagch == EOW) { 842 st = step(m->g, startst, stopst, st, flagch, st); 843 SP("boweow", st, c); 844 } 845 846 /* are we done? */ 847 if (ISSET(st, stopst) || p == stop || clen > stop - p) 848 break; /* NOTE BREAK OUT */ 849 850 /* no, we must deal with this character */ 851 ASSIGN(tmp, st); 852 ASSIGN(st, fresh); 853 assert(c != OUT); 854 st = step(m->g, startst, stopst, tmp, c, st); 855 SP("aft", st, c); 856 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 857 p += clen; 858 } 859 860 assert(coldp != NULL); 861 m->coldp = coldp; 862 if (ISSET(st, stopst)) 863 return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 864 else 865 return (NULL); 866 } 867 868 /* 869 * slow - step through the string more deliberately 870 */ 871 static const char * 872 slow(struct match *m, const char *start, const char *stop, sopno startst, 873 sopno stopst) 874 { 875 states st = m->st; 876 states empty = m->empty; 877 states tmp = m->tmp; 878 const char *p = start; 879 wint_t c; 880 wint_t lastc; /* previous c */ 881 wint_t flagch; 882 int i; 883 const char *matchp; /* last p at which a match ended */ 884 size_t clen; 885 886 AT("slow", start, stop, startst, stopst); 887 CLEAR(st); 888 SET1(st, startst); 889 SP("sstart", st, *p); 890 st = step(m->g, startst, stopst, st, NOTHING, st); 891 matchp = NULL; 892 if (start == m->beginp) 893 c = OUT; 894 else { 895 /* 896 * XXX Wrong if the previous character was multi-byte. 897 * Newline never is (in encodings supported by FreeBSD), 898 * so this only breaks the ISWORD tests below. 899 */ 900 c = (uch)*(start - 1); 901 } 902 for (;;) { 903 /* next character */ 904 lastc = c; 905 if (p == m->endp) { 906 c = OUT; 907 clen = 0; 908 } else 909 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 910 911 /* is there an EOL and/or BOL between lastc and c? */ 912 flagch = '\0'; 913 i = 0; 914 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 915 (lastc == OUT && !(m->eflags®_NOTBOL))) { 916 flagch = BOL; 917 i = m->g->nbol; 918 } 919 if ((c == '\n' && m->g->cflags®_NEWLINE) || 920 (c == OUT && !(m->eflags®_NOTEOL))) { 921 flagch = (flagch == BOL) ? BOLEOL : EOL; 922 i += m->g->neol; 923 } 924 if (i != 0) { 925 for (; i > 0; i--) 926 st = step(m->g, startst, stopst, st, 927 flagch, st); 928 SP("sboleol", st, c); 929 } 930 931 /* how about a word boundary? */ 932 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 933 (c != OUT && ISWORD(c))) { 934 flagch = BOW; 935 } 936 if ((lastc != OUT && ISWORD(lastc)) && 937 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 938 flagch = EOW; 939 } 940 if (flagch == BOW || flagch == EOW) { 941 st = step(m->g, startst, stopst, st, flagch, st); 942 SP("sboweow", st, c); 943 } 944 945 /* are we done? */ 946 if (ISSET(st, stopst)) 947 matchp = p; 948 if (EQ(st, empty) || p == stop || clen > stop - p) 949 break; /* NOTE BREAK OUT */ 950 951 /* no, we must deal with this character */ 952 ASSIGN(tmp, st); 953 ASSIGN(st, empty); 954 assert(c != OUT); 955 st = step(m->g, startst, stopst, tmp, c, st); 956 SP("saft", st, c); 957 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 958 p += clen; 959 } 960 961 return (matchp); 962 } 963 964 965 /* 966 * step - map set of states reachable before char to set reachable after 967 */ 968 static states 969 step(struct re_guts *g, 970 sopno start, /* start state within strip */ 971 sopno stop, /* state after stop state within strip */ 972 states bef, /* states reachable before */ 973 wint_t ch, /* character or NONCHAR code */ 974 states aft) /* states already known reachable after */ 975 { 976 cset *cs; 977 sop s; 978 sopno pc; 979 onestate here; /* note, macros know this name */ 980 sopno look; 981 int i; 982 983 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 984 s = g->strip[pc]; 985 switch (OP(s)) { 986 case OEND: 987 assert(pc == stop-1); 988 break; 989 case OCHAR: 990 /* only characters can match */ 991 assert(!NONCHAR(ch) || ch != OPND(s)); 992 if (ch == OPND(s)) 993 FWD(aft, bef, 1); 994 break; 995 case OBOL: 996 if (ch == BOL || ch == BOLEOL) 997 FWD(aft, bef, 1); 998 break; 999 case OEOL: 1000 if (ch == EOL || ch == BOLEOL) 1001 FWD(aft, bef, 1); 1002 break; 1003 case OBOW: 1004 if (ch == BOW) 1005 FWD(aft, bef, 1); 1006 break; 1007 case OEOW: 1008 if (ch == EOW) 1009 FWD(aft, bef, 1); 1010 break; 1011 case OANY: 1012 if (!NONCHAR(ch)) 1013 FWD(aft, bef, 1); 1014 break; 1015 case OANYOF: 1016 cs = &g->sets[OPND(s)]; 1017 if (!NONCHAR(ch) && CHIN(cs, ch)) 1018 FWD(aft, bef, 1); 1019 break; 1020 case OBACK_: /* ignored here */ 1021 case O_BACK: 1022 FWD(aft, aft, 1); 1023 break; 1024 case OPLUS_: /* forward, this is just an empty */ 1025 FWD(aft, aft, 1); 1026 break; 1027 case O_PLUS: /* both forward and back */ 1028 FWD(aft, aft, 1); 1029 i = ISSETBACK(aft, OPND(s)); 1030 BACK(aft, aft, OPND(s)); 1031 if (!i && ISSETBACK(aft, OPND(s))) { 1032 /* oho, must reconsider loop body */ 1033 pc -= OPND(s) + 1; 1034 INIT(here, pc); 1035 } 1036 break; 1037 case OQUEST_: /* two branches, both forward */ 1038 FWD(aft, aft, 1); 1039 FWD(aft, aft, OPND(s)); 1040 break; 1041 case O_QUEST: /* just an empty */ 1042 FWD(aft, aft, 1); 1043 break; 1044 case OLPAREN: /* not significant here */ 1045 case ORPAREN: 1046 FWD(aft, aft, 1); 1047 break; 1048 case OCH_: /* mark the first two branches */ 1049 FWD(aft, aft, 1); 1050 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1051 FWD(aft, aft, OPND(s)); 1052 break; 1053 case OOR1: /* done a branch, find the O_CH */ 1054 if (ISSTATEIN(aft, here)) { 1055 for (look = 1; 1056 OP(s = g->strip[pc+look]) != O_CH; 1057 look += OPND(s)) 1058 assert(OP(s) == OOR2); 1059 FWD(aft, aft, look + 1); 1060 } 1061 break; 1062 case OOR2: /* propagate OCH_'s marking */ 1063 FWD(aft, aft, 1); 1064 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1065 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1066 FWD(aft, aft, OPND(s)); 1067 } 1068 break; 1069 case O_CH: /* just empty */ 1070 FWD(aft, aft, 1); 1071 break; 1072 default: /* ooooops... */ 1073 assert(0); 1074 break; 1075 } 1076 } 1077 1078 return (aft); 1079 } 1080 1081 #ifdef REDEBUG 1082 /* 1083 * print - print a set of states 1084 */ 1085 static void 1086 print(struct match *m, const char *caption, states st, int ch, FILE *d) 1087 { 1088 struct re_guts *g = m->g; 1089 int i; 1090 int first = 1; 1091 1092 if (!(m->eflags®_TRACE)) 1093 return; 1094 1095 (void) fprintf(d, "%s", caption); 1096 if (ch != '\0') 1097 (void) fprintf(d, " %s", pchar(ch)); 1098 for (i = 0; i < g->nstates; i++) 1099 if (ISSET(st, i)) { 1100 (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1101 first = 0; 1102 } 1103 (void) fprintf(d, "\n"); 1104 } 1105 1106 /* 1107 * at - print current situation 1108 */ 1109 static void 1110 at(struct match *m, const char *title, const char *start, const char *stop, 1111 sopno startst, sopno stopst) 1112 { 1113 if (!(m->eflags®_TRACE)) 1114 return; 1115 1116 (void) printf("%s %s-", title, pchar(*start)); 1117 (void) printf("%s ", pchar(*stop)); 1118 (void) printf("%ld-%ld\n", (long)startst, (long)stopst); 1119 } 1120 1121 #ifndef PCHARDONE 1122 #define PCHARDONE /* never again */ 1123 /* 1124 * pchar - make a character printable 1125 * 1126 * Is this identical to regchar() over in debug.c? Well, yes. But a 1127 * duplicate here avoids having a debugging-capable regexec.o tied to 1128 * a matching debug.o, and this is convenient. It all disappears in 1129 * the non-debug compilation anyway, so it doesn't matter much. 1130 */ 1131 static const char * 1132 pchar(int ch) 1133 { 1134 static char pbuf[10]; 1135 1136 if (isprint((uch)ch) || ch == ' ') 1137 (void) sprintf(pbuf, "%c", ch); 1138 else 1139 (void) sprintf(pbuf, "\\%o", ch); 1140 return (pbuf); 1141 } 1142 #endif 1143 #endif 1144 1145 #undef matcher 1146 #undef fast 1147 #undef slow 1148 #undef dissect 1149 #undef backref 1150 #undef step 1151 #undef print 1152 #undef at 1153 #undef match 1154