1 /* 2 * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. 3 * Copyright 2012 Milan Jurik. All rights reserved. 4 * Copyright (c) 2016 by Delphix. All rights reserved. 5 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 6 * Copyright (c) 1992, 1993, 1994 7 * The Regents of the University of California. All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * Henry Spencer. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include <stdbool.h> 38 39 /* 40 * The matching engine and friends. This file is #included by regexec.c 41 * after suitable #defines of a variety of macros used herein, so that 42 * different state representations can be used without duplicating masses 43 * of code. 44 */ 45 46 #ifdef SNAMES 47 #define stepback sstepback 48 #define matcher smatcher 49 #define walk swalk 50 #define dissect sdissect 51 #define backref sbackref 52 #define step sstep 53 #define print sprint 54 #define at sat 55 #define match smat 56 #endif 57 #ifdef LNAMES 58 #define stepback lstepback 59 #define matcher lmatcher 60 #define walk lwalk 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #endif 68 #ifdef MNAMES 69 #define stepback mstepback 70 #define matcher mmatcher 71 #define walk mwalk 72 #define dissect mdissect 73 #define backref mbackref 74 #define step mstep 75 #define print mprint 76 #define at mat 77 #define match mmat 78 #endif 79 80 /* another structure passed up and down to avoid zillions of parameters */ 81 struct match { 82 struct re_guts *g; 83 int eflags; 84 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 85 const char *offp; /* offsets work from here */ 86 const char *beginp; /* start of string -- virtual NUL precedes */ 87 const char *endp; /* end of string -- virtual NUL here */ 88 const char *coldp; /* can be no match starting before here */ 89 const char **lastpos; /* [nplus+1] */ 90 STATEVARS; 91 states st; /* current states */ 92 states fresh; /* states for a fresh start */ 93 states tmp; /* temporary */ 94 states empty; /* empty set of states */ 95 mbstate_t mbs; /* multibyte conversion state */ 96 }; 97 98 /* ========= begin header generated by ./mkh ========= */ 99 #ifdef __cplusplus 100 extern "C" { 101 #endif 102 103 /* === engine.c === */ 104 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 105 static const char *dissect(struct match *, const char *, const char *, 106 sopno, sopno); 107 static const char *backref(struct match *, const char *, const char *, sopno, 108 sopno, sopno, int); 109 static const char *walk(struct match *m, const char *start, const char *stop, 110 sopno startst, sopno stopst, bool fast); 111 static states step(struct re_guts *, sopno, sopno, states, wint_t, states); 112 #define MAX_RECURSION 100 113 #define BOL (OUT-1) 114 #define EOL (BOL-1) 115 #define BOLEOL (BOL-2) 116 #define NOTHING (BOL-3) 117 #define BOW (BOL-4) 118 #define EOW (BOL-5) 119 #define BADCHAR (BOL-6) 120 #define NONCHAR(c) ((c) <= OUT) 121 #ifdef REDEBUG 122 static void print(struct match *, const char *, states, int, FILE *); 123 #endif 124 #ifdef REDEBUG 125 static void at(struct match *, const char *, const char *, const char *, 126 sopno, sopno); 127 #endif 128 #ifdef REDEBUG 129 static const char *pchar(int ch); 130 #endif 131 132 #ifdef __cplusplus 133 } 134 #endif 135 /* ========= end header generated by ./mkh ========= */ 136 137 #ifdef REDEBUG 138 #define SP(t, s, c) print(m, t, s, c, stdout) 139 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 140 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 141 #else 142 #define SP(t, s, c) /* nothing */ 143 #define AT(t, p1, p2, s1, s2) /* nothing */ 144 #define NOTE(s) /* nothing */ 145 #endif 146 147 /* 148 * Given a multibyte string pointed to by start, step back nchar characters 149 * from current position pointed to by cur. 150 */ 151 static const char * 152 stepback(const char *start, const char *cur, int nchar) 153 { 154 const char *ret; 155 int wc, mbc; 156 mbstate_t mbs; 157 size_t clen; 158 159 if (MB_CUR_MAX == 1) 160 return ((cur - nchar) > start ? cur - nchar : NULL); 161 162 ret = cur; 163 for (wc = nchar; wc > 0; wc--) { 164 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { 165 if ((ret - mbc) < start) 166 return (NULL); 167 memset(&mbs, 0, sizeof (mbs)); 168 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 169 if (clen != (size_t)-1 && clen != (size_t)-2) 170 break; 171 } 172 if (mbc > MB_CUR_MAX) 173 return (NULL); 174 ret -= mbc; 175 } 176 177 return (ret); 178 } 179 180 /* 181 * matcher - the actual matching engine 182 */ 183 static int /* 0 success, REG_NOMATCH failure */ 184 matcher(struct re_guts *g, const char *string, size_t nmatch, 185 regmatch_t pmatch[], int eflags) 186 { 187 const char *endp; 188 size_t i; 189 struct match mv; 190 struct match *m = &mv; 191 const char *dp = NULL; 192 const sopno gf = g->firststate+1; /* +1 for OEND */ 193 const sopno gl = g->laststate; 194 const char *start; 195 const char *stop; 196 /* Boyer-Moore algorithms variables */ 197 const char *pp; 198 int cj, mj; 199 const char *mustfirst; 200 const char *mustlast; 201 int *matchjump; 202 int *charjump; 203 204 /* simplify the situation where possible */ 205 if (g->cflags®_NOSUB) 206 nmatch = 0; 207 208 if (eflags®_STARTEND) { 209 start = string + pmatch[0].rm_so; 210 stop = string + pmatch[0].rm_eo; 211 } else { 212 start = string; 213 stop = start + strlen(start); 214 } 215 216 if (stop < start) 217 return (REG_EFATAL); 218 219 /* prescreening; this does wonders for this rather slow code */ 220 if (g->must != NULL) { 221 if (g->charjump != NULL && g->matchjump != NULL) { 222 mustfirst = g->must; 223 mustlast = g->must + g->mlen - 1; 224 charjump = g->charjump; 225 matchjump = g->matchjump; 226 pp = mustlast; 227 for (dp = start+g->mlen-1; dp < stop; ) { 228 /* Fast skip non-matches */ 229 while (dp < stop && charjump[(int)*dp]) 230 dp += charjump[(int)*dp]; 231 232 if (dp >= stop) 233 break; 234 235 /* Greedy matcher */ 236 /* 237 * We depend on not being used for 238 * for strings of length 1 239 */ 240 while (*--dp == *--pp && pp != mustfirst) 241 ; 242 243 if (*dp == *pp) 244 break; 245 246 /* Jump to next possible match */ 247 mj = matchjump[pp - mustfirst]; 248 cj = charjump[(int)*dp]; 249 dp += (cj < mj ? mj : cj); 250 pp = mustlast; 251 } 252 if (pp != mustfirst) 253 return (REG_NOMATCH); 254 } else { 255 for (dp = start; dp < stop; dp++) 256 if (*dp == g->must[0] && 257 stop - dp >= g->mlen && 258 memcmp(dp, g->must, (size_t)g->mlen) == 0) 259 break; 260 if (dp == stop) /* we didn't find g->must */ 261 return (REG_NOMATCH); 262 } 263 } 264 265 /* match struct setup */ 266 m->g = g; 267 m->eflags = eflags; 268 m->pmatch = NULL; 269 m->lastpos = NULL; 270 m->offp = string; 271 m->beginp = start; 272 m->endp = stop; 273 STATESETUP(m, 4); 274 SETUP(m->st); 275 SETUP(m->fresh); 276 SETUP(m->tmp); 277 SETUP(m->empty); 278 CLEAR(m->empty); 279 ZAPSTATE(&m->mbs); 280 281 /* Adjust start according to moffset, to speed things up */ 282 if (dp != NULL && g->moffset > -1) { 283 const char *nstart; 284 285 nstart = stepback(start, dp, g->moffset); 286 if (nstart != NULL) 287 start = nstart; 288 } 289 290 SP("mloop", m->st, *start); 291 292 /* this loop does only one repetition except for backrefs */ 293 for (;;) { 294 endp = walk(m, start, stop, gf, gl, true); 295 if (endp == NULL) { /* a miss */ 296 if (m->pmatch != NULL) 297 free((char *)m->pmatch); 298 if (m->lastpos != NULL) 299 free((char *)m->lastpos); 300 STATETEARDOWN(m); 301 return (REG_NOMATCH); 302 } 303 if (nmatch == 0 && !g->backrefs) 304 break; /* no further info needed */ 305 306 /* where? */ 307 assert(m->coldp != NULL); 308 for (;;) { 309 NOTE("finding start"); 310 endp = walk(m, m->coldp, stop, gf, gl, false); 311 if (endp != NULL) 312 break; 313 assert(m->coldp < m->endp); 314 m->coldp += XMBRTOWC(NULL, m->coldp, 315 m->endp - m->coldp, &m->mbs, 0); 316 } 317 if (nmatch == 1 && !g->backrefs) 318 break; /* no further info needed */ 319 320 /* oh my, it wants the subexpressions... */ 321 if (m->pmatch == NULL) 322 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 323 sizeof (regmatch_t)); 324 if (m->pmatch == NULL) { 325 STATETEARDOWN(m); 326 return (REG_ESPACE); 327 } 328 for (i = 1; i <= m->g->nsub; i++) 329 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 330 if (!g->backrefs && !(m->eflags®_BACKR)) { 331 NOTE("dissecting"); 332 dp = dissect(m, m->coldp, endp, gf, gl); 333 } else { 334 if (g->nplus > 0 && m->lastpos == NULL) 335 m->lastpos = malloc((g->nplus+1) * 336 sizeof (const char *)); 337 if (g->nplus > 0 && m->lastpos == NULL) { 338 free(m->pmatch); 339 STATETEARDOWN(m); 340 return (REG_ESPACE); 341 } 342 NOTE("backref dissect"); 343 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 344 } 345 if (dp != NULL) 346 break; 347 348 /* uh-oh... we couldn't find a subexpression-level match */ 349 assert(g->backrefs); /* must be back references doing it */ 350 assert(g->nplus == 0 || m->lastpos != NULL); 351 for (;;) { 352 if (dp != NULL || endp <= m->coldp) 353 break; /* defeat */ 354 NOTE("backoff"); 355 endp = walk(m, m->coldp, endp-1, gf, gl, false); 356 if (endp == NULL) 357 break; /* defeat */ 358 /* try it on a shorter possibility */ 359 #ifndef NDEBUG 360 for (i = 1; i <= m->g->nsub; i++) { 361 assert(m->pmatch[i].rm_so == -1); 362 assert(m->pmatch[i].rm_eo == -1); 363 } 364 #endif 365 NOTE("backoff dissect"); 366 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 367 } 368 assert(dp == NULL || dp == endp); 369 if (dp != NULL) /* found a shorter one */ 370 break; 371 372 /* despite initial appearances, there is no match here */ 373 NOTE("false alarm"); 374 /* recycle starting later */ 375 start = m->coldp + XMBRTOWC(NULL, m->coldp, 376 stop - m->coldp, &m->mbs, 0); 377 assert(start <= stop); 378 } 379 380 /* fill in the details if requested */ 381 if (nmatch > 0) { 382 pmatch[0].rm_so = m->coldp - m->offp; 383 pmatch[0].rm_eo = endp - m->offp; 384 } 385 if (nmatch > 1) { 386 assert(m->pmatch != NULL); 387 for (i = 1; i < nmatch; i++) 388 if (i <= m->g->nsub) 389 pmatch[i] = m->pmatch[i]; 390 else { 391 pmatch[i].rm_so = -1; 392 pmatch[i].rm_eo = -1; 393 } 394 } 395 396 if (m->pmatch != NULL) 397 free((char *)m->pmatch); 398 if (m->lastpos != NULL) 399 free((char *)m->lastpos); 400 STATETEARDOWN(m); 401 return (0); 402 } 403 404 /* 405 * dissect - figure out what matched what, no back references 406 */ 407 static const char * 408 dissect(struct match *m, const char *start, const char *stop, sopno startst, 409 sopno stopst) 410 { 411 int i; 412 sopno ss; /* start sop of current subRE */ 413 sopno es; /* end sop of current subRE */ 414 const char *sp; /* start of string matched by it */ 415 const char *stp; /* string matched by it cannot pass here */ 416 const char *rest; /* start of rest of string */ 417 const char *tail; /* string unmatched by rest of RE */ 418 sopno ssub; /* start sop of subsubRE */ 419 sopno esub; /* end sop of subsubRE */ 420 const char *ssp; /* start of string matched by subsubRE */ 421 const char *sep; /* end of string matched by subsubRE */ 422 const char *oldssp; /* previous ssp */ 423 const char *dp __unused; 424 425 AT("diss", start, stop, startst, stopst); 426 sp = start; 427 for (ss = startst; ss < stopst; ss = es) { 428 /* identify end of subRE */ 429 es = ss; 430 switch (OP(m->g->strip[es])) { 431 case OPLUS_: 432 case OQUEST_: 433 es += OPND(m->g->strip[es]); 434 break; 435 case OCH_: 436 while (OP(m->g->strip[es]) != (sop)O_CH) 437 es += OPND(m->g->strip[es]); 438 break; 439 } 440 es++; 441 442 /* figure out what it matched */ 443 switch (OP(m->g->strip[ss])) { 444 case OEND: 445 assert(0); 446 break; 447 case OCHAR: 448 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 449 break; 450 case OBOL: 451 case OEOL: 452 case OBOW: 453 case OEOW: 454 break; 455 case OANY: 456 case OANYOF: 457 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 458 break; 459 case OBACK_: 460 case O_BACK: 461 assert(0); 462 break; 463 /* cases where length of match is hard to find */ 464 case OQUEST_: 465 stp = stop; 466 for (;;) { 467 /* how long could this one be? */ 468 rest = walk(m, sp, stp, ss, es, false); 469 assert(rest != NULL); /* it did match */ 470 /* could the rest match the rest? */ 471 tail = walk(m, rest, stop, es, stopst, false); 472 if (tail == stop) 473 break; /* yes! */ 474 /* no -- try a shorter match for this one */ 475 stp = rest - 1; 476 assert(stp >= sp); /* it did work */ 477 } 478 ssub = ss + 1; 479 esub = es - 1; 480 /* did innards match? */ 481 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 482 dp = dissect(m, sp, rest, ssub, esub); 483 assert(dp == rest); 484 } else /* no */ 485 assert(sp == rest); 486 sp = rest; 487 break; 488 case OPLUS_: 489 stp = stop; 490 for (;;) { 491 /* how long could this one be? */ 492 rest = walk(m, sp, stp, ss, es, false); 493 assert(rest != NULL); /* it did match */ 494 /* could the rest match the rest? */ 495 tail = walk(m, rest, stop, es, stopst, false); 496 if (tail == stop) 497 break; /* yes! */ 498 /* no -- try a shorter match for this one */ 499 stp = rest - 1; 500 assert(stp >= sp); /* it did work */ 501 } 502 ssub = ss + 1; 503 esub = es - 1; 504 ssp = sp; 505 oldssp = ssp; 506 for (;;) { /* find last match of innards */ 507 sep = walk(m, ssp, rest, ssub, esub, false); 508 if (sep == NULL || sep == ssp) 509 break; /* failed or matched null */ 510 oldssp = ssp; /* on to next try */ 511 ssp = sep; 512 } 513 if (sep == NULL) { 514 /* last successful match */ 515 sep = ssp; 516 ssp = oldssp; 517 } 518 assert(sep == rest); /* must exhaust substring */ 519 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 520 dp = dissect(m, ssp, sep, ssub, esub); 521 assert(dp == sep); 522 sp = rest; 523 break; 524 case OCH_: 525 stp = stop; 526 for (;;) { 527 /* how long could this one be? */ 528 rest = walk(m, sp, stp, ss, es, false); 529 assert(rest != NULL); /* it did match */ 530 /* could the rest match the rest? */ 531 tail = walk(m, rest, stop, es, stopst, false); 532 if (tail == stop) 533 break; /* yes! */ 534 /* no -- try a shorter match for this one */ 535 stp = rest - 1; 536 assert(stp >= sp); /* it did work */ 537 } 538 ssub = ss + 1; 539 esub = ss + OPND(m->g->strip[ss]) - 1; 540 assert(OP(m->g->strip[esub]) == OOR1); 541 for (;;) { /* find first matching branch */ 542 if (walk(m, sp, rest, ssub, esub, 543 false) == rest) 544 break; /* it matched all of it */ 545 /* that one missed, try next one */ 546 assert(OP(m->g->strip[esub]) == OOR1); 547 esub++; 548 assert(OP(m->g->strip[esub]) == OOR2); 549 ssub = esub + 1; 550 esub += OPND(m->g->strip[esub]); 551 if (OP(m->g->strip[esub]) == (sop)OOR2) 552 esub--; 553 else 554 assert(OP(m->g->strip[esub]) == O_CH); 555 } 556 dp = dissect(m, sp, rest, ssub, esub); 557 assert(dp == rest); 558 sp = rest; 559 break; 560 case O_PLUS: 561 case O_QUEST: 562 case OOR1: 563 case OOR2: 564 case O_CH: 565 assert(0); 566 break; 567 case OLPAREN: 568 i = OPND(m->g->strip[ss]); 569 assert(0 < i && i <= m->g->nsub); 570 m->pmatch[i].rm_so = sp - m->offp; 571 break; 572 case ORPAREN: 573 i = OPND(m->g->strip[ss]); 574 assert(0 < i && i <= m->g->nsub); 575 m->pmatch[i].rm_eo = sp - m->offp; 576 break; 577 default: /* uh oh */ 578 assert(0); 579 break; 580 } 581 } 582 583 assert(sp == stop); 584 return (sp); 585 } 586 587 /* 588 * backref - figure out what matched what, figuring in back references 589 */ 590 static const char * 591 backref(struct match *m, const char *start, const char *stop, sopno startst, 592 sopno stopst, sopno lev, /* PLUS nesting level */ 593 int rec) 594 { 595 int i; 596 sopno ss; /* start sop of current subRE */ 597 const char *sp; /* start of string matched by it */ 598 sopno ssub; /* start sop of subsubRE */ 599 sopno esub; /* end sop of subsubRE */ 600 const char *ssp; /* start of string matched by subsubRE */ 601 const char *dp; 602 size_t len; 603 int hard; 604 sop s; 605 regoff_t offsave; 606 cset *cs; 607 wint_t wc; 608 609 AT("back", start, stop, startst, stopst); 610 sp = start; 611 612 /* get as far as we can with easy stuff */ 613 hard = 0; 614 for (ss = startst; !hard && ss < stopst; ss++) 615 switch (OP(s = m->g->strip[ss])) { 616 case OCHAR: 617 if (sp == stop) 618 return (NULL); 619 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 620 if (wc != OPND(s)) 621 return (NULL); 622 break; 623 case OANY: 624 if (sp == stop) 625 return (NULL); 626 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 627 if (wc == BADCHAR) 628 return (NULL); 629 break; 630 case OANYOF: 631 if (sp == stop) 632 return (NULL); 633 cs = &m->g->sets[OPND(s)]; 634 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 635 if (wc == BADCHAR || !CHIN(cs, wc)) 636 return (NULL); 637 break; 638 case OBOL: 639 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 640 (sp > m->offp && sp < m->endp && 641 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) { 642 break; 643 } 644 return (NULL); 645 case OEOL: 646 if ((sp == m->endp && !(m->eflags®_NOTEOL)) || 647 (sp < m->endp && *sp == '\n' && 648 (m->g->cflags®_NEWLINE))) { 649 break; 650 } 651 return (NULL); 652 case OBOW: 653 if (sp < m->endp && ISWORD(*sp) && 654 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 655 (sp > m->offp && !ISWORD(*(sp-1))))) { 656 break; 657 } 658 return (NULL); 659 case OEOW: 660 if (((sp == m->endp && !(m->eflags®_NOTEOL)) || 661 (sp < m->endp && *sp == '\n' && 662 (m->g->cflags®_NEWLINE)) || 663 (sp < m->endp && !ISWORD(*sp))) && 664 (sp > m->beginp && ISWORD(*(sp-1)))) { 665 break; 666 } 667 return (NULL); 668 case O_QUEST: 669 break; 670 case OOR1: /* matches null but needs to skip */ 671 ss++; 672 s = m->g->strip[ss]; 673 do { 674 assert(OP(s) == OOR2); 675 ss += OPND(s); 676 } while (OP(s = m->g->strip[ss]) != (sop)O_CH); 677 /* note that the ss++ gets us past the O_CH */ 678 break; 679 default: /* have to make a choice */ 680 hard = 1; 681 break; 682 } 683 if (!hard) { /* that was it! */ 684 if (sp != stop) 685 return (NULL); 686 return (sp); 687 } 688 ss--; /* adjust for the for's final increment */ 689 690 /* the hard stuff */ 691 AT("hard", sp, stop, ss, stopst); 692 s = m->g->strip[ss]; 693 switch (OP(s)) { 694 case OBACK_: /* the vilest depths */ 695 i = OPND(s); 696 assert(0 < i && i <= m->g->nsub); 697 if (m->pmatch[i].rm_eo == -1) 698 return (NULL); 699 assert(m->pmatch[i].rm_so != -1); 700 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 701 if (len == 0 && rec++ > MAX_RECURSION) 702 return (NULL); 703 assert(stop - m->beginp >= len); 704 if (sp > stop - len) 705 return (NULL); /* not enough left to match */ 706 ssp = m->offp + m->pmatch[i].rm_so; 707 if (memcmp(sp, ssp, len) != 0) 708 return (NULL); 709 while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) 710 ss++; 711 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 712 case OQUEST_: /* to null or not */ 713 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 714 if (dp != NULL) 715 return (dp); /* not */ 716 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 717 case OPLUS_: 718 assert(m->lastpos != NULL); 719 assert(lev+1 <= m->g->nplus); 720 m->lastpos[lev+1] = sp; 721 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 722 case O_PLUS: 723 if (sp == m->lastpos[lev]) /* last pass matched null */ 724 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 725 /* try another pass */ 726 m->lastpos[lev] = sp; 727 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 728 if (dp == NULL) 729 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 730 return (dp); 731 case OCH_: /* find the right one, if any */ 732 ssub = ss + 1; 733 esub = ss + OPND(s) - 1; 734 assert(OP(m->g->strip[esub]) == OOR1); 735 for (;;) { /* find first matching branch */ 736 dp = backref(m, sp, stop, ssub, esub, lev, rec); 737 if (dp != NULL) 738 return (dp); 739 /* that one missed, try next one */ 740 if (OP(m->g->strip[esub]) == (sop)O_CH) 741 return (NULL); /* there is none */ 742 esub++; 743 assert(OP(m->g->strip[esub]) == (sop)OOR2); 744 ssub = esub + 1; 745 esub += OPND(m->g->strip[esub]); 746 if (OP(m->g->strip[esub]) == (sop)OOR2) 747 esub--; 748 else 749 assert(OP(m->g->strip[esub]) == O_CH); 750 } 751 /* NOTREACHED */ 752 break; 753 case OLPAREN: /* must undo assignment if rest fails */ 754 i = OPND(s); 755 assert(0 < i && i <= m->g->nsub); 756 offsave = m->pmatch[i].rm_so; 757 m->pmatch[i].rm_so = sp - m->offp; 758 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 759 if (dp != NULL) 760 return (dp); 761 m->pmatch[i].rm_so = offsave; 762 return (NULL); 763 case ORPAREN: /* must undo assignment if rest fails */ 764 i = OPND(s); 765 assert(0 < i && i <= m->g->nsub); 766 offsave = m->pmatch[i].rm_eo; 767 m->pmatch[i].rm_eo = sp - m->offp; 768 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 769 if (dp != NULL) 770 return (dp); 771 m->pmatch[i].rm_eo = offsave; 772 return (NULL); 773 default: /* uh oh */ 774 assert(0); 775 break; 776 } 777 778 /* "can't happen" */ 779 assert(0); 780 return (NULL); 781 } 782 783 /* 784 * Step through the string either quickly or slowly. Returns where it ended 785 * or NULL. 786 */ 787 static const char * 788 walk(struct match *m, const char *start, const char *stop, sopno startst, 789 sopno stopst, bool fast) 790 { 791 states st = m->st; 792 states fresh = m->fresh; 793 states empty = m->empty; 794 states tmp = m->tmp; 795 const char *p = start; 796 wint_t c; 797 wint_t lastc; /* previous c */ 798 wint_t flagch; 799 int i; 800 const char *matchp; /* last p at which a match ended */ 801 size_t clen; 802 803 AT("slow", start, stop, startst, stopst); 804 CLEAR(st); 805 SET1(st, startst); 806 SP("sstart", st, *p); 807 st = step(m->g, startst, stopst, st, NOTHING, st); 808 if (fast) 809 ASSIGN(fresh, st); 810 matchp = NULL; 811 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 812 c = OUT; 813 else { 814 /* 815 * XXX Wrong if the previous character was multi-byte. 816 * Newline never is (in encodings supported by FreeBSD), 817 * so this only breaks the ISWORD tests below. 818 */ 819 c = (uch)*(start - 1); 820 } 821 for (;;) { 822 /* next character */ 823 lastc = c; 824 if (p == m->endp) { 825 c = OUT; 826 clen = 0; 827 } else 828 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 829 830 if (fast && EQ(st, fresh)) 831 matchp = p; 832 833 /* is there an EOL and/or BOL between lastc and c? */ 834 flagch = '\0'; 835 i = 0; 836 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 837 (lastc == OUT && !(m->eflags®_NOTBOL))) { 838 flagch = BOL; 839 i = m->g->nbol; 840 } 841 if ((c == '\n' && m->g->cflags®_NEWLINE) || 842 (c == OUT && !(m->eflags®_NOTEOL))) { 843 flagch = (flagch == BOL) ? BOLEOL : EOL; 844 i += m->g->neol; 845 } 846 if (i != 0) { 847 for (; i > 0; i--) 848 st = step(m->g, startst, stopst, st, 849 flagch, st); 850 SP("sboleol", st, c); 851 } 852 853 /* how about a word boundary? */ 854 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 855 (c != OUT && ISWORD(c))) { 856 flagch = BOW; 857 } 858 if ((lastc != OUT && ISWORD(lastc)) && 859 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 860 flagch = EOW; 861 } 862 if (flagch == BOW || flagch == EOW) { 863 st = step(m->g, startst, stopst, st, flagch, st); 864 SP("sboweow", st, c); 865 } 866 867 /* are we done? */ 868 if (ISSET(st, stopst)) { 869 if (fast) 870 break; 871 else 872 matchp = p; 873 } 874 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 875 break; /* NOTE BREAK OUT */ 876 877 /* no, we must deal with this character */ 878 ASSIGN(tmp, st); 879 if (fast) 880 ASSIGN(st, fresh); 881 else 882 ASSIGN(st, empty); 883 assert(c != OUT); 884 st = step(m->g, startst, stopst, tmp, c, st); 885 SP("saft", st, c); 886 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 887 p += clen; 888 } 889 890 if (fast) { 891 assert(matchp != NULL); 892 m->coldp = matchp; 893 if (ISSET(st, stopst)) 894 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 895 else 896 return (NULL); 897 } else 898 return (matchp); 899 } 900 901 /* 902 * step - map set of states reachable before char to set reachable after 903 */ 904 static states 905 step(struct re_guts *g, 906 sopno start, /* start state within strip */ 907 sopno stop, /* state after stop state within strip */ 908 states bef, /* states reachable before */ 909 wint_t ch, /* character or NONCHAR code */ 910 states aft) /* states already known reachable after */ 911 { 912 cset *cs; 913 sop s; 914 sopno pc; 915 onestate here; /* note, macros know this name */ 916 sopno look; 917 int i; 918 919 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 920 s = g->strip[pc]; 921 switch (OP(s)) { 922 case OEND: 923 assert(pc == stop-1); 924 break; 925 case OCHAR: 926 /* only characters can match */ 927 assert(!NONCHAR(ch) || ch != OPND(s)); 928 if (ch == OPND(s)) 929 FWD(aft, bef, 1); 930 break; 931 case OBOL: 932 if (ch == BOL || ch == BOLEOL) 933 FWD(aft, bef, 1); 934 break; 935 case OEOL: 936 if (ch == EOL || ch == BOLEOL) 937 FWD(aft, bef, 1); 938 break; 939 case OBOW: 940 if (ch == BOW) 941 FWD(aft, bef, 1); 942 break; 943 case OEOW: 944 if (ch == EOW) 945 FWD(aft, bef, 1); 946 break; 947 case OANY: 948 if (!NONCHAR(ch)) 949 FWD(aft, bef, 1); 950 break; 951 case OANYOF: 952 cs = &g->sets[OPND(s)]; 953 if (!NONCHAR(ch) && CHIN(cs, ch)) 954 FWD(aft, bef, 1); 955 break; 956 case OBACK_: /* ignored here */ 957 case O_BACK: 958 FWD(aft, aft, 1); 959 break; 960 case OPLUS_: /* forward, this is just an empty */ 961 FWD(aft, aft, 1); 962 break; 963 case O_PLUS: /* both forward and back */ 964 FWD(aft, aft, 1); 965 i = ISSETBACK(aft, OPND(s)); 966 BACK(aft, aft, OPND(s)); 967 if (!i && ISSETBACK(aft, OPND(s))) { 968 /* oho, must reconsider loop body */ 969 pc -= OPND(s) + 1; 970 INIT(here, pc); 971 } 972 break; 973 case OQUEST_: /* two branches, both forward */ 974 FWD(aft, aft, 1); 975 FWD(aft, aft, OPND(s)); 976 break; 977 case O_QUEST: /* just an empty */ 978 FWD(aft, aft, 1); 979 break; 980 case OLPAREN: /* not significant here */ 981 case ORPAREN: 982 FWD(aft, aft, 1); 983 break; 984 case OCH_: /* mark the first two branches */ 985 FWD(aft, aft, 1); 986 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 987 FWD(aft, aft, OPND(s)); 988 break; 989 case OOR1: /* done a branch, find the O_CH */ 990 if (ISSTATEIN(aft, here)) { 991 for (look = 1; 992 OP(s = g->strip[pc+look]) != (sop)O_CH; 993 look += OPND(s)) 994 assert(OP(s) == (sop)OOR2); 995 FWD(aft, aft, look + 1); 996 } 997 break; 998 case OOR2: /* propagate OCH_'s marking */ 999 FWD(aft, aft, 1); 1000 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { 1001 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1002 FWD(aft, aft, OPND(s)); 1003 } 1004 break; 1005 case O_CH: /* just empty */ 1006 FWD(aft, aft, 1); 1007 break; 1008 default: /* ooooops... */ 1009 assert(0); 1010 break; 1011 } 1012 } 1013 1014 return (aft); 1015 } 1016 1017 #ifdef REDEBUG 1018 /* 1019 * print - print a set of states 1020 */ 1021 static void 1022 print(struct match *m, const char *caption, states st, int ch, FILE *d) 1023 { 1024 struct re_guts *g = m->g; 1025 sopno i; 1026 int first = 1; 1027 1028 if (!(m->eflags®_TRACE)) 1029 return; 1030 1031 (void) fprintf(d, "%s", caption); 1032 if (ch != '\0') 1033 (void) fprintf(d, " %s", pchar(ch)); 1034 for (i = 0; i < g->nstates; i++) 1035 if (ISSET(st, i)) { 1036 (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1037 first = 0; 1038 } 1039 (void) fprintf(d, "\n"); 1040 } 1041 1042 /* 1043 * at - print current situation 1044 */ 1045 static void 1046 at(struct match *m, const char *title, const char *start, const char *stop, 1047 sopno startst, sopno stopst) 1048 { 1049 if (!(m->eflags®_TRACE)) 1050 return; 1051 1052 (void) printf("%s %s-", title, pchar(*start)); 1053 (void) printf("%s ", pchar(*stop)); 1054 (void) printf("%ld-%ld\n", (long)startst, (long)stopst); 1055 } 1056 1057 #ifndef PCHARDONE 1058 #define PCHARDONE /* never again */ 1059 /* 1060 * pchar - make a character printable 1061 * 1062 * Is this identical to regchar() over in debug.c? Well, yes. But a 1063 * duplicate here avoids having a debugging-capable regexec.o tied to 1064 * a matching debug.o, and this is convenient. It all disappears in 1065 * the non-debug compilation anyway, so it doesn't matter much. 1066 */ 1067 static const char * 1068 pchar(int ch) 1069 { 1070 static char pbuf[10]; 1071 1072 if (isprint((uch)ch) || ch == ' ') 1073 (void) sprintf(pbuf, "%c", ch); 1074 else 1075 (void) sprintf(pbuf, "\\%o", ch); 1076 return (pbuf); 1077 } 1078 #endif 1079 #endif 1080 1081 #undef stepback 1082 #undef matcher 1083 #undef walk 1084 #undef dissect 1085 #undef backref 1086 #undef step 1087 #undef print 1088 #undef at 1089 #undef match 1090