1 /* 2 * Copyright 2023 Bill Sommerfeld <sommerfeld@hamachi.org> 3 * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. 4 * Copyright 2012 Milan Jurik. All rights reserved. 5 * Copyright (c) 2016 by Delphix. All rights reserved. 6 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 7 * Copyright (c) 1992, 1993, 1994 8 * The Regents of the University of California. All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Henry Spencer. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #include <stdbool.h> 39 40 /* 41 * The matching engine and friends. This file is #included by regexec.c 42 * after suitable #defines of a variety of macros used herein, so that 43 * different state representations can be used without duplicating masses 44 * of code. 45 */ 46 47 #ifdef SNAMES 48 #define stepback sstepback 49 #define matcher smatcher 50 #define walk swalk 51 #define dissect sdissect 52 #define backref sbackref 53 #define step sstep 54 #define print sprint 55 #define at sat 56 #define match smat 57 #endif 58 #ifdef LNAMES 59 #define stepback lstepback 60 #define matcher lmatcher 61 #define walk lwalk 62 #define dissect ldissect 63 #define backref lbackref 64 #define step lstep 65 #define print lprint 66 #define at lat 67 #define match lmat 68 #endif 69 #ifdef MNAMES 70 #define stepback mstepback 71 #define matcher mmatcher 72 #define walk mwalk 73 #define dissect mdissect 74 #define backref mbackref 75 #define step mstep 76 #define print mprint 77 #define at mat 78 #define match mmat 79 #endif 80 81 /* another structure passed up and down to avoid zillions of parameters */ 82 struct match { 83 struct re_guts *g; 84 int eflags; 85 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 86 const char *offp; /* offsets work from here */ 87 const char *beginp; /* start of string -- virtual NUL precedes */ 88 const char *endp; /* end of string -- virtual NUL here */ 89 const char *coldp; /* can be no match starting before here */ 90 const char **lastpos; /* [nplus+1] */ 91 STATEVARS; 92 states st; /* current states */ 93 states fresh; /* states for a fresh start */ 94 states tmp; /* temporary */ 95 states empty; /* empty set of states */ 96 mbstate_t mbs; /* multibyte conversion state */ 97 }; 98 99 /* ========= begin header generated by ./mkh ========= */ 100 #ifdef __cplusplus 101 extern "C" { 102 #endif 103 104 /* === engine.c === */ 105 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 106 static const char *dissect(struct match *, const char *, const char *, 107 sopno, sopno); 108 static const char *backref(struct match *, const char *, const char *, sopno, 109 sopno, sopno, int); 110 static const char *walk(struct match *m, const char *start, const char *stop, 111 sopno startst, sopno stopst, bool fast); 112 static states step(struct re_guts *, sopno, sopno, states, wint_t, states); 113 #define MAX_RECURSION 100 114 #define BOL (OUT-1) 115 #define EOL (BOL-1) 116 #define BOLEOL (BOL-2) 117 #define NOTHING (BOL-3) 118 #define BOW (BOL-4) 119 #define EOW (BOL-5) 120 #define BADCHAR (BOL-6) 121 #define NONCHAR(c) ((c) <= OUT) 122 #ifdef REDEBUG 123 static void print(struct match *, const char *, states, int, FILE *); 124 #endif 125 #ifdef REDEBUG 126 static void at(struct match *, const char *, const char *, const char *, 127 sopno, sopno); 128 #endif 129 #ifdef REDEBUG 130 static const char *pchar(int ch); 131 #endif 132 133 #ifdef __cplusplus 134 } 135 #endif 136 /* ========= end header generated by ./mkh ========= */ 137 138 #ifdef REDEBUG 139 #define SP(t, s, c) print(m, t, s, c, stdout) 140 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 141 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 142 #else 143 #define SP(t, s, c) /* nothing */ 144 #define AT(t, p1, p2, s1, s2) /* nothing */ 145 #define NOTE(s) /* nothing */ 146 #endif 147 148 /* 149 * Given a multibyte string pointed to by start, step back nchar characters 150 * from current position pointed to by cur. 151 */ 152 static const char * 153 stepback(const char *start, const char *cur, int nchar, unsigned int max) 154 { 155 const char *ret; 156 int wc, mbc; 157 mbstate_t mbs; 158 size_t clen; 159 160 if (max == 1) 161 return ((cur - nchar) > start ? cur - nchar : NULL); 162 163 ret = cur; 164 for (wc = nchar; wc > 0; wc--) { 165 for (mbc = 1; mbc <= max; mbc++) { 166 if ((ret - mbc) < start) 167 return (NULL); 168 memset(&mbs, 0, sizeof (mbs)); 169 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 170 if (clen != (size_t)-1 && clen != (size_t)-2) 171 break; 172 } 173 if (mbc > max) 174 return (NULL); 175 ret -= mbc; 176 } 177 178 return (ret); 179 } 180 181 /* 182 * matcher - the actual matching engine 183 */ 184 static int /* 0 success, REG_NOMATCH failure */ 185 matcher(struct re_guts *g, const char *string, size_t nmatch, 186 regmatch_t pmatch[], int eflags) 187 { 188 const char *endp; 189 size_t i; 190 struct match mv; 191 struct match *m = &mv; 192 const char *dp = NULL; 193 const sopno gf = g->firststate+1; /* +1 for OEND */ 194 const sopno gl = g->laststate; 195 const char *start; 196 const char *stop; 197 /* Boyer-Moore algorithms variables */ 198 const char *pp; 199 int cj, mj; 200 const char *mustfirst; 201 const char *mustlast; 202 int *matchjump; 203 int *charjump; 204 205 /* simplify the situation where possible */ 206 if (g->cflags®_NOSUB) 207 nmatch = 0; 208 209 if (eflags®_STARTEND) { 210 start = string + pmatch[0].rm_so; 211 stop = string + pmatch[0].rm_eo; 212 } else { 213 start = string; 214 stop = start + strlen(start); 215 } 216 217 if (stop < start) 218 return (REG_EFATAL); 219 220 /* prescreening; this does wonders for this rather slow code */ 221 if (g->must != NULL) { 222 if (g->charjump != NULL && g->matchjump != NULL) { 223 mustfirst = g->must; 224 mustlast = g->must + g->mlen - 1; 225 charjump = g->charjump; 226 matchjump = g->matchjump; 227 pp = mustlast; 228 for (dp = start+g->mlen-1; dp < stop; ) { 229 /* Fast skip non-matches */ 230 while (dp < stop && charjump[(int)*dp]) 231 dp += charjump[(int)*dp]; 232 233 if (dp >= stop) 234 break; 235 236 /* Greedy matcher */ 237 /* 238 * We depend on not being used for 239 * for strings of length 1 240 */ 241 while (*--dp == *--pp && pp != mustfirst) 242 ; 243 244 if (*dp == *pp) 245 break; 246 247 /* Jump to next possible match */ 248 mj = matchjump[pp - mustfirst]; 249 cj = charjump[(int)*dp]; 250 dp += (cj < mj ? mj : cj); 251 pp = mustlast; 252 } 253 if (pp != mustfirst) 254 return (REG_NOMATCH); 255 } else { 256 for (dp = start; dp < stop; dp++) 257 if (*dp == g->must[0] && 258 stop - dp >= g->mlen && 259 memcmp(dp, g->must, (size_t)g->mlen) == 0) 260 break; 261 if (dp == stop) /* we didn't find g->must */ 262 return (REG_NOMATCH); 263 } 264 } 265 266 /* match struct setup */ 267 m->g = g; 268 m->eflags = eflags; 269 m->pmatch = NULL; 270 m->lastpos = NULL; 271 m->offp = string; 272 m->beginp = start; 273 m->endp = stop; 274 STATESETUP(m, 4); 275 SETUP(m->st); 276 SETUP(m->fresh); 277 SETUP(m->tmp); 278 SETUP(m->empty); 279 CLEAR(m->empty); 280 ZAPSTATE(&m->mbs); 281 282 /* Adjust start according to moffset, to speed things up */ 283 if (dp != NULL && g->moffset > -1) { 284 const char *nstart; 285 286 nstart = stepback(start, dp, g->moffset, g->mb_cur_max); 287 if (nstart != NULL) 288 start = nstart; 289 } 290 291 SP("mloop", m->st, *start); 292 293 /* this loop does only one repetition except for backrefs */ 294 for (;;) { 295 endp = walk(m, start, stop, gf, gl, true); 296 if (endp == NULL) { /* a miss */ 297 if (m->pmatch != NULL) 298 free((char *)m->pmatch); 299 if (m->lastpos != NULL) 300 free((char *)m->lastpos); 301 STATETEARDOWN(m); 302 return (REG_NOMATCH); 303 } 304 if (nmatch == 0 && !g->backrefs) 305 break; /* no further info needed */ 306 307 /* where? */ 308 assert(m->coldp != NULL); 309 for (;;) { 310 NOTE("finding start"); 311 endp = walk(m, m->coldp, stop, gf, gl, false); 312 if (endp != NULL) 313 break; 314 assert(m->coldp < m->endp); 315 m->coldp += XMBRTOWC(NULL, m->coldp, 316 m->endp - m->coldp, &m->mbs, 0); 317 } 318 if (nmatch == 1 && !g->backrefs) 319 break; /* no further info needed */ 320 321 /* oh my, it wants the subexpressions... */ 322 if (m->pmatch == NULL) 323 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 324 sizeof (regmatch_t)); 325 if (m->pmatch == NULL) { 326 STATETEARDOWN(m); 327 return (REG_ESPACE); 328 } 329 for (i = 1; i <= m->g->nsub; i++) 330 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 331 if (!g->backrefs && !(m->eflags®_BACKR)) { 332 NOTE("dissecting"); 333 dp = dissect(m, m->coldp, endp, gf, gl); 334 } else { 335 if (g->nplus > 0 && m->lastpos == NULL) 336 m->lastpos = malloc((g->nplus+1) * 337 sizeof (const char *)); 338 if (g->nplus > 0 && m->lastpos == NULL) { 339 free(m->pmatch); 340 STATETEARDOWN(m); 341 return (REG_ESPACE); 342 } 343 NOTE("backref dissect"); 344 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 345 } 346 if (dp != NULL) 347 break; 348 349 /* uh-oh... we couldn't find a subexpression-level match */ 350 assert(g->backrefs); /* must be back references doing it */ 351 assert(g->nplus == 0 || m->lastpos != NULL); 352 for (;;) { 353 if (dp != NULL || endp <= m->coldp) 354 break; /* defeat */ 355 NOTE("backoff"); 356 endp = walk(m, m->coldp, endp-1, gf, gl, false); 357 if (endp == NULL) 358 break; /* defeat */ 359 /* try it on a shorter possibility */ 360 #ifndef NDEBUG 361 for (i = 1; i <= m->g->nsub; i++) { 362 assert(m->pmatch[i].rm_so == -1); 363 assert(m->pmatch[i].rm_eo == -1); 364 } 365 #endif 366 NOTE("backoff dissect"); 367 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 368 } 369 assert(dp == NULL || dp == endp); 370 if (dp != NULL) /* found a shorter one */ 371 break; 372 373 /* despite initial appearances, there is no match here */ 374 NOTE("false alarm"); 375 /* recycle starting later */ 376 start = m->coldp + XMBRTOWC(NULL, m->coldp, 377 stop - m->coldp, &m->mbs, 0); 378 assert(start <= stop); 379 } 380 381 /* fill in the details if requested */ 382 if (nmatch > 0) { 383 pmatch[0].rm_so = m->coldp - m->offp; 384 pmatch[0].rm_eo = endp - m->offp; 385 } 386 if (nmatch > 1) { 387 assert(m->pmatch != NULL); 388 for (i = 1; i < nmatch; i++) 389 if (i <= m->g->nsub) 390 pmatch[i] = m->pmatch[i]; 391 else { 392 pmatch[i].rm_so = -1; 393 pmatch[i].rm_eo = -1; 394 } 395 } 396 397 if (m->pmatch != NULL) 398 free((char *)m->pmatch); 399 if (m->lastpos != NULL) 400 free((char *)m->lastpos); 401 STATETEARDOWN(m); 402 return (0); 403 } 404 405 /* 406 * dissect - figure out what matched what, no back references 407 */ 408 static const char * 409 dissect(struct match *m, const char *start, const char *stop, sopno startst, 410 sopno stopst) 411 { 412 int i; 413 sopno ss; /* start sop of current subRE */ 414 sopno es; /* end sop of current subRE */ 415 const char *sp; /* start of string matched by it */ 416 const char *stp; /* string matched by it cannot pass here */ 417 const char *rest; /* start of rest of string */ 418 const char *tail; /* string unmatched by rest of RE */ 419 sopno ssub; /* start sop of subsubRE */ 420 sopno esub; /* end sop of subsubRE */ 421 const char *ssp; /* start of string matched by subsubRE */ 422 const char *sep; /* end of string matched by subsubRE */ 423 const char *oldssp; /* previous ssp */ 424 const char *dp __unused; 425 426 AT("diss", start, stop, startst, stopst); 427 sp = start; 428 for (ss = startst; ss < stopst; ss = es) { 429 /* identify end of subRE */ 430 es = ss; 431 switch (OP(m->g->strip[es])) { 432 case OPLUS_: 433 case OQUEST_: 434 es += OPND(m->g->strip[es]); 435 break; 436 case OCH_: 437 while (OP(m->g->strip[es]) != (sop)O_CH) 438 es += OPND(m->g->strip[es]); 439 break; 440 } 441 es++; 442 443 /* figure out what it matched */ 444 switch (OP(m->g->strip[ss])) { 445 case OEND: 446 assert(0); 447 break; 448 case OCHAR: 449 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 450 break; 451 case OBOL: 452 case OEOL: 453 case OBOW: 454 case OEOW: 455 break; 456 case OANY: 457 case OANYOF: 458 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 459 break; 460 case OBACK_: 461 case O_BACK: 462 assert(0); 463 break; 464 /* cases where length of match is hard to find */ 465 case OQUEST_: 466 stp = stop; 467 for (;;) { 468 /* how long could this one be? */ 469 rest = walk(m, sp, stp, ss, es, false); 470 assert(rest != NULL); /* it did match */ 471 /* could the rest match the rest? */ 472 tail = walk(m, rest, stop, es, stopst, false); 473 if (tail == stop) 474 break; /* yes! */ 475 /* no -- try a shorter match for this one */ 476 stp = rest - 1; 477 assert(stp >= sp); /* it did work */ 478 } 479 ssub = ss + 1; 480 esub = es - 1; 481 /* did innards match? */ 482 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 483 dp = dissect(m, sp, rest, ssub, esub); 484 assert(dp == rest); 485 } else /* no */ 486 assert(sp == rest); 487 sp = rest; 488 break; 489 case OPLUS_: 490 stp = stop; 491 for (;;) { 492 /* how long could this one be? */ 493 rest = walk(m, sp, stp, ss, es, false); 494 assert(rest != NULL); /* it did match */ 495 /* could the rest match the rest? */ 496 tail = walk(m, rest, stop, es, stopst, false); 497 if (tail == stop) 498 break; /* yes! */ 499 /* no -- try a shorter match for this one */ 500 stp = rest - 1; 501 assert(stp >= sp); /* it did work */ 502 } 503 ssub = ss + 1; 504 esub = es - 1; 505 ssp = sp; 506 oldssp = ssp; 507 for (;;) { /* find last match of innards */ 508 sep = walk(m, ssp, rest, ssub, esub, false); 509 if (sep == NULL || sep == ssp) 510 break; /* failed or matched null */ 511 oldssp = ssp; /* on to next try */ 512 ssp = sep; 513 } 514 if (sep == NULL) { 515 /* last successful match */ 516 sep = ssp; 517 ssp = oldssp; 518 } 519 assert(sep == rest); /* must exhaust substring */ 520 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 521 dp = dissect(m, ssp, sep, ssub, esub); 522 assert(dp == sep); 523 sp = rest; 524 break; 525 case OCH_: 526 stp = stop; 527 for (;;) { 528 /* how long could this one be? */ 529 rest = walk(m, sp, stp, ss, es, false); 530 assert(rest != NULL); /* it did match */ 531 /* could the rest match the rest? */ 532 tail = walk(m, rest, stop, es, stopst, false); 533 if (tail == stop) 534 break; /* yes! */ 535 /* no -- try a shorter match for this one */ 536 stp = rest - 1; 537 assert(stp >= sp); /* it did work */ 538 } 539 ssub = ss + 1; 540 esub = ss + OPND(m->g->strip[ss]) - 1; 541 assert(OP(m->g->strip[esub]) == OOR1); 542 for (;;) { /* find first matching branch */ 543 if (walk(m, sp, rest, ssub, esub, 544 false) == rest) 545 break; /* it matched all of it */ 546 /* that one missed, try next one */ 547 assert(OP(m->g->strip[esub]) == OOR1); 548 esub++; 549 assert(OP(m->g->strip[esub]) == OOR2); 550 ssub = esub + 1; 551 esub += OPND(m->g->strip[esub]); 552 if (OP(m->g->strip[esub]) == (sop)OOR2) 553 esub--; 554 else 555 assert(OP(m->g->strip[esub]) == O_CH); 556 } 557 dp = dissect(m, sp, rest, ssub, esub); 558 assert(dp == rest); 559 sp = rest; 560 break; 561 case O_PLUS: 562 case O_QUEST: 563 case OOR1: 564 case OOR2: 565 case O_CH: 566 assert(0); 567 break; 568 case OLPAREN: 569 i = OPND(m->g->strip[ss]); 570 assert(0 < i && i <= m->g->nsub); 571 m->pmatch[i].rm_so = sp - m->offp; 572 break; 573 case ORPAREN: 574 i = OPND(m->g->strip[ss]); 575 assert(0 < i && i <= m->g->nsub); 576 m->pmatch[i].rm_eo = sp - m->offp; 577 break; 578 default: /* uh oh */ 579 assert(0); 580 break; 581 } 582 } 583 584 assert(sp == stop); 585 return (sp); 586 } 587 588 /* 589 * backref - figure out what matched what, figuring in back references 590 */ 591 static const char * 592 backref(struct match *m, const char *start, const char *stop, sopno startst, 593 sopno stopst, sopno lev, /* PLUS nesting level */ 594 int rec) 595 { 596 int i; 597 sopno ss; /* start sop of current subRE */ 598 const char *sp; /* start of string matched by it */ 599 sopno ssub; /* start sop of subsubRE */ 600 sopno esub; /* end sop of subsubRE */ 601 const char *ssp; /* start of string matched by subsubRE */ 602 const char *dp; 603 size_t len; 604 int hard; 605 sop s; 606 regoff_t offsave; 607 cset *cs; 608 wint_t wc; 609 610 AT("back", start, stop, startst, stopst); 611 sp = start; 612 613 /* get as far as we can with easy stuff */ 614 hard = 0; 615 for (ss = startst; !hard && ss < stopst; ss++) 616 switch (OP(s = m->g->strip[ss])) { 617 case OCHAR: 618 if (sp == stop) 619 return (NULL); 620 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 621 if (wc != OPND(s)) 622 return (NULL); 623 break; 624 case OANY: 625 if (sp == stop) 626 return (NULL); 627 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 628 if (wc == BADCHAR) 629 return (NULL); 630 break; 631 case OANYOF: 632 if (sp == stop) 633 return (NULL); 634 cs = &m->g->sets[OPND(s)]; 635 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 636 if (wc == BADCHAR || !CHIN(NC_ENGINE, cs, wc)) 637 return (NULL); 638 break; 639 case OBOL: 640 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 641 (sp > m->offp && sp < m->endp && 642 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) { 643 break; 644 } 645 return (NULL); 646 case OEOL: 647 if ((sp == m->endp && !(m->eflags®_NOTEOL)) || 648 (sp < m->endp && *sp == '\n' && 649 (m->g->cflags®_NEWLINE))) { 650 break; 651 } 652 return (NULL); 653 case OBOW: 654 if (sp < m->endp && ISWORD(*sp) && 655 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 656 (sp > m->offp && !ISWORD(*(sp-1))))) { 657 break; 658 } 659 return (NULL); 660 case OEOW: 661 if (((sp == m->endp && !(m->eflags®_NOTEOL)) || 662 (sp < m->endp && *sp == '\n' && 663 (m->g->cflags®_NEWLINE)) || 664 (sp < m->endp && !ISWORD(*sp))) && 665 (sp > m->beginp && ISWORD(*(sp-1)))) { 666 break; 667 } 668 return (NULL); 669 case O_QUEST: 670 break; 671 case OOR1: /* matches null but needs to skip */ 672 ss++; 673 s = m->g->strip[ss]; 674 do { 675 assert(OP(s) == OOR2); 676 ss += OPND(s); 677 } while (OP(s = m->g->strip[ss]) != (sop)O_CH); 678 /* note that the ss++ gets us past the O_CH */ 679 break; 680 default: /* have to make a choice */ 681 hard = 1; 682 break; 683 } 684 if (!hard) { /* that was it! */ 685 if (sp != stop) 686 return (NULL); 687 return (sp); 688 } 689 ss--; /* adjust for the for's final increment */ 690 691 /* the hard stuff */ 692 AT("hard", sp, stop, ss, stopst); 693 s = m->g->strip[ss]; 694 switch (OP(s)) { 695 case OBACK_: /* the vilest depths */ 696 i = OPND(s); 697 assert(0 < i && i <= m->g->nsub); 698 if (m->pmatch[i].rm_eo == -1) 699 return (NULL); 700 assert(m->pmatch[i].rm_so != -1); 701 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 702 if (len == 0 && rec++ > MAX_RECURSION) 703 return (NULL); 704 assert(stop - m->beginp >= len); 705 if (sp > stop - len) 706 return (NULL); /* not enough left to match */ 707 ssp = m->offp + m->pmatch[i].rm_so; 708 if (memcmp(sp, ssp, len) != 0) 709 return (NULL); 710 while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) 711 ss++; 712 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 713 case OQUEST_: /* to null or not */ 714 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 715 if (dp != NULL) 716 return (dp); /* not */ 717 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 718 case OPLUS_: 719 assert(m->lastpos != NULL); 720 assert(lev+1 <= m->g->nplus); 721 m->lastpos[lev+1] = sp; 722 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 723 case O_PLUS: 724 if (sp == m->lastpos[lev]) /* last pass matched null */ 725 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 726 /* try another pass */ 727 m->lastpos[lev] = sp; 728 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 729 if (dp == NULL) 730 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 731 return (dp); 732 case OCH_: /* find the right one, if any */ 733 ssub = ss + 1; 734 esub = ss + OPND(s) - 1; 735 assert(OP(m->g->strip[esub]) == OOR1); 736 for (;;) { /* find first matching branch */ 737 dp = backref(m, sp, stop, ssub, esub, lev, rec); 738 if (dp != NULL) 739 return (dp); 740 /* that one missed, try next one */ 741 if (OP(m->g->strip[esub]) == (sop)O_CH) 742 return (NULL); /* there is none */ 743 esub++; 744 assert(OP(m->g->strip[esub]) == (sop)OOR2); 745 ssub = esub + 1; 746 esub += OPND(m->g->strip[esub]); 747 if (OP(m->g->strip[esub]) == (sop)OOR2) 748 esub--; 749 else 750 assert(OP(m->g->strip[esub]) == O_CH); 751 } 752 /* NOTREACHED */ 753 break; 754 case OLPAREN: /* must undo assignment if rest fails */ 755 i = OPND(s); 756 assert(0 < i && i <= m->g->nsub); 757 offsave = m->pmatch[i].rm_so; 758 m->pmatch[i].rm_so = sp - m->offp; 759 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 760 if (dp != NULL) 761 return (dp); 762 m->pmatch[i].rm_so = offsave; 763 return (NULL); 764 case ORPAREN: /* must undo assignment if rest fails */ 765 i = OPND(s); 766 assert(0 < i && i <= m->g->nsub); 767 offsave = m->pmatch[i].rm_eo; 768 m->pmatch[i].rm_eo = sp - m->offp; 769 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 770 if (dp != NULL) 771 return (dp); 772 m->pmatch[i].rm_eo = offsave; 773 return (NULL); 774 default: /* uh oh */ 775 assert(0); 776 break; 777 } 778 779 /* "can't happen" */ 780 assert(0); 781 return (NULL); 782 } 783 784 /* 785 * Step through the string either quickly or slowly. Returns where it ended 786 * or NULL. 787 */ 788 static const char * 789 walk(struct match *m, const char *start, const char *stop, sopno startst, 790 sopno stopst, bool fast) 791 { 792 states st = m->st; 793 states fresh = m->fresh; 794 states empty = m->empty; 795 states tmp = m->tmp; 796 const char *p = start; 797 wint_t c; 798 wint_t lastc; /* previous c */ 799 wint_t flagch; 800 int i; 801 const char *matchp; /* last p at which a match ended */ 802 size_t clen; 803 804 AT("slow", start, stop, startst, stopst); 805 CLEAR(st); 806 SET1(st, startst); 807 SP("sstart", st, *p); 808 st = step(m->g, startst, stopst, st, NOTHING, st); 809 if (fast) 810 ASSIGN(fresh, st); 811 matchp = NULL; 812 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 813 c = OUT; 814 else { 815 /* 816 * XXX Wrong if the previous character was multi-byte. 817 * Newline never is (in encodings supported by FreeBSD), 818 * so this only breaks the ISWORD tests below. 819 */ 820 c = (uch)*(start - 1); 821 } 822 for (;;) { 823 /* next character */ 824 lastc = c; 825 if (p == m->endp) { 826 c = OUT; 827 clen = 0; 828 } else 829 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 830 831 if (fast && EQ(st, fresh)) 832 matchp = p; 833 834 /* is there an EOL and/or BOL between lastc and c? */ 835 flagch = '\0'; 836 i = 0; 837 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 838 (lastc == OUT && !(m->eflags®_NOTBOL))) { 839 flagch = BOL; 840 i = m->g->nbol; 841 } 842 if ((c == '\n' && m->g->cflags®_NEWLINE) || 843 (c == OUT && !(m->eflags®_NOTEOL))) { 844 flagch = (flagch == BOL) ? BOLEOL : EOL; 845 i += m->g->neol; 846 } 847 if (i != 0) { 848 for (; i > 0; i--) 849 st = step(m->g, startst, stopst, st, 850 flagch, st); 851 SP("sboleol", st, c); 852 } 853 854 /* how about a word boundary? */ 855 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 856 (c != OUT && ISWORD(c))) { 857 flagch = BOW; 858 } 859 if ((lastc != OUT && ISWORD(lastc)) && 860 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 861 flagch = EOW; 862 } 863 if (flagch == BOW || flagch == EOW) { 864 st = step(m->g, startst, stopst, st, flagch, st); 865 SP("sboweow", st, c); 866 } 867 868 /* are we done? */ 869 if (ISSET(st, stopst)) { 870 if (fast) 871 break; 872 else 873 matchp = p; 874 } 875 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 876 break; /* NOTE BREAK OUT */ 877 878 /* no, we must deal with this character */ 879 ASSIGN(tmp, st); 880 if (fast) 881 ASSIGN(st, fresh); 882 else 883 ASSIGN(st, empty); 884 assert(c != OUT); 885 st = step(m->g, startst, stopst, tmp, c, st); 886 SP("saft", st, c); 887 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 888 p += clen; 889 } 890 891 if (fast) { 892 assert(matchp != NULL); 893 m->coldp = matchp; 894 if (ISSET(st, stopst)) 895 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 896 else 897 return (NULL); 898 } else 899 return (matchp); 900 } 901 902 /* 903 * step - map set of states reachable before char to set reachable after 904 */ 905 static states 906 step(struct re_guts *g, 907 sopno start, /* start state within strip */ 908 sopno stop, /* state after stop state within strip */ 909 states bef, /* states reachable before */ 910 wint_t ch, /* character or NONCHAR code */ 911 states aft) /* states already known reachable after */ 912 { 913 cset *cs; 914 sop s; 915 sopno pc; 916 onestate here; /* note, macros know this name */ 917 sopno look; 918 int i; 919 920 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 921 s = g->strip[pc]; 922 switch (OP(s)) { 923 case OEND: 924 assert(pc == stop-1); 925 break; 926 case OCHAR: 927 /* only characters can match */ 928 assert(!NONCHAR(ch) || ch != OPND(s)); 929 if (ch == OPND(s)) 930 FWD(aft, bef, 1); 931 break; 932 case OBOL: 933 if (ch == BOL || ch == BOLEOL) 934 FWD(aft, bef, 1); 935 break; 936 case OEOL: 937 if (ch == EOL || ch == BOLEOL) 938 FWD(aft, bef, 1); 939 break; 940 case OBOW: 941 if (ch == BOW) 942 FWD(aft, bef, 1); 943 break; 944 case OEOW: 945 if (ch == EOW) 946 FWD(aft, bef, 1); 947 break; 948 case OANY: 949 if (!NONCHAR(ch)) 950 FWD(aft, bef, 1); 951 break; 952 case OANYOF: 953 cs = &g->sets[OPND(s)]; 954 if (!NONCHAR(ch) && CHIN(NC_ENGINE, cs, ch)) 955 FWD(aft, bef, 1); 956 break; 957 case OBACK_: /* ignored here */ 958 case O_BACK: 959 FWD(aft, aft, 1); 960 break; 961 case OPLUS_: /* forward, this is just an empty */ 962 FWD(aft, aft, 1); 963 break; 964 case O_PLUS: /* both forward and back */ 965 FWD(aft, aft, 1); 966 i = ISSETBACK(aft, OPND(s)); 967 BACK(aft, aft, OPND(s)); 968 if (!i && ISSETBACK(aft, OPND(s))) { 969 /* oho, must reconsider loop body */ 970 pc -= OPND(s) + 1; 971 INIT(here, pc); 972 } 973 break; 974 case OQUEST_: /* two branches, both forward */ 975 FWD(aft, aft, 1); 976 FWD(aft, aft, OPND(s)); 977 break; 978 case O_QUEST: /* just an empty */ 979 FWD(aft, aft, 1); 980 break; 981 case OLPAREN: /* not significant here */ 982 case ORPAREN: 983 FWD(aft, aft, 1); 984 break; 985 case OCH_: /* mark the first two branches */ 986 FWD(aft, aft, 1); 987 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 988 FWD(aft, aft, OPND(s)); 989 break; 990 case OOR1: /* done a branch, find the O_CH */ 991 if (ISSTATEIN(aft, here)) { 992 for (look = 1; 993 OP(s = g->strip[pc+look]) != (sop)O_CH; 994 look += OPND(s)) 995 assert(OP(s) == (sop)OOR2); 996 FWD(aft, aft, look + 1); 997 } 998 break; 999 case OOR2: /* propagate OCH_'s marking */ 1000 FWD(aft, aft, 1); 1001 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { 1002 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1003 FWD(aft, aft, OPND(s)); 1004 } 1005 break; 1006 case O_CH: /* just empty */ 1007 FWD(aft, aft, 1); 1008 break; 1009 default: /* ooooops... */ 1010 assert(0); 1011 break; 1012 } 1013 } 1014 1015 return (aft); 1016 } 1017 1018 #ifdef REDEBUG 1019 /* 1020 * print - print a set of states 1021 */ 1022 static void 1023 print(struct match *m, const char *caption, states st, int ch, FILE *d) 1024 { 1025 struct re_guts *g = m->g; 1026 sopno i; 1027 int first = 1; 1028 1029 if (!(m->eflags®_TRACE)) 1030 return; 1031 1032 (void) fprintf(d, "%s", caption); 1033 if (ch != '\0') 1034 (void) fprintf(d, " %s", pchar(ch)); 1035 for (i = 0; i < g->nstates; i++) 1036 if (ISSET(st, i)) { 1037 (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1038 first = 0; 1039 } 1040 (void) fprintf(d, "\n"); 1041 } 1042 1043 /* 1044 * at - print current situation 1045 */ 1046 static void 1047 at(struct match *m, const char *title, const char *start, const char *stop, 1048 sopno startst, sopno stopst) 1049 { 1050 if (!(m->eflags®_TRACE)) 1051 return; 1052 1053 (void) printf("%s %s-", title, pchar(*start)); 1054 (void) printf("%s ", pchar(*stop)); 1055 (void) printf("%ld-%ld\n", (long)startst, (long)stopst); 1056 } 1057 1058 #ifndef PCHARDONE 1059 #define PCHARDONE /* never again */ 1060 /* 1061 * pchar - make a character printable 1062 * 1063 * Is this identical to regchar() over in debug.c? Well, yes. But a 1064 * duplicate here avoids having a debugging-capable regexec.o tied to 1065 * a matching debug.o, and this is convenient. It all disappears in 1066 * the non-debug compilation anyway, so it doesn't matter much. 1067 */ 1068 static const char * 1069 pchar(int ch) 1070 { 1071 static char pbuf[10]; 1072 1073 if (isprint((uch)ch) || ch == ' ') 1074 (void) sprintf(pbuf, "%c", ch); 1075 else 1076 (void) sprintf(pbuf, "\\%o", ch); 1077 return (pbuf); 1078 } 1079 #endif 1080 #endif 1081 1082 #undef stepback 1083 #undef matcher 1084 #undef walk 1085 #undef dissect 1086 #undef backref 1087 #undef step 1088 #undef print 1089 #undef at 1090 #undef match 1091