1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include <stdbool.h> 42 43 /* 44 * The matching engine and friends. This file is #included by regexec.c 45 * after suitable #defines of a variety of macros used herein, so that 46 * different state representations can be used without duplicating masses 47 * of code. 48 */ 49 50 #ifdef SNAMES 51 #define stepback sstepback 52 #define matcher smatcher 53 #define walk swalk 54 #define dissect sdissect 55 #define backref sbackref 56 #define step sstep 57 #define print sprint 58 #define at sat 59 #define match smat 60 #endif 61 #ifdef LNAMES 62 #define stepback lstepback 63 #define matcher lmatcher 64 #define walk lwalk 65 #define dissect ldissect 66 #define backref lbackref 67 #define step lstep 68 #define print lprint 69 #define at lat 70 #define match lmat 71 #endif 72 #ifdef MNAMES 73 #define stepback mstepback 74 #define matcher mmatcher 75 #define walk mwalk 76 #define dissect mdissect 77 #define backref mbackref 78 #define step mstep 79 #define print mprint 80 #define at mat 81 #define match mmat 82 #endif 83 84 /* another structure passed up and down to avoid zillions of parameters */ 85 struct match { 86 struct re_guts *g; 87 int eflags; 88 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 89 const char *offp; /* offsets work from here */ 90 const char *beginp; /* start of string -- virtual NUL precedes */ 91 const char *endp; /* end of string -- virtual NUL here */ 92 const char *coldp; /* can be no match starting before here */ 93 const char **lastpos; /* [nplus+1] */ 94 STATEVARS; 95 states st; /* current states */ 96 states fresh; /* states for a fresh start */ 97 states tmp; /* temporary */ 98 states empty; /* empty set of states */ 99 mbstate_t mbs; /* multibyte conversion state */ 100 }; 101 102 /* ========= begin header generated by ./mkh ========= */ 103 #ifdef __cplusplus 104 extern "C" { 105 #endif 106 107 /* === engine.c === */ 108 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 109 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 110 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 111 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); 112 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); 113 #define MAX_RECURSION 100 114 #define BOL (OUT-1) 115 #define EOL (BOL-1) 116 #define BOLEOL (BOL-2) 117 #define NOTHING (BOL-3) 118 #define BOW (BOL-4) 119 #define EOW (BOL-5) 120 #define BADCHAR (BOL-6) 121 #define NONCHAR(c) ((c) <= OUT) 122 /* sflags */ 123 #define SBOS 0x0001 124 #define SEOS 0x0002 125 126 #ifdef REDEBUG 127 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 128 #endif 129 #ifdef REDEBUG 130 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 131 #endif 132 #ifdef REDEBUG 133 static const char *pchar(int ch); 134 #endif 135 136 #ifdef __cplusplus 137 } 138 #endif 139 /* ========= end header generated by ./mkh ========= */ 140 141 #ifdef REDEBUG 142 #define SP(t, s, c) print(m, t, s, c, stdout) 143 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 144 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 145 #else 146 #define SP(t, s, c) /* nothing */ 147 #define AT(t, p1, p2, s1, s2) /* nothing */ 148 #define NOTE(s) /* nothing */ 149 #endif 150 151 /* 152 * Given a multibyte string pointed to by start, step back nchar characters 153 * from current position pointed to by cur. 154 */ 155 static const char * 156 stepback(const char *start, const char *cur, int nchar) 157 { 158 const char *ret; 159 int wc, mbc; 160 mbstate_t mbs; 161 size_t clen; 162 163 if (MB_CUR_MAX == 1) 164 return ((cur - nchar) > start ? cur - nchar : NULL); 165 166 ret = cur; 167 for (wc = nchar; wc > 0; wc--) { 168 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { 169 if ((ret - mbc) < start) 170 return (NULL); 171 memset(&mbs, 0, sizeof(mbs)); 172 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 173 if (clen != (size_t)-1 && clen != (size_t)-2) 174 break; 175 } 176 if (mbc > MB_CUR_MAX) 177 return (NULL); 178 ret -= mbc; 179 } 180 181 return (ret); 182 } 183 184 /* 185 - matcher - the actual matching engine 186 == static int matcher(struct re_guts *g, const char *string, \ 187 == size_t nmatch, regmatch_t pmatch[], int eflags); 188 */ 189 static int /* 0 success, REG_NOMATCH failure */ 190 matcher(struct re_guts *g, 191 const char *string, 192 size_t nmatch, 193 regmatch_t pmatch[], 194 int eflags) 195 { 196 const char *endp; 197 size_t i; 198 struct match mv; 199 struct match *m = &mv; 200 const char *dp = NULL; 201 const sopno gf = g->firststate+1; /* +1 for OEND */ 202 const sopno gl = g->laststate; 203 const char *start; 204 const char *stop; 205 /* Boyer-Moore algorithms variables */ 206 const char *pp; 207 int cj, mj; 208 const char *mustfirst; 209 const char *mustlast; 210 int *matchjump; 211 int *charjump; 212 213 /* simplify the situation where possible */ 214 if (g->cflags®_NOSUB) 215 nmatch = 0; 216 if (eflags®_STARTEND) { 217 start = string + pmatch[0].rm_so; 218 stop = string + pmatch[0].rm_eo; 219 } else { 220 start = string; 221 stop = start + strlen(start); 222 } 223 if (stop < start) 224 return(REG_INVARG); 225 226 /* prescreening; this does wonders for this rather slow code */ 227 if (g->must != NULL) { 228 if (g->charjump != NULL && g->matchjump != NULL) { 229 mustfirst = g->must; 230 mustlast = g->must + g->mlen - 1; 231 charjump = g->charjump; 232 matchjump = g->matchjump; 233 pp = mustlast; 234 for (dp = start+g->mlen-1; dp < stop;) { 235 /* Fast skip non-matches */ 236 while (dp < stop && charjump[(int)*dp]) 237 dp += charjump[(int)*dp]; 238 239 if (dp >= stop) 240 break; 241 242 /* Greedy matcher */ 243 /* We depend on not being used for 244 * for strings of length 1 245 */ 246 while (*--dp == *--pp && pp != mustfirst); 247 248 if (*dp == *pp) 249 break; 250 251 /* Jump to next possible match */ 252 mj = matchjump[pp - mustfirst]; 253 cj = charjump[(int)*dp]; 254 dp += (cj < mj ? mj : cj); 255 pp = mustlast; 256 } 257 if (pp != mustfirst) 258 return(REG_NOMATCH); 259 } else { 260 for (dp = start; dp < stop; dp++) 261 if (*dp == g->must[0] && 262 stop - dp >= g->mlen && 263 memcmp(dp, g->must, (size_t)g->mlen) == 0) 264 break; 265 if (dp == stop) /* we didn't find g->must */ 266 return(REG_NOMATCH); 267 } 268 } 269 270 /* match struct setup */ 271 m->g = g; 272 m->eflags = eflags; 273 m->pmatch = NULL; 274 m->lastpos = NULL; 275 m->offp = string; 276 m->beginp = start; 277 m->endp = stop; 278 STATESETUP(m, 4); 279 SETUP(m->st); 280 SETUP(m->fresh); 281 SETUP(m->tmp); 282 SETUP(m->empty); 283 CLEAR(m->empty); 284 ZAPSTATE(&m->mbs); 285 286 /* Adjust start according to moffset, to speed things up */ 287 if (dp != NULL && g->moffset > -1) { 288 const char *nstart; 289 290 nstart = stepback(start, dp, g->moffset); 291 if (nstart != NULL) 292 start = nstart; 293 } 294 295 SP("mloop", m->st, *start); 296 297 /* this loop does only one repetition except for backrefs */ 298 for (;;) { 299 endp = walk(m, start, stop, gf, gl, true); 300 if (endp == NULL) { /* a miss */ 301 if (m->pmatch != NULL) 302 free((char *)m->pmatch); 303 if (m->lastpos != NULL) 304 free((char *)m->lastpos); 305 STATETEARDOWN(m); 306 return(REG_NOMATCH); 307 } 308 if (nmatch == 0 && !g->backrefs) 309 break; /* no further info needed */ 310 311 /* where? */ 312 assert(m->coldp != NULL); 313 for (;;) { 314 NOTE("finding start"); 315 endp = walk(m, m->coldp, stop, gf, gl, false); 316 if (endp != NULL) 317 break; 318 assert(m->coldp < m->endp); 319 m->coldp += XMBRTOWC(NULL, m->coldp, 320 m->endp - m->coldp, &m->mbs, 0); 321 } 322 if (nmatch == 1 && !g->backrefs) 323 break; /* no further info needed */ 324 325 /* oh my, he wants the subexpressions... */ 326 if (m->pmatch == NULL) 327 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 328 sizeof(regmatch_t)); 329 if (m->pmatch == NULL) { 330 STATETEARDOWN(m); 331 return(REG_ESPACE); 332 } 333 for (i = 1; i <= m->g->nsub; i++) 334 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 335 if (!g->backrefs && !(m->eflags®_BACKR)) { 336 NOTE("dissecting"); 337 dp = dissect(m, m->coldp, endp, gf, gl); 338 } else { 339 if (g->nplus > 0 && m->lastpos == NULL) 340 m->lastpos = malloc((g->nplus+1) * 341 sizeof(const char *)); 342 if (g->nplus > 0 && m->lastpos == NULL) { 343 free(m->pmatch); 344 STATETEARDOWN(m); 345 return(REG_ESPACE); 346 } 347 NOTE("backref dissect"); 348 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 349 } 350 if (dp != NULL) 351 break; 352 353 /* uh-oh... we couldn't find a subexpression-level match */ 354 assert(g->backrefs); /* must be back references doing it */ 355 assert(g->nplus == 0 || m->lastpos != NULL); 356 for (;;) { 357 if (dp != NULL || endp <= m->coldp) 358 break; /* defeat */ 359 NOTE("backoff"); 360 endp = walk(m, m->coldp, endp-1, gf, gl, false); 361 if (endp == NULL) 362 break; /* defeat */ 363 /* try it on a shorter possibility */ 364 #ifndef NDEBUG 365 for (i = 1; i <= m->g->nsub; i++) { 366 assert(m->pmatch[i].rm_so == -1); 367 assert(m->pmatch[i].rm_eo == -1); 368 } 369 #endif 370 NOTE("backoff dissect"); 371 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 372 } 373 assert(dp == NULL || dp == endp); 374 if (dp != NULL) /* found a shorter one */ 375 break; 376 377 /* despite initial appearances, there is no match here */ 378 NOTE("false alarm"); 379 /* recycle starting later */ 380 start = m->coldp + XMBRTOWC(NULL, m->coldp, 381 stop - m->coldp, &m->mbs, 0); 382 assert(start <= stop); 383 } 384 385 /* fill in the details if requested */ 386 if (nmatch > 0) { 387 pmatch[0].rm_so = m->coldp - m->offp; 388 pmatch[0].rm_eo = endp - m->offp; 389 } 390 if (nmatch > 1) { 391 assert(m->pmatch != NULL); 392 for (i = 1; i < nmatch; i++) 393 if (i <= m->g->nsub) 394 pmatch[i] = m->pmatch[i]; 395 else { 396 pmatch[i].rm_so = -1; 397 pmatch[i].rm_eo = -1; 398 } 399 } 400 401 if (m->pmatch != NULL) 402 free((char *)m->pmatch); 403 if (m->lastpos != NULL) 404 free((char *)m->lastpos); 405 STATETEARDOWN(m); 406 return(0); 407 } 408 409 /* 410 - dissect - figure out what matched what, no back references 411 == static const char *dissect(struct match *m, const char *start, \ 412 == const char *stop, sopno startst, sopno stopst); 413 */ 414 static const char * /* == stop (success) always */ 415 dissect(struct match *m, 416 const char *start, 417 const char *stop, 418 sopno startst, 419 sopno stopst) 420 { 421 int i; 422 sopno ss; /* start sop of current subRE */ 423 sopno es; /* end sop of current subRE */ 424 const char *sp; /* start of string matched by it */ 425 const char *stp; /* string matched by it cannot pass here */ 426 const char *rest; /* start of rest of string */ 427 const char *tail; /* string unmatched by rest of RE */ 428 sopno ssub; /* start sop of subsubRE */ 429 sopno esub; /* end sop of subsubRE */ 430 const char *ssp; /* start of string matched by subsubRE */ 431 const char *sep; /* end of string matched by subsubRE */ 432 const char *oldssp; /* previous ssp */ 433 const char *dp __unused; 434 435 AT("diss", start, stop, startst, stopst); 436 sp = start; 437 for (ss = startst; ss < stopst; ss = es) { 438 /* identify end of subRE */ 439 es = ss; 440 switch (OP(m->g->strip[es])) { 441 case OPLUS_: 442 case OQUEST_: 443 es += OPND(m->g->strip[es]); 444 break; 445 case OCH_: 446 while (OP(m->g->strip[es]) != (sop)O_CH) 447 es += OPND(m->g->strip[es]); 448 break; 449 } 450 es++; 451 452 /* figure out what it matched */ 453 switch (OP(m->g->strip[ss])) { 454 case OEND: 455 assert(nope); 456 break; 457 case OCHAR: 458 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 459 break; 460 case OBOL: 461 case OEOL: 462 case OBOW: 463 case OEOW: 464 case OBOS: 465 case OEOS: 466 break; 467 case OANY: 468 case OANYOF: 469 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 470 break; 471 case OBACK_: 472 case O_BACK: 473 assert(nope); 474 break; 475 /* cases where length of match is hard to find */ 476 case OQUEST_: 477 stp = stop; 478 for (;;) { 479 /* how long could this one be? */ 480 rest = walk(m, sp, stp, ss, es, false); 481 assert(rest != NULL); /* it did match */ 482 /* could the rest match the rest? */ 483 tail = walk(m, rest, stop, es, stopst, false); 484 if (tail == stop) 485 break; /* yes! */ 486 /* no -- try a shorter match for this one */ 487 stp = rest - 1; 488 assert(stp >= sp); /* it did work */ 489 } 490 ssub = ss + 1; 491 esub = es - 1; 492 /* did innards match? */ 493 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 494 dp = dissect(m, sp, rest, ssub, esub); 495 assert(dp == rest); 496 } else /* no */ 497 assert(sp == rest); 498 sp = rest; 499 break; 500 case OPLUS_: 501 stp = stop; 502 for (;;) { 503 /* how long could this one be? */ 504 rest = walk(m, sp, stp, ss, es, false); 505 assert(rest != NULL); /* it did match */ 506 /* could the rest match the rest? */ 507 tail = walk(m, rest, stop, es, stopst, false); 508 if (tail == stop) 509 break; /* yes! */ 510 /* no -- try a shorter match for this one */ 511 stp = rest - 1; 512 assert(stp >= sp); /* it did work */ 513 } 514 ssub = ss + 1; 515 esub = es - 1; 516 ssp = sp; 517 oldssp = ssp; 518 for (;;) { /* find last match of innards */ 519 sep = walk(m, ssp, rest, ssub, esub, false); 520 if (sep == NULL || sep == ssp) 521 break; /* failed or matched null */ 522 oldssp = ssp; /* on to next try */ 523 ssp = sep; 524 } 525 if (sep == NULL) { 526 /* last successful match */ 527 sep = ssp; 528 ssp = oldssp; 529 } 530 assert(sep == rest); /* must exhaust substring */ 531 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 532 dp = dissect(m, ssp, sep, ssub, esub); 533 assert(dp == sep); 534 sp = rest; 535 break; 536 case OCH_: 537 stp = stop; 538 for (;;) { 539 /* how long could this one be? */ 540 rest = walk(m, sp, stp, ss, es, false); 541 assert(rest != NULL); /* it did match */ 542 /* could the rest match the rest? */ 543 tail = walk(m, rest, stop, es, stopst, false); 544 if (tail == stop) 545 break; /* yes! */ 546 /* no -- try a shorter match for this one */ 547 stp = rest - 1; 548 assert(stp >= sp); /* it did work */ 549 } 550 ssub = ss + 1; 551 esub = ss + OPND(m->g->strip[ss]) - 1; 552 assert(OP(m->g->strip[esub]) == OOR1); 553 for (;;) { /* find first matching branch */ 554 if (walk(m, sp, rest, ssub, esub, false) == rest) 555 break; /* it matched all of it */ 556 /* that one missed, try next one */ 557 assert(OP(m->g->strip[esub]) == OOR1); 558 esub++; 559 assert(OP(m->g->strip[esub]) == OOR2); 560 ssub = esub + 1; 561 esub += OPND(m->g->strip[esub]); 562 if (OP(m->g->strip[esub]) == (sop)OOR2) 563 esub--; 564 else 565 assert(OP(m->g->strip[esub]) == O_CH); 566 } 567 dp = dissect(m, sp, rest, ssub, esub); 568 assert(dp == rest); 569 sp = rest; 570 break; 571 case O_PLUS: 572 case O_QUEST: 573 case OOR1: 574 case OOR2: 575 case O_CH: 576 assert(nope); 577 break; 578 case OLPAREN: 579 i = OPND(m->g->strip[ss]); 580 assert(0 < i && i <= m->g->nsub); 581 m->pmatch[i].rm_so = sp - m->offp; 582 break; 583 case ORPAREN: 584 i = OPND(m->g->strip[ss]); 585 assert(0 < i && i <= m->g->nsub); 586 m->pmatch[i].rm_eo = sp - m->offp; 587 break; 588 default: /* uh oh */ 589 assert(nope); 590 break; 591 } 592 } 593 594 assert(sp == stop); 595 return(sp); 596 } 597 598 #define ISBOW(m, sp) \ 599 (sp < m->endp && ISWORD(*sp) && \ 600 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \ 601 (sp > m->offp && !ISWORD(*(sp-1))))) 602 #define ISEOW(m, sp) \ 603 (((sp == m->endp && !(m->eflags®_NOTEOL)) || \ 604 (sp < m->endp && *sp == '\n' && \ 605 (m->g->cflags®_NEWLINE)) || \ 606 (sp < m->endp && !ISWORD(*sp)) ) && \ 607 (sp > m->beginp && ISWORD(*(sp-1)))) \ 608 609 /* 610 - backref - figure out what matched what, figuring in back references 611 == static const char *backref(struct match *m, const char *start, \ 612 == const char *stop, sopno startst, sopno stopst, sopno lev); 613 */ 614 static const char * /* == stop (success) or NULL (failure) */ 615 backref(struct match *m, 616 const char *start, 617 const char *stop, 618 sopno startst, 619 sopno stopst, 620 sopno lev, /* PLUS nesting level */ 621 int rec) 622 { 623 int i; 624 sopno ss; /* start sop of current subRE */ 625 const char *sp; /* start of string matched by it */ 626 sopno ssub; /* start sop of subsubRE */ 627 sopno esub; /* end sop of subsubRE */ 628 const char *ssp; /* start of string matched by subsubRE */ 629 const char *dp; 630 size_t len; 631 int hard; 632 sop s; 633 regoff_t offsave; 634 cset *cs; 635 wint_t wc; 636 637 AT("back", start, stop, startst, stopst); 638 sp = start; 639 640 /* get as far as we can with easy stuff */ 641 hard = 0; 642 for (ss = startst; !hard && ss < stopst; ss++) 643 switch (OP(s = m->g->strip[ss])) { 644 case OCHAR: 645 if (sp == stop) 646 return(NULL); 647 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 648 if (wc != OPND(s)) 649 return(NULL); 650 break; 651 case OANY: 652 if (sp == stop) 653 return(NULL); 654 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 655 if (wc == BADCHAR) 656 return (NULL); 657 break; 658 case OANYOF: 659 if (sp == stop) 660 return (NULL); 661 cs = &m->g->sets[OPND(s)]; 662 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 663 if (wc == BADCHAR || !CHIN(cs, wc)) 664 return(NULL); 665 break; 666 case OBOS: 667 if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0) 668 { /* yes */ } 669 else 670 return(NULL); 671 break; 672 case OEOS: 673 if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0) 674 { /* yes */ } 675 else 676 return(NULL); 677 break; 678 case OBOL: 679 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 680 (sp > m->offp && sp < m->endp && 681 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 682 { /* yes */ } 683 else 684 return(NULL); 685 break; 686 case OEOL: 687 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 688 (sp < m->endp && *sp == '\n' && 689 (m->g->cflags®_NEWLINE)) ) 690 { /* yes */ } 691 else 692 return(NULL); 693 break; 694 case OBOW: 695 if (ISBOW(m, sp)) 696 { /* yes */ } 697 else 698 return(NULL); 699 break; 700 case OEOW: 701 if (ISEOW(m, sp)) 702 { /* yes */ } 703 else 704 return(NULL); 705 break; 706 case O_QUEST: 707 break; 708 case OOR1: /* matches null but needs to skip */ 709 ss++; 710 s = m->g->strip[ss]; 711 do { 712 assert(OP(s) == OOR2); 713 ss += OPND(s); 714 } while (OP(s = m->g->strip[ss]) != (sop)O_CH); 715 /* note that the ss++ gets us past the O_CH */ 716 break; 717 default: /* have to make a choice */ 718 hard = 1; 719 break; 720 } 721 if (!hard) { /* that was it! */ 722 if (sp != stop) 723 return(NULL); 724 return(sp); 725 } 726 ss--; /* adjust for the for's final increment */ 727 728 /* the hard stuff */ 729 AT("hard", sp, stop, ss, stopst); 730 s = m->g->strip[ss]; 731 switch (OP(s)) { 732 case OBACK_: /* the vilest depths */ 733 i = OPND(s); 734 assert(0 < i && i <= m->g->nsub); 735 if (m->pmatch[i].rm_eo == -1) 736 return(NULL); 737 assert(m->pmatch[i].rm_so != -1); 738 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 739 if (len == 0 && rec++ > MAX_RECURSION) 740 return(NULL); 741 assert(stop - m->beginp >= len); 742 if (sp > stop - len) 743 return(NULL); /* not enough left to match */ 744 ssp = m->offp + m->pmatch[i].rm_so; 745 if (memcmp(sp, ssp, len) != 0) 746 return(NULL); 747 while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) 748 ss++; 749 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 750 case OQUEST_: /* to null or not */ 751 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 752 if (dp != NULL) 753 return(dp); /* not */ 754 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 755 case OPLUS_: 756 assert(m->lastpos != NULL); 757 assert(lev+1 <= m->g->nplus); 758 m->lastpos[lev+1] = sp; 759 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 760 case O_PLUS: 761 if (sp == m->lastpos[lev]) /* last pass matched null */ 762 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 763 /* try another pass */ 764 m->lastpos[lev] = sp; 765 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 766 if (dp == NULL) 767 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 768 else 769 return(dp); 770 case OCH_: /* find the right one, if any */ 771 ssub = ss + 1; 772 esub = ss + OPND(s) - 1; 773 assert(OP(m->g->strip[esub]) == OOR1); 774 for (;;) { /* find first matching branch */ 775 dp = backref(m, sp, stop, ssub, esub, lev, rec); 776 if (dp != NULL) 777 return(dp); 778 /* that one missed, try next one */ 779 if (OP(m->g->strip[esub]) == (sop)O_CH) 780 return(NULL); /* there is none */ 781 esub++; 782 assert(OP(m->g->strip[esub]) == (sop)OOR2); 783 ssub = esub + 1; 784 esub += OPND(m->g->strip[esub]); 785 if (OP(m->g->strip[esub]) == (sop)OOR2) 786 esub--; 787 else 788 assert(OP(m->g->strip[esub]) == O_CH); 789 } 790 /* NOTREACHED */ 791 break; 792 case OLPAREN: /* must undo assignment if rest fails */ 793 i = OPND(s); 794 assert(0 < i && i <= m->g->nsub); 795 offsave = m->pmatch[i].rm_so; 796 m->pmatch[i].rm_so = sp - m->offp; 797 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 798 if (dp != NULL) 799 return(dp); 800 m->pmatch[i].rm_so = offsave; 801 return(NULL); 802 case ORPAREN: /* must undo assignment if rest fails */ 803 i = OPND(s); 804 assert(0 < i && i <= m->g->nsub); 805 offsave = m->pmatch[i].rm_eo; 806 m->pmatch[i].rm_eo = sp - m->offp; 807 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 808 if (dp != NULL) 809 return(dp); 810 m->pmatch[i].rm_eo = offsave; 811 return(NULL); 812 default: /* uh oh */ 813 assert(nope); 814 break; 815 } 816 817 /* "can't happen" */ 818 assert(nope); 819 /* NOTREACHED */ 820 return "shut up gcc"; 821 } 822 823 /* 824 - walk - step through the string either quickly or slowly 825 == static const char *walk(struct match *m, const char *start, \ 826 == const char *stop, sopno startst, sopno stopst, bool fast); 827 */ 828 static const char * /* where it ended, or NULL */ 829 walk(struct match *m, const char *start, const char *stop, sopno startst, 830 sopno stopst, bool fast) 831 { 832 states st = m->st; 833 states fresh = m->fresh; 834 states empty = m->empty; 835 states tmp = m->tmp; 836 const char *p = start; 837 wint_t c; 838 wint_t lastc; /* previous c */ 839 wint_t flagch; 840 int i, sflags; 841 const char *matchp; /* last p at which a match ended */ 842 size_t clen; 843 844 sflags = 0; 845 AT("slow", start, stop, startst, stopst); 846 CLEAR(st); 847 SET1(st, startst); 848 SP("sstart", st, *p); 849 st = step(m->g, startst, stopst, st, NOTHING, st, sflags); 850 if (fast) 851 ASSIGN(fresh, st); 852 matchp = NULL; 853 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 854 c = OUT; 855 else { 856 /* 857 * XXX Wrong if the previous character was multi-byte. 858 * Newline never is (in encodings supported by FreeBSD), 859 * so this only breaks the ISWORD tests below. 860 */ 861 c = (uch)*(start - 1); 862 } 863 for (;;) { 864 /* next character */ 865 lastc = c; 866 sflags = 0; 867 if (p == m->endp) { 868 c = OUT; 869 clen = 0; 870 } else 871 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 872 873 if (fast && EQ(st, fresh)) 874 matchp = p; 875 876 /* is there an EOL and/or BOL between lastc and c? */ 877 flagch = '\0'; 878 i = 0; 879 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 880 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 881 flagch = BOL; 882 i = m->g->nbol; 883 } 884 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 885 (c == OUT && !(m->eflags®_NOTEOL)) ) { 886 flagch = (flagch == BOL) ? BOLEOL : EOL; 887 i += m->g->neol; 888 } 889 if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) { 890 sflags |= SBOS; 891 /* Step one more for BOS. */ 892 i++; 893 } 894 if (c == OUT && (m->eflags & REG_NOTEOL) == 0) { 895 sflags |= SEOS; 896 /* Step one more for EOS. */ 897 i++; 898 } 899 if (i != 0) { 900 for (; i > 0; i--) 901 st = step(m->g, startst, stopst, st, flagch, st, 902 sflags); 903 SP("sboleol", st, c); 904 } 905 906 /* how about a word boundary? */ 907 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 908 (c != OUT && ISWORD(c)) ) { 909 flagch = BOW; 910 } 911 if ( (lastc != OUT && ISWORD(lastc)) && 912 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 913 flagch = EOW; 914 } 915 if (flagch == BOW || flagch == EOW) { 916 st = step(m->g, startst, stopst, st, flagch, st, sflags); 917 SP("sboweow", st, c); 918 } 919 920 /* are we done? */ 921 if (ISSET(st, stopst)) { 922 if (fast) 923 break; 924 else 925 matchp = p; 926 } 927 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 928 break; /* NOTE BREAK OUT */ 929 930 /* no, we must deal with this character */ 931 ASSIGN(tmp, st); 932 if (fast) 933 ASSIGN(st, fresh); 934 else 935 ASSIGN(st, empty); 936 assert(c != OUT); 937 st = step(m->g, startst, stopst, tmp, c, st, sflags); 938 SP("saft", st, c); 939 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), 940 st)); 941 p += clen; 942 } 943 944 if (fast) { 945 assert(matchp != NULL); 946 m->coldp = matchp; 947 if (ISSET(st, stopst)) 948 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 949 else 950 return (NULL); 951 } else 952 return (matchp); 953 } 954 955 /* 956 - step - map set of states reachable before char to set reachable after 957 == static states step(struct re_guts *g, sopno start, sopno stop, \ 958 == states bef, int ch, states aft); 959 == #define BOL (OUT-1) 960 == #define EOL (BOL-1) 961 == #define BOLEOL (BOL-2) 962 == #define NOTHING (BOL-3) 963 == #define BOW (BOL-4) 964 == #define EOW (BOL-5) 965 == #define BADCHAR (BOL-6) 966 == #define NONCHAR(c) ((c) <= OUT) 967 */ 968 static states 969 step(struct re_guts *g, 970 sopno start, /* start state within strip */ 971 sopno stop, /* state after stop state within strip */ 972 states bef, /* states reachable before */ 973 wint_t ch, /* character or NONCHAR code */ 974 states aft, /* states already known reachable after */ 975 int sflags) /* state flags */ 976 { 977 cset *cs; 978 sop s; 979 sopno pc; 980 onestate here; /* note, macros know this name */ 981 sopno look; 982 int i; 983 984 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 985 s = g->strip[pc]; 986 switch (OP(s)) { 987 case OEND: 988 assert(pc == stop-1); 989 break; 990 case OCHAR: 991 /* only characters can match */ 992 assert(!NONCHAR(ch) || ch != OPND(s)); 993 if (ch == OPND(s)) 994 FWD(aft, bef, 1); 995 break; 996 case OBOS: 997 if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0) 998 FWD(aft, bef, 1); 999 break; 1000 case OEOS: 1001 if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0) 1002 FWD(aft, bef, 1); 1003 break; 1004 case OBOL: 1005 if (ch == BOL || ch == BOLEOL) 1006 FWD(aft, bef, 1); 1007 break; 1008 case OEOL: 1009 if (ch == EOL || ch == BOLEOL) 1010 FWD(aft, bef, 1); 1011 break; 1012 case OBOW: 1013 if (ch == BOW) 1014 FWD(aft, bef, 1); 1015 break; 1016 case OEOW: 1017 if (ch == EOW) 1018 FWD(aft, bef, 1); 1019 break; 1020 case OANY: 1021 if (!NONCHAR(ch)) 1022 FWD(aft, bef, 1); 1023 break; 1024 case OANYOF: 1025 cs = &g->sets[OPND(s)]; 1026 if (!NONCHAR(ch) && CHIN(cs, ch)) 1027 FWD(aft, bef, 1); 1028 break; 1029 case OBACK_: /* ignored here */ 1030 case O_BACK: 1031 FWD(aft, aft, 1); 1032 break; 1033 case OPLUS_: /* forward, this is just an empty */ 1034 FWD(aft, aft, 1); 1035 break; 1036 case O_PLUS: /* both forward and back */ 1037 FWD(aft, aft, 1); 1038 i = ISSETBACK(aft, OPND(s)); 1039 BACK(aft, aft, OPND(s)); 1040 if (!i && ISSETBACK(aft, OPND(s))) { 1041 /* oho, must reconsider loop body */ 1042 pc -= OPND(s) + 1; 1043 INIT(here, pc); 1044 } 1045 break; 1046 case OQUEST_: /* two branches, both forward */ 1047 FWD(aft, aft, 1); 1048 FWD(aft, aft, OPND(s)); 1049 break; 1050 case O_QUEST: /* just an empty */ 1051 FWD(aft, aft, 1); 1052 break; 1053 case OLPAREN: /* not significant here */ 1054 case ORPAREN: 1055 FWD(aft, aft, 1); 1056 break; 1057 case OCH_: /* mark the first two branches */ 1058 FWD(aft, aft, 1); 1059 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1060 FWD(aft, aft, OPND(s)); 1061 break; 1062 case OOR1: /* done a branch, find the O_CH */ 1063 if (ISSTATEIN(aft, here)) { 1064 for (look = 1; 1065 OP(s = g->strip[pc+look]) != (sop)O_CH; 1066 look += OPND(s)) 1067 assert(OP(s) == (sop)OOR2); 1068 FWD(aft, aft, look + 1); 1069 } 1070 break; 1071 case OOR2: /* propagate OCH_'s marking */ 1072 FWD(aft, aft, 1); 1073 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { 1074 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1075 FWD(aft, aft, OPND(s)); 1076 } 1077 break; 1078 case O_CH: /* just empty */ 1079 FWD(aft, aft, 1); 1080 break; 1081 default: /* ooooops... */ 1082 assert(nope); 1083 break; 1084 } 1085 } 1086 1087 return(aft); 1088 } 1089 1090 #ifdef REDEBUG 1091 /* 1092 - print - print a set of states 1093 == #ifdef REDEBUG 1094 == static void print(struct match *m, const char *caption, states st, \ 1095 == int ch, FILE *d); 1096 == #endif 1097 */ 1098 static void 1099 print(struct match *m, 1100 const char *caption, 1101 states st, 1102 int ch, 1103 FILE *d) 1104 { 1105 struct re_guts *g = m->g; 1106 sopno i; 1107 int first = 1; 1108 1109 if (!(m->eflags®_TRACE)) 1110 return; 1111 1112 fprintf(d, "%s", caption); 1113 if (ch != '\0') 1114 fprintf(d, " %s", pchar(ch)); 1115 for (i = 0; i < g->nstates; i++) 1116 if (ISSET(st, i)) { 1117 fprintf(d, "%s%lu", (first) ? "\t" : ", ", i); 1118 first = 0; 1119 } 1120 fprintf(d, "\n"); 1121 } 1122 1123 /* 1124 - at - print current situation 1125 == #ifdef REDEBUG 1126 == static void at(struct match *m, const char *title, const char *start, \ 1127 == const char *stop, sopno startst, sopno stopst); 1128 == #endif 1129 */ 1130 static void 1131 at( struct match *m, 1132 const char *title, 1133 const char *start, 1134 const char *stop, 1135 sopno startst, 1136 sopno stopst) 1137 { 1138 if (!(m->eflags®_TRACE)) 1139 return; 1140 1141 printf("%s %s-", title, pchar(*start)); 1142 printf("%s ", pchar(*stop)); 1143 printf("%ld-%ld\n", (long)startst, (long)stopst); 1144 } 1145 1146 #ifndef PCHARDONE 1147 #define PCHARDONE /* never again */ 1148 /* 1149 - pchar - make a character printable 1150 == #ifdef REDEBUG 1151 == static const char *pchar(int ch); 1152 == #endif 1153 * 1154 * Is this identical to regchar() over in debug.c? Well, yes. But a 1155 * duplicate here avoids having a debugging-capable regexec.o tied to 1156 * a matching debug.o, and this is convenient. It all disappears in 1157 * the non-debug compilation anyway, so it doesn't matter much. 1158 */ 1159 static const char * /* -> representation */ 1160 pchar(int ch) 1161 { 1162 static char pbuf[10]; 1163 1164 if (isprint((uch)ch) || ch == ' ') 1165 sprintf(pbuf, "%c", ch); 1166 else 1167 sprintf(pbuf, "\\%o", ch); 1168 return(pbuf); 1169 } 1170 #endif 1171 #endif 1172 1173 #undef stepback 1174 #undef matcher 1175 #undef walk 1176 #undef dissect 1177 #undef backref 1178 #undef step 1179 #undef print 1180 #undef at 1181 #undef match 1182