1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include <stdbool.h> 42 43 /* 44 * The matching engine and friends. This file is #included by regexec.c 45 * after suitable #defines of a variety of macros used herein, so that 46 * different state representations can be used without duplicating masses 47 * of code. 48 */ 49 50 #ifdef SNAMES 51 #define stepback sstepback 52 #define matcher smatcher 53 #define walk swalk 54 #define dissect sdissect 55 #define backref sbackref 56 #define step sstep 57 #define print sprint 58 #define at sat 59 #define match smat 60 #endif 61 #ifdef LNAMES 62 #define stepback lstepback 63 #define matcher lmatcher 64 #define walk lwalk 65 #define dissect ldissect 66 #define backref lbackref 67 #define step lstep 68 #define print lprint 69 #define at lat 70 #define match lmat 71 #endif 72 #ifdef MNAMES 73 #define stepback mstepback 74 #define matcher mmatcher 75 #define walk mwalk 76 #define dissect mdissect 77 #define backref mbackref 78 #define step mstep 79 #define print mprint 80 #define at mat 81 #define match mmat 82 #endif 83 84 /* another structure passed up and down to avoid zillions of parameters */ 85 struct match { 86 struct re_guts *g; 87 int eflags; 88 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 89 const char *offp; /* offsets work from here */ 90 const char *beginp; /* start of string -- virtual NUL precedes */ 91 const char *endp; /* end of string -- virtual NUL here */ 92 const char *coldp; /* can be no match starting before here */ 93 const char **lastpos; /* [nplus+1] */ 94 STATEVARS; 95 states st; /* current states */ 96 states fresh; /* states for a fresh start */ 97 states tmp; /* temporary */ 98 states empty; /* empty set of states */ 99 mbstate_t mbs; /* multibyte conversion state */ 100 }; 101 102 /* ========= begin header generated by ./mkh ========= */ 103 #ifdef __cplusplus 104 extern "C" { 105 #endif 106 107 /* === engine.c === */ 108 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 109 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 110 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 111 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); 112 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); 113 #define MAX_RECURSION 100 114 #define BOL (OUT-1) 115 #define EOL (BOL-1) 116 #define BOLEOL (BOL-2) 117 #define NOTHING (BOL-3) 118 #define BOW (BOL-4) 119 #define EOW (BOL-5) 120 #define BADCHAR (BOL-6) 121 #define NWBND (BOL-7) 122 #define NONCHAR(c) ((c) <= OUT) 123 /* sflags */ 124 #define SBOS 0x0001 125 #define SEOS 0x0002 126 127 #ifdef REDEBUG 128 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 129 #endif 130 #ifdef REDEBUG 131 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 132 #endif 133 #ifdef REDEBUG 134 static const char *pchar(int ch); 135 #endif 136 137 #ifdef __cplusplus 138 } 139 #endif 140 /* ========= end header generated by ./mkh ========= */ 141 142 #ifdef REDEBUG 143 #define SP(t, s, c) print(m, t, s, c, stdout) 144 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 145 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 146 #else 147 #define SP(t, s, c) /* nothing */ 148 #define AT(t, p1, p2, s1, s2) /* nothing */ 149 #define NOTE(s) /* nothing */ 150 #endif 151 152 /* 153 * Given a multibyte string pointed to by start, step back nchar characters 154 * from current position pointed to by cur. 155 */ 156 static const char * 157 stepback(const char *start, const char *cur, int nchar) 158 { 159 const char *ret; 160 int wc, mbc; 161 mbstate_t mbs; 162 size_t clen; 163 164 if (MB_CUR_MAX == 1) 165 return ((cur - nchar) > start ? cur - nchar : NULL); 166 167 ret = cur; 168 for (wc = nchar; wc > 0; wc--) { 169 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { 170 if ((ret - mbc) < start) 171 return (NULL); 172 memset(&mbs, 0, sizeof(mbs)); 173 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 174 if (clen != (size_t)-1 && clen != (size_t)-2) 175 break; 176 } 177 if (mbc > MB_CUR_MAX) 178 return (NULL); 179 ret -= mbc; 180 } 181 182 return (ret); 183 } 184 185 /* 186 - matcher - the actual matching engine 187 == static int matcher(struct re_guts *g, const char *string, \ 188 == size_t nmatch, regmatch_t pmatch[], int eflags); 189 */ 190 static int /* 0 success, REG_NOMATCH failure */ 191 matcher(struct re_guts *g, 192 const char *string, 193 size_t nmatch, 194 regmatch_t pmatch[], 195 int eflags) 196 { 197 const char *endp; 198 size_t i; 199 struct match mv; 200 struct match *m = &mv; 201 const char *dp = NULL; 202 const sopno gf = g->firststate+1; /* +1 for OEND */ 203 const sopno gl = g->laststate; 204 const char *start; 205 const char *stop; 206 /* Boyer-Moore algorithms variables */ 207 const char *pp; 208 int cj, mj; 209 const char *mustfirst; 210 const char *mustlast; 211 int *matchjump; 212 int *charjump; 213 214 /* simplify the situation where possible */ 215 if (g->cflags®_NOSUB) 216 nmatch = 0; 217 if (eflags®_STARTEND) { 218 start = string + pmatch[0].rm_so; 219 stop = string + pmatch[0].rm_eo; 220 } else { 221 start = string; 222 stop = start + strlen(start); 223 } 224 if (stop < start) 225 return(REG_INVARG); 226 227 /* prescreening; this does wonders for this rather slow code */ 228 if (g->must != NULL) { 229 if (g->charjump != NULL && g->matchjump != NULL) { 230 mustfirst = g->must; 231 mustlast = g->must + g->mlen - 1; 232 charjump = g->charjump; 233 matchjump = g->matchjump; 234 pp = mustlast; 235 for (dp = start+g->mlen-1; dp < stop;) { 236 /* Fast skip non-matches */ 237 while (dp < stop && charjump[(int)*dp]) 238 dp += charjump[(int)*dp]; 239 240 if (dp >= stop) 241 break; 242 243 /* Greedy matcher */ 244 /* We depend on not being used for 245 * for strings of length 1 246 */ 247 while (*--dp == *--pp && pp != mustfirst); 248 249 if (*dp == *pp) 250 break; 251 252 /* Jump to next possible match */ 253 mj = matchjump[pp - mustfirst]; 254 cj = charjump[(int)*dp]; 255 dp += (cj < mj ? mj : cj); 256 pp = mustlast; 257 } 258 if (pp != mustfirst) 259 return(REG_NOMATCH); 260 } else { 261 for (dp = start; dp < stop; dp++) 262 if (*dp == g->must[0] && 263 stop - dp >= g->mlen && 264 memcmp(dp, g->must, (size_t)g->mlen) == 0) 265 break; 266 if (dp == stop) /* we didn't find g->must */ 267 return(REG_NOMATCH); 268 } 269 } 270 271 /* match struct setup */ 272 m->g = g; 273 m->eflags = eflags; 274 m->pmatch = NULL; 275 m->lastpos = NULL; 276 m->offp = string; 277 m->beginp = start; 278 m->endp = stop; 279 STATESETUP(m, 4); 280 SETUP(m->st); 281 SETUP(m->fresh); 282 SETUP(m->tmp); 283 SETUP(m->empty); 284 CLEAR(m->empty); 285 ZAPSTATE(&m->mbs); 286 287 /* Adjust start according to moffset, to speed things up */ 288 if (dp != NULL && g->moffset > -1) { 289 const char *nstart; 290 291 nstart = stepback(start, dp, g->moffset); 292 if (nstart != NULL) 293 start = nstart; 294 } 295 296 SP("mloop", m->st, *start); 297 298 /* this loop does only one repetition except for backrefs */ 299 for (;;) { 300 endp = walk(m, start, stop, gf, gl, true); 301 if (endp == NULL) { /* a miss */ 302 if (m->pmatch != NULL) 303 free((char *)m->pmatch); 304 if (m->lastpos != NULL) 305 free((char *)m->lastpos); 306 STATETEARDOWN(m); 307 return(REG_NOMATCH); 308 } 309 if (nmatch == 0 && !g->backrefs) 310 break; /* no further info needed */ 311 312 /* where? */ 313 assert(m->coldp != NULL); 314 for (;;) { 315 NOTE("finding start"); 316 endp = walk(m, m->coldp, stop, gf, gl, false); 317 if (endp != NULL) 318 break; 319 assert(m->coldp < m->endp); 320 m->coldp += XMBRTOWC(NULL, m->coldp, 321 m->endp - m->coldp, &m->mbs, 0); 322 } 323 if (nmatch == 1 && !g->backrefs) 324 break; /* no further info needed */ 325 326 /* oh my, he wants the subexpressions... */ 327 if (m->pmatch == NULL) 328 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 329 sizeof(regmatch_t)); 330 if (m->pmatch == NULL) { 331 STATETEARDOWN(m); 332 return(REG_ESPACE); 333 } 334 for (i = 1; i <= m->g->nsub; i++) 335 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 336 if (!g->backrefs && !(m->eflags®_BACKR)) { 337 NOTE("dissecting"); 338 dp = dissect(m, m->coldp, endp, gf, gl); 339 } else { 340 if (g->nplus > 0 && m->lastpos == NULL) 341 m->lastpos = malloc((g->nplus+1) * 342 sizeof(const char *)); 343 if (g->nplus > 0 && m->lastpos == NULL) { 344 free(m->pmatch); 345 STATETEARDOWN(m); 346 return(REG_ESPACE); 347 } 348 NOTE("backref dissect"); 349 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 350 } 351 if (dp != NULL) 352 break; 353 354 /* uh-oh... we couldn't find a subexpression-level match */ 355 assert(g->backrefs); /* must be back references doing it */ 356 assert(g->nplus == 0 || m->lastpos != NULL); 357 for (;;) { 358 if (dp != NULL || endp <= m->coldp) 359 break; /* defeat */ 360 NOTE("backoff"); 361 endp = walk(m, m->coldp, endp-1, gf, gl, false); 362 if (endp == NULL) 363 break; /* defeat */ 364 /* try it on a shorter possibility */ 365 #ifndef NDEBUG 366 for (i = 1; i <= m->g->nsub; i++) { 367 assert(m->pmatch[i].rm_so == -1); 368 assert(m->pmatch[i].rm_eo == -1); 369 } 370 #endif 371 NOTE("backoff dissect"); 372 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 373 } 374 assert(dp == NULL || dp == endp); 375 if (dp != NULL) /* found a shorter one */ 376 break; 377 378 /* despite initial appearances, there is no match here */ 379 NOTE("false alarm"); 380 /* recycle starting later */ 381 start = m->coldp + XMBRTOWC(NULL, m->coldp, 382 stop - m->coldp, &m->mbs, 0); 383 assert(start <= stop); 384 } 385 386 /* fill in the details if requested */ 387 if (nmatch > 0) { 388 pmatch[0].rm_so = m->coldp - m->offp; 389 pmatch[0].rm_eo = endp - m->offp; 390 } 391 if (nmatch > 1) { 392 assert(m->pmatch != NULL); 393 for (i = 1; i < nmatch; i++) 394 if (i <= m->g->nsub) 395 pmatch[i] = m->pmatch[i]; 396 else { 397 pmatch[i].rm_so = -1; 398 pmatch[i].rm_eo = -1; 399 } 400 } 401 402 if (m->pmatch != NULL) 403 free((char *)m->pmatch); 404 if (m->lastpos != NULL) 405 free((char *)m->lastpos); 406 STATETEARDOWN(m); 407 return(0); 408 } 409 410 /* 411 - dissect - figure out what matched what, no back references 412 == static const char *dissect(struct match *m, const char *start, \ 413 == const char *stop, sopno startst, sopno stopst); 414 */ 415 static const char * /* == stop (success) always */ 416 dissect(struct match *m, 417 const char *start, 418 const char *stop, 419 sopno startst, 420 sopno stopst) 421 { 422 int i; 423 sopno ss; /* start sop of current subRE */ 424 sopno es; /* end sop of current subRE */ 425 const char *sp; /* start of string matched by it */ 426 const char *stp; /* string matched by it cannot pass here */ 427 const char *rest; /* start of rest of string */ 428 const char *tail; /* string unmatched by rest of RE */ 429 sopno ssub; /* start sop of subsubRE */ 430 sopno esub; /* end sop of subsubRE */ 431 const char *ssp; /* start of string matched by subsubRE */ 432 const char *sep; /* end of string matched by subsubRE */ 433 const char *oldssp; /* previous ssp */ 434 const char *dp __unused; 435 436 AT("diss", start, stop, startst, stopst); 437 sp = start; 438 for (ss = startst; ss < stopst; ss = es) { 439 /* identify end of subRE */ 440 es = ss; 441 switch (OP(m->g->strip[es])) { 442 case OPLUS_: 443 case OQUEST_: 444 es += OPND(m->g->strip[es]); 445 break; 446 case OCH_: 447 while (OP(m->g->strip[es]) != (sop)O_CH) 448 es += OPND(m->g->strip[es]); 449 break; 450 } 451 es++; 452 453 /* figure out what it matched */ 454 switch (OP(m->g->strip[ss])) { 455 case OEND: 456 assert(nope); 457 break; 458 case OCHAR: 459 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 460 break; 461 case OBOL: 462 case OEOL: 463 case OBOW: 464 case OEOW: 465 case OBOS: 466 case OEOS: 467 case OWBND: 468 case ONWBND: 469 break; 470 case OANY: 471 case OANYOF: 472 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 473 break; 474 case OBACK_: 475 case O_BACK: 476 assert(nope); 477 break; 478 /* cases where length of match is hard to find */ 479 case OQUEST_: 480 stp = stop; 481 for (;;) { 482 /* how long could this one be? */ 483 rest = walk(m, sp, stp, ss, es, false); 484 assert(rest != NULL); /* it did match */ 485 /* could the rest match the rest? */ 486 tail = walk(m, rest, stop, es, stopst, false); 487 if (tail == stop) 488 break; /* yes! */ 489 /* no -- try a shorter match for this one */ 490 stp = rest - 1; 491 assert(stp >= sp); /* it did work */ 492 } 493 ssub = ss + 1; 494 esub = es - 1; 495 /* did innards match? */ 496 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 497 dp = dissect(m, sp, rest, ssub, esub); 498 assert(dp == rest); 499 } else /* no */ 500 assert(sp == rest); 501 sp = rest; 502 break; 503 case OPLUS_: 504 stp = stop; 505 for (;;) { 506 /* how long could this one be? */ 507 rest = walk(m, sp, stp, ss, es, false); 508 assert(rest != NULL); /* it did match */ 509 /* could the rest match the rest? */ 510 tail = walk(m, rest, stop, es, stopst, false); 511 if (tail == stop) 512 break; /* yes! */ 513 /* no -- try a shorter match for this one */ 514 stp = rest - 1; 515 assert(stp >= sp); /* it did work */ 516 } 517 ssub = ss + 1; 518 esub = es - 1; 519 ssp = sp; 520 oldssp = ssp; 521 for (;;) { /* find last match of innards */ 522 sep = walk(m, ssp, rest, ssub, esub, false); 523 if (sep == NULL || sep == ssp) 524 break; /* failed or matched null */ 525 oldssp = ssp; /* on to next try */ 526 ssp = sep; 527 } 528 if (sep == NULL) { 529 /* last successful match */ 530 sep = ssp; 531 ssp = oldssp; 532 } 533 assert(sep == rest); /* must exhaust substring */ 534 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 535 dp = dissect(m, ssp, sep, ssub, esub); 536 assert(dp == sep); 537 sp = rest; 538 break; 539 case OCH_: 540 stp = stop; 541 for (;;) { 542 /* how long could this one be? */ 543 rest = walk(m, sp, stp, ss, es, false); 544 assert(rest != NULL); /* it did match */ 545 /* could the rest match the rest? */ 546 tail = walk(m, rest, stop, es, stopst, false); 547 if (tail == stop) 548 break; /* yes! */ 549 /* no -- try a shorter match for this one */ 550 stp = rest - 1; 551 assert(stp >= sp); /* it did work */ 552 } 553 ssub = ss + 1; 554 esub = ss + OPND(m->g->strip[ss]) - 1; 555 assert(OP(m->g->strip[esub]) == OOR1); 556 for (;;) { /* find first matching branch */ 557 if (walk(m, sp, rest, ssub, esub, false) == rest) 558 break; /* it matched all of it */ 559 /* that one missed, try next one */ 560 assert(OP(m->g->strip[esub]) == OOR1); 561 esub++; 562 assert(OP(m->g->strip[esub]) == OOR2); 563 ssub = esub + 1; 564 esub += OPND(m->g->strip[esub]); 565 if (OP(m->g->strip[esub]) == (sop)OOR2) 566 esub--; 567 else 568 assert(OP(m->g->strip[esub]) == O_CH); 569 } 570 dp = dissect(m, sp, rest, ssub, esub); 571 assert(dp == rest); 572 sp = rest; 573 break; 574 case O_PLUS: 575 case O_QUEST: 576 case OOR1: 577 case OOR2: 578 case O_CH: 579 assert(nope); 580 break; 581 case OLPAREN: 582 i = OPND(m->g->strip[ss]); 583 assert(0 < i && i <= m->g->nsub); 584 m->pmatch[i].rm_so = sp - m->offp; 585 break; 586 case ORPAREN: 587 i = OPND(m->g->strip[ss]); 588 assert(0 < i && i <= m->g->nsub); 589 m->pmatch[i].rm_eo = sp - m->offp; 590 break; 591 default: /* uh oh */ 592 assert(nope); 593 break; 594 } 595 } 596 597 assert(sp == stop); 598 return(sp); 599 } 600 601 #define ISBOW(m, sp) \ 602 (sp < m->endp && ISWORD(*sp) && \ 603 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \ 604 (sp > m->offp && !ISWORD(*(sp-1))))) 605 #define ISEOW(m, sp) \ 606 (((sp == m->endp && !(m->eflags®_NOTEOL)) || \ 607 (sp < m->endp && *sp == '\n' && \ 608 (m->g->cflags®_NEWLINE)) || \ 609 (sp < m->endp && !ISWORD(*sp)) ) && \ 610 (sp > m->beginp && ISWORD(*(sp-1)))) \ 611 612 /* 613 - backref - figure out what matched what, figuring in back references 614 == static const char *backref(struct match *m, const char *start, \ 615 == const char *stop, sopno startst, sopno stopst, sopno lev); 616 */ 617 static const char * /* == stop (success) or NULL (failure) */ 618 backref(struct match *m, 619 const char *start, 620 const char *stop, 621 sopno startst, 622 sopno stopst, 623 sopno lev, /* PLUS nesting level */ 624 int rec) 625 { 626 int i; 627 sopno ss; /* start sop of current subRE */ 628 const char *sp; /* start of string matched by it */ 629 sopno ssub; /* start sop of subsubRE */ 630 sopno esub; /* end sop of subsubRE */ 631 const char *ssp; /* start of string matched by subsubRE */ 632 const char *dp; 633 size_t len; 634 int hard; 635 sop s; 636 regoff_t offsave; 637 cset *cs; 638 wint_t wc; 639 640 AT("back", start, stop, startst, stopst); 641 sp = start; 642 643 /* get as far as we can with easy stuff */ 644 hard = 0; 645 for (ss = startst; !hard && ss < stopst; ss++) 646 switch (OP(s = m->g->strip[ss])) { 647 case OCHAR: 648 if (sp == stop) 649 return(NULL); 650 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 651 if (wc != OPND(s)) 652 return(NULL); 653 break; 654 case OANY: 655 if (sp == stop) 656 return(NULL); 657 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 658 if (wc == BADCHAR) 659 return (NULL); 660 break; 661 case OANYOF: 662 if (sp == stop) 663 return (NULL); 664 cs = &m->g->sets[OPND(s)]; 665 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 666 if (wc == BADCHAR || !CHIN(cs, wc)) 667 return(NULL); 668 break; 669 case OBOS: 670 if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0) 671 { /* yes */ } 672 else 673 return(NULL); 674 break; 675 case OEOS: 676 if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0) 677 { /* yes */ } 678 else 679 return(NULL); 680 break; 681 case OBOL: 682 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 683 (sp > m->offp && sp < m->endp && 684 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 685 { /* yes */ } 686 else 687 return(NULL); 688 break; 689 case OEOL: 690 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 691 (sp < m->endp && *sp == '\n' && 692 (m->g->cflags®_NEWLINE)) ) 693 { /* yes */ } 694 else 695 return(NULL); 696 break; 697 case OWBND: 698 if (ISBOW(m, sp) || ISEOW(m, sp)) 699 { /* yes */ } 700 else 701 return(NULL); 702 break; 703 case ONWBND: 704 if (((sp == m->beginp) && !ISWORD(*sp)) || 705 (sp == m->endp && !ISWORD(*(sp - 1)))) 706 { /* yes, beginning/end of subject */ } 707 else if (ISWORD(*(sp - 1)) == ISWORD(*sp)) 708 { /* yes, beginning/end of subject */ } 709 else 710 return(NULL); 711 break; 712 case OBOW: 713 if (ISBOW(m, sp)) 714 { /* yes */ } 715 else 716 return(NULL); 717 break; 718 case OEOW: 719 if (ISEOW(m, sp)) 720 { /* yes */ } 721 else 722 return(NULL); 723 break; 724 case O_QUEST: 725 break; 726 case OOR1: /* matches null but needs to skip */ 727 ss++; 728 s = m->g->strip[ss]; 729 do { 730 assert(OP(s) == OOR2); 731 ss += OPND(s); 732 } while (OP(s = m->g->strip[ss]) != (sop)O_CH); 733 /* note that the ss++ gets us past the O_CH */ 734 break; 735 default: /* have to make a choice */ 736 hard = 1; 737 break; 738 } 739 if (!hard) { /* that was it! */ 740 if (sp != stop) 741 return(NULL); 742 return(sp); 743 } 744 ss--; /* adjust for the for's final increment */ 745 746 /* the hard stuff */ 747 AT("hard", sp, stop, ss, stopst); 748 s = m->g->strip[ss]; 749 switch (OP(s)) { 750 case OBACK_: /* the vilest depths */ 751 i = OPND(s); 752 assert(0 < i && i <= m->g->nsub); 753 if (m->pmatch[i].rm_eo == -1) 754 return(NULL); 755 assert(m->pmatch[i].rm_so != -1); 756 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 757 if (len == 0 && rec++ > MAX_RECURSION) 758 return(NULL); 759 assert(stop - m->beginp >= len); 760 if (sp > stop - len) 761 return(NULL); /* not enough left to match */ 762 ssp = m->offp + m->pmatch[i].rm_so; 763 if (memcmp(sp, ssp, len) != 0) 764 return(NULL); 765 while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) 766 ss++; 767 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 768 case OQUEST_: /* to null or not */ 769 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 770 if (dp != NULL) 771 return(dp); /* not */ 772 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 773 case OPLUS_: 774 assert(m->lastpos != NULL); 775 assert(lev+1 <= m->g->nplus); 776 m->lastpos[lev+1] = sp; 777 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 778 case O_PLUS: 779 if (sp == m->lastpos[lev]) /* last pass matched null */ 780 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 781 /* try another pass */ 782 m->lastpos[lev] = sp; 783 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 784 if (dp == NULL) 785 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 786 else 787 return(dp); 788 case OCH_: /* find the right one, if any */ 789 ssub = ss + 1; 790 esub = ss + OPND(s) - 1; 791 assert(OP(m->g->strip[esub]) == OOR1); 792 for (;;) { /* find first matching branch */ 793 dp = backref(m, sp, stop, ssub, esub, lev, rec); 794 if (dp != NULL) 795 return(dp); 796 /* that one missed, try next one */ 797 if (OP(m->g->strip[esub]) == (sop)O_CH) 798 return(NULL); /* there is none */ 799 esub++; 800 assert(OP(m->g->strip[esub]) == (sop)OOR2); 801 ssub = esub + 1; 802 esub += OPND(m->g->strip[esub]); 803 if (OP(m->g->strip[esub]) == (sop)OOR2) 804 esub--; 805 else 806 assert(OP(m->g->strip[esub]) == O_CH); 807 } 808 /* NOTREACHED */ 809 break; 810 case OLPAREN: /* must undo assignment if rest fails */ 811 i = OPND(s); 812 assert(0 < i && i <= m->g->nsub); 813 offsave = m->pmatch[i].rm_so; 814 m->pmatch[i].rm_so = sp - m->offp; 815 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 816 if (dp != NULL) 817 return(dp); 818 m->pmatch[i].rm_so = offsave; 819 return(NULL); 820 case ORPAREN: /* must undo assignment if rest fails */ 821 i = OPND(s); 822 assert(0 < i && i <= m->g->nsub); 823 offsave = m->pmatch[i].rm_eo; 824 m->pmatch[i].rm_eo = sp - m->offp; 825 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 826 if (dp != NULL) 827 return(dp); 828 m->pmatch[i].rm_eo = offsave; 829 return(NULL); 830 default: /* uh oh */ 831 assert(nope); 832 break; 833 } 834 835 /* "can't happen" */ 836 assert(nope); 837 /* NOTREACHED */ 838 return "shut up gcc"; 839 } 840 841 /* 842 - walk - step through the string either quickly or slowly 843 == static const char *walk(struct match *m, const char *start, \ 844 == const char *stop, sopno startst, sopno stopst, bool fast); 845 */ 846 static const char * /* where it ended, or NULL */ 847 walk(struct match *m, const char *start, const char *stop, sopno startst, 848 sopno stopst, bool fast) 849 { 850 states st = m->st; 851 states fresh = m->fresh; 852 states empty = m->empty; 853 states tmp = m->tmp; 854 const char *p = start; 855 wint_t c; 856 wint_t lastc; /* previous c */ 857 wint_t flagch; 858 int i, sflags; 859 const char *matchp; /* last p at which a match ended */ 860 size_t clen; 861 862 sflags = 0; 863 AT("slow", start, stop, startst, stopst); 864 CLEAR(st); 865 SET1(st, startst); 866 SP("sstart", st, *p); 867 st = step(m->g, startst, stopst, st, NOTHING, st, sflags); 868 if (fast) 869 ASSIGN(fresh, st); 870 matchp = NULL; 871 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 872 c = OUT; 873 else { 874 /* 875 * XXX Wrong if the previous character was multi-byte. 876 * Newline never is (in encodings supported by FreeBSD), 877 * so this only breaks the ISWORD tests below. 878 */ 879 c = (uch)*(start - 1); 880 } 881 for (;;) { 882 /* next character */ 883 lastc = c; 884 sflags = 0; 885 if (p == m->endp) { 886 c = OUT; 887 clen = 0; 888 } else 889 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 890 891 if (fast && EQ(st, fresh)) 892 matchp = p; 893 894 /* is there an EOL and/or BOL between lastc and c? */ 895 flagch = '\0'; 896 i = 0; 897 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 898 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 899 flagch = BOL; 900 i = m->g->nbol; 901 } 902 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 903 (c == OUT && !(m->eflags®_NOTEOL)) ) { 904 flagch = (flagch == BOL) ? BOLEOL : EOL; 905 i += m->g->neol; 906 } 907 if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) { 908 sflags |= SBOS; 909 /* Step one more for BOS. */ 910 i++; 911 } 912 if (c == OUT && (m->eflags & REG_NOTEOL) == 0) { 913 sflags |= SEOS; 914 /* Step one more for EOS. */ 915 i++; 916 } 917 if (i != 0) { 918 for (; i > 0; i--) 919 st = step(m->g, startst, stopst, st, flagch, st, 920 sflags); 921 SP("sboleol", st, c); 922 } 923 924 /* how about a word boundary? */ 925 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 926 (c != OUT && ISWORD(c)) ) { 927 flagch = BOW; 928 } 929 if ( (lastc != OUT && ISWORD(lastc)) && 930 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 931 flagch = EOW; 932 } 933 if (flagch == BOW || flagch == EOW) { 934 st = step(m->g, startst, stopst, st, flagch, st, sflags); 935 SP("sboweow", st, c); 936 } 937 if (lastc != OUT && c != OUT && 938 ISWORD(lastc) == ISWORD(c)) { 939 flagch = NWBND; 940 } else if ((lastc == OUT && !ISWORD(c)) || 941 (c == OUT && !ISWORD(lastc))) { 942 flagch = NWBND; 943 } 944 if (flagch == NWBND) { 945 st = step(m->g, startst, stopst, st, flagch, st, sflags); 946 SP("snwbnd", st, c); 947 } 948 949 /* are we done? */ 950 if (ISSET(st, stopst)) { 951 if (fast) 952 break; 953 else 954 matchp = p; 955 } 956 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 957 break; /* NOTE BREAK OUT */ 958 959 /* no, we must deal with this character */ 960 ASSIGN(tmp, st); 961 if (fast) 962 ASSIGN(st, fresh); 963 else 964 ASSIGN(st, empty); 965 assert(c != OUT); 966 st = step(m->g, startst, stopst, tmp, c, st, sflags); 967 SP("saft", st, c); 968 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), 969 st)); 970 p += clen; 971 } 972 973 if (fast) { 974 assert(matchp != NULL); 975 m->coldp = matchp; 976 if (ISSET(st, stopst)) 977 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 978 else 979 return (NULL); 980 } else 981 return (matchp); 982 } 983 984 /* 985 - step - map set of states reachable before char to set reachable after 986 == static states step(struct re_guts *g, sopno start, sopno stop, \ 987 == states bef, int ch, states aft); 988 == #define BOL (OUT-1) 989 == #define EOL (BOL-1) 990 == #define BOLEOL (BOL-2) 991 == #define NOTHING (BOL-3) 992 == #define BOW (BOL-4) 993 == #define EOW (BOL-5) 994 == #define BADCHAR (BOL-6) 995 == #define NONCHAR(c) ((c) <= OUT) 996 */ 997 static states 998 step(struct re_guts *g, 999 sopno start, /* start state within strip */ 1000 sopno stop, /* state after stop state within strip */ 1001 states bef, /* states reachable before */ 1002 wint_t ch, /* character or NONCHAR code */ 1003 states aft, /* states already known reachable after */ 1004 int sflags) /* state flags */ 1005 { 1006 cset *cs; 1007 sop s; 1008 sopno pc; 1009 onestate here; /* note, macros know this name */ 1010 sopno look; 1011 int i; 1012 1013 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1014 s = g->strip[pc]; 1015 switch (OP(s)) { 1016 case OEND: 1017 assert(pc == stop-1); 1018 break; 1019 case OCHAR: 1020 /* only characters can match */ 1021 assert(!NONCHAR(ch) || ch != OPND(s)); 1022 if (ch == OPND(s)) 1023 FWD(aft, bef, 1); 1024 break; 1025 case OBOS: 1026 if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0) 1027 FWD(aft, bef, 1); 1028 break; 1029 case OEOS: 1030 if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0) 1031 FWD(aft, bef, 1); 1032 break; 1033 case OBOL: 1034 if (ch == BOL || ch == BOLEOL) 1035 FWD(aft, bef, 1); 1036 break; 1037 case OEOL: 1038 if (ch == EOL || ch == BOLEOL) 1039 FWD(aft, bef, 1); 1040 break; 1041 case OBOW: 1042 if (ch == BOW) 1043 FWD(aft, bef, 1); 1044 break; 1045 case OEOW: 1046 if (ch == EOW) 1047 FWD(aft, bef, 1); 1048 break; 1049 case OWBND: 1050 if (ch == BOW || ch == EOW) 1051 FWD(aft, bef, 1); 1052 break; 1053 case ONWBND: 1054 if (ch == NWBND) 1055 FWD(aft, aft, 1); 1056 break; 1057 case OANY: 1058 if (!NONCHAR(ch)) 1059 FWD(aft, bef, 1); 1060 break; 1061 case OANYOF: 1062 cs = &g->sets[OPND(s)]; 1063 if (!NONCHAR(ch) && CHIN(cs, ch)) 1064 FWD(aft, bef, 1); 1065 break; 1066 case OBACK_: /* ignored here */ 1067 case O_BACK: 1068 FWD(aft, aft, 1); 1069 break; 1070 case OPLUS_: /* forward, this is just an empty */ 1071 FWD(aft, aft, 1); 1072 break; 1073 case O_PLUS: /* both forward and back */ 1074 FWD(aft, aft, 1); 1075 i = ISSETBACK(aft, OPND(s)); 1076 BACK(aft, aft, OPND(s)); 1077 if (!i && ISSETBACK(aft, OPND(s))) { 1078 /* oho, must reconsider loop body */ 1079 pc -= OPND(s) + 1; 1080 INIT(here, pc); 1081 } 1082 break; 1083 case OQUEST_: /* two branches, both forward */ 1084 FWD(aft, aft, 1); 1085 FWD(aft, aft, OPND(s)); 1086 break; 1087 case O_QUEST: /* just an empty */ 1088 FWD(aft, aft, 1); 1089 break; 1090 case OLPAREN: /* not significant here */ 1091 case ORPAREN: 1092 FWD(aft, aft, 1); 1093 break; 1094 case OCH_: /* mark the first two branches */ 1095 FWD(aft, aft, 1); 1096 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1097 FWD(aft, aft, OPND(s)); 1098 break; 1099 case OOR1: /* done a branch, find the O_CH */ 1100 if (ISSTATEIN(aft, here)) { 1101 for (look = 1; 1102 OP(s = g->strip[pc+look]) != (sop)O_CH; 1103 look += OPND(s)) 1104 assert(OP(s) == (sop)OOR2); 1105 FWD(aft, aft, look + 1); 1106 } 1107 break; 1108 case OOR2: /* propagate OCH_'s marking */ 1109 FWD(aft, aft, 1); 1110 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { 1111 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1112 FWD(aft, aft, OPND(s)); 1113 } 1114 break; 1115 case O_CH: /* just empty */ 1116 FWD(aft, aft, 1); 1117 break; 1118 default: /* ooooops... */ 1119 assert(nope); 1120 break; 1121 } 1122 } 1123 1124 return(aft); 1125 } 1126 1127 #ifdef REDEBUG 1128 /* 1129 - print - print a set of states 1130 == #ifdef REDEBUG 1131 == static void print(struct match *m, const char *caption, states st, \ 1132 == int ch, FILE *d); 1133 == #endif 1134 */ 1135 static void 1136 print(struct match *m, 1137 const char *caption, 1138 states st, 1139 int ch, 1140 FILE *d) 1141 { 1142 struct re_guts *g = m->g; 1143 sopno i; 1144 int first = 1; 1145 1146 if (!(m->eflags®_TRACE)) 1147 return; 1148 1149 fprintf(d, "%s", caption); 1150 if (ch != '\0') 1151 fprintf(d, " %s", pchar(ch)); 1152 for (i = 0; i < g->nstates; i++) 1153 if (ISSET(st, i)) { 1154 fprintf(d, "%s%lu", (first) ? "\t" : ", ", i); 1155 first = 0; 1156 } 1157 fprintf(d, "\n"); 1158 } 1159 1160 /* 1161 - at - print current situation 1162 == #ifdef REDEBUG 1163 == static void at(struct match *m, const char *title, const char *start, \ 1164 == const char *stop, sopno startst, sopno stopst); 1165 == #endif 1166 */ 1167 static void 1168 at( struct match *m, 1169 const char *title, 1170 const char *start, 1171 const char *stop, 1172 sopno startst, 1173 sopno stopst) 1174 { 1175 if (!(m->eflags®_TRACE)) 1176 return; 1177 1178 printf("%s %s-", title, pchar(*start)); 1179 printf("%s ", pchar(*stop)); 1180 printf("%ld-%ld\n", (long)startst, (long)stopst); 1181 } 1182 1183 #ifndef PCHARDONE 1184 #define PCHARDONE /* never again */ 1185 /* 1186 - pchar - make a character printable 1187 == #ifdef REDEBUG 1188 == static const char *pchar(int ch); 1189 == #endif 1190 * 1191 * Is this identical to regchar() over in debug.c? Well, yes. But a 1192 * duplicate here avoids having a debugging-capable regexec.o tied to 1193 * a matching debug.o, and this is convenient. It all disappears in 1194 * the non-debug compilation anyway, so it doesn't matter much. 1195 */ 1196 static const char * /* -> representation */ 1197 pchar(int ch) 1198 { 1199 static char pbuf[10]; 1200 1201 if (isprint((uch)ch) || ch == ' ') 1202 sprintf(pbuf, "%c", ch); 1203 else 1204 sprintf(pbuf, "\\%o", ch); 1205 return(pbuf); 1206 } 1207 #endif 1208 #endif 1209 1210 #undef stepback 1211 #undef matcher 1212 #undef walk 1213 #undef dissect 1214 #undef backref 1215 #undef step 1216 #undef print 1217 #undef at 1218 #undef match 1219