1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 #include <stdbool.h> 39 40 /* 41 * The matching engine and friends. This file is #included by regexec.c 42 * after suitable #defines of a variety of macros used herein, so that 43 * different state representations can be used without duplicating masses 44 * of code. 45 */ 46 47 #ifdef SNAMES 48 #define stepback sstepback 49 #define matcher smatcher 50 #define walk swalk 51 #define dissect sdissect 52 #define backref sbackref 53 #define step sstep 54 #define print sprint 55 #define at sat 56 #define match smat 57 #endif 58 #ifdef LNAMES 59 #define stepback lstepback 60 #define matcher lmatcher 61 #define walk lwalk 62 #define dissect ldissect 63 #define backref lbackref 64 #define step lstep 65 #define print lprint 66 #define at lat 67 #define match lmat 68 #endif 69 #ifdef MNAMES 70 #define stepback mstepback 71 #define matcher mmatcher 72 #define walk mwalk 73 #define dissect mdissect 74 #define backref mbackref 75 #define step mstep 76 #define print mprint 77 #define at mat 78 #define match mmat 79 #endif 80 81 /* another structure passed up and down to avoid zillions of parameters */ 82 struct match { 83 struct re_guts *g; 84 int eflags; 85 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 86 const char *offp; /* offsets work from here */ 87 const char *beginp; /* start of string -- virtual NUL precedes */ 88 const char *endp; /* end of string -- virtual NUL here */ 89 const char *coldp; /* can be no match starting before here */ 90 const char **lastpos; /* [nplus+1] */ 91 STATEVARS; 92 states st; /* current states */ 93 states fresh; /* states for a fresh start */ 94 states tmp; /* temporary */ 95 states empty; /* empty set of states */ 96 mbstate_t mbs; /* multibyte conversion state */ 97 }; 98 99 /* ========= begin header generated by ./mkh ========= */ 100 #ifdef __cplusplus 101 extern "C" { 102 #endif 103 104 /* === engine.c === */ 105 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 106 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 107 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 108 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); 109 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); 110 #define MAX_RECURSION 100 111 #define BOL (OUT-1) 112 #define EOL (BOL-1) 113 #define BOLEOL (BOL-2) 114 #define NOTHING (BOL-3) 115 #define BOW (BOL-4) 116 #define EOW (BOL-5) 117 #define BADCHAR (BOL-6) 118 #define NWBND (BOL-7) 119 #define NONCHAR(c) ((c) <= OUT) 120 /* sflags */ 121 #define SBOS 0x0001 122 #define SEOS 0x0002 123 124 #ifdef REDEBUG 125 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 126 #endif 127 #ifdef REDEBUG 128 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 129 #endif 130 #ifdef REDEBUG 131 static const char *pchar(int ch); 132 #endif 133 134 #ifdef __cplusplus 135 } 136 #endif 137 /* ========= end header generated by ./mkh ========= */ 138 139 #ifdef REDEBUG 140 #define SP(t, s, c) print(m, t, s, c, stdout) 141 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 142 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 143 #else 144 #define SP(t, s, c) /* nothing */ 145 #define AT(t, p1, p2, s1, s2) /* nothing */ 146 #define NOTE(s) /* nothing */ 147 #endif 148 149 /* 150 * Given a multibyte string pointed to by start, step back nchar characters 151 * from current position pointed to by cur. 152 */ 153 static const char * 154 stepback(const char *start, const char *cur, int nchar) 155 { 156 const char *ret; 157 int wc, mbc; 158 mbstate_t mbs; 159 size_t clen; 160 161 if (MB_CUR_MAX == 1) 162 return ((cur - nchar) > start ? cur - nchar : NULL); 163 164 ret = cur; 165 for (wc = nchar; wc > 0; wc--) { 166 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { 167 if ((ret - mbc) < start) 168 return (NULL); 169 memset(&mbs, 0, sizeof(mbs)); 170 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 171 if (clen != (size_t)-1 && clen != (size_t)-2) 172 break; 173 } 174 if (mbc > MB_CUR_MAX) 175 return (NULL); 176 ret -= mbc; 177 } 178 179 return (ret); 180 } 181 182 /* 183 - matcher - the actual matching engine 184 == static int matcher(struct re_guts *g, const char *string, \ 185 == size_t nmatch, regmatch_t pmatch[], int eflags); 186 */ 187 static int /* 0 success, REG_NOMATCH failure */ 188 matcher(struct re_guts *g, 189 const char *string, 190 size_t nmatch, 191 regmatch_t pmatch[], 192 int eflags) 193 { 194 const char *endp; 195 size_t i; 196 struct match mv; 197 struct match *m = &mv; 198 const char *dp = NULL; 199 const sopno gf = g->firststate+1; /* +1 for OEND */ 200 const sopno gl = g->laststate; 201 const char *start; 202 const char *stop; 203 /* Boyer-Moore algorithms variables */ 204 const char *pp; 205 int cj, mj; 206 const char *mustfirst; 207 const char *mustlast; 208 int *matchjump; 209 int *charjump; 210 211 /* simplify the situation where possible */ 212 if (g->cflags®_NOSUB) 213 nmatch = 0; 214 if (eflags®_STARTEND) { 215 start = string + pmatch[0].rm_so; 216 stop = string + pmatch[0].rm_eo; 217 } else { 218 start = string; 219 stop = start + strlen(start); 220 } 221 if (stop < start) 222 return(REG_INVARG); 223 224 /* prescreening; this does wonders for this rather slow code */ 225 if (g->must != NULL) { 226 if (g->charjump != NULL && g->matchjump != NULL) { 227 mustfirst = g->must; 228 mustlast = g->must + g->mlen - 1; 229 charjump = g->charjump; 230 matchjump = g->matchjump; 231 pp = mustlast; 232 for (dp = start+g->mlen-1; dp < stop;) { 233 /* Fast skip non-matches */ 234 while (dp < stop && charjump[(int)*dp]) 235 dp += charjump[(int)*dp]; 236 237 if (dp >= stop) 238 break; 239 240 /* Greedy matcher */ 241 /* We depend on not being used for 242 * for strings of length 1 243 */ 244 while (*--dp == *--pp && pp != mustfirst); 245 246 if (*dp == *pp) 247 break; 248 249 /* Jump to next possible match */ 250 mj = matchjump[pp - mustfirst]; 251 cj = charjump[(int)*dp]; 252 dp += (cj < mj ? mj : cj); 253 pp = mustlast; 254 } 255 if (pp != mustfirst) 256 return(REG_NOMATCH); 257 } else { 258 for (dp = start; dp < stop; dp++) 259 if (*dp == g->must[0] && 260 stop - dp >= g->mlen && 261 memcmp(dp, g->must, (size_t)g->mlen) == 0) 262 break; 263 if (dp == stop) /* we didn't find g->must */ 264 return(REG_NOMATCH); 265 } 266 } 267 268 /* match struct setup */ 269 m->g = g; 270 m->eflags = eflags; 271 m->pmatch = NULL; 272 m->lastpos = NULL; 273 m->offp = string; 274 m->beginp = start; 275 m->endp = stop; 276 STATESETUP(m, 4); 277 SETUP(m->st); 278 SETUP(m->fresh); 279 SETUP(m->tmp); 280 SETUP(m->empty); 281 CLEAR(m->empty); 282 ZAPSTATE(&m->mbs); 283 284 /* Adjust start according to moffset, to speed things up */ 285 if (dp != NULL && g->moffset > -1) { 286 const char *nstart; 287 288 nstart = stepback(start, dp, g->moffset); 289 if (nstart != NULL) 290 start = nstart; 291 } 292 293 SP("mloop", m->st, *start); 294 295 /* this loop does only one repetition except for backrefs */ 296 for (;;) { 297 endp = walk(m, start, stop, gf, gl, true); 298 if (endp == NULL) { /* a miss */ 299 if (m->pmatch != NULL) 300 free((char *)m->pmatch); 301 if (m->lastpos != NULL) 302 free((char *)m->lastpos); 303 STATETEARDOWN(m); 304 return(REG_NOMATCH); 305 } 306 if (nmatch == 0 && !g->backrefs) 307 break; /* no further info needed */ 308 309 /* where? */ 310 assert(m->coldp != NULL); 311 for (;;) { 312 NOTE("finding start"); 313 endp = walk(m, m->coldp, stop, gf, gl, false); 314 if (endp != NULL) 315 break; 316 assert(m->coldp < m->endp); 317 m->coldp += XMBRTOWC(NULL, m->coldp, 318 m->endp - m->coldp, &m->mbs, 0); 319 } 320 if (nmatch == 1 && !g->backrefs) 321 break; /* no further info needed */ 322 323 /* oh my, he wants the subexpressions... */ 324 if (m->pmatch == NULL) 325 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 326 sizeof(regmatch_t)); 327 if (m->pmatch == NULL) { 328 STATETEARDOWN(m); 329 return(REG_ESPACE); 330 } 331 for (i = 1; i <= m->g->nsub; i++) 332 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 333 if (!g->backrefs && !(m->eflags®_BACKR)) { 334 NOTE("dissecting"); 335 dp = dissect(m, m->coldp, endp, gf, gl); 336 } else { 337 if (g->nplus > 0 && m->lastpos == NULL) 338 m->lastpos = malloc((g->nplus+1) * 339 sizeof(const char *)); 340 if (g->nplus > 0 && m->lastpos == NULL) { 341 free(m->pmatch); 342 STATETEARDOWN(m); 343 return(REG_ESPACE); 344 } 345 NOTE("backref dissect"); 346 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 347 } 348 if (dp != NULL) 349 break; 350 351 /* uh-oh... we couldn't find a subexpression-level match */ 352 assert(g->backrefs); /* must be back references doing it */ 353 assert(g->nplus == 0 || m->lastpos != NULL); 354 for (;;) { 355 if (dp != NULL || endp <= m->coldp) 356 break; /* defeat */ 357 NOTE("backoff"); 358 endp = walk(m, m->coldp, endp-1, gf, gl, false); 359 if (endp == NULL) 360 break; /* defeat */ 361 /* try it on a shorter possibility */ 362 #ifndef NDEBUG 363 for (i = 1; i <= m->g->nsub; i++) { 364 assert(m->pmatch[i].rm_so == -1); 365 assert(m->pmatch[i].rm_eo == -1); 366 } 367 #endif 368 NOTE("backoff dissect"); 369 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 370 } 371 assert(dp == NULL || dp == endp); 372 if (dp != NULL) /* found a shorter one */ 373 break; 374 375 /* despite initial appearances, there is no match here */ 376 NOTE("false alarm"); 377 /* recycle starting later */ 378 start = m->coldp + XMBRTOWC(NULL, m->coldp, 379 stop - m->coldp, &m->mbs, 0); 380 assert(start <= stop); 381 } 382 383 /* fill in the details if requested */ 384 if (nmatch > 0) { 385 pmatch[0].rm_so = m->coldp - m->offp; 386 pmatch[0].rm_eo = endp - m->offp; 387 } 388 if (nmatch > 1) { 389 assert(m->pmatch != NULL); 390 for (i = 1; i < nmatch; i++) 391 if (i <= m->g->nsub) 392 pmatch[i] = m->pmatch[i]; 393 else { 394 pmatch[i].rm_so = -1; 395 pmatch[i].rm_eo = -1; 396 } 397 } 398 399 if (m->pmatch != NULL) 400 free((char *)m->pmatch); 401 if (m->lastpos != NULL) 402 free((char *)m->lastpos); 403 STATETEARDOWN(m); 404 return(0); 405 } 406 407 /* 408 - dissect - figure out what matched what, no back references 409 == static const char *dissect(struct match *m, const char *start, \ 410 == const char *stop, sopno startst, sopno stopst); 411 */ 412 static const char * /* == stop (success) always */ 413 dissect(struct match *m, 414 const char *start, 415 const char *stop, 416 sopno startst, 417 sopno stopst) 418 { 419 int i; 420 sopno ss; /* start sop of current subRE */ 421 sopno es; /* end sop of current subRE */ 422 const char *sp; /* start of string matched by it */ 423 const char *stp; /* string matched by it cannot pass here */ 424 const char *rest; /* start of rest of string */ 425 const char *tail; /* string unmatched by rest of RE */ 426 sopno ssub; /* start sop of subsubRE */ 427 sopno esub; /* end sop of subsubRE */ 428 const char *ssp; /* start of string matched by subsubRE */ 429 const char *sep; /* end of string matched by subsubRE */ 430 const char *oldssp; /* previous ssp */ 431 const char *dp __unused; 432 433 AT("diss", start, stop, startst, stopst); 434 sp = start; 435 for (ss = startst; ss < stopst; ss = es) { 436 /* identify end of subRE */ 437 es = ss; 438 switch (OP(m->g->strip[es])) { 439 case OPLUS_: 440 case OQUEST_: 441 es += OPND(m->g->strip[es]); 442 break; 443 case OCH_: 444 while (OP(m->g->strip[es]) != (sop)O_CH) 445 es += OPND(m->g->strip[es]); 446 break; 447 } 448 es++; 449 450 /* figure out what it matched */ 451 switch (OP(m->g->strip[ss])) { 452 case OEND: 453 assert(nope); 454 break; 455 case OCHAR: 456 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 457 break; 458 case OBOL: 459 case OEOL: 460 case OBOW: 461 case OEOW: 462 case OBOS: 463 case OEOS: 464 case OWBND: 465 case ONWBND: 466 break; 467 case OANY: 468 case OANYOF: 469 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 470 break; 471 case OBACK_: 472 case O_BACK: 473 assert(nope); 474 break; 475 /* cases where length of match is hard to find */ 476 case OQUEST_: 477 stp = stop; 478 for (;;) { 479 /* how long could this one be? */ 480 rest = walk(m, sp, stp, ss, es, false); 481 assert(rest != NULL); /* it did match */ 482 /* could the rest match the rest? */ 483 tail = walk(m, rest, stop, es, stopst, false); 484 if (tail == stop) 485 break; /* yes! */ 486 /* no -- try a shorter match for this one */ 487 stp = rest - 1; 488 assert(stp >= sp); /* it did work */ 489 } 490 ssub = ss + 1; 491 esub = es - 1; 492 /* did innards match? */ 493 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 494 dp = dissect(m, sp, rest, ssub, esub); 495 assert(dp == rest); 496 } else /* no */ 497 assert(sp == rest); 498 sp = rest; 499 break; 500 case OPLUS_: 501 stp = stop; 502 for (;;) { 503 /* how long could this one be? */ 504 rest = walk(m, sp, stp, ss, es, false); 505 assert(rest != NULL); /* it did match */ 506 /* could the rest match the rest? */ 507 tail = walk(m, rest, stop, es, stopst, false); 508 if (tail == stop) 509 break; /* yes! */ 510 /* no -- try a shorter match for this one */ 511 stp = rest - 1; 512 assert(stp >= sp); /* it did work */ 513 } 514 ssub = ss + 1; 515 esub = es - 1; 516 ssp = sp; 517 oldssp = ssp; 518 for (;;) { /* find last match of innards */ 519 sep = walk(m, ssp, rest, ssub, esub, false); 520 if (sep == NULL || sep == ssp) 521 break; /* failed or matched null */ 522 oldssp = ssp; /* on to next try */ 523 ssp = sep; 524 } 525 if (sep == NULL) { 526 /* last successful match */ 527 sep = ssp; 528 ssp = oldssp; 529 } 530 assert(sep == rest); /* must exhaust substring */ 531 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 532 dp = dissect(m, ssp, sep, ssub, esub); 533 assert(dp == sep); 534 sp = rest; 535 break; 536 case OCH_: 537 stp = stop; 538 for (;;) { 539 /* how long could this one be? */ 540 rest = walk(m, sp, stp, ss, es, false); 541 assert(rest != NULL); /* it did match */ 542 /* could the rest match the rest? */ 543 tail = walk(m, rest, stop, es, stopst, false); 544 if (tail == stop) 545 break; /* yes! */ 546 /* no -- try a shorter match for this one */ 547 stp = rest - 1; 548 assert(stp >= sp); /* it did work */ 549 } 550 ssub = ss + 1; 551 esub = ss + OPND(m->g->strip[ss]) - 1; 552 assert(OP(m->g->strip[esub]) == OOR1); 553 for (;;) { /* find first matching branch */ 554 if (walk(m, sp, rest, ssub, esub, false) == rest) 555 break; /* it matched all of it */ 556 /* that one missed, try next one */ 557 assert(OP(m->g->strip[esub]) == OOR1); 558 esub++; 559 assert(OP(m->g->strip[esub]) == OOR2); 560 ssub = esub + 1; 561 esub += OPND(m->g->strip[esub]); 562 if (OP(m->g->strip[esub]) == (sop)OOR2) 563 esub--; 564 else 565 assert(OP(m->g->strip[esub]) == O_CH); 566 } 567 dp = dissect(m, sp, rest, ssub, esub); 568 assert(dp == rest); 569 sp = rest; 570 break; 571 case O_PLUS: 572 case O_QUEST: 573 case OOR1: 574 case OOR2: 575 case O_CH: 576 assert(nope); 577 break; 578 case OLPAREN: 579 i = OPND(m->g->strip[ss]); 580 assert(0 < i && i <= m->g->nsub); 581 m->pmatch[i].rm_so = sp - m->offp; 582 break; 583 case ORPAREN: 584 i = OPND(m->g->strip[ss]); 585 assert(0 < i && i <= m->g->nsub); 586 m->pmatch[i].rm_eo = sp - m->offp; 587 break; 588 default: /* uh oh */ 589 assert(nope); 590 break; 591 } 592 } 593 594 assert(sp == stop); 595 return(sp); 596 } 597 598 #define ISBOW(m, sp) \ 599 (sp < m->endp && ISWORD(*sp) && \ 600 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \ 601 (sp > m->offp && !ISWORD(*(sp-1))))) 602 #define ISEOW(m, sp) \ 603 (((sp == m->endp && !(m->eflags®_NOTEOL)) || \ 604 (sp < m->endp && *sp == '\n' && \ 605 (m->g->cflags®_NEWLINE)) || \ 606 (sp < m->endp && !ISWORD(*sp)) ) && \ 607 (sp > m->beginp && ISWORD(*(sp-1)))) \ 608 609 /* 610 - backref - figure out what matched what, figuring in back references 611 == static const char *backref(struct match *m, const char *start, \ 612 == const char *stop, sopno startst, sopno stopst, sopno lev); 613 */ 614 static const char * /* == stop (success) or NULL (failure) */ 615 backref(struct match *m, 616 const char *start, 617 const char *stop, 618 sopno startst, 619 sopno stopst, 620 sopno lev, /* PLUS nesting level */ 621 int rec) 622 { 623 int i; 624 sopno ss; /* start sop of current subRE */ 625 const char *sp; /* start of string matched by it */ 626 sopno ssub; /* start sop of subsubRE */ 627 sopno esub; /* end sop of subsubRE */ 628 const char *ssp; /* start of string matched by subsubRE */ 629 const char *dp; 630 size_t len; 631 int hard; 632 sop s; 633 regoff_t offsave; 634 cset *cs; 635 wint_t wc; 636 637 AT("back", start, stop, startst, stopst); 638 sp = start; 639 640 /* get as far as we can with easy stuff */ 641 hard = 0; 642 for (ss = startst; !hard && ss < stopst; ss++) 643 switch (OP(s = m->g->strip[ss])) { 644 case OCHAR: 645 if (sp == stop) 646 return(NULL); 647 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 648 if (wc != OPND(s)) 649 return(NULL); 650 break; 651 case OANY: 652 if (sp == stop) 653 return(NULL); 654 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 655 if (wc == BADCHAR) 656 return (NULL); 657 break; 658 case OANYOF: 659 if (sp == stop) 660 return (NULL); 661 cs = &m->g->sets[OPND(s)]; 662 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 663 if (wc == BADCHAR || !CHIN(cs, wc)) 664 return(NULL); 665 break; 666 case OBOS: 667 if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0) 668 { /* yes */ } 669 else 670 return(NULL); 671 break; 672 case OEOS: 673 if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0) 674 { /* yes */ } 675 else 676 return(NULL); 677 break; 678 case OBOL: 679 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 680 (sp > m->offp && sp < m->endp && 681 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 682 { /* yes */ } 683 else 684 return(NULL); 685 break; 686 case OEOL: 687 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 688 (sp < m->endp && *sp == '\n' && 689 (m->g->cflags®_NEWLINE)) ) 690 { /* yes */ } 691 else 692 return(NULL); 693 break; 694 case OWBND: 695 if (ISBOW(m, sp) || ISEOW(m, sp)) 696 { /* yes */ } 697 else 698 return(NULL); 699 break; 700 case ONWBND: 701 if (((sp == m->beginp) && !ISWORD(*sp)) || 702 (sp == m->endp && !ISWORD(*(sp - 1)))) 703 { /* yes, beginning/end of subject */ } 704 else if (ISWORD(*(sp - 1)) == ISWORD(*sp)) 705 { /* yes, beginning/end of subject */ } 706 else 707 return(NULL); 708 break; 709 case OBOW: 710 if (ISBOW(m, sp)) 711 { /* yes */ } 712 else 713 return(NULL); 714 break; 715 case OEOW: 716 if (ISEOW(m, sp)) 717 { /* yes */ } 718 else 719 return(NULL); 720 break; 721 case O_QUEST: 722 break; 723 case OOR1: /* matches null but needs to skip */ 724 ss++; 725 s = m->g->strip[ss]; 726 do { 727 assert(OP(s) == OOR2); 728 ss += OPND(s); 729 } while (OP(s = m->g->strip[ss]) != (sop)O_CH); 730 /* note that the ss++ gets us past the O_CH */ 731 break; 732 default: /* have to make a choice */ 733 hard = 1; 734 break; 735 } 736 if (!hard) { /* that was it! */ 737 if (sp != stop) 738 return(NULL); 739 return(sp); 740 } 741 ss--; /* adjust for the for's final increment */ 742 743 /* the hard stuff */ 744 AT("hard", sp, stop, ss, stopst); 745 s = m->g->strip[ss]; 746 switch (OP(s)) { 747 case OBACK_: /* the vilest depths */ 748 i = OPND(s); 749 assert(0 < i && i <= m->g->nsub); 750 if (m->pmatch[i].rm_eo == -1) 751 return(NULL); 752 assert(m->pmatch[i].rm_so != -1); 753 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 754 if (len == 0 && rec++ > MAX_RECURSION) 755 return(NULL); 756 assert(stop - m->beginp >= len); 757 if (sp > stop - len) 758 return(NULL); /* not enough left to match */ 759 ssp = m->offp + m->pmatch[i].rm_so; 760 if (memcmp(sp, ssp, len) != 0) 761 return(NULL); 762 while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) 763 ss++; 764 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 765 case OQUEST_: /* to null or not */ 766 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 767 if (dp != NULL) 768 return(dp); /* not */ 769 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 770 case OPLUS_: 771 assert(m->lastpos != NULL); 772 assert(lev+1 <= m->g->nplus); 773 m->lastpos[lev+1] = sp; 774 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 775 case O_PLUS: 776 if (sp == m->lastpos[lev]) /* last pass matched null */ 777 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 778 /* try another pass */ 779 m->lastpos[lev] = sp; 780 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 781 if (dp == NULL) 782 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 783 else 784 return(dp); 785 case OCH_: /* find the right one, if any */ 786 ssub = ss + 1; 787 esub = ss + OPND(s) - 1; 788 assert(OP(m->g->strip[esub]) == OOR1); 789 for (;;) { /* find first matching branch */ 790 dp = backref(m, sp, stop, ssub, esub, lev, rec); 791 if (dp != NULL) 792 return(dp); 793 /* that one missed, try next one */ 794 if (OP(m->g->strip[esub]) == (sop)O_CH) 795 return(NULL); /* there is none */ 796 esub++; 797 assert(OP(m->g->strip[esub]) == (sop)OOR2); 798 ssub = esub + 1; 799 esub += OPND(m->g->strip[esub]); 800 if (OP(m->g->strip[esub]) == (sop)OOR2) 801 esub--; 802 else 803 assert(OP(m->g->strip[esub]) == O_CH); 804 } 805 /* NOTREACHED */ 806 break; 807 case OLPAREN: /* must undo assignment if rest fails */ 808 i = OPND(s); 809 assert(0 < i && i <= m->g->nsub); 810 offsave = m->pmatch[i].rm_so; 811 m->pmatch[i].rm_so = sp - m->offp; 812 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 813 if (dp != NULL) 814 return(dp); 815 m->pmatch[i].rm_so = offsave; 816 return(NULL); 817 case ORPAREN: /* must undo assignment if rest fails */ 818 i = OPND(s); 819 assert(0 < i && i <= m->g->nsub); 820 offsave = m->pmatch[i].rm_eo; 821 m->pmatch[i].rm_eo = sp - m->offp; 822 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 823 if (dp != NULL) 824 return(dp); 825 m->pmatch[i].rm_eo = offsave; 826 return(NULL); 827 default: /* uh oh */ 828 assert(nope); 829 break; 830 } 831 832 /* "can't happen" */ 833 assert(nope); 834 /* NOTREACHED */ 835 return "shut up gcc"; 836 } 837 838 /* 839 - walk - step through the string either quickly or slowly 840 == static const char *walk(struct match *m, const char *start, \ 841 == const char *stop, sopno startst, sopno stopst, bool fast); 842 */ 843 static const char * /* where it ended, or NULL */ 844 walk(struct match *m, const char *start, const char *stop, sopno startst, 845 sopno stopst, bool fast) 846 { 847 states st = m->st; 848 states fresh = m->fresh; 849 states empty = m->empty; 850 states tmp = m->tmp; 851 const char *p = start; 852 wint_t c; 853 wint_t lastc; /* previous c */ 854 wint_t flagch; 855 int i, sflags; 856 const char *matchp; /* last p at which a match ended */ 857 size_t clen; 858 859 sflags = 0; 860 AT("slow", start, stop, startst, stopst); 861 CLEAR(st); 862 SET1(st, startst); 863 SP("sstart", st, *p); 864 st = step(m->g, startst, stopst, st, NOTHING, st, sflags); 865 if (fast) 866 ASSIGN(fresh, st); 867 matchp = NULL; 868 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 869 c = OUT; 870 else { 871 /* 872 * XXX Wrong if the previous character was multi-byte. 873 * Newline never is (in encodings supported by FreeBSD), 874 * so this only breaks the ISWORD tests below. 875 */ 876 c = (uch)*(start - 1); 877 } 878 for (;;) { 879 /* next character */ 880 lastc = c; 881 sflags = 0; 882 if (p == m->endp) { 883 c = OUT; 884 clen = 0; 885 } else 886 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 887 888 if (fast && EQ(st, fresh)) 889 matchp = p; 890 891 /* is there an EOL and/or BOL between lastc and c? */ 892 flagch = '\0'; 893 i = 0; 894 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 895 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 896 flagch = BOL; 897 i = m->g->nbol; 898 } 899 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 900 (c == OUT && !(m->eflags®_NOTEOL)) ) { 901 flagch = (flagch == BOL) ? BOLEOL : EOL; 902 i += m->g->neol; 903 } 904 if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) { 905 sflags |= SBOS; 906 /* Step one more for BOS. */ 907 i++; 908 } 909 if (c == OUT && (m->eflags & REG_NOTEOL) == 0) { 910 sflags |= SEOS; 911 /* Step one more for EOS. */ 912 i++; 913 } 914 if (i != 0) { 915 for (; i > 0; i--) 916 st = step(m->g, startst, stopst, st, flagch, st, 917 sflags); 918 SP("sboleol", st, c); 919 } 920 921 /* how about a word boundary? */ 922 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 923 (c != OUT && ISWORD(c)) ) { 924 flagch = BOW; 925 } 926 if ( (lastc != OUT && ISWORD(lastc)) && 927 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 928 flagch = EOW; 929 } 930 if (flagch == BOW || flagch == EOW) { 931 st = step(m->g, startst, stopst, st, flagch, st, sflags); 932 SP("sboweow", st, c); 933 } 934 if (lastc != OUT && c != OUT && 935 ISWORD(lastc) == ISWORD(c)) { 936 flagch = NWBND; 937 } else if ((lastc == OUT && !ISWORD(c)) || 938 (c == OUT && !ISWORD(lastc))) { 939 flagch = NWBND; 940 } 941 if (flagch == NWBND) { 942 st = step(m->g, startst, stopst, st, flagch, st, sflags); 943 SP("snwbnd", st, c); 944 } 945 946 /* are we done? */ 947 if (ISSET(st, stopst)) { 948 if (fast) 949 break; 950 else 951 matchp = p; 952 } 953 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 954 break; /* NOTE BREAK OUT */ 955 956 /* no, we must deal with this character */ 957 ASSIGN(tmp, st); 958 if (fast) 959 ASSIGN(st, fresh); 960 else 961 ASSIGN(st, empty); 962 assert(c != OUT); 963 st = step(m->g, startst, stopst, tmp, c, st, sflags); 964 SP("saft", st, c); 965 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), 966 st)); 967 p += clen; 968 } 969 970 if (fast) { 971 assert(matchp != NULL); 972 m->coldp = matchp; 973 if (ISSET(st, stopst)) 974 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 975 else 976 return (NULL); 977 } else 978 return (matchp); 979 } 980 981 /* 982 - step - map set of states reachable before char to set reachable after 983 == static states step(struct re_guts *g, sopno start, sopno stop, \ 984 == states bef, int ch, states aft); 985 == #define BOL (OUT-1) 986 == #define EOL (BOL-1) 987 == #define BOLEOL (BOL-2) 988 == #define NOTHING (BOL-3) 989 == #define BOW (BOL-4) 990 == #define EOW (BOL-5) 991 == #define BADCHAR (BOL-6) 992 == #define NONCHAR(c) ((c) <= OUT) 993 */ 994 static states 995 step(struct re_guts *g, 996 sopno start, /* start state within strip */ 997 sopno stop, /* state after stop state within strip */ 998 states bef, /* states reachable before */ 999 wint_t ch, /* character or NONCHAR code */ 1000 states aft, /* states already known reachable after */ 1001 int sflags) /* state flags */ 1002 { 1003 cset *cs; 1004 sop s; 1005 sopno pc; 1006 onestate here; /* note, macros know this name */ 1007 sopno look; 1008 int i; 1009 1010 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1011 s = g->strip[pc]; 1012 switch (OP(s)) { 1013 case OEND: 1014 assert(pc == stop-1); 1015 break; 1016 case OCHAR: 1017 /* only characters can match */ 1018 assert(!NONCHAR(ch) || ch != OPND(s)); 1019 if (ch == OPND(s)) 1020 FWD(aft, bef, 1); 1021 break; 1022 case OBOS: 1023 if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0) 1024 FWD(aft, bef, 1); 1025 break; 1026 case OEOS: 1027 if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0) 1028 FWD(aft, bef, 1); 1029 break; 1030 case OBOL: 1031 if (ch == BOL || ch == BOLEOL) 1032 FWD(aft, bef, 1); 1033 break; 1034 case OEOL: 1035 if (ch == EOL || ch == BOLEOL) 1036 FWD(aft, bef, 1); 1037 break; 1038 case OBOW: 1039 if (ch == BOW) 1040 FWD(aft, bef, 1); 1041 break; 1042 case OEOW: 1043 if (ch == EOW) 1044 FWD(aft, bef, 1); 1045 break; 1046 case OWBND: 1047 if (ch == BOW || ch == EOW) 1048 FWD(aft, bef, 1); 1049 break; 1050 case ONWBND: 1051 if (ch == NWBND) 1052 FWD(aft, aft, 1); 1053 break; 1054 case OANY: 1055 if (!NONCHAR(ch)) 1056 FWD(aft, bef, 1); 1057 break; 1058 case OANYOF: 1059 cs = &g->sets[OPND(s)]; 1060 if (!NONCHAR(ch) && CHIN(cs, ch)) 1061 FWD(aft, bef, 1); 1062 break; 1063 case OBACK_: /* ignored here */ 1064 case O_BACK: 1065 FWD(aft, aft, 1); 1066 break; 1067 case OPLUS_: /* forward, this is just an empty */ 1068 FWD(aft, aft, 1); 1069 break; 1070 case O_PLUS: /* both forward and back */ 1071 FWD(aft, aft, 1); 1072 i = ISSETBACK(aft, OPND(s)); 1073 BACK(aft, aft, OPND(s)); 1074 if (!i && ISSETBACK(aft, OPND(s))) { 1075 /* oho, must reconsider loop body */ 1076 pc -= OPND(s) + 1; 1077 INIT(here, pc); 1078 } 1079 break; 1080 case OQUEST_: /* two branches, both forward */ 1081 FWD(aft, aft, 1); 1082 FWD(aft, aft, OPND(s)); 1083 break; 1084 case O_QUEST: /* just an empty */ 1085 FWD(aft, aft, 1); 1086 break; 1087 case OLPAREN: /* not significant here */ 1088 case ORPAREN: 1089 FWD(aft, aft, 1); 1090 break; 1091 case OCH_: /* mark the first two branches */ 1092 FWD(aft, aft, 1); 1093 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1094 FWD(aft, aft, OPND(s)); 1095 break; 1096 case OOR1: /* done a branch, find the O_CH */ 1097 if (ISSTATEIN(aft, here)) { 1098 for (look = 1; 1099 OP(s = g->strip[pc+look]) != (sop)O_CH; 1100 look += OPND(s)) 1101 assert(OP(s) == (sop)OOR2); 1102 FWD(aft, aft, look + 1); 1103 } 1104 break; 1105 case OOR2: /* propagate OCH_'s marking */ 1106 FWD(aft, aft, 1); 1107 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { 1108 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1109 FWD(aft, aft, OPND(s)); 1110 } 1111 break; 1112 case O_CH: /* just empty */ 1113 FWD(aft, aft, 1); 1114 break; 1115 default: /* ooooops... */ 1116 assert(nope); 1117 break; 1118 } 1119 } 1120 1121 return(aft); 1122 } 1123 1124 #ifdef REDEBUG 1125 /* 1126 - print - print a set of states 1127 == #ifdef REDEBUG 1128 == static void print(struct match *m, const char *caption, states st, \ 1129 == int ch, FILE *d); 1130 == #endif 1131 */ 1132 static void 1133 print(struct match *m, 1134 const char *caption, 1135 states st, 1136 int ch, 1137 FILE *d) 1138 { 1139 struct re_guts *g = m->g; 1140 sopno i; 1141 int first = 1; 1142 1143 if (!(m->eflags®_TRACE)) 1144 return; 1145 1146 fprintf(d, "%s", caption); 1147 if (ch != '\0') 1148 fprintf(d, " %s", pchar(ch)); 1149 for (i = 0; i < g->nstates; i++) 1150 if (ISSET(st, i)) { 1151 fprintf(d, "%s%lu", (first) ? "\t" : ", ", i); 1152 first = 0; 1153 } 1154 fprintf(d, "\n"); 1155 } 1156 1157 /* 1158 - at - print current situation 1159 == #ifdef REDEBUG 1160 == static void at(struct match *m, const char *title, const char *start, \ 1161 == const char *stop, sopno startst, sopno stopst); 1162 == #endif 1163 */ 1164 static void 1165 at( struct match *m, 1166 const char *title, 1167 const char *start, 1168 const char *stop, 1169 sopno startst, 1170 sopno stopst) 1171 { 1172 if (!(m->eflags®_TRACE)) 1173 return; 1174 1175 printf("%s %s-", title, pchar(*start)); 1176 printf("%s ", pchar(*stop)); 1177 printf("%ld-%ld\n", (long)startst, (long)stopst); 1178 } 1179 1180 #ifndef PCHARDONE 1181 #define PCHARDONE /* never again */ 1182 /* 1183 - pchar - make a character printable 1184 == #ifdef REDEBUG 1185 == static const char *pchar(int ch); 1186 == #endif 1187 * 1188 * Is this identical to regchar() over in debug.c? Well, yes. But a 1189 * duplicate here avoids having a debugging-capable regexec.o tied to 1190 * a matching debug.o, and this is convenient. It all disappears in 1191 * the non-debug compilation anyway, so it doesn't matter much. 1192 */ 1193 static const char * /* -> representation */ 1194 pchar(int ch) 1195 { 1196 static char pbuf[10]; 1197 1198 if (isprint((uch)ch) || ch == ' ') 1199 sprintf(pbuf, "%c", ch); 1200 else 1201 sprintf(pbuf, "\\%o", ch); 1202 return(pbuf); 1203 } 1204 #endif 1205 #endif 1206 1207 #undef stepback 1208 #undef matcher 1209 #undef walk 1210 #undef dissect 1211 #undef backref 1212 #undef step 1213 #undef print 1214 #undef at 1215 #undef match 1216