1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 #include <sys/cdefs.h> 39 #include <stdbool.h> 40 41 /* 42 * The matching engine and friends. This file is #included by regexec.c 43 * after suitable #defines of a variety of macros used herein, so that 44 * different state representations can be used without duplicating masses 45 * of code. 46 */ 47 48 #ifdef SNAMES 49 #define stepback sstepback 50 #define matcher smatcher 51 #define walk swalk 52 #define dissect sdissect 53 #define backref sbackref 54 #define step sstep 55 #define print sprint 56 #define at sat 57 #define match smat 58 #endif 59 #ifdef LNAMES 60 #define stepback lstepback 61 #define matcher lmatcher 62 #define walk lwalk 63 #define dissect ldissect 64 #define backref lbackref 65 #define step lstep 66 #define print lprint 67 #define at lat 68 #define match lmat 69 #endif 70 #ifdef MNAMES 71 #define stepback mstepback 72 #define matcher mmatcher 73 #define walk mwalk 74 #define dissect mdissect 75 #define backref mbackref 76 #define step mstep 77 #define print mprint 78 #define at mat 79 #define match mmat 80 #endif 81 82 /* another structure passed up and down to avoid zillions of parameters */ 83 struct match { 84 struct re_guts *g; 85 int eflags; 86 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 87 const char *offp; /* offsets work from here */ 88 const char *beginp; /* start of string -- virtual NUL precedes */ 89 const char *endp; /* end of string -- virtual NUL here */ 90 const char *coldp; /* can be no match starting before here */ 91 const char **lastpos; /* [nplus+1] */ 92 STATEVARS; 93 states st; /* current states */ 94 states fresh; /* states for a fresh start */ 95 states tmp; /* temporary */ 96 states empty; /* empty set of states */ 97 mbstate_t mbs; /* multibyte conversion state */ 98 }; 99 100 /* ========= begin header generated by ./mkh ========= */ 101 #ifdef __cplusplus 102 extern "C" { 103 #endif 104 105 /* === engine.c === */ 106 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 107 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 108 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 109 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); 110 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); 111 #define MAX_RECURSION 100 112 #define BOL (OUT-1) 113 #define EOL (BOL-1) 114 #define BOLEOL (BOL-2) 115 #define NOTHING (BOL-3) 116 #define BOW (BOL-4) 117 #define EOW (BOL-5) 118 #define BADCHAR (BOL-6) 119 #define NWBND (BOL-7) 120 #define NONCHAR(c) ((c) <= OUT) 121 /* sflags */ 122 #define SBOS 0x0001 123 #define SEOS 0x0002 124 125 #ifdef REDEBUG 126 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 127 #endif 128 #ifdef REDEBUG 129 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 130 #endif 131 #ifdef REDEBUG 132 static const char *pchar(int ch); 133 #endif 134 135 #ifdef __cplusplus 136 } 137 #endif 138 /* ========= end header generated by ./mkh ========= */ 139 140 #ifdef REDEBUG 141 #define SP(t, s, c) print(m, t, s, c, stdout) 142 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 143 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 144 #else 145 #define SP(t, s, c) /* nothing */ 146 #define AT(t, p1, p2, s1, s2) /* nothing */ 147 #define NOTE(s) /* nothing */ 148 #endif 149 150 /* 151 * Given a multibyte string pointed to by start, step back nchar characters 152 * from current position pointed to by cur. 153 */ 154 static const char * 155 stepback(const char *start, const char *cur, int nchar) 156 { 157 const char *ret; 158 int wc, mbc; 159 mbstate_t mbs; 160 size_t clen; 161 162 if (MB_CUR_MAX == 1) 163 return ((cur - nchar) > start ? cur - nchar : NULL); 164 165 ret = cur; 166 for (wc = nchar; wc > 0; wc--) { 167 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { 168 if ((ret - mbc) < start) 169 return (NULL); 170 memset(&mbs, 0, sizeof(mbs)); 171 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 172 if (clen != (size_t)-1 && clen != (size_t)-2) 173 break; 174 } 175 if (mbc > MB_CUR_MAX) 176 return (NULL); 177 ret -= mbc; 178 } 179 180 return (ret); 181 } 182 183 /* 184 - matcher - the actual matching engine 185 == static int matcher(struct re_guts *g, const char *string, \ 186 == size_t nmatch, regmatch_t pmatch[], int eflags); 187 */ 188 static int /* 0 success, REG_NOMATCH failure */ 189 matcher(struct re_guts *g, 190 const char *string, 191 size_t nmatch, 192 regmatch_t pmatch[], 193 int eflags) 194 { 195 const char *endp; 196 size_t i; 197 struct match mv; 198 struct match *m = &mv; 199 const char *dp = NULL; 200 const sopno gf = g->firststate+1; /* +1 for OEND */ 201 const sopno gl = g->laststate; 202 const char *start; 203 const char *stop; 204 /* Boyer-Moore algorithms variables */ 205 const char *pp; 206 int cj, mj; 207 const char *mustfirst; 208 const char *mustlast; 209 int *matchjump; 210 int *charjump; 211 212 /* simplify the situation where possible */ 213 if (g->cflags®_NOSUB) 214 nmatch = 0; 215 if (eflags®_STARTEND) { 216 start = string + pmatch[0].rm_so; 217 stop = string + pmatch[0].rm_eo; 218 } else { 219 start = string; 220 stop = start + strlen(start); 221 } 222 if (stop < start) 223 return(REG_INVARG); 224 225 /* prescreening; this does wonders for this rather slow code */ 226 if (g->must != NULL) { 227 if (g->charjump != NULL && g->matchjump != NULL) { 228 mustfirst = g->must; 229 mustlast = g->must + g->mlen - 1; 230 charjump = g->charjump; 231 matchjump = g->matchjump; 232 pp = mustlast; 233 for (dp = start+g->mlen-1; dp < stop;) { 234 /* Fast skip non-matches */ 235 while (dp < stop && charjump[(int)*dp]) 236 dp += charjump[(int)*dp]; 237 238 if (dp >= stop) 239 break; 240 241 /* Greedy matcher */ 242 /* We depend on not being used for 243 * for strings of length 1 244 */ 245 while (*--dp == *--pp && pp != mustfirst); 246 247 if (*dp == *pp) 248 break; 249 250 /* Jump to next possible match */ 251 mj = matchjump[pp - mustfirst]; 252 cj = charjump[(int)*dp]; 253 dp += (cj < mj ? mj : cj); 254 pp = mustlast; 255 } 256 if (pp != mustfirst) 257 return(REG_NOMATCH); 258 } else { 259 for (dp = start; dp < stop; dp++) 260 if (*dp == g->must[0] && 261 stop - dp >= g->mlen && 262 memcmp(dp, g->must, (size_t)g->mlen) == 0) 263 break; 264 if (dp == stop) /* we didn't find g->must */ 265 return(REG_NOMATCH); 266 } 267 } 268 269 /* match struct setup */ 270 m->g = g; 271 m->eflags = eflags; 272 m->pmatch = NULL; 273 m->lastpos = NULL; 274 m->offp = string; 275 m->beginp = start; 276 m->endp = stop; 277 STATESETUP(m, 4); 278 SETUP(m->st); 279 SETUP(m->fresh); 280 SETUP(m->tmp); 281 SETUP(m->empty); 282 CLEAR(m->empty); 283 ZAPSTATE(&m->mbs); 284 285 /* Adjust start according to moffset, to speed things up */ 286 if (dp != NULL && g->moffset > -1) { 287 const char *nstart; 288 289 nstart = stepback(start, dp, g->moffset); 290 if (nstart != NULL) 291 start = nstart; 292 } 293 294 SP("mloop", m->st, *start); 295 296 /* this loop does only one repetition except for backrefs */ 297 for (;;) { 298 endp = walk(m, start, stop, gf, gl, true); 299 if (endp == NULL) { /* a miss */ 300 if (m->pmatch != NULL) 301 free((char *)m->pmatch); 302 if (m->lastpos != NULL) 303 free((char *)m->lastpos); 304 STATETEARDOWN(m); 305 return(REG_NOMATCH); 306 } 307 if (nmatch == 0 && !g->backrefs) 308 break; /* no further info needed */ 309 310 /* where? */ 311 assert(m->coldp != NULL); 312 for (;;) { 313 NOTE("finding start"); 314 endp = walk(m, m->coldp, stop, gf, gl, false); 315 if (endp != NULL) 316 break; 317 assert(m->coldp < m->endp); 318 m->coldp += XMBRTOWC(NULL, m->coldp, 319 m->endp - m->coldp, &m->mbs, 0); 320 } 321 if (nmatch == 1 && !g->backrefs) 322 break; /* no further info needed */ 323 324 /* oh my, he wants the subexpressions... */ 325 if (m->pmatch == NULL) 326 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 327 sizeof(regmatch_t)); 328 if (m->pmatch == NULL) { 329 STATETEARDOWN(m); 330 return(REG_ESPACE); 331 } 332 for (i = 1; i <= m->g->nsub; i++) 333 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 334 if (!g->backrefs && !(m->eflags®_BACKR)) { 335 NOTE("dissecting"); 336 dp = dissect(m, m->coldp, endp, gf, gl); 337 } else { 338 if (g->nplus > 0 && m->lastpos == NULL) 339 m->lastpos = malloc((g->nplus+1) * 340 sizeof(const char *)); 341 if (g->nplus > 0 && m->lastpos == NULL) { 342 free(m->pmatch); 343 STATETEARDOWN(m); 344 return(REG_ESPACE); 345 } 346 NOTE("backref dissect"); 347 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 348 } 349 if (dp != NULL) 350 break; 351 352 /* uh-oh... we couldn't find a subexpression-level match */ 353 assert(g->backrefs); /* must be back references doing it */ 354 assert(g->nplus == 0 || m->lastpos != NULL); 355 for (;;) { 356 if (dp != NULL || endp <= m->coldp) 357 break; /* defeat */ 358 NOTE("backoff"); 359 endp = walk(m, m->coldp, endp-1, gf, gl, false); 360 if (endp == NULL) 361 break; /* defeat */ 362 /* try it on a shorter possibility */ 363 #ifndef NDEBUG 364 for (i = 1; i <= m->g->nsub; i++) { 365 assert(m->pmatch[i].rm_so == -1); 366 assert(m->pmatch[i].rm_eo == -1); 367 } 368 #endif 369 NOTE("backoff dissect"); 370 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 371 } 372 assert(dp == NULL || dp == endp); 373 if (dp != NULL) /* found a shorter one */ 374 break; 375 376 /* despite initial appearances, there is no match here */ 377 NOTE("false alarm"); 378 /* recycle starting later */ 379 start = m->coldp + XMBRTOWC(NULL, m->coldp, 380 stop - m->coldp, &m->mbs, 0); 381 assert(start <= stop); 382 } 383 384 /* fill in the details if requested */ 385 if (nmatch > 0) { 386 pmatch[0].rm_so = m->coldp - m->offp; 387 pmatch[0].rm_eo = endp - m->offp; 388 } 389 if (nmatch > 1) { 390 assert(m->pmatch != NULL); 391 for (i = 1; i < nmatch; i++) 392 if (i <= m->g->nsub) 393 pmatch[i] = m->pmatch[i]; 394 else { 395 pmatch[i].rm_so = -1; 396 pmatch[i].rm_eo = -1; 397 } 398 } 399 400 if (m->pmatch != NULL) 401 free((char *)m->pmatch); 402 if (m->lastpos != NULL) 403 free((char *)m->lastpos); 404 STATETEARDOWN(m); 405 return(0); 406 } 407 408 /* 409 - dissect - figure out what matched what, no back references 410 == static const char *dissect(struct match *m, const char *start, \ 411 == const char *stop, sopno startst, sopno stopst); 412 */ 413 static const char * /* == stop (success) always */ 414 dissect(struct match *m, 415 const char *start, 416 const char *stop, 417 sopno startst, 418 sopno stopst) 419 { 420 int i; 421 sopno ss; /* start sop of current subRE */ 422 sopno es; /* end sop of current subRE */ 423 const char *sp; /* start of string matched by it */ 424 const char *stp; /* string matched by it cannot pass here */ 425 const char *rest; /* start of rest of string */ 426 const char *tail; /* string unmatched by rest of RE */ 427 sopno ssub; /* start sop of subsubRE */ 428 sopno esub; /* end sop of subsubRE */ 429 const char *ssp; /* start of string matched by subsubRE */ 430 const char *sep; /* end of string matched by subsubRE */ 431 const char *oldssp; /* previous ssp */ 432 const char *dp __unused; 433 434 AT("diss", start, stop, startst, stopst); 435 sp = start; 436 for (ss = startst; ss < stopst; ss = es) { 437 /* identify end of subRE */ 438 es = ss; 439 switch (OP(m->g->strip[es])) { 440 case OPLUS_: 441 case OQUEST_: 442 es += OPND(m->g->strip[es]); 443 break; 444 case OCH_: 445 while (OP(m->g->strip[es]) != (sop)O_CH) 446 es += OPND(m->g->strip[es]); 447 break; 448 } 449 es++; 450 451 /* figure out what it matched */ 452 switch (OP(m->g->strip[ss])) { 453 case OEND: 454 assert(nope); 455 break; 456 case OCHAR: 457 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 458 break; 459 case OBOL: 460 case OEOL: 461 case OBOW: 462 case OEOW: 463 case OBOS: 464 case OEOS: 465 case OWBND: 466 case ONWBND: 467 break; 468 case OANY: 469 case OANYOF: 470 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 471 break; 472 case OBACK_: 473 case O_BACK: 474 assert(nope); 475 break; 476 /* cases where length of match is hard to find */ 477 case OQUEST_: 478 stp = stop; 479 for (;;) { 480 /* how long could this one be? */ 481 rest = walk(m, sp, stp, ss, es, false); 482 assert(rest != NULL); /* it did match */ 483 /* could the rest match the rest? */ 484 tail = walk(m, rest, stop, es, stopst, false); 485 if (tail == stop) 486 break; /* yes! */ 487 /* no -- try a shorter match for this one */ 488 stp = rest - 1; 489 assert(stp >= sp); /* it did work */ 490 } 491 ssub = ss + 1; 492 esub = es - 1; 493 /* did innards match? */ 494 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 495 dp = dissect(m, sp, rest, ssub, esub); 496 assert(dp == rest); 497 } else /* no */ 498 assert(sp == rest); 499 sp = rest; 500 break; 501 case OPLUS_: 502 stp = stop; 503 for (;;) { 504 /* how long could this one be? */ 505 rest = walk(m, sp, stp, ss, es, false); 506 assert(rest != NULL); /* it did match */ 507 /* could the rest match the rest? */ 508 tail = walk(m, rest, stop, es, stopst, false); 509 if (tail == stop) 510 break; /* yes! */ 511 /* no -- try a shorter match for this one */ 512 stp = rest - 1; 513 assert(stp >= sp); /* it did work */ 514 } 515 ssub = ss + 1; 516 esub = es - 1; 517 ssp = sp; 518 oldssp = ssp; 519 for (;;) { /* find last match of innards */ 520 sep = walk(m, ssp, rest, ssub, esub, false); 521 if (sep == NULL || sep == ssp) 522 break; /* failed or matched null */ 523 oldssp = ssp; /* on to next try */ 524 ssp = sep; 525 } 526 if (sep == NULL) { 527 /* last successful match */ 528 sep = ssp; 529 ssp = oldssp; 530 } 531 assert(sep == rest); /* must exhaust substring */ 532 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 533 dp = dissect(m, ssp, sep, ssub, esub); 534 assert(dp == sep); 535 sp = rest; 536 break; 537 case OCH_: 538 stp = stop; 539 for (;;) { 540 /* how long could this one be? */ 541 rest = walk(m, sp, stp, ss, es, false); 542 assert(rest != NULL); /* it did match */ 543 /* could the rest match the rest? */ 544 tail = walk(m, rest, stop, es, stopst, false); 545 if (tail == stop) 546 break; /* yes! */ 547 /* no -- try a shorter match for this one */ 548 stp = rest - 1; 549 assert(stp >= sp); /* it did work */ 550 } 551 ssub = ss + 1; 552 esub = ss + OPND(m->g->strip[ss]) - 1; 553 assert(OP(m->g->strip[esub]) == OOR1); 554 for (;;) { /* find first matching branch */ 555 if (walk(m, sp, rest, ssub, esub, false) == rest) 556 break; /* it matched all of it */ 557 /* that one missed, try next one */ 558 assert(OP(m->g->strip[esub]) == OOR1); 559 esub++; 560 assert(OP(m->g->strip[esub]) == OOR2); 561 ssub = esub + 1; 562 esub += OPND(m->g->strip[esub]); 563 if (OP(m->g->strip[esub]) == (sop)OOR2) 564 esub--; 565 else 566 assert(OP(m->g->strip[esub]) == O_CH); 567 } 568 dp = dissect(m, sp, rest, ssub, esub); 569 assert(dp == rest); 570 sp = rest; 571 break; 572 case O_PLUS: 573 case O_QUEST: 574 case OOR1: 575 case OOR2: 576 case O_CH: 577 assert(nope); 578 break; 579 case OLPAREN: 580 i = OPND(m->g->strip[ss]); 581 assert(0 < i && i <= m->g->nsub); 582 m->pmatch[i].rm_so = sp - m->offp; 583 break; 584 case ORPAREN: 585 i = OPND(m->g->strip[ss]); 586 assert(0 < i && i <= m->g->nsub); 587 m->pmatch[i].rm_eo = sp - m->offp; 588 break; 589 default: /* uh oh */ 590 assert(nope); 591 break; 592 } 593 } 594 595 assert(sp == stop); 596 return(sp); 597 } 598 599 #define ISBOW(m, sp) \ 600 (sp < m->endp && ISWORD(*sp) && \ 601 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \ 602 (sp > m->offp && !ISWORD(*(sp-1))))) 603 #define ISEOW(m, sp) \ 604 (((sp == m->endp && !(m->eflags®_NOTEOL)) || \ 605 (sp < m->endp && *sp == '\n' && \ 606 (m->g->cflags®_NEWLINE)) || \ 607 (sp < m->endp && !ISWORD(*sp)) ) && \ 608 (sp > m->beginp && ISWORD(*(sp-1)))) \ 609 610 /* 611 - backref - figure out what matched what, figuring in back references 612 == static const char *backref(struct match *m, const char *start, \ 613 == const char *stop, sopno startst, sopno stopst, sopno lev); 614 */ 615 static const char * /* == stop (success) or NULL (failure) */ 616 backref(struct match *m, 617 const char *start, 618 const char *stop, 619 sopno startst, 620 sopno stopst, 621 sopno lev, /* PLUS nesting level */ 622 int rec) 623 { 624 int i; 625 sopno ss; /* start sop of current subRE */ 626 const char *sp; /* start of string matched by it */ 627 sopno ssub; /* start sop of subsubRE */ 628 sopno esub; /* end sop of subsubRE */ 629 const char *ssp; /* start of string matched by subsubRE */ 630 const char *dp; 631 size_t len; 632 int hard; 633 sop s; 634 regoff_t offsave; 635 cset *cs; 636 wint_t wc; 637 638 AT("back", start, stop, startst, stopst); 639 sp = start; 640 641 /* get as far as we can with easy stuff */ 642 hard = 0; 643 for (ss = startst; !hard && ss < stopst; ss++) 644 switch (OP(s = m->g->strip[ss])) { 645 case OCHAR: 646 if (sp == stop) 647 return(NULL); 648 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 649 if (wc != OPND(s)) 650 return(NULL); 651 break; 652 case OANY: 653 if (sp == stop) 654 return(NULL); 655 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 656 if (wc == BADCHAR) 657 return (NULL); 658 break; 659 case OANYOF: 660 if (sp == stop) 661 return (NULL); 662 cs = &m->g->sets[OPND(s)]; 663 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 664 if (wc == BADCHAR || !CHIN(cs, wc)) 665 return(NULL); 666 break; 667 case OBOS: 668 if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0) 669 { /* yes */ } 670 else 671 return(NULL); 672 break; 673 case OEOS: 674 if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0) 675 { /* yes */ } 676 else 677 return(NULL); 678 break; 679 case OBOL: 680 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 681 (sp > m->offp && sp < m->endp && 682 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 683 { /* yes */ } 684 else 685 return(NULL); 686 break; 687 case OEOL: 688 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 689 (sp < m->endp && *sp == '\n' && 690 (m->g->cflags®_NEWLINE)) ) 691 { /* yes */ } 692 else 693 return(NULL); 694 break; 695 case OWBND: 696 if (ISBOW(m, sp) || ISEOW(m, sp)) 697 { /* yes */ } 698 else 699 return(NULL); 700 break; 701 case ONWBND: 702 if (((sp == m->beginp) && !ISWORD(*sp)) || 703 (sp == m->endp && !ISWORD(*(sp - 1)))) 704 { /* yes, beginning/end of subject */ } 705 else if (ISWORD(*(sp - 1)) == ISWORD(*sp)) 706 { /* yes, beginning/end of subject */ } 707 else 708 return(NULL); 709 break; 710 case OBOW: 711 if (ISBOW(m, sp)) 712 { /* yes */ } 713 else 714 return(NULL); 715 break; 716 case OEOW: 717 if (ISEOW(m, sp)) 718 { /* yes */ } 719 else 720 return(NULL); 721 break; 722 case O_QUEST: 723 break; 724 case OOR1: /* matches null but needs to skip */ 725 ss++; 726 s = m->g->strip[ss]; 727 do { 728 assert(OP(s) == OOR2); 729 ss += OPND(s); 730 } while (OP(s = m->g->strip[ss]) != (sop)O_CH); 731 /* note that the ss++ gets us past the O_CH */ 732 break; 733 default: /* have to make a choice */ 734 hard = 1; 735 break; 736 } 737 if (!hard) { /* that was it! */ 738 if (sp != stop) 739 return(NULL); 740 return(sp); 741 } 742 ss--; /* adjust for the for's final increment */ 743 744 /* the hard stuff */ 745 AT("hard", sp, stop, ss, stopst); 746 s = m->g->strip[ss]; 747 switch (OP(s)) { 748 case OBACK_: /* the vilest depths */ 749 i = OPND(s); 750 assert(0 < i && i <= m->g->nsub); 751 if (m->pmatch[i].rm_eo == -1) 752 return(NULL); 753 assert(m->pmatch[i].rm_so != -1); 754 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 755 if (len == 0 && rec++ > MAX_RECURSION) 756 return(NULL); 757 assert(stop - m->beginp >= len); 758 if (sp > stop - len) 759 return(NULL); /* not enough left to match */ 760 ssp = m->offp + m->pmatch[i].rm_so; 761 if (memcmp(sp, ssp, len) != 0) 762 return(NULL); 763 while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) 764 ss++; 765 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 766 case OQUEST_: /* to null or not */ 767 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 768 if (dp != NULL) 769 return(dp); /* not */ 770 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 771 case OPLUS_: 772 assert(m->lastpos != NULL); 773 assert(lev+1 <= m->g->nplus); 774 m->lastpos[lev+1] = sp; 775 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 776 case O_PLUS: 777 if (sp == m->lastpos[lev]) /* last pass matched null */ 778 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 779 /* try another pass */ 780 m->lastpos[lev] = sp; 781 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 782 if (dp == NULL) 783 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 784 else 785 return(dp); 786 case OCH_: /* find the right one, if any */ 787 ssub = ss + 1; 788 esub = ss + OPND(s) - 1; 789 assert(OP(m->g->strip[esub]) == OOR1); 790 for (;;) { /* find first matching branch */ 791 dp = backref(m, sp, stop, ssub, esub, lev, rec); 792 if (dp != NULL) 793 return(dp); 794 /* that one missed, try next one */ 795 if (OP(m->g->strip[esub]) == (sop)O_CH) 796 return(NULL); /* there is none */ 797 esub++; 798 assert(OP(m->g->strip[esub]) == (sop)OOR2); 799 ssub = esub + 1; 800 esub += OPND(m->g->strip[esub]); 801 if (OP(m->g->strip[esub]) == (sop)OOR2) 802 esub--; 803 else 804 assert(OP(m->g->strip[esub]) == O_CH); 805 } 806 /* NOTREACHED */ 807 break; 808 case OLPAREN: /* must undo assignment if rest fails */ 809 i = OPND(s); 810 assert(0 < i && i <= m->g->nsub); 811 offsave = m->pmatch[i].rm_so; 812 m->pmatch[i].rm_so = sp - m->offp; 813 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 814 if (dp != NULL) 815 return(dp); 816 m->pmatch[i].rm_so = offsave; 817 return(NULL); 818 case ORPAREN: /* must undo assignment if rest fails */ 819 i = OPND(s); 820 assert(0 < i && i <= m->g->nsub); 821 offsave = m->pmatch[i].rm_eo; 822 m->pmatch[i].rm_eo = sp - m->offp; 823 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 824 if (dp != NULL) 825 return(dp); 826 m->pmatch[i].rm_eo = offsave; 827 return(NULL); 828 default: /* uh oh */ 829 assert(nope); 830 break; 831 } 832 833 /* "can't happen" */ 834 assert(nope); 835 /* NOTREACHED */ 836 return "shut up gcc"; 837 } 838 839 /* 840 - walk - step through the string either quickly or slowly 841 == static const char *walk(struct match *m, const char *start, \ 842 == const char *stop, sopno startst, sopno stopst, bool fast); 843 */ 844 static const char * /* where it ended, or NULL */ 845 walk(struct match *m, const char *start, const char *stop, sopno startst, 846 sopno stopst, bool fast) 847 { 848 states st = m->st; 849 states fresh = m->fresh; 850 states empty = m->empty; 851 states tmp = m->tmp; 852 const char *p = start; 853 wint_t c; 854 wint_t lastc; /* previous c */ 855 wint_t flagch; 856 int i, sflags; 857 const char *matchp; /* last p at which a match ended */ 858 size_t clen; 859 860 sflags = 0; 861 AT("slow", start, stop, startst, stopst); 862 CLEAR(st); 863 SET1(st, startst); 864 SP("sstart", st, *p); 865 st = step(m->g, startst, stopst, st, NOTHING, st, sflags); 866 if (fast) 867 ASSIGN(fresh, st); 868 matchp = NULL; 869 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 870 c = OUT; 871 else { 872 /* 873 * XXX Wrong if the previous character was multi-byte. 874 * Newline never is (in encodings supported by FreeBSD), 875 * so this only breaks the ISWORD tests below. 876 */ 877 c = (uch)*(start - 1); 878 } 879 for (;;) { 880 /* next character */ 881 lastc = c; 882 sflags = 0; 883 if (p == m->endp) { 884 c = OUT; 885 clen = 0; 886 } else 887 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 888 889 if (fast && EQ(st, fresh)) 890 matchp = p; 891 892 /* is there an EOL and/or BOL between lastc and c? */ 893 flagch = '\0'; 894 i = 0; 895 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 896 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 897 flagch = BOL; 898 i = m->g->nbol; 899 } 900 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 901 (c == OUT && !(m->eflags®_NOTEOL)) ) { 902 flagch = (flagch == BOL) ? BOLEOL : EOL; 903 i += m->g->neol; 904 } 905 if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) { 906 sflags |= SBOS; 907 /* Step one more for BOS. */ 908 i++; 909 } 910 if (c == OUT && (m->eflags & REG_NOTEOL) == 0) { 911 sflags |= SEOS; 912 /* Step one more for EOS. */ 913 i++; 914 } 915 if (i != 0) { 916 for (; i > 0; i--) 917 st = step(m->g, startst, stopst, st, flagch, st, 918 sflags); 919 SP("sboleol", st, c); 920 } 921 922 /* how about a word boundary? */ 923 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 924 (c != OUT && ISWORD(c)) ) { 925 flagch = BOW; 926 } 927 if ( (lastc != OUT && ISWORD(lastc)) && 928 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 929 flagch = EOW; 930 } 931 if (flagch == BOW || flagch == EOW) { 932 st = step(m->g, startst, stopst, st, flagch, st, sflags); 933 SP("sboweow", st, c); 934 } 935 if (lastc != OUT && c != OUT && 936 ISWORD(lastc) == ISWORD(c)) { 937 flagch = NWBND; 938 } else if ((lastc == OUT && !ISWORD(c)) || 939 (c == OUT && !ISWORD(lastc))) { 940 flagch = NWBND; 941 } 942 if (flagch == NWBND) { 943 st = step(m->g, startst, stopst, st, flagch, st, sflags); 944 SP("snwbnd", st, c); 945 } 946 947 /* are we done? */ 948 if (ISSET(st, stopst)) { 949 if (fast) 950 break; 951 else 952 matchp = p; 953 } 954 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 955 break; /* NOTE BREAK OUT */ 956 957 /* no, we must deal with this character */ 958 ASSIGN(tmp, st); 959 if (fast) 960 ASSIGN(st, fresh); 961 else 962 ASSIGN(st, empty); 963 assert(c != OUT); 964 st = step(m->g, startst, stopst, tmp, c, st, sflags); 965 SP("saft", st, c); 966 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), 967 st)); 968 p += clen; 969 } 970 971 if (fast) { 972 assert(matchp != NULL); 973 m->coldp = matchp; 974 if (ISSET(st, stopst)) 975 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 976 else 977 return (NULL); 978 } else 979 return (matchp); 980 } 981 982 /* 983 - step - map set of states reachable before char to set reachable after 984 == static states step(struct re_guts *g, sopno start, sopno stop, \ 985 == states bef, int ch, states aft); 986 == #define BOL (OUT-1) 987 == #define EOL (BOL-1) 988 == #define BOLEOL (BOL-2) 989 == #define NOTHING (BOL-3) 990 == #define BOW (BOL-4) 991 == #define EOW (BOL-5) 992 == #define BADCHAR (BOL-6) 993 == #define NONCHAR(c) ((c) <= OUT) 994 */ 995 static states 996 step(struct re_guts *g, 997 sopno start, /* start state within strip */ 998 sopno stop, /* state after stop state within strip */ 999 states bef, /* states reachable before */ 1000 wint_t ch, /* character or NONCHAR code */ 1001 states aft, /* states already known reachable after */ 1002 int sflags) /* state flags */ 1003 { 1004 cset *cs; 1005 sop s; 1006 sopno pc; 1007 onestate here; /* note, macros know this name */ 1008 sopno look; 1009 int i; 1010 1011 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1012 s = g->strip[pc]; 1013 switch (OP(s)) { 1014 case OEND: 1015 assert(pc == stop-1); 1016 break; 1017 case OCHAR: 1018 /* only characters can match */ 1019 assert(!NONCHAR(ch) || ch != OPND(s)); 1020 if (ch == OPND(s)) 1021 FWD(aft, bef, 1); 1022 break; 1023 case OBOS: 1024 if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0) 1025 FWD(aft, bef, 1); 1026 break; 1027 case OEOS: 1028 if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0) 1029 FWD(aft, bef, 1); 1030 break; 1031 case OBOL: 1032 if (ch == BOL || ch == BOLEOL) 1033 FWD(aft, bef, 1); 1034 break; 1035 case OEOL: 1036 if (ch == EOL || ch == BOLEOL) 1037 FWD(aft, bef, 1); 1038 break; 1039 case OBOW: 1040 if (ch == BOW) 1041 FWD(aft, bef, 1); 1042 break; 1043 case OEOW: 1044 if (ch == EOW) 1045 FWD(aft, bef, 1); 1046 break; 1047 case OWBND: 1048 if (ch == BOW || ch == EOW) 1049 FWD(aft, bef, 1); 1050 break; 1051 case ONWBND: 1052 if (ch == NWBND) 1053 FWD(aft, aft, 1); 1054 break; 1055 case OANY: 1056 if (!NONCHAR(ch)) 1057 FWD(aft, bef, 1); 1058 break; 1059 case OANYOF: 1060 cs = &g->sets[OPND(s)]; 1061 if (!NONCHAR(ch) && CHIN(cs, ch)) 1062 FWD(aft, bef, 1); 1063 break; 1064 case OBACK_: /* ignored here */ 1065 case O_BACK: 1066 FWD(aft, aft, 1); 1067 break; 1068 case OPLUS_: /* forward, this is just an empty */ 1069 FWD(aft, aft, 1); 1070 break; 1071 case O_PLUS: /* both forward and back */ 1072 FWD(aft, aft, 1); 1073 i = ISSETBACK(aft, OPND(s)); 1074 BACK(aft, aft, OPND(s)); 1075 if (!i && ISSETBACK(aft, OPND(s))) { 1076 /* oho, must reconsider loop body */ 1077 pc -= OPND(s) + 1; 1078 INIT(here, pc); 1079 } 1080 break; 1081 case OQUEST_: /* two branches, both forward */ 1082 FWD(aft, aft, 1); 1083 FWD(aft, aft, OPND(s)); 1084 break; 1085 case O_QUEST: /* just an empty */ 1086 FWD(aft, aft, 1); 1087 break; 1088 case OLPAREN: /* not significant here */ 1089 case ORPAREN: 1090 FWD(aft, aft, 1); 1091 break; 1092 case OCH_: /* mark the first two branches */ 1093 FWD(aft, aft, 1); 1094 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1095 FWD(aft, aft, OPND(s)); 1096 break; 1097 case OOR1: /* done a branch, find the O_CH */ 1098 if (ISSTATEIN(aft, here)) { 1099 for (look = 1; 1100 OP(s = g->strip[pc+look]) != (sop)O_CH; 1101 look += OPND(s)) 1102 assert(OP(s) == (sop)OOR2); 1103 FWD(aft, aft, look + 1); 1104 } 1105 break; 1106 case OOR2: /* propagate OCH_'s marking */ 1107 FWD(aft, aft, 1); 1108 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { 1109 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1110 FWD(aft, aft, OPND(s)); 1111 } 1112 break; 1113 case O_CH: /* just empty */ 1114 FWD(aft, aft, 1); 1115 break; 1116 default: /* ooooops... */ 1117 assert(nope); 1118 break; 1119 } 1120 } 1121 1122 return(aft); 1123 } 1124 1125 #ifdef REDEBUG 1126 /* 1127 - print - print a set of states 1128 == #ifdef REDEBUG 1129 == static void print(struct match *m, const char *caption, states st, \ 1130 == int ch, FILE *d); 1131 == #endif 1132 */ 1133 static void 1134 print(struct match *m, 1135 const char *caption, 1136 states st, 1137 int ch, 1138 FILE *d) 1139 { 1140 struct re_guts *g = m->g; 1141 sopno i; 1142 int first = 1; 1143 1144 if (!(m->eflags®_TRACE)) 1145 return; 1146 1147 fprintf(d, "%s", caption); 1148 if (ch != '\0') 1149 fprintf(d, " %s", pchar(ch)); 1150 for (i = 0; i < g->nstates; i++) 1151 if (ISSET(st, i)) { 1152 fprintf(d, "%s%lu", (first) ? "\t" : ", ", i); 1153 first = 0; 1154 } 1155 fprintf(d, "\n"); 1156 } 1157 1158 /* 1159 - at - print current situation 1160 == #ifdef REDEBUG 1161 == static void at(struct match *m, const char *title, const char *start, \ 1162 == const char *stop, sopno startst, sopno stopst); 1163 == #endif 1164 */ 1165 static void 1166 at( struct match *m, 1167 const char *title, 1168 const char *start, 1169 const char *stop, 1170 sopno startst, 1171 sopno stopst) 1172 { 1173 if (!(m->eflags®_TRACE)) 1174 return; 1175 1176 printf("%s %s-", title, pchar(*start)); 1177 printf("%s ", pchar(*stop)); 1178 printf("%ld-%ld\n", (long)startst, (long)stopst); 1179 } 1180 1181 #ifndef PCHARDONE 1182 #define PCHARDONE /* never again */ 1183 /* 1184 - pchar - make a character printable 1185 == #ifdef REDEBUG 1186 == static const char *pchar(int ch); 1187 == #endif 1188 * 1189 * Is this identical to regchar() over in debug.c? Well, yes. But a 1190 * duplicate here avoids having a debugging-capable regexec.o tied to 1191 * a matching debug.o, and this is convenient. It all disappears in 1192 * the non-debug compilation anyway, so it doesn't matter much. 1193 */ 1194 static const char * /* -> representation */ 1195 pchar(int ch) 1196 { 1197 static char pbuf[10]; 1198 1199 if (isprint((uch)ch) || ch == ' ') 1200 sprintf(pbuf, "%c", ch); 1201 else 1202 sprintf(pbuf, "\\%o", ch); 1203 return(pbuf); 1204 } 1205 #endif 1206 #endif 1207 1208 #undef stepback 1209 #undef matcher 1210 #undef walk 1211 #undef dissect 1212 #undef backref 1213 #undef step 1214 #undef print 1215 #undef at 1216 #undef match 1217