1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <stdbool.h> 37 38 /* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45 #ifdef SNAMES 46 #define stepback sstepback 47 #define matcher smatcher 48 #define walk swalk 49 #define dissect sdissect 50 #define backref sbackref 51 #define step sstep 52 #define print sprint 53 #define at sat 54 #define match smat 55 #endif 56 #ifdef LNAMES 57 #define stepback lstepback 58 #define matcher lmatcher 59 #define walk lwalk 60 #define dissect ldissect 61 #define backref lbackref 62 #define step lstep 63 #define print lprint 64 #define at lat 65 #define match lmat 66 #endif 67 #ifdef MNAMES 68 #define stepback mstepback 69 #define matcher mmatcher 70 #define walk mwalk 71 #define dissect mdissect 72 #define backref mbackref 73 #define step mstep 74 #define print mprint 75 #define at mat 76 #define match mmat 77 #endif 78 79 /* another structure passed up and down to avoid zillions of parameters */ 80 struct match { 81 struct re_guts *g; 82 int eflags; 83 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 84 const char *offp; /* offsets work from here */ 85 const char *beginp; /* start of string -- virtual NUL precedes */ 86 const char *endp; /* end of string -- virtual NUL here */ 87 const char *coldp; /* can be no match starting before here */ 88 const char **lastpos; /* [nplus+1] */ 89 STATEVARS; 90 states st; /* current states */ 91 states fresh; /* states for a fresh start */ 92 states tmp; /* temporary */ 93 states empty; /* empty set of states */ 94 mbstate_t mbs; /* multibyte conversion state */ 95 }; 96 97 /* ========= begin header generated by ./mkh ========= */ 98 #ifdef __cplusplus 99 extern "C" { 100 #endif 101 102 /* === engine.c === */ 103 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 104 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 105 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); 106 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); 107 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); 108 #define MAX_RECURSION 100 109 #define BOL (OUT-1) 110 #define EOL (BOL-1) 111 #define BOLEOL (BOL-2) 112 #define NOTHING (BOL-3) 113 #define BOW (BOL-4) 114 #define EOW (BOL-5) 115 #define BADCHAR (BOL-6) 116 #define NWBND (BOL-7) 117 #define NONCHAR(c) ((c) <= OUT) 118 /* sflags */ 119 #define SBOS 0x0001 120 #define SEOS 0x0002 121 122 #ifdef REDEBUG 123 static void print(struct match *m, const char *caption, states st, int ch, FILE *d); 124 #endif 125 #ifdef REDEBUG 126 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); 127 #endif 128 #ifdef REDEBUG 129 static const char *pchar(int ch); 130 #endif 131 132 #ifdef __cplusplus 133 } 134 #endif 135 /* ========= end header generated by ./mkh ========= */ 136 137 #ifdef REDEBUG 138 #define SP(t, s, c) print(m, t, s, c, stdout) 139 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 140 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 141 #else 142 #define SP(t, s, c) /* nothing */ 143 #define AT(t, p1, p2, s1, s2) /* nothing */ 144 #define NOTE(s) /* nothing */ 145 #endif 146 147 /* 148 * Given a multibyte string pointed to by start, step back nchar characters 149 * from current position pointed to by cur. 150 */ 151 static const char * 152 stepback(const char *start, const char *cur, int nchar) 153 { 154 const char *ret; 155 int wc, mbc; 156 mbstate_t mbs; 157 size_t clen; 158 159 if (MB_CUR_MAX == 1) 160 return ((cur - nchar) > start ? cur - nchar : NULL); 161 162 ret = cur; 163 for (wc = nchar; wc > 0; wc--) { 164 for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { 165 if ((ret - mbc) < start) 166 return (NULL); 167 memset(&mbs, 0, sizeof(mbs)); 168 clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); 169 if (clen != (size_t)-1 && clen != (size_t)-2) 170 break; 171 } 172 if (mbc > MB_CUR_MAX) 173 return (NULL); 174 ret -= mbc; 175 } 176 177 return (ret); 178 } 179 180 /* 181 - matcher - the actual matching engine 182 == static int matcher(struct re_guts *g, const char *string, \ 183 == size_t nmatch, regmatch_t pmatch[], int eflags); 184 */ 185 static int /* 0 success, REG_NOMATCH failure */ 186 matcher(struct re_guts *g, 187 const char *string, 188 size_t nmatch, 189 regmatch_t pmatch[], 190 int eflags) 191 { 192 const char *endp; 193 size_t i; 194 struct match mv; 195 struct match *m = &mv; 196 const char *dp = NULL; 197 const sopno gf = g->firststate+1; /* +1 for OEND */ 198 const sopno gl = g->laststate; 199 const char *start; 200 const char *stop; 201 /* Boyer-Moore algorithms variables */ 202 const char *pp; 203 int cj, mj; 204 const char *mustfirst; 205 const char *mustlast; 206 int *matchjump; 207 int *charjump; 208 209 /* simplify the situation where possible */ 210 if (g->cflags®_NOSUB) 211 nmatch = 0; 212 if (eflags®_STARTEND) { 213 start = string + pmatch[0].rm_so; 214 stop = string + pmatch[0].rm_eo; 215 } else { 216 start = string; 217 stop = start + strlen(start); 218 } 219 if (stop < start) 220 return(REG_INVARG); 221 222 /* prescreening; this does wonders for this rather slow code */ 223 if (g->must != NULL) { 224 if (g->charjump != NULL && g->matchjump != NULL) { 225 mustfirst = g->must; 226 mustlast = g->must + g->mlen - 1; 227 charjump = g->charjump; 228 matchjump = g->matchjump; 229 pp = mustlast; 230 for (dp = start+g->mlen-1; dp < stop;) { 231 /* Fast skip non-matches */ 232 while (dp < stop && charjump[(int)*dp]) 233 dp += charjump[(int)*dp]; 234 235 if (dp >= stop) 236 break; 237 238 /* Greedy matcher */ 239 /* We depend on not being used for 240 * for strings of length 1 241 */ 242 while (*--dp == *--pp && pp != mustfirst); 243 244 if (*dp == *pp) 245 break; 246 247 /* Jump to next possible match */ 248 mj = matchjump[pp - mustfirst]; 249 cj = charjump[(int)*dp]; 250 dp += (cj < mj ? mj : cj); 251 pp = mustlast; 252 } 253 if (pp != mustfirst) 254 return(REG_NOMATCH); 255 } else { 256 for (dp = start; dp < stop; dp++) 257 if (*dp == g->must[0] && 258 stop - dp >= g->mlen && 259 memcmp(dp, g->must, (size_t)g->mlen) == 0) 260 break; 261 if (dp == stop) /* we didn't find g->must */ 262 return(REG_NOMATCH); 263 } 264 } 265 266 /* match struct setup */ 267 m->g = g; 268 m->eflags = eflags; 269 m->pmatch = NULL; 270 m->lastpos = NULL; 271 m->offp = string; 272 m->beginp = start; 273 m->endp = stop; 274 STATESETUP(m, 4); 275 SETUP(m->st); 276 SETUP(m->fresh); 277 SETUP(m->tmp); 278 SETUP(m->empty); 279 CLEAR(m->empty); 280 ZAPSTATE(&m->mbs); 281 282 /* Adjust start according to moffset, to speed things up */ 283 if (dp != NULL && g->moffset > -1) { 284 const char *nstart; 285 286 nstart = stepback(start, dp, g->moffset); 287 if (nstart != NULL) 288 start = nstart; 289 } 290 291 SP("mloop", m->st, *start); 292 293 /* this loop does only one repetition except for backrefs */ 294 for (;;) { 295 endp = walk(m, start, stop, gf, gl, true); 296 if (endp == NULL) { /* a miss */ 297 if (m->pmatch != NULL) 298 free((char *)m->pmatch); 299 if (m->lastpos != NULL) 300 free((char *)m->lastpos); 301 STATETEARDOWN(m); 302 return(REG_NOMATCH); 303 } 304 if (nmatch == 0 && !g->backrefs) 305 break; /* no further info needed */ 306 307 /* where? */ 308 assert(m->coldp != NULL); 309 for (;;) { 310 NOTE("finding start"); 311 endp = walk(m, m->coldp, stop, gf, gl, false); 312 if (endp != NULL) 313 break; 314 assert(m->coldp < m->endp); 315 m->coldp += XMBRTOWC(NULL, m->coldp, 316 m->endp - m->coldp, &m->mbs, 0); 317 } 318 if (nmatch == 1 && !g->backrefs) 319 break; /* no further info needed */ 320 321 /* oh my, he wants the subexpressions... */ 322 if (m->pmatch == NULL) 323 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 324 sizeof(regmatch_t)); 325 if (m->pmatch == NULL) { 326 STATETEARDOWN(m); 327 return(REG_ESPACE); 328 } 329 for (i = 1; i <= m->g->nsub; i++) 330 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 331 if (!g->backrefs && !(m->eflags®_BACKR)) { 332 NOTE("dissecting"); 333 dp = dissect(m, m->coldp, endp, gf, gl); 334 } else { 335 if (g->nplus > 0 && m->lastpos == NULL) 336 m->lastpos = malloc((g->nplus+1) * 337 sizeof(const char *)); 338 if (g->nplus > 0 && m->lastpos == NULL) { 339 free(m->pmatch); 340 STATETEARDOWN(m); 341 return(REG_ESPACE); 342 } 343 NOTE("backref dissect"); 344 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 345 } 346 if (dp != NULL) 347 break; 348 349 /* uh-oh... we couldn't find a subexpression-level match */ 350 assert(g->backrefs); /* must be back references doing it */ 351 assert(g->nplus == 0 || m->lastpos != NULL); 352 for (;;) { 353 if (dp != NULL || endp <= m->coldp) 354 break; /* defeat */ 355 NOTE("backoff"); 356 endp = walk(m, m->coldp, endp-1, gf, gl, false); 357 if (endp == NULL) 358 break; /* defeat */ 359 /* try it on a shorter possibility */ 360 #ifndef NDEBUG 361 for (i = 1; i <= m->g->nsub; i++) { 362 assert(m->pmatch[i].rm_so == -1); 363 assert(m->pmatch[i].rm_eo == -1); 364 } 365 #endif 366 NOTE("backoff dissect"); 367 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 368 } 369 assert(dp == NULL || dp == endp); 370 if (dp != NULL) /* found a shorter one */ 371 break; 372 373 /* despite initial appearances, there is no match here */ 374 NOTE("false alarm"); 375 /* recycle starting later */ 376 start = m->coldp + XMBRTOWC(NULL, m->coldp, 377 stop - m->coldp, &m->mbs, 0); 378 assert(start <= stop); 379 } 380 381 /* fill in the details if requested */ 382 if (nmatch > 0) { 383 pmatch[0].rm_so = m->coldp - m->offp; 384 pmatch[0].rm_eo = endp - m->offp; 385 } 386 if (nmatch > 1) { 387 assert(m->pmatch != NULL); 388 for (i = 1; i < nmatch; i++) 389 if (i <= m->g->nsub) 390 pmatch[i] = m->pmatch[i]; 391 else { 392 pmatch[i].rm_so = -1; 393 pmatch[i].rm_eo = -1; 394 } 395 } 396 397 if (m->pmatch != NULL) 398 free((char *)m->pmatch); 399 if (m->lastpos != NULL) 400 free((char *)m->lastpos); 401 STATETEARDOWN(m); 402 return(0); 403 } 404 405 /* 406 - dissect - figure out what matched what, no back references 407 == static const char *dissect(struct match *m, const char *start, \ 408 == const char *stop, sopno startst, sopno stopst); 409 */ 410 static const char * /* == stop (success) always */ 411 dissect(struct match *m, 412 const char *start, 413 const char *stop, 414 sopno startst, 415 sopno stopst) 416 { 417 int i; 418 sopno ss; /* start sop of current subRE */ 419 sopno es; /* end sop of current subRE */ 420 const char *sp; /* start of string matched by it */ 421 const char *stp; /* string matched by it cannot pass here */ 422 const char *rest; /* start of rest of string */ 423 const char *tail; /* string unmatched by rest of RE */ 424 sopno ssub; /* start sop of subsubRE */ 425 sopno esub; /* end sop of subsubRE */ 426 const char *ssp; /* start of string matched by subsubRE */ 427 const char *sep; /* end of string matched by subsubRE */ 428 const char *oldssp; /* previous ssp */ 429 const char *dp __unused; 430 431 AT("diss", start, stop, startst, stopst); 432 sp = start; 433 for (ss = startst; ss < stopst; ss = es) { 434 /* identify end of subRE */ 435 es = ss; 436 switch (OP(m->g->strip[es])) { 437 case OPLUS_: 438 case OQUEST_: 439 es += OPND(m->g->strip[es]); 440 break; 441 case OCH_: 442 while (OP(m->g->strip[es]) != (sop)O_CH) 443 es += OPND(m->g->strip[es]); 444 break; 445 } 446 es++; 447 448 /* figure out what it matched */ 449 switch (OP(m->g->strip[ss])) { 450 case OEND: 451 assert(nope); 452 break; 453 case OCHAR: 454 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 455 break; 456 case OBOL: 457 case OEOL: 458 case OBOW: 459 case OEOW: 460 case OBOS: 461 case OEOS: 462 case OWBND: 463 case ONWBND: 464 break; 465 case OANY: 466 case OANYOF: 467 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 468 break; 469 case OBACK_: 470 case O_BACK: 471 assert(nope); 472 break; 473 /* cases where length of match is hard to find */ 474 case OQUEST_: 475 stp = stop; 476 for (;;) { 477 /* how long could this one be? */ 478 rest = walk(m, sp, stp, ss, es, false); 479 assert(rest != NULL); /* it did match */ 480 /* could the rest match the rest? */ 481 tail = walk(m, rest, stop, es, stopst, false); 482 if (tail == stop) 483 break; /* yes! */ 484 /* no -- try a shorter match for this one */ 485 stp = rest - 1; 486 assert(stp >= sp); /* it did work */ 487 } 488 ssub = ss + 1; 489 esub = es - 1; 490 /* did innards match? */ 491 if (walk(m, sp, rest, ssub, esub, false) != NULL) { 492 dp = dissect(m, sp, rest, ssub, esub); 493 assert(dp == rest); 494 } else /* no */ 495 assert(sp == rest); 496 sp = rest; 497 break; 498 case OPLUS_: 499 stp = stop; 500 for (;;) { 501 /* how long could this one be? */ 502 rest = walk(m, sp, stp, ss, es, false); 503 assert(rest != NULL); /* it did match */ 504 /* could the rest match the rest? */ 505 tail = walk(m, rest, stop, es, stopst, false); 506 if (tail == stop) 507 break; /* yes! */ 508 /* no -- try a shorter match for this one */ 509 stp = rest - 1; 510 assert(stp >= sp); /* it did work */ 511 } 512 ssub = ss + 1; 513 esub = es - 1; 514 ssp = sp; 515 oldssp = ssp; 516 for (;;) { /* find last match of innards */ 517 sep = walk(m, ssp, rest, ssub, esub, false); 518 if (sep == NULL || sep == ssp) 519 break; /* failed or matched null */ 520 oldssp = ssp; /* on to next try */ 521 ssp = sep; 522 } 523 if (sep == NULL) { 524 /* last successful match */ 525 sep = ssp; 526 ssp = oldssp; 527 } 528 assert(sep == rest); /* must exhaust substring */ 529 assert(walk(m, ssp, sep, ssub, esub, false) == rest); 530 dp = dissect(m, ssp, sep, ssub, esub); 531 assert(dp == sep); 532 sp = rest; 533 break; 534 case OCH_: 535 stp = stop; 536 for (;;) { 537 /* how long could this one be? */ 538 rest = walk(m, sp, stp, ss, es, false); 539 assert(rest != NULL); /* it did match */ 540 /* could the rest match the rest? */ 541 tail = walk(m, rest, stop, es, stopst, false); 542 if (tail == stop) 543 break; /* yes! */ 544 /* no -- try a shorter match for this one */ 545 stp = rest - 1; 546 assert(stp >= sp); /* it did work */ 547 } 548 ssub = ss + 1; 549 esub = ss + OPND(m->g->strip[ss]) - 1; 550 assert(OP(m->g->strip[esub]) == OOR1); 551 for (;;) { /* find first matching branch */ 552 if (walk(m, sp, rest, ssub, esub, false) == rest) 553 break; /* it matched all of it */ 554 /* that one missed, try next one */ 555 assert(OP(m->g->strip[esub]) == OOR1); 556 esub++; 557 assert(OP(m->g->strip[esub]) == OOR2); 558 ssub = esub + 1; 559 esub += OPND(m->g->strip[esub]); 560 if (OP(m->g->strip[esub]) == (sop)OOR2) 561 esub--; 562 else 563 assert(OP(m->g->strip[esub]) == O_CH); 564 } 565 dp = dissect(m, sp, rest, ssub, esub); 566 assert(dp == rest); 567 sp = rest; 568 break; 569 case O_PLUS: 570 case O_QUEST: 571 case OOR1: 572 case OOR2: 573 case O_CH: 574 assert(nope); 575 break; 576 case OLPAREN: 577 i = OPND(m->g->strip[ss]); 578 assert(0 < i && i <= m->g->nsub); 579 m->pmatch[i].rm_so = sp - m->offp; 580 break; 581 case ORPAREN: 582 i = OPND(m->g->strip[ss]); 583 assert(0 < i && i <= m->g->nsub); 584 m->pmatch[i].rm_eo = sp - m->offp; 585 break; 586 default: /* uh oh */ 587 assert(nope); 588 break; 589 } 590 } 591 592 assert(sp == stop); 593 return(sp); 594 } 595 596 #define ISBOW(m, sp) \ 597 (sp < m->endp && ISWORD(*sp) && \ 598 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || \ 599 (sp > m->offp && !ISWORD(*(sp-1))))) 600 #define ISEOW(m, sp) \ 601 (((sp == m->endp && !(m->eflags®_NOTEOL)) || \ 602 (sp < m->endp && *sp == '\n' && \ 603 (m->g->cflags®_NEWLINE)) || \ 604 (sp < m->endp && !ISWORD(*sp)) ) && \ 605 (sp > m->beginp && ISWORD(*(sp-1)))) \ 606 607 /* 608 - backref - figure out what matched what, figuring in back references 609 == static const char *backref(struct match *m, const char *start, \ 610 == const char *stop, sopno startst, sopno stopst, sopno lev); 611 */ 612 static const char * /* == stop (success) or NULL (failure) */ 613 backref(struct match *m, 614 const char *start, 615 const char *stop, 616 sopno startst, 617 sopno stopst, 618 sopno lev, /* PLUS nesting level */ 619 int rec) 620 { 621 int i; 622 sopno ss; /* start sop of current subRE */ 623 const char *sp; /* start of string matched by it */ 624 sopno ssub; /* start sop of subsubRE */ 625 sopno esub; /* end sop of subsubRE */ 626 const char *ssp; /* start of string matched by subsubRE */ 627 const char *dp; 628 size_t len; 629 int hard; 630 sop s; 631 regoff_t offsave; 632 cset *cs; 633 wint_t wc; 634 635 AT("back", start, stop, startst, stopst); 636 sp = start; 637 638 /* get as far as we can with easy stuff */ 639 hard = 0; 640 for (ss = startst; !hard && ss < stopst; ss++) 641 switch (OP(s = m->g->strip[ss])) { 642 case OCHAR: 643 if (sp == stop) 644 return(NULL); 645 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 646 if (wc != OPND(s)) 647 return(NULL); 648 break; 649 case OANY: 650 if (sp == stop) 651 return(NULL); 652 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 653 if (wc == BADCHAR) 654 return (NULL); 655 break; 656 case OANYOF: 657 if (sp == stop) 658 return (NULL); 659 cs = &m->g->sets[OPND(s)]; 660 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 661 if (wc == BADCHAR || !CHIN(cs, wc)) 662 return(NULL); 663 break; 664 case OBOS: 665 if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0) 666 { /* yes */ } 667 else 668 return(NULL); 669 break; 670 case OEOS: 671 if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0) 672 { /* yes */ } 673 else 674 return(NULL); 675 break; 676 case OBOL: 677 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 678 (sp > m->offp && sp < m->endp && 679 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) 680 { /* yes */ } 681 else 682 return(NULL); 683 break; 684 case OEOL: 685 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 686 (sp < m->endp && *sp == '\n' && 687 (m->g->cflags®_NEWLINE)) ) 688 { /* yes */ } 689 else 690 return(NULL); 691 break; 692 case OWBND: 693 if (ISBOW(m, sp) || ISEOW(m, sp)) 694 { /* yes */ } 695 else 696 return(NULL); 697 break; 698 case ONWBND: 699 if (((sp == m->beginp) && !ISWORD(*sp)) || 700 (sp == m->endp && !ISWORD(*(sp - 1)))) 701 { /* yes, beginning/end of subject */ } 702 else if (ISWORD(*(sp - 1)) == ISWORD(*sp)) 703 { /* yes, beginning/end of subject */ } 704 else 705 return(NULL); 706 break; 707 case OBOW: 708 if (ISBOW(m, sp)) 709 { /* yes */ } 710 else 711 return(NULL); 712 break; 713 case OEOW: 714 if (ISEOW(m, sp)) 715 { /* yes */ } 716 else 717 return(NULL); 718 break; 719 case O_QUEST: 720 break; 721 case OOR1: /* matches null but needs to skip */ 722 ss++; 723 s = m->g->strip[ss]; 724 do { 725 assert(OP(s) == OOR2); 726 ss += OPND(s); 727 } while (OP(s = m->g->strip[ss]) != (sop)O_CH); 728 /* note that the ss++ gets us past the O_CH */ 729 break; 730 default: /* have to make a choice */ 731 hard = 1; 732 break; 733 } 734 if (!hard) { /* that was it! */ 735 if (sp != stop) 736 return(NULL); 737 return(sp); 738 } 739 ss--; /* adjust for the for's final increment */ 740 741 /* the hard stuff */ 742 AT("hard", sp, stop, ss, stopst); 743 s = m->g->strip[ss]; 744 switch (OP(s)) { 745 case OBACK_: /* the vilest depths */ 746 i = OPND(s); 747 assert(0 < i && i <= m->g->nsub); 748 if (m->pmatch[i].rm_eo == -1) 749 return(NULL); 750 assert(m->pmatch[i].rm_so != -1); 751 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 752 if (len == 0 && rec++ > MAX_RECURSION) 753 return(NULL); 754 assert(stop - m->beginp >= len); 755 if (sp > stop - len) 756 return(NULL); /* not enough left to match */ 757 ssp = m->offp + m->pmatch[i].rm_so; 758 if (memcmp(sp, ssp, len) != 0) 759 return(NULL); 760 while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) 761 ss++; 762 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 763 case OQUEST_: /* to null or not */ 764 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 765 if (dp != NULL) 766 return(dp); /* not */ 767 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 768 case OPLUS_: 769 assert(m->lastpos != NULL); 770 assert(lev+1 <= m->g->nplus); 771 m->lastpos[lev+1] = sp; 772 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 773 case O_PLUS: 774 if (sp == m->lastpos[lev]) /* last pass matched null */ 775 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 776 /* try another pass */ 777 m->lastpos[lev] = sp; 778 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 779 if (dp == NULL) 780 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 781 else 782 return(dp); 783 case OCH_: /* find the right one, if any */ 784 ssub = ss + 1; 785 esub = ss + OPND(s) - 1; 786 assert(OP(m->g->strip[esub]) == OOR1); 787 for (;;) { /* find first matching branch */ 788 dp = backref(m, sp, stop, ssub, esub, lev, rec); 789 if (dp != NULL) 790 return(dp); 791 /* that one missed, try next one */ 792 if (OP(m->g->strip[esub]) == (sop)O_CH) 793 return(NULL); /* there is none */ 794 esub++; 795 assert(OP(m->g->strip[esub]) == (sop)OOR2); 796 ssub = esub + 1; 797 esub += OPND(m->g->strip[esub]); 798 if (OP(m->g->strip[esub]) == (sop)OOR2) 799 esub--; 800 else 801 assert(OP(m->g->strip[esub]) == O_CH); 802 } 803 /* NOTREACHED */ 804 break; 805 case OLPAREN: /* must undo assignment if rest fails */ 806 i = OPND(s); 807 assert(0 < i && i <= m->g->nsub); 808 offsave = m->pmatch[i].rm_so; 809 m->pmatch[i].rm_so = sp - m->offp; 810 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 811 if (dp != NULL) 812 return(dp); 813 m->pmatch[i].rm_so = offsave; 814 return(NULL); 815 case ORPAREN: /* must undo assignment if rest fails */ 816 i = OPND(s); 817 assert(0 < i && i <= m->g->nsub); 818 offsave = m->pmatch[i].rm_eo; 819 m->pmatch[i].rm_eo = sp - m->offp; 820 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 821 if (dp != NULL) 822 return(dp); 823 m->pmatch[i].rm_eo = offsave; 824 return(NULL); 825 default: /* uh oh */ 826 assert(nope); 827 break; 828 } 829 830 /* "can't happen" */ 831 assert(nope); 832 /* NOTREACHED */ 833 return "shut up gcc"; 834 } 835 836 /* 837 - walk - step through the string either quickly or slowly 838 == static const char *walk(struct match *m, const char *start, \ 839 == const char *stop, sopno startst, sopno stopst, bool fast); 840 */ 841 static const char * /* where it ended, or NULL */ 842 walk(struct match *m, const char *start, const char *stop, sopno startst, 843 sopno stopst, bool fast) 844 { 845 states st = m->st; 846 states fresh = m->fresh; 847 states empty = m->empty; 848 states tmp = m->tmp; 849 const char *p = start; 850 wint_t c; 851 wint_t lastc; /* previous c */ 852 wint_t flagch; 853 int i, sflags; 854 const char *matchp; /* last p at which a match ended */ 855 size_t clen; 856 857 sflags = 0; 858 AT("slow", start, stop, startst, stopst); 859 CLEAR(st); 860 SET1(st, startst); 861 SP("sstart", st, *p); 862 st = step(m->g, startst, stopst, st, NOTHING, st, sflags); 863 if (fast) 864 ASSIGN(fresh, st); 865 matchp = NULL; 866 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 867 c = OUT; 868 else { 869 /* 870 * XXX Wrong if the previous character was multi-byte. 871 * Newline never is (in encodings supported by FreeBSD), 872 * so this only breaks the ISWORD tests below. 873 */ 874 c = (uch)*(start - 1); 875 } 876 for (;;) { 877 /* next character */ 878 lastc = c; 879 sflags = 0; 880 if (p == m->endp) { 881 c = OUT; 882 clen = 0; 883 } else 884 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 885 886 if (fast && EQ(st, fresh)) 887 matchp = p; 888 889 /* is there an EOL and/or BOL between lastc and c? */ 890 flagch = '\0'; 891 i = 0; 892 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 893 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 894 flagch = BOL; 895 i = m->g->nbol; 896 } 897 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 898 (c == OUT && !(m->eflags®_NOTEOL)) ) { 899 flagch = (flagch == BOL) ? BOLEOL : EOL; 900 i += m->g->neol; 901 } 902 if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) { 903 sflags |= SBOS; 904 /* Step one more for BOS. */ 905 i++; 906 } 907 if (c == OUT && (m->eflags & REG_NOTEOL) == 0) { 908 sflags |= SEOS; 909 /* Step one more for EOS. */ 910 i++; 911 } 912 if (i != 0) { 913 for (; i > 0; i--) 914 st = step(m->g, startst, stopst, st, flagch, st, 915 sflags); 916 SP("sboleol", st, c); 917 } 918 919 /* how about a word boundary? */ 920 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 921 (c != OUT && ISWORD(c)) ) { 922 flagch = BOW; 923 } 924 if ( (lastc != OUT && ISWORD(lastc)) && 925 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 926 flagch = EOW; 927 } 928 if (flagch == BOW || flagch == EOW) { 929 st = step(m->g, startst, stopst, st, flagch, st, sflags); 930 SP("sboweow", st, c); 931 } 932 if (lastc != OUT && c != OUT && 933 ISWORD(lastc) == ISWORD(c)) { 934 flagch = NWBND; 935 } else if ((lastc == OUT && !ISWORD(c)) || 936 (c == OUT && !ISWORD(lastc))) { 937 flagch = NWBND; 938 } 939 if (flagch == NWBND) { 940 st = step(m->g, startst, stopst, st, flagch, st, sflags); 941 SP("snwbnd", st, c); 942 } 943 944 /* are we done? */ 945 if (ISSET(st, stopst)) { 946 if (fast) 947 break; 948 else 949 matchp = p; 950 } 951 if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) 952 break; /* NOTE BREAK OUT */ 953 954 /* no, we must deal with this character */ 955 ASSIGN(tmp, st); 956 if (fast) 957 ASSIGN(st, fresh); 958 else 959 ASSIGN(st, empty); 960 assert(c != OUT); 961 st = step(m->g, startst, stopst, tmp, c, st, sflags); 962 SP("saft", st, c); 963 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), 964 st)); 965 p += clen; 966 } 967 968 if (fast) { 969 assert(matchp != NULL); 970 m->coldp = matchp; 971 if (ISSET(st, stopst)) 972 return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 973 else 974 return (NULL); 975 } else 976 return (matchp); 977 } 978 979 /* 980 - step - map set of states reachable before char to set reachable after 981 == static states step(struct re_guts *g, sopno start, sopno stop, \ 982 == states bef, int ch, states aft); 983 == #define BOL (OUT-1) 984 == #define EOL (BOL-1) 985 == #define BOLEOL (BOL-2) 986 == #define NOTHING (BOL-3) 987 == #define BOW (BOL-4) 988 == #define EOW (BOL-5) 989 == #define BADCHAR (BOL-6) 990 == #define NONCHAR(c) ((c) <= OUT) 991 */ 992 static states 993 step(struct re_guts *g, 994 sopno start, /* start state within strip */ 995 sopno stop, /* state after stop state within strip */ 996 states bef, /* states reachable before */ 997 wint_t ch, /* character or NONCHAR code */ 998 states aft, /* states already known reachable after */ 999 int sflags) /* state flags */ 1000 { 1001 cset *cs; 1002 sop s; 1003 sopno pc; 1004 onestate here; /* note, macros know this name */ 1005 sopno look; 1006 int i; 1007 1008 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 1009 s = g->strip[pc]; 1010 switch (OP(s)) { 1011 case OEND: 1012 assert(pc == stop-1); 1013 break; 1014 case OCHAR: 1015 /* only characters can match */ 1016 assert(!NONCHAR(ch) || ch != OPND(s)); 1017 if (ch == OPND(s)) 1018 FWD(aft, bef, 1); 1019 break; 1020 case OBOS: 1021 if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0) 1022 FWD(aft, bef, 1); 1023 break; 1024 case OEOS: 1025 if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0) 1026 FWD(aft, bef, 1); 1027 break; 1028 case OBOL: 1029 if (ch == BOL || ch == BOLEOL) 1030 FWD(aft, bef, 1); 1031 break; 1032 case OEOL: 1033 if (ch == EOL || ch == BOLEOL) 1034 FWD(aft, bef, 1); 1035 break; 1036 case OBOW: 1037 if (ch == BOW) 1038 FWD(aft, bef, 1); 1039 break; 1040 case OEOW: 1041 if (ch == EOW) 1042 FWD(aft, bef, 1); 1043 break; 1044 case OWBND: 1045 if (ch == BOW || ch == EOW) 1046 FWD(aft, bef, 1); 1047 break; 1048 case ONWBND: 1049 if (ch == NWBND) 1050 FWD(aft, aft, 1); 1051 break; 1052 case OANY: 1053 if (!NONCHAR(ch)) 1054 FWD(aft, bef, 1); 1055 break; 1056 case OANYOF: 1057 cs = &g->sets[OPND(s)]; 1058 if (!NONCHAR(ch) && CHIN(cs, ch)) 1059 FWD(aft, bef, 1); 1060 break; 1061 case OBACK_: /* ignored here */ 1062 case O_BACK: 1063 FWD(aft, aft, 1); 1064 break; 1065 case OPLUS_: /* forward, this is just an empty */ 1066 FWD(aft, aft, 1); 1067 break; 1068 case O_PLUS: /* both forward and back */ 1069 FWD(aft, aft, 1); 1070 i = ISSETBACK(aft, OPND(s)); 1071 BACK(aft, aft, OPND(s)); 1072 if (!i && ISSETBACK(aft, OPND(s))) { 1073 /* oho, must reconsider loop body */ 1074 pc -= OPND(s) + 1; 1075 INIT(here, pc); 1076 } 1077 break; 1078 case OQUEST_: /* two branches, both forward */ 1079 FWD(aft, aft, 1); 1080 FWD(aft, aft, OPND(s)); 1081 break; 1082 case O_QUEST: /* just an empty */ 1083 FWD(aft, aft, 1); 1084 break; 1085 case OLPAREN: /* not significant here */ 1086 case ORPAREN: 1087 FWD(aft, aft, 1); 1088 break; 1089 case OCH_: /* mark the first two branches */ 1090 FWD(aft, aft, 1); 1091 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1092 FWD(aft, aft, OPND(s)); 1093 break; 1094 case OOR1: /* done a branch, find the O_CH */ 1095 if (ISSTATEIN(aft, here)) { 1096 for (look = 1; 1097 OP(s = g->strip[pc+look]) != (sop)O_CH; 1098 look += OPND(s)) 1099 assert(OP(s) == (sop)OOR2); 1100 FWD(aft, aft, look + 1); 1101 } 1102 break; 1103 case OOR2: /* propagate OCH_'s marking */ 1104 FWD(aft, aft, 1); 1105 if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { 1106 assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); 1107 FWD(aft, aft, OPND(s)); 1108 } 1109 break; 1110 case O_CH: /* just empty */ 1111 FWD(aft, aft, 1); 1112 break; 1113 default: /* ooooops... */ 1114 assert(nope); 1115 break; 1116 } 1117 } 1118 1119 return(aft); 1120 } 1121 1122 #ifdef REDEBUG 1123 /* 1124 - print - print a set of states 1125 == #ifdef REDEBUG 1126 == static void print(struct match *m, const char *caption, states st, \ 1127 == int ch, FILE *d); 1128 == #endif 1129 */ 1130 static void 1131 print(struct match *m, 1132 const char *caption, 1133 states st, 1134 int ch, 1135 FILE *d) 1136 { 1137 struct re_guts *g = m->g; 1138 sopno i; 1139 int first = 1; 1140 1141 if (!(m->eflags®_TRACE)) 1142 return; 1143 1144 fprintf(d, "%s", caption); 1145 if (ch != '\0') 1146 fprintf(d, " %s", pchar(ch)); 1147 for (i = 0; i < g->nstates; i++) 1148 if (ISSET(st, i)) { 1149 fprintf(d, "%s%lu", (first) ? "\t" : ", ", i); 1150 first = 0; 1151 } 1152 fprintf(d, "\n"); 1153 } 1154 1155 /* 1156 - at - print current situation 1157 == #ifdef REDEBUG 1158 == static void at(struct match *m, const char *title, const char *start, \ 1159 == const char *stop, sopno startst, sopno stopst); 1160 == #endif 1161 */ 1162 static void 1163 at( struct match *m, 1164 const char *title, 1165 const char *start, 1166 const char *stop, 1167 sopno startst, 1168 sopno stopst) 1169 { 1170 if (!(m->eflags®_TRACE)) 1171 return; 1172 1173 printf("%s %s-", title, pchar(*start)); 1174 printf("%s ", pchar(*stop)); 1175 printf("%ld-%ld\n", (long)startst, (long)stopst); 1176 } 1177 1178 #ifndef PCHARDONE 1179 #define PCHARDONE /* never again */ 1180 /* 1181 - pchar - make a character printable 1182 == #ifdef REDEBUG 1183 == static const char *pchar(int ch); 1184 == #endif 1185 * 1186 * Is this identical to regchar() over in debug.c? Well, yes. But a 1187 * duplicate here avoids having a debugging-capable regexec.o tied to 1188 * a matching debug.o, and this is convenient. It all disappears in 1189 * the non-debug compilation anyway, so it doesn't matter much. 1190 */ 1191 static const char * /* -> representation */ 1192 pchar(int ch) 1193 { 1194 static char pbuf[10]; 1195 1196 if (isprint((uch)ch) || ch == ' ') 1197 sprintf(pbuf, "%c", ch); 1198 else 1199 sprintf(pbuf, "\\%o", ch); 1200 return(pbuf); 1201 } 1202 #endif 1203 #endif 1204 1205 #undef stepback 1206 #undef matcher 1207 #undef walk 1208 #undef dissect 1209 #undef backref 1210 #undef step 1211 #undef print 1212 #undef at 1213 #undef match 1214