1 /* $NetBSD: engine.c,v 1.7 2011/11/19 17:45:11 tnozaki Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer of the University of Toronto. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.4 (Berkeley) 3/19/94 36 */ 37 38 /* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45 #ifdef SNAMES 46 #define matcher smatcher 47 #define fast sfast 48 #define slow sslow 49 #define dissect sdissect 50 #define backref sbackref 51 #define step sstep 52 #define print sprint 53 #define at sat 54 #define match smat 55 #endif 56 #ifdef LNAMES 57 #define matcher lmatcher 58 #define fast lfast 59 #define slow lslow 60 #define dissect ldissect 61 #define backref lbackref 62 #define step lstep 63 #define print lprint 64 #define at lat 65 #define match lmat 66 #endif 67 68 /* another structure passed up and down to avoid zillions of parameters */ 69 struct match { 70 struct re_guts *g; 71 int eflags; 72 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 73 const RCHAR_T *offp; /* offsets work from here */ 74 const RCHAR_T *beginp; /* start of string -- virtual NUL precedes */ 75 const RCHAR_T *endp; /* end of string -- virtual NUL here */ 76 const RCHAR_T *coldp; /* can be no match starting before here */ 77 const RCHAR_T **lastpos; /* [nplus+1] */ 78 STATEVARS; 79 states st; /* current states */ 80 states fresh; /* states for a fresh start */ 81 states tmp; /* temporary */ 82 states empty; /* empty set of states */ 83 }; 84 85 /* ========= begin header generated by ./mkh ========= */ 86 #ifdef __cplusplus 87 extern "C" { 88 #endif 89 90 /* === engine.c === */ 91 static int matcher(struct re_guts *g, const RCHAR_T *string, size_t nmatch, regmatch_t pmatch[], int eflags); 92 static const RCHAR_T *dissect(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst); 93 static const RCHAR_T *backref(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst, sopno lev); 94 static const RCHAR_T *fast(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst); 95 static const RCHAR_T *slow(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, sopno stopst); 96 static states step(struct re_guts *g, sopno start, sopno stop, states bef, int flag, RCHAR_T ch, states aft); 97 #define BOL (1) 98 #define EOL (BOL+1) 99 #define BOLEOL (BOL+2) 100 #define NOTHING (BOL+3) 101 #define BOW (BOL+4) 102 #define EOW (BOL+5) 103 #ifdef REDEBUG 104 static void print(struct match *m, char *caption, states st, int ch, FILE *d); 105 #endif 106 #ifdef REDEBUG 107 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 108 #endif 109 #ifdef REDEBUG 110 static char *pchar(int ch); 111 #endif 112 113 #ifdef __cplusplus 114 } 115 #endif 116 /* ========= end header generated by ./mkh ========= */ 117 118 #ifdef REDEBUG 119 #define SP(t, s, c) print(m, t, s, c, stdout) 120 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 121 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 122 #else 123 #define SP(t, s, c) /* nothing */ 124 #define AT(t, p1, p2, s1, s2) /* nothing */ 125 #define NOTE(s) /* nothing */ 126 #endif 127 128 /* 129 - matcher - the actual matching engine 130 */ 131 static int /* 0 success, REG_NOMATCH failure */ 132 matcher(struct re_guts *g, const RCHAR_T *string, size_t nmatch, 133 regmatch_t pmatch[], int eflags) 134 { 135 const RCHAR_T *endp; 136 size_t i; 137 struct match mv; 138 struct match *m = &mv; 139 const RCHAR_T *dp; 140 const sopno gf = g->firststate+1; /* +1 for OEND */ 141 const sopno gl = g->laststate; 142 const RCHAR_T *start; 143 const RCHAR_T *stop; 144 145 /* simplify the situation where possible */ 146 if (g->cflags®_NOSUB) 147 nmatch = 0; 148 if (eflags®_STARTEND) { 149 start = string + pmatch[0].rm_so; 150 stop = string + pmatch[0].rm_eo; 151 } else { 152 start = string; 153 stop = start + STRLEN(start); 154 } 155 if (stop < start) 156 return(REG_INVARG); 157 158 /* prescreening; this does wonders for this rather slow code */ 159 if (g->must != NULL) { 160 for (dp = start; dp < stop; dp++) 161 if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen && 162 MEMCMP(dp, g->must, g->mlen) == 0) 163 break; 164 if (dp == stop) /* we didn't find g->must */ 165 return(REG_NOMATCH); 166 } 167 168 /* match struct setup */ 169 m->g = g; 170 m->eflags = eflags; 171 m->pmatch = NULL; 172 m->lastpos = NULL; 173 m->offp = string; 174 m->beginp = start; 175 m->endp = stop; 176 STATESETUP(m, 4); 177 SETUP(m->st); 178 SETUP(m->fresh); 179 SETUP(m->tmp); 180 SETUP(m->empty); 181 CLEAR(m->empty); 182 183 /* this loop does only one repetition except for backrefs */ 184 for (;;) { 185 endp = fast(m, start, stop, gf, gl); 186 if (endp == NULL) { /* a miss */ 187 STATETEARDOWN(m); 188 return(REG_NOMATCH); 189 } 190 if (nmatch == 0 && !g->backrefs) 191 break; /* no further info needed */ 192 193 /* where? */ 194 assert(m->coldp != NULL); 195 for (;;) { 196 NOTE("finding start"); 197 endp = slow(m, m->coldp, stop, gf, gl); 198 if (endp != NULL) 199 break; 200 assert(m->coldp < m->endp); 201 m->coldp++; 202 } 203 if (nmatch == 1 && !g->backrefs) 204 break; /* no further info needed */ 205 206 /* oh my, he wants the subexpressions... */ 207 if (m->pmatch == NULL) 208 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 209 sizeof(regmatch_t)); 210 if (m->pmatch == NULL) { 211 STATETEARDOWN(m); 212 return(REG_ESPACE); 213 } 214 for (i = 1; i <= m->g->nsub; i++) 215 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 216 if (!g->backrefs && !(m->eflags®_BACKR)) { 217 NOTE("dissecting"); 218 dp = dissect(m, m->coldp, endp, gf, gl); 219 } else { 220 if (g->nplus > 0 && m->lastpos == NULL) 221 m->lastpos = (const RCHAR_T **)malloc((g->nplus+1) * 222 sizeof(const RCHAR_T *)); 223 if (g->nplus > 0 && m->lastpos == NULL) { 224 free(m->pmatch); 225 STATETEARDOWN(m); 226 return(REG_ESPACE); 227 } 228 NOTE("backref dissect"); 229 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 230 } 231 if (dp != NULL) 232 break; 233 234 /* uh-oh... we couldn't find a subexpression-level match */ 235 assert(g->backrefs); /* must be back references doing it */ 236 assert(g->nplus == 0 || m->lastpos != NULL); 237 for (;;) { 238 if (dp != NULL || endp <= m->coldp) 239 break; /* defeat */ 240 NOTE("backoff"); 241 endp = slow(m, m->coldp, endp-1, gf, gl); 242 if (endp == NULL) 243 break; /* defeat */ 244 /* try it on a shorter possibility */ 245 #ifndef NDEBUG 246 for (i = 1; i <= m->g->nsub; i++) { 247 assert(m->pmatch[i].rm_so == -1); 248 assert(m->pmatch[i].rm_eo == -1); 249 } 250 #endif 251 NOTE("backoff dissect"); 252 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 253 } 254 assert(dp == NULL || dp == endp); 255 if (dp != NULL) /* found a shorter one */ 256 break; 257 258 /* despite initial appearances, there is no match here */ 259 NOTE("false alarm"); 260 start = m->coldp + 1; /* recycle starting later */ 261 assert(start <= stop); 262 } 263 264 /* fill in the details if requested */ 265 if (nmatch > 0) { 266 pmatch[0].rm_so = m->coldp - m->offp; 267 pmatch[0].rm_eo = endp - m->offp; 268 } 269 if (nmatch > 1) { 270 assert(m->pmatch != NULL); 271 for (i = 1; i < nmatch; i++) 272 if (i <= m->g->nsub) 273 pmatch[i] = m->pmatch[i]; 274 else { 275 pmatch[i].rm_so = -1; 276 pmatch[i].rm_eo = -1; 277 } 278 } 279 280 if (m->pmatch != NULL) 281 free((char *)m->pmatch); 282 if (m->lastpos != NULL) 283 free((char *)m->lastpos); 284 STATETEARDOWN(m); 285 return(0); 286 } 287 288 /* 289 - dissect - figure out what matched what, no back references 290 */ 291 static const RCHAR_T * /* == stop (success) always */ 292 dissect(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, 293 sopno startst, sopno stopst) 294 { 295 int i; 296 sopno ss; /* start sop of current subRE */ 297 sopno es; /* end sop of current subRE */ 298 const RCHAR_T *sp; /* start of string matched by it */ 299 const RCHAR_T *stp; /* string matched by it cannot pass here */ 300 const RCHAR_T *rest; /* start of rest of string */ 301 const RCHAR_T *tail; /* string unmatched by rest of RE */ 302 sopno ssub; /* start sop of subsubRE */ 303 sopno esub; /* end sop of subsubRE */ 304 const RCHAR_T *ssp; /* start of string matched by subsubRE */ 305 const RCHAR_T *sep; /* end of string matched by subsubRE */ 306 const RCHAR_T *oldssp; /* previous ssp */ 307 const RCHAR_T *dp; 308 309 AT("diss", start, stop, startst, stopst); 310 sp = start; 311 for (ss = startst; ss < stopst; ss = es) { 312 /* identify end of subRE */ 313 es = ss; 314 switch (m->g->strip[es]) { 315 case OPLUS_: 316 case OQUEST_: 317 es += m->g->stripdata[es]; 318 break; 319 case OCH_: 320 while (m->g->strip[es] != O_CH) 321 es += m->g->stripdata[es]; 322 break; 323 } 324 es++; 325 326 /* figure out what it matched */ 327 switch (m->g->strip[ss]) { 328 case OEND: 329 assert(nope); 330 break; 331 case OCHAR: 332 sp++; 333 break; 334 case OBOL: 335 case OEOL: 336 case OBOW: 337 case OEOW: 338 break; 339 case OANY: 340 case OANYOF: 341 sp++; 342 break; 343 case OBACK_: 344 case O_BACK: 345 assert(nope); 346 break; 347 /* cases where length of match is hard to find */ 348 case OQUEST_: 349 stp = stop; 350 for (;;) { 351 /* how long could this one be? */ 352 rest = slow(m, sp, stp, ss, es); 353 assert(rest != NULL); /* it did match */ 354 /* could the rest match the rest? */ 355 tail = slow(m, rest, stop, es, stopst); 356 if (tail == stop) 357 break; /* yes! */ 358 /* no -- try a shorter match for this one */ 359 stp = rest - 1; 360 assert(stp >= sp); /* it did work */ 361 } 362 ssub = ss + 1; 363 esub = es - 1; 364 /* did innards match? */ 365 if (slow(m, sp, rest, ssub, esub) != NULL) { 366 dp = dissect(m, sp, rest, ssub, esub); 367 assert(dp == rest); 368 } else /* no */ 369 assert(sp == rest); 370 sp = rest; 371 break; 372 case OPLUS_: 373 stp = stop; 374 for (;;) { 375 /* how long could this one be? */ 376 rest = slow(m, sp, stp, ss, es); 377 assert(rest != NULL); /* it did match */ 378 /* could the rest match the rest? */ 379 tail = slow(m, rest, stop, es, stopst); 380 if (tail == stop) 381 break; /* yes! */ 382 /* no -- try a shorter match for this one */ 383 stp = rest - 1; 384 assert(stp >= sp); /* it did work */ 385 } 386 ssub = ss + 1; 387 esub = es - 1; 388 ssp = sp; 389 oldssp = ssp; 390 for (;;) { /* find last match of innards */ 391 sep = slow(m, ssp, rest, ssub, esub); 392 if (sep == NULL || sep == ssp) 393 break; /* failed or matched null */ 394 oldssp = ssp; /* on to next try */ 395 ssp = sep; 396 } 397 if (sep == NULL) { 398 /* last successful match */ 399 sep = ssp; 400 ssp = oldssp; 401 } 402 assert(sep == rest); /* must exhaust substring */ 403 assert(slow(m, ssp, sep, ssub, esub) == rest); 404 dp = dissect(m, ssp, sep, ssub, esub); 405 assert(dp == sep); 406 sp = rest; 407 break; 408 case OCH_: 409 stp = stop; 410 for (;;) { 411 /* how long could this one be? */ 412 rest = slow(m, sp, stp, ss, es); 413 assert(rest != NULL); /* it did match */ 414 /* could the rest match the rest? */ 415 tail = slow(m, rest, stop, es, stopst); 416 if (tail == stop) 417 break; /* yes! */ 418 /* no -- try a shorter match for this one */ 419 stp = rest - 1; 420 assert(stp >= sp); /* it did work */ 421 } 422 ssub = ss + 1; 423 esub = ss + m->g->stripdata[ss] - 1; 424 assert(m->g->strip[esub] == OOR1); 425 for (;;) { /* find first matching branch */ 426 if (slow(m, sp, rest, ssub, esub) == rest) 427 break; /* it matched all of it */ 428 /* that one missed, try next one */ 429 assert(m->g->strip[esub] == OOR1); 430 esub++; 431 assert(m->g->strip[esub] == OOR2); 432 ssub = esub + 1; 433 esub += m->g->stripdata[esub]; 434 if (m->g->strip[esub] == OOR2) 435 esub--; 436 else 437 assert(m->g->strip[esub] == O_CH); 438 } 439 dp = dissect(m, sp, rest, ssub, esub); 440 assert(dp == rest); 441 sp = rest; 442 break; 443 case O_PLUS: 444 case O_QUEST: 445 case OOR1: 446 case OOR2: 447 case O_CH: 448 assert(nope); 449 break; 450 case OLPAREN: 451 i = m->g->stripdata[ss]; 452 assert(0 < i && i <= m->g->nsub); 453 m->pmatch[i].rm_so = sp - m->offp; 454 break; 455 case ORPAREN: 456 i = m->g->stripdata[ss]; 457 assert(0 < i && i <= m->g->nsub); 458 m->pmatch[i].rm_eo = sp - m->offp; 459 break; 460 default: /* uh oh */ 461 assert(nope); 462 break; 463 } 464 } 465 466 assert(sp == stop); 467 return(sp); 468 } 469 470 /* 471 - backref - figure out what matched what, figuring in back references 472 */ 473 static const RCHAR_T * /* == stop (success) or NULL (failure) */ 474 backref(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, 475 sopno startst, sopno stopst, sopno lev) /* PLUS nesting level */ 476 { 477 int i; 478 sopno ss; /* start sop of current subRE */ 479 const RCHAR_T *sp; /* start of string matched by it */ 480 sopno ssub; /* start sop of subsubRE */ 481 sopno esub; /* end sop of subsubRE */ 482 const RCHAR_T *ssp; /* start of string matched by subsubRE */ 483 const RCHAR_T *dp; 484 size_t len; 485 int hard; 486 sop s; 487 RCHAR_T d; 488 regoff_t offsave; 489 cset *cs; 490 491 AT("back", start, stop, startst, stopst); 492 sp = start; 493 494 /* get as far as we can with easy stuff */ 495 hard = 0; 496 for (ss = startst; !hard && ss < stopst; ss++) { 497 s = m->g->strip[ss]; 498 d = m->g->stripdata[ss]; 499 switch (s) { 500 case OCHAR: 501 if (sp == stop || *sp++ != d) 502 return(NULL); 503 break; 504 case OANY: 505 if (sp == stop) 506 return(NULL); 507 sp++; 508 break; 509 case OANYOF: 510 cs = &m->g->sets[d]; 511 if (sp == stop || !CHIN(cs, *sp++)) 512 return(NULL); 513 break; 514 case OBOL: 515 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 516 (sp < m->endp && *(sp-1) == '\n' && 517 (m->g->cflags®_NEWLINE)) ) 518 { /* yes */ } 519 else 520 return(NULL); 521 break; 522 case OEOL: 523 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 524 (sp < m->endp && *sp == '\n' && 525 (m->g->cflags®_NEWLINE)) ) 526 { /* yes */ } 527 else 528 return(NULL); 529 break; 530 case OBOW: 531 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 532 (sp < m->endp && *(sp-1) == '\n' && 533 (m->g->cflags®_NEWLINE)) || 534 (sp > m->beginp && 535 !ISWORD(*(sp-1))) ) && 536 (sp < m->endp && ISWORD(*sp)) ) 537 { /* yes */ } 538 else 539 return(NULL); 540 break; 541 case OEOW: 542 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 543 (sp < m->endp && *sp == '\n' && 544 (m->g->cflags®_NEWLINE)) || 545 (sp < m->endp && !ISWORD(*sp)) ) && 546 (sp > m->beginp && ISWORD(*(sp-1))) ) 547 { /* yes */ } 548 else 549 return(NULL); 550 break; 551 case O_QUEST: 552 break; 553 case OOR1: /* matches null but needs to skip */ 554 ss++; 555 s = m->g->strip[ss]; 556 d = m->g->stripdata[ss]; 557 do { 558 assert(s == OOR2); 559 ss += d; 560 s = m->g->strip[ss]; 561 d = m->g->stripdata[ss]; 562 } while (s != O_CH); 563 /* note that the ss++ gets us past the O_CH */ 564 break; 565 default: /* have to make a choice */ 566 hard = 1; 567 break; 568 } 569 } 570 if (!hard) { /* that was it! */ 571 if (sp != stop) 572 return(NULL); 573 return(sp); 574 } 575 ss--; /* adjust for the for's final increment */ 576 577 /* the hard stuff */ 578 AT("hard", sp, stop, ss, stopst); 579 s = m->g->strip[ss]; 580 d = m->g->stripdata[ss]; 581 switch (s) { 582 case OBACK_: /* the vilest depths */ 583 i = d; 584 assert(0 < i && i <= m->g->nsub); 585 if (m->pmatch[i].rm_eo == -1) 586 return(NULL); 587 assert(m->pmatch[i].rm_so != -1); 588 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 589 assert(stop - m->beginp >= len); 590 if (sp > stop - len) 591 return(NULL); /* not enough left to match */ 592 ssp = m->offp + m->pmatch[i].rm_so; 593 if (memcmp(sp, ssp, len) != 0) 594 return(NULL); 595 while (m->g->strip[ss] != O_BACK || m->g->stripdata[ss] != i) 596 ss++; 597 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 598 break; 599 case OQUEST_: /* to null or not */ 600 dp = backref(m, sp, stop, ss+1, stopst, lev); 601 if (dp != NULL) 602 return(dp); /* not */ 603 return(backref(m, sp, stop, ss+d+1, stopst, lev)); 604 break; 605 case OPLUS_: 606 assert(m->lastpos != NULL); 607 assert(lev+1 <= m->g->nplus); 608 m->lastpos[lev+1] = sp; 609 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 610 break; 611 case O_PLUS: 612 if (sp == m->lastpos[lev]) /* last pass matched null */ 613 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 614 /* try another pass */ 615 m->lastpos[lev] = sp; 616 dp = backref(m, sp, stop, ss-d+1, stopst, lev); 617 if (dp == NULL) 618 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 619 else 620 return(dp); 621 break; 622 case OCH_: /* find the right one, if any */ 623 ssub = ss + 1; 624 esub = ss + d - 1; 625 assert(m->g->strip[esub] == OOR1); 626 for (;;) { /* find first matching branch */ 627 dp = backref(m, sp, stop, ssub, esub, lev); 628 if (dp != NULL) 629 return(dp); 630 /* that one missed, try next one */ 631 if (m->g->strip[esub] == O_CH) 632 return(NULL); /* there is none */ 633 esub++; 634 assert(m->g->strip[esub] == OOR2); 635 ssub = esub + 1; 636 esub += m->g->stripdata[esub]; 637 if (m->g->strip[esub] == OOR2) 638 esub--; 639 else 640 assert(m->g->strip[esub] == O_CH); 641 } 642 break; 643 case OLPAREN: /* must undo assignment if rest fails */ 644 i = d; 645 assert(0 < i && i <= m->g->nsub); 646 offsave = m->pmatch[i].rm_so; 647 m->pmatch[i].rm_so = sp - m->offp; 648 dp = backref(m, sp, stop, ss+1, stopst, lev); 649 if (dp != NULL) 650 return(dp); 651 m->pmatch[i].rm_so = offsave; 652 return(NULL); 653 break; 654 case ORPAREN: /* must undo assignment if rest fails */ 655 i = d; 656 assert(0 < i && i <= m->g->nsub); 657 offsave = m->pmatch[i].rm_eo; 658 m->pmatch[i].rm_eo = sp - m->offp; 659 dp = backref(m, sp, stop, ss+1, stopst, lev); 660 if (dp != NULL) 661 return(dp); 662 m->pmatch[i].rm_eo = offsave; 663 return(NULL); 664 break; 665 default: /* uh oh */ 666 assert(nope); 667 break; 668 } 669 670 /* "can't happen" */ 671 assert(nope); 672 /* NOTREACHED */ 673 return NULL; 674 } 675 676 /* 677 - fast - step through the string at top speed 678 */ 679 static const RCHAR_T * /* where tentative match ended, or NULL */ 680 fast(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, 681 sopno stopst) 682 { 683 states st = m->st; 684 states fresh = m->fresh; 685 states tmp = m->tmp; 686 const RCHAR_T *p = start; 687 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1); 688 RCHAR_T lastc; /* previous c */ 689 int flag; 690 int i; 691 const RCHAR_T *coldp; /* last p after which no match was underway */ 692 693 CLEAR(st); 694 SET1(st, startst); 695 st = step(m->g, startst, stopst, st, NOTHING, OUT, st); 696 ASSIGN(fresh, st); 697 SP("start", st, *p); 698 coldp = NULL; 699 for (;;) { 700 /* next character */ 701 lastc = c; 702 c = (p == m->endp) ? OUT : *p; 703 if (EQ(st, fresh)) 704 coldp = p; 705 706 /* is there an EOL and/or BOL between lastc and c? */ 707 flag = 0; 708 i = 0; 709 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 710 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 711 flag = BOL; 712 i = m->g->nbol; 713 } 714 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 715 (c == OUT && !(m->eflags®_NOTEOL)) ) { 716 flag = (flag == BOL) ? BOLEOL : EOL; 717 i += m->g->neol; 718 } 719 if (i != 0) { 720 for (; i > 0; i--) 721 st = step(m->g, startst, stopst, st, flag, OUT, st); 722 SP("boleol", st, c); 723 } 724 725 /* how about a word boundary? */ 726 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) && 727 (c != OUT && ISWORD(c)) ) { 728 flag = BOW; 729 } 730 if ( (lastc != OUT && ISWORD(lastc)) && 731 (flag == EOL || (c != OUT && !ISWORD(c))) ) { 732 flag = EOW; 733 } 734 if (flag == BOW || flag == EOW) { 735 st = step(m->g, startst, stopst, st, flag, OUT, st); 736 SP("boweow", st, c); 737 } 738 739 /* are we done? */ 740 if (ISSET(st, stopst) || p == stop) 741 break; /* NOTE BREAK OUT */ 742 743 /* no, we must deal with this character */ 744 ASSIGN(tmp, st); 745 ASSIGN(st, fresh); 746 assert(c != OUT); 747 st = step(m->g, startst, stopst, tmp, 0, c, st); 748 SP("aft", st, c); 749 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st)); 750 p++; 751 } 752 753 assert(coldp != NULL); 754 m->coldp = coldp; 755 if (ISSET(st, stopst)) 756 return(p+1); 757 else 758 return(NULL); 759 } 760 761 /* 762 - slow - step through the string more deliberately 763 */ 764 static const RCHAR_T * /* where it ended */ 765 slow(struct match *m, const RCHAR_T *start, const RCHAR_T *stop, sopno startst, 766 sopno stopst) 767 { 768 states st = m->st; 769 states empty = m->empty; 770 states tmp = m->tmp; 771 const RCHAR_T *p = start; 772 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1); 773 RCHAR_T lastc; /* previous c */ 774 int flag; 775 int i; 776 const RCHAR_T *matchp; /* last p at which a match ended */ 777 778 AT("slow", start, stop, startst, stopst); 779 CLEAR(st); 780 SET1(st, startst); 781 SP("sstart", st, *p); 782 st = step(m->g, startst, stopst, st, NOTHING, OUT, st); 783 matchp = NULL; 784 for (;;) { 785 /* next character */ 786 lastc = c; 787 c = (p == m->endp) ? OUT : *p; 788 789 /* is there an EOL and/or BOL between lastc and c? */ 790 flag = 0; 791 i = 0; 792 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 793 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 794 flag = BOL; 795 i = m->g->nbol; 796 } 797 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 798 (c == OUT && !(m->eflags®_NOTEOL)) ) { 799 flag = (flag == BOL) ? BOLEOL : EOL; 800 i += m->g->neol; 801 } 802 if (i != 0) { 803 for (; i > 0; i--) 804 st = step(m->g, startst, stopst, st, flag, OUT, st); 805 SP("sboleol", st, c); 806 } 807 808 /* how about a word boundary? */ 809 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) && 810 (c != OUT && ISWORD(c)) ) { 811 flag = BOW; 812 } 813 if ( (lastc != OUT && ISWORD(lastc)) && 814 (flag == EOL || (c != OUT && !ISWORD(c))) ) { 815 flag = EOW; 816 } 817 if (flag == BOW || flag == EOW) { 818 st = step(m->g, startst, stopst, st, flag, OUT, st); 819 SP("sboweow", st, c); 820 } 821 822 /* are we done? */ 823 if (ISSET(st, stopst)) 824 matchp = p; 825 if (EQ(st, empty) || p == stop) 826 break; /* NOTE BREAK OUT */ 827 828 /* no, we must deal with this character */ 829 ASSIGN(tmp, st); 830 ASSIGN(st, empty); 831 assert(c != OUT); 832 st = step(m->g, startst, stopst, tmp, 0, c, st); 833 SP("saft", st, c); 834 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st)); 835 p++; 836 } 837 838 return(matchp); 839 } 840 841 842 /* 843 - step - map set of states reachable before char to set reachable after 844 */ 845 static states 846 step(struct re_guts *g, 847 sopno start, /* start state within strip */ 848 sopno stop, /* state after stop state within strip */ 849 states bef, /* states reachable before */ 850 int flag, /* NONCHAR flag */ 851 RCHAR_T ch, /* character code */ 852 states aft) /* states already known reachable after */ 853 { 854 cset *cs; 855 sop s; 856 RCHAR_T d; 857 sopno pc; 858 onestate here; /* note, macros know this name */ 859 sopno look; 860 int i; 861 862 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 863 s = g->strip[pc]; 864 d = g->stripdata[pc]; 865 switch (s) { 866 case OEND: 867 assert(pc == stop-1); 868 break; 869 case OCHAR: 870 /* only characters can match */ 871 assert(!flag || ch != d); 872 if (ch == d) 873 FWD(aft, bef, 1); 874 break; 875 case OBOL: 876 if (flag == BOL || flag == BOLEOL) 877 FWD(aft, bef, 1); 878 break; 879 case OEOL: 880 if (flag == EOL || flag == BOLEOL) 881 FWD(aft, bef, 1); 882 break; 883 case OBOW: 884 if (flag == BOW) 885 FWD(aft, bef, 1); 886 break; 887 case OEOW: 888 if (flag == EOW) 889 FWD(aft, bef, 1); 890 break; 891 case OANY: 892 if (!flag) 893 FWD(aft, bef, 1); 894 break; 895 case OANYOF: 896 cs = &g->sets[d]; 897 if (!flag && CHIN(cs, ch)) 898 FWD(aft, bef, 1); 899 break; 900 case OBACK_: /* ignored here */ 901 case O_BACK: 902 FWD(aft, aft, 1); 903 break; 904 case OPLUS_: /* forward, this is just an empty */ 905 FWD(aft, aft, 1); 906 break; 907 case O_PLUS: /* both forward and back */ 908 FWD(aft, aft, 1); 909 i = ISSETBACK(aft, d); 910 BACK(aft, aft, d); 911 if (!i && ISSETBACK(aft, d)) { 912 /* oho, must reconsider loop body */ 913 pc -= d + 1; 914 INIT(here, pc); 915 } 916 break; 917 case OQUEST_: /* two branches, both forward */ 918 FWD(aft, aft, 1); 919 FWD(aft, aft, d); 920 break; 921 case O_QUEST: /* just an empty */ 922 FWD(aft, aft, 1); 923 break; 924 case OLPAREN: /* not significant here */ 925 case ORPAREN: 926 FWD(aft, aft, 1); 927 break; 928 case OCH_: /* mark the first two branches */ 929 FWD(aft, aft, 1); 930 assert(OP(g->strip[pc+d]) == OOR2); 931 FWD(aft, aft, d); 932 break; 933 case OOR1: /* done a branch, find the O_CH */ 934 if (ISSTATEIN(aft, here)) { 935 for (look = 1; /**/; look += d) { 936 s = g->strip[pc+look]; 937 d = g->stripdata[pc+look]; 938 if (s == O_CH) 939 break; 940 assert(s == OOR2); 941 } 942 FWD(aft, aft, look); 943 } 944 break; 945 case OOR2: /* propagate OCH_'s marking */ 946 FWD(aft, aft, 1); 947 if (g->strip[pc+d] != O_CH) { 948 assert(g->strip[pc+d] == OOR2); 949 FWD(aft, aft, d); 950 } 951 break; 952 case O_CH: /* just empty */ 953 FWD(aft, aft, 1); 954 break; 955 default: /* ooooops... */ 956 assert(nope); 957 break; 958 } 959 } 960 961 return(aft); 962 } 963 964 #ifdef REDEBUG 965 /* 966 - print - print a set of states 967 */ 968 static void 969 print(struct match *m, char *caption, states st, int ch, FILE *d) 970 { 971 struct re_guts *g = m->g; 972 int i; 973 int first = 1; 974 975 if (!(m->eflags®_TRACE)) 976 return; 977 978 fprintf(d, "%s", caption); 979 if (ch != '\0') 980 fprintf(d, " %s", pchar(ch)); 981 for (i = 0; i < g->nstates; i++) 982 if (ISSET(st, i)) { 983 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 984 first = 0; 985 } 986 fprintf(d, "\n"); 987 } 988 989 /* 990 - at - print current situation 991 */ 992 static void 993 at(struct match *m, char *title, char *start, char *stop, sopno startst, 994 sopno stopst) 995 { 996 if (!(m->eflags®_TRACE)) 997 return; 998 999 printf("%s %s-", title, pchar(*start)); 1000 printf("%s ", pchar(*stop)); 1001 printf("%ld-%ld\n", (long)startst, (long)stopst); 1002 } 1003 1004 #ifndef PCHARDONE 1005 #define PCHARDONE /* never again */ 1006 /* 1007 - pchar - make a character printable 1008 * 1009 * Is this identical to regchar() over in debug.c? Well, yes. But a 1010 * duplicate here avoids having a debugging-capable regexec.o tied to 1011 * a matching debug.o, and this is convenient. It all disappears in 1012 * the non-debug compilation anyway, so it doesn't matter much. 1013 */ 1014 static char * /* -> representation */ 1015 pchar(int ch) 1016 { 1017 static char pbuf[10]; 1018 1019 if (isprint(ch) || ch == ' ') 1020 snprintf(pbuf, sizeof(pbuf), "%c", ch); 1021 else 1022 snprintf(pbuf, sizeof(pbuf), "\\%o", ch); 1023 return(pbuf); 1024 } 1025 #endif 1026 #endif 1027 1028 #undef matcher 1029 #undef fast 1030 #undef slow 1031 #undef dissect 1032 #undef backref 1033 #undef step 1034 #undef print 1035 #undef at 1036 #undef match 1037