1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 * 21 * Beware that some of this code is subtly aware of the way operator 22 * precedence is structured in regular expressions. Serious changes in 23 * regular-expression syntax might require a total rethink. 24 * 25 * *** NOTE: this code has been altered slightly for use in Tcl. *** 26 * Slightly modified by David MacKenzie to undo most of the changes for TCL. 27 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman 28 */ 29 30 #include "less.h" 31 #if HAVE_STDIO_H 32 #include <stdio.h> 33 #endif 34 #if HAVE_STDLIB_H 35 #include <stdlib.h> 36 #endif 37 #if HAVE_STRING_H 38 #include <string.h> 39 #endif 40 #include "regexp.h" 41 42 /* 43 * The "internal use only" fields in regexp.h are present to pass info from 44 * compile to execute that permits the execute phase to run lots faster on 45 * simple cases. They are: 46 * 47 * regstart char that must begin a match; '\0' if none obvious 48 * reganch is the match anchored (at beginning-of-line only)? 49 * regmust string (pointer into program) that match must include, or NULL 50 * regmlen length of regmust string 51 * 52 * Regstart and reganch permit very fast decisions on suitable starting points 53 * for a match, cutting down the work a lot. Regmust permits fast rejection 54 * of lines that cannot possibly match. The regmust tests are costly enough 55 * that regcomp() supplies a regmust only if the r.e. contains something 56 * potentially expensive (at present, the only such thing detected is * or + 57 * at the start of the r.e., which can involve a lot of backup). Regmlen is 58 * supplied because the test in regexec() needs it and regcomp() is 59 * computing it anyway. 60 */ 61 62 /* 63 * Structure for regexp "program". This is essentially a linear encoding 64 * of a nondeterministic finite-state machine (aka syntax charts or 65 * "railroad normal form" in parsing technology). Each node is an opcode 66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 67 * all nodes except BRANCH implement concatenation; a "next" pointer with 68 * a BRANCH on both ends of it is connecting two alternatives. (Here we 69 * have one of the subtle syntax dependencies: an individual BRANCH (as 70 * opposed to a collection of them) is never concatenated with anything 71 * because of operator precedence.) The operand of some types of node is 72 * a literal string; for others, it is a node leading into a sub-FSM. In 73 * particular, the operand of a BRANCH node is the first node of the branch. 74 * (NB this is *not* a tree structure: the tail of the branch connects 75 * to the thing following the set of BRANCHes.) The opcodes are: 76 */ 77 78 /* definition number opnd? meaning */ 79 #undef EOL 80 #define END 0 /* no End of program. */ 81 #define BOL 1 /* no Match "" at beginning of line. */ 82 #define EOL 2 /* no Match "" at end of line. */ 83 #define ANY 3 /* no Match any one character. */ 84 #define ANYOF 4 /* str Match any character in this string. */ 85 #define ANYBUT 5 /* str Match any character not in this string. */ 86 #define BRANCH 6 /* node Match this alternative, or the next... */ 87 #define BACK 7 /* no Match "", "next" ptr points backward. */ 88 #define EXACTLY 8 /* str Match this string. */ 89 #define NOTHING 9 /* no Match empty string. */ 90 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 91 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 92 #define OPEN 20 /* no Mark this point in input as start of #n. */ 93 /* OPEN+1 is number 1, etc. */ 94 #define CLOSE 30 /* no Analogous to OPEN. */ 95 96 /* 97 * Opcode notes: 98 * 99 * BRANCH The set of branches constituting a single choice are hooked 100 * together with their "next" pointers, since precedence prevents 101 * anything being concatenated to any individual branch. The 102 * "next" pointer of the last BRANCH in a choice points to the 103 * thing following the whole choice. This is also where the 104 * final "next" pointer of each individual branch points; each 105 * branch starts with the operand node of a BRANCH node. 106 * 107 * BACK Normal "next" pointers all implicitly point forward; BACK 108 * exists to make loop structures possible. 109 * 110 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 111 * BRANCH structures using BACK. Simple cases (one character 112 * per match) are implemented with STAR and PLUS for speed 113 * and to minimize recursive plunges. 114 * 115 * OPEN,CLOSE ...are numbered at compile time. 116 */ 117 118 /* 119 * A node is one char of opcode followed by two chars of "next" pointer. 120 * "Next" pointers are stored as two 8-bit pieces, high order first. The 121 * value is a positive offset from the opcode of the node containing it. 122 * An operand, if any, simply follows the node. (Note that much of the 123 * code generation knows about this implicit relationship.) 124 * 125 * Using two bytes for the "next" pointer is vast overkill for most things, 126 * but allows patterns to get big without disasters. 127 */ 128 #define OP(p) (*(p)) 129 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 130 #define OPERAND(p) ((p) + 3) 131 132 /* 133 * See regmagic.h for one further detail of program structure. 134 */ 135 136 137 /* 138 * Utility definitions. 139 */ 140 #ifndef CHARBITS 141 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 142 #else 143 #define UCHARAT(p) ((int)*(p)&CHARBITS) 144 #endif 145 146 #define FAIL(m) { regerror(m); return(NULL); } 147 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 148 #define META "^$.[()|?+*\\" 149 150 /* 151 * Flags to be passed up and down. 152 */ 153 #define HASWIDTH 01 /* Known never to match null string. */ 154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 155 #define SPSTART 04 /* Starts with * or +. */ 156 #define WORST 0 /* Worst case. */ 157 158 /* 159 * Global work variables for regcomp(). 160 */ 161 static char *regparse; /* Input-scan pointer. */ 162 static int regnpar; /* () count. */ 163 static char regdummy; 164 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 165 static long regsize; /* Code size. */ 166 167 /* 168 * The first byte of the regexp internal "program" is actually this magic 169 * number; the start node begins in the second byte. 170 */ 171 #define MAGIC 0234 172 173 174 /* 175 * Forward declarations for regcomp()'s friends. 176 */ 177 #ifndef STATIC 178 #define STATIC static 179 #endif 180 STATIC char *reg(); 181 STATIC char *regbranch(); 182 STATIC char *regpiece(); 183 STATIC char *regatom(); 184 STATIC char *regnode(); 185 STATIC char *regnext(); 186 STATIC void regc(); 187 STATIC void reginsert(); 188 STATIC void regtail(); 189 STATIC void regoptail(); 190 #ifdef STRCSPN 191 STATIC int strcspn(); 192 #endif 193 194 /* 195 - regcomp - compile a regular expression into internal code 196 * 197 * We can't allocate space until we know how big the compiled form will be, 198 * but we can't compile it (and thus know how big it is) until we've got a 199 * place to put the code. So we cheat: we compile it twice, once with code 200 * generation turned off and size counting turned on, and once "for real". 201 * This also means that we don't allocate space until we are sure that the 202 * thing really will compile successfully, and we never have to move the 203 * code and thus invalidate pointers into it. (Note that it has to be in 204 * one piece because free() must be able to free it all.) 205 * 206 * Beware that the optimization-preparation code in here knows about some 207 * of the structure of the compiled regexp. 208 */ 209 regexp * 210 regcomp(char *exp) 211 { 212 regexp *r; 213 char *scan; 214 char *longest; 215 int len; 216 int flags; 217 218 if (exp == NULL) 219 FAIL("NULL argument"); 220 221 /* First pass: determine size, legality. */ 222 regparse = exp; 223 regnpar = 1; 224 regsize = 0L; 225 regcode = ®dummy; 226 regc(MAGIC); 227 if (reg(0, &flags) == NULL) 228 return(NULL); 229 230 /* Small enough for pointer-storage convention? */ 231 if (regsize >= 32767L) /* Probably could be 65535L. */ 232 FAIL("regexp too big"); 233 234 /* Allocate space. */ 235 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 236 if (r == NULL) 237 FAIL("out of space"); 238 239 /* Second pass: emit code. */ 240 regparse = exp; 241 regnpar = 1; 242 regcode = r->program; 243 regc(MAGIC); 244 if (reg(0, &flags) == NULL) 245 { 246 free(r); 247 return(NULL); 248 } 249 250 /* Dig out information for optimizations. */ 251 r->regstart = '\0'; /* Worst-case defaults. */ 252 r->reganch = 0; 253 r->regmust = NULL; 254 r->regmlen = 0; 255 scan = r->program+1; /* First BRANCH. */ 256 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 257 scan = OPERAND(scan); 258 259 /* Starting-point info. */ 260 if (OP(scan) == EXACTLY) 261 r->regstart = *OPERAND(scan); 262 else if (OP(scan) == BOL) 263 r->reganch++; 264 265 /* 266 * If there's something expensive in the r.e., find the 267 * longest literal string that must appear and make it the 268 * regmust. Resolve ties in favor of later strings, since 269 * the regstart check works with the beginning of the r.e. 270 * and avoiding duplication strengthens checking. Not a 271 * strong reason, but sufficient in the absence of others. 272 */ 273 if (flags&SPSTART) { 274 longest = NULL; 275 len = 0; 276 for (; scan != NULL; scan = regnext(scan)) 277 if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) { 278 longest = OPERAND(scan); 279 len = (int) strlen(OPERAND(scan)); 280 } 281 r->regmust = longest; 282 r->regmlen = len; 283 } 284 } 285 286 return(r); 287 } 288 289 /* 290 - reg - regular expression, i.e. main body or parenthesized thing 291 * 292 * Caller must absorb opening parenthesis. 293 * 294 * Combining parenthesis handling with the base level of regular expression 295 * is a trifle forced, but the need to tie the tails of the branches to what 296 * follows makes it hard to avoid. 297 */ 298 static char * 299 reg(int paren, int *flagp) 300 { 301 char *ret; 302 char *br; 303 char *ender; 304 int parno = 0; 305 int flags; 306 307 *flagp = HASWIDTH; /* Tentatively. */ 308 309 /* Make an OPEN node, if parenthesized. */ 310 if (paren) { 311 if (regnpar >= NSUBEXP) 312 FAIL("too many ()"); 313 parno = regnpar; 314 regnpar++; 315 ret = regnode(OPEN+parno); 316 } else 317 ret = NULL; 318 319 /* Pick up the branches, linking them together. */ 320 br = regbranch(&flags); 321 if (br == NULL) 322 return(NULL); 323 if (ret != NULL) 324 regtail(ret, br); /* OPEN -> first. */ 325 else 326 ret = br; 327 if (!(flags&HASWIDTH)) 328 *flagp &= ~HASWIDTH; 329 *flagp |= flags&SPSTART; 330 while (*regparse == '|') { 331 regparse++; 332 br = regbranch(&flags); 333 if (br == NULL) 334 return(NULL); 335 regtail(ret, br); /* BRANCH -> BRANCH. */ 336 if (!(flags&HASWIDTH)) 337 *flagp &= ~HASWIDTH; 338 *flagp |= flags&SPSTART; 339 } 340 341 /* Make a closing node, and hook it on the end. */ 342 ender = regnode((paren) ? CLOSE+parno : END); 343 regtail(ret, ender); 344 345 /* Hook the tails of the branches to the closing node. */ 346 for (br = ret; br != NULL; br = regnext(br)) 347 regoptail(br, ender); 348 349 /* Check for proper termination. */ 350 if (paren && *regparse++ != ')') { 351 FAIL("unmatched ()"); 352 } else if (!paren && *regparse != '\0') { 353 if (*regparse == ')') { 354 FAIL("unmatched ()"); 355 } else 356 FAIL("junk on end"); /* "Can't happen". */ 357 /* NOTREACHED */ 358 } 359 360 return(ret); 361 } 362 363 /* 364 - regbranch - one alternative of an | operator 365 * 366 * Implements the concatenation operator. 367 */ 368 static char * 369 regbranch(int *flagp) 370 { 371 char *ret; 372 char *chain; 373 char *latest; 374 int flags; 375 376 *flagp = WORST; /* Tentatively. */ 377 378 ret = regnode(BRANCH); 379 chain = NULL; 380 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 381 latest = regpiece(&flags); 382 if (latest == NULL) 383 return(NULL); 384 *flagp |= flags&HASWIDTH; 385 if (chain == NULL) /* First piece. */ 386 *flagp |= flags&SPSTART; 387 else 388 regtail(chain, latest); 389 chain = latest; 390 } 391 if (chain == NULL) /* Loop ran zero times. */ 392 (void) regnode(NOTHING); 393 394 return(ret); 395 } 396 397 /* 398 - regpiece - something followed by possible [*+?] 399 * 400 * Note that the branching code sequences used for ? and the general cases 401 * of * and + are somewhat optimized: they use the same NOTHING node as 402 * both the endmarker for their branch list and the body of the last branch. 403 * It might seem that this node could be dispensed with entirely, but the 404 * endmarker role is not redundant. 405 */ 406 static char * 407 regpiece(int *flagp) 408 { 409 char *ret; 410 char op; 411 char *next; 412 int flags; 413 414 ret = regatom(&flags); 415 if (ret == NULL) 416 return(NULL); 417 418 op = *regparse; 419 if (!ISMULT(op)) { 420 *flagp = flags; 421 return(ret); 422 } 423 424 if (!(flags&HASWIDTH) && op != '?') 425 FAIL("*+ operand could be empty"); 426 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 427 428 if (op == '*' && (flags&SIMPLE)) 429 reginsert(STAR, ret); 430 else if (op == '*') { 431 /* Emit x* as (x&|), where & means "self". */ 432 reginsert(BRANCH, ret); /* Either x */ 433 regoptail(ret, regnode(BACK)); /* and loop */ 434 regoptail(ret, ret); /* back */ 435 regtail(ret, regnode(BRANCH)); /* or */ 436 regtail(ret, regnode(NOTHING)); /* null. */ 437 } else if (op == '+' && (flags&SIMPLE)) 438 reginsert(PLUS, ret); 439 else if (op == '+') { 440 /* Emit x+ as x(&|), where & means "self". */ 441 next = regnode(BRANCH); /* Either */ 442 regtail(ret, next); 443 regtail(regnode(BACK), ret); /* loop back */ 444 regtail(next, regnode(BRANCH)); /* or */ 445 regtail(ret, regnode(NOTHING)); /* null. */ 446 } else if (op == '?') { 447 /* Emit x? as (x|) */ 448 reginsert(BRANCH, ret); /* Either x */ 449 regtail(ret, regnode(BRANCH)); /* or */ 450 next = regnode(NOTHING); /* null. */ 451 regtail(ret, next); 452 regoptail(ret, next); 453 } 454 regparse++; 455 if (ISMULT(*regparse)) 456 FAIL("nested *?+"); 457 458 return(ret); 459 } 460 461 /* 462 - regatom - the lowest level 463 * 464 * Optimization: gobbles an entire sequence of ordinary characters so that 465 * it can turn them into a single node, which is smaller to store and 466 * faster to run. Backslashed characters are exceptions, each becoming a 467 * separate node; the code is simpler that way and it's not worth fixing. 468 */ 469 static char * 470 regatom(int *flagp) 471 { 472 char *ret; 473 int flags; 474 475 *flagp = WORST; /* Tentatively. */ 476 477 switch (*regparse++) { 478 case '^': 479 ret = regnode(BOL); 480 break; 481 case '$': 482 ret = regnode(EOL); 483 break; 484 case '.': 485 ret = regnode(ANY); 486 *flagp |= HASWIDTH|SIMPLE; 487 break; 488 case '[': { 489 int clss; 490 int classend; 491 492 if (*regparse == '^') { /* Complement of range. */ 493 ret = regnode(ANYBUT); 494 regparse++; 495 } else 496 ret = regnode(ANYOF); 497 if (*regparse == ']' || *regparse == '-') 498 regc(*regparse++); 499 while (*regparse != '\0' && *regparse != ']') { 500 if (*regparse == '-') { 501 regparse++; 502 if (*regparse == ']' || *regparse == '\0') 503 regc('-'); 504 else { 505 clss = UCHARAT(regparse-2)+1; 506 classend = UCHARAT(regparse); 507 if (clss > classend+1) 508 FAIL("invalid [] range"); 509 for (; clss <= classend; clss++) 510 regc(clss); 511 regparse++; 512 } 513 } else 514 regc(*regparse++); 515 } 516 regc('\0'); 517 if (*regparse != ']') 518 FAIL("unmatched []"); 519 regparse++; 520 *flagp |= HASWIDTH|SIMPLE; 521 } 522 break; 523 case '(': 524 ret = reg(1, &flags); 525 if (ret == NULL) 526 return(NULL); 527 *flagp |= flags&(HASWIDTH|SPSTART); 528 break; 529 case '\0': 530 case '|': 531 case ')': 532 FAIL("internal urp"); /* Supposed to be caught earlier. */ 533 /* NOTREACHED */ 534 break; 535 case '?': 536 case '+': 537 case '*': 538 FAIL("?+* follows nothing"); 539 /* NOTREACHED */ 540 break; 541 case '\\': 542 if (*regparse == '\0') 543 FAIL("trailing \\"); 544 ret = regnode(EXACTLY); 545 regc(*regparse++); 546 regc('\0'); 547 *flagp |= HASWIDTH|SIMPLE; 548 break; 549 default: { 550 int len; 551 char ender; 552 553 regparse--; 554 len = (int) strcspn(regparse, META); 555 if (len <= 0) 556 FAIL("internal disaster"); 557 ender = *(regparse+len); 558 if (len > 1 && ISMULT(ender)) 559 len--; /* Back off clear of ?+* operand. */ 560 *flagp |= HASWIDTH; 561 if (len == 1) 562 *flagp |= SIMPLE; 563 ret = regnode(EXACTLY); 564 while (len > 0) { 565 regc(*regparse++); 566 len--; 567 } 568 regc('\0'); 569 } 570 break; 571 } 572 573 return(ret); 574 } 575 576 /* 577 - regnode - emit a node 578 */ 579 static char * /* Location. */ 580 regnode(char op) 581 { 582 char *ret; 583 char *ptr; 584 585 ret = regcode; 586 if (ret == ®dummy) { 587 regsize += 3; 588 return(ret); 589 } 590 591 ptr = ret; 592 *ptr++ = op; 593 *ptr++ = '\0'; /* Null "next" pointer. */ 594 *ptr++ = '\0'; 595 regcode = ptr; 596 597 return(ret); 598 } 599 600 /* 601 - regc - emit (if appropriate) a byte of code 602 */ 603 static void 604 regc(char b) 605 { 606 if (regcode != ®dummy) 607 *regcode++ = b; 608 else 609 regsize++; 610 } 611 612 /* 613 - reginsert - insert an operator in front of already-emitted operand 614 * 615 * Means relocating the operand. 616 */ 617 static void 618 reginsert(char op, char *opnd) 619 { 620 char *src; 621 char *dst; 622 char *place; 623 624 if (regcode == ®dummy) { 625 regsize += 3; 626 return; 627 } 628 629 src = regcode; 630 regcode += 3; 631 dst = regcode; 632 while (src > opnd) 633 *--dst = *--src; 634 635 place = opnd; /* Op node, where operand used to be. */ 636 *place++ = op; 637 *place++ = '\0'; 638 *place++ = '\0'; 639 } 640 641 /* 642 - regtail - set the next-pointer at the end of a node chain 643 */ 644 static void 645 regtail(char *p, char *val) 646 { 647 char *scan; 648 char *temp; 649 int offset; 650 651 if (p == ®dummy) 652 return; 653 654 /* Find last node. */ 655 scan = p; 656 for (;;) { 657 temp = regnext(scan); 658 if (temp == NULL) 659 break; 660 scan = temp; 661 } 662 663 if (OP(scan) == BACK) 664 offset = (int) (scan - val); 665 else 666 offset = (int) (val - scan); 667 *(scan+1) = (offset>>8)&0377; 668 *(scan+2) = offset&0377; 669 } 670 671 /* 672 - regoptail - regtail on operand of first argument; nop if operandless 673 */ 674 static void 675 regoptail(char *p, char *val) 676 { 677 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 678 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 679 return; 680 regtail(OPERAND(p), val); 681 } 682 683 /* 684 * regexec and friends 685 */ 686 687 /* 688 * Global work variables for regexec(). 689 */ 690 static char *reginput; /* String-input pointer. */ 691 static char *regbol; /* Beginning of input, for ^ check. */ 692 static char **regstartp; /* Pointer to startp array. */ 693 static char **regendp; /* Ditto for endp. */ 694 695 /* 696 * Forwards. 697 */ 698 STATIC int regtry(); 699 STATIC int regmatch(); 700 STATIC int regrepeat(); 701 702 #ifdef DEBUG 703 int regnarrate = 0; 704 void regdump(); 705 STATIC char *regprop(); 706 #endif 707 708 /* 709 - regexec - match a regexp against a string 710 */ 711 int 712 regexec2(regexp *prog, char *string, int notbol) 713 { 714 char *s; 715 716 /* Be paranoid... */ 717 if (prog == NULL || string == NULL) { 718 regerror("NULL parameter"); 719 return(0); 720 } 721 722 /* Check validity of program. */ 723 if (UCHARAT(prog->program) != MAGIC) { 724 regerror("corrupted program"); 725 return(0); 726 } 727 728 /* If there is a "must appear" string, look for it. */ 729 if (prog->regmust != NULL) { 730 s = string; 731 while ((s = strchr(s, prog->regmust[0])) != NULL) { 732 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 733 break; /* Found it. */ 734 s++; 735 } 736 if (s == NULL) /* Not present. */ 737 return(0); 738 } 739 740 /* Mark beginning of line for ^ . */ 741 if (notbol) 742 regbol = NULL; 743 else 744 regbol = string; 745 746 /* Simplest case: anchored match need be tried only once. */ 747 if (prog->reganch) 748 return(regtry(prog, string)); 749 750 /* Messy cases: unanchored match. */ 751 s = string; 752 if (prog->regstart != '\0') 753 /* We know what char it must start with. */ 754 while ((s = strchr(s, prog->regstart)) != NULL) { 755 if (regtry(prog, s)) 756 return(1); 757 s++; 758 } 759 else 760 /* We don't -- general case. */ 761 do { 762 if (regtry(prog, s)) 763 return(1); 764 } while (*s++ != '\0'); 765 766 /* Failure. */ 767 return(0); 768 } 769 770 int 771 regexec(regexp *prog, char *string) 772 { 773 return regexec2(prog, string, 0); 774 } 775 776 /* 777 - regtry - try match at specific point 778 */ 779 static int /* 0 failure, 1 success */ 780 regtry(regexp *prog, char *string) 781 { 782 int i; 783 char **sp; 784 char **ep; 785 786 reginput = string; 787 regstartp = prog->startp; 788 regendp = prog->endp; 789 790 sp = prog->startp; 791 ep = prog->endp; 792 for (i = NSUBEXP; i > 0; i--) { 793 *sp++ = NULL; 794 *ep++ = NULL; 795 } 796 if (regmatch(prog->program + 1)) { 797 prog->startp[0] = string; 798 prog->endp[0] = reginput; 799 return(1); 800 } else 801 return(0); 802 } 803 804 /* 805 - regmatch - main matching routine 806 * 807 * Conceptually the strategy is simple: check to see whether the current 808 * node matches, call self recursively to see whether the rest matches, 809 * and then act accordingly. In practice we make some effort to avoid 810 * recursion, in particular by going through "ordinary" nodes (that don't 811 * need to know whether the rest of the match failed) by a loop instead of 812 * by recursion. 813 */ 814 static int /* 0 failure, 1 success */ 815 regmatch(char *prog) 816 { 817 char *scan; /* Current node. */ 818 char *next; /* Next node. */ 819 820 scan = prog; 821 #ifdef DEBUG 822 if (scan != NULL && regnarrate) 823 fprintf(stderr, "%s(\n", regprop(scan)); 824 #endif 825 while (scan != NULL) { 826 #ifdef DEBUG 827 if (regnarrate) 828 fprintf(stderr, "%s...\n", regprop(scan)); 829 #endif 830 next = regnext(scan); 831 832 switch (OP(scan)) { 833 case BOL: 834 if (reginput != regbol) 835 return(0); 836 break; 837 case EOL: 838 if (*reginput != '\0') 839 return(0); 840 break; 841 case ANY: 842 if (*reginput == '\0') 843 return(0); 844 reginput++; 845 break; 846 case EXACTLY: { 847 int len; 848 char *opnd; 849 850 opnd = OPERAND(scan); 851 /* Inline the first character, for speed. */ 852 if (*opnd != *reginput) 853 return(0); 854 len = (int) strlen(opnd); 855 if (len > 1 && strncmp(opnd, reginput, len) != 0) 856 return(0); 857 reginput += len; 858 } 859 break; 860 case ANYOF: 861 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 862 return(0); 863 reginput++; 864 break; 865 case ANYBUT: 866 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 867 return(0); 868 reginput++; 869 break; 870 case NOTHING: 871 break; 872 case BACK: 873 break; 874 case OPEN+1: 875 case OPEN+2: 876 case OPEN+3: 877 case OPEN+4: 878 case OPEN+5: 879 case OPEN+6: 880 case OPEN+7: 881 case OPEN+8: 882 case OPEN+9: { 883 int no; 884 char *save; 885 886 no = OP(scan) - OPEN; 887 save = reginput; 888 889 if (regmatch(next)) { 890 /* 891 * Don't set startp if some later 892 * invocation of the same parentheses 893 * already has. 894 */ 895 if (regstartp[no] == NULL) 896 regstartp[no] = save; 897 return(1); 898 } else 899 return(0); 900 } 901 /* NOTREACHED */ 902 break; 903 case CLOSE+1: 904 case CLOSE+2: 905 case CLOSE+3: 906 case CLOSE+4: 907 case CLOSE+5: 908 case CLOSE+6: 909 case CLOSE+7: 910 case CLOSE+8: 911 case CLOSE+9: { 912 int no; 913 char *save; 914 915 no = OP(scan) - CLOSE; 916 save = reginput; 917 918 if (regmatch(next)) { 919 /* 920 * Don't set endp if some later 921 * invocation of the same parentheses 922 * already has. 923 */ 924 if (regendp[no] == NULL) 925 regendp[no] = save; 926 return(1); 927 } else 928 return(0); 929 } 930 /* NOTREACHED */ 931 break; 932 case BRANCH: { 933 char *save; 934 935 if (OP(next) != BRANCH) /* No choice. */ 936 next = OPERAND(scan); /* Avoid recursion. */ 937 else { 938 do { 939 save = reginput; 940 if (regmatch(OPERAND(scan))) 941 return(1); 942 reginput = save; 943 scan = regnext(scan); 944 } while (scan != NULL && OP(scan) == BRANCH); 945 return(0); 946 /* NOTREACHED */ 947 } 948 } 949 /* NOTREACHED */ 950 break; 951 case STAR: 952 case PLUS: { 953 char nextch; 954 int no; 955 char *save; 956 int min; 957 958 /* 959 * Lookahead to avoid useless match attempts 960 * when we know what character comes next. 961 */ 962 nextch = '\0'; 963 if (OP(next) == EXACTLY) 964 nextch = *OPERAND(next); 965 min = (OP(scan) == STAR) ? 0 : 1; 966 save = reginput; 967 no = regrepeat(OPERAND(scan)); 968 while (no >= min) { 969 /* If it could work, try it. */ 970 if (nextch == '\0' || *reginput == nextch) 971 if (regmatch(next)) 972 return(1); 973 /* Couldn't or didn't -- back up. */ 974 no--; 975 reginput = save + no; 976 } 977 return(0); 978 } 979 /* NOTREACHED */ 980 break; 981 case END: 982 return(1); /* Success! */ 983 /* NOTREACHED */ 984 break; 985 default: 986 regerror("memory corruption"); 987 return(0); 988 /* NOTREACHED */ 989 break; 990 } 991 992 scan = next; 993 } 994 995 /* 996 * We get here only if there's trouble -- normally "case END" is 997 * the terminating point. 998 */ 999 regerror("corrupted pointers"); 1000 return(0); 1001 } 1002 1003 /* 1004 - regrepeat - repeatedly match something simple, report how many 1005 */ 1006 static int 1007 regrepeat(char *p) 1008 { 1009 int count = 0; 1010 char *scan; 1011 char *opnd; 1012 1013 scan = reginput; 1014 opnd = OPERAND(p); 1015 switch (OP(p)) { 1016 case ANY: 1017 count = (int) strlen(scan); 1018 scan += count; 1019 break; 1020 case EXACTLY: 1021 while (*opnd == *scan) { 1022 count++; 1023 scan++; 1024 } 1025 break; 1026 case ANYOF: 1027 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1028 count++; 1029 scan++; 1030 } 1031 break; 1032 case ANYBUT: 1033 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1034 count++; 1035 scan++; 1036 } 1037 break; 1038 default: /* Oh dear. Called inappropriately. */ 1039 regerror("internal foulup"); 1040 count = 0; /* Best compromise. */ 1041 break; 1042 } 1043 reginput = scan; 1044 1045 return(count); 1046 } 1047 1048 /* 1049 - regnext - dig the "next" pointer out of a node 1050 */ 1051 static char * 1052 regnext(char *p) 1053 { 1054 int offset; 1055 1056 if (p == ®dummy) 1057 return(NULL); 1058 1059 offset = NEXT(p); 1060 if (offset == 0) 1061 return(NULL); 1062 1063 if (OP(p) == BACK) 1064 return(p-offset); 1065 else 1066 return(p+offset); 1067 } 1068 1069 #ifdef DEBUG 1070 1071 STATIC char *regprop(); 1072 1073 /* 1074 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1075 */ 1076 void 1077 regdump(regexp *r) 1078 { 1079 char *s; 1080 char op = EXACTLY; /* Arbitrary non-END op. */ 1081 char *next; 1082 1083 1084 s = r->program + 1; 1085 while (op != END) { /* While that wasn't END last time... */ 1086 op = OP(s); 1087 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1088 next = regnext(s); 1089 if (next == NULL) /* Next ptr. */ 1090 printf("(0)"); 1091 else 1092 printf("(%d)", (s-r->program)+(next-s)); 1093 s += 3; 1094 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1095 /* Literal string, where present. */ 1096 while (*s != '\0') { 1097 putchar(*s); 1098 s++; 1099 } 1100 s++; 1101 } 1102 putchar('\n'); 1103 } 1104 1105 /* Header fields of interest. */ 1106 if (r->regstart != '\0') 1107 printf("start `%c' ", r->regstart); 1108 if (r->reganch) 1109 printf("anchored "); 1110 if (r->regmust != NULL) 1111 printf("must have \"%s\"", r->regmust); 1112 printf("\n"); 1113 } 1114 1115 /* 1116 - regprop - printable representation of opcode 1117 */ 1118 static char * 1119 regprop(char *op) 1120 { 1121 char *p; 1122 static char buf[50]; 1123 1124 (void) strcpy(buf, ":"); 1125 1126 switch (OP(op)) { 1127 case BOL: 1128 p = "BOL"; 1129 break; 1130 case EOL: 1131 p = "EOL"; 1132 break; 1133 case ANY: 1134 p = "ANY"; 1135 break; 1136 case ANYOF: 1137 p = "ANYOF"; 1138 break; 1139 case ANYBUT: 1140 p = "ANYBUT"; 1141 break; 1142 case BRANCH: 1143 p = "BRANCH"; 1144 break; 1145 case EXACTLY: 1146 p = "EXACTLY"; 1147 break; 1148 case NOTHING: 1149 p = "NOTHING"; 1150 break; 1151 case BACK: 1152 p = "BACK"; 1153 break; 1154 case END: 1155 p = "END"; 1156 break; 1157 case OPEN+1: 1158 case OPEN+2: 1159 case OPEN+3: 1160 case OPEN+4: 1161 case OPEN+5: 1162 case OPEN+6: 1163 case OPEN+7: 1164 case OPEN+8: 1165 case OPEN+9: 1166 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1167 p = NULL; 1168 break; 1169 case CLOSE+1: 1170 case CLOSE+2: 1171 case CLOSE+3: 1172 case CLOSE+4: 1173 case CLOSE+5: 1174 case CLOSE+6: 1175 case CLOSE+7: 1176 case CLOSE+8: 1177 case CLOSE+9: 1178 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1179 p = NULL; 1180 break; 1181 case STAR: 1182 p = "STAR"; 1183 break; 1184 case PLUS: 1185 p = "PLUS"; 1186 break; 1187 default: 1188 regerror("corrupted opcode"); 1189 break; 1190 } 1191 if (p != NULL) 1192 (void) strcat(buf, p); 1193 return(buf); 1194 } 1195 #endif 1196 1197 /* 1198 * The following is provided for those people who do not have strcspn() in 1199 * their C libraries. They should get off their butts and do something 1200 * about it; at least one public-domain implementation of those (highly 1201 * useful) string routines has been published on Usenet. 1202 */ 1203 #ifdef STRCSPN 1204 /* 1205 * strcspn - find length of initial segment of s1 consisting entirely 1206 * of characters not from s2 1207 */ 1208 1209 static int 1210 strcspn(char *s1, char *s2) 1211 { 1212 char *scan1; 1213 char *scan2; 1214 int count; 1215 1216 count = 0; 1217 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1218 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1219 if (*scan1 == *scan2++) 1220 return(count); 1221 count++; 1222 } 1223 return(count); 1224 } 1225 #endif 1226