1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 * 21 * Beware that some of this code is subtly aware of the way operator 22 * precedence is structured in regular expressions. Serious changes in 23 * regular-expression syntax might require a total rethink. 24 * 25 * *** NOTE: this code has been altered slightly for use in Tcl. *** 26 * Slightly modified by David MacKenzie to undo most of the changes for TCL. 27 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman 28 */ 29 30 #include "less.h" 31 #if HAVE_STDIO_H 32 #include <stdio.h> 33 #endif 34 #if HAVE_STDLIB_H 35 #include <stdlib.h> 36 #endif 37 #if HAVE_STRING_H 38 #include <string.h> 39 #endif 40 #include "regexp.h" 41 42 /* 43 * The "internal use only" fields in regexp.h are present to pass info from 44 * compile to execute that permits the execute phase to run lots faster on 45 * simple cases. They are: 46 * 47 * regstart char that must begin a match; '\0' if none obvious 48 * reganch is the match anchored (at beginning-of-line only)? 49 * regmust string (pointer into program) that match must include, or NULL 50 * regmlen length of regmust string 51 * 52 * Regstart and reganch permit very fast decisions on suitable starting points 53 * for a match, cutting down the work a lot. Regmust permits fast rejection 54 * of lines that cannot possibly match. The regmust tests are costly enough 55 * that regcomp() supplies a regmust only if the r.e. contains something 56 * potentially expensive (at present, the only such thing detected is * or + 57 * at the start of the r.e., which can involve a lot of backup). Regmlen is 58 * supplied because the test in regexec() needs it and regcomp() is 59 * computing it anyway. 60 */ 61 62 /* 63 * Structure for regexp "program". This is essentially a linear encoding 64 * of a nondeterministic finite-state machine (aka syntax charts or 65 * "railroad normal form" in parsing technology). Each node is an opcode 66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 67 * all nodes except BRANCH implement concatenation; a "next" pointer with 68 * a BRANCH on both ends of it is connecting two alternatives. (Here we 69 * have one of the subtle syntax dependencies: an individual BRANCH (as 70 * opposed to a collection of them) is never concatenated with anything 71 * because of operator precedence.) The operand of some types of node is 72 * a literal string; for others, it is a node leading into a sub-FSM. In 73 * particular, the operand of a BRANCH node is the first node of the branch. 74 * (NB this is *not* a tree structure: the tail of the branch connects 75 * to the thing following the set of BRANCHes.) The opcodes are: 76 */ 77 78 /* definition number opnd? meaning */ 79 #undef EOL 80 #define END 0 /* no End of program. */ 81 #define BOL 1 /* no Match "" at beginning of line. */ 82 #define EOL 2 /* no Match "" at end of line. */ 83 #define ANY 3 /* no Match any one character. */ 84 #define ANYOF 4 /* str Match any character in this string. */ 85 #define ANYBUT 5 /* str Match any character not in this string. */ 86 #define BRANCH 6 /* node Match this alternative, or the next... */ 87 #define BACK 7 /* no Match "", "next" ptr points backward. */ 88 #define EXACTLY 8 /* str Match this string. */ 89 #define NOTHING 9 /* no Match empty string. */ 90 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 91 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 92 #define OPEN 20 /* no Mark this point in input as start of #n. */ 93 /* OPEN+1 is number 1, etc. */ 94 #define CLOSE 30 /* no Analogous to OPEN. */ 95 96 /* 97 * Opcode notes: 98 * 99 * BRANCH The set of branches constituting a single choice are hooked 100 * together with their "next" pointers, since precedence prevents 101 * anything being concatenated to any individual branch. The 102 * "next" pointer of the last BRANCH in a choice points to the 103 * thing following the whole choice. This is also where the 104 * final "next" pointer of each individual branch points; each 105 * branch starts with the operand node of a BRANCH node. 106 * 107 * BACK Normal "next" pointers all implicitly point forward; BACK 108 * exists to make loop structures possible. 109 * 110 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 111 * BRANCH structures using BACK. Simple cases (one character 112 * per match) are implemented with STAR and PLUS for speed 113 * and to minimize recursive plunges. 114 * 115 * OPEN,CLOSE ...are numbered at compile time. 116 */ 117 118 /* 119 * A node is one char of opcode followed by two chars of "next" pointer. 120 * "Next" pointers are stored as two 8-bit pieces, high order first. The 121 * value is a positive offset from the opcode of the node containing it. 122 * An operand, if any, simply follows the node. (Note that much of the 123 * code generation knows about this implicit relationship.) 124 * 125 * Using two bytes for the "next" pointer is vast overkill for most things, 126 * but allows patterns to get big without disasters. 127 */ 128 #define OP(p) (*(p)) 129 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 130 #define OPERAND(p) ((p) + 3) 131 132 /* 133 * See regmagic.h for one further detail of program structure. 134 */ 135 136 137 /* 138 * Utility definitions. 139 */ 140 #ifndef CHARBITS 141 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 142 #else 143 #define UCHARAT(p) ((int)*(p)&CHARBITS) 144 #endif 145 146 #define FAIL(m) { regerror(m); return(NULL); } 147 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 148 #define META "^$.[()|?+*\\" 149 150 /* 151 * Flags to be passed up and down. 152 */ 153 #define HASWIDTH 01 /* Known never to match null string. */ 154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 155 #define SPSTART 04 /* Starts with * or +. */ 156 #define WORST 0 /* Worst case. */ 157 158 /* 159 * Global work variables for regcomp(). 160 */ 161 static char *regparse; /* Input-scan pointer. */ 162 static int regnpar; /* () count. */ 163 static char regdummy; 164 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 165 static long regsize; /* Code size. */ 166 167 /* 168 * The first byte of the regexp internal "program" is actually this magic 169 * number; the start node begins in the second byte. 170 */ 171 #define MAGIC 0234 172 173 174 /* 175 * Forward declarations for regcomp()'s friends. 176 */ 177 #ifndef STATIC 178 #define STATIC static 179 #endif 180 STATIC char *reg(); 181 STATIC char *regbranch(); 182 STATIC char *regpiece(); 183 STATIC char *regatom(); 184 STATIC char *regnode(); 185 STATIC char *regnext(); 186 STATIC void regc(); 187 STATIC void reginsert(); 188 STATIC void regtail(); 189 STATIC void regoptail(); 190 #ifdef STRCSPN 191 STATIC int strcspn(); 192 #endif 193 194 /* 195 - regcomp - compile a regular expression into internal code 196 * 197 * We can't allocate space until we know how big the compiled form will be, 198 * but we can't compile it (and thus know how big it is) until we've got a 199 * place to put the code. So we cheat: we compile it twice, once with code 200 * generation turned off and size counting turned on, and once "for real". 201 * This also means that we don't allocate space until we are sure that the 202 * thing really will compile successfully, and we never have to move the 203 * code and thus invalidate pointers into it. (Note that it has to be in 204 * one piece because free() must be able to free it all.) 205 * 206 * Beware that the optimization-preparation code in here knows about some 207 * of the structure of the compiled regexp. 208 */ 209 regexp * 210 regcomp(exp) 211 char *exp; 212 { 213 register regexp *r; 214 register char *scan; 215 register char *longest; 216 register int len; 217 int flags; 218 219 if (exp == NULL) 220 FAIL("NULL argument"); 221 222 /* First pass: determine size, legality. */ 223 regparse = exp; 224 regnpar = 1; 225 regsize = 0L; 226 regcode = ®dummy; 227 regc(MAGIC); 228 if (reg(0, &flags) == NULL) 229 return(NULL); 230 231 /* Small enough for pointer-storage convention? */ 232 if (regsize >= 32767L) /* Probably could be 65535L. */ 233 FAIL("regexp too big"); 234 235 /* Allocate space. */ 236 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 237 if (r == NULL) 238 FAIL("out of space"); 239 240 /* Second pass: emit code. */ 241 regparse = exp; 242 regnpar = 1; 243 regcode = r->program; 244 regc(MAGIC); 245 if (reg(0, &flags) == NULL) 246 { 247 free(r); 248 return(NULL); 249 } 250 251 /* Dig out information for optimizations. */ 252 r->regstart = '\0'; /* Worst-case defaults. */ 253 r->reganch = 0; 254 r->regmust = NULL; 255 r->regmlen = 0; 256 scan = r->program+1; /* First BRANCH. */ 257 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 258 scan = OPERAND(scan); 259 260 /* Starting-point info. */ 261 if (OP(scan) == EXACTLY) 262 r->regstart = *OPERAND(scan); 263 else if (OP(scan) == BOL) 264 r->reganch++; 265 266 /* 267 * If there's something expensive in the r.e., find the 268 * longest literal string that must appear and make it the 269 * regmust. Resolve ties in favor of later strings, since 270 * the regstart check works with the beginning of the r.e. 271 * and avoiding duplication strengthens checking. Not a 272 * strong reason, but sufficient in the absence of others. 273 */ 274 if (flags&SPSTART) { 275 longest = NULL; 276 len = 0; 277 for (; scan != NULL; scan = regnext(scan)) 278 if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) { 279 longest = OPERAND(scan); 280 len = (int) strlen(OPERAND(scan)); 281 } 282 r->regmust = longest; 283 r->regmlen = len; 284 } 285 } 286 287 return(r); 288 } 289 290 /* 291 - reg - regular expression, i.e. main body or parenthesized thing 292 * 293 * Caller must absorb opening parenthesis. 294 * 295 * Combining parenthesis handling with the base level of regular expression 296 * is a trifle forced, but the need to tie the tails of the branches to what 297 * follows makes it hard to avoid. 298 */ 299 static char * 300 reg(paren, flagp) 301 int paren; /* Parenthesized? */ 302 int *flagp; 303 { 304 register char *ret; 305 register char *br; 306 register char *ender; 307 register int parno = 0; 308 int flags; 309 310 *flagp = HASWIDTH; /* Tentatively. */ 311 312 /* Make an OPEN node, if parenthesized. */ 313 if (paren) { 314 if (regnpar >= NSUBEXP) 315 FAIL("too many ()"); 316 parno = regnpar; 317 regnpar++; 318 ret = regnode(OPEN+parno); 319 } else 320 ret = NULL; 321 322 /* Pick up the branches, linking them together. */ 323 br = regbranch(&flags); 324 if (br == NULL) 325 return(NULL); 326 if (ret != NULL) 327 regtail(ret, br); /* OPEN -> first. */ 328 else 329 ret = br; 330 if (!(flags&HASWIDTH)) 331 *flagp &= ~HASWIDTH; 332 *flagp |= flags&SPSTART; 333 while (*regparse == '|') { 334 regparse++; 335 br = regbranch(&flags); 336 if (br == NULL) 337 return(NULL); 338 regtail(ret, br); /* BRANCH -> BRANCH. */ 339 if (!(flags&HASWIDTH)) 340 *flagp &= ~HASWIDTH; 341 *flagp |= flags&SPSTART; 342 } 343 344 /* Make a closing node, and hook it on the end. */ 345 ender = regnode((paren) ? CLOSE+parno : END); 346 regtail(ret, ender); 347 348 /* Hook the tails of the branches to the closing node. */ 349 for (br = ret; br != NULL; br = regnext(br)) 350 regoptail(br, ender); 351 352 /* Check for proper termination. */ 353 if (paren && *regparse++ != ')') { 354 FAIL("unmatched ()"); 355 } else if (!paren && *regparse != '\0') { 356 if (*regparse == ')') { 357 FAIL("unmatched ()"); 358 } else 359 FAIL("junk on end"); /* "Can't happen". */ 360 /* NOTREACHED */ 361 } 362 363 return(ret); 364 } 365 366 /* 367 - regbranch - one alternative of an | operator 368 * 369 * Implements the concatenation operator. 370 */ 371 static char * 372 regbranch(flagp) 373 int *flagp; 374 { 375 register char *ret; 376 register char *chain; 377 register char *latest; 378 int flags; 379 380 *flagp = WORST; /* Tentatively. */ 381 382 ret = regnode(BRANCH); 383 chain = NULL; 384 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 385 latest = regpiece(&flags); 386 if (latest == NULL) 387 return(NULL); 388 *flagp |= flags&HASWIDTH; 389 if (chain == NULL) /* First piece. */ 390 *flagp |= flags&SPSTART; 391 else 392 regtail(chain, latest); 393 chain = latest; 394 } 395 if (chain == NULL) /* Loop ran zero times. */ 396 (void) regnode(NOTHING); 397 398 return(ret); 399 } 400 401 /* 402 - regpiece - something followed by possible [*+?] 403 * 404 * Note that the branching code sequences used for ? and the general cases 405 * of * and + are somewhat optimized: they use the same NOTHING node as 406 * both the endmarker for their branch list and the body of the last branch. 407 * It might seem that this node could be dispensed with entirely, but the 408 * endmarker role is not redundant. 409 */ 410 static char * 411 regpiece(flagp) 412 int *flagp; 413 { 414 register char *ret; 415 register char op; 416 register char *next; 417 int flags; 418 419 ret = regatom(&flags); 420 if (ret == NULL) 421 return(NULL); 422 423 op = *regparse; 424 if (!ISMULT(op)) { 425 *flagp = flags; 426 return(ret); 427 } 428 429 if (!(flags&HASWIDTH) && op != '?') 430 FAIL("*+ operand could be empty"); 431 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 432 433 if (op == '*' && (flags&SIMPLE)) 434 reginsert(STAR, ret); 435 else if (op == '*') { 436 /* Emit x* as (x&|), where & means "self". */ 437 reginsert(BRANCH, ret); /* Either x */ 438 regoptail(ret, regnode(BACK)); /* and loop */ 439 regoptail(ret, ret); /* back */ 440 regtail(ret, regnode(BRANCH)); /* or */ 441 regtail(ret, regnode(NOTHING)); /* null. */ 442 } else if (op == '+' && (flags&SIMPLE)) 443 reginsert(PLUS, ret); 444 else if (op == '+') { 445 /* Emit x+ as x(&|), where & means "self". */ 446 next = regnode(BRANCH); /* Either */ 447 regtail(ret, next); 448 regtail(regnode(BACK), ret); /* loop back */ 449 regtail(next, regnode(BRANCH)); /* or */ 450 regtail(ret, regnode(NOTHING)); /* null. */ 451 } else if (op == '?') { 452 /* Emit x? as (x|) */ 453 reginsert(BRANCH, ret); /* Either x */ 454 regtail(ret, regnode(BRANCH)); /* or */ 455 next = regnode(NOTHING); /* null. */ 456 regtail(ret, next); 457 regoptail(ret, next); 458 } 459 regparse++; 460 if (ISMULT(*regparse)) 461 FAIL("nested *?+"); 462 463 return(ret); 464 } 465 466 /* 467 - regatom - the lowest level 468 * 469 * Optimization: gobbles an entire sequence of ordinary characters so that 470 * it can turn them into a single node, which is smaller to store and 471 * faster to run. Backslashed characters are exceptions, each becoming a 472 * separate node; the code is simpler that way and it's not worth fixing. 473 */ 474 static char * 475 regatom(flagp) 476 int *flagp; 477 { 478 register char *ret; 479 int flags; 480 481 *flagp = WORST; /* Tentatively. */ 482 483 switch (*regparse++) { 484 case '^': 485 ret = regnode(BOL); 486 break; 487 case '$': 488 ret = regnode(EOL); 489 break; 490 case '.': 491 ret = regnode(ANY); 492 *flagp |= HASWIDTH|SIMPLE; 493 break; 494 case '[': { 495 register int clss; 496 register int classend; 497 498 if (*regparse == '^') { /* Complement of range. */ 499 ret = regnode(ANYBUT); 500 regparse++; 501 } else 502 ret = regnode(ANYOF); 503 if (*regparse == ']' || *regparse == '-') 504 regc(*regparse++); 505 while (*regparse != '\0' && *regparse != ']') { 506 if (*regparse == '-') { 507 regparse++; 508 if (*regparse == ']' || *regparse == '\0') 509 regc('-'); 510 else { 511 clss = UCHARAT(regparse-2)+1; 512 classend = UCHARAT(regparse); 513 if (clss > classend+1) 514 FAIL("invalid [] range"); 515 for (; clss <= classend; clss++) 516 regc(clss); 517 regparse++; 518 } 519 } else 520 regc(*regparse++); 521 } 522 regc('\0'); 523 if (*regparse != ']') 524 FAIL("unmatched []"); 525 regparse++; 526 *flagp |= HASWIDTH|SIMPLE; 527 } 528 break; 529 case '(': 530 ret = reg(1, &flags); 531 if (ret == NULL) 532 return(NULL); 533 *flagp |= flags&(HASWIDTH|SPSTART); 534 break; 535 case '\0': 536 case '|': 537 case ')': 538 FAIL("internal urp"); /* Supposed to be caught earlier. */ 539 /* NOTREACHED */ 540 break; 541 case '?': 542 case '+': 543 case '*': 544 FAIL("?+* follows nothing"); 545 /* NOTREACHED */ 546 break; 547 case '\\': 548 if (*regparse == '\0') 549 FAIL("trailing \\"); 550 ret = regnode(EXACTLY); 551 regc(*regparse++); 552 regc('\0'); 553 *flagp |= HASWIDTH|SIMPLE; 554 break; 555 default: { 556 register int len; 557 register char ender; 558 559 regparse--; 560 len = (int) strcspn(regparse, META); 561 if (len <= 0) 562 FAIL("internal disaster"); 563 ender = *(regparse+len); 564 if (len > 1 && ISMULT(ender)) 565 len--; /* Back off clear of ?+* operand. */ 566 *flagp |= HASWIDTH; 567 if (len == 1) 568 *flagp |= SIMPLE; 569 ret = regnode(EXACTLY); 570 while (len > 0) { 571 regc(*regparse++); 572 len--; 573 } 574 regc('\0'); 575 } 576 break; 577 } 578 579 return(ret); 580 } 581 582 /* 583 - regnode - emit a node 584 */ 585 static char * /* Location. */ 586 regnode(op) 587 char op; 588 { 589 register char *ret; 590 register char *ptr; 591 592 ret = regcode; 593 if (ret == ®dummy) { 594 regsize += 3; 595 return(ret); 596 } 597 598 ptr = ret; 599 *ptr++ = op; 600 *ptr++ = '\0'; /* Null "next" pointer. */ 601 *ptr++ = '\0'; 602 regcode = ptr; 603 604 return(ret); 605 } 606 607 /* 608 - regc - emit (if appropriate) a byte of code 609 */ 610 static void 611 regc(b) 612 char b; 613 { 614 if (regcode != ®dummy) 615 *regcode++ = b; 616 else 617 regsize++; 618 } 619 620 /* 621 - reginsert - insert an operator in front of already-emitted operand 622 * 623 * Means relocating the operand. 624 */ 625 static void 626 reginsert(op, opnd) 627 char op; 628 char *opnd; 629 { 630 register char *src; 631 register char *dst; 632 register char *place; 633 634 if (regcode == ®dummy) { 635 regsize += 3; 636 return; 637 } 638 639 src = regcode; 640 regcode += 3; 641 dst = regcode; 642 while (src > opnd) 643 *--dst = *--src; 644 645 place = opnd; /* Op node, where operand used to be. */ 646 *place++ = op; 647 *place++ = '\0'; 648 *place++ = '\0'; 649 } 650 651 /* 652 - regtail - set the next-pointer at the end of a node chain 653 */ 654 static void 655 regtail(p, val) 656 char *p; 657 char *val; 658 { 659 register char *scan; 660 register char *temp; 661 register int offset; 662 663 if (p == ®dummy) 664 return; 665 666 /* Find last node. */ 667 scan = p; 668 for (;;) { 669 temp = regnext(scan); 670 if (temp == NULL) 671 break; 672 scan = temp; 673 } 674 675 if (OP(scan) == BACK) 676 offset = (int) (scan - val); 677 else 678 offset = (int) (val - scan); 679 *(scan+1) = (offset>>8)&0377; 680 *(scan+2) = offset&0377; 681 } 682 683 /* 684 - regoptail - regtail on operand of first argument; nop if operandless 685 */ 686 static void 687 regoptail(p, val) 688 char *p; 689 char *val; 690 { 691 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 692 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 693 return; 694 regtail(OPERAND(p), val); 695 } 696 697 /* 698 * regexec and friends 699 */ 700 701 /* 702 * Global work variables for regexec(). 703 */ 704 static char *reginput; /* String-input pointer. */ 705 static char *regbol; /* Beginning of input, for ^ check. */ 706 static char **regstartp; /* Pointer to startp array. */ 707 static char **regendp; /* Ditto for endp. */ 708 709 /* 710 * Forwards. 711 */ 712 STATIC int regtry(); 713 STATIC int regmatch(); 714 STATIC int regrepeat(); 715 716 #ifdef DEBUG 717 int regnarrate = 0; 718 void regdump(); 719 STATIC char *regprop(); 720 #endif 721 722 /* 723 - regexec - match a regexp against a string 724 */ 725 int 726 regexec2(prog, string, notbol) 727 register regexp *prog; 728 register char *string; 729 int notbol; 730 { 731 register char *s; 732 733 /* Be paranoid... */ 734 if (prog == NULL || string == NULL) { 735 regerror("NULL parameter"); 736 return(0); 737 } 738 739 /* Check validity of program. */ 740 if (UCHARAT(prog->program) != MAGIC) { 741 regerror("corrupted program"); 742 return(0); 743 } 744 745 /* If there is a "must appear" string, look for it. */ 746 if (prog->regmust != NULL) { 747 s = string; 748 while ((s = strchr(s, prog->regmust[0])) != NULL) { 749 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 750 break; /* Found it. */ 751 s++; 752 } 753 if (s == NULL) /* Not present. */ 754 return(0); 755 } 756 757 /* Mark beginning of line for ^ . */ 758 if (notbol) 759 regbol = NULL; 760 else 761 regbol = string; 762 763 /* Simplest case: anchored match need be tried only once. */ 764 if (prog->reganch) 765 return(regtry(prog, string)); 766 767 /* Messy cases: unanchored match. */ 768 s = string; 769 if (prog->regstart != '\0') 770 /* We know what char it must start with. */ 771 while ((s = strchr(s, prog->regstart)) != NULL) { 772 if (regtry(prog, s)) 773 return(1); 774 s++; 775 } 776 else 777 /* We don't -- general case. */ 778 do { 779 if (regtry(prog, s)) 780 return(1); 781 } while (*s++ != '\0'); 782 783 /* Failure. */ 784 return(0); 785 } 786 787 int 788 regexec(prog, string) 789 register regexp *prog; 790 register char *string; 791 { 792 return regexec2(prog, string, 0); 793 } 794 795 /* 796 - regtry - try match at specific point 797 */ 798 static int /* 0 failure, 1 success */ 799 regtry(prog, string) 800 regexp *prog; 801 char *string; 802 { 803 register int i; 804 register char **sp; 805 register char **ep; 806 807 reginput = string; 808 regstartp = prog->startp; 809 regendp = prog->endp; 810 811 sp = prog->startp; 812 ep = prog->endp; 813 for (i = NSUBEXP; i > 0; i--) { 814 *sp++ = NULL; 815 *ep++ = NULL; 816 } 817 if (regmatch(prog->program + 1)) { 818 prog->startp[0] = string; 819 prog->endp[0] = reginput; 820 return(1); 821 } else 822 return(0); 823 } 824 825 /* 826 - regmatch - main matching routine 827 * 828 * Conceptually the strategy is simple: check to see whether the current 829 * node matches, call self recursively to see whether the rest matches, 830 * and then act accordingly. In practice we make some effort to avoid 831 * recursion, in particular by going through "ordinary" nodes (that don't 832 * need to know whether the rest of the match failed) by a loop instead of 833 * by recursion. 834 */ 835 static int /* 0 failure, 1 success */ 836 regmatch(prog) 837 char *prog; 838 { 839 register char *scan; /* Current node. */ 840 char *next; /* Next node. */ 841 842 scan = prog; 843 #ifdef DEBUG 844 if (scan != NULL && regnarrate) 845 fprintf(stderr, "%s(\n", regprop(scan)); 846 #endif 847 while (scan != NULL) { 848 #ifdef DEBUG 849 if (regnarrate) 850 fprintf(stderr, "%s...\n", regprop(scan)); 851 #endif 852 next = regnext(scan); 853 854 switch (OP(scan)) { 855 case BOL: 856 if (reginput != regbol) 857 return(0); 858 break; 859 case EOL: 860 if (*reginput != '\0') 861 return(0); 862 break; 863 case ANY: 864 if (*reginput == '\0') 865 return(0); 866 reginput++; 867 break; 868 case EXACTLY: { 869 register int len; 870 register char *opnd; 871 872 opnd = OPERAND(scan); 873 /* Inline the first character, for speed. */ 874 if (*opnd != *reginput) 875 return(0); 876 len = (int) strlen(opnd); 877 if (len > 1 && strncmp(opnd, reginput, len) != 0) 878 return(0); 879 reginput += len; 880 } 881 break; 882 case ANYOF: 883 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 884 return(0); 885 reginput++; 886 break; 887 case ANYBUT: 888 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 889 return(0); 890 reginput++; 891 break; 892 case NOTHING: 893 break; 894 case BACK: 895 break; 896 case OPEN+1: 897 case OPEN+2: 898 case OPEN+3: 899 case OPEN+4: 900 case OPEN+5: 901 case OPEN+6: 902 case OPEN+7: 903 case OPEN+8: 904 case OPEN+9: { 905 register int no; 906 register char *save; 907 908 no = OP(scan) - OPEN; 909 save = reginput; 910 911 if (regmatch(next)) { 912 /* 913 * Don't set startp if some later 914 * invocation of the same parentheses 915 * already has. 916 */ 917 if (regstartp[no] == NULL) 918 regstartp[no] = save; 919 return(1); 920 } else 921 return(0); 922 } 923 /* NOTREACHED */ 924 break; 925 case CLOSE+1: 926 case CLOSE+2: 927 case CLOSE+3: 928 case CLOSE+4: 929 case CLOSE+5: 930 case CLOSE+6: 931 case CLOSE+7: 932 case CLOSE+8: 933 case CLOSE+9: { 934 register int no; 935 register char *save; 936 937 no = OP(scan) - CLOSE; 938 save = reginput; 939 940 if (regmatch(next)) { 941 /* 942 * Don't set endp if some later 943 * invocation of the same parentheses 944 * already has. 945 */ 946 if (regendp[no] == NULL) 947 regendp[no] = save; 948 return(1); 949 } else 950 return(0); 951 } 952 /* NOTREACHED */ 953 break; 954 case BRANCH: { 955 register char *save; 956 957 if (OP(next) != BRANCH) /* No choice. */ 958 next = OPERAND(scan); /* Avoid recursion. */ 959 else { 960 do { 961 save = reginput; 962 if (regmatch(OPERAND(scan))) 963 return(1); 964 reginput = save; 965 scan = regnext(scan); 966 } while (scan != NULL && OP(scan) == BRANCH); 967 return(0); 968 /* NOTREACHED */ 969 } 970 } 971 /* NOTREACHED */ 972 break; 973 case STAR: 974 case PLUS: { 975 register char nextch; 976 register int no; 977 register char *save; 978 register int min; 979 980 /* 981 * Lookahead to avoid useless match attempts 982 * when we know what character comes next. 983 */ 984 nextch = '\0'; 985 if (OP(next) == EXACTLY) 986 nextch = *OPERAND(next); 987 min = (OP(scan) == STAR) ? 0 : 1; 988 save = reginput; 989 no = regrepeat(OPERAND(scan)); 990 while (no >= min) { 991 /* If it could work, try it. */ 992 if (nextch == '\0' || *reginput == nextch) 993 if (regmatch(next)) 994 return(1); 995 /* Couldn't or didn't -- back up. */ 996 no--; 997 reginput = save + no; 998 } 999 return(0); 1000 } 1001 /* NOTREACHED */ 1002 break; 1003 case END: 1004 return(1); /* Success! */ 1005 /* NOTREACHED */ 1006 break; 1007 default: 1008 regerror("memory corruption"); 1009 return(0); 1010 /* NOTREACHED */ 1011 break; 1012 } 1013 1014 scan = next; 1015 } 1016 1017 /* 1018 * We get here only if there's trouble -- normally "case END" is 1019 * the terminating point. 1020 */ 1021 regerror("corrupted pointers"); 1022 return(0); 1023 } 1024 1025 /* 1026 - regrepeat - repeatedly match something simple, report how many 1027 */ 1028 static int 1029 regrepeat(p) 1030 char *p; 1031 { 1032 register int count = 0; 1033 register char *scan; 1034 register char *opnd; 1035 1036 scan = reginput; 1037 opnd = OPERAND(p); 1038 switch (OP(p)) { 1039 case ANY: 1040 count = (int) strlen(scan); 1041 scan += count; 1042 break; 1043 case EXACTLY: 1044 while (*opnd == *scan) { 1045 count++; 1046 scan++; 1047 } 1048 break; 1049 case ANYOF: 1050 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1051 count++; 1052 scan++; 1053 } 1054 break; 1055 case ANYBUT: 1056 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1057 count++; 1058 scan++; 1059 } 1060 break; 1061 default: /* Oh dear. Called inappropriately. */ 1062 regerror("internal foulup"); 1063 count = 0; /* Best compromise. */ 1064 break; 1065 } 1066 reginput = scan; 1067 1068 return(count); 1069 } 1070 1071 /* 1072 - regnext - dig the "next" pointer out of a node 1073 */ 1074 static char * 1075 regnext(p) 1076 register char *p; 1077 { 1078 register int offset; 1079 1080 if (p == ®dummy) 1081 return(NULL); 1082 1083 offset = NEXT(p); 1084 if (offset == 0) 1085 return(NULL); 1086 1087 if (OP(p) == BACK) 1088 return(p-offset); 1089 else 1090 return(p+offset); 1091 } 1092 1093 #ifdef DEBUG 1094 1095 STATIC char *regprop(); 1096 1097 /* 1098 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1099 */ 1100 void 1101 regdump(r) 1102 regexp *r; 1103 { 1104 register char *s; 1105 register char op = EXACTLY; /* Arbitrary non-END op. */ 1106 register char *next; 1107 1108 1109 s = r->program + 1; 1110 while (op != END) { /* While that wasn't END last time... */ 1111 op = OP(s); 1112 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1113 next = regnext(s); 1114 if (next == NULL) /* Next ptr. */ 1115 printf("(0)"); 1116 else 1117 printf("(%d)", (s-r->program)+(next-s)); 1118 s += 3; 1119 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1120 /* Literal string, where present. */ 1121 while (*s != '\0') { 1122 putchar(*s); 1123 s++; 1124 } 1125 s++; 1126 } 1127 putchar('\n'); 1128 } 1129 1130 /* Header fields of interest. */ 1131 if (r->regstart != '\0') 1132 printf("start `%c' ", r->regstart); 1133 if (r->reganch) 1134 printf("anchored "); 1135 if (r->regmust != NULL) 1136 printf("must have \"%s\"", r->regmust); 1137 printf("\n"); 1138 } 1139 1140 /* 1141 - regprop - printable representation of opcode 1142 */ 1143 static char * 1144 regprop(op) 1145 char *op; 1146 { 1147 register char *p; 1148 static char buf[50]; 1149 1150 (void) strcpy(buf, ":"); 1151 1152 switch (OP(op)) { 1153 case BOL: 1154 p = "BOL"; 1155 break; 1156 case EOL: 1157 p = "EOL"; 1158 break; 1159 case ANY: 1160 p = "ANY"; 1161 break; 1162 case ANYOF: 1163 p = "ANYOF"; 1164 break; 1165 case ANYBUT: 1166 p = "ANYBUT"; 1167 break; 1168 case BRANCH: 1169 p = "BRANCH"; 1170 break; 1171 case EXACTLY: 1172 p = "EXACTLY"; 1173 break; 1174 case NOTHING: 1175 p = "NOTHING"; 1176 break; 1177 case BACK: 1178 p = "BACK"; 1179 break; 1180 case END: 1181 p = "END"; 1182 break; 1183 case OPEN+1: 1184 case OPEN+2: 1185 case OPEN+3: 1186 case OPEN+4: 1187 case OPEN+5: 1188 case OPEN+6: 1189 case OPEN+7: 1190 case OPEN+8: 1191 case OPEN+9: 1192 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1193 p = NULL; 1194 break; 1195 case CLOSE+1: 1196 case CLOSE+2: 1197 case CLOSE+3: 1198 case CLOSE+4: 1199 case CLOSE+5: 1200 case CLOSE+6: 1201 case CLOSE+7: 1202 case CLOSE+8: 1203 case CLOSE+9: 1204 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1205 p = NULL; 1206 break; 1207 case STAR: 1208 p = "STAR"; 1209 break; 1210 case PLUS: 1211 p = "PLUS"; 1212 break; 1213 default: 1214 regerror("corrupted opcode"); 1215 break; 1216 } 1217 if (p != NULL) 1218 (void) strcat(buf, p); 1219 return(buf); 1220 } 1221 #endif 1222 1223 /* 1224 * The following is provided for those people who do not have strcspn() in 1225 * their C libraries. They should get off their butts and do something 1226 * about it; at least one public-domain implementation of those (highly 1227 * useful) string routines has been published on Usenet. 1228 */ 1229 #ifdef STRCSPN 1230 /* 1231 * strcspn - find length of initial segment of s1 consisting entirely 1232 * of characters not from s2 1233 */ 1234 1235 static int 1236 strcspn(s1, s2) 1237 char *s1; 1238 char *s2; 1239 { 1240 register char *scan1; 1241 register char *scan2; 1242 register int count; 1243 1244 count = 0; 1245 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1246 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1247 if (*scan1 == *scan2++) 1248 return(count); 1249 count++; 1250 } 1251 return(count); 1252 } 1253 #endif 1254