1 /* 2 * regcomp and regexec -- regsub and regerror are elsewhere 3 * 4 * Copyright (c) 1986 by University of Toronto. 5 * Written by Henry Spencer. Not derived from licensed software. 6 * 7 * Permission is granted to anyone to use this software for any 8 * purpose on any computer system, and to redistribute it freely, 9 * subject to the following restrictions: 10 * 11 * 1. The author is not responsible for the consequences of use of 12 * this software, no matter how awful, even if they arise 13 * from defects in it. 14 * 15 * 2. The origin of this software must not be misrepresented, either 16 * by explicit claim or by omission. 17 * 18 * 3. Altered versions must be plainly marked as such, and must not 19 * be misrepresented as being the original software. 20 * 21 * Beware that some of this code is subtly aware of the way operator 22 * precedence is structured in regular expressions. Serious changes in 23 * regular-expression syntax might require a total rethink. 24 * 25 * *** NOTE: this code has been altered slightly for use in Tcl. *** 26 * Slightly modified by David MacKenzie to undo most of the changes for TCL. 27 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman 28 */ 29 30 #include "less.h" 31 #if HAVE_STDIO_H 32 #include <stdio.h> 33 #endif 34 #if HAVE_STDLIB_H 35 #include <stdlib.h> 36 #endif 37 #if HAVE_STRING_H 38 #include <string.h> 39 #endif 40 #include "regexp.h" 41 42 /* 43 * The "internal use only" fields in regexp.h are present to pass info from 44 * compile to execute that permits the execute phase to run lots faster on 45 * simple cases. They are: 46 * 47 * regstart char that must begin a match; '\0' if none obvious 48 * reganch is the match anchored (at beginning-of-line only)? 49 * regmust string (pointer into program) that match must include, or NULL 50 * regmlen length of regmust string 51 * 52 * Regstart and reganch permit very fast decisions on suitable starting points 53 * for a match, cutting down the work a lot. Regmust permits fast rejection 54 * of lines that cannot possibly match. The regmust tests are costly enough 55 * that regcomp() supplies a regmust only if the r.e. contains something 56 * potentially expensive (at present, the only such thing detected is * or + 57 * at the start of the r.e., which can involve a lot of backup). Regmlen is 58 * supplied because the test in regexec() needs it and regcomp() is 59 * computing it anyway. 60 */ 61 62 /* 63 * Structure for regexp "program". This is essentially a linear encoding 64 * of a nondeterministic finite-state machine (aka syntax charts or 65 * "railroad normal form" in parsing technology). Each node is an opcode 66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 67 * all nodes except BRANCH implement concatenation; a "next" pointer with 68 * a BRANCH on both ends of it is connecting two alternatives. (Here we 69 * have one of the subtle syntax dependencies: an individual BRANCH (as 70 * opposed to a collection of them) is never concatenated with anything 71 * because of operator precedence.) The operand of some types of node is 72 * a literal string; for others, it is a node leading into a sub-FSM. In 73 * particular, the operand of a BRANCH node is the first node of the branch. 74 * (NB this is *not* a tree structure: the tail of the branch connects 75 * to the thing following the set of BRANCHes.) The opcodes are: 76 */ 77 78 /* definition number opnd? meaning */ 79 #undef EOL 80 #define END 0 /* no End of program. */ 81 #define BOL 1 /* no Match "" at beginning of line. */ 82 #define EOL 2 /* no Match "" at end of line. */ 83 #define ANY 3 /* no Match any one character. */ 84 #define ANYOF 4 /* str Match any character in this string. */ 85 #define ANYBUT 5 /* str Match any character not in this string. */ 86 #define BRANCH 6 /* node Match this alternative, or the next... */ 87 #define BACK 7 /* no Match "", "next" ptr points backward. */ 88 #define EXACTLY 8 /* str Match this string. */ 89 #define NOTHING 9 /* no Match empty string. */ 90 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 91 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 92 #define OPEN 20 /* no Mark this point in input as start of #n. */ 93 /* OPEN+1 is number 1, etc. */ 94 #define CLOSE 30 /* no Analogous to OPEN. */ 95 96 /* 97 * Opcode notes: 98 * 99 * BRANCH The set of branches constituting a single choice are hooked 100 * together with their "next" pointers, since precedence prevents 101 * anything being concatenated to any individual branch. The 102 * "next" pointer of the last BRANCH in a choice points to the 103 * thing following the whole choice. This is also where the 104 * final "next" pointer of each individual branch points; each 105 * branch starts with the operand node of a BRANCH node. 106 * 107 * BACK Normal "next" pointers all implicitly point forward; BACK 108 * exists to make loop structures possible. 109 * 110 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 111 * BRANCH structures using BACK. Simple cases (one character 112 * per match) are implemented with STAR and PLUS for speed 113 * and to minimize recursive plunges. 114 * 115 * OPEN,CLOSE ...are numbered at compile time. 116 */ 117 118 /* 119 * A node is one char of opcode followed by two chars of "next" pointer. 120 * "Next" pointers are stored as two 8-bit pieces, high order first. The 121 * value is a positive offset from the opcode of the node containing it. 122 * An operand, if any, simply follows the node. (Note that much of the 123 * code generation knows about this implicit relationship.) 124 * 125 * Using two bytes for the "next" pointer is vast overkill for most things, 126 * but allows patterns to get big without disasters. 127 */ 128 #define OP(p) (*(p)) 129 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 130 #define OPERAND(p) ((p) + 3) 131 132 /* 133 * See regmagic.h for one further detail of program structure. 134 */ 135 136 137 /* 138 * Utility definitions. 139 */ 140 #ifndef CHARBITS 141 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 142 #else 143 #define UCHARAT(p) ((int)*(p)&CHARBITS) 144 #endif 145 146 #define FAIL(m) { regerror(m); return(NULL); } 147 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 148 #define META "^$.[()|?+*\\" 149 150 /* 151 * Flags to be passed up and down. 152 */ 153 #define HASWIDTH 01 /* Known never to match null string. */ 154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 155 #define SPSTART 04 /* Starts with * or +. */ 156 #define WORST 0 /* Worst case. */ 157 158 /* 159 * Global work variables for regcomp(). 160 */ 161 static char *regparse; /* Input-scan pointer. */ 162 static int regnpar; /* () count. */ 163 static char regdummy; 164 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 165 static long regsize; /* Code size. */ 166 167 /* 168 * The first byte of the regexp internal "program" is actually this magic 169 * number; the start node begins in the second byte. 170 */ 171 #define MAGIC 0234 172 173 174 /* 175 * Forward declarations for regcomp()'s friends. 176 */ 177 #ifndef STATIC 178 #define STATIC static 179 #endif 180 STATIC char *reg(); 181 STATIC char *regbranch(); 182 STATIC char *regpiece(); 183 STATIC char *regatom(); 184 STATIC char *regnode(); 185 STATIC char *regnext(); 186 STATIC void regc(); 187 STATIC void reginsert(); 188 STATIC void regtail(); 189 STATIC void regoptail(); 190 #ifdef STRCSPN 191 STATIC int strcspn(); 192 #endif 193 194 /* 195 - regcomp - compile a regular expression into internal code 196 * 197 * We can't allocate space until we know how big the compiled form will be, 198 * but we can't compile it (and thus know how big it is) until we've got a 199 * place to put the code. So we cheat: we compile it twice, once with code 200 * generation turned off and size counting turned on, and once "for real". 201 * This also means that we don't allocate space until we are sure that the 202 * thing really will compile successfully, and we never have to move the 203 * code and thus invalidate pointers into it. (Note that it has to be in 204 * one piece because free() must be able to free it all.) 205 * 206 * Beware that the optimization-preparation code in here knows about some 207 * of the structure of the compiled regexp. 208 */ 209 regexp * 210 regcomp(exp) 211 char *exp; 212 { 213 register regexp *r; 214 register char *scan; 215 register char *longest; 216 register int len; 217 int flags; 218 219 if (exp == NULL) 220 FAIL("NULL argument"); 221 222 /* First pass: determine size, legality. */ 223 regparse = exp; 224 regnpar = 1; 225 regsize = 0L; 226 regcode = ®dummy; 227 regc(MAGIC); 228 if (reg(0, &flags) == NULL) 229 return(NULL); 230 231 /* Small enough for pointer-storage convention? */ 232 if (regsize >= 32767L) /* Probably could be 65535L. */ 233 FAIL("regexp too big"); 234 235 /* Allocate space. */ 236 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); 237 if (r == NULL) 238 FAIL("out of space"); 239 240 /* Second pass: emit code. */ 241 regparse = exp; 242 regnpar = 1; 243 regcode = r->program; 244 regc(MAGIC); 245 if (reg(0, &flags) == NULL) 246 return(NULL); 247 248 /* Dig out information for optimizations. */ 249 r->regstart = '\0'; /* Worst-case defaults. */ 250 r->reganch = 0; 251 r->regmust = NULL; 252 r->regmlen = 0; 253 scan = r->program+1; /* First BRANCH. */ 254 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ 255 scan = OPERAND(scan); 256 257 /* Starting-point info. */ 258 if (OP(scan) == EXACTLY) 259 r->regstart = *OPERAND(scan); 260 else if (OP(scan) == BOL) 261 r->reganch++; 262 263 /* 264 * If there's something expensive in the r.e., find the 265 * longest literal string that must appear and make it the 266 * regmust. Resolve ties in favor of later strings, since 267 * the regstart check works with the beginning of the r.e. 268 * and avoiding duplication strengthens checking. Not a 269 * strong reason, but sufficient in the absence of others. 270 */ 271 if (flags&SPSTART) { 272 longest = NULL; 273 len = 0; 274 for (; scan != NULL; scan = regnext(scan)) 275 if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) { 276 longest = OPERAND(scan); 277 len = strlen(OPERAND(scan)); 278 } 279 r->regmust = longest; 280 r->regmlen = len; 281 } 282 } 283 284 return(r); 285 } 286 287 /* 288 - reg - regular expression, i.e. main body or parenthesized thing 289 * 290 * Caller must absorb opening parenthesis. 291 * 292 * Combining parenthesis handling with the base level of regular expression 293 * is a trifle forced, but the need to tie the tails of the branches to what 294 * follows makes it hard to avoid. 295 */ 296 static char * 297 reg(paren, flagp) 298 int paren; /* Parenthesized? */ 299 int *flagp; 300 { 301 register char *ret; 302 register char *br; 303 register char *ender; 304 register int parno = 0; 305 int flags; 306 307 *flagp = HASWIDTH; /* Tentatively. */ 308 309 /* Make an OPEN node, if parenthesized. */ 310 if (paren) { 311 if (regnpar >= NSUBEXP) 312 FAIL("too many ()"); 313 parno = regnpar; 314 regnpar++; 315 ret = regnode(OPEN+parno); 316 } else 317 ret = NULL; 318 319 /* Pick up the branches, linking them together. */ 320 br = regbranch(&flags); 321 if (br == NULL) 322 return(NULL); 323 if (ret != NULL) 324 regtail(ret, br); /* OPEN -> first. */ 325 else 326 ret = br; 327 if (!(flags&HASWIDTH)) 328 *flagp &= ~HASWIDTH; 329 *flagp |= flags&SPSTART; 330 while (*regparse == '|') { 331 regparse++; 332 br = regbranch(&flags); 333 if (br == NULL) 334 return(NULL); 335 regtail(ret, br); /* BRANCH -> BRANCH. */ 336 if (!(flags&HASWIDTH)) 337 *flagp &= ~HASWIDTH; 338 *flagp |= flags&SPSTART; 339 } 340 341 /* Make a closing node, and hook it on the end. */ 342 ender = regnode((paren) ? CLOSE+parno : END); 343 regtail(ret, ender); 344 345 /* Hook the tails of the branches to the closing node. */ 346 for (br = ret; br != NULL; br = regnext(br)) 347 regoptail(br, ender); 348 349 /* Check for proper termination. */ 350 if (paren && *regparse++ != ')') { 351 FAIL("unmatched ()"); 352 } else if (!paren && *regparse != '\0') { 353 if (*regparse == ')') { 354 FAIL("unmatched ()"); 355 } else 356 FAIL("junk on end"); /* "Can't happen". */ 357 /* NOTREACHED */ 358 } 359 360 return(ret); 361 } 362 363 /* 364 - regbranch - one alternative of an | operator 365 * 366 * Implements the concatenation operator. 367 */ 368 static char * 369 regbranch(flagp) 370 int *flagp; 371 { 372 register char *ret; 373 register char *chain; 374 register char *latest; 375 int flags; 376 377 *flagp = WORST; /* Tentatively. */ 378 379 ret = regnode(BRANCH); 380 chain = NULL; 381 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 382 latest = regpiece(&flags); 383 if (latest == NULL) 384 return(NULL); 385 *flagp |= flags&HASWIDTH; 386 if (chain == NULL) /* First piece. */ 387 *flagp |= flags&SPSTART; 388 else 389 regtail(chain, latest); 390 chain = latest; 391 } 392 if (chain == NULL) /* Loop ran zero times. */ 393 (void) regnode(NOTHING); 394 395 return(ret); 396 } 397 398 /* 399 - regpiece - something followed by possible [*+?] 400 * 401 * Note that the branching code sequences used for ? and the general cases 402 * of * and + are somewhat optimized: they use the same NOTHING node as 403 * both the endmarker for their branch list and the body of the last branch. 404 * It might seem that this node could be dispensed with entirely, but the 405 * endmarker role is not redundant. 406 */ 407 static char * 408 regpiece(flagp) 409 int *flagp; 410 { 411 register char *ret; 412 register char op; 413 register char *next; 414 int flags; 415 416 ret = regatom(&flags); 417 if (ret == NULL) 418 return(NULL); 419 420 op = *regparse; 421 if (!ISMULT(op)) { 422 *flagp = flags; 423 return(ret); 424 } 425 426 if (!(flags&HASWIDTH) && op != '?') 427 FAIL("*+ operand could be empty"); 428 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 429 430 if (op == '*' && (flags&SIMPLE)) 431 reginsert(STAR, ret); 432 else if (op == '*') { 433 /* Emit x* as (x&|), where & means "self". */ 434 reginsert(BRANCH, ret); /* Either x */ 435 regoptail(ret, regnode(BACK)); /* and loop */ 436 regoptail(ret, ret); /* back */ 437 regtail(ret, regnode(BRANCH)); /* or */ 438 regtail(ret, regnode(NOTHING)); /* null. */ 439 } else if (op == '+' && (flags&SIMPLE)) 440 reginsert(PLUS, ret); 441 else if (op == '+') { 442 /* Emit x+ as x(&|), where & means "self". */ 443 next = regnode(BRANCH); /* Either */ 444 regtail(ret, next); 445 regtail(regnode(BACK), ret); /* loop back */ 446 regtail(next, regnode(BRANCH)); /* or */ 447 regtail(ret, regnode(NOTHING)); /* null. */ 448 } else if (op == '?') { 449 /* Emit x? as (x|) */ 450 reginsert(BRANCH, ret); /* Either x */ 451 regtail(ret, regnode(BRANCH)); /* or */ 452 next = regnode(NOTHING); /* null. */ 453 regtail(ret, next); 454 regoptail(ret, next); 455 } 456 regparse++; 457 if (ISMULT(*regparse)) 458 FAIL("nested *?+"); 459 460 return(ret); 461 } 462 463 /* 464 - regatom - the lowest level 465 * 466 * Optimization: gobbles an entire sequence of ordinary characters so that 467 * it can turn them into a single node, which is smaller to store and 468 * faster to run. Backslashed characters are exceptions, each becoming a 469 * separate node; the code is simpler that way and it's not worth fixing. 470 */ 471 static char * 472 regatom(flagp) 473 int *flagp; 474 { 475 register char *ret; 476 int flags; 477 478 *flagp = WORST; /* Tentatively. */ 479 480 switch (*regparse++) { 481 case '^': 482 ret = regnode(BOL); 483 break; 484 case '$': 485 ret = regnode(EOL); 486 break; 487 case '.': 488 ret = regnode(ANY); 489 *flagp |= HASWIDTH|SIMPLE; 490 break; 491 case '[': { 492 register int clss; 493 register int classend; 494 495 if (*regparse == '^') { /* Complement of range. */ 496 ret = regnode(ANYBUT); 497 regparse++; 498 } else 499 ret = regnode(ANYOF); 500 if (*regparse == ']' || *regparse == '-') 501 regc(*regparse++); 502 while (*regparse != '\0' && *regparse != ']') { 503 if (*regparse == '-') { 504 regparse++; 505 if (*regparse == ']' || *regparse == '\0') 506 regc('-'); 507 else { 508 clss = UCHARAT(regparse-2)+1; 509 classend = UCHARAT(regparse); 510 if (clss > classend+1) 511 FAIL("invalid [] range"); 512 for (; clss <= classend; clss++) 513 regc(clss); 514 regparse++; 515 } 516 } else 517 regc(*regparse++); 518 } 519 regc('\0'); 520 if (*regparse != ']') 521 FAIL("unmatched []"); 522 regparse++; 523 *flagp |= HASWIDTH|SIMPLE; 524 } 525 break; 526 case '(': 527 ret = reg(1, &flags); 528 if (ret == NULL) 529 return(NULL); 530 *flagp |= flags&(HASWIDTH|SPSTART); 531 break; 532 case '\0': 533 case '|': 534 case ')': 535 FAIL("internal urp"); /* Supposed to be caught earlier. */ 536 /* NOTREACHED */ 537 break; 538 case '?': 539 case '+': 540 case '*': 541 FAIL("?+* follows nothing"); 542 /* NOTREACHED */ 543 break; 544 case '\\': 545 if (*regparse == '\0') 546 FAIL("trailing \\"); 547 ret = regnode(EXACTLY); 548 regc(*regparse++); 549 regc('\0'); 550 *flagp |= HASWIDTH|SIMPLE; 551 break; 552 default: { 553 register int len; 554 register char ender; 555 556 regparse--; 557 len = strcspn(regparse, META); 558 if (len <= 0) 559 FAIL("internal disaster"); 560 ender = *(regparse+len); 561 if (len > 1 && ISMULT(ender)) 562 len--; /* Back off clear of ?+* operand. */ 563 *flagp |= HASWIDTH; 564 if (len == 1) 565 *flagp |= SIMPLE; 566 ret = regnode(EXACTLY); 567 while (len > 0) { 568 regc(*regparse++); 569 len--; 570 } 571 regc('\0'); 572 } 573 break; 574 } 575 576 return(ret); 577 } 578 579 /* 580 - regnode - emit a node 581 */ 582 static char * /* Location. */ 583 regnode(op) 584 char op; 585 { 586 register char *ret; 587 register char *ptr; 588 589 ret = regcode; 590 if (ret == ®dummy) { 591 regsize += 3; 592 return(ret); 593 } 594 595 ptr = ret; 596 *ptr++ = op; 597 *ptr++ = '\0'; /* Null "next" pointer. */ 598 *ptr++ = '\0'; 599 regcode = ptr; 600 601 return(ret); 602 } 603 604 /* 605 - regc - emit (if appropriate) a byte of code 606 */ 607 static void 608 regc(b) 609 char b; 610 { 611 if (regcode != ®dummy) 612 *regcode++ = b; 613 else 614 regsize++; 615 } 616 617 /* 618 - reginsert - insert an operator in front of already-emitted operand 619 * 620 * Means relocating the operand. 621 */ 622 static void 623 reginsert(op, opnd) 624 char op; 625 char *opnd; 626 { 627 register char *src; 628 register char *dst; 629 register char *place; 630 631 if (regcode == ®dummy) { 632 regsize += 3; 633 return; 634 } 635 636 src = regcode; 637 regcode += 3; 638 dst = regcode; 639 while (src > opnd) 640 *--dst = *--src; 641 642 place = opnd; /* Op node, where operand used to be. */ 643 *place++ = op; 644 *place++ = '\0'; 645 *place++ = '\0'; 646 } 647 648 /* 649 - regtail - set the next-pointer at the end of a node chain 650 */ 651 static void 652 regtail(p, val) 653 char *p; 654 char *val; 655 { 656 register char *scan; 657 register char *temp; 658 register int offset; 659 660 if (p == ®dummy) 661 return; 662 663 /* Find last node. */ 664 scan = p; 665 for (;;) { 666 temp = regnext(scan); 667 if (temp == NULL) 668 break; 669 scan = temp; 670 } 671 672 if (OP(scan) == BACK) 673 offset = scan - val; 674 else 675 offset = val - scan; 676 *(scan+1) = (offset>>8)&0377; 677 *(scan+2) = offset&0377; 678 } 679 680 /* 681 - regoptail - regtail on operand of first argument; nop if operandless 682 */ 683 static void 684 regoptail(p, val) 685 char *p; 686 char *val; 687 { 688 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 689 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 690 return; 691 regtail(OPERAND(p), val); 692 } 693 694 /* 695 * regexec and friends 696 */ 697 698 /* 699 * Global work variables for regexec(). 700 */ 701 static char *reginput; /* String-input pointer. */ 702 static char *regbol; /* Beginning of input, for ^ check. */ 703 static char **regstartp; /* Pointer to startp array. */ 704 static char **regendp; /* Ditto for endp. */ 705 706 /* 707 * Forwards. 708 */ 709 STATIC int regtry(); 710 STATIC int regmatch(); 711 STATIC int regrepeat(); 712 713 #ifdef DEBUG 714 int regnarrate = 0; 715 void regdump(); 716 STATIC char *regprop(); 717 #endif 718 719 /* 720 - regexec - match a regexp against a string 721 */ 722 int 723 regexec2(prog, string, notbol) 724 register regexp *prog; 725 register char *string; 726 int notbol; 727 { 728 register char *s; 729 730 /* Be paranoid... */ 731 if (prog == NULL || string == NULL) { 732 regerror("NULL parameter"); 733 return(0); 734 } 735 736 /* Check validity of program. */ 737 if (UCHARAT(prog->program) != MAGIC) { 738 regerror("corrupted program"); 739 return(0); 740 } 741 742 /* If there is a "must appear" string, look for it. */ 743 if (prog->regmust != NULL) { 744 s = string; 745 while ((s = strchr(s, prog->regmust[0])) != NULL) { 746 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 747 break; /* Found it. */ 748 s++; 749 } 750 if (s == NULL) /* Not present. */ 751 return(0); 752 } 753 754 /* Mark beginning of line for ^ . */ 755 if (notbol) 756 regbol = NULL; 757 else 758 regbol = string; 759 760 /* Simplest case: anchored match need be tried only once. */ 761 if (prog->reganch) 762 return(regtry(prog, string)); 763 764 /* Messy cases: unanchored match. */ 765 s = string; 766 if (prog->regstart != '\0') 767 /* We know what char it must start with. */ 768 while ((s = strchr(s, prog->regstart)) != NULL) { 769 if (regtry(prog, s)) 770 return(1); 771 s++; 772 } 773 else 774 /* We don't -- general case. */ 775 do { 776 if (regtry(prog, s)) 777 return(1); 778 } while (*s++ != '\0'); 779 780 /* Failure. */ 781 return(0); 782 } 783 784 int 785 regexec(prog, string) 786 register regexp *prog; 787 register char *string; 788 { 789 return regexec2(prog, string, 0); 790 } 791 792 /* 793 - regtry - try match at specific point 794 */ 795 static int /* 0 failure, 1 success */ 796 regtry(prog, string) 797 regexp *prog; 798 char *string; 799 { 800 register int i; 801 register char **sp; 802 register char **ep; 803 804 reginput = string; 805 regstartp = prog->startp; 806 regendp = prog->endp; 807 808 sp = prog->startp; 809 ep = prog->endp; 810 for (i = NSUBEXP; i > 0; i--) { 811 *sp++ = NULL; 812 *ep++ = NULL; 813 } 814 if (regmatch(prog->program + 1)) { 815 prog->startp[0] = string; 816 prog->endp[0] = reginput; 817 return(1); 818 } else 819 return(0); 820 } 821 822 /* 823 - regmatch - main matching routine 824 * 825 * Conceptually the strategy is simple: check to see whether the current 826 * node matches, call self recursively to see whether the rest matches, 827 * and then act accordingly. In practice we make some effort to avoid 828 * recursion, in particular by going through "ordinary" nodes (that don't 829 * need to know whether the rest of the match failed) by a loop instead of 830 * by recursion. 831 */ 832 static int /* 0 failure, 1 success */ 833 regmatch(prog) 834 char *prog; 835 { 836 register char *scan; /* Current node. */ 837 char *next; /* Next node. */ 838 839 scan = prog; 840 #ifdef DEBUG 841 if (scan != NULL && regnarrate) 842 fprintf(stderr, "%s(\n", regprop(scan)); 843 #endif 844 while (scan != NULL) { 845 #ifdef DEBUG 846 if (regnarrate) 847 fprintf(stderr, "%s...\n", regprop(scan)); 848 #endif 849 next = regnext(scan); 850 851 switch (OP(scan)) { 852 case BOL: 853 if (reginput != regbol) 854 return(0); 855 break; 856 case EOL: 857 if (*reginput != '\0') 858 return(0); 859 break; 860 case ANY: 861 if (*reginput == '\0') 862 return(0); 863 reginput++; 864 break; 865 case EXACTLY: { 866 register int len; 867 register char *opnd; 868 869 opnd = OPERAND(scan); 870 /* Inline the first character, for speed. */ 871 if (*opnd != *reginput) 872 return(0); 873 len = strlen(opnd); 874 if (len > 1 && strncmp(opnd, reginput, len) != 0) 875 return(0); 876 reginput += len; 877 } 878 break; 879 case ANYOF: 880 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 881 return(0); 882 reginput++; 883 break; 884 case ANYBUT: 885 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 886 return(0); 887 reginput++; 888 break; 889 case NOTHING: 890 break; 891 case BACK: 892 break; 893 case OPEN+1: 894 case OPEN+2: 895 case OPEN+3: 896 case OPEN+4: 897 case OPEN+5: 898 case OPEN+6: 899 case OPEN+7: 900 case OPEN+8: 901 case OPEN+9: { 902 register int no; 903 register char *save; 904 905 no = OP(scan) - OPEN; 906 save = reginput; 907 908 if (regmatch(next)) { 909 /* 910 * Don't set startp if some later 911 * invocation of the same parentheses 912 * already has. 913 */ 914 if (regstartp[no] == NULL) 915 regstartp[no] = save; 916 return(1); 917 } else 918 return(0); 919 } 920 /* NOTREACHED */ 921 break; 922 case CLOSE+1: 923 case CLOSE+2: 924 case CLOSE+3: 925 case CLOSE+4: 926 case CLOSE+5: 927 case CLOSE+6: 928 case CLOSE+7: 929 case CLOSE+8: 930 case CLOSE+9: { 931 register int no; 932 register char *save; 933 934 no = OP(scan) - CLOSE; 935 save = reginput; 936 937 if (regmatch(next)) { 938 /* 939 * Don't set endp if some later 940 * invocation of the same parentheses 941 * already has. 942 */ 943 if (regendp[no] == NULL) 944 regendp[no] = save; 945 return(1); 946 } else 947 return(0); 948 } 949 /* NOTREACHED */ 950 break; 951 case BRANCH: { 952 register char *save; 953 954 if (OP(next) != BRANCH) /* No choice. */ 955 next = OPERAND(scan); /* Avoid recursion. */ 956 else { 957 do { 958 save = reginput; 959 if (regmatch(OPERAND(scan))) 960 return(1); 961 reginput = save; 962 scan = regnext(scan); 963 } while (scan != NULL && OP(scan) == BRANCH); 964 return(0); 965 /* NOTREACHED */ 966 } 967 } 968 /* NOTREACHED */ 969 break; 970 case STAR: 971 case PLUS: { 972 register char nextch; 973 register int no; 974 register char *save; 975 register int min; 976 977 /* 978 * Lookahead to avoid useless match attempts 979 * when we know what character comes next. 980 */ 981 nextch = '\0'; 982 if (OP(next) == EXACTLY) 983 nextch = *OPERAND(next); 984 min = (OP(scan) == STAR) ? 0 : 1; 985 save = reginput; 986 no = regrepeat(OPERAND(scan)); 987 while (no >= min) { 988 /* If it could work, try it. */ 989 if (nextch == '\0' || *reginput == nextch) 990 if (regmatch(next)) 991 return(1); 992 /* Couldn't or didn't -- back up. */ 993 no--; 994 reginput = save + no; 995 } 996 return(0); 997 } 998 /* NOTREACHED */ 999 break; 1000 case END: 1001 return(1); /* Success! */ 1002 /* NOTREACHED */ 1003 break; 1004 default: 1005 regerror("memory corruption"); 1006 return(0); 1007 /* NOTREACHED */ 1008 break; 1009 } 1010 1011 scan = next; 1012 } 1013 1014 /* 1015 * We get here only if there's trouble -- normally "case END" is 1016 * the terminating point. 1017 */ 1018 regerror("corrupted pointers"); 1019 return(0); 1020 } 1021 1022 /* 1023 - regrepeat - repeatedly match something simple, report how many 1024 */ 1025 static int 1026 regrepeat(p) 1027 char *p; 1028 { 1029 register int count = 0; 1030 register char *scan; 1031 register char *opnd; 1032 1033 scan = reginput; 1034 opnd = OPERAND(p); 1035 switch (OP(p)) { 1036 case ANY: 1037 count = strlen(scan); 1038 scan += count; 1039 break; 1040 case EXACTLY: 1041 while (*opnd == *scan) { 1042 count++; 1043 scan++; 1044 } 1045 break; 1046 case ANYOF: 1047 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1048 count++; 1049 scan++; 1050 } 1051 break; 1052 case ANYBUT: 1053 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1054 count++; 1055 scan++; 1056 } 1057 break; 1058 default: /* Oh dear. Called inappropriately. */ 1059 regerror("internal foulup"); 1060 count = 0; /* Best compromise. */ 1061 break; 1062 } 1063 reginput = scan; 1064 1065 return(count); 1066 } 1067 1068 /* 1069 - regnext - dig the "next" pointer out of a node 1070 */ 1071 static char * 1072 regnext(p) 1073 register char *p; 1074 { 1075 register int offset; 1076 1077 if (p == ®dummy) 1078 return(NULL); 1079 1080 offset = NEXT(p); 1081 if (offset == 0) 1082 return(NULL); 1083 1084 if (OP(p) == BACK) 1085 return(p-offset); 1086 else 1087 return(p+offset); 1088 } 1089 1090 #ifdef DEBUG 1091 1092 STATIC char *regprop(); 1093 1094 /* 1095 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1096 */ 1097 void 1098 regdump(r) 1099 regexp *r; 1100 { 1101 register char *s; 1102 register char op = EXACTLY; /* Arbitrary non-END op. */ 1103 register char *next; 1104 1105 1106 s = r->program + 1; 1107 while (op != END) { /* While that wasn't END last time... */ 1108 op = OP(s); 1109 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1110 next = regnext(s); 1111 if (next == NULL) /* Next ptr. */ 1112 printf("(0)"); 1113 else 1114 printf("(%d)", (s-r->program)+(next-s)); 1115 s += 3; 1116 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1117 /* Literal string, where present. */ 1118 while (*s != '\0') { 1119 putchar(*s); 1120 s++; 1121 } 1122 s++; 1123 } 1124 putchar('\n'); 1125 } 1126 1127 /* Header fields of interest. */ 1128 if (r->regstart != '\0') 1129 printf("start `%c' ", r->regstart); 1130 if (r->reganch) 1131 printf("anchored "); 1132 if (r->regmust != NULL) 1133 printf("must have \"%s\"", r->regmust); 1134 printf("\n"); 1135 } 1136 1137 /* 1138 - regprop - printable representation of opcode 1139 */ 1140 static char * 1141 regprop(op) 1142 char *op; 1143 { 1144 register char *p; 1145 static char buf[50]; 1146 1147 (void) strcpy(buf, ":"); 1148 1149 switch (OP(op)) { 1150 case BOL: 1151 p = "BOL"; 1152 break; 1153 case EOL: 1154 p = "EOL"; 1155 break; 1156 case ANY: 1157 p = "ANY"; 1158 break; 1159 case ANYOF: 1160 p = "ANYOF"; 1161 break; 1162 case ANYBUT: 1163 p = "ANYBUT"; 1164 break; 1165 case BRANCH: 1166 p = "BRANCH"; 1167 break; 1168 case EXACTLY: 1169 p = "EXACTLY"; 1170 break; 1171 case NOTHING: 1172 p = "NOTHING"; 1173 break; 1174 case BACK: 1175 p = "BACK"; 1176 break; 1177 case END: 1178 p = "END"; 1179 break; 1180 case OPEN+1: 1181 case OPEN+2: 1182 case OPEN+3: 1183 case OPEN+4: 1184 case OPEN+5: 1185 case OPEN+6: 1186 case OPEN+7: 1187 case OPEN+8: 1188 case OPEN+9: 1189 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1190 p = NULL; 1191 break; 1192 case CLOSE+1: 1193 case CLOSE+2: 1194 case CLOSE+3: 1195 case CLOSE+4: 1196 case CLOSE+5: 1197 case CLOSE+6: 1198 case CLOSE+7: 1199 case CLOSE+8: 1200 case CLOSE+9: 1201 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1202 p = NULL; 1203 break; 1204 case STAR: 1205 p = "STAR"; 1206 break; 1207 case PLUS: 1208 p = "PLUS"; 1209 break; 1210 default: 1211 regerror("corrupted opcode"); 1212 break; 1213 } 1214 if (p != NULL) 1215 (void) strcat(buf, p); 1216 return(buf); 1217 } 1218 #endif 1219 1220 /* 1221 * The following is provided for those people who do not have strcspn() in 1222 * their C libraries. They should get off their butts and do something 1223 * about it; at least one public-domain implementation of those (highly 1224 * useful) string routines has been published on Usenet. 1225 */ 1226 #ifdef STRCSPN 1227 /* 1228 * strcspn - find length of initial segment of s1 consisting entirely 1229 * of characters not from s2 1230 */ 1231 1232 static int 1233 strcspn(s1, s2) 1234 char *s1; 1235 char *s2; 1236 { 1237 register char *scan1; 1238 register char *scan2; 1239 register int count; 1240 1241 count = 0; 1242 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1243 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1244 if (*scan1 == *scan2++) 1245 return(count); 1246 count++; 1247 } 1248 return(count); 1249 } 1250 #endif 1251