1 /* 2 * Copyright (C) Lucent Technologies 1997 3 * All Rights Reserved 4 * 5 * Permission to use, copy, modify, and distribute this software and 6 * its documentation for any purpose and without fee is hereby 7 * granted, provided that the above copyright notice appear in all 8 * copies and that both that the copyright notice and this 9 * permission notice and warranty disclaimer appear in supporting 10 * documentation, and that the name Lucent Technologies or any of 11 * its entities not be used in advertising or publicity pertaining 12 * to distribution of the software without specific, written prior 13 * permission. 14 * 15 * LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 * IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 * THIS SOFTWARE. 23 */ 24 25 /* 26 * CDDL HEADER START 27 * 28 * The contents of this file are subject to the terms of the 29 * Common Development and Distribution License, Version 1.0 only 30 * (the "License"). You may not use this file except in compliance 31 * with the License. 32 * 33 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 34 * or http://www.opensolaris.org/os/licensing. 35 * See the License for the specific language governing permissions 36 * and limitations under the License. 37 * 38 * When distributing Covered Code, include this CDDL HEADER in each 39 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 40 * If applicable, add the following below this CDDL HEADER, with the 41 * fields enclosed by brackets "[]" replaced with your own identifying 42 * information: Portions Copyright [yyyy] [name of copyright owner] 43 * 44 * CDDL HEADER END 45 */ 46 47 /* 48 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 49 * Use is subject to license terms. 50 */ 51 52 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 53 /* All Rights Reserved */ 54 55 /* lasciate ogne speranza, voi ch'intrate. */ 56 57 #define DEBUG 58 59 #include "awk.h" 60 #include "y.tab.h" 61 62 #define HAT (NCHARS-1) /* matches ^ in regular expr */ 63 /* NCHARS is 2**n */ 64 #define MAXLIN (3 * LINE_MAX) 65 66 #define type(v) (v)->nobj /* badly overloaded here */ 67 #define info(v) (v)->ntype /* badly overloaded here */ 68 #define left(v) (v)->narg[0] 69 #define right(v) (v)->narg[1] 70 #define parent(v) (v)->nnext 71 72 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 73 #define ELEAF case EMPTYRE: /* empty string in regexp */ 74 #define UNARY case STAR: case PLUS: case QUEST: 75 76 /* 77 * encoding in tree Nodes: 78 * leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 79 * left is index, right contains value or pointer to value 80 * unary (STAR, PLUS, QUEST): left is child, right is null 81 * binary (CAT, OR): left and right are children 82 * parent contains pointer to parent 83 */ 84 85 int *setvec; 86 int *tmpset; 87 int maxsetvec = 0; 88 89 int rtok; /* next token in current re */ 90 int rlxval; 91 static uschar *rlxstr; 92 static uschar *prestr; /* current position in current re */ 93 static uschar *lastre; /* origin of last re */ 94 95 static int setcnt; 96 static int poscnt; 97 98 char *patbeg; 99 int patlen; 100 101 #define NFA 20 /* cache this many dynamic fa's */ 102 fa *fatab[NFA]; 103 int nfatab = 0; /* entries in fatab */ 104 105 static fa *mkdfa(const char *, int); 106 static int makeinit(fa *, int); 107 static void penter(Node *); 108 static void freetr(Node *); 109 static void overflo(const char *); 110 static void growvec(const char *); 111 static void cfoll(fa *, Node *); 112 static void follow(Node *); 113 static Node *reparse(const char *); 114 static int relex(void); 115 static void freefa(fa *); 116 static int cgoto(fa *, int, int); 117 118 fa * 119 makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 120 { 121 int i, use, nuse; 122 fa *pfa; 123 static int now = 1; 124 125 if (setvec == NULL) { /* first time through any RE */ 126 maxsetvec = MAXLIN; 127 setvec = (int *)malloc(maxsetvec * sizeof (int)); 128 tmpset = (int *)malloc(maxsetvec * sizeof (int)); 129 if (setvec == NULL || tmpset == NULL) 130 overflo("out of space initializing makedfa"); 131 } 132 133 if (compile_time) /* a constant for sure */ 134 return (mkdfa(s, anchor)); 135 for (i = 0; i < nfatab; i++) { /* is it there already? */ 136 if (fatab[i]->anchor == anchor && 137 strcmp((const char *)fatab[i]->restr, s) == 0) { 138 fatab[i]->use = now++; 139 return (fatab[i]); 140 } 141 } 142 pfa = mkdfa(s, anchor); 143 if (nfatab < NFA) { /* room for another */ 144 fatab[nfatab] = pfa; 145 fatab[nfatab]->use = now++; 146 nfatab++; 147 return (pfa); 148 } 149 use = fatab[0]->use; /* replace least-recently used */ 150 nuse = 0; 151 for (i = 1; i < nfatab; i++) 152 if (fatab[i]->use < use) { 153 use = fatab[i]->use; 154 nuse = i; 155 } 156 freefa(fatab[nuse]); 157 fatab[nuse] = pfa; 158 pfa->use = now++; 159 return (pfa); 160 } 161 162 /* 163 * does the real work of making a dfa 164 * anchor = 1 for anchored matches, else 0 165 */ 166 fa * 167 mkdfa(const char *s, int anchor) 168 { 169 Node *p, *p1; 170 fa *f; 171 172 p = reparse(s); 173 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 174 /* put ALL STAR in front of reg. exp. */ 175 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 176 /* put FINAL after reg. exp. */ 177 178 poscnt = 0; 179 penter(p1); /* enter parent pointers and leaf indices */ 180 if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL) 181 overflo("out of space for fa"); 182 /* penter has computed number of positions in re */ 183 f->accept = poscnt-1; 184 cfoll(f, p1); /* set up follow sets */ 185 freetr(p1); 186 if ((f->posns[0] = 187 (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) { 188 overflo("out of space in makedfa"); 189 } 190 if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL) 191 overflo("out of space in makedfa"); 192 *f->posns[1] = 0; 193 f->initstat = makeinit(f, anchor); 194 f->anchor = anchor; 195 f->restr = (uschar *)tostring(s); 196 return (f); 197 } 198 199 static int 200 makeinit(fa *f, int anchor) 201 { 202 int i, k; 203 204 f->curstat = 2; 205 f->out[2] = 0; 206 f->reset = 0; 207 k = *(f->re[0].lfollow); 208 xfree(f->posns[2]); 209 if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL) 210 overflo("out of space in makeinit"); 211 for (i = 0; i <= k; i++) { 212 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 213 } 214 if ((f->posns[2])[1] == f->accept) 215 f->out[2] = 1; 216 for (i = 0; i < NCHARS; i++) 217 f->gototab[2][i] = 0; 218 f->curstat = cgoto(f, 2, HAT); 219 if (anchor) { 220 *f->posns[2] = k-1; /* leave out position 0 */ 221 for (i = 0; i < k; i++) { 222 (f->posns[0])[i] = (f->posns[2])[i]; 223 } 224 225 f->out[0] = f->out[2]; 226 if (f->curstat != 2) 227 --(*f->posns[f->curstat]); 228 } 229 return (f->curstat); 230 } 231 232 void 233 penter(Node *p) /* set up parent pointers and leaf indices */ 234 { 235 switch (type(p)) { 236 ELEAF 237 LEAF 238 info(p) = poscnt; 239 poscnt++; 240 break; 241 UNARY 242 penter(left(p)); 243 parent(left(p)) = p; 244 break; 245 case CAT: 246 case OR: 247 penter(left(p)); 248 penter(right(p)); 249 parent(left(p)) = p; 250 parent(right(p)) = p; 251 break; 252 default: /* can't happen */ 253 FATAL("can't happen: unknown type %d in penter", type(p)); 254 break; 255 } 256 } 257 258 static void 259 freetr(Node *p) /* free parse tree */ 260 { 261 switch (type(p)) { 262 ELEAF 263 LEAF 264 xfree(p); 265 break; 266 UNARY 267 freetr(left(p)); 268 xfree(p); 269 break; 270 case CAT: 271 case OR: 272 freetr(left(p)); 273 freetr(right(p)); 274 xfree(p); 275 break; 276 default: /* can't happen */ 277 FATAL("can't happen: unknown type %d in freetr", type(p)); 278 break; 279 } 280 } 281 282 static void 283 growvec(const char *msg) 284 { 285 maxsetvec *= 4; 286 setvec = (int *)realloc(setvec, maxsetvec * sizeof (int)); 287 tmpset = (int *)realloc(tmpset, maxsetvec * sizeof (int)); 288 if (setvec == NULL || tmpset == NULL) 289 overflo(msg); 290 } 291 292 /* 293 * in the parsing of regular expressions, metacharacters like . have 294 * to be seen literally; \056 is not a metacharacter. 295 */ 296 297 /* 298 * find and eval hex string at pp, return new p; only pick up one 8-bit 299 * byte (2 chars). 300 */ 301 int 302 hexstr(uschar **pp) 303 { 304 uschar *p; 305 int n = 0; 306 int i; 307 308 for (i = 0, p = (uschar *)*pp; i < 2 && isxdigit(*p); i++, p++) { 309 if (isdigit(*p)) 310 n = 16 * n + *p - '0'; 311 else if (*p >= 'a' && *p <= 'f') 312 n = 16 * n + *p - 'a' + 10; 313 else if (*p >= 'A' && *p <= 'F') 314 n = 16 * n + *p - 'A' + 10; 315 } 316 *pp = (uschar *)p; 317 return (n); 318 } 319 320 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') 321 322 /* pick up next thing after a \\ and increment *pp */ 323 int 324 quoted(uschar **pp) 325 { 326 uschar *p = *pp; 327 int c; 328 329 if ((c = *p++) == 't') 330 c = '\t'; 331 else if (c == 'n') 332 c = '\n'; 333 else if (c == 'f') 334 c = '\f'; 335 else if (c == 'r') 336 c = '\r'; 337 else if (c == 'b') 338 c = '\b'; 339 else if (c == '\\') 340 c = '\\'; 341 else if (c == 'x') { /* hexadecimal goo follows */ 342 c = hexstr(&p); /* this adds a null if number is invalid */ 343 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 344 int n = c - '0'; 345 if (isoctdigit(*p)) { 346 n = 8 * n + *p++ - '0'; 347 if (isoctdigit(*p)) 348 n = 8 * n + *p++ - '0'; 349 } 350 c = n; 351 } /* else */ 352 /* c = c; */ 353 *pp = p; 354 return (c); 355 } 356 357 char * 358 cclenter(const char *argp) /* add a character class */ 359 { 360 int i, c, c2; 361 uschar *p = (uschar *)argp; 362 uschar *op, *bp; 363 static uschar *buf = NULL; 364 static size_t bufsz = 100; 365 366 op = p; 367 if (buf == NULL && (buf = (uschar *)malloc(bufsz)) == NULL) 368 FATAL("out of space for character class [%.10s...] 1", p); 369 bp = buf; 370 for (i = 0; (c = *p++) != 0; ) { 371 if (c == '\\') { 372 c = quoted(&p); 373 } else if (c == '-' && i > 0 && bp[-1] != 0) { 374 if (*p != 0) { 375 c = bp[-1]; 376 c2 = *p++; 377 if (c2 == '\\') 378 c2 = quoted(&p); 379 if (c > c2) { /* empty; ignore */ 380 bp--; 381 i--; 382 continue; 383 } 384 while (c < c2) { 385 if (!adjbuf((char **)&buf, &bufsz, 386 bp-buf+2, 100, (char **)&bp, 387 "cclenter1")) { 388 FATAL( 389 "out of space for character class [%.10s...] 2", p); 390 } 391 *bp++ = ++c; 392 i++; 393 } 394 continue; 395 } 396 } 397 if (!adjbuf((char **)&buf, &bufsz, bp-buf+2, 100, (char **)&bp, 398 "cclenter2")) 399 FATAL( 400 "out of space for character class [%.10s...] 3", p); 401 *bp++ = c; 402 i++; 403 } 404 *bp = '\0'; 405 dprintf(("cclenter: in = |%s|, out = |%s|\n", op, buf)); 406 xfree(op); 407 return ((char *)tostring((char *)buf)); 408 } 409 410 static void 411 overflo(const char *s) 412 { 413 FATAL("regular expression too big: %.30s...", gettext((char *)s)); 414 } 415 416 /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 417 static void 418 cfoll(fa *f, Node *v) 419 { 420 int i; 421 int *p; 422 423 switch (type(v)) { 424 ELEAF 425 LEAF 426 f->re[info(v)].ltype = type(v); 427 f->re[info(v)].lval.np = right(v); 428 while (f->accept >= maxsetvec) { /* guessing here! */ 429 growvec("out of space in cfoll()"); 430 } 431 for (i = 0; i <= f->accept; i++) 432 setvec[i] = 0; 433 setcnt = 0; 434 follow(v); /* computes setvec and setcnt */ 435 if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL) 436 overflo("out of space building follow set"); 437 f->re[info(v)].lfollow = p; 438 *p = setcnt; 439 for (i = f->accept; i >= 0; i--) { 440 if (setvec[i] == 1) 441 *++p = i; 442 } 443 break; 444 UNARY 445 cfoll(f, left(v)); 446 break; 447 case CAT: 448 case OR: 449 cfoll(f, left(v)); 450 cfoll(f, right(v)); 451 break; 452 default: /* can't happen */ 453 FATAL("can't happen: unknown type %d in cfoll", type(v)); 454 } 455 } 456 457 /* 458 * collects initially active leaves of p into setvec 459 * returns 0 or 1 depending on whether p matches empty string 460 */ 461 static int 462 first(Node *p) 463 { 464 int b, lp; 465 466 switch (type(p)) { 467 ELEAF 468 LEAF 469 lp = info(p); /* look for high-water mark of subscripts */ 470 while (setcnt >= maxsetvec || lp >= maxsetvec) { 471 /* guessing here! */ 472 growvec("out of space in first()"); 473 } 474 if (type(p) == EMPTYRE) { 475 setvec[lp] = 0; 476 return (0); 477 } 478 if (setvec[lp] != 1) { 479 setvec[lp] = 1; 480 setcnt++; 481 } 482 if (type(p) == CCL && (*(char *)right(p)) == '\0') 483 return (0); /* empty CCL */ 484 else 485 return (1); 486 case PLUS: 487 if (first(left(p)) == 0) 488 return (0); 489 return (1); 490 case STAR: 491 case QUEST: 492 (void) first(left(p)); 493 return (0); 494 case CAT: 495 if (first(left(p)) == 0 && first(right(p)) == 0) 496 return (0); 497 return (1); 498 case OR: 499 b = first(right(p)); 500 if (first(left(p)) == 0 || b == 0) 501 return (0); 502 return (1); 503 } 504 FATAL("can't happen: unknown type %d in first", type(p)); 505 } 506 507 /* collects leaves that can follow v into setvec */ 508 static void 509 follow(Node *v) 510 { 511 Node *p; 512 513 if (type(v) == FINAL) 514 return; 515 p = parent(v); 516 switch (type(p)) { 517 case STAR: 518 case PLUS: 519 (void) first(v); 520 follow(p); 521 return; 522 523 case OR: 524 case QUEST: 525 follow(p); 526 return; 527 528 case CAT: 529 if (v == left(p)) { /* v is left child of p */ 530 if (first(right(p)) == 0) { 531 follow(p); 532 return; 533 } 534 } else /* v is right child */ 535 follow(p); 536 return; 537 default: 538 FATAL("unknown type %d in follow", type(p)); 539 break; 540 } 541 } 542 543 static int 544 member(int c, const char *sarg) /* is c in s? */ 545 { 546 uschar *s = (uschar *)sarg; 547 548 while (*s) 549 if (c == *s++) 550 return (1); 551 return (0); 552 } 553 554 555 int 556 match(fa *f, const char *p0) /* shortest match ? */ 557 { 558 int s, ns; 559 uschar *p = (uschar *)p0; 560 561 s = f->reset ? makeinit(f, 0) : f->initstat; 562 if (f->out[s]) 563 return (1); 564 do { 565 if ((ns = f->gototab[s][*p]) != 0) 566 s = ns; 567 else 568 s = cgoto(f, s, *p); 569 if (f->out[s]) 570 return (1); 571 } while (*p++ != 0); 572 return (0); 573 } 574 575 int 576 pmatch(fa *f, const char *p0) /* longest match, for sub */ 577 { 578 int s, ns; 579 uschar *p = (uschar *)p0; 580 uschar *q; 581 int i, k; 582 583 if (f->reset) { 584 f->initstat = s = makeinit(f, 1); 585 } else { 586 s = f->initstat; 587 } 588 patbeg = (char *)p; 589 patlen = -1; 590 do { 591 q = p; 592 do { 593 if (f->out[s]) /* final state */ 594 patlen = q-p; 595 if ((ns = f->gototab[s][*q]) != 0) 596 s = ns; 597 else 598 s = cgoto(f, s, *q); 599 if (s == 1) { /* no transition */ 600 if (patlen >= 0) { 601 patbeg = (char *)p; 602 return (1); 603 } else { 604 goto nextin; /* no match */ 605 } 606 } 607 } while (*q++ != 0); 608 if (f->out[s]) 609 patlen = q - p - 1; /* don't count $ */ 610 if (patlen >= 0) { 611 patbeg = (char *)p; 612 return (1); 613 } 614 nextin: 615 s = 2; 616 if (f->reset) { 617 for (i = 2; i <= f->curstat; i++) 618 xfree(f->posns[i]); 619 k = *f->posns[0]; 620 if ((f->posns[2] = 621 (int *)calloc(k + 1, sizeof (int))) == NULL) { 622 overflo("out of space in pmatch"); 623 } 624 for (i = 0; i <= k; i++) 625 (f->posns[2])[i] = (f->posns[0])[i]; 626 f->initstat = f->curstat = 2; 627 f->out[2] = f->out[0]; 628 for (i = 0; i < NCHARS; i++) 629 f->gototab[2][i] = 0; 630 } 631 } while (*p++ != 0); 632 return (0); 633 } 634 635 int 636 nematch(fa *f, const char *p0) /* non-empty match, for sub */ 637 { 638 int s, ns; 639 uschar *p = (uschar *)p0; 640 uschar *q; 641 int i, k; 642 643 if (f->reset) { 644 f->initstat = s = makeinit(f, 1); 645 } else { 646 s = f->initstat; 647 } 648 patlen = -1; 649 while (*p) { 650 q = p; 651 do { 652 if (f->out[s]) /* final state */ 653 patlen = q-p; 654 if ((ns = f->gototab[s][*q]) != 0) 655 s = ns; 656 else 657 s = cgoto(f, s, *q); 658 if (s == 1) { /* no transition */ 659 if (patlen > 0) { 660 patbeg = (char *)p; 661 return (1); 662 } else 663 goto nnextin; /* no nonempty match */ 664 } 665 } while (*q++ != 0); 666 if (f->out[s]) 667 patlen = q - p - 1; /* don't count $ */ 668 if (patlen > 0) { 669 patbeg = (char *)p; 670 return (1); 671 } 672 nnextin: 673 s = 2; 674 if (f->reset) { 675 for (i = 2; i <= f->curstat; i++) 676 xfree(f->posns[i]); 677 k = *f->posns[0]; 678 if ((f->posns[2] = 679 (int *)calloc(k + 1, sizeof (int))) == NULL) { 680 overflo("out of state space"); 681 } 682 for (i = 0; i <= k; i++) 683 (f->posns[2])[i] = (f->posns[0])[i]; 684 f->initstat = f->curstat = 2; 685 f->out[2] = f->out[0]; 686 for (i = 0; i < NCHARS; i++) 687 f->gototab[2][i] = 0; 688 } 689 p++; 690 } 691 return (0); 692 } 693 694 static Node *regexp(void), *primary(void), *concat(Node *); 695 static Node *alt(Node *), *unary(Node *); 696 697 /* parses regular expression pointed to by p */ 698 /* uses relex() to scan regular expression */ 699 static Node * 700 reparse(const char *p) 701 { 702 Node *np; 703 704 dprintf(("reparse <%s>\n", p)); 705 706 /* prestr points to string to be parsed */ 707 lastre = prestr = (uschar *)p; 708 rtok = relex(); 709 /* GNU compatibility: an empty regexp matches anything */ 710 if (rtok == '\0') { 711 return (op2(EMPTYRE, NIL, NIL)); 712 } 713 np = regexp(); 714 if (rtok != '\0') 715 FATAL("syntax error in regular expression %s at %s", 716 lastre, prestr); 717 return (np); 718 } 719 720 static Node * 721 regexp(void) /* top-level parse of reg expr */ 722 { 723 return (alt(concat(primary()))); 724 } 725 726 static Node * 727 primary(void) 728 { 729 Node *np; 730 731 switch (rtok) { 732 case CHAR: 733 np = op2(CHAR, NIL, itonp(rlxval)); 734 rtok = relex(); 735 return (unary(np)); 736 case ALL: 737 rtok = relex(); 738 return (unary(op2(ALL, NIL, NIL))); 739 case EMPTYRE: 740 rtok = relex(); 741 return (unary(op2(ALL, NIL, NIL))); 742 case DOT: 743 rtok = relex(); 744 return (unary(op2(DOT, NIL, NIL))); 745 case CCL: 746 /*LINTED align*/ 747 np = op2(CCL, NIL, (Node *)cclenter((char *)rlxstr)); 748 rtok = relex(); 749 return (unary(np)); 750 case NCCL: 751 /*LINTED align*/ 752 np = op2(NCCL, NIL, (Node *)cclenter((char *)rlxstr)); 753 rtok = relex(); 754 return (unary(np)); 755 case '^': 756 rtok = relex(); 757 return (unary(op2(CHAR, NIL, itonp(HAT)))); 758 case '$': 759 rtok = relex(); 760 return (unary(op2(CHAR, NIL, NIL))); 761 case '(': 762 rtok = relex(); 763 if (rtok == ')') { /* special pleading for () */ 764 rtok = relex(); 765 return (unary(op2(CCL, NIL, 766 /*LINTED align*/ 767 (Node *)tostring("")))); 768 } 769 np = regexp(); 770 if (rtok == ')') { 771 rtok = relex(); 772 return (unary(np)); 773 } else { 774 FATAL("syntax error in regular expression %s at %s", 775 lastre, prestr); 776 } 777 /* FALLTHROUGH */ 778 default: 779 FATAL("illegal primary in regular expression %s at %s", 780 lastre, prestr); 781 } 782 /*NOTREACHED*/ 783 return (NULL); 784 } 785 786 static Node * 787 concat(Node *np) 788 { 789 switch (rtok) { 790 case EMPTYRE: 791 case CHAR: 792 case DOT: 793 case ALL: 794 case CCL: 795 case NCCL: 796 case '$': 797 case '(': 798 return (concat(op2(CAT, np, primary()))); 799 default: 800 return (np); 801 } 802 } 803 804 static Node * 805 alt(Node *np) 806 { 807 if (rtok == OR) { 808 rtok = relex(); 809 return (alt(op2(OR, np, concat(primary())))); 810 } 811 return (np); 812 } 813 814 static Node * 815 unary(Node *np) 816 { 817 switch (rtok) { 818 case STAR: 819 rtok = relex(); 820 return (unary(op2(STAR, np, NIL))); 821 case PLUS: 822 rtok = relex(); 823 return (unary(op2(PLUS, np, NIL))); 824 case QUEST: 825 rtok = relex(); 826 return (unary(op2(QUEST, np, NIL))); 827 default: 828 return (np); 829 } 830 } 831 832 /* 833 * Character class definitions conformant to the POSIX locale as 834 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 835 * and operating character sets are both ASCII (ISO646) or supersets 836 * thereof. 837 * 838 * Note that to avoid overflowing the temporary buffer used in 839 * relex(), the expanded character class (prior to range expansion) 840 * must be less than twice the size of their full name. 841 */ 842 843 struct charclass { 844 const char *cc_name; 845 int cc_namelen; 846 int (*cc_func)(int); 847 } charclasses[] = { 848 { "alnum", 5, isalnum }, 849 { "alpha", 5, isalpha }, 850 { "blank", 5, isblank }, 851 { "cntrl", 5, iscntrl }, 852 { "digit", 5, isdigit }, 853 { "graph", 5, isgraph }, 854 { "lower", 5, islower }, 855 { "print", 5, isprint }, 856 { "punct", 5, ispunct }, 857 { "space", 5, isspace }, 858 { "upper", 5, isupper }, 859 { "xdigit", 6, isxdigit }, 860 { NULL, 0, NULL }, 861 }; 862 863 864 static int 865 relex(void) /* lexical analyzer for reparse */ 866 { 867 int c, n; 868 int cflag; 869 static uschar *buf = 0; 870 static size_t bufsz = 100; 871 uschar *bp; 872 struct charclass *cc; 873 int i; 874 875 switch (c = *prestr++) { 876 case '|': return OR; 877 case '*': return STAR; 878 case '+': return PLUS; 879 case '?': return QUEST; 880 case '.': return DOT; 881 case '\0': prestr--; return '\0'; 882 case '^': 883 case '$': 884 case '(': 885 case ')': 886 return (c); 887 case '\\': 888 rlxval = quoted(&prestr); 889 return (CHAR); 890 default: 891 rlxval = c; 892 return (CHAR); 893 case '[': 894 if (buf == NULL && (buf = (uschar *)malloc(bufsz)) == NULL) 895 FATAL("out of space in reg expr %.10s..", lastre); 896 bp = buf; 897 if (*prestr == '^') { 898 cflag = 1; 899 prestr++; 900 } else 901 cflag = 0; 902 n = 2 * strlen((const char *)prestr) + 1; 903 if (!adjbuf((char **)&buf, &bufsz, n, n, (char **)&bp, 904 "relex1")) 905 FATAL("out of space for reg expr %.10s...", lastre); 906 for (;;) { 907 if ((c = *prestr++) == '\\') { 908 *bp++ = '\\'; 909 if ((c = *prestr++) == '\0') { 910 FATAL("nonterminated character class " 911 "%.20s...", lastre); 912 } 913 *bp++ = c; 914 } else if (c == '[' && *prestr == ':') { 915 /* 916 * Handle POSIX character class names. 917 * Dag-Erling Smorgrav, des@ofug.org 918 */ 919 for (cc = charclasses; cc->cc_name; cc++) 920 if (strncmp((const char *)prestr + 1, 921 (const char *)cc->cc_name, 922 cc->cc_namelen) == 0) 923 break; 924 925 if (cc->cc_name == NULL || 926 prestr[1 + cc->cc_namelen] != ':' || 927 prestr[2 + cc->cc_namelen] != ']') { 928 *bp++ = c; 929 continue; 930 } 931 932 prestr += cc->cc_namelen + 3; 933 /* 934 * BUG: We begin at 1, instead of 0, since we 935 * would otherwise prematurely terminate the 936 * string for classes like [[:cntrl:]]. This 937 * means that we can't match the NUL character, 938 * not without first adapting the entire 939 * program to track each string's length. 940 */ 941 for (i = 1; i < NCHARS; i++) { 942 (void) adjbuf((char **)&buf, &bufsz, 943 bp - buf + 1, 100, (char **)&bp, 944 "relex2"); 945 if (cc->cc_func(i)) { 946 *bp++ = i; 947 n++; 948 } 949 } 950 } else if (c == '\0') { 951 FATAL("nonterminated character class %.20s", 952 lastre); 953 } else if (bp == buf) { /* 1st char is special */ 954 *bp++ = c; 955 } else if (c == ']') { 956 *bp++ = '\0'; 957 rlxstr = (uschar *)tostring((char *)buf); 958 if (cflag == 0) 959 return (CCL); 960 else 961 return (NCCL); 962 } else 963 *bp++ = c; 964 } 965 /*NOTREACHED*/ 966 } 967 return (0); 968 } 969 970 static int 971 cgoto(fa *f, int s, int c) 972 { 973 int i, j, k; 974 int *p, *q; 975 976 assert(c == HAT || c < NCHARS); 977 while (f->accept >= maxsetvec) { /* guessing here! */ 978 growvec("out of space in cgoto()"); 979 } 980 for (i = 0; i <= f->accept; i++) 981 setvec[i] = 0; 982 setcnt = 0; 983 /* compute positions of gototab[s,c] into setvec */ 984 p = f->posns[s]; 985 for (i = 1; i <= *p; i++) { 986 if ((k = f->re[p[i]].ltype) != FINAL) { 987 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) || 988 (k == DOT && c != 0 && c != HAT) || 989 (k == ALL && c != 0) || 990 (k == EMPTYRE && c != 0) || 991 (k == CCL && 992 member(c, (char *)f->re[p[i]].lval.up)) || 993 (k == NCCL && 994 !member(c, (char *)f->re[p[i]].lval.up) && 995 c != 0 && c != HAT)) { 996 q = f->re[p[i]].lfollow; 997 for (j = 1; j <= *q; j++) { 998 if (q[j] >= maxsetvec) { 999 growvec("cgoto overflow"); 1000 } 1001 if (setvec[q[j]] == 0) { 1002 setcnt++; 1003 setvec[q[j]] = 1; 1004 } 1005 } 1006 } 1007 } 1008 } 1009 /* determine if setvec is a previous state */ 1010 tmpset[0] = setcnt; 1011 j = 1; 1012 for (i = f->accept; i >= 0; i--) 1013 if (setvec[i]) { 1014 tmpset[j++] = i; 1015 } 1016 /* tmpset == previous state? */ 1017 for (i = 1; i <= f->curstat; i++) { 1018 p = f->posns[i]; 1019 if ((k = tmpset[0]) != p[0]) 1020 goto different; 1021 for (j = 1; j <= k; j++) 1022 if (tmpset[j] != p[j]) 1023 goto different; 1024 /* setvec is state i */ 1025 f->gototab[s][c] = i; 1026 return (i); 1027 different:; 1028 } 1029 1030 /* add tmpset to current set of states */ 1031 if (f->curstat >= NSTATES-1) { 1032 f->curstat = 2; 1033 f->reset = 1; 1034 for (i = 2; i < NSTATES; i++) 1035 xfree(f->posns[i]); 1036 } else 1037 ++(f->curstat); 1038 for (i = 0; i < NCHARS; i++) 1039 f->gototab[f->curstat][i] = 0; 1040 xfree(f->posns[f->curstat]); 1041 if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL) 1042 overflo("out of space in cgoto"); 1043 1044 f->posns[f->curstat] = p; 1045 f->gototab[s][c] = f->curstat; 1046 for (i = 0; i <= setcnt; i++) 1047 p[i] = tmpset[i]; 1048 if (setvec[f->accept]) 1049 f->out[f->curstat] = 1; 1050 else 1051 f->out[f->curstat] = 0; 1052 return (f->curstat); 1053 } 1054 1055 static void 1056 freefa(fa *f) /* free a finite automaton */ 1057 { 1058 int i; 1059 1060 if (f == NULL) 1061 return; 1062 for (i = 0; i <= f->curstat; i++) 1063 xfree(f->posns[i]); 1064 for (i = 0; i <= f->accept; i++) { 1065 xfree(f->re[i].lfollow); 1066 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1067 xfree((f->re[i].lval.np)); 1068 } 1069 xfree(f->restr); 1070 xfree(f); 1071 } 1072