1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #define DEBUG 31 32 #include <ctype.h> 33 #include <limits.h> 34 #include <stdio.h> 35 #include <string.h> 36 #include <stdlib.h> 37 #include "awk.h" 38 #include "awkgram.tab.h" 39 40 #define MAXLIN 22 41 42 #define type(v) (v)->nobj /* badly overloaded here */ 43 #define info(v) (v)->ntype /* badly overloaded here */ 44 #define left(v) (v)->narg[0] 45 #define right(v) (v)->narg[1] 46 #define parent(v) (v)->nnext 47 48 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 49 #define ELEAF case EMPTYRE: /* empty string in regexp */ 50 #define UNARY case STAR: case PLUS: case QUEST: 51 52 /* encoding in tree Nodes: 53 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 54 left is index, right contains value or pointer to value 55 unary (STAR, PLUS, QUEST): left is child, right is null 56 binary (CAT, OR): left and right are children 57 parent contains pointer to parent 58 */ 59 60 61 int *setvec; 62 int *tmpset; 63 int maxsetvec = 0; 64 65 int rtok; /* next token in current re */ 66 int rlxval; 67 static const uschar *rlxstr; 68 static const uschar *prestr; /* current position in current re */ 69 static const uschar *lastre; /* origin of last re */ 70 static const uschar *lastatom; /* origin of last Atom */ 71 static const uschar *starttok; 72 static const uschar *basestr; /* starts with original, replaced during 73 repetition processing */ 74 static const uschar *firstbasestr; 75 76 static int setcnt; 77 static int poscnt; 78 79 const char *patbeg; 80 int patlen; 81 82 #define NFA 128 /* cache this many dynamic fa's */ 83 fa *fatab[NFA]; 84 int nfatab = 0; /* entries in fatab */ 85 86 static int * 87 intalloc(size_t n, const char *f) 88 { 89 int *p = (int *) calloc(n, sizeof(int)); 90 if (p == NULL) 91 overflo(f); 92 return p; 93 } 94 95 static void 96 resizesetvec(const char *f) 97 { 98 if (maxsetvec == 0) 99 maxsetvec = MAXLIN; 100 else 101 maxsetvec *= 4; 102 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec)); 103 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset)); 104 if (setvec == NULL || tmpset == NULL) 105 overflo(f); 106 } 107 108 static void 109 resize_state(fa *f, int state) 110 { 111 unsigned int **p; 112 uschar *p2; 113 int **p3; 114 int i, new_count; 115 116 if (++state < f->state_count) 117 return; 118 119 new_count = state + 10; /* needs to be tuned */ 120 121 p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0])); 122 if (p == NULL) 123 goto out; 124 f->gototab = p; 125 126 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0])); 127 if (p2 == NULL) 128 goto out; 129 f->out = p2; 130 131 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0])); 132 if (p3 == NULL) 133 goto out; 134 f->posns = p3; 135 136 for (i = f->state_count; i < new_count; ++i) { 137 f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab)); 138 if (f->gototab[i] == NULL) 139 goto out; 140 f->out[i] = 0; 141 f->posns[i] = NULL; 142 } 143 f->state_count = new_count; 144 return; 145 out: 146 overflo(__func__); 147 } 148 149 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 150 { 151 int i, use, nuse; 152 fa *pfa; 153 static int now = 1; 154 155 if (setvec == NULL) { /* first time through any RE */ 156 resizesetvec(__func__); 157 } 158 159 if (compile_time != RUNNING) /* a constant for sure */ 160 return mkdfa(s, anchor); 161 for (i = 0; i < nfatab; i++) /* is it there already? */ 162 if (fatab[i]->anchor == anchor 163 && strcmp((const char *) fatab[i]->restr, s) == 0) { 164 fatab[i]->use = now++; 165 return fatab[i]; 166 } 167 pfa = mkdfa(s, anchor); 168 if (nfatab < NFA) { /* room for another */ 169 fatab[nfatab] = pfa; 170 fatab[nfatab]->use = now++; 171 nfatab++; 172 return pfa; 173 } 174 use = fatab[0]->use; /* replace least-recently used */ 175 nuse = 0; 176 for (i = 1; i < nfatab; i++) 177 if (fatab[i]->use < use) { 178 use = fatab[i]->use; 179 nuse = i; 180 } 181 freefa(fatab[nuse]); 182 fatab[nuse] = pfa; 183 pfa->use = now++; 184 return pfa; 185 } 186 187 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 188 /* anchor = true for anchored matches, else false */ 189 { 190 Node *p, *p1; 191 fa *f; 192 193 firstbasestr = (const uschar *) s; 194 basestr = firstbasestr; 195 p = reparse(s); 196 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 197 /* put ALL STAR in front of reg. exp. */ 198 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 199 /* put FINAL after reg. exp. */ 200 201 poscnt = 0; 202 penter(p1); /* enter parent pointers and leaf indices */ 203 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 204 overflo(__func__); 205 f->accept = poscnt-1; /* penter has computed number of positions in re */ 206 cfoll(f, p1); /* set up follow sets */ 207 freetr(p1); 208 resize_state(f, 1); 209 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 210 f->posns[1] = intalloc(1, __func__); 211 *f->posns[1] = 0; 212 f->initstat = makeinit(f, anchor); 213 f->anchor = anchor; 214 f->restr = (uschar *) tostring(s); 215 if (firstbasestr != basestr) { 216 if (basestr) 217 xfree(basestr); 218 } 219 return f; 220 } 221 222 int makeinit(fa *f, bool anchor) 223 { 224 int i, k; 225 226 f->curstat = 2; 227 f->out[2] = 0; 228 k = *(f->re[0].lfollow); 229 xfree(f->posns[2]); 230 f->posns[2] = intalloc(k + 1, __func__); 231 for (i = 0; i <= k; i++) { 232 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 233 } 234 if ((f->posns[2])[1] == f->accept) 235 f->out[2] = 1; 236 for (i = 0; i < NCHARS; i++) 237 f->gototab[2][i] = 0; 238 f->curstat = cgoto(f, 2, HAT); 239 if (anchor) { 240 *f->posns[2] = k-1; /* leave out position 0 */ 241 for (i = 0; i < k; i++) { 242 (f->posns[0])[i] = (f->posns[2])[i]; 243 } 244 245 f->out[0] = f->out[2]; 246 if (f->curstat != 2) 247 --(*f->posns[f->curstat]); 248 } 249 return f->curstat; 250 } 251 252 void penter(Node *p) /* set up parent pointers and leaf indices */ 253 { 254 switch (type(p)) { 255 ELEAF 256 LEAF 257 info(p) = poscnt; 258 poscnt++; 259 break; 260 UNARY 261 penter(left(p)); 262 parent(left(p)) = p; 263 break; 264 case CAT: 265 case OR: 266 penter(left(p)); 267 penter(right(p)); 268 parent(left(p)) = p; 269 parent(right(p)) = p; 270 break; 271 case ZERO: 272 break; 273 default: /* can't happen */ 274 FATAL("can't happen: unknown type %d in penter", type(p)); 275 break; 276 } 277 } 278 279 void freetr(Node *p) /* free parse tree */ 280 { 281 switch (type(p)) { 282 ELEAF 283 LEAF 284 xfree(p); 285 break; 286 UNARY 287 case ZERO: 288 freetr(left(p)); 289 xfree(p); 290 break; 291 case CAT: 292 case OR: 293 freetr(left(p)); 294 freetr(right(p)); 295 xfree(p); 296 break; 297 default: /* can't happen */ 298 FATAL("can't happen: unknown type %d in freetr", type(p)); 299 break; 300 } 301 } 302 303 /* in the parsing of regular expressions, metacharacters like . have */ 304 /* to be seen literally; \056 is not a metacharacter. */ 305 306 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ 307 { /* only pick up one 8-bit byte (2 chars) */ 308 const uschar *p; 309 int n = 0; 310 int i; 311 312 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { 313 if (isdigit(*p)) 314 n = 16 * n + *p - '0'; 315 else if (*p >= 'a' && *p <= 'f') 316 n = 16 * n + *p - 'a' + 10; 317 else if (*p >= 'A' && *p <= 'F') 318 n = 16 * n + *p - 'A' + 10; 319 } 320 *pp = p; 321 return n; 322 } 323 324 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 325 326 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 327 /* and increment *pp */ 328 { 329 const uschar *p = *pp; 330 int c; 331 332 if ((c = *p++) == 't') 333 c = '\t'; 334 else if (c == 'n') 335 c = '\n'; 336 else if (c == 'f') 337 c = '\f'; 338 else if (c == 'r') 339 c = '\r'; 340 else if (c == 'b') 341 c = '\b'; 342 else if (c == 'v') 343 c = '\v'; 344 else if (c == 'a') 345 c = '\a'; 346 else if (c == '\\') 347 c = '\\'; 348 else if (c == 'x') { /* hexadecimal goo follows */ 349 c = hexstr(&p); /* this adds a null if number is invalid */ 350 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 351 int n = c - '0'; 352 if (isoctdigit(*p)) { 353 n = 8 * n + *p++ - '0'; 354 if (isoctdigit(*p)) 355 n = 8 * n + *p++ - '0'; 356 } 357 c = n; 358 } /* else */ 359 /* c = c; */ 360 *pp = p; 361 return c; 362 } 363 364 static int collate_range_cmp(int a, int b) 365 { 366 static char s[2][2]; 367 368 if ((uschar)a == (uschar)b) 369 return 0; 370 s[0][0] = a; 371 s[1][0] = b; 372 return (strcoll(s[0], s[1])); 373 } 374 375 char *cclenter(const char *argp) /* add a character class */ 376 { 377 int i, c, c2, j; 378 const uschar *op, *p = (const uschar *) argp; 379 uschar *bp; 380 static uschar *buf = NULL; 381 static int bufsz = 100; 382 383 op = p; 384 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 385 FATAL("out of space for character class [%.10s...] 1", p); 386 bp = buf; 387 for (i = 0; (c = *p++) != 0; ) { 388 if (c == '\\') { 389 c = quoted(&p); 390 } else if (c == '-' && i > 0 && bp[-1] != 0) { 391 if (*p != 0) { 392 c = bp[-1]; 393 c2 = *p++; 394 if (c2 == '\\') 395 c2 = quoted(&p); 396 if (collate_range_cmp(c, c2) > 0) { 397 bp--; 398 i--; 399 continue; 400 } 401 for (j = 0; j < NCHARS; j++) { 402 if ((collate_range_cmp(c, j) > 0) || 403 collate_range_cmp(j, c2) > 0) 404 continue; 405 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 406 FATAL("out of space for character class [%.10s...] 2", p); 407 *bp++ = j; 408 i++; 409 } 410 continue; 411 } 412 } 413 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 414 FATAL("out of space for character class [%.10s...] 3", p); 415 *bp++ = c; 416 i++; 417 } 418 *bp = 0; 419 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); 420 xfree(op); 421 return (char *) tostring((char *) buf); 422 } 423 424 void overflo(const char *s) 425 { 426 FATAL("regular expression too big: out of space in %.30s...", s); 427 } 428 429 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 430 { 431 int i; 432 int *p; 433 434 switch (type(v)) { 435 ELEAF 436 LEAF 437 f->re[info(v)].ltype = type(v); 438 f->re[info(v)].lval.np = right(v); 439 while (f->accept >= maxsetvec) { /* guessing here! */ 440 resizesetvec(__func__); 441 } 442 for (i = 0; i <= f->accept; i++) 443 setvec[i] = 0; 444 setcnt = 0; 445 follow(v); /* computes setvec and setcnt */ 446 p = intalloc(setcnt + 1, __func__); 447 f->re[info(v)].lfollow = p; 448 *p = setcnt; 449 for (i = f->accept; i >= 0; i--) 450 if (setvec[i] == 1) 451 *++p = i; 452 break; 453 UNARY 454 cfoll(f,left(v)); 455 break; 456 case CAT: 457 case OR: 458 cfoll(f,left(v)); 459 cfoll(f,right(v)); 460 break; 461 case ZERO: 462 break; 463 default: /* can't happen */ 464 FATAL("can't happen: unknown type %d in cfoll", type(v)); 465 } 466 } 467 468 int first(Node *p) /* collects initially active leaves of p into setvec */ 469 /* returns 0 if p matches empty string */ 470 { 471 int b, lp; 472 473 switch (type(p)) { 474 ELEAF 475 LEAF 476 lp = info(p); /* look for high-water mark of subscripts */ 477 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 478 resizesetvec(__func__); 479 } 480 if (type(p) == EMPTYRE) { 481 setvec[lp] = 0; 482 return(0); 483 } 484 if (setvec[lp] != 1) { 485 setvec[lp] = 1; 486 setcnt++; 487 } 488 if (type(p) == CCL && (*(char *) right(p)) == '\0') 489 return(0); /* empty CCL */ 490 return(1); 491 case PLUS: 492 if (first(left(p)) == 0) 493 return(0); 494 return(1); 495 case STAR: 496 case QUEST: 497 first(left(p)); 498 return(0); 499 case CAT: 500 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 501 return(1); 502 case OR: 503 b = first(right(p)); 504 if (first(left(p)) == 0 || b == 0) return(0); 505 return(1); 506 case ZERO: 507 return 0; 508 } 509 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 510 return(-1); 511 } 512 513 void follow(Node *v) /* collects leaves that can follow v into setvec */ 514 { 515 Node *p; 516 517 if (type(v) == FINAL) 518 return; 519 p = parent(v); 520 switch (type(p)) { 521 case STAR: 522 case PLUS: 523 first(v); 524 follow(p); 525 return; 526 527 case OR: 528 case QUEST: 529 follow(p); 530 return; 531 532 case CAT: 533 if (v == left(p)) { /* v is left child of p */ 534 if (first(right(p)) == 0) { 535 follow(p); 536 return; 537 } 538 } else /* v is right child */ 539 follow(p); 540 return; 541 } 542 } 543 544 int member(int c, const char *sarg) /* is c in s? */ 545 { 546 const uschar *s = (const uschar *) sarg; 547 548 while (*s) 549 if (c == *s++) 550 return(1); 551 return(0); 552 } 553 554 int match(fa *f, const char *p0) /* shortest match ? */ 555 { 556 int s, ns; 557 const uschar *p = (const uschar *) p0; 558 559 s = f->initstat; 560 assert (s < f->state_count); 561 562 if (f->out[s]) 563 return(1); 564 do { 565 /* assert(*p < NCHARS); */ 566 if ((ns = f->gototab[s][*p]) != 0) 567 s = ns; 568 else 569 s = cgoto(f, s, *p); 570 if (f->out[s]) 571 return(1); 572 } while (*p++ != 0); 573 return(0); 574 } 575 576 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 577 { 578 int s, ns; 579 const uschar *p = (const uschar *) p0; 580 const uschar *q; 581 582 s = f->initstat; 583 assert(s < f->state_count); 584 585 patbeg = (const char *)p; 586 patlen = -1; 587 do { 588 q = p; 589 do { 590 if (f->out[s]) /* final state */ 591 patlen = q-p; 592 /* assert(*q < NCHARS); */ 593 if ((ns = f->gototab[s][*q]) != 0) 594 s = ns; 595 else 596 s = cgoto(f, s, *q); 597 598 assert(s < f->state_count); 599 600 if (s == 1) { /* no transition */ 601 if (patlen >= 0) { 602 patbeg = (const char *) p; 603 return(1); 604 } 605 else 606 goto nextin; /* no match */ 607 } 608 } while (*q++ != 0); 609 if (f->out[s]) 610 patlen = q-p-1; /* don't count $ */ 611 if (patlen >= 0) { 612 patbeg = (const char *) p; 613 return(1); 614 } 615 nextin: 616 s = 2; 617 } while (*p++); 618 return (0); 619 } 620 621 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 622 { 623 int s, ns; 624 const uschar *p = (const uschar *) p0; 625 const uschar *q; 626 627 s = f->initstat; 628 assert(s < f->state_count); 629 630 patbeg = (const char *)p; 631 patlen = -1; 632 while (*p) { 633 q = p; 634 do { 635 if (f->out[s]) /* final state */ 636 patlen = q-p; 637 /* assert(*q < NCHARS); */ 638 if ((ns = f->gototab[s][*q]) != 0) 639 s = ns; 640 else 641 s = cgoto(f, s, *q); 642 if (s == 1) { /* no transition */ 643 if (patlen > 0) { 644 patbeg = (const char *) p; 645 return(1); 646 } else 647 goto nnextin; /* no nonempty match */ 648 } 649 } while (*q++ != 0); 650 if (f->out[s]) 651 patlen = q-p-1; /* don't count $ */ 652 if (patlen > 0 ) { 653 patbeg = (const char *) p; 654 return(1); 655 } 656 nnextin: 657 s = 2; 658 p++; 659 } 660 return (0); 661 } 662 663 664 /* 665 * NAME 666 * fnematch 667 * 668 * DESCRIPTION 669 * A stream-fed version of nematch which transfers characters to a 670 * null-terminated buffer. All characters up to and including the last 671 * character of the matching text or EOF are placed in the buffer. If 672 * a match is found, patbeg and patlen are set appropriately. 673 * 674 * RETURN VALUES 675 * false No match found. 676 * true Match found. 677 */ 678 679 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 680 { 681 char *buf = *pbuf; 682 int bufsize = *pbufsize; 683 int c, i, j, k, ns, s; 684 685 s = pfa->initstat; 686 patlen = 0; 687 688 /* 689 * All indices relative to buf. 690 * i <= j <= k <= bufsize 691 * 692 * i: origin of active substring 693 * j: current character 694 * k: destination of next getc() 695 */ 696 i = -1, k = 0; 697 do { 698 j = i++; 699 do { 700 if (++j == k) { 701 if (k == bufsize) 702 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 703 FATAL("stream '%.30s...' too long", buf); 704 buf[k++] = (c = getc(f)) != EOF ? c : 0; 705 } 706 c = (uschar)buf[j]; 707 /* assert(c < NCHARS); */ 708 709 if ((ns = pfa->gototab[s][c]) != 0) 710 s = ns; 711 else 712 s = cgoto(pfa, s, c); 713 714 if (pfa->out[s]) { /* final state */ 715 patlen = j - i + 1; 716 if (c == 0) /* don't count $ */ 717 patlen--; 718 } 719 } while (buf[j] && s != 1); 720 s = 2; 721 } while (buf[i] && !patlen); 722 723 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 724 *pbuf = buf; 725 *pbufsize = bufsize; 726 727 if (patlen) { 728 patbeg = (char *) buf + i; 729 /* 730 * Under no circumstances is the last character fed to 731 * the automaton part of the match. It is EOF's nullbyte, 732 * or it sent the automaton into a state with no further 733 * transitions available (s==1), or both. Room for a 734 * terminating nullbyte is guaranteed. 735 * 736 * ungetc any chars after the end of matching text 737 * (except for EOF's nullbyte, if present) and null 738 * terminate the buffer. 739 */ 740 do 741 if (buf[--k] && ungetc(buf[k], f) == EOF) 742 FATAL("unable to ungetc '%c'", buf[k]); 743 while (k > i + patlen); 744 buf[k] = '\0'; 745 return true; 746 } 747 else 748 return false; 749 } 750 751 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 752 { /* uses relex() to scan regular expression */ 753 Node *np; 754 755 DPRINTF("reparse <%s>\n", p); 756 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 757 rtok = relex(); 758 /* GNU compatibility: an empty regexp matches anything */ 759 if (rtok == '\0') { 760 /* FATAL("empty regular expression"); previous */ 761 return(op2(EMPTYRE, NIL, NIL)); 762 } 763 np = regexp(); 764 if (rtok != '\0') 765 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 766 return(np); 767 } 768 769 Node *regexp(void) /* top-level parse of reg expr */ 770 { 771 return (alt(concat(primary()))); 772 } 773 774 Node *primary(void) 775 { 776 Node *np; 777 int savelastatom; 778 779 switch (rtok) { 780 case CHAR: 781 lastatom = starttok; 782 np = op2(CHAR, NIL, itonp(rlxval)); 783 rtok = relex(); 784 return (unary(np)); 785 case ALL: 786 rtok = relex(); 787 return (unary(op2(ALL, NIL, NIL))); 788 case EMPTYRE: 789 rtok = relex(); 790 return (unary(op2(EMPTYRE, NIL, NIL))); 791 case DOT: 792 lastatom = starttok; 793 rtok = relex(); 794 return (unary(op2(DOT, NIL, NIL))); 795 case CCL: 796 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 797 lastatom = starttok; 798 rtok = relex(); 799 return (unary(np)); 800 case NCCL: 801 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 802 lastatom = starttok; 803 rtok = relex(); 804 return (unary(np)); 805 case '^': 806 rtok = relex(); 807 return (unary(op2(CHAR, NIL, itonp(HAT)))); 808 case '$': 809 rtok = relex(); 810 return (unary(op2(CHAR, NIL, NIL))); 811 case '(': 812 lastatom = starttok; 813 savelastatom = starttok - basestr; /* Retain over recursion */ 814 rtok = relex(); 815 if (rtok == ')') { /* special pleading for () */ 816 rtok = relex(); 817 return unary(op2(CCL, NIL, (Node *) tostring(""))); 818 } 819 np = regexp(); 820 if (rtok == ')') { 821 lastatom = basestr + savelastatom; /* Restore */ 822 rtok = relex(); 823 return (unary(np)); 824 } 825 else 826 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 827 default: 828 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 829 } 830 return 0; /*NOTREACHED*/ 831 } 832 833 Node *concat(Node *np) 834 { 835 switch (rtok) { 836 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 837 return (concat(op2(CAT, np, primary()))); 838 case EMPTYRE: 839 rtok = relex(); 840 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 841 primary()))); 842 } 843 return (np); 844 } 845 846 Node *alt(Node *np) 847 { 848 if (rtok == OR) { 849 rtok = relex(); 850 return (alt(op2(OR, np, concat(primary())))); 851 } 852 return (np); 853 } 854 855 Node *unary(Node *np) 856 { 857 switch (rtok) { 858 case STAR: 859 rtok = relex(); 860 return (unary(op2(STAR, np, NIL))); 861 case PLUS: 862 rtok = relex(); 863 return (unary(op2(PLUS, np, NIL))); 864 case QUEST: 865 rtok = relex(); 866 return (unary(op2(QUEST, np, NIL))); 867 case ZERO: 868 rtok = relex(); 869 return (unary(op2(ZERO, np, NIL))); 870 default: 871 return (np); 872 } 873 } 874 875 /* 876 * Character class definitions conformant to the POSIX locale as 877 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 878 * and operating character sets are both ASCII (ISO646) or supersets 879 * thereof. 880 * 881 * Note that to avoid overflowing the temporary buffer used in 882 * relex(), the expanded character class (prior to range expansion) 883 * must be less than twice the size of their full name. 884 */ 885 886 /* Because isblank doesn't show up in any of the header files on any 887 * system i use, it's defined here. if some other locale has a richer 888 * definition of "blank", define HAS_ISBLANK and provide your own 889 * version. 890 * the parentheses here are an attempt to find a path through the maze 891 * of macro definition and/or function and/or version provided. thanks 892 * to nelson beebe for the suggestion; let's see if it works everywhere. 893 */ 894 895 /* #define HAS_ISBLANK */ 896 #ifndef HAS_ISBLANK 897 898 int (xisblank)(int c) 899 { 900 return c==' ' || c=='\t'; 901 } 902 903 #endif 904 905 static const struct charclass { 906 const char *cc_name; 907 int cc_namelen; 908 int (*cc_func)(int); 909 } charclasses[] = { 910 { "alnum", 5, isalnum }, 911 { "alpha", 5, isalpha }, 912 #ifndef HAS_ISBLANK 913 { "blank", 5, xisblank }, 914 #else 915 { "blank", 5, isblank }, 916 #endif 917 { "cntrl", 5, iscntrl }, 918 { "digit", 5, isdigit }, 919 { "graph", 5, isgraph }, 920 { "lower", 5, islower }, 921 { "print", 5, isprint }, 922 { "punct", 5, ispunct }, 923 { "space", 5, isspace }, 924 { "upper", 5, isupper }, 925 { "xdigit", 6, isxdigit }, 926 { NULL, 0, NULL }, 927 }; 928 929 #define REPEAT_SIMPLE 0 930 #define REPEAT_PLUS_APPENDED 1 931 #define REPEAT_WITH_Q 2 932 #define REPEAT_ZERO 3 933 934 static int 935 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 936 int atomlen, int firstnum, int secondnum, int special_case) 937 { 938 int i, j; 939 uschar *buf = 0; 940 int ret = 1; 941 int init_q = (firstnum == 0); /* first added char will be ? */ 942 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 943 int prefix_length = reptok - basestr; /* prefix includes first rep */ 944 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 945 int size = prefix_length + suffix_length; 946 947 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 948 size += atomlen*(firstnum-1); 949 } 950 951 /* Adjust size of buffer for special cases */ 952 if (special_case == REPEAT_PLUS_APPENDED) { 953 size++; /* for the final + */ 954 } else if (special_case == REPEAT_WITH_Q) { 955 size += init_q + (atomlen+1)* n_q_reps; 956 } else if (special_case == REPEAT_ZERO) { 957 size += 2; /* just a null ERE: () */ 958 } 959 if ((buf = (uschar *) malloc(size + 1)) == NULL) 960 FATAL("out of space in reg expr %.10s..", lastre); 961 memcpy(buf, basestr, prefix_length); /* copy prefix */ 962 j = prefix_length; 963 if (special_case == REPEAT_ZERO) { 964 j -= atomlen; 965 buf[j++] = '('; 966 buf[j++] = ')'; 967 } 968 for (i = 1; i < firstnum; i++) { /* copy x reps */ 969 memcpy(&buf[j], atom, atomlen); 970 j += atomlen; 971 } 972 if (special_case == REPEAT_PLUS_APPENDED) { 973 buf[j++] = '+'; 974 } else if (special_case == REPEAT_WITH_Q) { 975 if (init_q) 976 buf[j++] = '?'; 977 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 978 memcpy(&buf[j], atom, atomlen); 979 j += atomlen; 980 buf[j++] = '?'; 981 } 982 } 983 memcpy(&buf[j], reptok+reptoklen, suffix_length); 984 if (special_case == REPEAT_ZERO) { 985 buf[j+suffix_length] = '\0'; 986 } else { 987 buf[size] = '\0'; 988 } 989 /* free old basestr */ 990 if (firstbasestr != basestr) { 991 if (basestr) 992 xfree(basestr); 993 } 994 basestr = buf; 995 prestr = buf + prefix_length; 996 if (special_case == REPEAT_ZERO) { 997 prestr -= atomlen; 998 ret++; 999 } 1000 return ret; 1001 } 1002 1003 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1004 int atomlen, int firstnum, int secondnum) 1005 { 1006 /* 1007 In general, the repetition specifier or "bound" is replaced here 1008 by an equivalent ERE string, repeating the immediately previous atom 1009 and appending ? and + as needed. Note that the first copy of the 1010 atom is left in place, except in the special_case of a zero-repeat 1011 (i.e., {0}). 1012 */ 1013 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1014 if (firstnum < 2) { 1015 /* 0 or 1: should be handled before you get here */ 1016 FATAL("internal error"); 1017 } else { 1018 return replace_repeat(reptok, reptoklen, atom, atomlen, 1019 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1020 } 1021 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1022 if (firstnum == 0) { /* {0} or {0,0} */ 1023 /* This case is unusual because the resulting 1024 replacement string might actually be SMALLER than 1025 the original ERE */ 1026 return replace_repeat(reptok, reptoklen, atom, atomlen, 1027 firstnum, secondnum, REPEAT_ZERO); 1028 } else { /* (firstnum >= 1) */ 1029 return replace_repeat(reptok, reptoklen, atom, atomlen, 1030 firstnum, secondnum, REPEAT_SIMPLE); 1031 } 1032 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1033 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1034 return replace_repeat(reptok, reptoklen, atom, atomlen, 1035 firstnum, secondnum, REPEAT_WITH_Q); 1036 } else { /* Error - shouldn't be here (n>m) */ 1037 FATAL("internal error"); 1038 } 1039 return 0; 1040 } 1041 1042 int relex(void) /* lexical analyzer for reparse */ 1043 { 1044 int c, n; 1045 int cflag; 1046 static uschar *buf = NULL; 1047 static int bufsz = 100; 1048 uschar *bp; 1049 const struct charclass *cc; 1050 int i; 1051 int num, m; 1052 bool commafound, digitfound; 1053 const uschar *startreptok; 1054 static int parens = 0; 1055 1056 rescan: 1057 starttok = prestr; 1058 1059 switch (c = *prestr++) { 1060 case '|': return OR; 1061 case '*': return STAR; 1062 case '+': return PLUS; 1063 case '?': return QUEST; 1064 case '.': return DOT; 1065 case '\0': prestr--; return '\0'; 1066 case '^': 1067 case '$': 1068 return c; 1069 case '(': 1070 parens++; 1071 return c; 1072 case ')': 1073 if (parens) { 1074 parens--; 1075 return c; 1076 } 1077 /* unmatched close parenthesis; per POSIX, treat as literal */ 1078 rlxval = c; 1079 return CHAR; 1080 case '\\': 1081 rlxval = quoted(&prestr); 1082 return CHAR; 1083 default: 1084 rlxval = c; 1085 return CHAR; 1086 case '[': 1087 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1088 FATAL("out of space in reg expr %.10s..", lastre); 1089 bp = buf; 1090 if (*prestr == '^') { 1091 cflag = 1; 1092 prestr++; 1093 } 1094 else 1095 cflag = 0; 1096 n = 2 * strlen((const char *) prestr)+1; 1097 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1098 FATAL("out of space for reg expr %.10s...", lastre); 1099 for (; ; ) { 1100 if ((c = *prestr++) == '\\') { 1101 *bp++ = '\\'; 1102 if ((c = *prestr++) == '\0') 1103 FATAL("nonterminated character class %.20s...", lastre); 1104 *bp++ = c; 1105 /* } else if (c == '\n') { */ 1106 /* FATAL("newline in character class %.20s...", lastre); */ 1107 } else if (c == '[' && *prestr == ':') { 1108 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1109 for (cc = charclasses; cc->cc_name; cc++) 1110 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1111 break; 1112 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1113 prestr[2 + cc->cc_namelen] == ']') { 1114 prestr += cc->cc_namelen + 3; 1115 /* 1116 * BUG: We begin at 1, instead of 0, since we 1117 * would otherwise prematurely terminate the 1118 * string for classes like [[:cntrl:]]. This 1119 * means that we can't match the NUL character, 1120 * not without first adapting the entire 1121 * program to track each string's length. 1122 */ 1123 for (i = 1; i <= UCHAR_MAX; i++) { 1124 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 1125 FATAL("out of space for reg expr %.10s...", lastre); 1126 if (cc->cc_func(i)) { 1127 /* escape backslash */ 1128 if (i == '\\') { 1129 *bp++ = '\\'; 1130 n++; 1131 } 1132 1133 *bp++ = i; 1134 n++; 1135 } 1136 } 1137 } else 1138 *bp++ = c; 1139 } else if (c == '[' && *prestr == '.') { 1140 char collate_char; 1141 prestr++; 1142 collate_char = *prestr++; 1143 if (*prestr == '.' && prestr[1] == ']') { 1144 prestr += 2; 1145 /* Found it: map via locale TBD: for 1146 now, simply return this char. This 1147 is sufficient to pass conformance 1148 test awk.ex 156 1149 */ 1150 if (*prestr == ']') { 1151 prestr++; 1152 rlxval = collate_char; 1153 return CHAR; 1154 } 1155 } 1156 } else if (c == '[' && *prestr == '=') { 1157 char equiv_char; 1158 prestr++; 1159 equiv_char = *prestr++; 1160 if (*prestr == '=' && prestr[1] == ']') { 1161 prestr += 2; 1162 /* Found it: map via locale TBD: for now 1163 simply return this char. This is 1164 sufficient to pass conformance test 1165 awk.ex 156 1166 */ 1167 if (*prestr == ']') { 1168 prestr++; 1169 rlxval = equiv_char; 1170 return CHAR; 1171 } 1172 } 1173 } else if (c == '\0') { 1174 FATAL("nonterminated character class %.20s", lastre); 1175 } else if (bp == buf) { /* 1st char is special */ 1176 *bp++ = c; 1177 } else if (c == ']') { 1178 *bp++ = 0; 1179 rlxstr = (uschar *) tostring((char *) buf); 1180 if (cflag == 0) 1181 return CCL; 1182 else 1183 return NCCL; 1184 } else 1185 *bp++ = c; 1186 } 1187 break; 1188 case '{': 1189 if (isdigit(*(prestr))) { 1190 num = 0; /* Process as a repetition */ 1191 n = -1; m = -1; 1192 commafound = false; 1193 digitfound = false; 1194 startreptok = prestr-1; 1195 /* Remember start of previous atom here ? */ 1196 } else { /* just a { char, not a repetition */ 1197 rlxval = c; 1198 return CHAR; 1199 } 1200 for (; ; ) { 1201 if ((c = *prestr++) == '}') { 1202 if (commafound) { 1203 if (digitfound) { /* {n,m} */ 1204 m = num; 1205 if (m < n) 1206 FATAL("illegal repetition expression: class %.20s", 1207 lastre); 1208 if (n == 0 && m == 1) { 1209 return QUEST; 1210 } 1211 } else { /* {n,} */ 1212 if (n == 0) 1213 return STAR; 1214 else if (n == 1) 1215 return PLUS; 1216 } 1217 } else { 1218 if (digitfound) { /* {n} same as {n,n} */ 1219 n = num; 1220 m = num; 1221 } else { /* {} */ 1222 FATAL("illegal repetition expression: class %.20s", 1223 lastre); 1224 } 1225 } 1226 if (repeat(starttok, prestr-starttok, lastatom, 1227 startreptok - lastatom, n, m) > 0) { 1228 if (n == 0 && m == 0) { 1229 return ZERO; 1230 } 1231 /* must rescan input for next token */ 1232 goto rescan; 1233 } 1234 /* Failed to replace: eat up {...} characters 1235 and treat like just PLUS */ 1236 return PLUS; 1237 } else if (c == '\0') { 1238 FATAL("nonterminated character class %.20s", 1239 lastre); 1240 } else if (isdigit(c)) { 1241 num = 10 * num + c - '0'; 1242 digitfound = true; 1243 } else if (c == ',') { 1244 if (commafound) 1245 FATAL("illegal repetition expression: class %.20s", 1246 lastre); 1247 /* looking for {n,} or {n,m} */ 1248 commafound = true; 1249 n = num; 1250 digitfound = false; /* reset */ 1251 num = 0; 1252 } else { 1253 FATAL("illegal repetition expression: class %.20s", 1254 lastre); 1255 } 1256 } 1257 break; 1258 } 1259 } 1260 1261 int cgoto(fa *f, int s, int c) 1262 { 1263 int *p, *q; 1264 int i, j, k; 1265 1266 assert(c == HAT || c < NCHARS); 1267 while (f->accept >= maxsetvec) { /* guessing here! */ 1268 resizesetvec(__func__); 1269 } 1270 for (i = 0; i <= f->accept; i++) 1271 setvec[i] = 0; 1272 setcnt = 0; 1273 resize_state(f, s); 1274 /* compute positions of gototab[s,c] into setvec */ 1275 p = f->posns[s]; 1276 for (i = 1; i <= *p; i++) { 1277 if ((k = f->re[p[i]].ltype) != FINAL) { 1278 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1279 || (k == DOT && c != 0 && c != HAT) 1280 || (k == ALL && c != 0) 1281 || (k == EMPTYRE && c != 0) 1282 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1283 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1284 q = f->re[p[i]].lfollow; 1285 for (j = 1; j <= *q; j++) { 1286 if (q[j] >= maxsetvec) { 1287 resizesetvec(__func__); 1288 } 1289 if (setvec[q[j]] == 0) { 1290 setcnt++; 1291 setvec[q[j]] = 1; 1292 } 1293 } 1294 } 1295 } 1296 } 1297 /* determine if setvec is a previous state */ 1298 tmpset[0] = setcnt; 1299 j = 1; 1300 for (i = f->accept; i >= 0; i--) 1301 if (setvec[i]) { 1302 tmpset[j++] = i; 1303 } 1304 resize_state(f, f->curstat > s ? f->curstat : s); 1305 /* tmpset == previous state? */ 1306 for (i = 1; i <= f->curstat; i++) { 1307 p = f->posns[i]; 1308 if ((k = tmpset[0]) != p[0]) 1309 goto different; 1310 for (j = 1; j <= k; j++) 1311 if (tmpset[j] != p[j]) 1312 goto different; 1313 /* setvec is state i */ 1314 if (c != HAT) 1315 f->gototab[s][c] = i; 1316 return i; 1317 different:; 1318 } 1319 1320 /* add tmpset to current set of states */ 1321 ++(f->curstat); 1322 resize_state(f, f->curstat); 1323 for (i = 0; i < NCHARS; i++) 1324 f->gototab[f->curstat][i] = 0; 1325 xfree(f->posns[f->curstat]); 1326 p = intalloc(setcnt + 1, __func__); 1327 1328 f->posns[f->curstat] = p; 1329 if (c != HAT) 1330 f->gototab[s][c] = f->curstat; 1331 for (i = 0; i <= setcnt; i++) 1332 p[i] = tmpset[i]; 1333 if (setvec[f->accept]) 1334 f->out[f->curstat] = 1; 1335 else 1336 f->out[f->curstat] = 0; 1337 return f->curstat; 1338 } 1339 1340 1341 void freefa(fa *f) /* free a finite automaton */ 1342 { 1343 int i; 1344 1345 if (f == NULL) 1346 return; 1347 for (i = 0; i < f->state_count; i++) 1348 xfree(f->gototab[i]) 1349 for (i = 0; i <= f->curstat; i++) 1350 xfree(f->posns[i]); 1351 for (i = 0; i <= f->accept; i++) { 1352 xfree(f->re[i].lfollow); 1353 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1354 xfree(f->re[i].lval.np); 1355 } 1356 xfree(f->restr); 1357 xfree(f->out); 1358 xfree(f->posns); 1359 xfree(f->gototab); 1360 xfree(f); 1361 } 1362