1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #define DEBUG 31 32 #include <ctype.h> 33 #include <limits.h> 34 #include <stdio.h> 35 #include <string.h> 36 #include <stdlib.h> 37 #include "awk.h" 38 #include "awkgram.tab.h" 39 40 #define MAXLIN 22 41 42 #define type(v) (v)->nobj /* badly overloaded here */ 43 #define info(v) (v)->ntype /* badly overloaded here */ 44 #define left(v) (v)->narg[0] 45 #define right(v) (v)->narg[1] 46 #define parent(v) (v)->nnext 47 48 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 49 #define ELEAF case EMPTYRE: /* empty string in regexp */ 50 #define UNARY case STAR: case PLUS: case QUEST: 51 52 /* encoding in tree Nodes: 53 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 54 left is index, right contains value or pointer to value 55 unary (STAR, PLUS, QUEST): left is child, right is null 56 binary (CAT, OR): left and right are children 57 parent contains pointer to parent 58 */ 59 60 61 int *setvec; 62 int *tmpset; 63 int maxsetvec = 0; 64 65 int rtok; /* next token in current re */ 66 int rlxval; 67 static const uschar *rlxstr; 68 static const uschar *prestr; /* current position in current re */ 69 static const uschar *lastre; /* origin of last re */ 70 static const uschar *lastatom; /* origin of last Atom */ 71 static const uschar *starttok; 72 static const uschar *basestr; /* starts with original, replaced during 73 repetition processing */ 74 static const uschar *firstbasestr; 75 76 static int setcnt; 77 static int poscnt; 78 79 const char *patbeg; 80 int patlen; 81 82 #define NFA 128 /* cache this many dynamic fa's */ 83 fa *fatab[NFA]; 84 int nfatab = 0; /* entries in fatab */ 85 86 static int * 87 intalloc(size_t n, const char *f) 88 { 89 int *p = (int *) calloc(n, sizeof(int)); 90 if (p == NULL) 91 overflo(f); 92 return p; 93 } 94 95 static void 96 resizesetvec(const char *f) 97 { 98 if (maxsetvec == 0) 99 maxsetvec = MAXLIN; 100 else 101 maxsetvec *= 4; 102 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec)); 103 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset)); 104 if (setvec == NULL || tmpset == NULL) 105 overflo(f); 106 } 107 108 static void 109 resize_state(fa *f, int state) 110 { 111 unsigned int **p; 112 uschar *p2; 113 int **p3; 114 int i, new_count; 115 116 if (++state < f->state_count) 117 return; 118 119 new_count = state + 10; /* needs to be tuned */ 120 121 p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0])); 122 if (p == NULL) 123 goto out; 124 f->gototab = p; 125 126 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0])); 127 if (p2 == NULL) 128 goto out; 129 f->out = p2; 130 131 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0])); 132 if (p3 == NULL) 133 goto out; 134 f->posns = p3; 135 136 for (i = f->state_count; i < new_count; ++i) { 137 f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab)); 138 if (f->gototab[i] == NULL) 139 goto out; 140 f->out[i] = 0; 141 f->posns[i] = NULL; 142 } 143 f->state_count = new_count; 144 return; 145 out: 146 overflo(__func__); 147 } 148 149 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 150 { 151 int i, use, nuse; 152 fa *pfa; 153 static int now = 1; 154 155 if (setvec == NULL) { /* first time through any RE */ 156 resizesetvec(__func__); 157 } 158 159 if (compile_time != RUNNING) /* a constant for sure */ 160 return mkdfa(s, anchor); 161 for (i = 0; i < nfatab; i++) /* is it there already? */ 162 if (fatab[i]->anchor == anchor 163 && strcmp((const char *) fatab[i]->restr, s) == 0) { 164 fatab[i]->use = now++; 165 return fatab[i]; 166 } 167 pfa = mkdfa(s, anchor); 168 if (nfatab < NFA) { /* room for another */ 169 fatab[nfatab] = pfa; 170 fatab[nfatab]->use = now++; 171 nfatab++; 172 return pfa; 173 } 174 use = fatab[0]->use; /* replace least-recently used */ 175 nuse = 0; 176 for (i = 1; i < nfatab; i++) 177 if (fatab[i]->use < use) { 178 use = fatab[i]->use; 179 nuse = i; 180 } 181 freefa(fatab[nuse]); 182 fatab[nuse] = pfa; 183 pfa->use = now++; 184 return pfa; 185 } 186 187 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 188 /* anchor = true for anchored matches, else false */ 189 { 190 Node *p, *p1; 191 fa *f; 192 193 firstbasestr = (const uschar *) s; 194 basestr = firstbasestr; 195 p = reparse(s); 196 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 197 /* put ALL STAR in front of reg. exp. */ 198 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 199 /* put FINAL after reg. exp. */ 200 201 poscnt = 0; 202 penter(p1); /* enter parent pointers and leaf indices */ 203 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 204 overflo(__func__); 205 f->accept = poscnt-1; /* penter has computed number of positions in re */ 206 cfoll(f, p1); /* set up follow sets */ 207 freetr(p1); 208 resize_state(f, 1); 209 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 210 f->posns[1] = intalloc(1, __func__); 211 *f->posns[1] = 0; 212 f->initstat = makeinit(f, anchor); 213 f->anchor = anchor; 214 f->restr = (uschar *) tostring(s); 215 if (firstbasestr != basestr) { 216 if (basestr) 217 xfree(basestr); 218 } 219 return f; 220 } 221 222 int makeinit(fa *f, bool anchor) 223 { 224 int i, k; 225 226 f->curstat = 2; 227 f->out[2] = 0; 228 k = *(f->re[0].lfollow); 229 xfree(f->posns[2]); 230 f->posns[2] = intalloc(k + 1, __func__); 231 for (i = 0; i <= k; i++) { 232 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 233 } 234 if ((f->posns[2])[1] == f->accept) 235 f->out[2] = 1; 236 for (i = 0; i < NCHARS; i++) 237 f->gototab[2][i] = 0; 238 f->curstat = cgoto(f, 2, HAT); 239 if (anchor) { 240 *f->posns[2] = k-1; /* leave out position 0 */ 241 for (i = 0; i < k; i++) { 242 (f->posns[0])[i] = (f->posns[2])[i]; 243 } 244 245 f->out[0] = f->out[2]; 246 if (f->curstat != 2) 247 --(*f->posns[f->curstat]); 248 } 249 return f->curstat; 250 } 251 252 void penter(Node *p) /* set up parent pointers and leaf indices */ 253 { 254 switch (type(p)) { 255 ELEAF 256 LEAF 257 info(p) = poscnt; 258 poscnt++; 259 break; 260 UNARY 261 penter(left(p)); 262 parent(left(p)) = p; 263 break; 264 case CAT: 265 case OR: 266 penter(left(p)); 267 penter(right(p)); 268 parent(left(p)) = p; 269 parent(right(p)) = p; 270 break; 271 case ZERO: 272 break; 273 default: /* can't happen */ 274 FATAL("can't happen: unknown type %d in penter", type(p)); 275 break; 276 } 277 } 278 279 void freetr(Node *p) /* free parse tree */ 280 { 281 switch (type(p)) { 282 ELEAF 283 LEAF 284 xfree(p); 285 break; 286 UNARY 287 case ZERO: 288 freetr(left(p)); 289 xfree(p); 290 break; 291 case CAT: 292 case OR: 293 freetr(left(p)); 294 freetr(right(p)); 295 xfree(p); 296 break; 297 default: /* can't happen */ 298 FATAL("can't happen: unknown type %d in freetr", type(p)); 299 break; 300 } 301 } 302 303 /* in the parsing of regular expressions, metacharacters like . have */ 304 /* to be seen literally; \056 is not a metacharacter. */ 305 306 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ 307 { /* only pick up one 8-bit byte (2 chars) */ 308 const uschar *p; 309 int n = 0; 310 int i; 311 312 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { 313 if (isdigit(*p)) 314 n = 16 * n + *p - '0'; 315 else if (*p >= 'a' && *p <= 'f') 316 n = 16 * n + *p - 'a' + 10; 317 else if (*p >= 'A' && *p <= 'F') 318 n = 16 * n + *p - 'A' + 10; 319 } 320 *pp = p; 321 return n; 322 } 323 324 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 325 326 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 327 /* and increment *pp */ 328 { 329 const uschar *p = *pp; 330 int c; 331 332 if ((c = *p++) == 't') 333 c = '\t'; 334 else if (c == 'n') 335 c = '\n'; 336 else if (c == 'f') 337 c = '\f'; 338 else if (c == 'r') 339 c = '\r'; 340 else if (c == 'b') 341 c = '\b'; 342 else if (c == 'v') 343 c = '\v'; 344 else if (c == 'a') 345 c = '\a'; 346 else if (c == '\\') 347 c = '\\'; 348 else if (c == 'x') { /* hexadecimal goo follows */ 349 c = hexstr(&p); /* this adds a null if number is invalid */ 350 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 351 int n = c - '0'; 352 if (isoctdigit(*p)) { 353 n = 8 * n + *p++ - '0'; 354 if (isoctdigit(*p)) 355 n = 8 * n + *p++ - '0'; 356 } 357 c = n; 358 } /* else */ 359 /* c = c; */ 360 *pp = p; 361 return c; 362 } 363 364 static int collate_range_cmp(int a, int b) 365 { 366 static char s[2][2]; 367 368 if ((uschar)a == (uschar)b) 369 return 0; 370 s[0][0] = a; 371 s[1][0] = b; 372 return (strcoll(s[0], s[1])); 373 } 374 375 char *cclenter(const char *argp) /* add a character class */ 376 { 377 int i, c, c2, j; 378 const uschar *op, *p = (const uschar *) argp; 379 uschar *bp; 380 static uschar *buf = NULL; 381 static int bufsz = 100; 382 383 op = p; 384 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 385 FATAL("out of space for character class [%.10s...] 1", p); 386 bp = buf; 387 for (i = 0; (c = *p++) != 0; ) { 388 if (c == '\\') { 389 c = quoted(&p); 390 } else if (c == '-' && i > 0 && bp[-1] != 0) { 391 if (*p != 0) { 392 c = bp[-1]; 393 c2 = *p++; 394 if (c2 == '\\') 395 c2 = quoted(&p); 396 if (collate_range_cmp(c, c2) > 0) { 397 bp--; 398 i--; 399 continue; 400 } 401 for (j = 0; j < NCHARS; j++) { 402 if ((collate_range_cmp(c, j) > 0) || 403 collate_range_cmp(j, c2) > 0) 404 continue; 405 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 406 FATAL("out of space for character class [%.10s...] 2", p); 407 *bp++ = j; 408 i++; 409 } 410 continue; 411 } 412 } 413 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 414 FATAL("out of space for character class [%.10s...] 3", p); 415 *bp++ = c; 416 i++; 417 } 418 *bp = 0; 419 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); 420 xfree(op); 421 return (char *) tostring((char *) buf); 422 } 423 424 void overflo(const char *s) 425 { 426 FATAL("regular expression too big: out of space in %.30s...", s); 427 } 428 429 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 430 { 431 int i; 432 int *p; 433 434 switch (type(v)) { 435 ELEAF 436 LEAF 437 f->re[info(v)].ltype = type(v); 438 f->re[info(v)].lval.np = right(v); 439 while (f->accept >= maxsetvec) { /* guessing here! */ 440 resizesetvec(__func__); 441 } 442 for (i = 0; i <= f->accept; i++) 443 setvec[i] = 0; 444 setcnt = 0; 445 follow(v); /* computes setvec and setcnt */ 446 p = intalloc(setcnt + 1, __func__); 447 f->re[info(v)].lfollow = p; 448 *p = setcnt; 449 for (i = f->accept; i >= 0; i--) 450 if (setvec[i] == 1) 451 *++p = i; 452 break; 453 UNARY 454 cfoll(f,left(v)); 455 break; 456 case CAT: 457 case OR: 458 cfoll(f,left(v)); 459 cfoll(f,right(v)); 460 break; 461 case ZERO: 462 break; 463 default: /* can't happen */ 464 FATAL("can't happen: unknown type %d in cfoll", type(v)); 465 } 466 } 467 468 int first(Node *p) /* collects initially active leaves of p into setvec */ 469 /* returns 0 if p matches empty string */ 470 { 471 int b, lp; 472 473 switch (type(p)) { 474 ELEAF 475 LEAF 476 lp = info(p); /* look for high-water mark of subscripts */ 477 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 478 resizesetvec(__func__); 479 } 480 if (type(p) == EMPTYRE) { 481 setvec[lp] = 0; 482 return(0); 483 } 484 if (setvec[lp] != 1) { 485 setvec[lp] = 1; 486 setcnt++; 487 } 488 if (type(p) == CCL && (*(char *) right(p)) == '\0') 489 return(0); /* empty CCL */ 490 return(1); 491 case PLUS: 492 if (first(left(p)) == 0) 493 return(0); 494 return(1); 495 case STAR: 496 case QUEST: 497 first(left(p)); 498 return(0); 499 case CAT: 500 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 501 return(1); 502 case OR: 503 b = first(right(p)); 504 if (first(left(p)) == 0 || b == 0) return(0); 505 return(1); 506 case ZERO: 507 return 0; 508 } 509 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 510 return(-1); 511 } 512 513 void follow(Node *v) /* collects leaves that can follow v into setvec */ 514 { 515 Node *p; 516 517 if (type(v) == FINAL) 518 return; 519 p = parent(v); 520 switch (type(p)) { 521 case STAR: 522 case PLUS: 523 first(v); 524 follow(p); 525 return; 526 527 case OR: 528 case QUEST: 529 follow(p); 530 return; 531 532 case CAT: 533 if (v == left(p)) { /* v is left child of p */ 534 if (first(right(p)) == 0) { 535 follow(p); 536 return; 537 } 538 } else /* v is right child */ 539 follow(p); 540 return; 541 } 542 } 543 544 int member(int c, const char *sarg) /* is c in s? */ 545 { 546 const uschar *s = (const uschar *) sarg; 547 548 while (*s) 549 if (c == *s++) 550 return(1); 551 return(0); 552 } 553 554 int match(fa *f, const char *p0) /* shortest match ? */ 555 { 556 int s, ns; 557 const uschar *p = (const uschar *) p0; 558 559 s = f->initstat; 560 assert (s < f->state_count); 561 562 if (f->out[s]) 563 return(1); 564 do { 565 /* assert(*p < NCHARS); */ 566 if ((ns = f->gototab[s][*p]) != 0) 567 s = ns; 568 else 569 s = cgoto(f, s, *p); 570 if (f->out[s]) 571 return(1); 572 } while (*p++ != 0); 573 return(0); 574 } 575 576 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 577 { 578 int s, ns; 579 const uschar *p = (const uschar *) p0; 580 const uschar *q; 581 582 s = f->initstat; 583 assert(s < f->state_count); 584 585 patbeg = (const char *)p; 586 patlen = -1; 587 do { 588 q = p; 589 do { 590 if (f->out[s]) /* final state */ 591 patlen = q-p; 592 /* assert(*q < NCHARS); */ 593 if ((ns = f->gototab[s][*q]) != 0) 594 s = ns; 595 else 596 s = cgoto(f, s, *q); 597 598 assert(s < f->state_count); 599 600 if (s == 1) { /* no transition */ 601 if (patlen >= 0) { 602 patbeg = (const char *) p; 603 return(1); 604 } 605 else 606 goto nextin; /* no match */ 607 } 608 } while (*q++ != 0); 609 if (f->out[s]) 610 patlen = q-p-1; /* don't count $ */ 611 if (patlen >= 0) { 612 patbeg = (const char *) p; 613 return(1); 614 } 615 nextin: 616 s = 2; 617 #if 0 /* XXX */ 618 if (f->reset) { 619 for (i = 2; i <= f->curstat; i++) 620 n xfree(f->posns[i]); 621 k = *f->posns[0]; 622 if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) 623 overflo("out of space in pmatch"); 624 for (i = 0; i <= k; i++) 625 (f->posns[2])[i] = (f->posns[0])[i]; 626 f->initstat = f->curstat = 2; 627 f->out[2] = f->out[0]; 628 for (i = 0; i < NCHARS; i++) 629 f->gototab[2][i] = 0; 630 } 631 #endif 632 } while (*p++ != 0); 633 return (0); 634 } 635 636 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 637 { 638 int s, ns; 639 const uschar *p = (const uschar *) p0; 640 const uschar *q; 641 642 s = f->initstat; 643 assert(s < f->state_count); 644 645 patbeg = (const char *)p; 646 patlen = -1; 647 while (*p) { 648 q = p; 649 do { 650 if (f->out[s]) /* final state */ 651 patlen = q-p; 652 /* assert(*q < NCHARS); */ 653 if ((ns = f->gototab[s][*q]) != 0) 654 s = ns; 655 else 656 s = cgoto(f, s, *q); 657 if (s == 1) { /* no transition */ 658 if (patlen > 0) { 659 patbeg = (const char *) p; 660 return(1); 661 } else 662 goto nnextin; /* no nonempty match */ 663 } 664 } while (*q++ != 0); 665 if (f->out[s]) 666 patlen = q-p-1; /* don't count $ */ 667 if (patlen > 0 ) { 668 patbeg = (const char *) p; 669 return(1); 670 } 671 nnextin: 672 s = 2; 673 #if 0 /* XXX */ 674 if (f->reset) { 675 for (i = 2; i <= f->curstat; i++) 676 xfree(f->posns[i]); 677 k = *f->posns[0]; 678 if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) 679 overflo("out of state space"); 680 for (i = 0; i <= k; i++) 681 (f->posns[2])[i] = (f->posns[0])[i]; 682 f->initstat = f->curstat = 2; 683 f->out[2] = f->out[0]; 684 for (i = 0; i < NCHARS; i++) 685 f->gototab[2][i] = 0; 686 } 687 #endif 688 p++; 689 } 690 return (0); 691 } 692 693 694 /* 695 * NAME 696 * fnematch 697 * 698 * DESCRIPTION 699 * A stream-fed version of nematch which transfers characters to a 700 * null-terminated buffer. All characters up to and including the last 701 * character of the matching text or EOF are placed in the buffer. If 702 * a match is found, patbeg and patlen are set appropriately. 703 * 704 * RETURN VALUES 705 * false No match found. 706 * true Match found. 707 */ 708 709 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 710 { 711 char *buf = *pbuf; 712 int bufsize = *pbufsize; 713 int c, i, j, k, ns, s; 714 715 s = pfa->initstat; 716 patlen = 0; 717 718 /* 719 * All indices relative to buf. 720 * i <= j <= k <= bufsize 721 * 722 * i: origin of active substring 723 * j: current character 724 * k: destination of next getc() 725 */ 726 i = -1, k = 0; 727 do { 728 j = i++; 729 do { 730 if (++j == k) { 731 if (k == bufsize) 732 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 733 FATAL("stream '%.30s...' too long", buf); 734 buf[k++] = (c = getc(f)) != EOF ? c : 0; 735 } 736 c = (uschar)buf[j]; 737 /* assert(c < NCHARS); */ 738 739 if ((ns = pfa->gototab[s][c]) != 0) 740 s = ns; 741 else 742 s = cgoto(pfa, s, c); 743 744 if (pfa->out[s]) { /* final state */ 745 patlen = j - i + 1; 746 if (c == 0) /* don't count $ */ 747 patlen--; 748 } 749 } while (buf[j] && s != 1); 750 s = 2; 751 } while (buf[i] && !patlen); 752 753 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 754 *pbuf = buf; 755 *pbufsize = bufsize; 756 757 if (patlen) { 758 patbeg = (char *) buf + i; 759 /* 760 * Under no circumstances is the last character fed to 761 * the automaton part of the match. It is EOF's nullbyte, 762 * or it sent the automaton into a state with no further 763 * transitions available (s==1), or both. Room for a 764 * terminating nullbyte is guaranteed. 765 * 766 * ungetc any chars after the end of matching text 767 * (except for EOF's nullbyte, if present) and null 768 * terminate the buffer. 769 */ 770 do 771 if (buf[--k] && ungetc(buf[k], f) == EOF) 772 FATAL("unable to ungetc '%c'", buf[k]); 773 while (k > i + patlen); 774 buf[k] = '\0'; 775 return true; 776 } 777 else 778 return false; 779 } 780 781 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 782 { /* uses relex() to scan regular expression */ 783 Node *np; 784 785 DPRINTF("reparse <%s>\n", p); 786 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 787 rtok = relex(); 788 /* GNU compatibility: an empty regexp matches anything */ 789 if (rtok == '\0') { 790 /* FATAL("empty regular expression"); previous */ 791 return(op2(EMPTYRE, NIL, NIL)); 792 } 793 np = regexp(); 794 if (rtok != '\0') 795 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 796 return(np); 797 } 798 799 Node *regexp(void) /* top-level parse of reg expr */ 800 { 801 return (alt(concat(primary()))); 802 } 803 804 Node *primary(void) 805 { 806 Node *np; 807 int savelastatom; 808 809 switch (rtok) { 810 case CHAR: 811 lastatom = starttok; 812 np = op2(CHAR, NIL, itonp(rlxval)); 813 rtok = relex(); 814 return (unary(np)); 815 case ALL: 816 rtok = relex(); 817 return (unary(op2(ALL, NIL, NIL))); 818 case EMPTYRE: 819 rtok = relex(); 820 return (unary(op2(EMPTYRE, NIL, NIL))); 821 case DOT: 822 lastatom = starttok; 823 rtok = relex(); 824 return (unary(op2(DOT, NIL, NIL))); 825 case CCL: 826 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 827 lastatom = starttok; 828 rtok = relex(); 829 return (unary(np)); 830 case NCCL: 831 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 832 lastatom = starttok; 833 rtok = relex(); 834 return (unary(np)); 835 case '^': 836 rtok = relex(); 837 return (unary(op2(CHAR, NIL, itonp(HAT)))); 838 case '$': 839 rtok = relex(); 840 return (unary(op2(CHAR, NIL, NIL))); 841 case '(': 842 lastatom = starttok; 843 savelastatom = starttok - basestr; /* Retain over recursion */ 844 rtok = relex(); 845 if (rtok == ')') { /* special pleading for () */ 846 rtok = relex(); 847 return unary(op2(CCL, NIL, (Node *) tostring(""))); 848 } 849 np = regexp(); 850 if (rtok == ')') { 851 lastatom = basestr + savelastatom; /* Restore */ 852 rtok = relex(); 853 return (unary(np)); 854 } 855 else 856 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 857 default: 858 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 859 } 860 return 0; /*NOTREACHED*/ 861 } 862 863 Node *concat(Node *np) 864 { 865 switch (rtok) { 866 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 867 return (concat(op2(CAT, np, primary()))); 868 case EMPTYRE: 869 rtok = relex(); 870 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 871 primary()))); 872 } 873 return (np); 874 } 875 876 Node *alt(Node *np) 877 { 878 if (rtok == OR) { 879 rtok = relex(); 880 return (alt(op2(OR, np, concat(primary())))); 881 } 882 return (np); 883 } 884 885 Node *unary(Node *np) 886 { 887 switch (rtok) { 888 case STAR: 889 rtok = relex(); 890 return (unary(op2(STAR, np, NIL))); 891 case PLUS: 892 rtok = relex(); 893 return (unary(op2(PLUS, np, NIL))); 894 case QUEST: 895 rtok = relex(); 896 return (unary(op2(QUEST, np, NIL))); 897 case ZERO: 898 rtok = relex(); 899 return (unary(op2(ZERO, np, NIL))); 900 default: 901 return (np); 902 } 903 } 904 905 /* 906 * Character class definitions conformant to the POSIX locale as 907 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 908 * and operating character sets are both ASCII (ISO646) or supersets 909 * thereof. 910 * 911 * Note that to avoid overflowing the temporary buffer used in 912 * relex(), the expanded character class (prior to range expansion) 913 * must be less than twice the size of their full name. 914 */ 915 916 /* Because isblank doesn't show up in any of the header files on any 917 * system i use, it's defined here. if some other locale has a richer 918 * definition of "blank", define HAS_ISBLANK and provide your own 919 * version. 920 * the parentheses here are an attempt to find a path through the maze 921 * of macro definition and/or function and/or version provided. thanks 922 * to nelson beebe for the suggestion; let's see if it works everywhere. 923 */ 924 925 /* #define HAS_ISBLANK */ 926 #ifndef HAS_ISBLANK 927 928 int (xisblank)(int c) 929 { 930 return c==' ' || c=='\t'; 931 } 932 933 #endif 934 935 static const struct charclass { 936 const char *cc_name; 937 int cc_namelen; 938 int (*cc_func)(int); 939 } charclasses[] = { 940 { "alnum", 5, isalnum }, 941 { "alpha", 5, isalpha }, 942 #ifndef HAS_ISBLANK 943 { "blank", 5, xisblank }, 944 #else 945 { "blank", 5, isblank }, 946 #endif 947 { "cntrl", 5, iscntrl }, 948 { "digit", 5, isdigit }, 949 { "graph", 5, isgraph }, 950 { "lower", 5, islower }, 951 { "print", 5, isprint }, 952 { "punct", 5, ispunct }, 953 { "space", 5, isspace }, 954 { "upper", 5, isupper }, 955 { "xdigit", 6, isxdigit }, 956 { NULL, 0, NULL }, 957 }; 958 959 #define REPEAT_SIMPLE 0 960 #define REPEAT_PLUS_APPENDED 1 961 #define REPEAT_WITH_Q 2 962 #define REPEAT_ZERO 3 963 964 static int 965 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 966 int atomlen, int firstnum, int secondnum, int special_case) 967 { 968 int i, j; 969 uschar *buf = 0; 970 int ret = 1; 971 int init_q = (firstnum == 0); /* first added char will be ? */ 972 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 973 int prefix_length = reptok - basestr; /* prefix includes first rep */ 974 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 975 int size = prefix_length + suffix_length; 976 977 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 978 size += atomlen*(firstnum-1); 979 } 980 981 /* Adjust size of buffer for special cases */ 982 if (special_case == REPEAT_PLUS_APPENDED) { 983 size++; /* for the final + */ 984 } else if (special_case == REPEAT_WITH_Q) { 985 size += init_q + (atomlen+1)* n_q_reps; 986 } else if (special_case == REPEAT_ZERO) { 987 size += 2; /* just a null ERE: () */ 988 } 989 if ((buf = (uschar *) malloc(size + 1)) == NULL) 990 FATAL("out of space in reg expr %.10s..", lastre); 991 memcpy(buf, basestr, prefix_length); /* copy prefix */ 992 j = prefix_length; 993 if (special_case == REPEAT_ZERO) { 994 j -= atomlen; 995 buf[j++] = '('; 996 buf[j++] = ')'; 997 } 998 for (i = 1; i < firstnum; i++) { /* copy x reps */ 999 memcpy(&buf[j], atom, atomlen); 1000 j += atomlen; 1001 } 1002 if (special_case == REPEAT_PLUS_APPENDED) { 1003 buf[j++] = '+'; 1004 } else if (special_case == REPEAT_WITH_Q) { 1005 if (init_q) 1006 buf[j++] = '?'; 1007 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 1008 memcpy(&buf[j], atom, atomlen); 1009 j += atomlen; 1010 buf[j++] = '?'; 1011 } 1012 } 1013 memcpy(&buf[j], reptok+reptoklen, suffix_length); 1014 if (special_case == REPEAT_ZERO) { 1015 buf[j+suffix_length] = '\0'; 1016 } else { 1017 buf[size] = '\0'; 1018 } 1019 /* free old basestr */ 1020 if (firstbasestr != basestr) { 1021 if (basestr) 1022 xfree(basestr); 1023 } 1024 basestr = buf; 1025 prestr = buf + prefix_length; 1026 if (special_case == REPEAT_ZERO) { 1027 prestr -= atomlen; 1028 ret++; 1029 } 1030 return ret; 1031 } 1032 1033 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1034 int atomlen, int firstnum, int secondnum) 1035 { 1036 /* 1037 In general, the repetition specifier or "bound" is replaced here 1038 by an equivalent ERE string, repeating the immediately previous atom 1039 and appending ? and + as needed. Note that the first copy of the 1040 atom is left in place, except in the special_case of a zero-repeat 1041 (i.e., {0}). 1042 */ 1043 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1044 if (firstnum < 2) { 1045 /* 0 or 1: should be handled before you get here */ 1046 FATAL("internal error"); 1047 } else { 1048 return replace_repeat(reptok, reptoklen, atom, atomlen, 1049 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1050 } 1051 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1052 if (firstnum == 0) { /* {0} or {0,0} */ 1053 /* This case is unusual because the resulting 1054 replacement string might actually be SMALLER than 1055 the original ERE */ 1056 return replace_repeat(reptok, reptoklen, atom, atomlen, 1057 firstnum, secondnum, REPEAT_ZERO); 1058 } else { /* (firstnum >= 1) */ 1059 return replace_repeat(reptok, reptoklen, atom, atomlen, 1060 firstnum, secondnum, REPEAT_SIMPLE); 1061 } 1062 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1063 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1064 return replace_repeat(reptok, reptoklen, atom, atomlen, 1065 firstnum, secondnum, REPEAT_WITH_Q); 1066 } else { /* Error - shouldn't be here (n>m) */ 1067 FATAL("internal error"); 1068 } 1069 return 0; 1070 } 1071 1072 int relex(void) /* lexical analyzer for reparse */ 1073 { 1074 int c, n; 1075 int cflag; 1076 static uschar *buf = NULL; 1077 static int bufsz = 100; 1078 uschar *bp; 1079 const struct charclass *cc; 1080 int i; 1081 int num, m; 1082 bool commafound, digitfound; 1083 const uschar *startreptok; 1084 static int parens = 0; 1085 1086 rescan: 1087 starttok = prestr; 1088 1089 switch (c = *prestr++) { 1090 case '|': return OR; 1091 case '*': return STAR; 1092 case '+': return PLUS; 1093 case '?': return QUEST; 1094 case '.': return DOT; 1095 case '\0': prestr--; return '\0'; 1096 case '^': 1097 case '$': 1098 return c; 1099 case '(': 1100 parens++; 1101 return c; 1102 case ')': 1103 if (parens) { 1104 parens--; 1105 return c; 1106 } 1107 /* unmatched close parenthesis; per POSIX, treat as literal */ 1108 rlxval = c; 1109 return CHAR; 1110 case '\\': 1111 rlxval = quoted(&prestr); 1112 return CHAR; 1113 default: 1114 rlxval = c; 1115 return CHAR; 1116 case '[': 1117 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1118 FATAL("out of space in reg expr %.10s..", lastre); 1119 bp = buf; 1120 if (*prestr == '^') { 1121 cflag = 1; 1122 prestr++; 1123 } 1124 else 1125 cflag = 0; 1126 n = 2 * strlen((const char *) prestr)+1; 1127 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1128 FATAL("out of space for reg expr %.10s...", lastre); 1129 for (; ; ) { 1130 if ((c = *prestr++) == '\\') { 1131 *bp++ = '\\'; 1132 if ((c = *prestr++) == '\0') 1133 FATAL("nonterminated character class %.20s...", lastre); 1134 *bp++ = c; 1135 /* } else if (c == '\n') { */ 1136 /* FATAL("newline in character class %.20s...", lastre); */ 1137 } else if (c == '[' && *prestr == ':') { 1138 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1139 for (cc = charclasses; cc->cc_name; cc++) 1140 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1141 break; 1142 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1143 prestr[2 + cc->cc_namelen] == ']') { 1144 prestr += cc->cc_namelen + 3; 1145 /* 1146 * BUG: We begin at 1, instead of 0, since we 1147 * would otherwise prematurely terminate the 1148 * string for classes like [[:cntrl:]]. This 1149 * means that we can't match the NUL character, 1150 * not without first adapting the entire 1151 * program to track each string's length. 1152 */ 1153 for (i = 1; i <= UCHAR_MAX; i++) { 1154 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 1155 FATAL("out of space for reg expr %.10s...", lastre); 1156 if (cc->cc_func(i)) { 1157 /* escape backslash */ 1158 if (i == '\\') { 1159 *bp++ = '\\'; 1160 n++; 1161 } 1162 1163 *bp++ = i; 1164 n++; 1165 } 1166 } 1167 } else 1168 *bp++ = c; 1169 } else if (c == '[' && *prestr == '.') { 1170 char collate_char; 1171 prestr++; 1172 collate_char = *prestr++; 1173 if (*prestr == '.' && prestr[1] == ']') { 1174 prestr += 2; 1175 /* Found it: map via locale TBD: for 1176 now, simply return this char. This 1177 is sufficient to pass conformance 1178 test awk.ex 156 1179 */ 1180 if (*prestr == ']') { 1181 prestr++; 1182 rlxval = collate_char; 1183 return CHAR; 1184 } 1185 } 1186 } else if (c == '[' && *prestr == '=') { 1187 char equiv_char; 1188 prestr++; 1189 equiv_char = *prestr++; 1190 if (*prestr == '=' && prestr[1] == ']') { 1191 prestr += 2; 1192 /* Found it: map via locale TBD: for now 1193 simply return this char. This is 1194 sufficient to pass conformance test 1195 awk.ex 156 1196 */ 1197 if (*prestr == ']') { 1198 prestr++; 1199 rlxval = equiv_char; 1200 return CHAR; 1201 } 1202 } 1203 } else if (c == '\0') { 1204 FATAL("nonterminated character class %.20s", lastre); 1205 } else if (bp == buf) { /* 1st char is special */ 1206 *bp++ = c; 1207 } else if (c == ']') { 1208 *bp++ = 0; 1209 rlxstr = (uschar *) tostring((char *) buf); 1210 if (cflag == 0) 1211 return CCL; 1212 else 1213 return NCCL; 1214 } else 1215 *bp++ = c; 1216 } 1217 break; 1218 case '{': 1219 if (isdigit(*(prestr))) { 1220 num = 0; /* Process as a repetition */ 1221 n = -1; m = -1; 1222 commafound = false; 1223 digitfound = false; 1224 startreptok = prestr-1; 1225 /* Remember start of previous atom here ? */ 1226 } else { /* just a { char, not a repetition */ 1227 rlxval = c; 1228 return CHAR; 1229 } 1230 for (; ; ) { 1231 if ((c = *prestr++) == '}') { 1232 if (commafound) { 1233 if (digitfound) { /* {n,m} */ 1234 m = num; 1235 if (m < n) 1236 FATAL("illegal repetition expression: class %.20s", 1237 lastre); 1238 if (n == 0 && m == 1) { 1239 return QUEST; 1240 } 1241 } else { /* {n,} */ 1242 if (n == 0) 1243 return STAR; 1244 else if (n == 1) 1245 return PLUS; 1246 } 1247 } else { 1248 if (digitfound) { /* {n} same as {n,n} */ 1249 n = num; 1250 m = num; 1251 } else { /* {} */ 1252 FATAL("illegal repetition expression: class %.20s", 1253 lastre); 1254 } 1255 } 1256 if (repeat(starttok, prestr-starttok, lastatom, 1257 startreptok - lastatom, n, m) > 0) { 1258 if (n == 0 && m == 0) { 1259 return ZERO; 1260 } 1261 /* must rescan input for next token */ 1262 goto rescan; 1263 } 1264 /* Failed to replace: eat up {...} characters 1265 and treat like just PLUS */ 1266 return PLUS; 1267 } else if (c == '\0') { 1268 FATAL("nonterminated character class %.20s", 1269 lastre); 1270 } else if (isdigit(c)) { 1271 num = 10 * num + c - '0'; 1272 digitfound = true; 1273 } else if (c == ',') { 1274 if (commafound) 1275 FATAL("illegal repetition expression: class %.20s", 1276 lastre); 1277 /* looking for {n,} or {n,m} */ 1278 commafound = true; 1279 n = num; 1280 digitfound = false; /* reset */ 1281 num = 0; 1282 } else { 1283 FATAL("illegal repetition expression: class %.20s", 1284 lastre); 1285 } 1286 } 1287 break; 1288 } 1289 } 1290 1291 int cgoto(fa *f, int s, int c) 1292 { 1293 int *p, *q; 1294 int i, j, k; 1295 1296 assert(c == HAT || c < NCHARS); 1297 while (f->accept >= maxsetvec) { /* guessing here! */ 1298 resizesetvec(__func__); 1299 } 1300 for (i = 0; i <= f->accept; i++) 1301 setvec[i] = 0; 1302 setcnt = 0; 1303 resize_state(f, s); 1304 /* compute positions of gototab[s,c] into setvec */ 1305 p = f->posns[s]; 1306 for (i = 1; i <= *p; i++) { 1307 if ((k = f->re[p[i]].ltype) != FINAL) { 1308 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1309 || (k == DOT && c != 0 && c != HAT) 1310 || (k == ALL && c != 0) 1311 || (k == EMPTYRE && c != 0) 1312 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1313 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1314 q = f->re[p[i]].lfollow; 1315 for (j = 1; j <= *q; j++) { 1316 if (q[j] >= maxsetvec) { 1317 resizesetvec(__func__); 1318 } 1319 if (setvec[q[j]] == 0) { 1320 setcnt++; 1321 setvec[q[j]] = 1; 1322 } 1323 } 1324 } 1325 } 1326 } 1327 /* determine if setvec is a previous state */ 1328 tmpset[0] = setcnt; 1329 j = 1; 1330 for (i = f->accept; i >= 0; i--) 1331 if (setvec[i]) { 1332 tmpset[j++] = i; 1333 } 1334 resize_state(f, f->curstat > s ? f->curstat : s); 1335 /* tmpset == previous state? */ 1336 for (i = 1; i <= f->curstat; i++) { 1337 p = f->posns[i]; 1338 if ((k = tmpset[0]) != p[0]) 1339 goto different; 1340 for (j = 1; j <= k; j++) 1341 if (tmpset[j] != p[j]) 1342 goto different; 1343 /* setvec is state i */ 1344 if (c != HAT) 1345 f->gototab[s][c] = i; 1346 return i; 1347 different:; 1348 } 1349 1350 /* add tmpset to current set of states */ 1351 ++(f->curstat); 1352 resize_state(f, f->curstat); 1353 for (i = 0; i < NCHARS; i++) 1354 f->gototab[f->curstat][i] = 0; 1355 xfree(f->posns[f->curstat]); 1356 p = intalloc(setcnt + 1, __func__); 1357 1358 f->posns[f->curstat] = p; 1359 if (c != HAT) 1360 f->gototab[s][c] = f->curstat; 1361 for (i = 0; i <= setcnt; i++) 1362 p[i] = tmpset[i]; 1363 if (setvec[f->accept]) 1364 f->out[f->curstat] = 1; 1365 else 1366 f->out[f->curstat] = 0; 1367 return f->curstat; 1368 } 1369 1370 1371 void freefa(fa *f) /* free a finite automaton */ 1372 { 1373 int i; 1374 1375 if (f == NULL) 1376 return; 1377 for (i = 0; i < f->state_count; i++) 1378 xfree(f->gototab[i]) 1379 for (i = 0; i <= f->curstat; i++) 1380 xfree(f->posns[i]); 1381 for (i = 0; i <= f->accept; i++) { 1382 xfree(f->re[i].lfollow); 1383 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1384 xfree(f->re[i].lval.np); 1385 } 1386 xfree(f->restr); 1387 xfree(f->out); 1388 xfree(f->posns); 1389 xfree(f->gototab); 1390 xfree(f); 1391 } 1392