1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #define DEBUG 31 32 #include <ctype.h> 33 #include <limits.h> 34 #include <stdio.h> 35 #include <string.h> 36 #include <stdlib.h> 37 #include "awk.h" 38 #include "ytab.h" 39 40 #define MAXLIN 22 41 42 #define type(v) (v)->nobj /* badly overloaded here */ 43 #define info(v) (v)->ntype /* badly overloaded here */ 44 #define left(v) (v)->narg[0] 45 #define right(v) (v)->narg[1] 46 #define parent(v) (v)->nnext 47 48 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 49 #define ELEAF case EMPTYRE: /* empty string in regexp */ 50 #define UNARY case STAR: case PLUS: case QUEST: 51 52 /* encoding in tree Nodes: 53 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 54 left is index, right contains value or pointer to value 55 unary (STAR, PLUS, QUEST): left is child, right is null 56 binary (CAT, OR): left and right are children 57 parent contains pointer to parent 58 */ 59 60 61 int *setvec; 62 int *tmpset; 63 int maxsetvec = 0; 64 65 int rtok; /* next token in current re */ 66 int rlxval; 67 static uschar *rlxstr; 68 static uschar *prestr; /* current position in current re */ 69 static uschar *lastre; /* origin of last re */ 70 static uschar *lastatom; /* origin of last Atom */ 71 static uschar *starttok; 72 static uschar *basestr; /* starts with original, replaced during 73 repetition processing */ 74 static uschar *firstbasestr; 75 76 static int setcnt; 77 static int poscnt; 78 79 char *patbeg; 80 int patlen; 81 82 #define NFA 20 /* cache this many dynamic fa's */ 83 fa *fatab[NFA]; 84 int nfatab = 0; /* entries in fatab */ 85 86 fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 87 { 88 int i, use, nuse; 89 fa *pfa; 90 static int now = 1; 91 92 if (setvec == NULL) { /* first time through any RE */ 93 maxsetvec = MAXLIN; 94 setvec = (int *) malloc(maxsetvec * sizeof(int)); 95 tmpset = (int *) malloc(maxsetvec * sizeof(int)); 96 if (setvec == NULL || tmpset == NULL) 97 overflo("out of space initializing makedfa"); 98 } 99 100 if (compile_time) /* a constant for sure */ 101 return mkdfa(s, anchor); 102 for (i = 0; i < nfatab; i++) /* is it there already? */ 103 if (fatab[i]->anchor == anchor 104 && strcmp((const char *) fatab[i]->restr, s) == 0) { 105 fatab[i]->use = now++; 106 return fatab[i]; 107 } 108 pfa = mkdfa(s, anchor); 109 if (nfatab < NFA) { /* room for another */ 110 fatab[nfatab] = pfa; 111 fatab[nfatab]->use = now++; 112 nfatab++; 113 return pfa; 114 } 115 use = fatab[0]->use; /* replace least-recently used */ 116 nuse = 0; 117 for (i = 1; i < nfatab; i++) 118 if (fatab[i]->use < use) { 119 use = fatab[i]->use; 120 nuse = i; 121 } 122 freefa(fatab[nuse]); 123 fatab[nuse] = pfa; 124 pfa->use = now++; 125 return pfa; 126 } 127 128 fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 129 /* anchor = 1 for anchored matches, else 0 */ 130 { 131 Node *p, *p1; 132 fa *f; 133 134 firstbasestr = (uschar *) s; 135 basestr = firstbasestr; 136 p = reparse(s); 137 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 138 /* put ALL STAR in front of reg. exp. */ 139 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 140 /* put FINAL after reg. exp. */ 141 142 poscnt = 0; 143 penter(p1); /* enter parent pointers and leaf indices */ 144 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 145 overflo("out of space for fa"); 146 f->accept = poscnt-1; /* penter has computed number of positions in re */ 147 cfoll(f, p1); /* set up follow sets */ 148 freetr(p1); 149 if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL) 150 overflo("out of space in makedfa"); 151 if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 152 overflo("out of space in makedfa"); 153 *f->posns[1] = 0; 154 f->initstat = makeinit(f, anchor); 155 f->anchor = anchor; 156 f->restr = (uschar *) tostring(s); 157 if (firstbasestr != basestr) { 158 if (basestr) 159 xfree(basestr); 160 } 161 return f; 162 } 163 164 int makeinit(fa *f, int anchor) 165 { 166 int i, k; 167 168 f->curstat = 2; 169 f->out[2] = 0; 170 f->reset = 0; 171 k = *(f->re[0].lfollow); 172 xfree(f->posns[2]); 173 if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) 174 overflo("out of space in makeinit"); 175 for (i=0; i <= k; i++) { 176 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 177 } 178 if ((f->posns[2])[1] == f->accept) 179 f->out[2] = 1; 180 for (i=0; i < NCHARS; i++) 181 f->gototab[2][i] = 0; 182 f->curstat = cgoto(f, 2, HAT); 183 if (anchor) { 184 *f->posns[2] = k-1; /* leave out position 0 */ 185 for (i=0; i < k; i++) { 186 (f->posns[0])[i] = (f->posns[2])[i]; 187 } 188 189 f->out[0] = f->out[2]; 190 if (f->curstat != 2) 191 --(*f->posns[f->curstat]); 192 } 193 return f->curstat; 194 } 195 196 void penter(Node *p) /* set up parent pointers and leaf indices */ 197 { 198 switch (type(p)) { 199 ELEAF 200 LEAF 201 info(p) = poscnt; 202 poscnt++; 203 break; 204 UNARY 205 penter(left(p)); 206 parent(left(p)) = p; 207 break; 208 case CAT: 209 case OR: 210 penter(left(p)); 211 penter(right(p)); 212 parent(left(p)) = p; 213 parent(right(p)) = p; 214 break; 215 default: /* can't happen */ 216 FATAL("can't happen: unknown type %d in penter", type(p)); 217 break; 218 } 219 } 220 221 void freetr(Node *p) /* free parse tree */ 222 { 223 switch (type(p)) { 224 ELEAF 225 LEAF 226 xfree(p); 227 break; 228 UNARY 229 freetr(left(p)); 230 xfree(p); 231 break; 232 case CAT: 233 case OR: 234 freetr(left(p)); 235 freetr(right(p)); 236 xfree(p); 237 break; 238 default: /* can't happen */ 239 FATAL("can't happen: unknown type %d in freetr", type(p)); 240 break; 241 } 242 } 243 244 /* in the parsing of regular expressions, metacharacters like . have */ 245 /* to be seen literally; \056 is not a metacharacter. */ 246 247 int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */ 248 { /* only pick up one 8-bit byte (2 chars) */ 249 uschar *p; 250 int n = 0; 251 int i; 252 253 for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { 254 if (isdigit(*p)) 255 n = 16 * n + *p - '0'; 256 else if (*p >= 'a' && *p <= 'f') 257 n = 16 * n + *p - 'a' + 10; 258 else if (*p >= 'A' && *p <= 'F') 259 n = 16 * n + *p - 'A' + 10; 260 } 261 *pp = (uschar *) p; 262 return n; 263 } 264 265 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 266 267 int quoted(uschar **pp) /* pick up next thing after a \\ */ 268 /* and increment *pp */ 269 { 270 uschar *p = *pp; 271 int c; 272 273 if ((c = *p++) == 't') 274 c = '\t'; 275 else if (c == 'n') 276 c = '\n'; 277 else if (c == 'f') 278 c = '\f'; 279 else if (c == 'r') 280 c = '\r'; 281 else if (c == 'b') 282 c = '\b'; 283 else if (c == '\\') 284 c = '\\'; 285 else if (c == 'x') { /* hexadecimal goo follows */ 286 c = hexstr(&p); /* this adds a null if number is invalid */ 287 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 288 int n = c - '0'; 289 if (isoctdigit(*p)) { 290 n = 8 * n + *p++ - '0'; 291 if (isoctdigit(*p)) 292 n = 8 * n + *p++ - '0'; 293 } 294 c = n; 295 } /* else */ 296 /* c = c; */ 297 *pp = p; 298 return c; 299 } 300 301 static int collate_range_cmp(int a, int b) 302 { 303 static char s[2][2]; 304 305 if ((uschar)a == (uschar)b) 306 return 0; 307 s[0][0] = a; 308 s[1][0] = b; 309 return (strcoll(s[0], s[1])); 310 } 311 312 char *cclenter(const char *argp) /* add a character class */ 313 { 314 int i, c, c2; 315 int j; 316 uschar *p = (uschar *) argp; 317 uschar *op, *bp; 318 static uschar *buf = NULL; 319 static int bufsz = 100; 320 321 op = p; 322 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 323 FATAL("out of space for character class [%.10s...] 1", p); 324 bp = buf; 325 for (i = 0; (c = *p++) != 0; ) { 326 if (c == '\\') { 327 c = quoted(&p); 328 } else if (c == '-' && i > 0 && bp[-1] != 0) { 329 if (*p != 0) { 330 c = bp[-1]; 331 c2 = *p++; 332 if (c2 == '\\') 333 c2 = quoted(&p); 334 if (collate_range_cmp(c, c2) > 0) { 335 bp--; 336 i--; 337 continue; 338 } 339 for (j = 0; j < NCHARS; j++) { 340 if ((collate_range_cmp(c, j) > 0) || 341 collate_range_cmp(j, c2) > 0) 342 continue; 343 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 344 FATAL("out of space for character class [%.10s...] 2", p); 345 *bp++ = j; 346 i++; 347 } 348 continue; 349 } 350 } 351 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 352 FATAL("out of space for character class [%.10s...] 3", p); 353 *bp++ = c; 354 i++; 355 } 356 *bp = 0; 357 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 358 xfree(op); 359 return (char *) tostring((char *) buf); 360 } 361 362 void overflo(const char *s) 363 { 364 FATAL("regular expression too big: %.30s...", s); 365 } 366 367 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 368 { 369 int i; 370 int *p; 371 372 switch (type(v)) { 373 ELEAF 374 LEAF 375 f->re[info(v)].ltype = type(v); 376 f->re[info(v)].lval.np = right(v); 377 while (f->accept >= maxsetvec) { /* guessing here! */ 378 maxsetvec *= 4; 379 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 380 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 381 if (setvec == NULL || tmpset == NULL) 382 overflo("out of space in cfoll()"); 383 } 384 for (i = 0; i <= f->accept; i++) 385 setvec[i] = 0; 386 setcnt = 0; 387 follow(v); /* computes setvec and setcnt */ 388 if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL) 389 overflo("out of space building follow set"); 390 f->re[info(v)].lfollow = p; 391 *p = setcnt; 392 for (i = f->accept; i >= 0; i--) 393 if (setvec[i] == 1) 394 *++p = i; 395 break; 396 UNARY 397 cfoll(f,left(v)); 398 break; 399 case CAT: 400 case OR: 401 cfoll(f,left(v)); 402 cfoll(f,right(v)); 403 break; 404 default: /* can't happen */ 405 FATAL("can't happen: unknown type %d in cfoll", type(v)); 406 } 407 } 408 409 int first(Node *p) /* collects initially active leaves of p into setvec */ 410 /* returns 0 if p matches empty string */ 411 { 412 int b, lp; 413 414 switch (type(p)) { 415 ELEAF 416 LEAF 417 lp = info(p); /* look for high-water mark of subscripts */ 418 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 419 maxsetvec *= 4; 420 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 421 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 422 if (setvec == NULL || tmpset == NULL) 423 overflo("out of space in first()"); 424 } 425 if (type(p) == EMPTYRE) { 426 setvec[lp] = 0; 427 return(0); 428 } 429 if (setvec[lp] != 1) { 430 setvec[lp] = 1; 431 setcnt++; 432 } 433 if (type(p) == CCL && (*(char *) right(p)) == '\0') 434 return(0); /* empty CCL */ 435 else return(1); 436 case PLUS: 437 if (first(left(p)) == 0) return(0); 438 return(1); 439 case STAR: 440 case QUEST: 441 first(left(p)); 442 return(0); 443 case CAT: 444 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 445 return(1); 446 case OR: 447 b = first(right(p)); 448 if (first(left(p)) == 0 || b == 0) return(0); 449 return(1); 450 } 451 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 452 return(-1); 453 } 454 455 void follow(Node *v) /* collects leaves that can follow v into setvec */ 456 { 457 Node *p; 458 459 if (type(v) == FINAL) 460 return; 461 p = parent(v); 462 switch (type(p)) { 463 case STAR: 464 case PLUS: 465 first(v); 466 follow(p); 467 return; 468 469 case OR: 470 case QUEST: 471 follow(p); 472 return; 473 474 case CAT: 475 if (v == left(p)) { /* v is left child of p */ 476 if (first(right(p)) == 0) { 477 follow(p); 478 return; 479 } 480 } else /* v is right child */ 481 follow(p); 482 return; 483 } 484 } 485 486 int member(int c, const char *sarg) /* is c in s? */ 487 { 488 uschar *s = (uschar *) sarg; 489 490 while (*s) 491 if (c == *s++) 492 return(1); 493 return(0); 494 } 495 496 int match(fa *f, const char *p0) /* shortest match ? */ 497 { 498 int s, ns; 499 uschar *p = (uschar *) p0; 500 501 s = f->reset ? makeinit(f,0) : f->initstat; 502 if (f->out[s]) 503 return(1); 504 do { 505 /* assert(*p < NCHARS); */ 506 if ((ns = f->gototab[s][*p]) != 0) 507 s = ns; 508 else 509 s = cgoto(f, s, *p); 510 if (f->out[s]) 511 return(1); 512 } while (*p++ != 0); 513 return(0); 514 } 515 516 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 517 { 518 int s, ns; 519 uschar *p = (uschar *) p0; 520 uschar *q; 521 int i, k; 522 523 /* s = f->reset ? makeinit(f,1) : f->initstat; */ 524 if (f->reset) { 525 f->initstat = s = makeinit(f,1); 526 } else { 527 s = f->initstat; 528 } 529 patbeg = (char *) p; 530 patlen = -1; 531 do { 532 q = p; 533 do { 534 if (f->out[s]) /* final state */ 535 patlen = q-p; 536 /* assert(*q < NCHARS); */ 537 if ((ns = f->gototab[s][*q]) != 0) 538 s = ns; 539 else 540 s = cgoto(f, s, *q); 541 if (s == 1) { /* no transition */ 542 if (patlen >= 0) { 543 patbeg = (char *) p; 544 return(1); 545 } 546 else 547 goto nextin; /* no match */ 548 } 549 } while (*q++ != 0); 550 if (f->out[s]) 551 patlen = q-p-1; /* don't count $ */ 552 if (patlen >= 0) { 553 patbeg = (char *) p; 554 return(1); 555 } 556 nextin: 557 s = 2; 558 if (f->reset) { 559 for (i = 2; i <= f->curstat; i++) 560 xfree(f->posns[i]); 561 k = *f->posns[0]; 562 if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) 563 overflo("out of space in pmatch"); 564 for (i = 0; i <= k; i++) 565 (f->posns[2])[i] = (f->posns[0])[i]; 566 f->initstat = f->curstat = 2; 567 f->out[2] = f->out[0]; 568 for (i = 0; i < NCHARS; i++) 569 f->gototab[2][i] = 0; 570 } 571 } while (*p++ != 0); 572 return (0); 573 } 574 575 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 576 { 577 int s, ns; 578 uschar *p = (uschar *) p0; 579 uschar *q; 580 int i, k; 581 582 /* s = f->reset ? makeinit(f,1) : f->initstat; */ 583 if (f->reset) { 584 f->initstat = s = makeinit(f,1); 585 } else { 586 s = f->initstat; 587 } 588 patlen = -1; 589 while (*p) { 590 q = p; 591 do { 592 if (f->out[s]) /* final state */ 593 patlen = q-p; 594 /* assert(*q < NCHARS); */ 595 if ((ns = f->gototab[s][*q]) != 0) 596 s = ns; 597 else 598 s = cgoto(f, s, *q); 599 if (s == 1) { /* no transition */ 600 if (patlen > 0) { 601 patbeg = (char *) p; 602 return(1); 603 } else 604 goto nnextin; /* no nonempty match */ 605 } 606 } while (*q++ != 0); 607 if (f->out[s]) 608 patlen = q-p-1; /* don't count $ */ 609 if (patlen > 0 ) { 610 patbeg = (char *) p; 611 return(1); 612 } 613 nnextin: 614 s = 2; 615 if (f->reset) { 616 for (i = 2; i <= f->curstat; i++) 617 xfree(f->posns[i]); 618 k = *f->posns[0]; 619 if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) 620 overflo("out of state space"); 621 for (i = 0; i <= k; i++) 622 (f->posns[2])[i] = (f->posns[0])[i]; 623 f->initstat = f->curstat = 2; 624 f->out[2] = f->out[0]; 625 for (i = 0; i < NCHARS; i++) 626 f->gototab[2][i] = 0; 627 } 628 p++; 629 } 630 return (0); 631 } 632 633 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 634 { /* uses relex() to scan regular expression */ 635 Node *np; 636 637 dprintf( ("reparse <%s>\n", p) ); 638 lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ 639 rtok = relex(); 640 /* GNU compatibility: an empty regexp matches anything */ 641 if (rtok == '\0') { 642 /* FATAL("empty regular expression"); previous */ 643 return(op2(EMPTYRE, NIL, NIL)); 644 } 645 np = regexp(); 646 if (rtok != '\0') 647 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 648 return(np); 649 } 650 651 Node *regexp(void) /* top-level parse of reg expr */ 652 { 653 return (alt(concat(primary()))); 654 } 655 656 Node *primary(void) 657 { 658 Node *np; 659 int savelastatom; 660 661 switch (rtok) { 662 case CHAR: 663 lastatom = starttok; 664 np = op2(CHAR, NIL, itonp(rlxval)); 665 rtok = relex(); 666 return (unary(np)); 667 case ALL: 668 rtok = relex(); 669 return (unary(op2(ALL, NIL, NIL))); 670 case EMPTYRE: 671 rtok = relex(); 672 return (unary(op2(EMPTYRE, NIL, NIL))); 673 case DOT: 674 lastatom = starttok; 675 rtok = relex(); 676 return (unary(op2(DOT, NIL, NIL))); 677 case CCL: 678 np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); 679 lastatom = starttok; 680 rtok = relex(); 681 return (unary(np)); 682 case NCCL: 683 np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); 684 lastatom = starttok; 685 rtok = relex(); 686 return (unary(np)); 687 case '^': 688 rtok = relex(); 689 return (unary(op2(CHAR, NIL, itonp(HAT)))); 690 case '$': 691 rtok = relex(); 692 return (unary(op2(CHAR, NIL, NIL))); 693 case '(': 694 lastatom = starttok; 695 savelastatom = starttok - basestr; /* Retain over recursion */ 696 rtok = relex(); 697 if (rtok == ')') { /* special pleading for () */ 698 rtok = relex(); 699 return unary(op2(CCL, NIL, (Node *) tostring(""))); 700 } 701 np = regexp(); 702 if (rtok == ')') { 703 lastatom = basestr + savelastatom; /* Restore */ 704 rtok = relex(); 705 return (unary(np)); 706 } 707 else 708 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 709 default: 710 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 711 } 712 return 0; /*NOTREACHED*/ 713 } 714 715 Node *concat(Node *np) 716 { 717 switch (rtok) { 718 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 719 return (concat(op2(CAT, np, primary()))); 720 case EMPTYRE: 721 rtok = relex(); 722 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 723 primary()))); 724 } 725 return (np); 726 } 727 728 Node *alt(Node *np) 729 { 730 if (rtok == OR) { 731 rtok = relex(); 732 return (alt(op2(OR, np, concat(primary())))); 733 } 734 return (np); 735 } 736 737 Node *unary(Node *np) 738 { 739 switch (rtok) { 740 case STAR: 741 rtok = relex(); 742 return (unary(op2(STAR, np, NIL))); 743 case PLUS: 744 rtok = relex(); 745 return (unary(op2(PLUS, np, NIL))); 746 case QUEST: 747 rtok = relex(); 748 return (unary(op2(QUEST, np, NIL))); 749 default: 750 return (np); 751 } 752 } 753 754 /* 755 * Character class definitions conformant to the POSIX locale as 756 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 757 * and operating character sets are both ASCII (ISO646) or supersets 758 * thereof. 759 * 760 * Note that to avoid overflowing the temporary buffer used in 761 * relex(), the expanded character class (prior to range expansion) 762 * must be less than twice the size of their full name. 763 */ 764 765 /* Because isblank doesn't show up in any of the header files on any 766 * system i use, it's defined here. if some other locale has a richer 767 * definition of "blank", define HAS_ISBLANK and provide your own 768 * version. 769 * the parentheses here are an attempt to find a path through the maze 770 * of macro definition and/or function and/or version provided. thanks 771 * to nelson beebe for the suggestion; let's see if it works everywhere. 772 */ 773 774 /* #define HAS_ISBLANK */ 775 #ifndef HAS_ISBLANK 776 777 int (xisblank)(int c) 778 { 779 return c==' ' || c=='\t'; 780 } 781 782 #endif 783 784 struct charclass { 785 const char *cc_name; 786 int cc_namelen; 787 int (*cc_func)(int); 788 } charclasses[] = { 789 { "alnum", 5, isalnum }, 790 { "alpha", 5, isalpha }, 791 #ifndef HAS_ISBLANK 792 { "blank", 5, xisblank }, 793 #else 794 { "blank", 5, isblank }, 795 #endif 796 { "cntrl", 5, iscntrl }, 797 { "digit", 5, isdigit }, 798 { "graph", 5, isgraph }, 799 { "lower", 5, islower }, 800 { "print", 5, isprint }, 801 { "punct", 5, ispunct }, 802 { "space", 5, isspace }, 803 { "upper", 5, isupper }, 804 { "xdigit", 6, isxdigit }, 805 { NULL, 0, NULL }, 806 }; 807 808 #define REPEAT_SIMPLE 0 809 #define REPEAT_PLUS_APPENDED 1 810 #define REPEAT_WITH_Q 2 811 #define REPEAT_ZERO 3 812 813 static int 814 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 815 int atomlen, int firstnum, int secondnum, int special_case) 816 { 817 int i, j; 818 uschar *buf = 0; 819 int ret = 1; 820 int init_q = (firstnum==0); /* first added char will be ? */ 821 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 822 int prefix_length = reptok - basestr; /* prefix includes first rep */ 823 int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */ 824 int size = prefix_length + suffix_length; 825 826 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 827 size += atomlen*(firstnum-1); 828 } 829 830 /* Adjust size of buffer for special cases */ 831 if (special_case == REPEAT_PLUS_APPENDED) { 832 size++; /* for the final + */ 833 } else if (special_case == REPEAT_WITH_Q) { 834 size += init_q + (atomlen+1)* n_q_reps; 835 } else if (special_case == REPEAT_ZERO) { 836 size += 2; /* just a null ERE: () */ 837 } 838 if ((buf = (uschar *) malloc(size+1)) == NULL) 839 FATAL("out of space in reg expr %.10s..", lastre); 840 memcpy(buf, basestr, prefix_length); /* copy prefix */ 841 j = prefix_length; 842 if (special_case == REPEAT_ZERO) { 843 j -= atomlen; 844 buf[j++] = '('; 845 buf[j++] = ')'; 846 } 847 for (i=1; i < firstnum; i++) { /* copy x reps */ 848 memcpy(&buf[j], atom, atomlen); 849 j += atomlen; 850 } 851 if (special_case == REPEAT_PLUS_APPENDED) { 852 buf[j++] = '+'; 853 } else if (special_case == REPEAT_WITH_Q) { 854 if (init_q) buf[j++] = '?'; 855 for (i=0; i < n_q_reps; i++) { /* copy x? reps */ 856 memcpy(&buf[j], atom, atomlen); 857 j += atomlen; 858 buf[j++] = '?'; 859 } 860 } 861 memcpy(&buf[j], reptok+reptoklen, suffix_length); 862 if (special_case == REPEAT_ZERO) { 863 buf[j+suffix_length] = '\0'; 864 } else { 865 buf[size] = '\0'; 866 } 867 /* free old basestr */ 868 if (firstbasestr != basestr) { 869 if (basestr) 870 xfree(basestr); 871 } 872 basestr = buf; 873 prestr = buf + prefix_length; 874 if (special_case == REPEAT_ZERO) { 875 prestr -= atomlen; 876 ret++; 877 } 878 return ret; 879 } 880 881 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 882 int atomlen, int firstnum, int secondnum) 883 { 884 /* 885 In general, the repetition specifier or "bound" is replaced here 886 by an equivalent ERE string, repeating the immediately previous atom 887 and appending ? and + as needed. Note that the first copy of the 888 atom is left in place, except in the special_case of a zero-repeat 889 (i.e., {0}). 890 */ 891 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 892 if (firstnum < 2) { 893 /* 0 or 1: should be handled before you get here */ 894 FATAL("internal error"); 895 } else { 896 return replace_repeat(reptok, reptoklen, atom, atomlen, 897 firstnum, secondnum, REPEAT_PLUS_APPENDED); 898 } 899 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 900 if (firstnum == 0) { /* {0} or {0,0} */ 901 /* This case is unusual because the resulting 902 replacement string might actually be SMALLER than 903 the original ERE */ 904 return replace_repeat(reptok, reptoklen, atom, atomlen, 905 firstnum, secondnum, REPEAT_ZERO); 906 } else { /* (firstnum >= 1) */ 907 return replace_repeat(reptok, reptoklen, atom, atomlen, 908 firstnum, secondnum, REPEAT_SIMPLE); 909 } 910 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 911 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 912 return replace_repeat(reptok, reptoklen, atom, atomlen, 913 firstnum, secondnum, REPEAT_WITH_Q); 914 } else { /* Error - shouldn't be here (n>m) */ 915 FATAL("internal error"); 916 } 917 return 0; 918 } 919 920 int relex(void) /* lexical analyzer for reparse */ 921 { 922 int c, n; 923 int cflag; 924 static uschar *buf = NULL; 925 static int bufsz = 100; 926 uschar *bp; 927 struct charclass *cc; 928 int i; 929 int num, m, commafound, digitfound; 930 const uschar *startreptok; 931 932 rescan: 933 starttok = prestr; 934 935 switch (c = *prestr++) { 936 case '|': return OR; 937 case '*': return STAR; 938 case '+': return PLUS; 939 case '?': return QUEST; 940 case '.': return DOT; 941 case '\0': prestr--; return '\0'; 942 case '^': 943 case '$': 944 case '(': 945 case ')': 946 return c; 947 case '\\': 948 rlxval = quoted(&prestr); 949 return CHAR; 950 default: 951 rlxval = c; 952 return CHAR; 953 case '[': 954 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 955 FATAL("out of space in reg expr %.10s..", lastre); 956 bp = buf; 957 if (*prestr == '^') { 958 cflag = 1; 959 prestr++; 960 } 961 else 962 cflag = 0; 963 n = 2 * strlen((const char *) prestr)+1; 964 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 965 FATAL("out of space for reg expr %.10s...", lastre); 966 for (; ; ) { 967 if ((c = *prestr++) == '\\') { 968 *bp++ = '\\'; 969 if ((c = *prestr++) == '\0') 970 FATAL("nonterminated character class %.20s...", lastre); 971 *bp++ = c; 972 /* } else if (c == '\n') { */ 973 /* FATAL("newline in character class %.20s...", lastre); */ 974 } else if (c == '[' && *prestr == ':') { 975 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 976 for (cc = charclasses; cc->cc_name; cc++) 977 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 978 break; 979 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 980 prestr[2 + cc->cc_namelen] == ']') { 981 prestr += cc->cc_namelen + 3; 982 /* 983 * BUG: We begin at 1, instead of 0, since we 984 * would otherwise prematurely terminate the 985 * string for classes like [[:cntrl:]]. This 986 * means that we can't match the NUL character, 987 * not without first adapting the entire 988 * program to track each string's length. 989 */ 990 for (i = 1; i <= UCHAR_MAX; i++) { 991 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 992 FATAL("out of space for reg expr %.10s...", lastre); 993 if (cc->cc_func(i)) { 994 *bp++ = i; 995 n++; 996 } 997 } 998 } else 999 *bp++ = c; 1000 } else if (c == '[' && *prestr == '.') { 1001 char collate_char; 1002 prestr++; 1003 collate_char = *prestr++; 1004 if (*prestr == '.' && prestr[1] == ']') { 1005 prestr += 2; 1006 /* Found it: map via locale TBD: for 1007 now, simply return this char. This 1008 is sufficient to pass conformance 1009 test awk.ex 156 1010 */ 1011 if (*prestr == ']') { 1012 prestr++; 1013 rlxval = collate_char; 1014 return CHAR; 1015 } 1016 } 1017 } else if (c == '[' && *prestr == '=') { 1018 char equiv_char; 1019 prestr++; 1020 equiv_char = *prestr++; 1021 if (*prestr == '=' && prestr[1] == ']') { 1022 prestr += 2; 1023 /* Found it: map via locale TBD: for now 1024 simply return this char. This is 1025 sufficient to pass conformance test 1026 awk.ex 156 1027 */ 1028 if (*prestr == ']') { 1029 prestr++; 1030 rlxval = equiv_char; 1031 return CHAR; 1032 } 1033 } 1034 } else if (c == '\0') { 1035 FATAL("nonterminated character class %.20s", lastre); 1036 } else if (bp == buf) { /* 1st char is special */ 1037 *bp++ = c; 1038 } else if (c == ']') { 1039 *bp++ = 0; 1040 rlxstr = (uschar *) tostring((char *) buf); 1041 if (cflag == 0) 1042 return CCL; 1043 else 1044 return NCCL; 1045 } else 1046 *bp++ = c; 1047 } 1048 break; 1049 case '{': 1050 if (isdigit(*(prestr))) { 1051 num = 0; /* Process as a repetition */ 1052 n = -1; m = -1; 1053 commafound = 0; 1054 digitfound = 0; 1055 startreptok = prestr-1; 1056 /* Remember start of previous atom here ? */ 1057 } else { /* just a { char, not a repetition */ 1058 rlxval = c; 1059 return CHAR; 1060 } 1061 for (; ; ) { 1062 if ((c = *prestr++) == '}') { 1063 if (commafound) { 1064 if (digitfound) { /* {n,m} */ 1065 m = num; 1066 if (m<n) 1067 FATAL("illegal repetition expression: class %.20s", 1068 lastre); 1069 if ((n==0) && (m==1)) { 1070 return QUEST; 1071 } 1072 } else { /* {n,} */ 1073 if (n==0) return STAR; 1074 if (n==1) return PLUS; 1075 } 1076 } else { 1077 if (digitfound) { /* {n} same as {n,n} */ 1078 n = num; 1079 m = num; 1080 } else { /* {} */ 1081 FATAL("illegal repetition expression: class %.20s", 1082 lastre); 1083 } 1084 } 1085 if (repeat(starttok, prestr-starttok, lastatom, 1086 startreptok - lastatom, n, m) > 0) { 1087 if ((n==0) && (m==0)) { 1088 return EMPTYRE; 1089 } 1090 /* must rescan input for next token */ 1091 goto rescan; 1092 } 1093 /* Failed to replace: eat up {...} characters 1094 and treat like just PLUS */ 1095 return PLUS; 1096 } else if (c == '\0') { 1097 FATAL("nonterminated character class %.20s", 1098 lastre); 1099 } else if (isdigit(c)) { 1100 num = 10 * num + c - '0'; 1101 digitfound = 1; 1102 } else if (c == ',') { 1103 if (commafound) 1104 FATAL("illegal repetition expression: class %.20s", 1105 lastre); 1106 /* looking for {n,} or {n,m} */ 1107 commafound = 1; 1108 n = num; 1109 digitfound = 0; /* reset */ 1110 num = 0; 1111 } else { 1112 FATAL("illegal repetition expression: class %.20s", 1113 lastre); 1114 } 1115 } 1116 break; 1117 } 1118 } 1119 1120 int cgoto(fa *f, int s, int c) 1121 { 1122 int i, j, k; 1123 int *p, *q; 1124 1125 assert(c == HAT || c < NCHARS); 1126 while (f->accept >= maxsetvec) { /* guessing here! */ 1127 maxsetvec *= 4; 1128 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 1129 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 1130 if (setvec == NULL || tmpset == NULL) 1131 overflo("out of space in cgoto()"); 1132 } 1133 for (i = 0; i <= f->accept; i++) 1134 setvec[i] = 0; 1135 setcnt = 0; 1136 /* compute positions of gototab[s,c] into setvec */ 1137 p = f->posns[s]; 1138 for (i = 1; i <= *p; i++) { 1139 if ((k = f->re[p[i]].ltype) != FINAL) { 1140 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1141 || (k == DOT && c != 0 && c != HAT) 1142 || (k == ALL && c != 0) 1143 || (k == EMPTYRE && c != 0) 1144 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1145 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1146 q = f->re[p[i]].lfollow; 1147 for (j = 1; j <= *q; j++) { 1148 if (q[j] >= maxsetvec) { 1149 maxsetvec *= 4; 1150 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 1151 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 1152 if (setvec == NULL || tmpset == NULL) 1153 overflo("cgoto overflow"); 1154 } 1155 if (setvec[q[j]] == 0) { 1156 setcnt++; 1157 setvec[q[j]] = 1; 1158 } 1159 } 1160 } 1161 } 1162 } 1163 /* determine if setvec is a previous state */ 1164 tmpset[0] = setcnt; 1165 j = 1; 1166 for (i = f->accept; i >= 0; i--) 1167 if (setvec[i]) { 1168 tmpset[j++] = i; 1169 } 1170 /* tmpset == previous state? */ 1171 for (i = 1; i <= f->curstat; i++) { 1172 p = f->posns[i]; 1173 if ((k = tmpset[0]) != p[0]) 1174 goto different; 1175 for (j = 1; j <= k; j++) 1176 if (tmpset[j] != p[j]) 1177 goto different; 1178 /* setvec is state i */ 1179 f->gototab[s][c] = i; 1180 return i; 1181 different:; 1182 } 1183 1184 /* add tmpset to current set of states */ 1185 if (f->curstat >= NSTATES-1) { 1186 f->curstat = 2; 1187 f->reset = 1; 1188 for (i = 2; i < NSTATES; i++) 1189 xfree(f->posns[i]); 1190 } else 1191 ++(f->curstat); 1192 for (i = 0; i < NCHARS; i++) 1193 f->gototab[f->curstat][i] = 0; 1194 xfree(f->posns[f->curstat]); 1195 if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL) 1196 overflo("out of space in cgoto"); 1197 1198 f->posns[f->curstat] = p; 1199 f->gototab[s][c] = f->curstat; 1200 for (i = 0; i <= setcnt; i++) 1201 p[i] = tmpset[i]; 1202 if (setvec[f->accept]) 1203 f->out[f->curstat] = 1; 1204 else 1205 f->out[f->curstat] = 0; 1206 return f->curstat; 1207 } 1208 1209 1210 void freefa(fa *f) /* free a finite automaton */ 1211 { 1212 int i; 1213 1214 if (f == NULL) 1215 return; 1216 for (i = 0; i <= f->curstat; i++) 1217 xfree(f->posns[i]); 1218 for (i = 0; i <= f->accept; i++) { 1219 xfree(f->re[i].lfollow); 1220 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1221 xfree((f->re[i].lval.np)); 1222 } 1223 xfree(f->restr); 1224 xfree(f); 1225 } 1226