1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #define DEBUG 31 32 #include <ctype.h> 33 #include <limits.h> 34 #include <stdio.h> 35 #include <string.h> 36 #include <stdlib.h> 37 #include "awk.h" 38 #include "ytab.h" 39 40 #define HAT (NCHARS+2) /* matches ^ in regular expr */ 41 /* NCHARS is 2**n */ 42 #define MAXLIN 22 43 44 #define type(v) (v)->nobj /* badly overloaded here */ 45 #define info(v) (v)->ntype /* badly overloaded here */ 46 #define left(v) (v)->narg[0] 47 #define right(v) (v)->narg[1] 48 #define parent(v) (v)->nnext 49 50 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 51 #define ELEAF case EMPTYRE: /* empty string in regexp */ 52 #define UNARY case STAR: case PLUS: case QUEST: 53 54 /* encoding in tree Nodes: 55 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 56 left is index, right contains value or pointer to value 57 unary (STAR, PLUS, QUEST): left is child, right is null 58 binary (CAT, OR): left and right are children 59 parent contains pointer to parent 60 */ 61 62 63 int *setvec; 64 int *tmpset; 65 int maxsetvec = 0; 66 67 int rtok; /* next token in current re */ 68 int rlxval; 69 static uschar *rlxstr; 70 static uschar *prestr; /* current position in current re */ 71 static uschar *lastre; /* origin of last re */ 72 static uschar *lastatom; /* origin of last Atom */ 73 static uschar *starttok; 74 static uschar *basestr; /* starts with original, replaced during 75 repetition processing */ 76 static uschar *firstbasestr; 77 78 static int setcnt; 79 static int poscnt; 80 81 char *patbeg; 82 int patlen; 83 84 #define NFA 20 /* cache this many dynamic fa's */ 85 fa *fatab[NFA]; 86 int nfatab = 0; /* entries in fatab */ 87 88 fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ 89 { 90 int i, use, nuse; 91 fa *pfa; 92 static int now = 1; 93 94 if (setvec == NULL) { /* first time through any RE */ 95 maxsetvec = MAXLIN; 96 setvec = (int *) malloc(maxsetvec * sizeof(int)); 97 tmpset = (int *) malloc(maxsetvec * sizeof(int)); 98 if (setvec == NULL || tmpset == NULL) 99 overflo("out of space initializing makedfa"); 100 } 101 102 if (compile_time) /* a constant for sure */ 103 return mkdfa(s, anchor); 104 for (i = 0; i < nfatab; i++) /* is it there already? */ 105 if (fatab[i]->anchor == anchor 106 && strcmp((const char *) fatab[i]->restr, s) == 0) { 107 fatab[i]->use = now++; 108 return fatab[i]; 109 } 110 pfa = mkdfa(s, anchor); 111 if (nfatab < NFA) { /* room for another */ 112 fatab[nfatab] = pfa; 113 fatab[nfatab]->use = now++; 114 nfatab++; 115 return pfa; 116 } 117 use = fatab[0]->use; /* replace least-recently used */ 118 nuse = 0; 119 for (i = 1; i < nfatab; i++) 120 if (fatab[i]->use < use) { 121 use = fatab[i]->use; 122 nuse = i; 123 } 124 freefa(fatab[nuse]); 125 fatab[nuse] = pfa; 126 pfa->use = now++; 127 return pfa; 128 } 129 130 fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ 131 /* anchor = 1 for anchored matches, else 0 */ 132 { 133 Node *p, *p1; 134 fa *f; 135 136 firstbasestr = (uschar *) s; 137 basestr = firstbasestr; 138 p = reparse(s); 139 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 140 /* put ALL STAR in front of reg. exp. */ 141 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 142 /* put FINAL after reg. exp. */ 143 144 poscnt = 0; 145 penter(p1); /* enter parent pointers and leaf indices */ 146 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 147 overflo("out of space for fa"); 148 f->accept = poscnt-1; /* penter has computed number of positions in re */ 149 cfoll(f, p1); /* set up follow sets */ 150 freetr(p1); 151 if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL) 152 overflo("out of space in makedfa"); 153 if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 154 overflo("out of space in makedfa"); 155 *f->posns[1] = 0; 156 f->initstat = makeinit(f, anchor); 157 f->anchor = anchor; 158 f->restr = (uschar *) tostring(s); 159 if (firstbasestr != basestr) { 160 if (basestr) 161 xfree(basestr); 162 } 163 return f; 164 } 165 166 int makeinit(fa *f, int anchor) 167 { 168 int i, k; 169 170 f->curstat = 2; 171 f->out[2] = 0; 172 f->reset = 0; 173 k = *(f->re[0].lfollow); 174 xfree(f->posns[2]); 175 if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) 176 overflo("out of space in makeinit"); 177 for (i=0; i <= k; i++) { 178 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 179 } 180 if ((f->posns[2])[1] == f->accept) 181 f->out[2] = 1; 182 for (i=0; i < NCHARS; i++) 183 f->gototab[2][i] = 0; 184 f->curstat = cgoto(f, 2, HAT); 185 if (anchor) { 186 *f->posns[2] = k-1; /* leave out position 0 */ 187 for (i=0; i < k; i++) { 188 (f->posns[0])[i] = (f->posns[2])[i]; 189 } 190 191 f->out[0] = f->out[2]; 192 if (f->curstat != 2) 193 --(*f->posns[f->curstat]); 194 } 195 return f->curstat; 196 } 197 198 void penter(Node *p) /* set up parent pointers and leaf indices */ 199 { 200 switch (type(p)) { 201 ELEAF 202 LEAF 203 info(p) = poscnt; 204 poscnt++; 205 break; 206 UNARY 207 penter(left(p)); 208 parent(left(p)) = p; 209 break; 210 case CAT: 211 case OR: 212 penter(left(p)); 213 penter(right(p)); 214 parent(left(p)) = p; 215 parent(right(p)) = p; 216 break; 217 default: /* can't happen */ 218 FATAL("can't happen: unknown type %d in penter", type(p)); 219 break; 220 } 221 } 222 223 void freetr(Node *p) /* free parse tree */ 224 { 225 switch (type(p)) { 226 ELEAF 227 LEAF 228 xfree(p); 229 break; 230 UNARY 231 freetr(left(p)); 232 xfree(p); 233 break; 234 case CAT: 235 case OR: 236 freetr(left(p)); 237 freetr(right(p)); 238 xfree(p); 239 break; 240 default: /* can't happen */ 241 FATAL("can't happen: unknown type %d in freetr", type(p)); 242 break; 243 } 244 } 245 246 /* in the parsing of regular expressions, metacharacters like . have */ 247 /* to be seen literally; \056 is not a metacharacter. */ 248 249 int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */ 250 { /* only pick up one 8-bit byte (2 chars) */ 251 uschar *p; 252 int n = 0; 253 int i; 254 255 for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { 256 if (isdigit(*p)) 257 n = 16 * n + *p - '0'; 258 else if (*p >= 'a' && *p <= 'f') 259 n = 16 * n + *p - 'a' + 10; 260 else if (*p >= 'A' && *p <= 'F') 261 n = 16 * n + *p - 'A' + 10; 262 } 263 *pp = (uschar *) p; 264 return n; 265 } 266 267 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 268 269 int quoted(uschar **pp) /* pick up next thing after a \\ */ 270 /* and increment *pp */ 271 { 272 uschar *p = *pp; 273 int c; 274 275 if ((c = *p++) == 't') 276 c = '\t'; 277 else if (c == 'n') 278 c = '\n'; 279 else if (c == 'f') 280 c = '\f'; 281 else if (c == 'r') 282 c = '\r'; 283 else if (c == 'b') 284 c = '\b'; 285 else if (c == '\\') 286 c = '\\'; 287 else if (c == 'x') { /* hexadecimal goo follows */ 288 c = hexstr(&p); /* this adds a null if number is invalid */ 289 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 290 int n = c - '0'; 291 if (isoctdigit(*p)) { 292 n = 8 * n + *p++ - '0'; 293 if (isoctdigit(*p)) 294 n = 8 * n + *p++ - '0'; 295 } 296 c = n; 297 } /* else */ 298 /* c = c; */ 299 *pp = p; 300 return c; 301 } 302 303 static int collate_range_cmp(int a, int b) 304 { 305 static char s[2][2]; 306 307 if ((uschar)a == (uschar)b) 308 return 0; 309 s[0][0] = a; 310 s[1][0] = b; 311 return (strcoll(s[0], s[1])); 312 } 313 314 char *cclenter(const char *argp) /* add a character class */ 315 { 316 int i, c, c2; 317 int j; 318 uschar *p = (uschar *) argp; 319 uschar *op, *bp; 320 static uschar *buf = NULL; 321 static int bufsz = 100; 322 323 op = p; 324 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 325 FATAL("out of space for character class [%.10s...] 1", p); 326 bp = buf; 327 for (i = 0; (c = *p++) != 0; ) { 328 if (c == '\\') { 329 c = quoted(&p); 330 } else if (c == '-' && i > 0 && bp[-1] != 0) { 331 if (*p != 0) { 332 c = bp[-1]; 333 c2 = *p++; 334 if (c2 == '\\') 335 c2 = quoted(&p); 336 if (collate_range_cmp(c, c2) > 0) { 337 bp--; 338 i--; 339 continue; 340 } 341 for (j = 0; j < NCHARS; j++) { 342 if ((collate_range_cmp(c, j) > 0) || 343 collate_range_cmp(j, c2) > 0) 344 continue; 345 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) 346 FATAL("out of space for character class [%.10s...] 2", p); 347 *bp++ = j; 348 i++; 349 } 350 continue; 351 } 352 } 353 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) 354 FATAL("out of space for character class [%.10s...] 3", p); 355 *bp++ = c; 356 i++; 357 } 358 *bp = 0; 359 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); 360 xfree(op); 361 return (char *) tostring((char *) buf); 362 } 363 364 void overflo(const char *s) 365 { 366 FATAL("regular expression too big: %.30s...", s); 367 } 368 369 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 370 { 371 int i; 372 int *p; 373 374 switch (type(v)) { 375 ELEAF 376 LEAF 377 f->re[info(v)].ltype = type(v); 378 f->re[info(v)].lval.np = right(v); 379 while (f->accept >= maxsetvec) { /* guessing here! */ 380 maxsetvec *= 4; 381 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 382 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 383 if (setvec == NULL || tmpset == NULL) 384 overflo("out of space in cfoll()"); 385 } 386 for (i = 0; i <= f->accept; i++) 387 setvec[i] = 0; 388 setcnt = 0; 389 follow(v); /* computes setvec and setcnt */ 390 if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL) 391 overflo("out of space building follow set"); 392 f->re[info(v)].lfollow = p; 393 *p = setcnt; 394 for (i = f->accept; i >= 0; i--) 395 if (setvec[i] == 1) 396 *++p = i; 397 break; 398 UNARY 399 cfoll(f,left(v)); 400 break; 401 case CAT: 402 case OR: 403 cfoll(f,left(v)); 404 cfoll(f,right(v)); 405 break; 406 default: /* can't happen */ 407 FATAL("can't happen: unknown type %d in cfoll", type(v)); 408 } 409 } 410 411 int first(Node *p) /* collects initially active leaves of p into setvec */ 412 /* returns 0 if p matches empty string */ 413 { 414 int b, lp; 415 416 switch (type(p)) { 417 ELEAF 418 LEAF 419 lp = info(p); /* look for high-water mark of subscripts */ 420 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 421 maxsetvec *= 4; 422 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 423 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 424 if (setvec == NULL || tmpset == NULL) 425 overflo("out of space in first()"); 426 } 427 if (type(p) == EMPTYRE) { 428 setvec[lp] = 0; 429 return(0); 430 } 431 if (setvec[lp] != 1) { 432 setvec[lp] = 1; 433 setcnt++; 434 } 435 if (type(p) == CCL && (*(char *) right(p)) == '\0') 436 return(0); /* empty CCL */ 437 else return(1); 438 case PLUS: 439 if (first(left(p)) == 0) return(0); 440 return(1); 441 case STAR: 442 case QUEST: 443 first(left(p)); 444 return(0); 445 case CAT: 446 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 447 return(1); 448 case OR: 449 b = first(right(p)); 450 if (first(left(p)) == 0 || b == 0) return(0); 451 return(1); 452 } 453 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 454 return(-1); 455 } 456 457 void follow(Node *v) /* collects leaves that can follow v into setvec */ 458 { 459 Node *p; 460 461 if (type(v) == FINAL) 462 return; 463 p = parent(v); 464 switch (type(p)) { 465 case STAR: 466 case PLUS: 467 first(v); 468 follow(p); 469 return; 470 471 case OR: 472 case QUEST: 473 follow(p); 474 return; 475 476 case CAT: 477 if (v == left(p)) { /* v is left child of p */ 478 if (first(right(p)) == 0) { 479 follow(p); 480 return; 481 } 482 } else /* v is right child */ 483 follow(p); 484 return; 485 } 486 } 487 488 int member(int c, const char *sarg) /* is c in s? */ 489 { 490 uschar *s = (uschar *) sarg; 491 492 while (*s) 493 if (c == *s++) 494 return(1); 495 return(0); 496 } 497 498 int match(fa *f, const char *p0) /* shortest match ? */ 499 { 500 int s, ns; 501 uschar *p = (uschar *) p0; 502 503 s = f->reset ? makeinit(f,0) : f->initstat; 504 if (f->out[s]) 505 return(1); 506 do { 507 /* assert(*p < NCHARS); */ 508 if ((ns = f->gototab[s][*p]) != 0) 509 s = ns; 510 else 511 s = cgoto(f, s, *p); 512 if (f->out[s]) 513 return(1); 514 } while (*p++ != 0); 515 return(0); 516 } 517 518 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 519 { 520 int s, ns; 521 uschar *p = (uschar *) p0; 522 uschar *q; 523 int i, k; 524 525 /* s = f->reset ? makeinit(f,1) : f->initstat; */ 526 if (f->reset) { 527 f->initstat = s = makeinit(f,1); 528 } else { 529 s = f->initstat; 530 } 531 patbeg = (char *) p; 532 patlen = -1; 533 do { 534 q = p; 535 do { 536 if (f->out[s]) /* final state */ 537 patlen = q-p; 538 /* assert(*q < NCHARS); */ 539 if ((ns = f->gototab[s][*q]) != 0) 540 s = ns; 541 else 542 s = cgoto(f, s, *q); 543 if (s == 1) { /* no transition */ 544 if (patlen >= 0) { 545 patbeg = (char *) p; 546 return(1); 547 } 548 else 549 goto nextin; /* no match */ 550 } 551 } while (*q++ != 0); 552 if (f->out[s]) 553 patlen = q-p-1; /* don't count $ */ 554 if (patlen >= 0) { 555 patbeg = (char *) p; 556 return(1); 557 } 558 nextin: 559 s = 2; 560 if (f->reset) { 561 for (i = 2; i <= f->curstat; i++) 562 xfree(f->posns[i]); 563 k = *f->posns[0]; 564 if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) 565 overflo("out of space in pmatch"); 566 for (i = 0; i <= k; i++) 567 (f->posns[2])[i] = (f->posns[0])[i]; 568 f->initstat = f->curstat = 2; 569 f->out[2] = f->out[0]; 570 for (i = 0; i < NCHARS; i++) 571 f->gototab[2][i] = 0; 572 } 573 } while (*p++ != 0); 574 return (0); 575 } 576 577 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 578 { 579 int s, ns; 580 uschar *p = (uschar *) p0; 581 uschar *q; 582 int i, k; 583 584 /* s = f->reset ? makeinit(f,1) : f->initstat; */ 585 if (f->reset) { 586 f->initstat = s = makeinit(f,1); 587 } else { 588 s = f->initstat; 589 } 590 patlen = -1; 591 while (*p) { 592 q = p; 593 do { 594 if (f->out[s]) /* final state */ 595 patlen = q-p; 596 /* assert(*q < NCHARS); */ 597 if ((ns = f->gototab[s][*q]) != 0) 598 s = ns; 599 else 600 s = cgoto(f, s, *q); 601 if (s == 1) { /* no transition */ 602 if (patlen > 0) { 603 patbeg = (char *) p; 604 return(1); 605 } else 606 goto nnextin; /* no nonempty match */ 607 } 608 } while (*q++ != 0); 609 if (f->out[s]) 610 patlen = q-p-1; /* don't count $ */ 611 if (patlen > 0 ) { 612 patbeg = (char *) p; 613 return(1); 614 } 615 nnextin: 616 s = 2; 617 if (f->reset) { 618 for (i = 2; i <= f->curstat; i++) 619 xfree(f->posns[i]); 620 k = *f->posns[0]; 621 if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) 622 overflo("out of state space"); 623 for (i = 0; i <= k; i++) 624 (f->posns[2])[i] = (f->posns[0])[i]; 625 f->initstat = f->curstat = 2; 626 f->out[2] = f->out[0]; 627 for (i = 0; i < NCHARS; i++) 628 f->gototab[2][i] = 0; 629 } 630 p++; 631 } 632 return (0); 633 } 634 635 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 636 { /* uses relex() to scan regular expression */ 637 Node *np; 638 639 dprintf( ("reparse <%s>\n", p) ); 640 lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ 641 rtok = relex(); 642 /* GNU compatibility: an empty regexp matches anything */ 643 if (rtok == '\0') { 644 /* FATAL("empty regular expression"); previous */ 645 return(op2(EMPTYRE, NIL, NIL)); 646 } 647 np = regexp(); 648 if (rtok != '\0') 649 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 650 return(np); 651 } 652 653 Node *regexp(void) /* top-level parse of reg expr */ 654 { 655 return (alt(concat(primary()))); 656 } 657 658 Node *primary(void) 659 { 660 Node *np; 661 int savelastatom; 662 663 switch (rtok) { 664 case CHAR: 665 lastatom = starttok; 666 np = op2(CHAR, NIL, itonp(rlxval)); 667 rtok = relex(); 668 return (unary(np)); 669 case ALL: 670 rtok = relex(); 671 return (unary(op2(ALL, NIL, NIL))); 672 case EMPTYRE: 673 rtok = relex(); 674 return (unary(op2(EMPTYRE, NIL, NIL))); 675 case DOT: 676 lastatom = starttok; 677 rtok = relex(); 678 return (unary(op2(DOT, NIL, NIL))); 679 case CCL: 680 np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); 681 lastatom = starttok; 682 rtok = relex(); 683 return (unary(np)); 684 case NCCL: 685 np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); 686 lastatom = starttok; 687 rtok = relex(); 688 return (unary(np)); 689 case '^': 690 rtok = relex(); 691 return (unary(op2(CHAR, NIL, itonp(HAT)))); 692 case '$': 693 rtok = relex(); 694 return (unary(op2(CHAR, NIL, NIL))); 695 case '(': 696 lastatom = starttok; 697 savelastatom = starttok - basestr; /* Retain over recursion */ 698 rtok = relex(); 699 if (rtok == ')') { /* special pleading for () */ 700 rtok = relex(); 701 return unary(op2(CCL, NIL, (Node *) tostring(""))); 702 } 703 np = regexp(); 704 if (rtok == ')') { 705 lastatom = basestr + savelastatom; /* Restore */ 706 rtok = relex(); 707 return (unary(np)); 708 } 709 else 710 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 711 default: 712 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 713 } 714 return 0; /*NOTREACHED*/ 715 } 716 717 Node *concat(Node *np) 718 { 719 switch (rtok) { 720 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 721 return (concat(op2(CAT, np, primary()))); 722 case EMPTYRE: 723 rtok = relex(); 724 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), 725 primary()))); 726 } 727 return (np); 728 } 729 730 Node *alt(Node *np) 731 { 732 if (rtok == OR) { 733 rtok = relex(); 734 return (alt(op2(OR, np, concat(primary())))); 735 } 736 return (np); 737 } 738 739 Node *unary(Node *np) 740 { 741 switch (rtok) { 742 case STAR: 743 rtok = relex(); 744 return (unary(op2(STAR, np, NIL))); 745 case PLUS: 746 rtok = relex(); 747 return (unary(op2(PLUS, np, NIL))); 748 case QUEST: 749 rtok = relex(); 750 return (unary(op2(QUEST, np, NIL))); 751 default: 752 return (np); 753 } 754 } 755 756 /* 757 * Character class definitions conformant to the POSIX locale as 758 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 759 * and operating character sets are both ASCII (ISO646) or supersets 760 * thereof. 761 * 762 * Note that to avoid overflowing the temporary buffer used in 763 * relex(), the expanded character class (prior to range expansion) 764 * must be less than twice the size of their full name. 765 */ 766 767 /* Because isblank doesn't show up in any of the header files on any 768 * system i use, it's defined here. if some other locale has a richer 769 * definition of "blank", define HAS_ISBLANK and provide your own 770 * version. 771 * the parentheses here are an attempt to find a path through the maze 772 * of macro definition and/or function and/or version provided. thanks 773 * to nelson beebe for the suggestion; let's see if it works everywhere. 774 */ 775 776 /* #define HAS_ISBLANK */ 777 #ifndef HAS_ISBLANK 778 779 int (xisblank)(int c) 780 { 781 return c==' ' || c=='\t'; 782 } 783 784 #endif 785 786 struct charclass { 787 const char *cc_name; 788 int cc_namelen; 789 int (*cc_func)(int); 790 } charclasses[] = { 791 { "alnum", 5, isalnum }, 792 { "alpha", 5, isalpha }, 793 #ifndef HAS_ISBLANK 794 { "blank", 5, xisblank }, 795 #else 796 { "blank", 5, isblank }, 797 #endif 798 { "cntrl", 5, iscntrl }, 799 { "digit", 5, isdigit }, 800 { "graph", 5, isgraph }, 801 { "lower", 5, islower }, 802 { "print", 5, isprint }, 803 { "punct", 5, ispunct }, 804 { "space", 5, isspace }, 805 { "upper", 5, isupper }, 806 { "xdigit", 6, isxdigit }, 807 { NULL, 0, NULL }, 808 }; 809 810 #define REPEAT_SIMPLE 0 811 #define REPEAT_PLUS_APPENDED 1 812 #define REPEAT_WITH_Q 2 813 #define REPEAT_ZERO 3 814 815 static int 816 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 817 int atomlen, int firstnum, int secondnum, int special_case) 818 { 819 int i, j; 820 uschar *buf = 0; 821 int ret = 1; 822 int init_q = (firstnum==0); /* first added char will be ? */ 823 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 824 int prefix_length = reptok - basestr; /* prefix includes first rep */ 825 int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */ 826 int size = prefix_length + suffix_length; 827 828 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 829 size += atomlen*(firstnum-1); 830 } 831 832 /* Adjust size of buffer for special cases */ 833 if (special_case == REPEAT_PLUS_APPENDED) { 834 size++; /* for the final + */ 835 } else if (special_case == REPEAT_WITH_Q) { 836 size += init_q + (atomlen+1)* n_q_reps; 837 } else if (special_case == REPEAT_ZERO) { 838 size += 2; /* just a null ERE: () */ 839 } 840 if ((buf = (uschar *) malloc(size+1)) == NULL) 841 FATAL("out of space in reg expr %.10s..", lastre); 842 memcpy(buf, basestr, prefix_length); /* copy prefix */ 843 j = prefix_length; 844 if (special_case == REPEAT_ZERO) { 845 j -= atomlen; 846 buf[j++] = '('; 847 buf[j++] = ')'; 848 } 849 for (i=1; i < firstnum; i++) { /* copy x reps */ 850 memcpy(&buf[j], atom, atomlen); 851 j += atomlen; 852 } 853 if (special_case == REPEAT_PLUS_APPENDED) { 854 buf[j++] = '+'; 855 } else if (special_case == REPEAT_WITH_Q) { 856 if (init_q) buf[j++] = '?'; 857 for (i=0; i < n_q_reps; i++) { /* copy x? reps */ 858 memcpy(&buf[j], atom, atomlen); 859 j += atomlen; 860 buf[j++] = '?'; 861 } 862 } 863 memcpy(&buf[j], reptok+reptoklen, suffix_length); 864 if (special_case == REPEAT_ZERO) { 865 buf[j+suffix_length] = '\0'; 866 } else { 867 buf[size] = '\0'; 868 } 869 /* free old basestr */ 870 if (firstbasestr != basestr) { 871 if (basestr) 872 xfree(basestr); 873 } 874 basestr = buf; 875 prestr = buf + prefix_length; 876 if (special_case == REPEAT_ZERO) { 877 prestr -= atomlen; 878 ret++; 879 } 880 return ret; 881 } 882 883 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 884 int atomlen, int firstnum, int secondnum) 885 { 886 /* 887 In general, the repetition specifier or "bound" is replaced here 888 by an equivalent ERE string, repeating the immediately previous atom 889 and appending ? and + as needed. Note that the first copy of the 890 atom is left in place, except in the special_case of a zero-repeat 891 (i.e., {0}). 892 */ 893 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 894 if (firstnum < 2) { 895 /* 0 or 1: should be handled before you get here */ 896 FATAL("internal error"); 897 } else { 898 return replace_repeat(reptok, reptoklen, atom, atomlen, 899 firstnum, secondnum, REPEAT_PLUS_APPENDED); 900 } 901 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 902 if (firstnum == 0) { /* {0} or {0,0} */ 903 /* This case is unusual because the resulting 904 replacement string might actually be SMALLER than 905 the original ERE */ 906 return replace_repeat(reptok, reptoklen, atom, atomlen, 907 firstnum, secondnum, REPEAT_ZERO); 908 } else { /* (firstnum >= 1) */ 909 return replace_repeat(reptok, reptoklen, atom, atomlen, 910 firstnum, secondnum, REPEAT_SIMPLE); 911 } 912 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 913 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 914 return replace_repeat(reptok, reptoklen, atom, atomlen, 915 firstnum, secondnum, REPEAT_WITH_Q); 916 } else { /* Error - shouldn't be here (n>m) */ 917 FATAL("internal error"); 918 } 919 return 0; 920 } 921 922 int relex(void) /* lexical analyzer for reparse */ 923 { 924 int c, n; 925 int cflag; 926 static uschar *buf = NULL; 927 static int bufsz = 100; 928 uschar *bp; 929 struct charclass *cc; 930 int i; 931 int num, m, commafound, digitfound; 932 const uschar *startreptok; 933 934 rescan: 935 starttok = prestr; 936 937 switch (c = *prestr++) { 938 case '|': return OR; 939 case '*': return STAR; 940 case '+': return PLUS; 941 case '?': return QUEST; 942 case '.': return DOT; 943 case '\0': prestr--; return '\0'; 944 case '^': 945 case '$': 946 case '(': 947 case ')': 948 return c; 949 case '\\': 950 rlxval = quoted(&prestr); 951 return CHAR; 952 default: 953 rlxval = c; 954 return CHAR; 955 case '[': 956 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 957 FATAL("out of space in reg expr %.10s..", lastre); 958 bp = buf; 959 if (*prestr == '^') { 960 cflag = 1; 961 prestr++; 962 } 963 else 964 cflag = 0; 965 n = 2 * strlen((const char *) prestr)+1; 966 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 967 FATAL("out of space for reg expr %.10s...", lastre); 968 for (; ; ) { 969 if ((c = *prestr++) == '\\') { 970 *bp++ = '\\'; 971 if ((c = *prestr++) == '\0') 972 FATAL("nonterminated character class %.20s...", lastre); 973 *bp++ = c; 974 /* } else if (c == '\n') { */ 975 /* FATAL("newline in character class %.20s...", lastre); */ 976 } else if (c == '[' && *prestr == ':') { 977 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 978 for (cc = charclasses; cc->cc_name; cc++) 979 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 980 break; 981 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 982 prestr[2 + cc->cc_namelen] == ']') { 983 prestr += cc->cc_namelen + 3; 984 /* 985 * BUG: We begin at 1, instead of 0, since we 986 * would otherwise prematurely terminate the 987 * string for classes like [[:cntrl:]]. This 988 * means that we can't match the NUL character, 989 * not without first adapting the entire 990 * program to track each string's length. 991 */ 992 for (i = 1; i <= UCHAR_MAX; i++) { 993 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) 994 FATAL("out of space for reg expr %.10s...", lastre); 995 if (cc->cc_func(i)) { 996 *bp++ = i; 997 n++; 998 } 999 } 1000 } else 1001 *bp++ = c; 1002 } else if (c == '[' && *prestr == '.') { 1003 char collate_char; 1004 prestr++; 1005 collate_char = *prestr++; 1006 if (*prestr == '.' && prestr[1] == ']') { 1007 prestr += 2; 1008 /* Found it: map via locale TBD: for 1009 now, simply return this char. This 1010 is sufficient to pass conformance 1011 test awk.ex 156 1012 */ 1013 if (*prestr == ']') { 1014 prestr++; 1015 rlxval = collate_char; 1016 return CHAR; 1017 } 1018 } 1019 } else if (c == '[' && *prestr == '=') { 1020 char equiv_char; 1021 prestr++; 1022 equiv_char = *prestr++; 1023 if (*prestr == '=' && prestr[1] == ']') { 1024 prestr += 2; 1025 /* Found it: map via locale TBD: for now 1026 simply return this char. This is 1027 sufficient to pass conformance test 1028 awk.ex 156 1029 */ 1030 if (*prestr == ']') { 1031 prestr++; 1032 rlxval = equiv_char; 1033 return CHAR; 1034 } 1035 } 1036 } else if (c == '\0') { 1037 FATAL("nonterminated character class %.20s", lastre); 1038 } else if (bp == buf) { /* 1st char is special */ 1039 *bp++ = c; 1040 } else if (c == ']') { 1041 *bp++ = 0; 1042 rlxstr = (uschar *) tostring((char *) buf); 1043 if (cflag == 0) 1044 return CCL; 1045 else 1046 return NCCL; 1047 } else 1048 *bp++ = c; 1049 } 1050 break; 1051 case '{': 1052 if (isdigit(*(prestr))) { 1053 num = 0; /* Process as a repetition */ 1054 n = -1; m = -1; 1055 commafound = 0; 1056 digitfound = 0; 1057 startreptok = prestr-1; 1058 /* Remember start of previous atom here ? */ 1059 } else { /* just a { char, not a repetition */ 1060 rlxval = c; 1061 return CHAR; 1062 } 1063 for (; ; ) { 1064 if ((c = *prestr++) == '}') { 1065 if (commafound) { 1066 if (digitfound) { /* {n,m} */ 1067 m = num; 1068 if (m<n) 1069 FATAL("illegal repetition expression: class %.20s", 1070 lastre); 1071 if ((n==0) && (m==1)) { 1072 return QUEST; 1073 } 1074 } else { /* {n,} */ 1075 if (n==0) return STAR; 1076 if (n==1) return PLUS; 1077 } 1078 } else { 1079 if (digitfound) { /* {n} same as {n,n} */ 1080 n = num; 1081 m = num; 1082 } else { /* {} */ 1083 FATAL("illegal repetition expression: class %.20s", 1084 lastre); 1085 } 1086 } 1087 if (repeat(starttok, prestr-starttok, lastatom, 1088 startreptok - lastatom, n, m) > 0) { 1089 if ((n==0) && (m==0)) { 1090 return EMPTYRE; 1091 } 1092 /* must rescan input for next token */ 1093 goto rescan; 1094 } 1095 /* Failed to replace: eat up {...} characters 1096 and treat like just PLUS */ 1097 return PLUS; 1098 } else if (c == '\0') { 1099 FATAL("nonterminated character class %.20s", 1100 lastre); 1101 } else if (isdigit(c)) { 1102 num = 10 * num + c - '0'; 1103 digitfound = 1; 1104 } else if (c == ',') { 1105 if (commafound) 1106 FATAL("illegal repetition expression: class %.20s", 1107 lastre); 1108 /* looking for {n,} or {n,m} */ 1109 commafound = 1; 1110 n = num; 1111 digitfound = 0; /* reset */ 1112 num = 0; 1113 } else { 1114 FATAL("illegal repetition expression: class %.20s", 1115 lastre); 1116 } 1117 } 1118 break; 1119 } 1120 } 1121 1122 int cgoto(fa *f, int s, int c) 1123 { 1124 int i, j, k; 1125 int *p, *q; 1126 1127 assert(c == HAT || c < NCHARS); 1128 while (f->accept >= maxsetvec) { /* guessing here! */ 1129 maxsetvec *= 4; 1130 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 1131 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 1132 if (setvec == NULL || tmpset == NULL) 1133 overflo("out of space in cgoto()"); 1134 } 1135 for (i = 0; i <= f->accept; i++) 1136 setvec[i] = 0; 1137 setcnt = 0; 1138 /* compute positions of gototab[s,c] into setvec */ 1139 p = f->posns[s]; 1140 for (i = 1; i <= *p; i++) { 1141 if ((k = f->re[p[i]].ltype) != FINAL) { 1142 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1143 || (k == DOT && c != 0 && c != HAT) 1144 || (k == ALL && c != 0) 1145 || (k == EMPTYRE && c != 0) 1146 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) 1147 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { 1148 q = f->re[p[i]].lfollow; 1149 for (j = 1; j <= *q; j++) { 1150 if (q[j] >= maxsetvec) { 1151 maxsetvec *= 4; 1152 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); 1153 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); 1154 if (setvec == NULL || tmpset == NULL) 1155 overflo("cgoto overflow"); 1156 } 1157 if (setvec[q[j]] == 0) { 1158 setcnt++; 1159 setvec[q[j]] = 1; 1160 } 1161 } 1162 } 1163 } 1164 } 1165 /* determine if setvec is a previous state */ 1166 tmpset[0] = setcnt; 1167 j = 1; 1168 for (i = f->accept; i >= 0; i--) 1169 if (setvec[i]) { 1170 tmpset[j++] = i; 1171 } 1172 /* tmpset == previous state? */ 1173 for (i = 1; i <= f->curstat; i++) { 1174 p = f->posns[i]; 1175 if ((k = tmpset[0]) != p[0]) 1176 goto different; 1177 for (j = 1; j <= k; j++) 1178 if (tmpset[j] != p[j]) 1179 goto different; 1180 /* setvec is state i */ 1181 f->gototab[s][c] = i; 1182 return i; 1183 different:; 1184 } 1185 1186 /* add tmpset to current set of states */ 1187 if (f->curstat >= NSTATES-1) { 1188 f->curstat = 2; 1189 f->reset = 1; 1190 for (i = 2; i < NSTATES; i++) 1191 xfree(f->posns[i]); 1192 } else 1193 ++(f->curstat); 1194 for (i = 0; i < NCHARS; i++) 1195 f->gototab[f->curstat][i] = 0; 1196 xfree(f->posns[f->curstat]); 1197 if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL) 1198 overflo("out of space in cgoto"); 1199 1200 f->posns[f->curstat] = p; 1201 f->gototab[s][c] = f->curstat; 1202 for (i = 0; i <= setcnt; i++) 1203 p[i] = tmpset[i]; 1204 if (setvec[f->accept]) 1205 f->out[f->curstat] = 1; 1206 else 1207 f->out[f->curstat] = 0; 1208 return f->curstat; 1209 } 1210 1211 1212 void freefa(fa *f) /* free a finite automaton */ 1213 { 1214 int i; 1215 1216 if (f == NULL) 1217 return; 1218 for (i = 0; i <= f->curstat; i++) 1219 xfree(f->posns[i]); 1220 for (i = 0; i <= f->accept; i++) { 1221 xfree(f->re[i].lfollow); 1222 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1223 xfree((f->re[i].lval.np)); 1224 } 1225 xfree(f->restr); 1226 xfree(f); 1227 } 1228