1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #define DEBUG 28 29 #include <ctype.h> 30 #include <limits.h> 31 #include <stdio.h> 32 #include <string.h> 33 #include <stdlib.h> 34 #include "awk.h" 35 #include "awkgram.tab.h" 36 37 #define MAXLIN 22 38 39 #define type(v) (v)->nobj /* badly overloaded here */ 40 #define info(v) (v)->ntype /* badly overloaded here */ 41 #define left(v) (v)->narg[0] 42 #define right(v) (v)->narg[1] 43 #define parent(v) (v)->nnext 44 45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 46 #define ELEAF case EMPTYRE: /* empty string in regexp */ 47 #define UNARY case STAR: case PLUS: case QUEST: 48 49 /* encoding in tree Nodes: 50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 51 left is index, right contains value or pointer to value 52 unary (STAR, PLUS, QUEST): left is child, right is null 53 binary (CAT, OR): left and right are children 54 parent contains pointer to parent 55 */ 56 57 58 int *setvec; 59 int *tmpset; 60 int maxsetvec = 0; 61 62 int rtok; /* next token in current re */ 63 int rlxval; 64 static const uschar *rlxstr; 65 static const uschar *prestr; /* current position in current re */ 66 static const uschar *lastre; /* origin of last re */ 67 static const uschar *lastatom; /* origin of last Atom */ 68 static const uschar *starttok; 69 static const uschar *basestr; /* starts with original, replaced during 70 repetition processing */ 71 static const uschar *firstbasestr; 72 73 static int setcnt; 74 static int poscnt; 75 76 const char *patbeg; 77 int patlen; 78 79 #define NFA 128 /* cache this many dynamic fa's */ 80 fa *fatab[NFA]; 81 int nfatab = 0; /* entries in fatab */ 82 83 extern int u8_nextlen(const char *s); 84 85 86 /* utf-8 mechanism: 87 88 For most of Awk, utf-8 strings just "work", since they look like 89 null-terminated sequences of 8-bit bytes. 90 91 Functions like length(), index(), and substr() have to operate 92 in units of utf-8 characters. The u8_* functions in run.c 93 handle this. 94 95 Regular expressions are more complicated, since the basic 96 mechanism of the goto table used 8-bit byte indices into the 97 gototab entries to compute the next state. Unicode is a lot 98 bigger, so the gototab entries are now structs with a character 99 and a next state, and there is a linear search of the characters 100 to find the state. (Yes, this is slower, by a significant 101 amount. Tough.) 102 103 Throughout the RE mechanism in b.c, utf-8 characters are 104 converted to their utf-32 value. This mostly shows up in 105 cclenter, which expands character class ranges like a-z and now 106 alpha-omega. The size of a gototab array is still about 256. 107 This should be dynamic, but for now things work ok for a single 108 code page of Unicode, which is the most likely case. 109 110 The code changes are localized in run.c and b.c. I have added a 111 handful of functions to somewhat better hide the implementation, 112 but a lot more could be done. 113 114 */ 115 116 static int get_gototab(fa*, int, int); 117 static int set_gototab(fa*, int, int, int); 118 extern int u8_rune(int *, const uschar *); 119 120 static int * 121 intalloc(size_t n, const char *f) 122 { 123 int *p = (int *) calloc(n, sizeof(int)); 124 if (p == NULL) 125 overflo(f); 126 return p; 127 } 128 129 static void 130 resizesetvec(const char *f) 131 { 132 if (maxsetvec == 0) 133 maxsetvec = MAXLIN; 134 else 135 maxsetvec *= 4; 136 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec)); 137 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset)); 138 if (setvec == NULL || tmpset == NULL) 139 overflo(f); 140 } 141 142 static void 143 resize_state(fa *f, int state) 144 { 145 gtt **p; 146 uschar *p2; 147 int **p3; 148 int i, new_count; 149 150 if (++state < f->state_count) 151 return; 152 153 new_count = state + 10; /* needs to be tuned */ 154 155 p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0])); 156 if (p == NULL) 157 goto out; 158 f->gototab = p; 159 160 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0])); 161 if (p2 == NULL) 162 goto out; 163 f->out = p2; 164 165 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0])); 166 if (p3 == NULL) 167 goto out; 168 f->posns = p3; 169 170 for (i = f->state_count; i < new_count; ++i) { 171 f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); 172 if (f->gototab[i] == NULL) 173 goto out; 174 f->out[i] = 0; 175 f->posns[i] = NULL; 176 } 177 f->gototab_len = NCHARS; /* should be variable, growable */ 178 f->state_count = new_count; 179 return; 180 out: 181 overflo(__func__); 182 } 183 184 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 185 { 186 int i, use, nuse; 187 fa *pfa; 188 static int now = 1; 189 190 if (setvec == NULL) { /* first time through any RE */ 191 resizesetvec(__func__); 192 } 193 194 if (compile_time != RUNNING) /* a constant for sure */ 195 return mkdfa(s, anchor); 196 for (i = 0; i < nfatab; i++) /* is it there already? */ 197 if (fatab[i]->anchor == anchor 198 && strcmp((const char *) fatab[i]->restr, s) == 0) { 199 fatab[i]->use = now++; 200 return fatab[i]; 201 } 202 pfa = mkdfa(s, anchor); 203 if (nfatab < NFA) { /* room for another */ 204 fatab[nfatab] = pfa; 205 fatab[nfatab]->use = now++; 206 nfatab++; 207 return pfa; 208 } 209 use = fatab[0]->use; /* replace least-recently used */ 210 nuse = 0; 211 for (i = 1; i < nfatab; i++) 212 if (fatab[i]->use < use) { 213 use = fatab[i]->use; 214 nuse = i; 215 } 216 freefa(fatab[nuse]); 217 fatab[nuse] = pfa; 218 pfa->use = now++; 219 return pfa; 220 } 221 222 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 223 /* anchor = true for anchored matches, else false */ 224 { 225 Node *p, *p1; 226 fa *f; 227 228 firstbasestr = (const uschar *) s; 229 basestr = firstbasestr; 230 p = reparse(s); 231 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 232 /* put ALL STAR in front of reg. exp. */ 233 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 234 /* put FINAL after reg. exp. */ 235 236 poscnt = 0; 237 penter(p1); /* enter parent pointers and leaf indices */ 238 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 239 overflo(__func__); 240 f->accept = poscnt-1; /* penter has computed number of positions in re */ 241 cfoll(f, p1); /* set up follow sets */ 242 freetr(p1); 243 resize_state(f, 1); 244 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 245 f->posns[1] = intalloc(1, __func__); 246 *f->posns[1] = 0; 247 f->initstat = makeinit(f, anchor); 248 f->anchor = anchor; 249 f->restr = (uschar *) tostring(s); 250 if (firstbasestr != basestr) { 251 if (basestr) 252 xfree(basestr); 253 } 254 return f; 255 } 256 257 int makeinit(fa *f, bool anchor) 258 { 259 int i, k; 260 261 f->curstat = 2; 262 f->out[2] = 0; 263 k = *(f->re[0].lfollow); 264 xfree(f->posns[2]); 265 f->posns[2] = intalloc(k + 1, __func__); 266 for (i = 0; i <= k; i++) { 267 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 268 } 269 if ((f->posns[2])[1] == f->accept) 270 f->out[2] = 1; 271 for (i = 0; i < NCHARS; i++) 272 set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ 273 f->curstat = cgoto(f, 2, HAT); 274 if (anchor) { 275 *f->posns[2] = k-1; /* leave out position 0 */ 276 for (i = 0; i < k; i++) { 277 (f->posns[0])[i] = (f->posns[2])[i]; 278 } 279 280 f->out[0] = f->out[2]; 281 if (f->curstat != 2) 282 --(*f->posns[f->curstat]); 283 } 284 return f->curstat; 285 } 286 287 void penter(Node *p) /* set up parent pointers and leaf indices */ 288 { 289 switch (type(p)) { 290 ELEAF 291 LEAF 292 info(p) = poscnt; 293 poscnt++; 294 break; 295 UNARY 296 penter(left(p)); 297 parent(left(p)) = p; 298 break; 299 case CAT: 300 case OR: 301 penter(left(p)); 302 penter(right(p)); 303 parent(left(p)) = p; 304 parent(right(p)) = p; 305 break; 306 case ZERO: 307 break; 308 default: /* can't happen */ 309 FATAL("can't happen: unknown type %d in penter", type(p)); 310 break; 311 } 312 } 313 314 void freetr(Node *p) /* free parse tree */ 315 { 316 switch (type(p)) { 317 ELEAF 318 LEAF 319 xfree(p); 320 break; 321 UNARY 322 case ZERO: 323 freetr(left(p)); 324 xfree(p); 325 break; 326 case CAT: 327 case OR: 328 freetr(left(p)); 329 freetr(right(p)); 330 xfree(p); 331 break; 332 default: /* can't happen */ 333 FATAL("can't happen: unknown type %d in freetr", type(p)); 334 break; 335 } 336 } 337 338 /* in the parsing of regular expressions, metacharacters like . have */ 339 /* to be seen literally; \056 is not a metacharacter. */ 340 341 int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ 342 { /* only pick up one 8-bit byte (2 chars) */ 343 const uschar *p; 344 int n = 0; 345 int i; 346 347 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { 348 if (isdigit(*p)) 349 n = 16 * n + *p - '0'; 350 else if (*p >= 'a' && *p <= 'f') 351 n = 16 * n + *p - 'a' + 10; 352 else if (*p >= 'A' && *p <= 'F') 353 n = 16 * n + *p - 'A' + 10; 354 } 355 *pp = p; 356 return n; 357 } 358 359 360 361 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 362 363 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 364 /* and increment *pp */ 365 { 366 const uschar *p = *pp; 367 int c; 368 369 /* BUG: should advance by utf-8 char even if makes no sense */ 370 371 if ((c = *p++) == 't') { 372 c = '\t'; 373 } else if (c == 'n') { 374 c = '\n'; 375 } else if (c == 'f') { 376 c = '\f'; 377 } else if (c == 'r') { 378 c = '\r'; 379 } else if (c == 'b') { 380 c = '\b'; 381 } else if (c == 'v') { 382 c = '\v'; 383 } else if (c == 'a') { 384 c = '\a'; 385 } else if (c == '\\') { 386 c = '\\'; 387 } else if (c == 'x') { /* 2 hex digits follow */ 388 c = hexstr(&p, 2); /* this adds a null if number is invalid */ 389 } else if (c == 'u') { /* unicode char number up to 8 hex digits */ 390 c = hexstr(&p, 8); 391 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 392 int n = c - '0'; 393 if (isoctdigit(*p)) { 394 n = 8 * n + *p++ - '0'; 395 if (isoctdigit(*p)) 396 n = 8 * n + *p++ - '0'; 397 } 398 c = n; 399 } /* else */ 400 /* c = c; */ 401 *pp = p; 402 return c; 403 } 404 405 int *cclenter(const char *argp) /* add a character class */ 406 { 407 int i, c, c2; 408 int n; 409 const uschar *p = (const uschar *) argp; 410 int *bp, *retp; 411 static int *buf = NULL; 412 static int bufsz = 100; 413 414 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) 415 FATAL("out of space for character class [%.10s...] 1", p); 416 bp = buf; 417 for (i = 0; *p != 0; ) { 418 n = u8_rune(&c, p); 419 p += n; 420 if (c == '\\') { 421 c = quoted(&p); 422 } else if (c == '-' && i > 0 && bp[-1] != 0) { 423 if (*p != 0) { 424 c = bp[-1]; 425 /* c2 = *p++; */ 426 n = u8_rune(&c2, p); 427 p += n; 428 if (c2 == '\\') 429 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ 430 if (c > c2) { /* empty; ignore */ 431 bp--; 432 i--; 433 continue; 434 } 435 while (c < c2) { 436 if (i >= bufsz) { 437 bufsz *= 2; 438 buf = (int *) realloc(buf, bufsz * sizeof(int)); 439 if (buf == NULL) 440 FATAL("out of space for character class [%.10s...] 2", p); 441 bp = buf + i; 442 } 443 *bp++ = ++c; 444 i++; 445 } 446 continue; 447 } 448 } 449 if (i >= bufsz) { 450 bufsz *= 2; 451 buf = (int *) realloc(buf, bufsz * sizeof(int)); 452 if (buf == NULL) 453 FATAL("out of space for character class [%.10s...] 2", p); 454 bp = buf + i; 455 } 456 *bp++ = c; 457 i++; 458 } 459 *bp = 0; 460 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ 461 /* xfree(op); BUG: what are we freeing here? */ 462 retp = (int *) calloc(bp-buf+1, sizeof(int)); 463 for (i = 0; i < bp-buf+1; i++) 464 retp[i] = buf[i]; 465 return retp; 466 } 467 468 void overflo(const char *s) 469 { 470 FATAL("regular expression too big: out of space in %.30s...", s); 471 } 472 473 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 474 { 475 int i; 476 int *p; 477 478 switch (type(v)) { 479 ELEAF 480 LEAF 481 f->re[info(v)].ltype = type(v); 482 f->re[info(v)].lval.np = right(v); 483 while (f->accept >= maxsetvec) { /* guessing here! */ 484 resizesetvec(__func__); 485 } 486 for (i = 0; i <= f->accept; i++) 487 setvec[i] = 0; 488 setcnt = 0; 489 follow(v); /* computes setvec and setcnt */ 490 p = intalloc(setcnt + 1, __func__); 491 f->re[info(v)].lfollow = p; 492 *p = setcnt; 493 for (i = f->accept; i >= 0; i--) 494 if (setvec[i] == 1) 495 *++p = i; 496 break; 497 UNARY 498 cfoll(f,left(v)); 499 break; 500 case CAT: 501 case OR: 502 cfoll(f,left(v)); 503 cfoll(f,right(v)); 504 break; 505 case ZERO: 506 break; 507 default: /* can't happen */ 508 FATAL("can't happen: unknown type %d in cfoll", type(v)); 509 } 510 } 511 512 int first(Node *p) /* collects initially active leaves of p into setvec */ 513 /* returns 0 if p matches empty string */ 514 { 515 int b, lp; 516 517 switch (type(p)) { 518 ELEAF 519 LEAF 520 lp = info(p); /* look for high-water mark of subscripts */ 521 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 522 resizesetvec(__func__); 523 } 524 if (type(p) == EMPTYRE) { 525 setvec[lp] = 0; 526 return(0); 527 } 528 if (setvec[lp] != 1) { 529 setvec[lp] = 1; 530 setcnt++; 531 } 532 if (type(p) == CCL && (*(int *) right(p)) == 0) 533 return(0); /* empty CCL */ 534 return(1); 535 case PLUS: 536 if (first(left(p)) == 0) 537 return(0); 538 return(1); 539 case STAR: 540 case QUEST: 541 first(left(p)); 542 return(0); 543 case CAT: 544 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 545 return(1); 546 case OR: 547 b = first(right(p)); 548 if (first(left(p)) == 0 || b == 0) return(0); 549 return(1); 550 case ZERO: 551 return 0; 552 } 553 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 554 return(-1); 555 } 556 557 void follow(Node *v) /* collects leaves that can follow v into setvec */ 558 { 559 Node *p; 560 561 if (type(v) == FINAL) 562 return; 563 p = parent(v); 564 switch (type(p)) { 565 case STAR: 566 case PLUS: 567 first(v); 568 follow(p); 569 return; 570 571 case OR: 572 case QUEST: 573 follow(p); 574 return; 575 576 case CAT: 577 if (v == left(p)) { /* v is left child of p */ 578 if (first(right(p)) == 0) { 579 follow(p); 580 return; 581 } 582 } else /* v is right child */ 583 follow(p); 584 return; 585 } 586 } 587 588 int member(int c, int *sarg) /* is c in s? */ 589 { 590 int *s = (int *) sarg; 591 592 while (*s) 593 if (c == *s++) 594 return(1); 595 return(0); 596 } 597 598 static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ 599 { 600 int i; 601 for (i = 0; i < f->gototab_len; i++) { 602 if (f->gototab[state][i].ch == 0) 603 break; 604 if (f->gototab[state][i].ch == ch) 605 return f->gototab[state][i].state; 606 } 607 return 0; 608 } 609 610 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ 611 { 612 int i; 613 for (i = 0; i < f->gototab_len; i++) { 614 if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { 615 f->gototab[state][i].ch = ch; 616 f->gototab[state][i].state = val; 617 return val; 618 } 619 } 620 overflo(__func__); 621 return val; /* not used anywhere at the moment */ 622 } 623 624 int match(fa *f, const char *p0) /* shortest match ? */ 625 { 626 int s, ns; 627 int n; 628 int rune; 629 const uschar *p = (const uschar *) p0; 630 631 /* return pmatch(f, p0); does it matter whether longest or shortest? */ 632 633 s = f->initstat; 634 assert (s < f->state_count); 635 636 if (f->out[s]) 637 return(1); 638 do { 639 /* assert(*p < NCHARS); */ 640 n = u8_rune(&rune, p); 641 if ((ns = get_gototab(f, s, rune)) != 0) 642 s = ns; 643 else 644 s = cgoto(f, s, rune); 645 if (f->out[s]) 646 return(1); 647 if (*p == 0) 648 break; 649 p += n; 650 } while (1); /* was *p++ != 0 */ 651 return(0); 652 } 653 654 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 655 { 656 int s, ns; 657 int n; 658 int rune; 659 const uschar *p = (const uschar *) p0; 660 const uschar *q; 661 662 s = f->initstat; 663 assert(s < f->state_count); 664 665 patbeg = (const char *)p; 666 patlen = -1; 667 do { 668 q = p; 669 do { 670 if (f->out[s]) /* final state */ 671 patlen = q-p; 672 /* assert(*q < NCHARS); */ 673 n = u8_rune(&rune, q); 674 if ((ns = get_gototab(f, s, rune)) != 0) 675 s = ns; 676 else 677 s = cgoto(f, s, rune); 678 679 assert(s < f->state_count); 680 681 if (s == 1) { /* no transition */ 682 if (patlen >= 0) { 683 patbeg = (const char *) p; 684 return(1); 685 } 686 else 687 goto nextin; /* no match */ 688 } 689 if (*q == 0) 690 break; 691 q += n; 692 } while (1); 693 q++; /* was *q++ */ 694 if (f->out[s]) 695 patlen = q-p-1; /* don't count $ */ 696 if (patlen >= 0) { 697 patbeg = (const char *) p; 698 return(1); 699 } 700 nextin: 701 s = 2; 702 if (*p == 0) 703 break; 704 n = u8_rune(&rune, p); 705 p += n; 706 } while (1); /* was *p++ */ 707 return (0); 708 } 709 710 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 711 { 712 int s, ns; 713 int n; 714 int rune; 715 const uschar *p = (const uschar *) p0; 716 const uschar *q; 717 718 s = f->initstat; 719 assert(s < f->state_count); 720 721 patbeg = (const char *)p; 722 patlen = -1; 723 while (*p) { 724 q = p; 725 do { 726 if (f->out[s]) /* final state */ 727 patlen = q-p; 728 /* assert(*q < NCHARS); */ 729 n = u8_rune(&rune, q); 730 if ((ns = get_gototab(f, s, rune)) != 0) 731 s = ns; 732 else 733 s = cgoto(f, s, rune); 734 if (s == 1) { /* no transition */ 735 if (patlen > 0) { 736 patbeg = (const char *) p; 737 return(1); 738 } else 739 goto nnextin; /* no nonempty match */ 740 } 741 if (*q == 0) 742 break; 743 q += n; 744 } while (1); 745 q++; 746 if (f->out[s]) 747 patlen = q-p-1; /* don't count $ */ 748 if (patlen > 0 ) { 749 patbeg = (const char *) p; 750 return(1); 751 } 752 nnextin: 753 s = 2; 754 p++; 755 } 756 return (0); 757 } 758 759 760 #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long 761 762 // Read one rune at a time from the given FILE*. Return both 763 // the bytes and the actual rune. 764 765 struct runedata { 766 int rune; 767 size_t len; 768 char bytes[6]; 769 }; 770 771 struct runedata getrune(FILE *fp) 772 { 773 struct runedata result; 774 int c, next; 775 776 memset(&result, 0, sizeof(result)); 777 778 c = getc(fp); 779 if (c == EOF) 780 return result; // result.rune == 0 --> EOF 781 else if (c < 128 || awk_mb_cur_max == 1) { 782 result.bytes[0] = c; 783 result.len = 1; 784 result.rune = c; 785 786 return result; 787 } 788 789 // need to get bytes and fill things in 790 result.bytes[0] = c; 791 result.len = 1; 792 793 next = 1; 794 for (int i = 1; i < MAX_UTF_BYTES; i++) { 795 c = getc(fp); 796 if (c == EOF) 797 break; 798 result.bytes[next++] = c; 799 result.len++; 800 } 801 802 // put back any extra input bytes 803 int actual_len = u8_nextlen(result.bytes); 804 while (result.len > actual_len) { 805 ungetc(result.bytes[--result.len], fp); 806 } 807 808 result.bytes[result.len] = '\0'; 809 (void) u8_rune(& result.rune, (uschar *) result.bytes); 810 811 return result; 812 } 813 814 815 /* 816 * NAME 817 * fnematch 818 * 819 * DESCRIPTION 820 * A stream-fed version of nematch which transfers characters to a 821 * null-terminated buffer. All characters up to and including the last 822 * character of the matching text or EOF are placed in the buffer. If 823 * a match is found, patbeg and patlen are set appropriately. 824 * 825 * RETURN VALUES 826 * false No match found. 827 * true Match found. 828 */ 829 830 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 831 { 832 char *buf = *pbuf; 833 int bufsize = *pbufsize; 834 int i, j, k, ns, s; 835 struct runedata r; 836 837 s = pfa->initstat; 838 patlen = 0; 839 840 /* 841 * All indices relative to buf. 842 * i <= j <= k <= bufsize 843 * 844 * i: origin of active substring (first byte of first character) 845 * j: current character (last byte of current character) 846 * k: destination of next getc() 847 */ 848 i = -1, k = 0; 849 do { 850 j = i++; 851 do { 852 r = getrune(f); 853 if ((++j + r.len) >= k) { 854 if (k >= bufsize) 855 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) 856 FATAL("stream '%.30s...' too long", buf); 857 } 858 memcpy(buf + k, r.bytes, r.len); 859 j += r.len - 1; // incremented next time around the loop 860 k += r.len; 861 862 if ((ns = get_gototab(pfa, s, r.rune)) != 0) 863 s = ns; 864 else 865 s = cgoto(pfa, s, r.rune); 866 867 if (pfa->out[s]) { /* final state */ 868 patlen = j - i + 1; 869 if (r.rune == 0) /* don't count $ */ 870 patlen--; 871 } 872 } while (buf[j] && s != 1); 873 s = 2; 874 if (r.len > 1) 875 i += r.len - 1; // i incremented around the loop 876 } while (buf[i] && !patlen); 877 878 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 879 *pbuf = buf; 880 *pbufsize = bufsize; 881 882 if (patlen) { 883 patbeg = (char *) buf + i; 884 /* 885 * Under no circumstances is the last character fed to 886 * the automaton part of the match. It is EOF's nullbyte, 887 * or it sent the automaton into a state with no further 888 * transitions available (s==1), or both. Room for a 889 * terminating nullbyte is guaranteed. 890 * 891 * ungetc any chars after the end of matching text 892 * (except for EOF's nullbyte, if present) and null 893 * terminate the buffer. 894 */ 895 do 896 for (int ii = r.len; ii > 0; ii--) 897 if (buf[--k] && ungetc(buf[k], f) == EOF) 898 FATAL("unable to ungetc '%c'", buf[k]); 899 while (k > i + patlen); 900 buf[k] = '\0'; 901 return true; 902 } 903 else 904 return false; 905 } 906 907 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 908 { /* uses relex() to scan regular expression */ 909 Node *np; 910 911 DPRINTF("reparse <%s>\n", p); 912 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 913 rtok = relex(); 914 /* GNU compatibility: an empty regexp matches anything */ 915 if (rtok == '\0') { 916 /* FATAL("empty regular expression"); previous */ 917 return(op2(EMPTYRE, NIL, NIL)); 918 } 919 np = regexp(); 920 if (rtok != '\0') 921 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 922 return(np); 923 } 924 925 Node *regexp(void) /* top-level parse of reg expr */ 926 { 927 return (alt(concat(primary()))); 928 } 929 930 Node *primary(void) 931 { 932 Node *np; 933 int savelastatom; 934 935 switch (rtok) { 936 case CHAR: 937 lastatom = starttok; 938 np = op2(CHAR, NIL, itonp(rlxval)); 939 rtok = relex(); 940 return (unary(np)); 941 case ALL: 942 rtok = relex(); 943 return (unary(op2(ALL, NIL, NIL))); 944 case EMPTYRE: 945 rtok = relex(); 946 return (unary(op2(EMPTYRE, NIL, NIL))); 947 case DOT: 948 lastatom = starttok; 949 rtok = relex(); 950 return (unary(op2(DOT, NIL, NIL))); 951 case CCL: 952 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 953 lastatom = starttok; 954 rtok = relex(); 955 return (unary(np)); 956 case NCCL: 957 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 958 lastatom = starttok; 959 rtok = relex(); 960 return (unary(np)); 961 case '^': 962 rtok = relex(); 963 return (unary(op2(CHAR, NIL, itonp(HAT)))); 964 case '$': 965 rtok = relex(); 966 return (unary(op2(CHAR, NIL, NIL))); 967 case '(': 968 lastatom = starttok; 969 savelastatom = starttok - basestr; /* Retain over recursion */ 970 rtok = relex(); 971 if (rtok == ')') { /* special pleading for () */ 972 rtok = relex(); 973 return unary(op2(CCL, NIL, (Node *) cclenter(""))); 974 } 975 np = regexp(); 976 if (rtok == ')') { 977 lastatom = basestr + savelastatom; /* Restore */ 978 rtok = relex(); 979 return (unary(np)); 980 } 981 else 982 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 983 default: 984 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 985 } 986 return 0; /*NOTREACHED*/ 987 } 988 989 Node *concat(Node *np) 990 { 991 switch (rtok) { 992 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 993 return (concat(op2(CAT, np, primary()))); 994 case EMPTYRE: 995 rtok = relex(); 996 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")), 997 primary()))); 998 } 999 return (np); 1000 } 1001 1002 Node *alt(Node *np) 1003 { 1004 if (rtok == OR) { 1005 rtok = relex(); 1006 return (alt(op2(OR, np, concat(primary())))); 1007 } 1008 return (np); 1009 } 1010 1011 Node *unary(Node *np) 1012 { 1013 switch (rtok) { 1014 case STAR: 1015 rtok = relex(); 1016 return (unary(op2(STAR, np, NIL))); 1017 case PLUS: 1018 rtok = relex(); 1019 return (unary(op2(PLUS, np, NIL))); 1020 case QUEST: 1021 rtok = relex(); 1022 return (unary(op2(QUEST, np, NIL))); 1023 case ZERO: 1024 rtok = relex(); 1025 return (unary(op2(ZERO, np, NIL))); 1026 default: 1027 return (np); 1028 } 1029 } 1030 1031 /* 1032 * Character class definitions conformant to the POSIX locale as 1033 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 1034 * and operating character sets are both ASCII (ISO646) or supersets 1035 * thereof. 1036 * 1037 * Note that to avoid overflowing the temporary buffer used in 1038 * relex(), the expanded character class (prior to range expansion) 1039 * must be less than twice the size of their full name. 1040 */ 1041 1042 /* Because isblank doesn't show up in any of the header files on any 1043 * system i use, it's defined here. if some other locale has a richer 1044 * definition of "blank", define HAS_ISBLANK and provide your own 1045 * version. 1046 * the parentheses here are an attempt to find a path through the maze 1047 * of macro definition and/or function and/or version provided. thanks 1048 * to nelson beebe for the suggestion; let's see if it works everywhere. 1049 */ 1050 1051 /* #define HAS_ISBLANK */ 1052 #ifndef HAS_ISBLANK 1053 1054 int (xisblank)(int c) 1055 { 1056 return c==' ' || c=='\t'; 1057 } 1058 1059 #endif 1060 1061 static const struct charclass { 1062 const char *cc_name; 1063 int cc_namelen; 1064 int (*cc_func)(int); 1065 } charclasses[] = { 1066 { "alnum", 5, isalnum }, 1067 { "alpha", 5, isalpha }, 1068 #ifndef HAS_ISBLANK 1069 { "blank", 5, xisblank }, 1070 #else 1071 { "blank", 5, isblank }, 1072 #endif 1073 { "cntrl", 5, iscntrl }, 1074 { "digit", 5, isdigit }, 1075 { "graph", 5, isgraph }, 1076 { "lower", 5, islower }, 1077 { "print", 5, isprint }, 1078 { "punct", 5, ispunct }, 1079 { "space", 5, isspace }, 1080 { "upper", 5, isupper }, 1081 { "xdigit", 6, isxdigit }, 1082 { NULL, 0, NULL }, 1083 }; 1084 1085 #define REPEAT_SIMPLE 0 1086 #define REPEAT_PLUS_APPENDED 1 1087 #define REPEAT_WITH_Q 2 1088 #define REPEAT_ZERO 3 1089 1090 static int 1091 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1092 int atomlen, int firstnum, int secondnum, int special_case) 1093 { 1094 int i, j; 1095 uschar *buf = 0; 1096 int ret = 1; 1097 int init_q = (firstnum == 0); /* first added char will be ? */ 1098 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 1099 int prefix_length = reptok - basestr; /* prefix includes first rep */ 1100 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 1101 int size = prefix_length + suffix_length; 1102 1103 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 1104 size += atomlen*(firstnum-1); 1105 } 1106 1107 /* Adjust size of buffer for special cases */ 1108 if (special_case == REPEAT_PLUS_APPENDED) { 1109 size++; /* for the final + */ 1110 } else if (special_case == REPEAT_WITH_Q) { 1111 size += init_q + (atomlen+1)* (n_q_reps-init_q); 1112 } else if (special_case == REPEAT_ZERO) { 1113 size += 2; /* just a null ERE: () */ 1114 } 1115 if ((buf = (uschar *) malloc(size + 1)) == NULL) 1116 FATAL("out of space in reg expr %.10s..", lastre); 1117 memcpy(buf, basestr, prefix_length); /* copy prefix */ 1118 j = prefix_length; 1119 if (special_case == REPEAT_ZERO) { 1120 j -= atomlen; 1121 buf[j++] = '('; 1122 buf[j++] = ')'; 1123 } 1124 for (i = 1; i < firstnum; i++) { /* copy x reps */ 1125 memcpy(&buf[j], atom, atomlen); 1126 j += atomlen; 1127 } 1128 if (special_case == REPEAT_PLUS_APPENDED) { 1129 buf[j++] = '+'; 1130 } else if (special_case == REPEAT_WITH_Q) { 1131 if (init_q) 1132 buf[j++] = '?'; 1133 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 1134 memcpy(&buf[j], atom, atomlen); 1135 j += atomlen; 1136 buf[j++] = '?'; 1137 } 1138 } 1139 memcpy(&buf[j], reptok+reptoklen, suffix_length); 1140 j += suffix_length; 1141 buf[j] = '\0'; 1142 /* free old basestr */ 1143 if (firstbasestr != basestr) { 1144 if (basestr) 1145 xfree(basestr); 1146 } 1147 basestr = buf; 1148 prestr = buf + prefix_length; 1149 if (special_case == REPEAT_ZERO) { 1150 prestr -= atomlen; 1151 ret++; 1152 } 1153 return ret; 1154 } 1155 1156 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1157 int atomlen, int firstnum, int secondnum) 1158 { 1159 /* 1160 In general, the repetition specifier or "bound" is replaced here 1161 by an equivalent ERE string, repeating the immediately previous atom 1162 and appending ? and + as needed. Note that the first copy of the 1163 atom is left in place, except in the special_case of a zero-repeat 1164 (i.e., {0}). 1165 */ 1166 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1167 if (firstnum < 2) { 1168 /* 0 or 1: should be handled before you get here */ 1169 FATAL("internal error"); 1170 } else { 1171 return replace_repeat(reptok, reptoklen, atom, atomlen, 1172 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1173 } 1174 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1175 if (firstnum == 0) { /* {0} or {0,0} */ 1176 /* This case is unusual because the resulting 1177 replacement string might actually be SMALLER than 1178 the original ERE */ 1179 return replace_repeat(reptok, reptoklen, atom, atomlen, 1180 firstnum, secondnum, REPEAT_ZERO); 1181 } else { /* (firstnum >= 1) */ 1182 return replace_repeat(reptok, reptoklen, atom, atomlen, 1183 firstnum, secondnum, REPEAT_SIMPLE); 1184 } 1185 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1186 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1187 return replace_repeat(reptok, reptoklen, atom, atomlen, 1188 firstnum, secondnum, REPEAT_WITH_Q); 1189 } else { /* Error - shouldn't be here (n>m) */ 1190 FATAL("internal error"); 1191 } 1192 return 0; 1193 } 1194 1195 extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */ 1196 1197 int relex(void) /* lexical analyzer for reparse */ 1198 { 1199 int c, n; 1200 int cflag; 1201 static uschar *buf = NULL; 1202 static int bufsz = 100; 1203 uschar *bp; 1204 const struct charclass *cc; 1205 int i; 1206 int num, m; 1207 bool commafound, digitfound; 1208 const uschar *startreptok; 1209 static int parens = 0; 1210 1211 rescan: 1212 starttok = prestr; 1213 1214 if ((n = u8_rune(&rlxval, prestr)) > 1) { 1215 prestr += n; 1216 starttok = prestr; 1217 return CHAR; 1218 } 1219 1220 switch (c = *prestr++) { 1221 case '|': return OR; 1222 case '*': return STAR; 1223 case '+': return PLUS; 1224 case '?': return QUEST; 1225 case '.': return DOT; 1226 case '\0': prestr--; return '\0'; 1227 case '^': 1228 case '$': 1229 return c; 1230 case '(': 1231 parens++; 1232 return c; 1233 case ')': 1234 if (parens) { 1235 parens--; 1236 return c; 1237 } 1238 /* unmatched close parenthesis; per POSIX, treat as literal */ 1239 rlxval = c; 1240 return CHAR; 1241 case '\\': 1242 rlxval = quoted(&prestr); 1243 return CHAR; 1244 default: 1245 rlxval = c; 1246 return CHAR; 1247 case '[': 1248 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1249 FATAL("out of space in reg expr %.10s..", lastre); 1250 bp = buf; 1251 if (*prestr == '^') { 1252 cflag = 1; 1253 prestr++; 1254 } 1255 else 1256 cflag = 0; 1257 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ 1258 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1259 FATAL("out of space for reg expr %.10s...", lastre); 1260 for (; ; ) { 1261 if ((n = u8_rune(&rlxval, prestr)) > 1) { 1262 for (i = 0; i < n; i++) 1263 *bp++ = *prestr++; 1264 continue; 1265 } 1266 if ((c = *prestr++) == '\\') { 1267 *bp++ = '\\'; 1268 if ((c = *prestr++) == '\0') 1269 FATAL("nonterminated character class %.20s...", lastre); 1270 *bp++ = c; 1271 /* } else if (c == '\n') { */ 1272 /* FATAL("newline in character class %.20s...", lastre); */ 1273 } else if (c == '[' && *prestr == ':') { 1274 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1275 for (cc = charclasses; cc->cc_name; cc++) 1276 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1277 break; 1278 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1279 prestr[2 + cc->cc_namelen] == ']') { 1280 prestr += cc->cc_namelen + 3; 1281 /* 1282 * BUG: We begin at 1, instead of 0, since we 1283 * would otherwise prematurely terminate the 1284 * string for classes like [[:cntrl:]]. This 1285 * means that we can't match the NUL character, 1286 * not without first adapting the entire 1287 * program to track each string's length. 1288 */ 1289 for (i = 1; i <= UCHAR_MAX; i++) { 1290 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2")) 1291 FATAL("out of space for reg expr %.10s...", lastre); 1292 if (cc->cc_func(i)) { 1293 /* escape backslash */ 1294 if (i == '\\') { 1295 *bp++ = '\\'; 1296 n++; 1297 } 1298 1299 *bp++ = i; 1300 n++; 1301 } 1302 } 1303 } else 1304 *bp++ = c; 1305 } else if (c == '[' && *prestr == '.') { 1306 char collate_char; 1307 prestr++; 1308 collate_char = *prestr++; 1309 if (*prestr == '.' && prestr[1] == ']') { 1310 prestr += 2; 1311 /* Found it: map via locale TBD: for 1312 now, simply return this char. This 1313 is sufficient to pass conformance 1314 test awk.ex 156 1315 */ 1316 if (*prestr == ']') { 1317 prestr++; 1318 rlxval = collate_char; 1319 return CHAR; 1320 } 1321 } 1322 } else if (c == '[' && *prestr == '=') { 1323 char equiv_char; 1324 prestr++; 1325 equiv_char = *prestr++; 1326 if (*prestr == '=' && prestr[1] == ']') { 1327 prestr += 2; 1328 /* Found it: map via locale TBD: for now 1329 simply return this char. This is 1330 sufficient to pass conformance test 1331 awk.ex 156 1332 */ 1333 if (*prestr == ']') { 1334 prestr++; 1335 rlxval = equiv_char; 1336 return CHAR; 1337 } 1338 } 1339 } else if (c == '\0') { 1340 FATAL("nonterminated character class %.20s", lastre); 1341 } else if (bp == buf) { /* 1st char is special */ 1342 *bp++ = c; 1343 } else if (c == ']') { 1344 *bp++ = 0; 1345 rlxstr = (uschar *) tostring((char *) buf); 1346 if (cflag == 0) 1347 return CCL; 1348 else 1349 return NCCL; 1350 } else 1351 *bp++ = c; 1352 } 1353 break; 1354 case '{': 1355 if (isdigit(*(prestr))) { 1356 num = 0; /* Process as a repetition */ 1357 n = -1; m = -1; 1358 commafound = false; 1359 digitfound = false; 1360 startreptok = prestr-1; 1361 /* Remember start of previous atom here ? */ 1362 } else { /* just a { char, not a repetition */ 1363 rlxval = c; 1364 return CHAR; 1365 } 1366 for (; ; ) { 1367 if ((c = *prestr++) == '}') { 1368 if (commafound) { 1369 if (digitfound) { /* {n,m} */ 1370 m = num; 1371 if (m < n) 1372 FATAL("illegal repetition expression: class %.20s", 1373 lastre); 1374 if (n == 0 && m == 1) { 1375 return QUEST; 1376 } 1377 } else { /* {n,} */ 1378 if (n == 0) 1379 return STAR; 1380 else if (n == 1) 1381 return PLUS; 1382 } 1383 } else { 1384 if (digitfound) { /* {n} same as {n,n} */ 1385 n = num; 1386 m = num; 1387 } else { /* {} */ 1388 FATAL("illegal repetition expression: class %.20s", 1389 lastre); 1390 } 1391 } 1392 if (repeat(starttok, prestr-starttok, lastatom, 1393 startreptok - lastatom, n, m) > 0) { 1394 if (n == 0 && m == 0) { 1395 return ZERO; 1396 } 1397 /* must rescan input for next token */ 1398 goto rescan; 1399 } 1400 /* Failed to replace: eat up {...} characters 1401 and treat like just PLUS */ 1402 return PLUS; 1403 } else if (c == '\0') { 1404 FATAL("nonterminated character class %.20s", 1405 lastre); 1406 } else if (isdigit(c)) { 1407 num = 10 * num + c - '0'; 1408 digitfound = true; 1409 } else if (c == ',') { 1410 if (commafound) 1411 FATAL("illegal repetition expression: class %.20s", 1412 lastre); 1413 /* looking for {n,} or {n,m} */ 1414 commafound = true; 1415 n = num; 1416 digitfound = false; /* reset */ 1417 num = 0; 1418 } else { 1419 FATAL("illegal repetition expression: class %.20s", 1420 lastre); 1421 } 1422 } 1423 break; 1424 } 1425 } 1426 1427 int cgoto(fa *f, int s, int c) 1428 { 1429 int *p, *q; 1430 int i, j, k; 1431 1432 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ 1433 while (f->accept >= maxsetvec) { /* guessing here! */ 1434 resizesetvec(__func__); 1435 } 1436 for (i = 0; i <= f->accept; i++) 1437 setvec[i] = 0; 1438 setcnt = 0; 1439 resize_state(f, s); 1440 /* compute positions of gototab[s,c] into setvec */ 1441 p = f->posns[s]; 1442 for (i = 1; i <= *p; i++) { 1443 if ((k = f->re[p[i]].ltype) != FINAL) { 1444 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1445 || (k == DOT && c != 0 && c != HAT) 1446 || (k == ALL && c != 0) 1447 || (k == EMPTYRE && c != 0) 1448 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) 1449 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { 1450 q = f->re[p[i]].lfollow; 1451 for (j = 1; j <= *q; j++) { 1452 if (q[j] >= maxsetvec) { 1453 resizesetvec(__func__); 1454 } 1455 if (setvec[q[j]] == 0) { 1456 setcnt++; 1457 setvec[q[j]] = 1; 1458 } 1459 } 1460 } 1461 } 1462 } 1463 /* determine if setvec is a previous state */ 1464 tmpset[0] = setcnt; 1465 j = 1; 1466 for (i = f->accept; i >= 0; i--) 1467 if (setvec[i]) { 1468 tmpset[j++] = i; 1469 } 1470 resize_state(f, f->curstat > s ? f->curstat : s); 1471 /* tmpset == previous state? */ 1472 for (i = 1; i <= f->curstat; i++) { 1473 p = f->posns[i]; 1474 if ((k = tmpset[0]) != p[0]) 1475 goto different; 1476 for (j = 1; j <= k; j++) 1477 if (tmpset[j] != p[j]) 1478 goto different; 1479 /* setvec is state i */ 1480 if (c != HAT) 1481 set_gototab(f, s, c, i); 1482 return i; 1483 different:; 1484 } 1485 1486 /* add tmpset to current set of states */ 1487 ++(f->curstat); 1488 resize_state(f, f->curstat); 1489 for (i = 0; i < NCHARS; i++) 1490 set_gototab(f, f->curstat, 0, 0); 1491 xfree(f->posns[f->curstat]); 1492 p = intalloc(setcnt + 1, __func__); 1493 1494 f->posns[f->curstat] = p; 1495 if (c != HAT) 1496 set_gototab(f, s, c, f->curstat); 1497 for (i = 0; i <= setcnt; i++) 1498 p[i] = tmpset[i]; 1499 if (setvec[f->accept]) 1500 f->out[f->curstat] = 1; 1501 else 1502 f->out[f->curstat] = 0; 1503 return f->curstat; 1504 } 1505 1506 1507 void freefa(fa *f) /* free a finite automaton */ 1508 { 1509 int i; 1510 1511 if (f == NULL) 1512 return; 1513 for (i = 0; i < f->state_count; i++) 1514 xfree(f->gototab[i]) 1515 for (i = 0; i <= f->curstat; i++) 1516 xfree(f->posns[i]); 1517 for (i = 0; i <= f->accept; i++) { 1518 xfree(f->re[i].lfollow); 1519 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1520 xfree(f->re[i].lval.np); 1521 } 1522 xfree(f->restr); 1523 xfree(f->out); 1524 xfree(f->posns); 1525 xfree(f->gototab); 1526 xfree(f); 1527 } 1528