1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #define DEBUG 28 29 #include <ctype.h> 30 #include <limits.h> 31 #include <stdio.h> 32 #include <string.h> 33 #include <stdlib.h> 34 #include "awk.h" 35 #include "awkgram.tab.h" 36 37 #define MAXLIN 22 38 39 #define type(v) (v)->nobj /* badly overloaded here */ 40 #define info(v) (v)->ntype /* badly overloaded here */ 41 #define left(v) (v)->narg[0] 42 #define right(v) (v)->narg[1] 43 #define parent(v) (v)->nnext 44 45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 46 #define ELEAF case EMPTYRE: /* empty string in regexp */ 47 #define UNARY case STAR: case PLUS: case QUEST: 48 49 /* encoding in tree Nodes: 50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 51 left is index, right contains value or pointer to value 52 unary (STAR, PLUS, QUEST): left is child, right is null 53 binary (CAT, OR): left and right are children 54 parent contains pointer to parent 55 */ 56 57 58 int *setvec; 59 int *tmpset; 60 int maxsetvec = 0; 61 62 int rtok; /* next token in current re */ 63 int rlxval; 64 static const uschar *rlxstr; 65 static const uschar *prestr; /* current position in current re */ 66 static const uschar *lastre; /* origin of last re */ 67 static const uschar *lastatom; /* origin of last Atom */ 68 static const uschar *starttok; 69 static const uschar *basestr; /* starts with original, replaced during 70 repetition processing */ 71 static const uschar *firstbasestr; 72 73 static int setcnt; 74 static int poscnt; 75 76 const char *patbeg; 77 int patlen; 78 79 #define NFA 128 /* cache this many dynamic fa's */ 80 fa *fatab[NFA]; 81 int nfatab = 0; /* entries in fatab */ 82 83 extern int u8_nextlen(const char *s); 84 85 86 /* utf-8 mechanism: 87 88 For most of Awk, utf-8 strings just "work", since they look like 89 null-terminated sequences of 8-bit bytes. 90 91 Functions like length(), index(), and substr() have to operate 92 in units of utf-8 characters. The u8_* functions in run.c 93 handle this. 94 95 Regular expressions are more complicated, since the basic 96 mechanism of the goto table used 8-bit byte indices into the 97 gototab entries to compute the next state. Unicode is a lot 98 bigger, so the gototab entries are now structs with a character 99 and a next state. These are sorted by code point and binary 100 searched. 101 102 Throughout the RE mechanism in b.c, utf-8 characters are 103 converted to their utf-32 value. This mostly shows up in 104 cclenter, which expands character class ranges like a-z and now 105 alpha-omega. The size of a gototab array is still about 256. 106 This should be dynamic, but for now things work ok for a single 107 code page of Unicode, which is the most likely case. 108 109 The code changes are localized in run.c and b.c. I have added a 110 handful of functions to somewhat better hide the implementation, 111 but a lot more could be done. 112 113 */ 114 115 static int entry_cmp(const void *l, const void *r); 116 static int get_gototab(fa*, int, int); 117 static int set_gototab(fa*, int, int, int); 118 static void clear_gototab(fa*, int); 119 extern int u8_rune(int *, const char *); 120 121 static int * 122 intalloc(size_t n, const char *f) 123 { 124 int *p = (int *) calloc(n, sizeof(int)); 125 if (p == NULL) 126 overflo(f); 127 return p; 128 } 129 130 static void 131 resizesetvec(const char *f) 132 { 133 if (maxsetvec == 0) 134 maxsetvec = MAXLIN; 135 else 136 maxsetvec *= 4; 137 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec)); 138 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset)); 139 if (setvec == NULL || tmpset == NULL) 140 overflo(f); 141 } 142 143 static void 144 resize_state(fa *f, int state) 145 { 146 gtt *p; 147 uschar *p2; 148 int **p3; 149 int i, new_count; 150 151 if (++state < f->state_count) 152 return; 153 154 new_count = state + 10; /* needs to be tuned */ 155 156 p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt)); 157 if (p == NULL) 158 goto out; 159 f->gototab = p; 160 161 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0])); 162 if (p2 == NULL) 163 goto out; 164 f->out = p2; 165 166 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0])); 167 if (p3 == NULL) 168 goto out; 169 f->posns = p3; 170 171 for (i = f->state_count; i < new_count; ++i) { 172 f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); 173 if (f->gototab[i].entries == NULL) 174 goto out; 175 f->gototab[i].allocated = NCHARS; 176 f->gototab[i].inuse = 0; 177 f->out[i] = 0; 178 f->posns[i] = NULL; 179 } 180 f->state_count = new_count; 181 return; 182 out: 183 overflo(__func__); 184 } 185 186 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 187 { 188 int i, use, nuse; 189 fa *pfa; 190 static int now = 1; 191 192 if (setvec == NULL) { /* first time through any RE */ 193 resizesetvec(__func__); 194 } 195 196 if (compile_time != RUNNING) /* a constant for sure */ 197 return mkdfa(s, anchor); 198 for (i = 0; i < nfatab; i++) /* is it there already? */ 199 if (fatab[i]->anchor == anchor 200 && strcmp((const char *) fatab[i]->restr, s) == 0) { 201 fatab[i]->use = now++; 202 return fatab[i]; 203 } 204 pfa = mkdfa(s, anchor); 205 if (nfatab < NFA) { /* room for another */ 206 fatab[nfatab] = pfa; 207 fatab[nfatab]->use = now++; 208 nfatab++; 209 return pfa; 210 } 211 use = fatab[0]->use; /* replace least-recently used */ 212 nuse = 0; 213 for (i = 1; i < nfatab; i++) 214 if (fatab[i]->use < use) { 215 use = fatab[i]->use; 216 nuse = i; 217 } 218 freefa(fatab[nuse]); 219 fatab[nuse] = pfa; 220 pfa->use = now++; 221 return pfa; 222 } 223 224 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 225 /* anchor = true for anchored matches, else false */ 226 { 227 Node *p, *p1; 228 fa *f; 229 230 firstbasestr = (const uschar *) s; 231 basestr = firstbasestr; 232 p = reparse(s); 233 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 234 /* put ALL STAR in front of reg. exp. */ 235 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 236 /* put FINAL after reg. exp. */ 237 238 poscnt = 0; 239 penter(p1); /* enter parent pointers and leaf indices */ 240 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 241 overflo(__func__); 242 f->accept = poscnt-1; /* penter has computed number of positions in re */ 243 cfoll(f, p1); /* set up follow sets */ 244 freetr(p1); 245 resize_state(f, 1); 246 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 247 f->posns[1] = intalloc(1, __func__); 248 *f->posns[1] = 0; 249 f->initstat = makeinit(f, anchor); 250 f->anchor = anchor; 251 f->restr = (uschar *) tostring(s); 252 if (firstbasestr != basestr) { 253 if (basestr) 254 xfree(basestr); 255 } 256 return f; 257 } 258 259 int makeinit(fa *f, bool anchor) 260 { 261 int i, k; 262 263 f->curstat = 2; 264 f->out[2] = 0; 265 k = *(f->re[0].lfollow); 266 xfree(f->posns[2]); 267 f->posns[2] = intalloc(k + 1, __func__); 268 for (i = 0; i <= k; i++) { 269 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 270 } 271 if ((f->posns[2])[1] == f->accept) 272 f->out[2] = 1; 273 clear_gototab(f, 2); 274 f->curstat = cgoto(f, 2, HAT); 275 if (anchor) { 276 *f->posns[2] = k-1; /* leave out position 0 */ 277 for (i = 0; i < k; i++) { 278 (f->posns[0])[i] = (f->posns[2])[i]; 279 } 280 281 f->out[0] = f->out[2]; 282 if (f->curstat != 2) 283 --(*f->posns[f->curstat]); 284 } 285 return f->curstat; 286 } 287 288 void penter(Node *p) /* set up parent pointers and leaf indices */ 289 { 290 switch (type(p)) { 291 ELEAF 292 LEAF 293 info(p) = poscnt; 294 poscnt++; 295 break; 296 UNARY 297 penter(left(p)); 298 parent(left(p)) = p; 299 break; 300 case CAT: 301 case OR: 302 penter(left(p)); 303 penter(right(p)); 304 parent(left(p)) = p; 305 parent(right(p)) = p; 306 break; 307 case ZERO: 308 break; 309 default: /* can't happen */ 310 FATAL("can't happen: unknown type %d in penter", type(p)); 311 break; 312 } 313 } 314 315 void freetr(Node *p) /* free parse tree */ 316 { 317 switch (type(p)) { 318 ELEAF 319 LEAF 320 xfree(p); 321 break; 322 UNARY 323 case ZERO: 324 freetr(left(p)); 325 xfree(p); 326 break; 327 case CAT: 328 case OR: 329 freetr(left(p)); 330 freetr(right(p)); 331 xfree(p); 332 break; 333 default: /* can't happen */ 334 FATAL("can't happen: unknown type %d in freetr", type(p)); 335 break; 336 } 337 } 338 339 /* in the parsing of regular expressions, metacharacters like . have */ 340 /* to be seen literally; \056 is not a metacharacter. */ 341 342 int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ 343 { /* only pick up one 8-bit byte (2 chars) */ 344 const uschar *p; 345 int n = 0; 346 int i; 347 348 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { 349 if (isdigit((int) *p)) 350 n = 16 * n + *p - '0'; 351 else if (*p >= 'a' && *p <= 'f') 352 n = 16 * n + *p - 'a' + 10; 353 else if (*p >= 'A' && *p <= 'F') 354 n = 16 * n + *p - 'A' + 10; 355 } 356 *pp = p; 357 return n; 358 } 359 360 361 362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 363 364 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 365 /* and increment *pp */ 366 { 367 const uschar *p = *pp; 368 int c; 369 370 /* BUG: should advance by utf-8 char even if makes no sense */ 371 372 switch ((c = *p++)) { 373 case 't': 374 c = '\t'; 375 break; 376 case 'n': 377 c = '\n'; 378 break; 379 case 'f': 380 c = '\f'; 381 break; 382 case 'r': 383 c = '\r'; 384 break; 385 case 'b': 386 c = '\b'; 387 break; 388 case 'v': 389 c = '\v'; 390 break; 391 case 'a': 392 c = '\a'; 393 break; 394 case '\\': 395 c = '\\'; 396 break; 397 case 'x': /* 2 hex digits follow */ 398 c = hexstr(&p, 2); /* this adds a null if number is invalid */ 399 break; 400 case 'u': /* unicode char number up to 8 hex digits */ 401 c = hexstr(&p, 8); 402 break; 403 default: 404 if (isoctdigit(c)) { /* \d \dd \ddd */ 405 int n = c - '0'; 406 if (isoctdigit(*p)) { 407 n = 8 * n + *p++ - '0'; 408 if (isoctdigit(*p)) 409 n = 8 * n + *p++ - '0'; 410 } 411 c = n; 412 } 413 } 414 415 *pp = p; 416 return c; 417 } 418 419 int *cclenter(const char *argp) /* add a character class */ 420 { 421 int i, c, c2; 422 int n; 423 const uschar *p = (const uschar *) argp; 424 int *bp, *retp; 425 static int *buf = NULL; 426 static int bufsz = 100; 427 428 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) 429 FATAL("out of space for character class [%.10s...] 1", p); 430 bp = buf; 431 for (i = 0; *p != 0; ) { 432 n = u8_rune(&c, (const char *) p); 433 p += n; 434 if (c == '\\') { 435 c = quoted(&p); 436 } else if (c == '-' && i > 0 && bp[-1] != 0) { 437 if (*p != 0) { 438 c = bp[-1]; 439 /* c2 = *p++; */ 440 n = u8_rune(&c2, (const char *) p); 441 p += n; 442 if (c2 == '\\') 443 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ 444 if (c > c2) { /* empty; ignore */ 445 bp--; 446 i--; 447 continue; 448 } 449 while (c < c2) { 450 if (i >= bufsz) { 451 bufsz *= 2; 452 buf = (int *) realloc(buf, bufsz * sizeof(int)); 453 if (buf == NULL) 454 FATAL("out of space for character class [%.10s...] 2", p); 455 bp = buf + i; 456 } 457 *bp++ = ++c; 458 i++; 459 } 460 continue; 461 } 462 } 463 if (i >= bufsz) { 464 bufsz *= 2; 465 buf = (int *) realloc(buf, bufsz * sizeof(int)); 466 if (buf == NULL) 467 FATAL("out of space for character class [%.10s...] 2", p); 468 bp = buf + i; 469 } 470 *bp++ = c; 471 i++; 472 } 473 *bp = 0; 474 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ 475 /* xfree(op); BUG: what are we freeing here? */ 476 retp = (int *) calloc(bp-buf+1, sizeof(int)); 477 for (i = 0; i < bp-buf+1; i++) 478 retp[i] = buf[i]; 479 return retp; 480 } 481 482 void overflo(const char *s) 483 { 484 FATAL("regular expression too big: out of space in %.30s...", s); 485 } 486 487 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 488 { 489 int i; 490 int *p; 491 492 switch (type(v)) { 493 ELEAF 494 LEAF 495 f->re[info(v)].ltype = type(v); 496 f->re[info(v)].lval.np = right(v); 497 while (f->accept >= maxsetvec) { /* guessing here! */ 498 resizesetvec(__func__); 499 } 500 for (i = 0; i <= f->accept; i++) 501 setvec[i] = 0; 502 setcnt = 0; 503 follow(v); /* computes setvec and setcnt */ 504 p = intalloc(setcnt + 1, __func__); 505 f->re[info(v)].lfollow = p; 506 *p = setcnt; 507 for (i = f->accept; i >= 0; i--) 508 if (setvec[i] == 1) 509 *++p = i; 510 break; 511 UNARY 512 cfoll(f,left(v)); 513 break; 514 case CAT: 515 case OR: 516 cfoll(f,left(v)); 517 cfoll(f,right(v)); 518 break; 519 case ZERO: 520 break; 521 default: /* can't happen */ 522 FATAL("can't happen: unknown type %d in cfoll", type(v)); 523 } 524 } 525 526 int first(Node *p) /* collects initially active leaves of p into setvec */ 527 /* returns 0 if p matches empty string */ 528 { 529 int b, lp; 530 531 switch (type(p)) { 532 ELEAF 533 LEAF 534 lp = info(p); /* look for high-water mark of subscripts */ 535 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 536 resizesetvec(__func__); 537 } 538 if (type(p) == EMPTYRE) { 539 setvec[lp] = 0; 540 return(0); 541 } 542 if (setvec[lp] != 1) { 543 setvec[lp] = 1; 544 setcnt++; 545 } 546 if (type(p) == CCL && (*(int *) right(p)) == 0) 547 return(0); /* empty CCL */ 548 return(1); 549 case PLUS: 550 if (first(left(p)) == 0) 551 return(0); 552 return(1); 553 case STAR: 554 case QUEST: 555 first(left(p)); 556 return(0); 557 case CAT: 558 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 559 return(1); 560 case OR: 561 b = first(right(p)); 562 if (first(left(p)) == 0 || b == 0) return(0); 563 return(1); 564 case ZERO: 565 return 0; 566 } 567 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 568 return(-1); 569 } 570 571 void follow(Node *v) /* collects leaves that can follow v into setvec */ 572 { 573 Node *p; 574 575 if (type(v) == FINAL) 576 return; 577 p = parent(v); 578 switch (type(p)) { 579 case STAR: 580 case PLUS: 581 first(v); 582 follow(p); 583 return; 584 585 case OR: 586 case QUEST: 587 follow(p); 588 return; 589 590 case CAT: 591 if (v == left(p)) { /* v is left child of p */ 592 if (first(right(p)) == 0) { 593 follow(p); 594 return; 595 } 596 } else /* v is right child */ 597 follow(p); 598 return; 599 } 600 } 601 602 int member(int c, int *sarg) /* is c in s? */ 603 { 604 int *s = (int *) sarg; 605 606 while (*s) 607 if (c == *s++) 608 return(1); 609 return(0); 610 } 611 612 static void resize_gototab(fa *f, int state) 613 { 614 size_t new_size = f->gototab[state].allocated * 2; 615 gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); 616 if (p == NULL) 617 overflo(__func__); 618 619 // need to initialized the new memory to zero 620 size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size 621 memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out 622 623 f->gototab[state].allocated = new_size; // update gototab info 624 f->gototab[state].entries = p; 625 } 626 627 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */ 628 { 629 gtte key; 630 gtte *item; 631 632 key.ch = ch; 633 key.state = 0; /* irrelevant */ 634 item = (gtte *) bsearch(& key, f->gototab[state].entries, 635 f->gototab[state].inuse, sizeof(gtte), 636 entry_cmp); 637 638 if (item == NULL) 639 return 0; 640 else 641 return item->state; 642 } 643 644 static int entry_cmp(const void *l, const void *r) 645 { 646 const gtte *left, *right; 647 648 left = (const gtte *) l; 649 right = (const gtte *) r; 650 651 return left->ch - right->ch; 652 } 653 654 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */ 655 { 656 if (f->gototab[state].inuse == 0) { 657 f->gototab[state].entries[0].ch = ch; 658 f->gototab[state].entries[0].state = val; 659 f->gototab[state].inuse++; 660 return val; 661 } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { 662 // not seen yet, insert and return 663 gtt *tab = & f->gototab[state]; 664 if (tab->inuse + 1 >= tab->allocated) 665 resize_gototab(f, state); 666 667 f->gototab[state].entries[f->gototab[state].inuse].ch = ch; 668 f->gototab[state].entries[f->gototab[state].inuse].state = val; 669 f->gototab[state].inuse++; 670 return val; 671 } else { 672 // maybe we have it, maybe we don't 673 gtte key; 674 gtte *item; 675 676 key.ch = ch; 677 key.state = 0; /* irrelevant */ 678 item = (gtte *) bsearch(& key, f->gototab[state].entries, 679 f->gototab[state].inuse, sizeof(gtte), 680 entry_cmp); 681 682 if (item != NULL) { 683 // we have it, update state and return 684 item->state = val; 685 return item->state; 686 } 687 // otherwise, fall through to insert and reallocate. 688 } 689 690 gtt *tab = & f->gototab[state]; 691 if (tab->inuse + 1 >= tab->allocated) 692 resize_gototab(f, state); 693 f->gototab[state].entries[tab->inuse].ch = ch; 694 f->gototab[state].entries[tab->inuse].state = val; 695 ++tab->inuse; 696 697 qsort(f->gototab[state].entries, 698 f->gototab[state].inuse, sizeof(gtte), entry_cmp); 699 700 return val; /* not used anywhere at the moment */ 701 } 702 703 static void clear_gototab(fa *f, int state) 704 { 705 memset(f->gototab[state].entries, 0, 706 f->gototab[state].allocated * sizeof(gtte)); 707 f->gototab[state].inuse = 0; 708 } 709 710 int match(fa *f, const char *p0) /* shortest match ? */ 711 { 712 int s, ns; 713 int n; 714 int rune; 715 const uschar *p = (const uschar *) p0; 716 717 /* return pmatch(f, p0); does it matter whether longest or shortest? */ 718 719 s = f->initstat; 720 assert (s < f->state_count); 721 722 if (f->out[s]) 723 return(1); 724 do { 725 /* assert(*p < NCHARS); */ 726 n = u8_rune(&rune, (const char *) p); 727 if ((ns = get_gototab(f, s, rune)) != 0) 728 s = ns; 729 else 730 s = cgoto(f, s, rune); 731 if (f->out[s]) 732 return(1); 733 if (*p == 0) 734 break; 735 p += n; 736 } while (1); /* was *p++ != 0 */ 737 return(0); 738 } 739 740 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 741 { 742 int s, ns; 743 int n; 744 int rune; 745 const uschar *p = (const uschar *) p0; 746 const uschar *q; 747 748 s = f->initstat; 749 assert(s < f->state_count); 750 751 patbeg = (const char *)p; 752 patlen = -1; 753 do { 754 q = p; 755 do { 756 if (f->out[s]) /* final state */ 757 patlen = q-p; 758 /* assert(*q < NCHARS); */ 759 n = u8_rune(&rune, (const char *) q); 760 if ((ns = get_gototab(f, s, rune)) != 0) 761 s = ns; 762 else 763 s = cgoto(f, s, rune); 764 765 assert(s < f->state_count); 766 767 if (s == 1) { /* no transition */ 768 if (patlen >= 0) { 769 patbeg = (const char *) p; 770 return(1); 771 } 772 else 773 goto nextin; /* no match */ 774 } 775 if (*q == 0) 776 break; 777 q += n; 778 } while (1); 779 q++; /* was *q++ */ 780 if (f->out[s]) 781 patlen = q-p-1; /* don't count $ */ 782 if (patlen >= 0) { 783 patbeg = (const char *) p; 784 return(1); 785 } 786 nextin: 787 s = 2; 788 if (*p == 0) 789 break; 790 n = u8_rune(&rune, (const char *) p); 791 p += n; 792 } while (1); /* was *p++ */ 793 return (0); 794 } 795 796 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 797 { 798 int s, ns; 799 int n; 800 int rune; 801 const uschar *p = (const uschar *) p0; 802 const uschar *q; 803 804 s = f->initstat; 805 assert(s < f->state_count); 806 807 patbeg = (const char *)p; 808 patlen = -1; 809 while (*p) { 810 q = p; 811 do { 812 if (f->out[s]) /* final state */ 813 patlen = q-p; 814 /* assert(*q < NCHARS); */ 815 n = u8_rune(&rune, (const char *) q); 816 if ((ns = get_gototab(f, s, rune)) != 0) 817 s = ns; 818 else 819 s = cgoto(f, s, rune); 820 if (s == 1) { /* no transition */ 821 if (patlen > 0) { 822 patbeg = (const char *) p; 823 return(1); 824 } else 825 goto nnextin; /* no nonempty match */ 826 } 827 if (*q == 0) 828 break; 829 q += n; 830 } while (1); 831 q++; 832 if (f->out[s]) 833 patlen = q-p-1; /* don't count $ */ 834 if (patlen > 0 ) { 835 patbeg = (const char *) p; 836 return(1); 837 } 838 nnextin: 839 s = 2; 840 p++; 841 } 842 return (0); 843 } 844 845 846 /* 847 * NAME 848 * fnematch 849 * 850 * DESCRIPTION 851 * A stream-fed version of nematch which transfers characters to a 852 * null-terminated buffer. All characters up to and including the last 853 * character of the matching text or EOF are placed in the buffer. If 854 * a match is found, patbeg and patlen are set appropriately. 855 * 856 * RETURN VALUES 857 * false No match found. 858 * true Match found. 859 */ 860 861 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 862 { 863 char *i, *j, *k, *buf = *pbuf; 864 int bufsize = *pbufsize; 865 int c, n, ns, s; 866 867 s = pfa->initstat; 868 patlen = 0; 869 870 /* 871 * buf <= i <= j <= k <= buf+bufsize 872 * 873 * i: origin of active substring 874 * j: current character 875 * k: destination of the next getc 876 */ 877 878 i = j = k = buf; 879 880 do { 881 /* 882 * Call u8_rune with at least awk_mb_cur_max ahead in 883 * the buffer until EOF interferes. 884 */ 885 if (k - j < (int)awk_mb_cur_max) { 886 if (k + awk_mb_cur_max > buf + bufsize) { 887 char *obuf = buf; 888 adjbuf((char **) &buf, &bufsize, 889 bufsize + awk_mb_cur_max, 890 quantum, 0, "fnematch"); 891 892 /* buf resized, maybe moved. update pointers */ 893 *pbufsize = bufsize; 894 if (obuf != buf) { 895 i = buf + (i - obuf); 896 j = buf + (j - obuf); 897 k = buf + (k - obuf); 898 *pbuf = buf; 899 if (patlen) 900 patbeg = buf + (patbeg - obuf); 901 } 902 } 903 for (n = awk_mb_cur_max ; n > 0; n--) { 904 *k++ = (c = getc(f)) != EOF ? c : 0; 905 if (c == EOF) { 906 if (ferror(f)) 907 FATAL("fnematch: getc error"); 908 break; 909 } 910 } 911 } 912 913 j += u8_rune(&c, j); 914 915 if ((ns = get_gototab(pfa, s, c)) != 0) 916 s = ns; 917 else 918 s = cgoto(pfa, s, c); 919 920 if (pfa->out[s]) { /* final state */ 921 patbeg = i; 922 patlen = j - i; 923 if (c == 0) /* don't count $ */ 924 patlen--; 925 } 926 927 if (c && s != 1) 928 continue; /* origin i still viable, next j */ 929 if (patlen) 930 break; /* best match found */ 931 932 /* no match at origin i, next i and start over */ 933 i += u8_rune(&c, i); 934 if (c == 0) 935 break; /* no match */ 936 j = i; 937 s = 2; 938 } while (1); 939 940 if (patlen) { 941 /* 942 * Under no circumstances is the last character fed to 943 * the automaton part of the match. It is EOF's nullbyte, 944 * or it sent the automaton into a state with no further 945 * transitions available (s==1), or both. Room for a 946 * terminating nullbyte is guaranteed. 947 * 948 * ungetc any chars after the end of matching text 949 * (except for EOF's nullbyte, if present) and null 950 * terminate the buffer. 951 */ 952 do 953 if (*--k && ungetc(*k, f) == EOF) 954 FATAL("unable to ungetc '%c'", *k); 955 while (k > patbeg + patlen); 956 *k = '\0'; 957 return true; 958 } 959 else 960 return false; 961 } 962 963 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 964 { /* uses relex() to scan regular expression */ 965 Node *np; 966 967 DPRINTF("reparse <%s>\n", p); 968 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 969 rtok = relex(); 970 /* GNU compatibility: an empty regexp matches anything */ 971 if (rtok == '\0') { 972 /* FATAL("empty regular expression"); previous */ 973 return(op2(EMPTYRE, NIL, NIL)); 974 } 975 np = regexp(); 976 if (rtok != '\0') 977 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 978 return(np); 979 } 980 981 Node *regexp(void) /* top-level parse of reg expr */ 982 { 983 return (alt(concat(primary()))); 984 } 985 986 Node *primary(void) 987 { 988 Node *np; 989 int savelastatom; 990 991 switch (rtok) { 992 case CHAR: 993 lastatom = starttok; 994 np = op2(CHAR, NIL, itonp(rlxval)); 995 rtok = relex(); 996 return (unary(np)); 997 case ALL: 998 rtok = relex(); 999 return (unary(op2(ALL, NIL, NIL))); 1000 case EMPTYRE: 1001 rtok = relex(); 1002 return (unary(op2(EMPTYRE, NIL, NIL))); 1003 case DOT: 1004 lastatom = starttok; 1005 rtok = relex(); 1006 return (unary(op2(DOT, NIL, NIL))); 1007 case CCL: 1008 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 1009 lastatom = starttok; 1010 rtok = relex(); 1011 return (unary(np)); 1012 case NCCL: 1013 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 1014 lastatom = starttok; 1015 rtok = relex(); 1016 return (unary(np)); 1017 case '^': 1018 rtok = relex(); 1019 return (unary(op2(CHAR, NIL, itonp(HAT)))); 1020 case '$': 1021 rtok = relex(); 1022 return (unary(op2(CHAR, NIL, NIL))); 1023 case '(': 1024 lastatom = starttok; 1025 savelastatom = starttok - basestr; /* Retain over recursion */ 1026 rtok = relex(); 1027 if (rtok == ')') { /* special pleading for () */ 1028 rtok = relex(); 1029 return unary(op2(CCL, NIL, (Node *) cclenter(""))); 1030 } 1031 np = regexp(); 1032 if (rtok == ')') { 1033 lastatom = basestr + savelastatom; /* Restore */ 1034 rtok = relex(); 1035 return (unary(np)); 1036 } 1037 else 1038 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 1039 default: 1040 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 1041 } 1042 return 0; /*NOTREACHED*/ 1043 } 1044 1045 Node *concat(Node *np) 1046 { 1047 switch (rtok) { 1048 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 1049 return (concat(op2(CAT, np, primary()))); 1050 case EMPTYRE: 1051 rtok = relex(); 1052 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")), 1053 primary()))); 1054 } 1055 return (np); 1056 } 1057 1058 Node *alt(Node *np) 1059 { 1060 if (rtok == OR) { 1061 rtok = relex(); 1062 return (alt(op2(OR, np, concat(primary())))); 1063 } 1064 return (np); 1065 } 1066 1067 Node *unary(Node *np) 1068 { 1069 switch (rtok) { 1070 case STAR: 1071 rtok = relex(); 1072 return (unary(op2(STAR, np, NIL))); 1073 case PLUS: 1074 rtok = relex(); 1075 return (unary(op2(PLUS, np, NIL))); 1076 case QUEST: 1077 rtok = relex(); 1078 return (unary(op2(QUEST, np, NIL))); 1079 case ZERO: 1080 rtok = relex(); 1081 return (unary(op2(ZERO, np, NIL))); 1082 default: 1083 return (np); 1084 } 1085 } 1086 1087 /* 1088 * Character class definitions conformant to the POSIX locale as 1089 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 1090 * and operating character sets are both ASCII (ISO646) or supersets 1091 * thereof. 1092 * 1093 * Note that to avoid overflowing the temporary buffer used in 1094 * relex(), the expanded character class (prior to range expansion) 1095 * must be less than twice the size of their full name. 1096 */ 1097 1098 /* Because isblank doesn't show up in any of the header files on any 1099 * system i use, it's defined here. if some other locale has a richer 1100 * definition of "blank", define HAS_ISBLANK and provide your own 1101 * version. 1102 * the parentheses here are an attempt to find a path through the maze 1103 * of macro definition and/or function and/or version provided. thanks 1104 * to nelson beebe for the suggestion; let's see if it works everywhere. 1105 */ 1106 1107 /* #define HAS_ISBLANK */ 1108 #ifndef HAS_ISBLANK 1109 1110 int (xisblank)(int c) 1111 { 1112 return c==' ' || c=='\t'; 1113 } 1114 1115 #endif 1116 1117 static const struct charclass { 1118 const char *cc_name; 1119 int cc_namelen; 1120 int (*cc_func)(int); 1121 } charclasses[] = { 1122 { "alnum", 5, isalnum }, 1123 { "alpha", 5, isalpha }, 1124 #ifndef HAS_ISBLANK 1125 { "blank", 5, xisblank }, 1126 #else 1127 { "blank", 5, isblank }, 1128 #endif 1129 { "cntrl", 5, iscntrl }, 1130 { "digit", 5, isdigit }, 1131 { "graph", 5, isgraph }, 1132 { "lower", 5, islower }, 1133 { "print", 5, isprint }, 1134 { "punct", 5, ispunct }, 1135 { "space", 5, isspace }, 1136 { "upper", 5, isupper }, 1137 { "xdigit", 6, isxdigit }, 1138 { NULL, 0, NULL }, 1139 }; 1140 1141 #define REPEAT_SIMPLE 0 1142 #define REPEAT_PLUS_APPENDED 1 1143 #define REPEAT_WITH_Q 2 1144 #define REPEAT_ZERO 3 1145 1146 static int 1147 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1148 int atomlen, int firstnum, int secondnum, int special_case) 1149 { 1150 int i, j; 1151 uschar *buf = 0; 1152 int ret = 1; 1153 int init_q = (firstnum == 0); /* first added char will be ? */ 1154 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 1155 int prefix_length = reptok - basestr; /* prefix includes first rep */ 1156 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 1157 int size = prefix_length + suffix_length; 1158 1159 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 1160 size += atomlen*(firstnum-1); 1161 } 1162 1163 /* Adjust size of buffer for special cases */ 1164 if (special_case == REPEAT_PLUS_APPENDED) { 1165 size++; /* for the final + */ 1166 } else if (special_case == REPEAT_WITH_Q) { 1167 size += init_q + (atomlen+1)* (n_q_reps-init_q); 1168 } else if (special_case == REPEAT_ZERO) { 1169 size += 2; /* just a null ERE: () */ 1170 } 1171 if ((buf = (uschar *) malloc(size + 1)) == NULL) 1172 FATAL("out of space in reg expr %.10s..", lastre); 1173 memcpy(buf, basestr, prefix_length); /* copy prefix */ 1174 j = prefix_length; 1175 if (special_case == REPEAT_ZERO) { 1176 j -= atomlen; 1177 buf[j++] = '('; 1178 buf[j++] = ')'; 1179 } 1180 for (i = 1; i < firstnum; i++) { /* copy x reps */ 1181 memcpy(&buf[j], atom, atomlen); 1182 j += atomlen; 1183 } 1184 if (special_case == REPEAT_PLUS_APPENDED) { 1185 buf[j++] = '+'; 1186 } else if (special_case == REPEAT_WITH_Q) { 1187 if (init_q) 1188 buf[j++] = '?'; 1189 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 1190 memcpy(&buf[j], atom, atomlen); 1191 j += atomlen; 1192 buf[j++] = '?'; 1193 } 1194 } 1195 memcpy(&buf[j], reptok+reptoklen, suffix_length); 1196 j += suffix_length; 1197 buf[j] = '\0'; 1198 /* free old basestr */ 1199 if (firstbasestr != basestr) { 1200 if (basestr) 1201 xfree(basestr); 1202 } 1203 basestr = buf; 1204 prestr = buf + prefix_length; 1205 if (special_case == REPEAT_ZERO) { 1206 prestr -= atomlen; 1207 ret++; 1208 } 1209 return ret; 1210 } 1211 1212 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1213 int atomlen, int firstnum, int secondnum) 1214 { 1215 /* 1216 In general, the repetition specifier or "bound" is replaced here 1217 by an equivalent ERE string, repeating the immediately previous atom 1218 and appending ? and + as needed. Note that the first copy of the 1219 atom is left in place, except in the special_case of a zero-repeat 1220 (i.e., {0}). 1221 */ 1222 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1223 if (firstnum < 2) { 1224 /* 0 or 1: should be handled before you get here */ 1225 FATAL("internal error"); 1226 } else { 1227 return replace_repeat(reptok, reptoklen, atom, atomlen, 1228 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1229 } 1230 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1231 if (firstnum == 0) { /* {0} or {0,0} */ 1232 /* This case is unusual because the resulting 1233 replacement string might actually be SMALLER than 1234 the original ERE */ 1235 return replace_repeat(reptok, reptoklen, atom, atomlen, 1236 firstnum, secondnum, REPEAT_ZERO); 1237 } else { /* (firstnum >= 1) */ 1238 return replace_repeat(reptok, reptoklen, atom, atomlen, 1239 firstnum, secondnum, REPEAT_SIMPLE); 1240 } 1241 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1242 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1243 return replace_repeat(reptok, reptoklen, atom, atomlen, 1244 firstnum, secondnum, REPEAT_WITH_Q); 1245 } else { /* Error - shouldn't be here (n>m) */ 1246 FATAL("internal error"); 1247 } 1248 return 0; 1249 } 1250 1251 int relex(void) /* lexical analyzer for reparse */ 1252 { 1253 int c, n; 1254 int cflag; 1255 static uschar *buf = NULL; 1256 static int bufsz = 100; 1257 uschar *bp; 1258 const struct charclass *cc; 1259 int i; 1260 int num, m; 1261 bool commafound, digitfound; 1262 const uschar *startreptok; 1263 static int parens = 0; 1264 1265 rescan: 1266 starttok = prestr; 1267 1268 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1269 prestr += n; 1270 starttok = prestr; 1271 return CHAR; 1272 } 1273 1274 switch (c = *prestr++) { 1275 case '|': return OR; 1276 case '*': return STAR; 1277 case '+': return PLUS; 1278 case '?': return QUEST; 1279 case '.': return DOT; 1280 case '\0': prestr--; return '\0'; 1281 case '^': 1282 case '$': 1283 return c; 1284 case '(': 1285 parens++; 1286 return c; 1287 case ')': 1288 if (parens) { 1289 parens--; 1290 return c; 1291 } 1292 /* unmatched close parenthesis; per POSIX, treat as literal */ 1293 rlxval = c; 1294 return CHAR; 1295 case '\\': 1296 rlxval = quoted(&prestr); 1297 return CHAR; 1298 default: 1299 rlxval = c; 1300 return CHAR; 1301 case '[': 1302 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1303 FATAL("out of space in reg expr %.10s..", lastre); 1304 bp = buf; 1305 if (*prestr == '^') { 1306 cflag = 1; 1307 prestr++; 1308 } 1309 else 1310 cflag = 0; 1311 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ 1312 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1313 FATAL("out of space for reg expr %.10s...", lastre); 1314 for (; ; ) { 1315 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1316 for (i = 0; i < n; i++) 1317 *bp++ = *prestr++; 1318 continue; 1319 } 1320 if ((c = *prestr++) == '\\') { 1321 *bp++ = '\\'; 1322 if ((c = *prestr++) == '\0') 1323 FATAL("nonterminated character class %.20s...", lastre); 1324 *bp++ = c; 1325 /* } else if (c == '\n') { */ 1326 /* FATAL("newline in character class %.20s...", lastre); */ 1327 } else if (c == '[' && *prestr == ':') { 1328 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1329 for (cc = charclasses; cc->cc_name; cc++) 1330 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1331 break; 1332 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1333 prestr[2 + cc->cc_namelen] == ']') { 1334 prestr += cc->cc_namelen + 3; 1335 /* 1336 * BUG: We begin at 1, instead of 0, since we 1337 * would otherwise prematurely terminate the 1338 * string for classes like [[:cntrl:]]. This 1339 * means that we can't match the NUL character, 1340 * not without first adapting the entire 1341 * program to track each string's length. 1342 */ 1343 for (i = 1; i <= UCHAR_MAX; i++) { 1344 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2")) 1345 FATAL("out of space for reg expr %.10s...", lastre); 1346 if (cc->cc_func(i)) { 1347 /* escape backslash */ 1348 if (i == '\\') { 1349 *bp++ = '\\'; 1350 n++; 1351 } 1352 1353 *bp++ = i; 1354 n++; 1355 } 1356 } 1357 } else 1358 *bp++ = c; 1359 } else if (c == '[' && *prestr == '.') { 1360 char collate_char; 1361 prestr++; 1362 collate_char = *prestr++; 1363 if (*prestr == '.' && prestr[1] == ']') { 1364 prestr += 2; 1365 /* Found it: map via locale TBD: for 1366 now, simply return this char. This 1367 is sufficient to pass conformance 1368 test awk.ex 156 1369 */ 1370 if (*prestr == ']') { 1371 prestr++; 1372 rlxval = collate_char; 1373 return CHAR; 1374 } 1375 } 1376 } else if (c == '[' && *prestr == '=') { 1377 char equiv_char; 1378 prestr++; 1379 equiv_char = *prestr++; 1380 if (*prestr == '=' && prestr[1] == ']') { 1381 prestr += 2; 1382 /* Found it: map via locale TBD: for now 1383 simply return this char. This is 1384 sufficient to pass conformance test 1385 awk.ex 156 1386 */ 1387 if (*prestr == ']') { 1388 prestr++; 1389 rlxval = equiv_char; 1390 return CHAR; 1391 } 1392 } 1393 } else if (c == '\0') { 1394 FATAL("nonterminated character class %.20s", lastre); 1395 } else if (bp == buf) { /* 1st char is special */ 1396 *bp++ = c; 1397 } else if (c == ']') { 1398 *bp++ = 0; 1399 rlxstr = (uschar *) tostring((char *) buf); 1400 if (cflag == 0) 1401 return CCL; 1402 else 1403 return NCCL; 1404 } else 1405 *bp++ = c; 1406 } 1407 break; 1408 case '{': 1409 if (isdigit((int) *(prestr))) { 1410 num = 0; /* Process as a repetition */ 1411 n = -1; m = -1; 1412 commafound = false; 1413 digitfound = false; 1414 startreptok = prestr-1; 1415 /* Remember start of previous atom here ? */ 1416 } else { /* just a { char, not a repetition */ 1417 rlxval = c; 1418 return CHAR; 1419 } 1420 for (; ; ) { 1421 if ((c = *prestr++) == '}') { 1422 if (commafound) { 1423 if (digitfound) { /* {n,m} */ 1424 m = num; 1425 if (m < n) 1426 FATAL("illegal repetition expression: class %.20s", 1427 lastre); 1428 if (n == 0 && m == 1) { 1429 return QUEST; 1430 } 1431 } else { /* {n,} */ 1432 if (n == 0) 1433 return STAR; 1434 else if (n == 1) 1435 return PLUS; 1436 } 1437 } else { 1438 if (digitfound) { /* {n} same as {n,n} */ 1439 n = num; 1440 m = num; 1441 } else { /* {} */ 1442 FATAL("illegal repetition expression: class %.20s", 1443 lastre); 1444 } 1445 } 1446 if (repeat(starttok, prestr-starttok, lastatom, 1447 startreptok - lastatom, n, m) > 0) { 1448 if (n == 0 && m == 0) { 1449 return ZERO; 1450 } 1451 /* must rescan input for next token */ 1452 goto rescan; 1453 } 1454 /* Failed to replace: eat up {...} characters 1455 and treat like just PLUS */ 1456 return PLUS; 1457 } else if (c == '\0') { 1458 FATAL("nonterminated character class %.20s", 1459 lastre); 1460 } else if (isdigit(c)) { 1461 num = 10 * num + c - '0'; 1462 digitfound = true; 1463 } else if (c == ',') { 1464 if (commafound) 1465 FATAL("illegal repetition expression: class %.20s", 1466 lastre); 1467 /* looking for {n,} or {n,m} */ 1468 commafound = true; 1469 n = num; 1470 digitfound = false; /* reset */ 1471 num = 0; 1472 } else { 1473 FATAL("illegal repetition expression: class %.20s", 1474 lastre); 1475 } 1476 } 1477 break; 1478 } 1479 } 1480 1481 int cgoto(fa *f, int s, int c) 1482 { 1483 int *p, *q; 1484 int i, j, k; 1485 1486 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ 1487 while (f->accept >= maxsetvec) { /* guessing here! */ 1488 resizesetvec(__func__); 1489 } 1490 for (i = 0; i <= f->accept; i++) 1491 setvec[i] = 0; 1492 setcnt = 0; 1493 resize_state(f, s); 1494 /* compute positions of gototab[s,c] into setvec */ 1495 p = f->posns[s]; 1496 for (i = 1; i <= *p; i++) { 1497 if ((k = f->re[p[i]].ltype) != FINAL) { 1498 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1499 || (k == DOT && c != 0 && c != HAT) 1500 || (k == ALL && c != 0) 1501 || (k == EMPTYRE && c != 0) 1502 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) 1503 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { 1504 q = f->re[p[i]].lfollow; 1505 for (j = 1; j <= *q; j++) { 1506 if (q[j] >= maxsetvec) { 1507 resizesetvec(__func__); 1508 } 1509 if (setvec[q[j]] == 0) { 1510 setcnt++; 1511 setvec[q[j]] = 1; 1512 } 1513 } 1514 } 1515 } 1516 } 1517 /* determine if setvec is a previous state */ 1518 tmpset[0] = setcnt; 1519 j = 1; 1520 for (i = f->accept; i >= 0; i--) 1521 if (setvec[i]) { 1522 tmpset[j++] = i; 1523 } 1524 resize_state(f, f->curstat > s ? f->curstat : s); 1525 /* tmpset == previous state? */ 1526 for (i = 1; i <= f->curstat; i++) { 1527 p = f->posns[i]; 1528 if ((k = tmpset[0]) != p[0]) 1529 goto different; 1530 for (j = 1; j <= k; j++) 1531 if (tmpset[j] != p[j]) 1532 goto different; 1533 /* setvec is state i */ 1534 if (c != HAT) 1535 set_gototab(f, s, c, i); 1536 return i; 1537 different:; 1538 } 1539 1540 /* add tmpset to current set of states */ 1541 ++(f->curstat); 1542 resize_state(f, f->curstat); 1543 clear_gototab(f, f->curstat); 1544 xfree(f->posns[f->curstat]); 1545 p = intalloc(setcnt + 1, __func__); 1546 1547 f->posns[f->curstat] = p; 1548 if (c != HAT) 1549 set_gototab(f, s, c, f->curstat); 1550 for (i = 0; i <= setcnt; i++) 1551 p[i] = tmpset[i]; 1552 if (setvec[f->accept]) 1553 f->out[f->curstat] = 1; 1554 else 1555 f->out[f->curstat] = 0; 1556 return f->curstat; 1557 } 1558 1559 1560 void freefa(fa *f) /* free a finite automaton */ 1561 { 1562 int i; 1563 1564 if (f == NULL) 1565 return; 1566 for (i = 0; i < f->state_count; i++) 1567 xfree(f->gototab[i].entries); 1568 xfree(f->gototab); 1569 for (i = 0; i <= f->curstat; i++) 1570 xfree(f->posns[i]); 1571 for (i = 0; i <= f->accept; i++) { 1572 xfree(f->re[i].lfollow); 1573 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1574 xfree(f->re[i].lval.np); 1575 } 1576 xfree(f->restr); 1577 xfree(f->out); 1578 xfree(f->posns); 1579 xfree(f->gototab); 1580 xfree(f); 1581 } 1582