1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #define DEBUG 28 29 #include <ctype.h> 30 #include <limits.h> 31 #include <stdio.h> 32 #include <string.h> 33 #include <stdlib.h> 34 #include "awk.h" 35 #include "awkgram.tab.h" 36 37 #define MAXLIN 22 38 39 #define type(v) (v)->nobj /* badly overloaded here */ 40 #define info(v) (v)->ntype /* badly overloaded here */ 41 #define left(v) (v)->narg[0] 42 #define right(v) (v)->narg[1] 43 #define parent(v) (v)->nnext 44 45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 46 #define ELEAF case EMPTYRE: /* empty string in regexp */ 47 #define UNARY case STAR: case PLUS: case QUEST: 48 49 /* encoding in tree Nodes: 50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 51 left is index, right contains value or pointer to value 52 unary (STAR, PLUS, QUEST): left is child, right is null 53 binary (CAT, OR): left and right are children 54 parent contains pointer to parent 55 */ 56 57 58 int *setvec; 59 int *tmpset; 60 int maxsetvec = 0; 61 62 int rtok; /* next token in current re */ 63 int rlxval; 64 static const uschar *rlxstr; 65 static const uschar *prestr; /* current position in current re */ 66 static const uschar *lastre; /* origin of last re */ 67 static const uschar *lastatom; /* origin of last Atom */ 68 static const uschar *starttok; 69 static const uschar *basestr; /* starts with original, replaced during 70 repetition processing */ 71 static const uschar *firstbasestr; 72 73 static int setcnt; 74 static int poscnt; 75 76 const char *patbeg; 77 int patlen; 78 79 #define NFA 128 /* cache this many dynamic fa's */ 80 fa *fatab[NFA]; 81 int nfatab = 0; /* entries in fatab */ 82 83 extern int u8_nextlen(const char *s); 84 85 86 /* utf-8 mechanism: 87 88 For most of Awk, utf-8 strings just "work", since they look like 89 null-terminated sequences of 8-bit bytes. 90 91 Functions like length(), index(), and substr() have to operate 92 in units of utf-8 characters. The u8_* functions in run.c 93 handle this. 94 95 Regular expressions are more complicated, since the basic 96 mechanism of the goto table used 8-bit byte indices into the 97 gototab entries to compute the next state. Unicode is a lot 98 bigger, so the gototab entries are now structs with a character 99 and a next state. These are sorted by code point and binary 100 searched. 101 102 Throughout the RE mechanism in b.c, utf-8 characters are 103 converted to their utf-32 value. This mostly shows up in 104 cclenter, which expands character class ranges like a-z and now 105 alpha-omega. The size of a gototab array is still about 256. 106 This should be dynamic, but for now things work ok for a single 107 code page of Unicode, which is the most likely case. 108 109 The code changes are localized in run.c and b.c. I have added a 110 handful of functions to somewhat better hide the implementation, 111 but a lot more could be done. 112 113 */ 114 115 static int entry_cmp(const void *l, const void *r); 116 static int get_gototab(fa*, int, int); 117 static int set_gototab(fa*, int, int, int); 118 static void clear_gototab(fa*, int); 119 extern int u8_rune(int *, const char *); 120 121 static int * 122 intalloc(size_t n, const char *f) 123 { 124 int *p = (int *) calloc(n, sizeof(int)); 125 if (p == NULL) 126 overflo(f); 127 return p; 128 } 129 130 static void 131 resizesetvec(const char *f) 132 { 133 if (maxsetvec == 0) 134 maxsetvec = MAXLIN; 135 else 136 maxsetvec *= 4; 137 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec)); 138 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset)); 139 if (setvec == NULL || tmpset == NULL) 140 overflo(f); 141 } 142 143 static void 144 resize_state(fa *f, int state) 145 { 146 gtt *p; 147 uschar *p2; 148 int **p3; 149 int i, new_count; 150 151 if (++state < f->state_count) 152 return; 153 154 new_count = state + 10; /* needs to be tuned */ 155 156 p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt)); 157 if (p == NULL) 158 goto out; 159 f->gototab = p; 160 161 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0])); 162 if (p2 == NULL) 163 goto out; 164 f->out = p2; 165 166 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0])); 167 if (p3 == NULL) 168 goto out; 169 f->posns = p3; 170 171 for (i = f->state_count; i < new_count; ++i) { 172 f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); 173 if (f->gototab[i].entries == NULL) 174 goto out; 175 f->gototab[i].allocated = NCHARS; 176 f->gototab[i].inuse = 0; 177 f->out[i] = 0; 178 f->posns[i] = NULL; 179 } 180 f->state_count = new_count; 181 return; 182 out: 183 overflo(__func__); 184 } 185 186 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 187 { 188 int i, use, nuse; 189 fa *pfa; 190 static int now = 1; 191 192 if (setvec == NULL) { /* first time through any RE */ 193 resizesetvec(__func__); 194 } 195 196 if (compile_time != RUNNING) /* a constant for sure */ 197 return mkdfa(s, anchor); 198 for (i = 0; i < nfatab; i++) /* is it there already? */ 199 if (fatab[i]->anchor == anchor 200 && strcmp((const char *) fatab[i]->restr, s) == 0) { 201 fatab[i]->use = now++; 202 return fatab[i]; 203 } 204 pfa = mkdfa(s, anchor); 205 if (nfatab < NFA) { /* room for another */ 206 fatab[nfatab] = pfa; 207 fatab[nfatab]->use = now++; 208 nfatab++; 209 return pfa; 210 } 211 use = fatab[0]->use; /* replace least-recently used */ 212 nuse = 0; 213 for (i = 1; i < nfatab; i++) 214 if (fatab[i]->use < use) { 215 use = fatab[i]->use; 216 nuse = i; 217 } 218 freefa(fatab[nuse]); 219 fatab[nuse] = pfa; 220 pfa->use = now++; 221 return pfa; 222 } 223 224 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 225 /* anchor = true for anchored matches, else false */ 226 { 227 Node *p, *p1; 228 fa *f; 229 230 firstbasestr = (const uschar *) s; 231 basestr = firstbasestr; 232 p = reparse(s); 233 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 234 /* put ALL STAR in front of reg. exp. */ 235 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 236 /* put FINAL after reg. exp. */ 237 238 poscnt = 0; 239 penter(p1); /* enter parent pointers and leaf indices */ 240 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 241 overflo(__func__); 242 f->accept = poscnt-1; /* penter has computed number of positions in re */ 243 cfoll(f, p1); /* set up follow sets */ 244 freetr(p1); 245 resize_state(f, 1); 246 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 247 f->posns[1] = intalloc(1, __func__); 248 *f->posns[1] = 0; 249 f->initstat = makeinit(f, anchor); 250 f->anchor = anchor; 251 f->restr = (uschar *) tostring(s); 252 if (firstbasestr != basestr) { 253 if (basestr) 254 xfree(basestr); 255 } 256 return f; 257 } 258 259 int makeinit(fa *f, bool anchor) 260 { 261 int i, k; 262 263 f->curstat = 2; 264 f->out[2] = 0; 265 k = *(f->re[0].lfollow); 266 xfree(f->posns[2]); 267 f->posns[2] = intalloc(k + 1, __func__); 268 for (i = 0; i <= k; i++) { 269 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 270 } 271 if ((f->posns[2])[1] == f->accept) 272 f->out[2] = 1; 273 clear_gototab(f, 2); 274 f->curstat = cgoto(f, 2, HAT); 275 if (anchor) { 276 *f->posns[2] = k-1; /* leave out position 0 */ 277 for (i = 0; i < k; i++) { 278 (f->posns[0])[i] = (f->posns[2])[i]; 279 } 280 281 f->out[0] = f->out[2]; 282 if (f->curstat != 2) 283 --(*f->posns[f->curstat]); 284 } 285 return f->curstat; 286 } 287 288 void penter(Node *p) /* set up parent pointers and leaf indices */ 289 { 290 switch (type(p)) { 291 ELEAF 292 LEAF 293 info(p) = poscnt; 294 poscnt++; 295 break; 296 UNARY 297 penter(left(p)); 298 parent(left(p)) = p; 299 break; 300 case CAT: 301 case OR: 302 penter(left(p)); 303 penter(right(p)); 304 parent(left(p)) = p; 305 parent(right(p)) = p; 306 break; 307 case ZERO: 308 break; 309 default: /* can't happen */ 310 FATAL("can't happen: unknown type %d in penter", type(p)); 311 break; 312 } 313 } 314 315 void freetr(Node *p) /* free parse tree */ 316 { 317 switch (type(p)) { 318 ELEAF 319 LEAF 320 xfree(p); 321 break; 322 UNARY 323 case ZERO: 324 freetr(left(p)); 325 xfree(p); 326 break; 327 case CAT: 328 case OR: 329 freetr(left(p)); 330 freetr(right(p)); 331 xfree(p); 332 break; 333 default: /* can't happen */ 334 FATAL("can't happen: unknown type %d in freetr", type(p)); 335 break; 336 } 337 } 338 339 /* in the parsing of regular expressions, metacharacters like . have */ 340 /* to be seen literally; \056 is not a metacharacter. */ 341 342 int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ 343 { /* only pick up one 8-bit byte (2 chars) */ 344 const uschar *p; 345 int n = 0; 346 int i; 347 348 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { 349 if (isdigit((int) *p)) 350 n = 16 * n + *p - '0'; 351 else if (*p >= 'a' && *p <= 'f') 352 n = 16 * n + *p - 'a' + 10; 353 else if (*p >= 'A' && *p <= 'F') 354 n = 16 * n + *p - 'A' + 10; 355 } 356 *pp = p; 357 return n; 358 } 359 360 361 362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 363 364 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 365 /* and increment *pp */ 366 { 367 const uschar *p = *pp; 368 int c; 369 370 /* BUG: should advance by utf-8 char even if makes no sense */ 371 372 if ((c = *p++) == 't') { 373 c = '\t'; 374 } else if (c == 'n') { 375 c = '\n'; 376 } else if (c == 'f') { 377 c = '\f'; 378 } else if (c == 'r') { 379 c = '\r'; 380 } else if (c == 'b') { 381 c = '\b'; 382 } else if (c == 'v') { 383 c = '\v'; 384 } else if (c == 'a') { 385 c = '\a'; 386 } else if (c == '\\') { 387 c = '\\'; 388 } else if (c == 'x') { /* 2 hex digits follow */ 389 c = hexstr(&p, 2); /* this adds a null if number is invalid */ 390 } else if (c == 'u') { /* unicode char number up to 8 hex digits */ 391 c = hexstr(&p, 8); 392 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 393 int n = c - '0'; 394 if (isoctdigit(*p)) { 395 n = 8 * n + *p++ - '0'; 396 if (isoctdigit(*p)) 397 n = 8 * n + *p++ - '0'; 398 } 399 c = n; 400 } /* else */ 401 /* c = c; */ 402 *pp = p; 403 return c; 404 } 405 406 int *cclenter(const char *argp) /* add a character class */ 407 { 408 int i, c, c2; 409 int n; 410 const uschar *p = (const uschar *) argp; 411 int *bp, *retp; 412 static int *buf = NULL; 413 static int bufsz = 100; 414 415 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) 416 FATAL("out of space for character class [%.10s...] 1", p); 417 bp = buf; 418 for (i = 0; *p != 0; ) { 419 n = u8_rune(&c, (const char *) p); 420 p += n; 421 if (c == '\\') { 422 c = quoted(&p); 423 } else if (c == '-' && i > 0 && bp[-1] != 0) { 424 if (*p != 0) { 425 c = bp[-1]; 426 /* c2 = *p++; */ 427 n = u8_rune(&c2, (const char *) p); 428 p += n; 429 if (c2 == '\\') 430 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ 431 if (c > c2) { /* empty; ignore */ 432 bp--; 433 i--; 434 continue; 435 } 436 while (c < c2) { 437 if (i >= bufsz) { 438 bufsz *= 2; 439 buf = (int *) realloc(buf, bufsz * sizeof(int)); 440 if (buf == NULL) 441 FATAL("out of space for character class [%.10s...] 2", p); 442 bp = buf + i; 443 } 444 *bp++ = ++c; 445 i++; 446 } 447 continue; 448 } 449 } 450 if (i >= bufsz) { 451 bufsz *= 2; 452 buf = (int *) realloc(buf, bufsz * sizeof(int)); 453 if (buf == NULL) 454 FATAL("out of space for character class [%.10s...] 2", p); 455 bp = buf + i; 456 } 457 *bp++ = c; 458 i++; 459 } 460 *bp = 0; 461 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ 462 /* xfree(op); BUG: what are we freeing here? */ 463 retp = (int *) calloc(bp-buf+1, sizeof(int)); 464 for (i = 0; i < bp-buf+1; i++) 465 retp[i] = buf[i]; 466 return retp; 467 } 468 469 void overflo(const char *s) 470 { 471 FATAL("regular expression too big: out of space in %.30s...", s); 472 } 473 474 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 475 { 476 int i; 477 int *p; 478 479 switch (type(v)) { 480 ELEAF 481 LEAF 482 f->re[info(v)].ltype = type(v); 483 f->re[info(v)].lval.np = right(v); 484 while (f->accept >= maxsetvec) { /* guessing here! */ 485 resizesetvec(__func__); 486 } 487 for (i = 0; i <= f->accept; i++) 488 setvec[i] = 0; 489 setcnt = 0; 490 follow(v); /* computes setvec and setcnt */ 491 p = intalloc(setcnt + 1, __func__); 492 f->re[info(v)].lfollow = p; 493 *p = setcnt; 494 for (i = f->accept; i >= 0; i--) 495 if (setvec[i] == 1) 496 *++p = i; 497 break; 498 UNARY 499 cfoll(f,left(v)); 500 break; 501 case CAT: 502 case OR: 503 cfoll(f,left(v)); 504 cfoll(f,right(v)); 505 break; 506 case ZERO: 507 break; 508 default: /* can't happen */ 509 FATAL("can't happen: unknown type %d in cfoll", type(v)); 510 } 511 } 512 513 int first(Node *p) /* collects initially active leaves of p into setvec */ 514 /* returns 0 if p matches empty string */ 515 { 516 int b, lp; 517 518 switch (type(p)) { 519 ELEAF 520 LEAF 521 lp = info(p); /* look for high-water mark of subscripts */ 522 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 523 resizesetvec(__func__); 524 } 525 if (type(p) == EMPTYRE) { 526 setvec[lp] = 0; 527 return(0); 528 } 529 if (setvec[lp] != 1) { 530 setvec[lp] = 1; 531 setcnt++; 532 } 533 if (type(p) == CCL && (*(int *) right(p)) == 0) 534 return(0); /* empty CCL */ 535 return(1); 536 case PLUS: 537 if (first(left(p)) == 0) 538 return(0); 539 return(1); 540 case STAR: 541 case QUEST: 542 first(left(p)); 543 return(0); 544 case CAT: 545 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 546 return(1); 547 case OR: 548 b = first(right(p)); 549 if (first(left(p)) == 0 || b == 0) return(0); 550 return(1); 551 case ZERO: 552 return 0; 553 } 554 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 555 return(-1); 556 } 557 558 void follow(Node *v) /* collects leaves that can follow v into setvec */ 559 { 560 Node *p; 561 562 if (type(v) == FINAL) 563 return; 564 p = parent(v); 565 switch (type(p)) { 566 case STAR: 567 case PLUS: 568 first(v); 569 follow(p); 570 return; 571 572 case OR: 573 case QUEST: 574 follow(p); 575 return; 576 577 case CAT: 578 if (v == left(p)) { /* v is left child of p */ 579 if (first(right(p)) == 0) { 580 follow(p); 581 return; 582 } 583 } else /* v is right child */ 584 follow(p); 585 return; 586 } 587 } 588 589 int member(int c, int *sarg) /* is c in s? */ 590 { 591 int *s = (int *) sarg; 592 593 while (*s) 594 if (c == *s++) 595 return(1); 596 return(0); 597 } 598 599 static void resize_gototab(fa *f, int state) 600 { 601 size_t new_size = f->gototab[state].allocated * 2; 602 gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); 603 if (p == NULL) 604 overflo(__func__); 605 606 // need to initialized the new memory to zero 607 size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size 608 memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out 609 610 f->gototab[state].allocated = new_size; // update gototab info 611 f->gototab[state].entries = p; 612 } 613 614 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */ 615 { 616 gtte key; 617 gtte *item; 618 619 key.ch = ch; 620 key.state = 0; /* irrelevant */ 621 item = (gtte *) bsearch(& key, f->gototab[state].entries, 622 f->gototab[state].inuse, sizeof(gtte), 623 entry_cmp); 624 625 if (item == NULL) 626 return 0; 627 else 628 return item->state; 629 } 630 631 static int entry_cmp(const void *l, const void *r) 632 { 633 const gtte *left, *right; 634 635 left = (const gtte *) l; 636 right = (const gtte *) r; 637 638 return left->ch - right->ch; 639 } 640 641 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */ 642 { 643 if (f->gototab[state].inuse == 0) { 644 f->gototab[state].entries[0].ch = ch; 645 f->gototab[state].entries[0].state = val; 646 f->gototab[state].inuse++; 647 return val; 648 } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { 649 // not seen yet, insert and return 650 gtt *tab = & f->gototab[state]; 651 if (tab->inuse + 1 >= tab->allocated) 652 resize_gototab(f, state); 653 654 f->gototab[state].entries[f->gototab[state].inuse].ch = ch; 655 f->gototab[state].entries[f->gototab[state].inuse].state = val; 656 f->gototab[state].inuse++; 657 return val; 658 } else { 659 // maybe we have it, maybe we don't 660 gtte key; 661 gtte *item; 662 663 key.ch = ch; 664 key.state = 0; /* irrelevant */ 665 item = (gtte *) bsearch(& key, f->gototab[state].entries, 666 f->gototab[state].inuse, sizeof(gtte), 667 entry_cmp); 668 669 if (item != NULL) { 670 // we have it, update state and return 671 item->state = val; 672 return item->state; 673 } 674 // otherwise, fall through to insert and reallocate. 675 } 676 677 gtt *tab = & f->gototab[state]; 678 if (tab->inuse + 1 >= tab->allocated) 679 resize_gototab(f, state); 680 f->gototab[state].entries[tab->inuse].ch = ch; 681 f->gototab[state].entries[tab->inuse].state = val; 682 ++tab->inuse; 683 684 qsort(f->gototab[state].entries, 685 f->gototab[state].inuse, sizeof(gtte), entry_cmp); 686 687 return val; /* not used anywhere at the moment */ 688 } 689 690 static void clear_gototab(fa *f, int state) 691 { 692 memset(f->gototab[state].entries, 0, 693 f->gototab[state].allocated * sizeof(gtte)); 694 f->gototab[state].inuse = 0; 695 } 696 697 int match(fa *f, const char *p0) /* shortest match ? */ 698 { 699 int s, ns; 700 int n; 701 int rune; 702 const uschar *p = (const uschar *) p0; 703 704 /* return pmatch(f, p0); does it matter whether longest or shortest? */ 705 706 s = f->initstat; 707 assert (s < f->state_count); 708 709 if (f->out[s]) 710 return(1); 711 do { 712 /* assert(*p < NCHARS); */ 713 n = u8_rune(&rune, (const char *) p); 714 if ((ns = get_gototab(f, s, rune)) != 0) 715 s = ns; 716 else 717 s = cgoto(f, s, rune); 718 if (f->out[s]) 719 return(1); 720 if (*p == 0) 721 break; 722 p += n; 723 } while (1); /* was *p++ != 0 */ 724 return(0); 725 } 726 727 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 728 { 729 int s, ns; 730 int n; 731 int rune; 732 const uschar *p = (const uschar *) p0; 733 const uschar *q; 734 735 s = f->initstat; 736 assert(s < f->state_count); 737 738 patbeg = (const char *)p; 739 patlen = -1; 740 do { 741 q = p; 742 do { 743 if (f->out[s]) /* final state */ 744 patlen = q-p; 745 /* assert(*q < NCHARS); */ 746 n = u8_rune(&rune, (const char *) q); 747 if ((ns = get_gototab(f, s, rune)) != 0) 748 s = ns; 749 else 750 s = cgoto(f, s, rune); 751 752 assert(s < f->state_count); 753 754 if (s == 1) { /* no transition */ 755 if (patlen >= 0) { 756 patbeg = (const char *) p; 757 return(1); 758 } 759 else 760 goto nextin; /* no match */ 761 } 762 if (*q == 0) 763 break; 764 q += n; 765 } while (1); 766 q++; /* was *q++ */ 767 if (f->out[s]) 768 patlen = q-p-1; /* don't count $ */ 769 if (patlen >= 0) { 770 patbeg = (const char *) p; 771 return(1); 772 } 773 nextin: 774 s = 2; 775 if (*p == 0) 776 break; 777 n = u8_rune(&rune, (const char *) p); 778 p += n; 779 } while (1); /* was *p++ */ 780 return (0); 781 } 782 783 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 784 { 785 int s, ns; 786 int n; 787 int rune; 788 const uschar *p = (const uschar *) p0; 789 const uschar *q; 790 791 s = f->initstat; 792 assert(s < f->state_count); 793 794 patbeg = (const char *)p; 795 patlen = -1; 796 while (*p) { 797 q = p; 798 do { 799 if (f->out[s]) /* final state */ 800 patlen = q-p; 801 /* assert(*q < NCHARS); */ 802 n = u8_rune(&rune, (const char *) q); 803 if ((ns = get_gototab(f, s, rune)) != 0) 804 s = ns; 805 else 806 s = cgoto(f, s, rune); 807 if (s == 1) { /* no transition */ 808 if (patlen > 0) { 809 patbeg = (const char *) p; 810 return(1); 811 } else 812 goto nnextin; /* no nonempty match */ 813 } 814 if (*q == 0) 815 break; 816 q += n; 817 } while (1); 818 q++; 819 if (f->out[s]) 820 patlen = q-p-1; /* don't count $ */ 821 if (patlen > 0 ) { 822 patbeg = (const char *) p; 823 return(1); 824 } 825 nnextin: 826 s = 2; 827 p++; 828 } 829 return (0); 830 } 831 832 833 /* 834 * NAME 835 * fnematch 836 * 837 * DESCRIPTION 838 * A stream-fed version of nematch which transfers characters to a 839 * null-terminated buffer. All characters up to and including the last 840 * character of the matching text or EOF are placed in the buffer. If 841 * a match is found, patbeg and patlen are set appropriately. 842 * 843 * RETURN VALUES 844 * false No match found. 845 * true Match found. 846 */ 847 848 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 849 { 850 char *i, *j, *k, *buf = *pbuf; 851 int bufsize = *pbufsize; 852 int c, n, ns, s; 853 854 s = pfa->initstat; 855 patlen = 0; 856 857 /* 858 * buf <= i <= j <= k <= buf+bufsize 859 * 860 * i: origin of active substring 861 * j: current character 862 * k: destination of the next getc 863 */ 864 865 i = j = k = buf; 866 867 do { 868 /* 869 * Call u8_rune with at least awk_mb_cur_max ahead in 870 * the buffer until EOF interferes. 871 */ 872 if (k - j < awk_mb_cur_max) { 873 if (k + awk_mb_cur_max > buf + bufsize) { 874 char *obuf = buf; 875 adjbuf((char **) &buf, &bufsize, 876 bufsize + awk_mb_cur_max, 877 quantum, 0, "fnematch"); 878 879 /* buf resized, maybe moved. update pointers */ 880 *pbufsize = bufsize; 881 if (obuf != buf) { 882 i = buf + (i - obuf); 883 j = buf + (j - obuf); 884 k = buf + (k - obuf); 885 *pbuf = buf; 886 if (patlen) 887 patbeg = buf + (patbeg - obuf); 888 } 889 } 890 for (n = awk_mb_cur_max ; n > 0; n--) { 891 *k++ = (c = getc(f)) != EOF ? c : 0; 892 if (c == EOF) { 893 if (ferror(f)) 894 FATAL("fnematch: getc error"); 895 break; 896 } 897 } 898 } 899 900 j += u8_rune(&c, j); 901 902 if ((ns = get_gototab(pfa, s, c)) != 0) 903 s = ns; 904 else 905 s = cgoto(pfa, s, c); 906 907 if (pfa->out[s]) { /* final state */ 908 patbeg = i; 909 patlen = j - i; 910 if (c == 0) /* don't count $ */ 911 patlen--; 912 } 913 914 if (c && s != 1) 915 continue; /* origin i still viable, next j */ 916 if (patlen) 917 break; /* best match found */ 918 919 /* no match at origin i, next i and start over */ 920 i += u8_rune(&c, i); 921 if (c == 0) 922 break; /* no match */ 923 j = i; 924 s = 2; 925 } while (1); 926 927 if (patlen) { 928 /* 929 * Under no circumstances is the last character fed to 930 * the automaton part of the match. It is EOF's nullbyte, 931 * or it sent the automaton into a state with no further 932 * transitions available (s==1), or both. Room for a 933 * terminating nullbyte is guaranteed. 934 * 935 * ungetc any chars after the end of matching text 936 * (except for EOF's nullbyte, if present) and null 937 * terminate the buffer. 938 */ 939 do 940 if (*--k && ungetc(*k, f) == EOF) 941 FATAL("unable to ungetc '%c'", *k); 942 while (k > patbeg + patlen); 943 *k = '\0'; 944 return true; 945 } 946 else 947 return false; 948 } 949 950 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 951 { /* uses relex() to scan regular expression */ 952 Node *np; 953 954 DPRINTF("reparse <%s>\n", p); 955 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 956 rtok = relex(); 957 /* GNU compatibility: an empty regexp matches anything */ 958 if (rtok == '\0') { 959 /* FATAL("empty regular expression"); previous */ 960 return(op2(EMPTYRE, NIL, NIL)); 961 } 962 np = regexp(); 963 if (rtok != '\0') 964 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 965 return(np); 966 } 967 968 Node *regexp(void) /* top-level parse of reg expr */ 969 { 970 return (alt(concat(primary()))); 971 } 972 973 Node *primary(void) 974 { 975 Node *np; 976 int savelastatom; 977 978 switch (rtok) { 979 case CHAR: 980 lastatom = starttok; 981 np = op2(CHAR, NIL, itonp(rlxval)); 982 rtok = relex(); 983 return (unary(np)); 984 case ALL: 985 rtok = relex(); 986 return (unary(op2(ALL, NIL, NIL))); 987 case EMPTYRE: 988 rtok = relex(); 989 return (unary(op2(EMPTYRE, NIL, NIL))); 990 case DOT: 991 lastatom = starttok; 992 rtok = relex(); 993 return (unary(op2(DOT, NIL, NIL))); 994 case CCL: 995 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 996 lastatom = starttok; 997 rtok = relex(); 998 return (unary(np)); 999 case NCCL: 1000 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 1001 lastatom = starttok; 1002 rtok = relex(); 1003 return (unary(np)); 1004 case '^': 1005 rtok = relex(); 1006 return (unary(op2(CHAR, NIL, itonp(HAT)))); 1007 case '$': 1008 rtok = relex(); 1009 return (unary(op2(CHAR, NIL, NIL))); 1010 case '(': 1011 lastatom = starttok; 1012 savelastatom = starttok - basestr; /* Retain over recursion */ 1013 rtok = relex(); 1014 if (rtok == ')') { /* special pleading for () */ 1015 rtok = relex(); 1016 return unary(op2(CCL, NIL, (Node *) cclenter(""))); 1017 } 1018 np = regexp(); 1019 if (rtok == ')') { 1020 lastatom = basestr + savelastatom; /* Restore */ 1021 rtok = relex(); 1022 return (unary(np)); 1023 } 1024 else 1025 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 1026 default: 1027 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 1028 } 1029 return 0; /*NOTREACHED*/ 1030 } 1031 1032 Node *concat(Node *np) 1033 { 1034 switch (rtok) { 1035 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 1036 return (concat(op2(CAT, np, primary()))); 1037 case EMPTYRE: 1038 rtok = relex(); 1039 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")), 1040 primary()))); 1041 } 1042 return (np); 1043 } 1044 1045 Node *alt(Node *np) 1046 { 1047 if (rtok == OR) { 1048 rtok = relex(); 1049 return (alt(op2(OR, np, concat(primary())))); 1050 } 1051 return (np); 1052 } 1053 1054 Node *unary(Node *np) 1055 { 1056 switch (rtok) { 1057 case STAR: 1058 rtok = relex(); 1059 return (unary(op2(STAR, np, NIL))); 1060 case PLUS: 1061 rtok = relex(); 1062 return (unary(op2(PLUS, np, NIL))); 1063 case QUEST: 1064 rtok = relex(); 1065 return (unary(op2(QUEST, np, NIL))); 1066 case ZERO: 1067 rtok = relex(); 1068 return (unary(op2(ZERO, np, NIL))); 1069 default: 1070 return (np); 1071 } 1072 } 1073 1074 /* 1075 * Character class definitions conformant to the POSIX locale as 1076 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 1077 * and operating character sets are both ASCII (ISO646) or supersets 1078 * thereof. 1079 * 1080 * Note that to avoid overflowing the temporary buffer used in 1081 * relex(), the expanded character class (prior to range expansion) 1082 * must be less than twice the size of their full name. 1083 */ 1084 1085 /* Because isblank doesn't show up in any of the header files on any 1086 * system i use, it's defined here. if some other locale has a richer 1087 * definition of "blank", define HAS_ISBLANK and provide your own 1088 * version. 1089 * the parentheses here are an attempt to find a path through the maze 1090 * of macro definition and/or function and/or version provided. thanks 1091 * to nelson beebe for the suggestion; let's see if it works everywhere. 1092 */ 1093 1094 /* #define HAS_ISBLANK */ 1095 #ifndef HAS_ISBLANK 1096 1097 int (xisblank)(int c) 1098 { 1099 return c==' ' || c=='\t'; 1100 } 1101 1102 #endif 1103 1104 static const struct charclass { 1105 const char *cc_name; 1106 int cc_namelen; 1107 int (*cc_func)(int); 1108 } charclasses[] = { 1109 { "alnum", 5, isalnum }, 1110 { "alpha", 5, isalpha }, 1111 #ifndef HAS_ISBLANK 1112 { "blank", 5, xisblank }, 1113 #else 1114 { "blank", 5, isblank }, 1115 #endif 1116 { "cntrl", 5, iscntrl }, 1117 { "digit", 5, isdigit }, 1118 { "graph", 5, isgraph }, 1119 { "lower", 5, islower }, 1120 { "print", 5, isprint }, 1121 { "punct", 5, ispunct }, 1122 { "space", 5, isspace }, 1123 { "upper", 5, isupper }, 1124 { "xdigit", 6, isxdigit }, 1125 { NULL, 0, NULL }, 1126 }; 1127 1128 #define REPEAT_SIMPLE 0 1129 #define REPEAT_PLUS_APPENDED 1 1130 #define REPEAT_WITH_Q 2 1131 #define REPEAT_ZERO 3 1132 1133 static int 1134 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1135 int atomlen, int firstnum, int secondnum, int special_case) 1136 { 1137 int i, j; 1138 uschar *buf = 0; 1139 int ret = 1; 1140 int init_q = (firstnum == 0); /* first added char will be ? */ 1141 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 1142 int prefix_length = reptok - basestr; /* prefix includes first rep */ 1143 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 1144 int size = prefix_length + suffix_length; 1145 1146 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 1147 size += atomlen*(firstnum-1); 1148 } 1149 1150 /* Adjust size of buffer for special cases */ 1151 if (special_case == REPEAT_PLUS_APPENDED) { 1152 size++; /* for the final + */ 1153 } else if (special_case == REPEAT_WITH_Q) { 1154 size += init_q + (atomlen+1)* (n_q_reps-init_q); 1155 } else if (special_case == REPEAT_ZERO) { 1156 size += 2; /* just a null ERE: () */ 1157 } 1158 if ((buf = (uschar *) malloc(size + 1)) == NULL) 1159 FATAL("out of space in reg expr %.10s..", lastre); 1160 memcpy(buf, basestr, prefix_length); /* copy prefix */ 1161 j = prefix_length; 1162 if (special_case == REPEAT_ZERO) { 1163 j -= atomlen; 1164 buf[j++] = '('; 1165 buf[j++] = ')'; 1166 } 1167 for (i = 1; i < firstnum; i++) { /* copy x reps */ 1168 memcpy(&buf[j], atom, atomlen); 1169 j += atomlen; 1170 } 1171 if (special_case == REPEAT_PLUS_APPENDED) { 1172 buf[j++] = '+'; 1173 } else if (special_case == REPEAT_WITH_Q) { 1174 if (init_q) 1175 buf[j++] = '?'; 1176 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 1177 memcpy(&buf[j], atom, atomlen); 1178 j += atomlen; 1179 buf[j++] = '?'; 1180 } 1181 } 1182 memcpy(&buf[j], reptok+reptoklen, suffix_length); 1183 j += suffix_length; 1184 buf[j] = '\0'; 1185 /* free old basestr */ 1186 if (firstbasestr != basestr) { 1187 if (basestr) 1188 xfree(basestr); 1189 } 1190 basestr = buf; 1191 prestr = buf + prefix_length; 1192 if (special_case == REPEAT_ZERO) { 1193 prestr -= atomlen; 1194 ret++; 1195 } 1196 return ret; 1197 } 1198 1199 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1200 int atomlen, int firstnum, int secondnum) 1201 { 1202 /* 1203 In general, the repetition specifier or "bound" is replaced here 1204 by an equivalent ERE string, repeating the immediately previous atom 1205 and appending ? and + as needed. Note that the first copy of the 1206 atom is left in place, except in the special_case of a zero-repeat 1207 (i.e., {0}). 1208 */ 1209 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1210 if (firstnum < 2) { 1211 /* 0 or 1: should be handled before you get here */ 1212 FATAL("internal error"); 1213 } else { 1214 return replace_repeat(reptok, reptoklen, atom, atomlen, 1215 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1216 } 1217 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1218 if (firstnum == 0) { /* {0} or {0,0} */ 1219 /* This case is unusual because the resulting 1220 replacement string might actually be SMALLER than 1221 the original ERE */ 1222 return replace_repeat(reptok, reptoklen, atom, atomlen, 1223 firstnum, secondnum, REPEAT_ZERO); 1224 } else { /* (firstnum >= 1) */ 1225 return replace_repeat(reptok, reptoklen, atom, atomlen, 1226 firstnum, secondnum, REPEAT_SIMPLE); 1227 } 1228 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1229 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1230 return replace_repeat(reptok, reptoklen, atom, atomlen, 1231 firstnum, secondnum, REPEAT_WITH_Q); 1232 } else { /* Error - shouldn't be here (n>m) */ 1233 FATAL("internal error"); 1234 } 1235 return 0; 1236 } 1237 1238 int relex(void) /* lexical analyzer for reparse */ 1239 { 1240 int c, n; 1241 int cflag; 1242 static uschar *buf = NULL; 1243 static int bufsz = 100; 1244 uschar *bp; 1245 const struct charclass *cc; 1246 int i; 1247 int num, m; 1248 bool commafound, digitfound; 1249 const uschar *startreptok; 1250 static int parens = 0; 1251 1252 rescan: 1253 starttok = prestr; 1254 1255 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1256 prestr += n; 1257 starttok = prestr; 1258 return CHAR; 1259 } 1260 1261 switch (c = *prestr++) { 1262 case '|': return OR; 1263 case '*': return STAR; 1264 case '+': return PLUS; 1265 case '?': return QUEST; 1266 case '.': return DOT; 1267 case '\0': prestr--; return '\0'; 1268 case '^': 1269 case '$': 1270 return c; 1271 case '(': 1272 parens++; 1273 return c; 1274 case ')': 1275 if (parens) { 1276 parens--; 1277 return c; 1278 } 1279 /* unmatched close parenthesis; per POSIX, treat as literal */ 1280 rlxval = c; 1281 return CHAR; 1282 case '\\': 1283 rlxval = quoted(&prestr); 1284 return CHAR; 1285 default: 1286 rlxval = c; 1287 return CHAR; 1288 case '[': 1289 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1290 FATAL("out of space in reg expr %.10s..", lastre); 1291 bp = buf; 1292 if (*prestr == '^') { 1293 cflag = 1; 1294 prestr++; 1295 } 1296 else 1297 cflag = 0; 1298 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ 1299 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1300 FATAL("out of space for reg expr %.10s...", lastre); 1301 for (; ; ) { 1302 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1303 for (i = 0; i < n; i++) 1304 *bp++ = *prestr++; 1305 continue; 1306 } 1307 if ((c = *prestr++) == '\\') { 1308 *bp++ = '\\'; 1309 if ((c = *prestr++) == '\0') 1310 FATAL("nonterminated character class %.20s...", lastre); 1311 *bp++ = c; 1312 /* } else if (c == '\n') { */ 1313 /* FATAL("newline in character class %.20s...", lastre); */ 1314 } else if (c == '[' && *prestr == ':') { 1315 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1316 for (cc = charclasses; cc->cc_name; cc++) 1317 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1318 break; 1319 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1320 prestr[2 + cc->cc_namelen] == ']') { 1321 prestr += cc->cc_namelen + 3; 1322 /* 1323 * BUG: We begin at 1, instead of 0, since we 1324 * would otherwise prematurely terminate the 1325 * string for classes like [[:cntrl:]]. This 1326 * means that we can't match the NUL character, 1327 * not without first adapting the entire 1328 * program to track each string's length. 1329 */ 1330 for (i = 1; i <= UCHAR_MAX; i++) { 1331 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2")) 1332 FATAL("out of space for reg expr %.10s...", lastre); 1333 if (cc->cc_func(i)) { 1334 /* escape backslash */ 1335 if (i == '\\') { 1336 *bp++ = '\\'; 1337 n++; 1338 } 1339 1340 *bp++ = i; 1341 n++; 1342 } 1343 } 1344 } else 1345 *bp++ = c; 1346 } else if (c == '[' && *prestr == '.') { 1347 char collate_char; 1348 prestr++; 1349 collate_char = *prestr++; 1350 if (*prestr == '.' && prestr[1] == ']') { 1351 prestr += 2; 1352 /* Found it: map via locale TBD: for 1353 now, simply return this char. This 1354 is sufficient to pass conformance 1355 test awk.ex 156 1356 */ 1357 if (*prestr == ']') { 1358 prestr++; 1359 rlxval = collate_char; 1360 return CHAR; 1361 } 1362 } 1363 } else if (c == '[' && *prestr == '=') { 1364 char equiv_char; 1365 prestr++; 1366 equiv_char = *prestr++; 1367 if (*prestr == '=' && prestr[1] == ']') { 1368 prestr += 2; 1369 /* Found it: map via locale TBD: for now 1370 simply return this char. This is 1371 sufficient to pass conformance test 1372 awk.ex 156 1373 */ 1374 if (*prestr == ']') { 1375 prestr++; 1376 rlxval = equiv_char; 1377 return CHAR; 1378 } 1379 } 1380 } else if (c == '\0') { 1381 FATAL("nonterminated character class %.20s", lastre); 1382 } else if (bp == buf) { /* 1st char is special */ 1383 *bp++ = c; 1384 } else if (c == ']') { 1385 *bp++ = 0; 1386 rlxstr = (uschar *) tostring((char *) buf); 1387 if (cflag == 0) 1388 return CCL; 1389 else 1390 return NCCL; 1391 } else 1392 *bp++ = c; 1393 } 1394 break; 1395 case '{': 1396 if (isdigit((int) *(prestr))) { 1397 num = 0; /* Process as a repetition */ 1398 n = -1; m = -1; 1399 commafound = false; 1400 digitfound = false; 1401 startreptok = prestr-1; 1402 /* Remember start of previous atom here ? */ 1403 } else { /* just a { char, not a repetition */ 1404 rlxval = c; 1405 return CHAR; 1406 } 1407 for (; ; ) { 1408 if ((c = *prestr++) == '}') { 1409 if (commafound) { 1410 if (digitfound) { /* {n,m} */ 1411 m = num; 1412 if (m < n) 1413 FATAL("illegal repetition expression: class %.20s", 1414 lastre); 1415 if (n == 0 && m == 1) { 1416 return QUEST; 1417 } 1418 } else { /* {n,} */ 1419 if (n == 0) 1420 return STAR; 1421 else if (n == 1) 1422 return PLUS; 1423 } 1424 } else { 1425 if (digitfound) { /* {n} same as {n,n} */ 1426 n = num; 1427 m = num; 1428 } else { /* {} */ 1429 FATAL("illegal repetition expression: class %.20s", 1430 lastre); 1431 } 1432 } 1433 if (repeat(starttok, prestr-starttok, lastatom, 1434 startreptok - lastatom, n, m) > 0) { 1435 if (n == 0 && m == 0) { 1436 return ZERO; 1437 } 1438 /* must rescan input for next token */ 1439 goto rescan; 1440 } 1441 /* Failed to replace: eat up {...} characters 1442 and treat like just PLUS */ 1443 return PLUS; 1444 } else if (c == '\0') { 1445 FATAL("nonterminated character class %.20s", 1446 lastre); 1447 } else if (isdigit(c)) { 1448 num = 10 * num + c - '0'; 1449 digitfound = true; 1450 } else if (c == ',') { 1451 if (commafound) 1452 FATAL("illegal repetition expression: class %.20s", 1453 lastre); 1454 /* looking for {n,} or {n,m} */ 1455 commafound = true; 1456 n = num; 1457 digitfound = false; /* reset */ 1458 num = 0; 1459 } else { 1460 FATAL("illegal repetition expression: class %.20s", 1461 lastre); 1462 } 1463 } 1464 break; 1465 } 1466 } 1467 1468 int cgoto(fa *f, int s, int c) 1469 { 1470 int *p, *q; 1471 int i, j, k; 1472 1473 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ 1474 while (f->accept >= maxsetvec) { /* guessing here! */ 1475 resizesetvec(__func__); 1476 } 1477 for (i = 0; i <= f->accept; i++) 1478 setvec[i] = 0; 1479 setcnt = 0; 1480 resize_state(f, s); 1481 /* compute positions of gototab[s,c] into setvec */ 1482 p = f->posns[s]; 1483 for (i = 1; i <= *p; i++) { 1484 if ((k = f->re[p[i]].ltype) != FINAL) { 1485 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1486 || (k == DOT && c != 0 && c != HAT) 1487 || (k == ALL && c != 0) 1488 || (k == EMPTYRE && c != 0) 1489 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) 1490 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { 1491 q = f->re[p[i]].lfollow; 1492 for (j = 1; j <= *q; j++) { 1493 if (q[j] >= maxsetvec) { 1494 resizesetvec(__func__); 1495 } 1496 if (setvec[q[j]] == 0) { 1497 setcnt++; 1498 setvec[q[j]] = 1; 1499 } 1500 } 1501 } 1502 } 1503 } 1504 /* determine if setvec is a previous state */ 1505 tmpset[0] = setcnt; 1506 j = 1; 1507 for (i = f->accept; i >= 0; i--) 1508 if (setvec[i]) { 1509 tmpset[j++] = i; 1510 } 1511 resize_state(f, f->curstat > s ? f->curstat : s); 1512 /* tmpset == previous state? */ 1513 for (i = 1; i <= f->curstat; i++) { 1514 p = f->posns[i]; 1515 if ((k = tmpset[0]) != p[0]) 1516 goto different; 1517 for (j = 1; j <= k; j++) 1518 if (tmpset[j] != p[j]) 1519 goto different; 1520 /* setvec is state i */ 1521 if (c != HAT) 1522 set_gototab(f, s, c, i); 1523 return i; 1524 different:; 1525 } 1526 1527 /* add tmpset to current set of states */ 1528 ++(f->curstat); 1529 resize_state(f, f->curstat); 1530 clear_gototab(f, f->curstat); 1531 xfree(f->posns[f->curstat]); 1532 p = intalloc(setcnt + 1, __func__); 1533 1534 f->posns[f->curstat] = p; 1535 if (c != HAT) 1536 set_gototab(f, s, c, f->curstat); 1537 for (i = 0; i <= setcnt; i++) 1538 p[i] = tmpset[i]; 1539 if (setvec[f->accept]) 1540 f->out[f->curstat] = 1; 1541 else 1542 f->out[f->curstat] = 0; 1543 return f->curstat; 1544 } 1545 1546 1547 void freefa(fa *f) /* free a finite automaton */ 1548 { 1549 int i; 1550 1551 if (f == NULL) 1552 return; 1553 for (i = 0; i < f->state_count; i++) 1554 xfree(f->gototab[i].entries); 1555 xfree(f->gototab); 1556 for (i = 0; i <= f->curstat; i++) 1557 xfree(f->posns[i]); 1558 for (i = 0; i <= f->accept; i++) { 1559 xfree(f->re[i].lfollow); 1560 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1561 xfree(f->re[i].lval.np); 1562 } 1563 xfree(f->restr); 1564 xfree(f->out); 1565 xfree(f->posns); 1566 xfree(f->gototab); 1567 xfree(f); 1568 } 1569