1 /**************************************************************** 2 Copyright (C) Lucent Technologies 1997 3 All Rights Reserved 4 5 Permission to use, copy, modify, and distribute this software and 6 its documentation for any purpose and without fee is hereby 7 granted, provided that the above copyright notice appear in all 8 copies and that both that the copyright notice and this 9 permission notice and warranty disclaimer appear in supporting 10 documentation, and that the name Lucent Technologies or any of 11 its entities not be used in advertising or publicity pertaining 12 to distribution of the software without specific, written prior 13 permission. 14 15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22 THIS SOFTWARE. 23 ****************************************************************/ 24 25 /* lasciate ogne speranza, voi ch'intrate. */ 26 27 #define DEBUG 28 29 #include <ctype.h> 30 #include <limits.h> 31 #include <stdio.h> 32 #include <string.h> 33 #include <stdlib.h> 34 #include "awk.h" 35 #include "awkgram.tab.h" 36 37 #define MAXLIN 22 38 39 #define type(v) (v)->nobj /* badly overloaded here */ 40 #define info(v) (v)->ntype /* badly overloaded here */ 41 #define left(v) (v)->narg[0] 42 #define right(v) (v)->narg[1] 43 #define parent(v) (v)->nnext 44 45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 46 #define ELEAF case EMPTYRE: /* empty string in regexp */ 47 #define UNARY case STAR: case PLUS: case QUEST: 48 49 /* encoding in tree Nodes: 50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): 51 left is index, right contains value or pointer to value 52 unary (STAR, PLUS, QUEST): left is child, right is null 53 binary (CAT, OR): left and right are children 54 parent contains pointer to parent 55 */ 56 57 58 int *setvec; 59 int *tmpset; 60 int maxsetvec = 0; 61 62 int rtok; /* next token in current re */ 63 int rlxval; 64 static const uschar *rlxstr; 65 static const uschar *prestr; /* current position in current re */ 66 static const uschar *lastre; /* origin of last re */ 67 static const uschar *lastatom; /* origin of last Atom */ 68 static const uschar *starttok; 69 static const uschar *basestr; /* starts with original, replaced during 70 repetition processing */ 71 static const uschar *firstbasestr; 72 73 static int setcnt; 74 static int poscnt; 75 76 const char *patbeg; 77 int patlen; 78 79 #define NFA 128 /* cache this many dynamic fa's */ 80 fa *fatab[NFA]; 81 int nfatab = 0; /* entries in fatab */ 82 83 extern int u8_nextlen(const char *s); 84 85 86 /* utf-8 mechanism: 87 88 For most of Awk, utf-8 strings just "work", since they look like 89 null-terminated sequences of 8-bit bytes. 90 91 Functions like length(), index(), and substr() have to operate 92 in units of utf-8 characters. The u8_* functions in run.c 93 handle this. 94 95 Regular expressions are more complicated, since the basic 96 mechanism of the goto table used 8-bit byte indices into the 97 gototab entries to compute the next state. Unicode is a lot 98 bigger, so the gototab entries are now structs with a character 99 and a next state. These are sorted by code point and binary 100 searched. 101 102 Throughout the RE mechanism in b.c, utf-8 characters are 103 converted to their utf-32 value. This mostly shows up in 104 cclenter, which expands character class ranges like a-z and now 105 alpha-omega. The size of a gototab array is still about 256. 106 This should be dynamic, but for now things work ok for a single 107 code page of Unicode, which is the most likely case. 108 109 The code changes are localized in run.c and b.c. I have added a 110 handful of functions to somewhat better hide the implementation, 111 but a lot more could be done. 112 113 */ 114 115 static int entry_cmp(const void *l, const void *r); 116 static int get_gototab(fa*, int, int); 117 static int set_gototab(fa*, int, int, int); 118 static void clear_gototab(fa*, int); 119 extern int u8_rune(int *, const char *); 120 121 static int * 122 intalloc(size_t n, const char *f) 123 { 124 int *p = (int *) calloc(n, sizeof(int)); 125 if (p == NULL) 126 overflo(f); 127 return p; 128 } 129 130 static void 131 resizesetvec(const char *f) 132 { 133 if (maxsetvec == 0) 134 maxsetvec = MAXLIN; 135 else 136 maxsetvec *= 4; 137 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec)); 138 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset)); 139 if (setvec == NULL || tmpset == NULL) 140 overflo(f); 141 } 142 143 static void 144 resize_state(fa *f, int state) 145 { 146 gtt *p; 147 uschar *p2; 148 int **p3; 149 int i, new_count; 150 151 if (++state < f->state_count) 152 return; 153 154 new_count = state + 10; /* needs to be tuned */ 155 156 p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt)); 157 if (p == NULL) 158 goto out; 159 f->gototab = p; 160 161 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0])); 162 if (p2 == NULL) 163 goto out; 164 f->out = p2; 165 166 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0])); 167 if (p3 == NULL) 168 goto out; 169 f->posns = p3; 170 171 for (i = f->state_count; i < new_count; ++i) { 172 f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte)); 173 if (f->gototab[i].entries == NULL) 174 goto out; 175 f->gototab[i].allocated = NCHARS; 176 f->gototab[i].inuse = 0; 177 f->out[i] = 0; 178 f->posns[i] = NULL; 179 } 180 f->state_count = new_count; 181 return; 182 out: 183 overflo(__func__); 184 } 185 186 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */ 187 { 188 int i, use, nuse; 189 fa *pfa; 190 static int now = 1; 191 192 if (setvec == NULL) { /* first time through any RE */ 193 resizesetvec(__func__); 194 } 195 196 if (compile_time != RUNNING) /* a constant for sure */ 197 return mkdfa(s, anchor); 198 for (i = 0; i < nfatab; i++) /* is it there already? */ 199 if (fatab[i]->anchor == anchor 200 && strcmp((const char *) fatab[i]->restr, s) == 0) { 201 fatab[i]->use = now++; 202 return fatab[i]; 203 } 204 pfa = mkdfa(s, anchor); 205 if (nfatab < NFA) { /* room for another */ 206 fatab[nfatab] = pfa; 207 fatab[nfatab]->use = now++; 208 nfatab++; 209 return pfa; 210 } 211 use = fatab[0]->use; /* replace least-recently used */ 212 nuse = 0; 213 for (i = 1; i < nfatab; i++) 214 if (fatab[i]->use < use) { 215 use = fatab[i]->use; 216 nuse = i; 217 } 218 freefa(fatab[nuse]); 219 fatab[nuse] = pfa; 220 pfa->use = now++; 221 return pfa; 222 } 223 224 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */ 225 /* anchor = true for anchored matches, else false */ 226 { 227 Node *p, *p1; 228 fa *f; 229 230 firstbasestr = (const uschar *) s; 231 basestr = firstbasestr; 232 p = reparse(s); 233 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 234 /* put ALL STAR in front of reg. exp. */ 235 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 236 /* put FINAL after reg. exp. */ 237 238 poscnt = 0; 239 penter(p1); /* enter parent pointers and leaf indices */ 240 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL) 241 overflo(__func__); 242 f->accept = poscnt-1; /* penter has computed number of positions in re */ 243 cfoll(f, p1); /* set up follow sets */ 244 freetr(p1); 245 resize_state(f, 1); 246 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__); 247 f->posns[1] = intalloc(1, __func__); 248 *f->posns[1] = 0; 249 f->initstat = makeinit(f, anchor); 250 f->anchor = anchor; 251 f->restr = (uschar *) tostring(s); 252 if (firstbasestr != basestr) { 253 if (basestr) 254 xfree(basestr); 255 } 256 return f; 257 } 258 259 int makeinit(fa *f, bool anchor) 260 { 261 int i, k; 262 263 f->curstat = 2; 264 f->out[2] = 0; 265 k = *(f->re[0].lfollow); 266 xfree(f->posns[2]); 267 f->posns[2] = intalloc(k + 1, __func__); 268 for (i = 0; i <= k; i++) { 269 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 270 } 271 if ((f->posns[2])[1] == f->accept) 272 f->out[2] = 1; 273 clear_gototab(f, 2); 274 f->curstat = cgoto(f, 2, HAT); 275 if (anchor) { 276 *f->posns[2] = k-1; /* leave out position 0 */ 277 for (i = 0; i < k; i++) { 278 (f->posns[0])[i] = (f->posns[2])[i]; 279 } 280 281 f->out[0] = f->out[2]; 282 if (f->curstat != 2) 283 --(*f->posns[f->curstat]); 284 } 285 return f->curstat; 286 } 287 288 void penter(Node *p) /* set up parent pointers and leaf indices */ 289 { 290 switch (type(p)) { 291 ELEAF 292 LEAF 293 info(p) = poscnt; 294 poscnt++; 295 break; 296 UNARY 297 penter(left(p)); 298 parent(left(p)) = p; 299 break; 300 case CAT: 301 case OR: 302 penter(left(p)); 303 penter(right(p)); 304 parent(left(p)) = p; 305 parent(right(p)) = p; 306 break; 307 case ZERO: 308 break; 309 default: /* can't happen */ 310 FATAL("can't happen: unknown type %d in penter", type(p)); 311 break; 312 } 313 } 314 315 void freetr(Node *p) /* free parse tree */ 316 { 317 switch (type(p)) { 318 ELEAF 319 LEAF 320 xfree(p); 321 break; 322 UNARY 323 case ZERO: 324 freetr(left(p)); 325 xfree(p); 326 break; 327 case CAT: 328 case OR: 329 freetr(left(p)); 330 freetr(right(p)); 331 xfree(p); 332 break; 333 default: /* can't happen */ 334 FATAL("can't happen: unknown type %d in freetr", type(p)); 335 break; 336 } 337 } 338 339 /* in the parsing of regular expressions, metacharacters like . have */ 340 /* to be seen literally; \056 is not a metacharacter. */ 341 342 int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ 343 { /* only pick up one 8-bit byte (2 chars) */ 344 const uschar *p; 345 int n = 0; 346 int i; 347 348 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { 349 if (isdigit((int) *p)) 350 n = 16 * n + *p - '0'; 351 else if (*p >= 'a' && *p <= 'f') 352 n = 16 * n + *p - 'a' + 10; 353 else if (*p >= 'A' && *p <= 'F') 354 n = 16 * n + *p - 'A' + 10; 355 } 356 *pp = p; 357 return n; 358 } 359 360 361 362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ 363 364 int quoted(const uschar **pp) /* pick up next thing after a \\ */ 365 /* and increment *pp */ 366 { 367 const uschar *p = *pp; 368 int c; 369 370 /* BUG: should advance by utf-8 char even if makes no sense */ 371 372 if ((c = *p++) == 't') { 373 c = '\t'; 374 } else if (c == 'n') { 375 c = '\n'; 376 } else if (c == 'f') { 377 c = '\f'; 378 } else if (c == 'r') { 379 c = '\r'; 380 } else if (c == 'b') { 381 c = '\b'; 382 } else if (c == 'v') { 383 c = '\v'; 384 } else if (c == 'a') { 385 c = '\a'; 386 } else if (c == '\\') { 387 c = '\\'; 388 } else if (c == 'x') { /* 2 hex digits follow */ 389 c = hexstr(&p, 2); /* this adds a null if number is invalid */ 390 } else if (c == 'u') { /* unicode char number up to 8 hex digits */ 391 c = hexstr(&p, 8); 392 } else if (isoctdigit(c)) { /* \d \dd \ddd */ 393 int n = c - '0'; 394 if (isoctdigit(*p)) { 395 n = 8 * n + *p++ - '0'; 396 if (isoctdigit(*p)) 397 n = 8 * n + *p++ - '0'; 398 } 399 c = n; 400 } /* else */ 401 /* c = c; */ 402 *pp = p; 403 return c; 404 } 405 406 int *cclenter(const char *argp) /* add a character class */ 407 { 408 int i, c, c2; 409 int n; 410 const uschar *p = (const uschar *) argp; 411 int *bp, *retp; 412 static int *buf = NULL; 413 static int bufsz = 100; 414 415 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) 416 FATAL("out of space for character class [%.10s...] 1", p); 417 bp = buf; 418 for (i = 0; *p != 0; ) { 419 n = u8_rune(&c, (const char *) p); 420 p += n; 421 if (c == '\\') { 422 c = quoted(&p); 423 } else if (c == '-' && i > 0 && bp[-1] != 0) { 424 if (*p != 0) { 425 c = bp[-1]; 426 /* c2 = *p++; */ 427 n = u8_rune(&c2, (const char *) p); 428 p += n; 429 if (c2 == '\\') 430 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ 431 if (c > c2) { /* empty; ignore */ 432 bp--; 433 i--; 434 continue; 435 } 436 while (c < c2) { 437 if (i >= bufsz) { 438 bufsz *= 2; 439 buf = (int *) realloc(buf, bufsz * sizeof(int)); 440 if (buf == NULL) 441 FATAL("out of space for character class [%.10s...] 2", p); 442 bp = buf + i; 443 } 444 *bp++ = ++c; 445 i++; 446 } 447 continue; 448 } 449 } 450 if (i >= bufsz) { 451 bufsz *= 2; 452 buf = (int *) realloc(buf, bufsz * sizeof(int)); 453 if (buf == NULL) 454 FATAL("out of space for character class [%.10s...] 2", p); 455 bp = buf + i; 456 } 457 *bp++ = c; 458 i++; 459 } 460 *bp = 0; 461 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ 462 /* xfree(op); BUG: what are we freeing here? */ 463 retp = (int *) calloc(bp-buf+1, sizeof(int)); 464 for (i = 0; i < bp-buf+1; i++) 465 retp[i] = buf[i]; 466 return retp; 467 } 468 469 void overflo(const char *s) 470 { 471 FATAL("regular expression too big: out of space in %.30s...", s); 472 } 473 474 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 475 { 476 int i; 477 int *p; 478 479 switch (type(v)) { 480 ELEAF 481 LEAF 482 f->re[info(v)].ltype = type(v); 483 f->re[info(v)].lval.np = right(v); 484 while (f->accept >= maxsetvec) { /* guessing here! */ 485 resizesetvec(__func__); 486 } 487 for (i = 0; i <= f->accept; i++) 488 setvec[i] = 0; 489 setcnt = 0; 490 follow(v); /* computes setvec and setcnt */ 491 p = intalloc(setcnt + 1, __func__); 492 f->re[info(v)].lfollow = p; 493 *p = setcnt; 494 for (i = f->accept; i >= 0; i--) 495 if (setvec[i] == 1) 496 *++p = i; 497 break; 498 UNARY 499 cfoll(f,left(v)); 500 break; 501 case CAT: 502 case OR: 503 cfoll(f,left(v)); 504 cfoll(f,right(v)); 505 break; 506 case ZERO: 507 break; 508 default: /* can't happen */ 509 FATAL("can't happen: unknown type %d in cfoll", type(v)); 510 } 511 } 512 513 int first(Node *p) /* collects initially active leaves of p into setvec */ 514 /* returns 0 if p matches empty string */ 515 { 516 int b, lp; 517 518 switch (type(p)) { 519 ELEAF 520 LEAF 521 lp = info(p); /* look for high-water mark of subscripts */ 522 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ 523 resizesetvec(__func__); 524 } 525 if (type(p) == EMPTYRE) { 526 setvec[lp] = 0; 527 return(0); 528 } 529 if (setvec[lp] != 1) { 530 setvec[lp] = 1; 531 setcnt++; 532 } 533 if (type(p) == CCL && (*(int *) right(p)) == 0) 534 return(0); /* empty CCL */ 535 return(1); 536 case PLUS: 537 if (first(left(p)) == 0) 538 return(0); 539 return(1); 540 case STAR: 541 case QUEST: 542 first(left(p)); 543 return(0); 544 case CAT: 545 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 546 return(1); 547 case OR: 548 b = first(right(p)); 549 if (first(left(p)) == 0 || b == 0) return(0); 550 return(1); 551 case ZERO: 552 return 0; 553 } 554 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ 555 return(-1); 556 } 557 558 void follow(Node *v) /* collects leaves that can follow v into setvec */ 559 { 560 Node *p; 561 562 if (type(v) == FINAL) 563 return; 564 p = parent(v); 565 switch (type(p)) { 566 case STAR: 567 case PLUS: 568 first(v); 569 follow(p); 570 return; 571 572 case OR: 573 case QUEST: 574 follow(p); 575 return; 576 577 case CAT: 578 if (v == left(p)) { /* v is left child of p */ 579 if (first(right(p)) == 0) { 580 follow(p); 581 return; 582 } 583 } else /* v is right child */ 584 follow(p); 585 return; 586 } 587 } 588 589 int member(int c, int *sarg) /* is c in s? */ 590 { 591 int *s = (int *) sarg; 592 593 while (*s) 594 if (c == *s++) 595 return(1); 596 return(0); 597 } 598 599 static void resize_gototab(fa *f, int state) 600 { 601 size_t new_size = f->gototab[state].allocated * 2; 602 gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte)); 603 if (p == NULL) 604 overflo(__func__); 605 606 // need to initialized the new memory to zero 607 size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size 608 memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out 609 610 f->gototab[state].allocated = new_size; // update gototab info 611 f->gototab[state].entries = p; 612 } 613 614 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */ 615 { 616 gtte key; 617 gtte *item; 618 619 key.ch = ch; 620 key.state = 0; /* irrelevant */ 621 item = (gtte *) bsearch(& key, f->gototab[state].entries, 622 f->gototab[state].inuse, sizeof(gtte), 623 entry_cmp); 624 625 if (item == NULL) 626 return 0; 627 else 628 return item->state; 629 } 630 631 static int entry_cmp(const void *l, const void *r) 632 { 633 const gtte *left, *right; 634 635 left = (const gtte *) l; 636 right = (const gtte *) r; 637 638 return left->ch - right->ch; 639 } 640 641 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */ 642 { 643 if (f->gototab[state].inuse == 0) { 644 f->gototab[state].entries[0].ch = ch; 645 f->gototab[state].entries[0].state = val; 646 f->gototab[state].inuse++; 647 return val; 648 } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) { 649 // not seen yet, insert and return 650 gtt *tab = & f->gototab[state]; 651 if (tab->inuse + 1 >= tab->allocated) 652 resize_gototab(f, state); 653 654 f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch; 655 f->gototab[state].entries[f->gototab[state].inuse-1].state = val; 656 f->gototab[state].inuse++; 657 return val; 658 } else { 659 // maybe we have it, maybe we don't 660 gtte key; 661 gtte *item; 662 663 key.ch = ch; 664 key.state = 0; /* irrelevant */ 665 item = (gtte *) bsearch(& key, f->gototab[state].entries, 666 f->gototab[state].inuse, sizeof(gtte), 667 entry_cmp); 668 669 if (item != NULL) { 670 // we have it, update state and return 671 item->state = val; 672 return item->state; 673 } 674 // otherwise, fall through to insert and reallocate. 675 } 676 677 gtt *tab = & f->gototab[state]; 678 if (tab->inuse + 1 >= tab->allocated) 679 resize_gototab(f, state); 680 ++tab->inuse; 681 f->gototab[state].entries[tab->inuse].ch = ch; 682 f->gototab[state].entries[tab->inuse].state = val; 683 684 qsort(f->gototab[state].entries, 685 f->gototab[state].inuse, sizeof(gtte), entry_cmp); 686 687 return val; /* not used anywhere at the moment */ 688 } 689 690 static void clear_gototab(fa *f, int state) 691 { 692 memset(f->gototab[state].entries, 0, 693 f->gototab[state].allocated * sizeof(gtte)); 694 f->gototab[state].inuse = 0; 695 } 696 697 int match(fa *f, const char *p0) /* shortest match ? */ 698 { 699 int s, ns; 700 int n; 701 int rune; 702 const uschar *p = (const uschar *) p0; 703 704 /* return pmatch(f, p0); does it matter whether longest or shortest? */ 705 706 s = f->initstat; 707 assert (s < f->state_count); 708 709 if (f->out[s]) 710 return(1); 711 do { 712 /* assert(*p < NCHARS); */ 713 n = u8_rune(&rune, (const char *) p); 714 if ((ns = get_gototab(f, s, rune)) != 0) 715 s = ns; 716 else 717 s = cgoto(f, s, rune); 718 if (f->out[s]) 719 return(1); 720 if (*p == 0) 721 break; 722 p += n; 723 } while (1); /* was *p++ != 0 */ 724 return(0); 725 } 726 727 int pmatch(fa *f, const char *p0) /* longest match, for sub */ 728 { 729 int s, ns; 730 int n; 731 int rune; 732 const uschar *p = (const uschar *) p0; 733 const uschar *q; 734 735 s = f->initstat; 736 assert(s < f->state_count); 737 738 patbeg = (const char *)p; 739 patlen = -1; 740 do { 741 q = p; 742 do { 743 if (f->out[s]) /* final state */ 744 patlen = q-p; 745 /* assert(*q < NCHARS); */ 746 n = u8_rune(&rune, (const char *) q); 747 if ((ns = get_gototab(f, s, rune)) != 0) 748 s = ns; 749 else 750 s = cgoto(f, s, rune); 751 752 assert(s < f->state_count); 753 754 if (s == 1) { /* no transition */ 755 if (patlen >= 0) { 756 patbeg = (const char *) p; 757 return(1); 758 } 759 else 760 goto nextin; /* no match */ 761 } 762 if (*q == 0) 763 break; 764 q += n; 765 } while (1); 766 q++; /* was *q++ */ 767 if (f->out[s]) 768 patlen = q-p-1; /* don't count $ */ 769 if (patlen >= 0) { 770 patbeg = (const char *) p; 771 return(1); 772 } 773 nextin: 774 s = 2; 775 if (*p == 0) 776 break; 777 n = u8_rune(&rune, (const char *) p); 778 p += n; 779 } while (1); /* was *p++ */ 780 return (0); 781 } 782 783 int nematch(fa *f, const char *p0) /* non-empty match, for sub */ 784 { 785 int s, ns; 786 int n; 787 int rune; 788 const uschar *p = (const uschar *) p0; 789 const uschar *q; 790 791 s = f->initstat; 792 assert(s < f->state_count); 793 794 patbeg = (const char *)p; 795 patlen = -1; 796 while (*p) { 797 q = p; 798 do { 799 if (f->out[s]) /* final state */ 800 patlen = q-p; 801 /* assert(*q < NCHARS); */ 802 n = u8_rune(&rune, (const char *) q); 803 if ((ns = get_gototab(f, s, rune)) != 0) 804 s = ns; 805 else 806 s = cgoto(f, s, rune); 807 if (s == 1) { /* no transition */ 808 if (patlen > 0) { 809 patbeg = (const char *) p; 810 return(1); 811 } else 812 goto nnextin; /* no nonempty match */ 813 } 814 if (*q == 0) 815 break; 816 q += n; 817 } while (1); 818 q++; 819 if (f->out[s]) 820 patlen = q-p-1; /* don't count $ */ 821 if (patlen > 0 ) { 822 patbeg = (const char *) p; 823 return(1); 824 } 825 nnextin: 826 s = 2; 827 p++; 828 } 829 return (0); 830 } 831 832 833 #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long 834 835 /* 836 * NAME 837 * fnematch 838 * 839 * DESCRIPTION 840 * A stream-fed version of nematch which transfers characters to a 841 * null-terminated buffer. All characters up to and including the last 842 * character of the matching text or EOF are placed in the buffer. If 843 * a match is found, patbeg and patlen are set appropriately. 844 * 845 * RETURN VALUES 846 * false No match found. 847 * true Match found. 848 */ 849 850 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) 851 { 852 char *i, *j, *k, *buf = *pbuf; 853 int bufsize = *pbufsize; 854 int c, n, ns, s; 855 856 s = pfa->initstat; 857 patlen = 0; 858 859 /* 860 * buf <= i <= j <= k <= buf+bufsize 861 * 862 * i: origin of active substring 863 * j: current character 864 * k: destination of the next getc 865 */ 866 867 i = j = k = buf; 868 869 do { 870 /* 871 * Call u8_rune with at least MAX_UTF_BYTES ahead in 872 * the buffer until EOF interferes. 873 */ 874 if (k - j < MAX_UTF_BYTES) { 875 if (k + MAX_UTF_BYTES > buf + bufsize) { 876 adjbuf((char **) &buf, &bufsize, 877 bufsize + MAX_UTF_BYTES, 878 quantum, 0, "fnematch"); 879 } 880 for (n = MAX_UTF_BYTES ; n > 0; n--) { 881 *k++ = (c = getc(f)) != EOF ? c : 0; 882 if (c == EOF) { 883 if (ferror(f)) 884 FATAL("fnematch: getc error"); 885 break; 886 } 887 } 888 } 889 890 j += u8_rune(&c, j); 891 892 if ((ns = get_gototab(pfa, s, c)) != 0) 893 s = ns; 894 else 895 s = cgoto(pfa, s, c); 896 897 if (pfa->out[s]) { /* final state */ 898 patbeg = i; 899 patlen = j - i; 900 if (c == 0) /* don't count $ */ 901 patlen--; 902 } 903 904 if (c && s != 1) 905 continue; /* origin i still viable, next j */ 906 if (patlen) 907 break; /* best match found */ 908 909 /* no match at origin i, next i and start over */ 910 i += u8_rune(&c, i); 911 if (c == 0) 912 break; /* no match */ 913 j = i; 914 s = 2; 915 } while (1); 916 917 /* adjbuf() may have relocated a resized buffer. Inform the world. */ 918 *pbuf = buf; 919 *pbufsize = bufsize; 920 921 if (patlen) { 922 /* 923 * Under no circumstances is the last character fed to 924 * the automaton part of the match. It is EOF's nullbyte, 925 * or it sent the automaton into a state with no further 926 * transitions available (s==1), or both. Room for a 927 * terminating nullbyte is guaranteed. 928 * 929 * ungetc any chars after the end of matching text 930 * (except for EOF's nullbyte, if present) and null 931 * terminate the buffer. 932 */ 933 do 934 if (*--k && ungetc(*k, f) == EOF) 935 FATAL("unable to ungetc '%c'", *k); 936 while (k > patbeg + patlen); 937 *k = '\0'; 938 return true; 939 } 940 else 941 return false; 942 } 943 944 Node *reparse(const char *p) /* parses regular expression pointed to by p */ 945 { /* uses relex() to scan regular expression */ 946 Node *np; 947 948 DPRINTF("reparse <%s>\n", p); 949 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */ 950 rtok = relex(); 951 /* GNU compatibility: an empty regexp matches anything */ 952 if (rtok == '\0') { 953 /* FATAL("empty regular expression"); previous */ 954 return(op2(EMPTYRE, NIL, NIL)); 955 } 956 np = regexp(); 957 if (rtok != '\0') 958 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 959 return(np); 960 } 961 962 Node *regexp(void) /* top-level parse of reg expr */ 963 { 964 return (alt(concat(primary()))); 965 } 966 967 Node *primary(void) 968 { 969 Node *np; 970 int savelastatom; 971 972 switch (rtok) { 973 case CHAR: 974 lastatom = starttok; 975 np = op2(CHAR, NIL, itonp(rlxval)); 976 rtok = relex(); 977 return (unary(np)); 978 case ALL: 979 rtok = relex(); 980 return (unary(op2(ALL, NIL, NIL))); 981 case EMPTYRE: 982 rtok = relex(); 983 return (unary(op2(EMPTYRE, NIL, NIL))); 984 case DOT: 985 lastatom = starttok; 986 rtok = relex(); 987 return (unary(op2(DOT, NIL, NIL))); 988 case CCL: 989 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr)); 990 lastatom = starttok; 991 rtok = relex(); 992 return (unary(np)); 993 case NCCL: 994 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr)); 995 lastatom = starttok; 996 rtok = relex(); 997 return (unary(np)); 998 case '^': 999 rtok = relex(); 1000 return (unary(op2(CHAR, NIL, itonp(HAT)))); 1001 case '$': 1002 rtok = relex(); 1003 return (unary(op2(CHAR, NIL, NIL))); 1004 case '(': 1005 lastatom = starttok; 1006 savelastatom = starttok - basestr; /* Retain over recursion */ 1007 rtok = relex(); 1008 if (rtok == ')') { /* special pleading for () */ 1009 rtok = relex(); 1010 return unary(op2(CCL, NIL, (Node *) cclenter(""))); 1011 } 1012 np = regexp(); 1013 if (rtok == ')') { 1014 lastatom = basestr + savelastatom; /* Restore */ 1015 rtok = relex(); 1016 return (unary(np)); 1017 } 1018 else 1019 FATAL("syntax error in regular expression %s at %s", lastre, prestr); 1020 default: 1021 FATAL("illegal primary in regular expression %s at %s", lastre, prestr); 1022 } 1023 return 0; /*NOTREACHED*/ 1024 } 1025 1026 Node *concat(Node *np) 1027 { 1028 switch (rtok) { 1029 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 1030 return (concat(op2(CAT, np, primary()))); 1031 case EMPTYRE: 1032 rtok = relex(); 1033 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")), 1034 primary()))); 1035 } 1036 return (np); 1037 } 1038 1039 Node *alt(Node *np) 1040 { 1041 if (rtok == OR) { 1042 rtok = relex(); 1043 return (alt(op2(OR, np, concat(primary())))); 1044 } 1045 return (np); 1046 } 1047 1048 Node *unary(Node *np) 1049 { 1050 switch (rtok) { 1051 case STAR: 1052 rtok = relex(); 1053 return (unary(op2(STAR, np, NIL))); 1054 case PLUS: 1055 rtok = relex(); 1056 return (unary(op2(PLUS, np, NIL))); 1057 case QUEST: 1058 rtok = relex(); 1059 return (unary(op2(QUEST, np, NIL))); 1060 case ZERO: 1061 rtok = relex(); 1062 return (unary(op2(ZERO, np, NIL))); 1063 default: 1064 return (np); 1065 } 1066 } 1067 1068 /* 1069 * Character class definitions conformant to the POSIX locale as 1070 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source 1071 * and operating character sets are both ASCII (ISO646) or supersets 1072 * thereof. 1073 * 1074 * Note that to avoid overflowing the temporary buffer used in 1075 * relex(), the expanded character class (prior to range expansion) 1076 * must be less than twice the size of their full name. 1077 */ 1078 1079 /* Because isblank doesn't show up in any of the header files on any 1080 * system i use, it's defined here. if some other locale has a richer 1081 * definition of "blank", define HAS_ISBLANK and provide your own 1082 * version. 1083 * the parentheses here are an attempt to find a path through the maze 1084 * of macro definition and/or function and/or version provided. thanks 1085 * to nelson beebe for the suggestion; let's see if it works everywhere. 1086 */ 1087 1088 /* #define HAS_ISBLANK */ 1089 #ifndef HAS_ISBLANK 1090 1091 int (xisblank)(int c) 1092 { 1093 return c==' ' || c=='\t'; 1094 } 1095 1096 #endif 1097 1098 static const struct charclass { 1099 const char *cc_name; 1100 int cc_namelen; 1101 int (*cc_func)(int); 1102 } charclasses[] = { 1103 { "alnum", 5, isalnum }, 1104 { "alpha", 5, isalpha }, 1105 #ifndef HAS_ISBLANK 1106 { "blank", 5, xisblank }, 1107 #else 1108 { "blank", 5, isblank }, 1109 #endif 1110 { "cntrl", 5, iscntrl }, 1111 { "digit", 5, isdigit }, 1112 { "graph", 5, isgraph }, 1113 { "lower", 5, islower }, 1114 { "print", 5, isprint }, 1115 { "punct", 5, ispunct }, 1116 { "space", 5, isspace }, 1117 { "upper", 5, isupper }, 1118 { "xdigit", 6, isxdigit }, 1119 { NULL, 0, NULL }, 1120 }; 1121 1122 #define REPEAT_SIMPLE 0 1123 #define REPEAT_PLUS_APPENDED 1 1124 #define REPEAT_WITH_Q 2 1125 #define REPEAT_ZERO 3 1126 1127 static int 1128 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1129 int atomlen, int firstnum, int secondnum, int special_case) 1130 { 1131 int i, j; 1132 uschar *buf = 0; 1133 int ret = 1; 1134 int init_q = (firstnum == 0); /* first added char will be ? */ 1135 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ 1136 int prefix_length = reptok - basestr; /* prefix includes first rep */ 1137 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */ 1138 int size = prefix_length + suffix_length; 1139 1140 if (firstnum > 1) { /* add room for reps 2 through firstnum */ 1141 size += atomlen*(firstnum-1); 1142 } 1143 1144 /* Adjust size of buffer for special cases */ 1145 if (special_case == REPEAT_PLUS_APPENDED) { 1146 size++; /* for the final + */ 1147 } else if (special_case == REPEAT_WITH_Q) { 1148 size += init_q + (atomlen+1)* (n_q_reps-init_q); 1149 } else if (special_case == REPEAT_ZERO) { 1150 size += 2; /* just a null ERE: () */ 1151 } 1152 if ((buf = (uschar *) malloc(size + 1)) == NULL) 1153 FATAL("out of space in reg expr %.10s..", lastre); 1154 memcpy(buf, basestr, prefix_length); /* copy prefix */ 1155 j = prefix_length; 1156 if (special_case == REPEAT_ZERO) { 1157 j -= atomlen; 1158 buf[j++] = '('; 1159 buf[j++] = ')'; 1160 } 1161 for (i = 1; i < firstnum; i++) { /* copy x reps */ 1162 memcpy(&buf[j], atom, atomlen); 1163 j += atomlen; 1164 } 1165 if (special_case == REPEAT_PLUS_APPENDED) { 1166 buf[j++] = '+'; 1167 } else if (special_case == REPEAT_WITH_Q) { 1168 if (init_q) 1169 buf[j++] = '?'; 1170 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */ 1171 memcpy(&buf[j], atom, atomlen); 1172 j += atomlen; 1173 buf[j++] = '?'; 1174 } 1175 } 1176 memcpy(&buf[j], reptok+reptoklen, suffix_length); 1177 j += suffix_length; 1178 buf[j] = '\0'; 1179 /* free old basestr */ 1180 if (firstbasestr != basestr) { 1181 if (basestr) 1182 xfree(basestr); 1183 } 1184 basestr = buf; 1185 prestr = buf + prefix_length; 1186 if (special_case == REPEAT_ZERO) { 1187 prestr -= atomlen; 1188 ret++; 1189 } 1190 return ret; 1191 } 1192 1193 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, 1194 int atomlen, int firstnum, int secondnum) 1195 { 1196 /* 1197 In general, the repetition specifier or "bound" is replaced here 1198 by an equivalent ERE string, repeating the immediately previous atom 1199 and appending ? and + as needed. Note that the first copy of the 1200 atom is left in place, except in the special_case of a zero-repeat 1201 (i.e., {0}). 1202 */ 1203 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ 1204 if (firstnum < 2) { 1205 /* 0 or 1: should be handled before you get here */ 1206 FATAL("internal error"); 1207 } else { 1208 return replace_repeat(reptok, reptoklen, atom, atomlen, 1209 firstnum, secondnum, REPEAT_PLUS_APPENDED); 1210 } 1211 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ 1212 if (firstnum == 0) { /* {0} or {0,0} */ 1213 /* This case is unusual because the resulting 1214 replacement string might actually be SMALLER than 1215 the original ERE */ 1216 return replace_repeat(reptok, reptoklen, atom, atomlen, 1217 firstnum, secondnum, REPEAT_ZERO); 1218 } else { /* (firstnum >= 1) */ 1219 return replace_repeat(reptok, reptoklen, atom, atomlen, 1220 firstnum, secondnum, REPEAT_SIMPLE); 1221 } 1222 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ 1223 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ 1224 return replace_repeat(reptok, reptoklen, atom, atomlen, 1225 firstnum, secondnum, REPEAT_WITH_Q); 1226 } else { /* Error - shouldn't be here (n>m) */ 1227 FATAL("internal error"); 1228 } 1229 return 0; 1230 } 1231 1232 int relex(void) /* lexical analyzer for reparse */ 1233 { 1234 int c, n; 1235 int cflag; 1236 static uschar *buf = NULL; 1237 static int bufsz = 100; 1238 uschar *bp; 1239 const struct charclass *cc; 1240 int i; 1241 int num, m; 1242 bool commafound, digitfound; 1243 const uschar *startreptok; 1244 static int parens = 0; 1245 1246 rescan: 1247 starttok = prestr; 1248 1249 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1250 prestr += n; 1251 starttok = prestr; 1252 return CHAR; 1253 } 1254 1255 switch (c = *prestr++) { 1256 case '|': return OR; 1257 case '*': return STAR; 1258 case '+': return PLUS; 1259 case '?': return QUEST; 1260 case '.': return DOT; 1261 case '\0': prestr--; return '\0'; 1262 case '^': 1263 case '$': 1264 return c; 1265 case '(': 1266 parens++; 1267 return c; 1268 case ')': 1269 if (parens) { 1270 parens--; 1271 return c; 1272 } 1273 /* unmatched close parenthesis; per POSIX, treat as literal */ 1274 rlxval = c; 1275 return CHAR; 1276 case '\\': 1277 rlxval = quoted(&prestr); 1278 return CHAR; 1279 default: 1280 rlxval = c; 1281 return CHAR; 1282 case '[': 1283 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) 1284 FATAL("out of space in reg expr %.10s..", lastre); 1285 bp = buf; 1286 if (*prestr == '^') { 1287 cflag = 1; 1288 prestr++; 1289 } 1290 else 1291 cflag = 0; 1292 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ 1293 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) 1294 FATAL("out of space for reg expr %.10s...", lastre); 1295 for (; ; ) { 1296 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) { 1297 for (i = 0; i < n; i++) 1298 *bp++ = *prestr++; 1299 continue; 1300 } 1301 if ((c = *prestr++) == '\\') { 1302 *bp++ = '\\'; 1303 if ((c = *prestr++) == '\0') 1304 FATAL("nonterminated character class %.20s...", lastre); 1305 *bp++ = c; 1306 /* } else if (c == '\n') { */ 1307 /* FATAL("newline in character class %.20s...", lastre); */ 1308 } else if (c == '[' && *prestr == ':') { 1309 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ 1310 for (cc = charclasses; cc->cc_name; cc++) 1311 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) 1312 break; 1313 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && 1314 prestr[2 + cc->cc_namelen] == ']') { 1315 prestr += cc->cc_namelen + 3; 1316 /* 1317 * BUG: We begin at 1, instead of 0, since we 1318 * would otherwise prematurely terminate the 1319 * string for classes like [[:cntrl:]]. This 1320 * means that we can't match the NUL character, 1321 * not without first adapting the entire 1322 * program to track each string's length. 1323 */ 1324 for (i = 1; i <= UCHAR_MAX; i++) { 1325 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2")) 1326 FATAL("out of space for reg expr %.10s...", lastre); 1327 if (cc->cc_func(i)) { 1328 /* escape backslash */ 1329 if (i == '\\') { 1330 *bp++ = '\\'; 1331 n++; 1332 } 1333 1334 *bp++ = i; 1335 n++; 1336 } 1337 } 1338 } else 1339 *bp++ = c; 1340 } else if (c == '[' && *prestr == '.') { 1341 char collate_char; 1342 prestr++; 1343 collate_char = *prestr++; 1344 if (*prestr == '.' && prestr[1] == ']') { 1345 prestr += 2; 1346 /* Found it: map via locale TBD: for 1347 now, simply return this char. This 1348 is sufficient to pass conformance 1349 test awk.ex 156 1350 */ 1351 if (*prestr == ']') { 1352 prestr++; 1353 rlxval = collate_char; 1354 return CHAR; 1355 } 1356 } 1357 } else if (c == '[' && *prestr == '=') { 1358 char equiv_char; 1359 prestr++; 1360 equiv_char = *prestr++; 1361 if (*prestr == '=' && prestr[1] == ']') { 1362 prestr += 2; 1363 /* Found it: map via locale TBD: for now 1364 simply return this char. This is 1365 sufficient to pass conformance test 1366 awk.ex 156 1367 */ 1368 if (*prestr == ']') { 1369 prestr++; 1370 rlxval = equiv_char; 1371 return CHAR; 1372 } 1373 } 1374 } else if (c == '\0') { 1375 FATAL("nonterminated character class %.20s", lastre); 1376 } else if (bp == buf) { /* 1st char is special */ 1377 *bp++ = c; 1378 } else if (c == ']') { 1379 *bp++ = 0; 1380 rlxstr = (uschar *) tostring((char *) buf); 1381 if (cflag == 0) 1382 return CCL; 1383 else 1384 return NCCL; 1385 } else 1386 *bp++ = c; 1387 } 1388 break; 1389 case '{': 1390 if (isdigit((int) *(prestr))) { 1391 num = 0; /* Process as a repetition */ 1392 n = -1; m = -1; 1393 commafound = false; 1394 digitfound = false; 1395 startreptok = prestr-1; 1396 /* Remember start of previous atom here ? */ 1397 } else { /* just a { char, not a repetition */ 1398 rlxval = c; 1399 return CHAR; 1400 } 1401 for (; ; ) { 1402 if ((c = *prestr++) == '}') { 1403 if (commafound) { 1404 if (digitfound) { /* {n,m} */ 1405 m = num; 1406 if (m < n) 1407 FATAL("illegal repetition expression: class %.20s", 1408 lastre); 1409 if (n == 0 && m == 1) { 1410 return QUEST; 1411 } 1412 } else { /* {n,} */ 1413 if (n == 0) 1414 return STAR; 1415 else if (n == 1) 1416 return PLUS; 1417 } 1418 } else { 1419 if (digitfound) { /* {n} same as {n,n} */ 1420 n = num; 1421 m = num; 1422 } else { /* {} */ 1423 FATAL("illegal repetition expression: class %.20s", 1424 lastre); 1425 } 1426 } 1427 if (repeat(starttok, prestr-starttok, lastatom, 1428 startreptok - lastatom, n, m) > 0) { 1429 if (n == 0 && m == 0) { 1430 return ZERO; 1431 } 1432 /* must rescan input for next token */ 1433 goto rescan; 1434 } 1435 /* Failed to replace: eat up {...} characters 1436 and treat like just PLUS */ 1437 return PLUS; 1438 } else if (c == '\0') { 1439 FATAL("nonterminated character class %.20s", 1440 lastre); 1441 } else if (isdigit(c)) { 1442 num = 10 * num + c - '0'; 1443 digitfound = true; 1444 } else if (c == ',') { 1445 if (commafound) 1446 FATAL("illegal repetition expression: class %.20s", 1447 lastre); 1448 /* looking for {n,} or {n,m} */ 1449 commafound = true; 1450 n = num; 1451 digitfound = false; /* reset */ 1452 num = 0; 1453 } else { 1454 FATAL("illegal repetition expression: class %.20s", 1455 lastre); 1456 } 1457 } 1458 break; 1459 } 1460 } 1461 1462 int cgoto(fa *f, int s, int c) 1463 { 1464 int *p, *q; 1465 int i, j, k; 1466 1467 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ 1468 while (f->accept >= maxsetvec) { /* guessing here! */ 1469 resizesetvec(__func__); 1470 } 1471 for (i = 0; i <= f->accept; i++) 1472 setvec[i] = 0; 1473 setcnt = 0; 1474 resize_state(f, s); 1475 /* compute positions of gototab[s,c] into setvec */ 1476 p = f->posns[s]; 1477 for (i = 1; i <= *p; i++) { 1478 if ((k = f->re[p[i]].ltype) != FINAL) { 1479 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) 1480 || (k == DOT && c != 0 && c != HAT) 1481 || (k == ALL && c != 0) 1482 || (k == EMPTYRE && c != 0) 1483 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) 1484 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { 1485 q = f->re[p[i]].lfollow; 1486 for (j = 1; j <= *q; j++) { 1487 if (q[j] >= maxsetvec) { 1488 resizesetvec(__func__); 1489 } 1490 if (setvec[q[j]] == 0) { 1491 setcnt++; 1492 setvec[q[j]] = 1; 1493 } 1494 } 1495 } 1496 } 1497 } 1498 /* determine if setvec is a previous state */ 1499 tmpset[0] = setcnt; 1500 j = 1; 1501 for (i = f->accept; i >= 0; i--) 1502 if (setvec[i]) { 1503 tmpset[j++] = i; 1504 } 1505 resize_state(f, f->curstat > s ? f->curstat : s); 1506 /* tmpset == previous state? */ 1507 for (i = 1; i <= f->curstat; i++) { 1508 p = f->posns[i]; 1509 if ((k = tmpset[0]) != p[0]) 1510 goto different; 1511 for (j = 1; j <= k; j++) 1512 if (tmpset[j] != p[j]) 1513 goto different; 1514 /* setvec is state i */ 1515 if (c != HAT) 1516 set_gototab(f, s, c, i); 1517 return i; 1518 different:; 1519 } 1520 1521 /* add tmpset to current set of states */ 1522 ++(f->curstat); 1523 resize_state(f, f->curstat); 1524 clear_gototab(f, f->curstat); 1525 xfree(f->posns[f->curstat]); 1526 p = intalloc(setcnt + 1, __func__); 1527 1528 f->posns[f->curstat] = p; 1529 if (c != HAT) 1530 set_gototab(f, s, c, f->curstat); 1531 for (i = 0; i <= setcnt; i++) 1532 p[i] = tmpset[i]; 1533 if (setvec[f->accept]) 1534 f->out[f->curstat] = 1; 1535 else 1536 f->out[f->curstat] = 0; 1537 return f->curstat; 1538 } 1539 1540 1541 void freefa(fa *f) /* free a finite automaton */ 1542 { 1543 int i; 1544 1545 if (f == NULL) 1546 return; 1547 for (i = 0; i < f->state_count; i++) 1548 xfree(f->gototab[i].entries); 1549 xfree(f->gototab); 1550 for (i = 0; i <= f->curstat; i++) 1551 xfree(f->posns[i]); 1552 for (i = 0; i <= f->accept; i++) { 1553 xfree(f->re[i].lfollow); 1554 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) 1555 xfree(f->re[i].lval.np); 1556 } 1557 xfree(f->restr); 1558 xfree(f->out); 1559 xfree(f->posns); 1560 xfree(f->gototab); 1561 xfree(f); 1562 } 1563