1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 23 /* All Rights Reserved */ 24 25 26 /* 27 * Copyright 2003 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 #ident "%Z%%M% %I% %E% SMI" /* SVr4.0 2.11 */ 32 33 #define DEBUG 34 35 #include "awk.h" 36 #include <ctype.h> 37 #include <stdio.h> 38 #include "y.tab.h" 39 40 #define HAT (NCHARS-1) /* matches ^ in regular expr */ 41 /* NCHARS is 2**n */ 42 43 #define type(v) (v)->nobj 44 #define left(v) (v)->narg[0] 45 #define right(v) (v)->narg[1] 46 #define parent(v) (v)->nnext 47 48 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: 49 #define UNARY case STAR: case PLUS: case QUEST: 50 51 /* encoding in tree Nodes: 52 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL): 53 left is index, right contains value or pointer to value 54 unary (STAR, PLUS, QUEST): left is child, right is null 55 binary (CAT, OR): left and right are children 56 parent contains pointer to parent 57 */ 58 59 60 uchar chars[RECSIZE]; 61 int setvec[RECSIZE]; 62 int tmpset[RECSIZE]; 63 Node *point[RECSIZE]; 64 65 int rtok; /* next token in current re */ 66 int rlxval; 67 uchar *rlxstr; 68 uchar *prestr; /* current position in current re */ 69 uchar *lastre; /* origin of last re */ 70 71 static int setcnt; 72 static int poscnt; 73 74 uchar *patbeg; 75 int patlen; 76 77 #define NFA 20 /* cache this many dynamic fa's */ 78 fa *fatab[NFA]; 79 int nfatab = 0; /* entries in fatab */ 80 fa *mkdfa(); 81 82 fa *makedfa(s, anchor) /* returns dfa for reg expr s */ 83 uchar *s; 84 int anchor; 85 { 86 int i, use, nuse; 87 fa *pfa; 88 89 if (compile_time) /* a constant for sure */ 90 return mkdfa(s, anchor); 91 for (i = 0; i < nfatab; i++) /* is it there already? */ 92 if (fatab[i]->anchor == anchor && strcmp(fatab[i]->restr,s) == 0) { 93 fatab[i]->use++; 94 return fatab[i]; 95 } 96 pfa = mkdfa(s, anchor); 97 if (nfatab < NFA) { /* room for another */ 98 fatab[nfatab] = pfa; 99 fatab[nfatab]->use = 1; 100 nfatab++; 101 return pfa; 102 } 103 use = fatab[0]->use; /* replace least-recently used */ 104 nuse = 0; 105 for (i = 1; i < nfatab; i++) 106 if (fatab[i]->use < use) { 107 use = fatab[i]->use; 108 nuse = i; 109 } 110 freefa(fatab[nuse]); 111 fatab[nuse] = pfa; 112 pfa->use = 1; 113 return pfa; 114 } 115 116 fa *mkdfa(s, anchor) /* does the real work of making a dfa */ 117 uchar *s; 118 int anchor; /* anchor = 1 for anchored matches, else 0 */ 119 { 120 Node *p, *p1, *reparse(); 121 fa *f; 122 123 p = reparse(s); 124 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); 125 /* put ALL STAR in front of reg. exp. */ 126 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); 127 /* put FINAL after reg. exp. */ 128 129 poscnt = 0; 130 penter(p1); /* enter parent pointers and leaf indices */ 131 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) 132 overflo("no room for fa"); 133 f->accept = poscnt-1; /* penter has computed number of positions in re */ 134 cfoll(f, p1); /* set up follow sets */ 135 freetr(p1); 136 if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL) 137 overflo("out of space in makedfa"); 138 if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) 139 overflo("out of space in makedfa"); 140 *f->posns[1] = 0; 141 f->initstat = makeinit(f, anchor); 142 f->anchor = anchor; 143 f->restr = tostring(s); 144 return f; 145 } 146 147 int makeinit(f, anchor) 148 fa *f; 149 int anchor; 150 { 151 register int i, k; 152 153 f->curstat = 2; 154 f->out[2] = 0; 155 f->reset = 0; 156 k = *(f->re[0].lfollow); 157 xfree(f->posns[2]); 158 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 159 overflo("out of space in makeinit"); 160 for (i=0; i<=k; i++) { 161 (f->posns[2])[i] = (f->re[0].lfollow)[i]; 162 } 163 if ((f->posns[2])[1] == f->accept) 164 f->out[2] = 1; 165 for (i=0; i<NCHARS; i++) 166 f->gototab[2][i] = 0; 167 f->curstat = cgoto(f, 2, HAT); 168 if (anchor) { 169 *f->posns[2] = k-1; /* leave out position 0 */ 170 for (i=0; i<k; i++) { 171 (f->posns[0])[i] = (f->posns[2])[i]; 172 } 173 174 f->out[0] = f->out[2]; 175 if (f->curstat != 2) 176 --(*f->posns[f->curstat]); 177 } 178 return f->curstat; 179 } 180 181 penter(p) /* set up parent pointers and leaf indices */ 182 Node *p; 183 { 184 switch(type(p)) { 185 LEAF 186 left(p) = (Node *) poscnt; 187 point[poscnt++] = p; 188 break; 189 UNARY 190 penter(left(p)); 191 parent(left(p)) = p; 192 break; 193 case CAT: 194 case OR: 195 penter(left(p)); 196 penter(right(p)); 197 parent(left(p)) = p; 198 parent(right(p)) = p; 199 break; 200 default: 201 ERROR "unknown type %d in penter", type(p) FATAL; 202 break; 203 } 204 } 205 206 freetr(p) /* free parse tree */ 207 Node *p; 208 { 209 switch (type(p)) { 210 LEAF 211 xfree(p); 212 break; 213 UNARY 214 freetr(left(p)); 215 xfree(p); 216 break; 217 case CAT: 218 case OR: 219 freetr(left(p)); 220 freetr(right(p)); 221 xfree(p); 222 break; 223 default: 224 ERROR "unknown type %d in freetr", type(p) FATAL; 225 break; 226 } 227 } 228 229 uchar *cclenter(p) 230 register uchar *p; 231 { 232 register int i, c; 233 uchar *op; 234 235 op = p; 236 i = 0; 237 while ((c = *p++) != 0) { 238 if (c == '\\') { 239 if ((c = *p++) == 't') 240 c = '\t'; 241 else if (c == 'n') 242 c = '\n'; 243 else if (c == 'f') 244 c = '\f'; 245 else if (c == 'r') 246 c = '\r'; 247 else if (c == 'b') 248 c = '\b'; 249 else if (c == '\\') 250 c = '\\'; 251 else if (isdigit(c)) { 252 int n = c - '0'; 253 if (isdigit(*p)) { 254 n = 8 * n + *p++ - '0'; 255 if (isdigit(*p)) 256 n = 8 * n + *p++ - '0'; 257 } 258 c = n; 259 } /* else */ 260 /* c = c; */ 261 } else if (c == '-' && i > 0 && chars[i-1] != 0) { 262 if (*p != 0) { 263 c = chars[i-1]; 264 while ((uchar) c < *p) { /* fails if *p is \\ */ 265 if (i >= RECSIZE-1) 266 overflo("character class too big"); 267 chars[i++] = ++c; 268 } 269 p++; 270 continue; 271 } 272 } 273 if (i >= RECSIZE-1) 274 overflo("character class too big"); 275 chars[i++] = c; 276 } 277 chars[i++] = '\0'; 278 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, chars) ); 279 xfree(op); 280 return(tostring(chars)); 281 } 282 283 overflo(s) 284 uchar *s; 285 { 286 ERROR "regular expression too big: %s", gettext((char *) s) FATAL; 287 } 288 289 cfoll(f, v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ 290 fa *f; 291 register Node *v; 292 { 293 register int i; 294 register int *p; 295 296 switch(type(v)) { 297 LEAF 298 f->re[(int) left(v)].ltype = type(v); 299 f->re[(int) left(v)].lval = (int) right(v); 300 for (i=0; i<=f->accept; i++) 301 setvec[i] = 0; 302 setcnt = 0; 303 follow(v); /* computes setvec and setcnt */ 304 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 305 overflo("follow set overflow"); 306 f->re[(int) left(v)].lfollow = p; 307 *p = setcnt; 308 for (i = f->accept; i >= 0; i--) 309 if (setvec[i] == 1) *++p = i; 310 break; 311 UNARY 312 cfoll(f,left(v)); 313 break; 314 case CAT: 315 case OR: 316 cfoll(f,left(v)); 317 cfoll(f,right(v)); 318 break; 319 default: 320 ERROR "unknown type %d in cfoll", type(v) FATAL; 321 } 322 } 323 324 first(p) /* collects initially active leaves of p into setvec */ 325 register Node *p; /* returns 0 or 1 depending on whether p matches empty string */ 326 { 327 register int b; 328 329 switch(type(p)) { 330 LEAF 331 if (setvec[(int) left(p)] != 1) { 332 setvec[(int) left(p)] = 1; 333 setcnt++; 334 } 335 if (type(p) == CCL && (*(uchar *) right(p)) == '\0') 336 return(0); /* empty CCL */ 337 else return(1); 338 case PLUS: 339 if (first(left(p)) == 0) return(0); 340 return(1); 341 case STAR: 342 case QUEST: 343 first(left(p)); 344 return(0); 345 case CAT: 346 if (first(left(p)) == 0 && first(right(p)) == 0) return(0); 347 return(1); 348 case OR: 349 b = first(right(p)); 350 if (first(left(p)) == 0 || b == 0) return(0); 351 return(1); 352 } 353 ERROR "unknown type %d in first", type(p) FATAL; 354 return(-1); 355 } 356 357 follow(v) 358 Node *v; /* collects leaves that can follow v into setvec */ 359 { 360 Node *p; 361 362 if (type(v) == FINAL) 363 return; 364 p = parent(v); 365 switch (type(p)) { 366 case STAR: 367 case PLUS: 368 first(v); 369 follow(p); 370 return; 371 372 case OR: 373 case QUEST: 374 follow(p); 375 return; 376 377 case CAT: 378 if (v == left(p)) { /* v is left child of p */ 379 if (first(right(p)) == 0) { 380 follow(p); 381 return; 382 } 383 } 384 else /* v is right child */ 385 follow(p); 386 return; 387 default: 388 ERROR "unknown type %d in follow", type(p) FATAL; 389 break; 390 } 391 } 392 393 member(c, s) /* is c in s? */ 394 register uchar c, *s; 395 { 396 while (*s) 397 if (c == *s++) 398 return(1); 399 return(0); 400 } 401 402 403 match(f, p) 404 register fa *f; 405 register uchar *p; 406 { 407 register int s, ns; 408 409 s = f->reset?makeinit(f,0):f->initstat; 410 if (f->out[s]) 411 return(1); 412 do { 413 if (ns=f->gototab[s][*p]) 414 s=ns; 415 else 416 s=cgoto(f,s,*p); 417 if (f->out[s]) 418 return(1); 419 } while (*p++ != 0); 420 return(0); 421 } 422 423 pmatch(f, p) 424 register fa *f; 425 register uchar *p; 426 { 427 register int s, ns; 428 register uchar *q; 429 int i, k; 430 431 if (f->reset) { 432 f->initstat = s = makeinit(f,1); 433 } else { 434 s = f->initstat; 435 } 436 patbeg = p; 437 patlen = -1; 438 do { 439 q = p; 440 do { 441 if (f->out[s]) /* final state */ 442 patlen = q-p; 443 if (ns=f->gototab[s][*q]) 444 s=ns; 445 else 446 s=cgoto(f,s,*q); 447 if (s==1) /* no transition */ 448 if (patlen >= 0) { 449 patbeg = p; 450 return(1); 451 } 452 else 453 goto nextin; /* no match */ 454 } while (*q++ != 0); 455 if (f->out[s]) 456 patlen = q-p-1; /* don't count $ */ 457 if (patlen >= 0) { 458 patbeg = p; 459 return(1); 460 } 461 nextin: 462 s = 2; 463 if (f->reset) { 464 for (i=2; i<=f->curstat; i++) 465 xfree(f->posns[i]); 466 k = *f->posns[0]; 467 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 468 overflo("out of space in pmatch"); 469 for (i=0; i<=k; i++) 470 (f->posns[2])[i] = (f->posns[0])[i]; 471 f->initstat = f->curstat = 2; 472 f->out[2] = f->out[0]; 473 for (i=0; i<NCHARS; i++) 474 f->gototab[2][i] = 0; 475 } 476 } while (*p++ != 0); 477 return (0); 478 } 479 480 nematch(f, p) 481 register fa *f; 482 register uchar *p; 483 { 484 register int s, ns; 485 register uchar *q; 486 int i, k; 487 488 if (f->reset) { 489 f->initstat = s = makeinit(f,1); 490 } else { 491 s = f->initstat; 492 } 493 patlen = -1; 494 while (*p) { 495 q = p; 496 do { 497 if (f->out[s]) /* final state */ 498 patlen = q-p; 499 if (ns=f->gototab[s][*q]) 500 s=ns; 501 else 502 s=cgoto(f,s,*q); 503 if (s==1) /* no transition */ 504 if (patlen > 0) { 505 patbeg = p; 506 return(1); 507 } 508 else 509 goto nnextin; /* no nonempty match */ 510 } while (*q++ != 0); 511 if (f->out[s]) 512 patlen = q-p-1; /* don't count $ */ 513 if (patlen > 0 ) { 514 patbeg = p; 515 return(1); 516 } 517 nnextin: 518 s = 2; 519 if (f->reset) { 520 for (i=2; i<=f->curstat; i++) 521 xfree(f->posns[i]); 522 k = *f->posns[0]; 523 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL) 524 overflo("out of state space"); 525 for (i=0; i<=k; i++) 526 (f->posns[2])[i] = (f->posns[0])[i]; 527 f->initstat = f->curstat = 2; 528 f->out[2] = f->out[0]; 529 for (i=0; i<NCHARS; i++) 530 f->gototab[2][i] = 0; 531 } 532 p++; 533 } 534 return (0); 535 } 536 537 Node *regexp(), *primary(), *concat(), *alt(), *unary(); 538 539 Node *reparse(p) 540 uchar *p; 541 { 542 /* parses regular expression pointed to by p */ 543 /* uses relex() to scan regular expression */ 544 Node *np; 545 546 dprintf( ("reparse <%s>\n", p) ); 547 lastre = prestr = p; /* prestr points to string to be parsed */ 548 rtok = relex(); 549 if (rtok == '\0') 550 ERROR "empty regular expression" FATAL; 551 np = regexp(); 552 if (rtok == '\0') 553 return(np); 554 else 555 ERROR "syntax error in regular expression %s at %s", lastre, prestr FATAL; 556 /*NOTREACHED*/ 557 } 558 559 Node *regexp() 560 { 561 return (alt(concat(primary()))); 562 } 563 564 Node *primary() 565 { 566 Node *np; 567 568 switch (rtok) { 569 case CHAR: 570 np = op2(CHAR, NIL, (Node *) rlxval); 571 rtok = relex(); 572 return (unary(np)); 573 case ALL: 574 rtok = relex(); 575 return (unary(op2(ALL, NIL, NIL))); 576 case DOT: 577 rtok = relex(); 578 return (unary(op2(DOT, NIL, NIL))); 579 case CCL: 580 np = op2(CCL, NIL, (Node*) cclenter(rlxstr)); 581 rtok = relex(); 582 return (unary(np)); 583 case NCCL: 584 np = op2(NCCL, NIL, (Node *) cclenter(rlxstr)); 585 rtok = relex(); 586 return (unary(np)); 587 case '^': 588 rtok = relex(); 589 return (unary(op2(CHAR, NIL, (Node *) HAT))); 590 case '$': 591 rtok = relex(); 592 return (unary(op2(CHAR, NIL, NIL))); 593 case '(': 594 rtok = relex(); 595 if (rtok == ')') { /* special pleading for () */ 596 rtok = relex(); 597 return unary(op2(CCL, NIL, (Node *) tostring(""))); 598 } 599 np = regexp(); 600 if (rtok == ')') { 601 rtok = relex(); 602 return (unary(np)); 603 } 604 else 605 ERROR "syntax error in regular expression %s at %s", lastre, prestr FATAL; 606 default: 607 ERROR "illegal primary in regular expression %s at %s", lastre, prestr FATAL; 608 } 609 /*NOTREACHED*/ 610 } 611 612 Node *concat(np) 613 Node *np; 614 { 615 switch (rtok) { 616 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': 617 return (concat(op2(CAT, np, primary()))); 618 default: 619 return (np); 620 } 621 } 622 623 Node *alt(np) 624 Node *np; 625 { 626 if (rtok == OR) { 627 rtok = relex(); 628 return (alt(op2(OR, np, concat(primary())))); 629 } 630 return (np); 631 } 632 633 Node *unary(np) 634 Node *np; 635 { 636 switch (rtok) { 637 case STAR: 638 rtok = relex(); 639 return (unary(op2(STAR, np, NIL))); 640 case PLUS: 641 rtok = relex(); 642 return (unary(op2(PLUS, np, NIL))); 643 case QUEST: 644 rtok = relex(); 645 return (unary(op2(QUEST, np, NIL))); 646 default: 647 return (np); 648 } 649 } 650 651 relex() /* lexical analyzer for reparse */ 652 { 653 register int c; 654 uchar cbuf[RECSIZE]; 655 int clen, cflag; 656 657 switch (c = *prestr++) { 658 case '|': return OR; 659 case '*': return STAR; 660 case '+': return PLUS; 661 case '?': return QUEST; 662 case '.': return DOT; 663 case '\0': prestr--; return '\0'; 664 case '^': 665 case '$': 666 case '(': 667 case ')': 668 return c; 669 case '\\': 670 if ((c = *prestr++) == 't') 671 c = '\t'; 672 else if (c == 'n') 673 c = '\n'; 674 else if (c == 'f') 675 c = '\f'; 676 else if (c == 'r') 677 c = '\r'; 678 else if (c == 'b') 679 c = '\b'; 680 else if (c == '\\') 681 c = '\\'; 682 else if (isdigit(c)) { 683 int n = c - '0'; 684 if (isdigit(*prestr)) { 685 n = 8 * n + *prestr++ - '0'; 686 if (isdigit(*prestr)) 687 n = 8 * n + *prestr++ - '0'; 688 } 689 c = n; 690 } /* else it's now in c */ 691 rlxval = c; 692 return CHAR; 693 default: 694 rlxval = c; 695 return CHAR; 696 case '[': 697 clen = 0; 698 if (*prestr == '^') { 699 cflag = 1; 700 prestr++; 701 } 702 else 703 cflag = 0; 704 for (;;) { 705 if ((c = *prestr++) == '\\') { 706 cbuf[clen++] = '\\'; 707 if ((c = *prestr++) == '\0') 708 ERROR "nonterminated character class %s", lastre FATAL; 709 cbuf[clen++] = c; 710 } else if (c == ']') { 711 cbuf[clen] = 0; 712 rlxstr = tostring(cbuf); 713 if (cflag == 0) 714 return CCL; 715 else 716 return NCCL; 717 } else if (c == '\n') { 718 ERROR "newline in character class %s...", lastre FATAL; 719 } else if (c == '\0') { 720 ERROR "nonterminated character class %s", lastre FATAL; 721 } else 722 cbuf[clen++] = c; 723 } 724 } 725 } 726 727 int cgoto(f, s, c) 728 fa *f; 729 int s, c; 730 { 731 register int i, j, k; 732 register int *p, *q; 733 734 for (i=0; i<=f->accept; i++) 735 setvec[i] = 0; 736 setcnt = 0; 737 /* compute positions of gototab[s,c] into setvec */ 738 p = f->posns[s]; 739 for (i=1; i<=*p; i++) { 740 if ((k = f->re[p[i]].ltype) != FINAL) { 741 if (k == CHAR && c == f->re[p[i]].lval 742 || k == DOT && c != 0 && c != HAT 743 || k == ALL && c != 0 744 || k == CCL && member(c, (uchar *) f->re[p[i]].lval) 745 || k == NCCL && !member(c, (uchar *) f->re[p[i]].lval) && c != 0 && c != HAT) { 746 q = f->re[p[i]].lfollow; 747 for (j=1; j<=*q; j++) { 748 if (setvec[q[j]] == 0) { 749 setcnt++; 750 setvec[q[j]] = 1; 751 } 752 } 753 } 754 } 755 } 756 /* determine if setvec is a previous state */ 757 tmpset[0] = setcnt; 758 j = 1; 759 for (i = f->accept; i >= 0; i--) 760 if (setvec[i]) { 761 tmpset[j++] = i; 762 } 763 /* tmpset == previous state? */ 764 for (i=1; i<= f->curstat; i++) { 765 p = f->posns[i]; 766 if ((k = tmpset[0]) != p[0]) 767 goto different; 768 for (j = 1; j <= k; j++) 769 if (tmpset[j] != p[j]) 770 goto different; 771 /* setvec is state i */ 772 f->gototab[s][c] = i; 773 return i; 774 different:; 775 } 776 777 /* add tmpset to current set of states */ 778 if (f->curstat >= NSTATES-1) { 779 f->curstat = 2; 780 f->reset = 1; 781 for (i=2; i<NSTATES; i++) 782 xfree(f->posns[i]); 783 } 784 else 785 ++(f->curstat); 786 for (i=0; i<NCHARS; i++) 787 f->gototab[f->curstat][i] = 0; 788 xfree(f->posns[f->curstat]); 789 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL) 790 overflo("out of space in cgoto"); 791 792 f->posns[f->curstat] = p; 793 f->gototab[s][c] = f->curstat; 794 for (i = 0; i <= setcnt; i++) 795 p[i] = tmpset[i]; 796 if (setvec[f->accept]) 797 f->out[f->curstat] = 1; 798 else 799 f->out[f->curstat] = 0; 800 return f->curstat; 801 } 802 803 804 freefa(f) 805 struct fa *f; 806 { 807 808 register int i; 809 810 if (f == NULL) 811 return; 812 for (i=0; i<=f->curstat; i++) 813 xfree(f->posns[i]); 814 for (i=0; i<=f->accept; i++) 815 xfree(f->re[i].lfollow); 816 xfree(f->restr); 817 xfree(f); 818 } 819