1 /* nfa - NFA construction routines */ 2 3 /* Copyright (c) 1990 The Regents of the University of California. */ 4 /* All rights reserved. */ 5 6 /* This code is derived from software contributed to Berkeley by */ 7 /* Vern Paxson. */ 8 9 /* The United States Government has rights in this work pursuant */ 10 /* to contract no. DE-AC03-76SF00098 between the United States */ 11 /* Department of Energy and the University of California. */ 12 13 /* This file is part of flex. */ 14 15 /* Redistribution and use in source and binary forms, with or without */ 16 /* modification, are permitted provided that the following conditions */ 17 /* are met: */ 18 19 /* 1. Redistributions of source code must retain the above copyright */ 20 /* notice, this list of conditions and the following disclaimer. */ 21 /* 2. Redistributions in binary form must reproduce the above copyright */ 22 /* notice, this list of conditions and the following disclaimer in the */ 23 /* documentation and/or other materials provided with the distribution. */ 24 25 /* Neither the name of the University nor the names of its contributors */ 26 /* may be used to endorse or promote products derived from this software */ 27 /* without specific prior written permission. */ 28 29 /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ 30 /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ 31 /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ 32 /* PURPOSE. */ 33 34 #include "flexdef.h" 35 36 37 /* declare functions that have forward references */ 38 39 int dupmachine(int); 40 void mkxtion(int, int); 41 42 43 /* add_accept - add an accepting state to a machine 44 * 45 * accepting_number becomes mach's accepting number. 46 */ 47 48 void add_accept (int mach, int accepting_number) 49 { 50 /* Hang the accepting number off an epsilon state. if it is associated 51 * with a state that has a non-epsilon out-transition, then the state 52 * will accept BEFORE it makes that transition, i.e., one character 53 * too soon. 54 */ 55 56 if (transchar[finalst[mach]] == SYM_EPSILON) 57 accptnum[finalst[mach]] = accepting_number; 58 59 else { 60 int astate = mkstate (SYM_EPSILON); 61 62 accptnum[astate] = accepting_number; 63 (void) link_machines (mach, astate); 64 } 65 } 66 67 68 /* copysingl - make a given number of copies of a singleton machine 69 * 70 * synopsis 71 * 72 * newsng = copysingl( singl, num ); 73 * 74 * newsng - a new singleton composed of num copies of singl 75 * singl - a singleton machine 76 * num - the number of copies of singl to be present in newsng 77 */ 78 79 int copysingl (int singl, int num) 80 { 81 int copy, i; 82 83 copy = mkstate (SYM_EPSILON); 84 85 for (i = 1; i <= num; ++i) 86 copy = link_machines (copy, dupmachine (singl)); 87 88 return copy; 89 } 90 91 92 /* dumpnfa - debugging routine to write out an nfa */ 93 94 void dumpnfa (int state1) 95 { 96 int sym, tsp1, tsp2, anum, ns; 97 98 fprintf (stderr, 99 _ 100 ("\n\n********** beginning dump of nfa with start state %d\n"), 101 state1); 102 103 /* We probably should loop starting at firstst[state1] and going to 104 * lastst[state1], but they're not maintained properly when we "or" 105 * all of the rules together. So we use our knowledge that the machine 106 * starts at state 1 and ends at lastnfa. 107 */ 108 109 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */ 110 for (ns = 1; ns <= lastnfa; ++ns) { 111 fprintf (stderr, _("state # %4d\t"), ns); 112 113 sym = transchar[ns]; 114 tsp1 = trans1[ns]; 115 tsp2 = trans2[ns]; 116 anum = accptnum[ns]; 117 118 fprintf (stderr, "%3d: %4d, %4d", sym, tsp1, tsp2); 119 120 if (anum != NIL) 121 fprintf (stderr, " [%d]", anum); 122 123 fprintf (stderr, "\n"); 124 } 125 126 fprintf (stderr, _("********** end of dump\n")); 127 } 128 129 130 /* dupmachine - make a duplicate of a given machine 131 * 132 * synopsis 133 * 134 * copy = dupmachine( mach ); 135 * 136 * copy - holds duplicate of mach 137 * mach - machine to be duplicated 138 * 139 * note that the copy of mach is NOT an exact duplicate; rather, all the 140 * transition states values are adjusted so that the copy is self-contained, 141 * as the original should have been. 142 * 143 * also note that the original MUST be contiguous, with its low and high 144 * states accessible by the arrays firstst and lastst 145 */ 146 147 int dupmachine (int mach) 148 { 149 int i, init, state_offset; 150 int state = 0; 151 int last = lastst[mach]; 152 153 for (i = firstst[mach]; i <= last; ++i) { 154 state = mkstate (transchar[i]); 155 156 if (trans1[i] != NO_TRANSITION) { 157 mkxtion (finalst[state], trans1[i] + state - i); 158 159 if (transchar[i] == SYM_EPSILON && 160 trans2[i] != NO_TRANSITION) 161 mkxtion (finalst[state], 162 trans2[i] + state - i); 163 } 164 165 accptnum[state] = accptnum[i]; 166 } 167 168 if (state == 0) 169 flexfatal (_("empty machine in dupmachine()")); 170 171 state_offset = state - i + 1; 172 173 init = mach + state_offset; 174 firstst[init] = firstst[mach] + state_offset; 175 finalst[init] = finalst[mach] + state_offset; 176 lastst[init] = lastst[mach] + state_offset; 177 178 return init; 179 } 180 181 182 /* finish_rule - finish up the processing for a rule 183 * 184 * An accepting number is added to the given machine. If variable_trail_rule 185 * is true then the rule has trailing context and both the head and trail 186 * are variable size. Otherwise if headcnt or trailcnt is non-zero then 187 * the machine recognizes a pattern with trailing context and headcnt is 188 * the number of characters in the matched part of the pattern, or zero 189 * if the matched part has variable length. trailcnt is the number of 190 * trailing context characters in the pattern, or zero if the trailing 191 * context has variable length. 192 */ 193 194 void finish_rule (int mach, int variable_trail_rule, int headcnt, int trailcnt, 195 int pcont_act) 196 { 197 char action_text[MAXLINE]; 198 199 add_accept (mach, num_rules); 200 201 /* We did this in new_rule(), but it often gets the wrong 202 * number because we do it before we start parsing the current rule. 203 */ 204 rule_linenum[num_rules] = linenum; 205 206 /* If this is a continued action, then the line-number has already 207 * been updated, giving us the wrong number. 208 */ 209 if (continued_action) 210 --rule_linenum[num_rules]; 211 212 213 /* If the previous rule was continued action, then we inherit the 214 * previous newline flag, possibly overriding the current one. 215 */ 216 if (pcont_act && rule_has_nl[num_rules - 1]) 217 rule_has_nl[num_rules] = true; 218 219 snprintf (action_text, sizeof(action_text), "case %d:\n", num_rules); 220 add_action (action_text); 221 if (rule_has_nl[num_rules]) { 222 snprintf (action_text, sizeof(action_text), "/* rule %d can match eol */\n", 223 num_rules); 224 add_action (action_text); 225 } 226 227 228 if (variable_trail_rule) { 229 rule_type[num_rules] = RULE_VARIABLE; 230 231 if (performance_report > 0) 232 fprintf (stderr, 233 _ 234 ("Variable trailing context rule at line %d\n"), 235 rule_linenum[num_rules]); 236 237 variable_trailing_context_rules = true; 238 } 239 240 else { 241 rule_type[num_rules] = RULE_NORMAL; 242 243 if (headcnt > 0 || trailcnt > 0) { 244 /* Do trailing context magic to not match the trailing 245 * characters. 246 */ 247 char *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp"; 248 char *scanner_bp = "yy_bp"; 249 250 add_action 251 ("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n"); 252 253 if (headcnt > 0) { 254 if (rule_has_nl[num_rules]) { 255 snprintf (action_text, sizeof(action_text), 256 "YY_LINENO_REWIND_TO(%s + %d);\n", scanner_bp, headcnt); 257 add_action (action_text); 258 } 259 snprintf (action_text, sizeof(action_text), "%s = %s + %d;\n", 260 scanner_cp, scanner_bp, headcnt); 261 add_action (action_text); 262 } 263 264 else { 265 if (rule_has_nl[num_rules]) { 266 snprintf (action_text, sizeof(action_text), 267 "YY_LINENO_REWIND_TO(yy_cp - %d);\n", trailcnt); 268 add_action (action_text); 269 } 270 271 snprintf (action_text, sizeof(action_text), "%s -= %d;\n", 272 scanner_cp, trailcnt); 273 add_action (action_text); 274 } 275 276 add_action 277 ("YY_DO_BEFORE_ACTION; /* set up yytext again */\n"); 278 } 279 } 280 281 /* Okay, in the action code at this point yytext and yyleng have 282 * their proper final values for this rule, so here's the point 283 * to do any user action. But don't do it for continued actions, 284 * as that'll result in multiple YY_RULE_SETUP's. 285 */ 286 if (!continued_action) 287 add_action ("YY_RULE_SETUP\n"); 288 289 line_directive_out(NULL, 1); 290 add_action("[["); 291 } 292 293 294 /* link_machines - connect two machines together 295 * 296 * synopsis 297 * 298 * new = link_machines( first, last ); 299 * 300 * new - a machine constructed by connecting first to last 301 * first - the machine whose successor is to be last 302 * last - the machine whose predecessor is to be first 303 * 304 * note: this routine concatenates the machine first with the machine 305 * last to produce a machine new which will pattern-match first first 306 * and then last, and will fail if either of the sub-patterns fails. 307 * FIRST is set to new by the operation. last is unmolested. 308 */ 309 310 int link_machines (int first, int last) 311 { 312 if (first == NIL) 313 return last; 314 315 else if (last == NIL) 316 return first; 317 318 else { 319 mkxtion (finalst[first], last); 320 finalst[first] = finalst[last]; 321 lastst[first] = MAX (lastst[first], lastst[last]); 322 firstst[first] = MIN (firstst[first], firstst[last]); 323 324 return first; 325 } 326 } 327 328 329 /* mark_beginning_as_normal - mark each "beginning" state in a machine 330 * as being a "normal" (i.e., not trailing context- 331 * associated) states 332 * 333 * The "beginning" states are the epsilon closure of the first state 334 */ 335 336 void mark_beginning_as_normal (int mach) 337 { 338 switch (state_type[mach]) { 339 case STATE_NORMAL: 340 /* Oh, we've already visited here. */ 341 return; 342 343 case STATE_TRAILING_CONTEXT: 344 state_type[mach] = STATE_NORMAL; 345 346 if (transchar[mach] == SYM_EPSILON) { 347 if (trans1[mach] != NO_TRANSITION) 348 mark_beginning_as_normal (trans1[mach]); 349 350 if (trans2[mach] != NO_TRANSITION) 351 mark_beginning_as_normal (trans2[mach]); 352 } 353 break; 354 355 default: 356 flexerror (_ 357 ("bad state type in mark_beginning_as_normal()")); 358 break; 359 } 360 } 361 362 363 /* mkbranch - make a machine that branches to two machines 364 * 365 * synopsis 366 * 367 * branch = mkbranch( first, second ); 368 * 369 * branch - a machine which matches either first's pattern or second's 370 * first, second - machines whose patterns are to be or'ed (the | operator) 371 * 372 * Note that first and second are NEITHER destroyed by the operation. Also, 373 * the resulting machine CANNOT be used with any other "mk" operation except 374 * more mkbranch's. Compare with mkor() 375 */ 376 377 int mkbranch (int first, int second) 378 { 379 int eps; 380 381 if (first == NO_TRANSITION) 382 return second; 383 384 else if (second == NO_TRANSITION) 385 return first; 386 387 eps = mkstate (SYM_EPSILON); 388 389 mkxtion (eps, first); 390 mkxtion (eps, second); 391 392 return eps; 393 } 394 395 396 /* mkclos - convert a machine into a closure 397 * 398 * synopsis 399 * new = mkclos( state ); 400 * 401 * new - a new state which matches the closure of "state" 402 */ 403 404 int mkclos (int state) 405 { 406 return mkopt (mkposcl (state)); 407 } 408 409 410 /* mkopt - make a machine optional 411 * 412 * synopsis 413 * 414 * new = mkopt( mach ); 415 * 416 * new - a machine which optionally matches whatever mach matched 417 * mach - the machine to make optional 418 * 419 * notes: 420 * 1. mach must be the last machine created 421 * 2. mach is destroyed by the call 422 */ 423 424 int mkopt (int mach) 425 { 426 int eps; 427 428 if (!SUPER_FREE_EPSILON (finalst[mach])) { 429 eps = mkstate (SYM_EPSILON); 430 mach = link_machines (mach, eps); 431 } 432 433 /* Can't skimp on the following if FREE_EPSILON(mach) is true because 434 * some state interior to "mach" might point back to the beginning 435 * for a closure. 436 */ 437 eps = mkstate (SYM_EPSILON); 438 mach = link_machines (eps, mach); 439 440 mkxtion (mach, finalst[mach]); 441 442 return mach; 443 } 444 445 446 /* mkor - make a machine that matches either one of two machines 447 * 448 * synopsis 449 * 450 * new = mkor( first, second ); 451 * 452 * new - a machine which matches either first's pattern or second's 453 * first, second - machines whose patterns are to be or'ed (the | operator) 454 * 455 * note that first and second are both destroyed by the operation 456 * the code is rather convoluted because an attempt is made to minimize 457 * the number of epsilon states needed 458 */ 459 460 int mkor (int first, int second) 461 { 462 int eps, orend; 463 464 if (first == NIL) 465 return second; 466 467 else if (second == NIL) 468 return first; 469 470 else { 471 /* See comment in mkopt() about why we can't use the first 472 * state of "first" or "second" if they satisfy "FREE_EPSILON". 473 */ 474 eps = mkstate (SYM_EPSILON); 475 476 first = link_machines (eps, first); 477 478 mkxtion (first, second); 479 480 if (SUPER_FREE_EPSILON (finalst[first]) && 481 accptnum[finalst[first]] == NIL) { 482 orend = finalst[first]; 483 mkxtion (finalst[second], orend); 484 } 485 486 else if (SUPER_FREE_EPSILON (finalst[second]) && 487 accptnum[finalst[second]] == NIL) { 488 orend = finalst[second]; 489 mkxtion (finalst[first], orend); 490 } 491 492 else { 493 eps = mkstate (SYM_EPSILON); 494 495 first = link_machines (first, eps); 496 orend = finalst[first]; 497 498 mkxtion (finalst[second], orend); 499 } 500 } 501 502 finalst[first] = orend; 503 return first; 504 } 505 506 507 /* mkposcl - convert a machine into a positive closure 508 * 509 * synopsis 510 * new = mkposcl( state ); 511 * 512 * new - a machine matching the positive closure of "state" 513 */ 514 515 int mkposcl (int state) 516 { 517 int eps; 518 519 if (SUPER_FREE_EPSILON (finalst[state])) { 520 mkxtion (finalst[state], state); 521 return state; 522 } 523 524 else { 525 eps = mkstate (SYM_EPSILON); 526 mkxtion (eps, state); 527 return link_machines (state, eps); 528 } 529 } 530 531 532 /* mkrep - make a replicated machine 533 * 534 * synopsis 535 * new = mkrep( mach, lb, ub ); 536 * 537 * new - a machine that matches whatever "mach" matched from "lb" 538 * number of times to "ub" number of times 539 * 540 * note 541 * if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach" 542 */ 543 544 int mkrep (int mach, int lb, int ub) 545 { 546 int base_mach, tail, copy, i; 547 548 base_mach = copysingl (mach, lb - 1); 549 550 if (ub == INFINITE_REPEAT) { 551 copy = dupmachine (mach); 552 mach = link_machines (mach, 553 link_machines (base_mach, 554 mkclos (copy))); 555 } 556 557 else { 558 tail = mkstate (SYM_EPSILON); 559 560 for (i = lb; i < ub; ++i) { 561 copy = dupmachine (mach); 562 tail = mkopt (link_machines (copy, tail)); 563 } 564 565 mach = 566 link_machines (mach, 567 link_machines (base_mach, tail)); 568 } 569 570 return mach; 571 } 572 573 574 /* mkstate - create a state with a transition on a given symbol 575 * 576 * synopsis 577 * 578 * state = mkstate( sym ); 579 * 580 * state - a new state matching sym 581 * sym - the symbol the new state is to have an out-transition on 582 * 583 * note that this routine makes new states in ascending order through the 584 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE 585 * relies on machines being made in ascending order and that they are 586 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge 587 * that it admittedly is) 588 */ 589 590 int mkstate (int sym) 591 { 592 if (++lastnfa >= current_mns) { 593 if ((current_mns += MNS_INCREMENT) >= maximum_mns) 594 lerr(_ 595 ("input rules are too complicated (>= %d NFA states)"), 596 current_mns); 597 598 ++num_reallocs; 599 600 firstst = reallocate_integer_array (firstst, current_mns); 601 lastst = reallocate_integer_array (lastst, current_mns); 602 finalst = reallocate_integer_array (finalst, current_mns); 603 transchar = 604 reallocate_integer_array (transchar, current_mns); 605 trans1 = reallocate_integer_array (trans1, current_mns); 606 trans2 = reallocate_integer_array (trans2, current_mns); 607 accptnum = 608 reallocate_integer_array (accptnum, current_mns); 609 assoc_rule = 610 reallocate_integer_array (assoc_rule, current_mns); 611 state_type = 612 reallocate_integer_array (state_type, current_mns); 613 } 614 615 firstst[lastnfa] = lastnfa; 616 finalst[lastnfa] = lastnfa; 617 lastst[lastnfa] = lastnfa; 618 transchar[lastnfa] = sym; 619 trans1[lastnfa] = NO_TRANSITION; 620 trans2[lastnfa] = NO_TRANSITION; 621 accptnum[lastnfa] = NIL; 622 assoc_rule[lastnfa] = num_rules; 623 state_type[lastnfa] = current_state_type; 624 625 /* Fix up equivalence classes base on this transition. Note that any 626 * character which has its own transition gets its own equivalence 627 * class. Thus only characters which are only in character classes 628 * have a chance at being in the same equivalence class. E.g. "a|b" 629 * puts 'a' and 'b' into two different equivalence classes. "[ab]" 630 * puts them in the same equivalence class (barring other differences 631 * elsewhere in the input). 632 */ 633 634 if (sym < 0) { 635 /* We don't have to update the equivalence classes since 636 * that was already done when the ccl was created for the 637 * first time. 638 */ 639 } 640 641 else if (sym == SYM_EPSILON) 642 ++numeps; 643 644 else { 645 check_char (sym); 646 647 if (useecs) 648 /* Map NUL's to csize. */ 649 mkechar (sym ? sym : csize, nextecm, ecgroup); 650 } 651 652 return lastnfa; 653 } 654 655 656 /* mkxtion - make a transition from one state to another 657 * 658 * synopsis 659 * 660 * mkxtion( statefrom, stateto ); 661 * 662 * statefrom - the state from which the transition is to be made 663 * stateto - the state to which the transition is to be made 664 */ 665 666 void mkxtion (int statefrom, int stateto) 667 { 668 if (trans1[statefrom] == NO_TRANSITION) 669 trans1[statefrom] = stateto; 670 671 else if ((transchar[statefrom] != SYM_EPSILON) || 672 (trans2[statefrom] != NO_TRANSITION)) 673 flexfatal (_("found too many transitions in mkxtion()")); 674 675 else { /* second out-transition for an epsilon state */ 676 ++eps2; 677 trans2[statefrom] = stateto; 678 } 679 } 680 681 /* new_rule - initialize for a new rule */ 682 683 void new_rule (void) 684 { 685 if (++num_rules >= current_max_rules) { 686 ++num_reallocs; 687 current_max_rules += MAX_RULES_INCREMENT; 688 rule_type = reallocate_integer_array (rule_type, 689 current_max_rules); 690 rule_linenum = reallocate_integer_array (rule_linenum, 691 current_max_rules); 692 rule_useful = reallocate_integer_array (rule_useful, 693 current_max_rules); 694 rule_has_nl = reallocate_bool_array (rule_has_nl, 695 current_max_rules); 696 } 697 698 if (num_rules > MAX_RULE) 699 lerr (_("too many rules (> %d)!"), MAX_RULE); 700 701 rule_linenum[num_rules] = linenum; 702 rule_useful[num_rules] = false; 703 rule_has_nl[num_rules] = false; 704 } 705