1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 /* lasciate ogne speranza, voi ch'intrate. */
26
27 #define DEBUG
28
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36
37 #define MAXLIN 22
38
39 #define type(v) (v)->nobj /* badly overloaded here */
40 #define info(v) (v)->ntype /* badly overloaded here */
41 #define left(v) (v)->narg[0]
42 #define right(v) (v)->narg[1]
43 #define parent(v) (v)->nnext
44
45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF case EMPTYRE: /* empty string in regexp */
47 #define UNARY case STAR: case PLUS: case QUEST:
48
49 /* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
55 */
56
57
58 int *setvec;
59 int *tmpset;
60 int maxsetvec = 0;
61
62 int rtok; /* next token in current re */
63 int rlxval;
64 static const uschar *rlxstr;
65 static const uschar *prestr; /* current position in current re */
66 static const uschar *lastre; /* origin of last re */
67 static const uschar *lastatom; /* origin of last Atom */
68 static const uschar *starttok;
69 static const uschar *basestr; /* starts with original, replaced during
70 repetition processing */
71 static const uschar *firstbasestr;
72
73 static int setcnt;
74 static int poscnt;
75
76 const char *patbeg;
77 int patlen;
78
79 #define NFA 128 /* cache this many dynamic fa's */
80 fa *fatab[NFA];
81 int nfatab = 0; /* entries in fatab */
82
83 extern int u8_nextlen(const char *s);
84
85
86 /* utf-8 mechanism:
87
88 For most of Awk, utf-8 strings just "work", since they look like
89 null-terminated sequences of 8-bit bytes.
90
91 Functions like length(), index(), and substr() have to operate
92 in units of utf-8 characters. The u8_* functions in run.c
93 handle this.
94
95 Regular expressions are more complicated, since the basic
96 mechanism of the goto table used 8-bit byte indices into the
97 gototab entries to compute the next state. Unicode is a lot
98 bigger, so the gototab entries are now structs with a character
99 and a next state. These are sorted by code point and binary
100 searched.
101
102 Throughout the RE mechanism in b.c, utf-8 characters are
103 converted to their utf-32 value. This mostly shows up in
104 cclenter, which expands character class ranges like a-z and now
105 alpha-omega. The size of a gototab array is still about 256.
106 This should be dynamic, but for now things work ok for a single
107 code page of Unicode, which is the most likely case.
108
109 The code changes are localized in run.c and b.c. I have added a
110 handful of functions to somewhat better hide the implementation,
111 but a lot more could be done.
112
113 */
114
115 static int entry_cmp(const void *l, const void *r);
116 static int get_gototab(fa*, int, int);
117 static int set_gototab(fa*, int, int, int);
118 static void clear_gototab(fa*, int);
119 extern int u8_rune(int *, const char *);
120
121 static int *
intalloc(size_t n,const char * f)122 intalloc(size_t n, const char *f)
123 {
124 int *p = (int *) calloc(n, sizeof(int));
125 if (p == NULL)
126 overflo(f);
127 return p;
128 }
129
130 static void
resizesetvec(const char * f)131 resizesetvec(const char *f)
132 {
133 if (maxsetvec == 0)
134 maxsetvec = MAXLIN;
135 else
136 maxsetvec *= 4;
137 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139 if (setvec == NULL || tmpset == NULL)
140 overflo(f);
141 }
142
143 static void
resize_state(fa * f,int state)144 resize_state(fa *f, int state)
145 {
146 gtt *p;
147 uschar *p2;
148 int **p3;
149 int i, new_count;
150
151 if (++state < f->state_count)
152 return;
153
154 new_count = state + 10; /* needs to be tuned */
155
156 p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157 if (p == NULL)
158 goto out;
159 f->gototab = p;
160
161 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162 if (p2 == NULL)
163 goto out;
164 f->out = p2;
165
166 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167 if (p3 == NULL)
168 goto out;
169 f->posns = p3;
170
171 for (i = f->state_count; i < new_count; ++i) {
172 f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173 if (f->gototab[i].entries == NULL)
174 goto out;
175 f->gototab[i].allocated = NCHARS;
176 f->gototab[i].inuse = 0;
177 f->out[i] = 0;
178 f->posns[i] = NULL;
179 }
180 f->state_count = new_count;
181 return;
182 out:
183 overflo(__func__);
184 }
185
makedfa(const char * s,bool anchor)186 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
187 {
188 int i, use, nuse;
189 fa *pfa;
190 static int now = 1;
191
192 if (setvec == NULL) { /* first time through any RE */
193 resizesetvec(__func__);
194 }
195
196 if (compile_time != RUNNING) /* a constant for sure */
197 return mkdfa(s, anchor);
198 for (i = 0; i < nfatab; i++) /* is it there already? */
199 if (fatab[i]->anchor == anchor
200 && strcmp((const char *) fatab[i]->restr, s) == 0) {
201 fatab[i]->use = now++;
202 return fatab[i];
203 }
204 pfa = mkdfa(s, anchor);
205 if (nfatab < NFA) { /* room for another */
206 fatab[nfatab] = pfa;
207 fatab[nfatab]->use = now++;
208 nfatab++;
209 return pfa;
210 }
211 use = fatab[0]->use; /* replace least-recently used */
212 nuse = 0;
213 for (i = 1; i < nfatab; i++)
214 if (fatab[i]->use < use) {
215 use = fatab[i]->use;
216 nuse = i;
217 }
218 freefa(fatab[nuse]);
219 fatab[nuse] = pfa;
220 pfa->use = now++;
221 return pfa;
222 }
223
mkdfa(const char * s,bool anchor)224 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
225 /* anchor = true for anchored matches, else false */
226 {
227 Node *p, *p1;
228 fa *f;
229
230 firstbasestr = (const uschar *) s;
231 basestr = firstbasestr;
232 p = reparse(s);
233 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234 /* put ALL STAR in front of reg. exp. */
235 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236 /* put FINAL after reg. exp. */
237
238 poscnt = 0;
239 penter(p1); /* enter parent pointers and leaf indices */
240 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241 overflo(__func__);
242 f->accept = poscnt-1; /* penter has computed number of positions in re */
243 cfoll(f, p1); /* set up follow sets */
244 freetr(p1);
245 resize_state(f, 1);
246 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247 f->posns[1] = intalloc(1, __func__);
248 *f->posns[1] = 0;
249 f->initstat = makeinit(f, anchor);
250 f->anchor = anchor;
251 f->restr = (uschar *) tostring(s);
252 if (firstbasestr != basestr) {
253 if (basestr)
254 xfree(basestr);
255 }
256 return f;
257 }
258
makeinit(fa * f,bool anchor)259 int makeinit(fa *f, bool anchor)
260 {
261 int i, k;
262
263 f->curstat = 2;
264 f->out[2] = 0;
265 k = *(f->re[0].lfollow);
266 xfree(f->posns[2]);
267 f->posns[2] = intalloc(k + 1, __func__);
268 for (i = 0; i <= k; i++) {
269 (f->posns[2])[i] = (f->re[0].lfollow)[i];
270 }
271 if ((f->posns[2])[1] == f->accept)
272 f->out[2] = 1;
273 clear_gototab(f, 2);
274 f->curstat = cgoto(f, 2, HAT);
275 if (anchor) {
276 *f->posns[2] = k-1; /* leave out position 0 */
277 for (i = 0; i < k; i++) {
278 (f->posns[0])[i] = (f->posns[2])[i];
279 }
280
281 f->out[0] = f->out[2];
282 if (f->curstat != 2)
283 --(*f->posns[f->curstat]);
284 }
285 return f->curstat;
286 }
287
penter(Node * p)288 void penter(Node *p) /* set up parent pointers and leaf indices */
289 {
290 switch (type(p)) {
291 ELEAF
292 LEAF
293 info(p) = poscnt;
294 poscnt++;
295 break;
296 UNARY
297 penter(left(p));
298 parent(left(p)) = p;
299 break;
300 case CAT:
301 case OR:
302 penter(left(p));
303 penter(right(p));
304 parent(left(p)) = p;
305 parent(right(p)) = p;
306 break;
307 case ZERO:
308 break;
309 default: /* can't happen */
310 FATAL("can't happen: unknown type %d in penter", type(p));
311 break;
312 }
313 }
314
freetr(Node * p)315 void freetr(Node *p) /* free parse tree */
316 {
317 switch (type(p)) {
318 ELEAF
319 LEAF
320 xfree(p);
321 break;
322 UNARY
323 case ZERO:
324 freetr(left(p));
325 xfree(p);
326 break;
327 case CAT:
328 case OR:
329 freetr(left(p));
330 freetr(right(p));
331 xfree(p);
332 break;
333 default: /* can't happen */
334 FATAL("can't happen: unknown type %d in freetr", type(p));
335 break;
336 }
337 }
338
339 /* in the parsing of regular expressions, metacharacters like . have */
340 /* to be seen literally; \056 is not a metacharacter. */
341
hexstr(const uschar ** pp,int max)342 int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
343 { /* only pick up one 8-bit byte (2 chars) */
344 const uschar *p;
345 int n = 0;
346 int i;
347
348 for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349 if (isdigit((int) *p))
350 n = 16 * n + *p - '0';
351 else if (*p >= 'a' && *p <= 'f')
352 n = 16 * n + *p - 'a' + 10;
353 else if (*p >= 'A' && *p <= 'F')
354 n = 16 * n + *p - 'A' + 10;
355 }
356 *pp = p;
357 return n;
358 }
359
360
361
362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
363
quoted(const uschar ** pp)364 int quoted(const uschar **pp) /* pick up next thing after a \\ */
365 /* and increment *pp */
366 {
367 const uschar *p = *pp;
368 int c;
369
370 /* BUG: should advance by utf-8 char even if makes no sense */
371
372 switch ((c = *p++)) {
373 case 't':
374 c = '\t';
375 break;
376 case 'n':
377 c = '\n';
378 break;
379 case 'f':
380 c = '\f';
381 break;
382 case 'r':
383 c = '\r';
384 break;
385 case 'b':
386 c = '\b';
387 break;
388 case 'v':
389 c = '\v';
390 break;
391 case 'a':
392 c = '\a';
393 break;
394 case '\\':
395 c = '\\';
396 break;
397 case 'x': /* 2 hex digits follow */
398 c = hexstr(&p, 2); /* this adds a null if number is invalid */
399 break;
400 case 'u': /* unicode char number up to 8 hex digits */
401 c = hexstr(&p, 8);
402 break;
403 default:
404 if (isoctdigit(c)) { /* \d \dd \ddd */
405 int n = c - '0';
406 if (isoctdigit(*p)) {
407 n = 8 * n + *p++ - '0';
408 if (isoctdigit(*p))
409 n = 8 * n + *p++ - '0';
410 }
411 c = n;
412 }
413 }
414
415 *pp = p;
416 return c;
417 }
418
cclenter(const char * argp)419 int *cclenter(const char *argp) /* add a character class */
420 {
421 int i, c, c2;
422 int n;
423 const uschar *p = (const uschar *) argp;
424 int *bp, *retp;
425 static int *buf = NULL;
426 static int bufsz = 100;
427
428 if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
429 FATAL("out of space for character class [%.10s...] 1", p);
430 bp = buf;
431 for (i = 0; *p != 0; ) {
432 n = u8_rune(&c, (const char *) p);
433 p += n;
434 if (c == '\\') {
435 c = quoted(&p);
436 } else if (c == '-' && i > 0 && bp[-1] != 0) {
437 if (*p != 0) {
438 c = bp[-1];
439 /* c2 = *p++; */
440 n = u8_rune(&c2, (const char *) p);
441 p += n;
442 if (c2 == '\\')
443 c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
444 if (c > c2) { /* empty; ignore */
445 bp--;
446 i--;
447 continue;
448 }
449 while (c < c2) {
450 if (i >= bufsz) {
451 bufsz *= 2;
452 buf = (int *) realloc(buf, bufsz * sizeof(int));
453 if (buf == NULL)
454 FATAL("out of space for character class [%.10s...] 2", p);
455 bp = buf + i;
456 }
457 *bp++ = ++c;
458 i++;
459 }
460 continue;
461 }
462 }
463 if (i >= bufsz) {
464 bufsz *= 2;
465 buf = (int *) realloc(buf, bufsz * sizeof(int));
466 if (buf == NULL)
467 FATAL("out of space for character class [%.10s...] 2", p);
468 bp = buf + i;
469 }
470 *bp++ = c;
471 i++;
472 }
473 *bp = 0;
474 /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
475 /* xfree(op); BUG: what are we freeing here? */
476 retp = (int *) calloc(bp-buf+1, sizeof(int));
477 for (i = 0; i < bp-buf+1; i++)
478 retp[i] = buf[i];
479 return retp;
480 }
481
overflo(const char * s)482 void overflo(const char *s)
483 {
484 FATAL("regular expression too big: out of space in %.30s...", s);
485 }
486
cfoll(fa * f,Node * v)487 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
488 {
489 int i;
490 int *p;
491
492 switch (type(v)) {
493 ELEAF
494 LEAF
495 f->re[info(v)].ltype = type(v);
496 f->re[info(v)].lval.np = right(v);
497 while (f->accept >= maxsetvec) { /* guessing here! */
498 resizesetvec(__func__);
499 }
500 for (i = 0; i <= f->accept; i++)
501 setvec[i] = 0;
502 setcnt = 0;
503 follow(v); /* computes setvec and setcnt */
504 p = intalloc(setcnt + 1, __func__);
505 f->re[info(v)].lfollow = p;
506 *p = setcnt;
507 for (i = f->accept; i >= 0; i--)
508 if (setvec[i] == 1)
509 *++p = i;
510 break;
511 UNARY
512 cfoll(f,left(v));
513 break;
514 case CAT:
515 case OR:
516 cfoll(f,left(v));
517 cfoll(f,right(v));
518 break;
519 case ZERO:
520 break;
521 default: /* can't happen */
522 FATAL("can't happen: unknown type %d in cfoll", type(v));
523 }
524 }
525
first(Node * p)526 int first(Node *p) /* collects initially active leaves of p into setvec */
527 /* returns 0 if p matches empty string */
528 {
529 int b, lp;
530
531 switch (type(p)) {
532 ELEAF
533 LEAF
534 lp = info(p); /* look for high-water mark of subscripts */
535 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
536 resizesetvec(__func__);
537 }
538 if (type(p) == EMPTYRE) {
539 setvec[lp] = 0;
540 return(0);
541 }
542 if (setvec[lp] != 1) {
543 setvec[lp] = 1;
544 setcnt++;
545 }
546 if (type(p) == CCL && (*(int *) right(p)) == 0)
547 return(0); /* empty CCL */
548 return(1);
549 case PLUS:
550 if (first(left(p)) == 0)
551 return(0);
552 return(1);
553 case STAR:
554 case QUEST:
555 first(left(p));
556 return(0);
557 case CAT:
558 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
559 return(1);
560 case OR:
561 b = first(right(p));
562 if (first(left(p)) == 0 || b == 0) return(0);
563 return(1);
564 case ZERO:
565 return 0;
566 }
567 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
568 return(-1);
569 }
570
follow(Node * v)571 void follow(Node *v) /* collects leaves that can follow v into setvec */
572 {
573 Node *p;
574
575 if (type(v) == FINAL)
576 return;
577 p = parent(v);
578 switch (type(p)) {
579 case STAR:
580 case PLUS:
581 first(v);
582 follow(p);
583 return;
584
585 case OR:
586 case QUEST:
587 follow(p);
588 return;
589
590 case CAT:
591 if (v == left(p)) { /* v is left child of p */
592 if (first(right(p)) == 0) {
593 follow(p);
594 return;
595 }
596 } else /* v is right child */
597 follow(p);
598 return;
599 }
600 }
601
member(int c,int * sarg)602 int member(int c, int *sarg) /* is c in s? */
603 {
604 int *s = (int *) sarg;
605
606 while (*s)
607 if (c == *s++)
608 return(1);
609 return(0);
610 }
611
resize_gototab(fa * f,int state)612 static void resize_gototab(fa *f, int state)
613 {
614 size_t new_size = f->gototab[state].allocated * 2;
615 gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
616 if (p == NULL)
617 overflo(__func__);
618
619 // need to initialized the new memory to zero
620 size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
621 memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
622
623 f->gototab[state].allocated = new_size; // update gototab info
624 f->gototab[state].entries = p;
625 }
626
get_gototab(fa * f,int state,int ch)627 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
628 {
629 gtte key;
630 gtte *item;
631
632 key.ch = ch;
633 key.state = 0; /* irrelevant */
634 item = (gtte *) bsearch(& key, f->gototab[state].entries,
635 f->gototab[state].inuse, sizeof(gtte),
636 entry_cmp);
637
638 if (item == NULL)
639 return 0;
640 else
641 return item->state;
642 }
643
entry_cmp(const void * l,const void * r)644 static int entry_cmp(const void *l, const void *r)
645 {
646 const gtte *left, *right;
647
648 left = (const gtte *) l;
649 right = (const gtte *) r;
650
651 return left->ch - right->ch;
652 }
653
set_gototab(fa * f,int state,int ch,int val)654 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
655 {
656 if (f->gototab[state].inuse == 0) {
657 f->gototab[state].entries[0].ch = ch;
658 f->gototab[state].entries[0].state = val;
659 f->gototab[state].inuse++;
660 return val;
661 } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
662 // not seen yet, insert and return
663 gtt *tab = & f->gototab[state];
664 if (tab->inuse + 1 >= tab->allocated)
665 resize_gototab(f, state);
666
667 f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
668 f->gototab[state].entries[f->gototab[state].inuse].state = val;
669 f->gototab[state].inuse++;
670 return val;
671 } else {
672 // maybe we have it, maybe we don't
673 gtte key;
674 gtte *item;
675
676 key.ch = ch;
677 key.state = 0; /* irrelevant */
678 item = (gtte *) bsearch(& key, f->gototab[state].entries,
679 f->gototab[state].inuse, sizeof(gtte),
680 entry_cmp);
681
682 if (item != NULL) {
683 // we have it, update state and return
684 item->state = val;
685 return item->state;
686 }
687 // otherwise, fall through to insert and reallocate.
688 }
689
690 gtt *tab = & f->gototab[state];
691 if (tab->inuse + 1 >= tab->allocated)
692 resize_gototab(f, state);
693 f->gototab[state].entries[tab->inuse].ch = ch;
694 f->gototab[state].entries[tab->inuse].state = val;
695 ++tab->inuse;
696
697 qsort(f->gototab[state].entries,
698 f->gototab[state].inuse, sizeof(gtte), entry_cmp);
699
700 return val; /* not used anywhere at the moment */
701 }
702
clear_gototab(fa * f,int state)703 static void clear_gototab(fa *f, int state)
704 {
705 memset(f->gototab[state].entries, 0,
706 f->gototab[state].allocated * sizeof(gtte));
707 f->gototab[state].inuse = 0;
708 }
709
match(fa * f,const char * p0)710 int match(fa *f, const char *p0) /* shortest match ? */
711 {
712 int s, ns;
713 int n;
714 int rune;
715 const uschar *p = (const uschar *) p0;
716
717 /* return pmatch(f, p0); does it matter whether longest or shortest? */
718
719 s = f->initstat;
720 assert (s < f->state_count);
721
722 if (f->out[s])
723 return(1);
724 do {
725 /* assert(*p < NCHARS); */
726 n = u8_rune(&rune, (const char *) p);
727 if ((ns = get_gototab(f, s, rune)) != 0)
728 s = ns;
729 else
730 s = cgoto(f, s, rune);
731 if (f->out[s])
732 return(1);
733 if (*p == 0)
734 break;
735 p += n;
736 } while (1); /* was *p++ != 0 */
737 return(0);
738 }
739
pmatch(fa * f,const char * p0)740 int pmatch(fa *f, const char *p0) /* longest match, for sub */
741 {
742 int s, ns;
743 int n;
744 int rune;
745 const uschar *p = (const uschar *) p0;
746 const uschar *q;
747
748 s = f->initstat;
749 assert(s < f->state_count);
750
751 patbeg = (const char *)p;
752 patlen = -1;
753 do {
754 q = p;
755 do {
756 if (f->out[s]) /* final state */
757 patlen = q-p;
758 /* assert(*q < NCHARS); */
759 n = u8_rune(&rune, (const char *) q);
760 if ((ns = get_gototab(f, s, rune)) != 0)
761 s = ns;
762 else
763 s = cgoto(f, s, rune);
764
765 assert(s < f->state_count);
766
767 if (s == 1) { /* no transition */
768 if (patlen >= 0) {
769 patbeg = (const char *) p;
770 return(1);
771 }
772 else
773 goto nextin; /* no match */
774 }
775 if (*q == 0)
776 break;
777 q += n;
778 } while (1);
779 q++; /* was *q++ */
780 if (f->out[s])
781 patlen = q-p-1; /* don't count $ */
782 if (patlen >= 0) {
783 patbeg = (const char *) p;
784 return(1);
785 }
786 nextin:
787 s = 2;
788 if (*p == 0)
789 break;
790 n = u8_rune(&rune, (const char *) p);
791 p += n;
792 } while (1); /* was *p++ */
793 return (0);
794 }
795
nematch(fa * f,const char * p0)796 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
797 {
798 int s, ns;
799 int n;
800 int rune;
801 const uschar *p = (const uschar *) p0;
802 const uschar *q;
803
804 s = f->initstat;
805 assert(s < f->state_count);
806
807 patbeg = (const char *)p;
808 patlen = -1;
809 while (*p) {
810 q = p;
811 do {
812 if (f->out[s]) /* final state */
813 patlen = q-p;
814 /* assert(*q < NCHARS); */
815 n = u8_rune(&rune, (const char *) q);
816 if ((ns = get_gototab(f, s, rune)) != 0)
817 s = ns;
818 else
819 s = cgoto(f, s, rune);
820 if (s == 1) { /* no transition */
821 if (patlen > 0) {
822 patbeg = (const char *) p;
823 return(1);
824 } else
825 goto nnextin; /* no nonempty match */
826 }
827 if (*q == 0)
828 break;
829 q += n;
830 } while (1);
831 q++;
832 if (f->out[s])
833 patlen = q-p-1; /* don't count $ */
834 if (patlen > 0 ) {
835 patbeg = (const char *) p;
836 return(1);
837 }
838 nnextin:
839 s = 2;
840 p++;
841 }
842 return (0);
843 }
844
845
846 /*
847 * NAME
848 * fnematch
849 *
850 * DESCRIPTION
851 * A stream-fed version of nematch which transfers characters to a
852 * null-terminated buffer. All characters up to and including the last
853 * character of the matching text or EOF are placed in the buffer. If
854 * a match is found, patbeg and patlen are set appropriately.
855 *
856 * RETURN VALUES
857 * false No match found.
858 * true Match found.
859 */
860
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)861 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
862 {
863 char *i, *j, *k, *buf = *pbuf;
864 int bufsize = *pbufsize;
865 int c, n, ns, s;
866
867 s = pfa->initstat;
868 patlen = 0;
869
870 /*
871 * buf <= i <= j <= k <= buf+bufsize
872 *
873 * i: origin of active substring
874 * j: current character
875 * k: destination of the next getc
876 */
877
878 i = j = k = buf;
879
880 do {
881 /*
882 * Call u8_rune with at least awk_mb_cur_max ahead in
883 * the buffer until EOF interferes.
884 */
885 if (k - j < (int)awk_mb_cur_max) {
886 if (k + awk_mb_cur_max > buf + bufsize) {
887 char *obuf = buf;
888 adjbuf((char **) &buf, &bufsize,
889 bufsize + awk_mb_cur_max,
890 quantum, 0, "fnematch");
891
892 /* buf resized, maybe moved. update pointers */
893 *pbufsize = bufsize;
894 if (obuf != buf) {
895 i = buf + (i - obuf);
896 j = buf + (j - obuf);
897 k = buf + (k - obuf);
898 *pbuf = buf;
899 if (patlen)
900 patbeg = buf + (patbeg - obuf);
901 }
902 }
903 for (n = awk_mb_cur_max ; n > 0; n--) {
904 *k++ = (c = getc(f)) != EOF ? c : 0;
905 if (c == EOF) {
906 if (ferror(f))
907 FATAL("fnematch: getc error");
908 break;
909 }
910 }
911 }
912
913 j += u8_rune(&c, j);
914
915 if ((ns = get_gototab(pfa, s, c)) != 0)
916 s = ns;
917 else
918 s = cgoto(pfa, s, c);
919
920 if (pfa->out[s]) { /* final state */
921 patbeg = i;
922 patlen = j - i;
923 if (c == 0) /* don't count $ */
924 patlen--;
925 }
926
927 if (c && s != 1)
928 continue; /* origin i still viable, next j */
929 if (patlen)
930 break; /* best match found */
931
932 /* no match at origin i, next i and start over */
933 i += u8_rune(&c, i);
934 if (c == 0)
935 break; /* no match */
936 j = i;
937 s = 2;
938 } while (1);
939
940 if (patlen) {
941 /*
942 * Under no circumstances is the last character fed to
943 * the automaton part of the match. It is EOF's nullbyte,
944 * or it sent the automaton into a state with no further
945 * transitions available (s==1), or both. Room for a
946 * terminating nullbyte is guaranteed.
947 *
948 * ungetc any chars after the end of matching text
949 * (except for EOF's nullbyte, if present) and null
950 * terminate the buffer.
951 */
952 do
953 if (*--k && ungetc(*k, f) == EOF)
954 FATAL("unable to ungetc '%c'", *k);
955 while (k > patbeg + patlen);
956 *k = '\0';
957 return true;
958 }
959 else
960 return false;
961 }
962
reparse(const char * p)963 Node *reparse(const char *p) /* parses regular expression pointed to by p */
964 { /* uses relex() to scan regular expression */
965 Node *np;
966
967 DPRINTF("reparse <%s>\n", p);
968 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
969 rtok = relex();
970 /* GNU compatibility: an empty regexp matches anything */
971 if (rtok == '\0') {
972 /* FATAL("empty regular expression"); previous */
973 return(op2(EMPTYRE, NIL, NIL));
974 }
975 np = regexp();
976 if (rtok != '\0')
977 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
978 return(np);
979 }
980
regexp(void)981 Node *regexp(void) /* top-level parse of reg expr */
982 {
983 return (alt(concat(primary())));
984 }
985
primary(void)986 Node *primary(void)
987 {
988 Node *np;
989 int savelastatom;
990
991 switch (rtok) {
992 case CHAR:
993 lastatom = starttok;
994 np = op2(CHAR, NIL, itonp(rlxval));
995 rtok = relex();
996 return (unary(np));
997 case ALL:
998 rtok = relex();
999 return (unary(op2(ALL, NIL, NIL)));
1000 case EMPTYRE:
1001 rtok = relex();
1002 return (unary(op2(EMPTYRE, NIL, NIL)));
1003 case DOT:
1004 lastatom = starttok;
1005 rtok = relex();
1006 return (unary(op2(DOT, NIL, NIL)));
1007 case CCL:
1008 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1009 lastatom = starttok;
1010 rtok = relex();
1011 return (unary(np));
1012 case NCCL:
1013 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1014 lastatom = starttok;
1015 rtok = relex();
1016 return (unary(np));
1017 case '^':
1018 rtok = relex();
1019 return (unary(op2(CHAR, NIL, itonp(HAT))));
1020 case '$':
1021 rtok = relex();
1022 return (unary(op2(CHAR, NIL, NIL)));
1023 case '(':
1024 lastatom = starttok;
1025 savelastatom = starttok - basestr; /* Retain over recursion */
1026 rtok = relex();
1027 if (rtok == ')') { /* special pleading for () */
1028 rtok = relex();
1029 return unary(op2(CCL, NIL, (Node *) cclenter("")));
1030 }
1031 np = regexp();
1032 if (rtok == ')') {
1033 lastatom = basestr + savelastatom; /* Restore */
1034 rtok = relex();
1035 return (unary(np));
1036 }
1037 else
1038 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1039 default:
1040 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1041 }
1042 return 0; /*NOTREACHED*/
1043 }
1044
concat(Node * np)1045 Node *concat(Node *np)
1046 {
1047 switch (rtok) {
1048 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1049 return (concat(op2(CAT, np, primary())));
1050 case EMPTYRE:
1051 rtok = relex();
1052 return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1053 primary())));
1054 }
1055 return (np);
1056 }
1057
alt(Node * np)1058 Node *alt(Node *np)
1059 {
1060 if (rtok == OR) {
1061 rtok = relex();
1062 return (alt(op2(OR, np, concat(primary()))));
1063 }
1064 return (np);
1065 }
1066
unary(Node * np)1067 Node *unary(Node *np)
1068 {
1069 switch (rtok) {
1070 case STAR:
1071 rtok = relex();
1072 return (unary(op2(STAR, np, NIL)));
1073 case PLUS:
1074 rtok = relex();
1075 return (unary(op2(PLUS, np, NIL)));
1076 case QUEST:
1077 rtok = relex();
1078 return (unary(op2(QUEST, np, NIL)));
1079 case ZERO:
1080 rtok = relex();
1081 return (unary(op2(ZERO, np, NIL)));
1082 default:
1083 return (np);
1084 }
1085 }
1086
1087 /*
1088 * Character class definitions conformant to the POSIX locale as
1089 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1090 * and operating character sets are both ASCII (ISO646) or supersets
1091 * thereof.
1092 *
1093 * Note that to avoid overflowing the temporary buffer used in
1094 * relex(), the expanded character class (prior to range expansion)
1095 * must be less than twice the size of their full name.
1096 */
1097
1098 /* Because isblank doesn't show up in any of the header files on any
1099 * system i use, it's defined here. if some other locale has a richer
1100 * definition of "blank", define HAS_ISBLANK and provide your own
1101 * version.
1102 * the parentheses here are an attempt to find a path through the maze
1103 * of macro definition and/or function and/or version provided. thanks
1104 * to nelson beebe for the suggestion; let's see if it works everywhere.
1105 */
1106
1107 /* #define HAS_ISBLANK */
1108 #ifndef HAS_ISBLANK
1109
1110 int (xisblank)(int c)
1111 {
1112 return c==' ' || c=='\t';
1113 }
1114
1115 #endif
1116
1117 static const struct charclass {
1118 const char *cc_name;
1119 int cc_namelen;
1120 int (*cc_func)(int);
1121 } charclasses[] = {
1122 { "alnum", 5, isalnum },
1123 { "alpha", 5, isalpha },
1124 #ifndef HAS_ISBLANK
1125 { "blank", 5, xisblank },
1126 #else
1127 { "blank", 5, isblank },
1128 #endif
1129 { "cntrl", 5, iscntrl },
1130 { "digit", 5, isdigit },
1131 { "graph", 5, isgraph },
1132 { "lower", 5, islower },
1133 { "print", 5, isprint },
1134 { "punct", 5, ispunct },
1135 { "space", 5, isspace },
1136 { "upper", 5, isupper },
1137 { "xdigit", 6, isxdigit },
1138 { NULL, 0, NULL },
1139 };
1140
1141 #define REPEAT_SIMPLE 0
1142 #define REPEAT_PLUS_APPENDED 1
1143 #define REPEAT_WITH_Q 2
1144 #define REPEAT_ZERO 3
1145
1146 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1147 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1148 int atomlen, int firstnum, int secondnum, int special_case)
1149 {
1150 int i, j;
1151 uschar *buf = 0;
1152 int ret = 1;
1153 int init_q = (firstnum == 0); /* first added char will be ? */
1154 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1155 int prefix_length = reptok - basestr; /* prefix includes first rep */
1156 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1157 int size = prefix_length + suffix_length;
1158
1159 if (firstnum > 1) { /* add room for reps 2 through firstnum */
1160 size += atomlen*(firstnum-1);
1161 }
1162
1163 /* Adjust size of buffer for special cases */
1164 if (special_case == REPEAT_PLUS_APPENDED) {
1165 size++; /* for the final + */
1166 } else if (special_case == REPEAT_WITH_Q) {
1167 size += init_q + (atomlen+1)* (n_q_reps-init_q);
1168 } else if (special_case == REPEAT_ZERO) {
1169 size += 2; /* just a null ERE: () */
1170 }
1171 if ((buf = (uschar *) malloc(size + 1)) == NULL)
1172 FATAL("out of space in reg expr %.10s..", lastre);
1173 memcpy(buf, basestr, prefix_length); /* copy prefix */
1174 j = prefix_length;
1175 if (special_case == REPEAT_ZERO) {
1176 j -= atomlen;
1177 buf[j++] = '(';
1178 buf[j++] = ')';
1179 }
1180 for (i = 1; i < firstnum; i++) { /* copy x reps */
1181 memcpy(&buf[j], atom, atomlen);
1182 j += atomlen;
1183 }
1184 if (special_case == REPEAT_PLUS_APPENDED) {
1185 buf[j++] = '+';
1186 } else if (special_case == REPEAT_WITH_Q) {
1187 if (init_q)
1188 buf[j++] = '?';
1189 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1190 memcpy(&buf[j], atom, atomlen);
1191 j += atomlen;
1192 buf[j++] = '?';
1193 }
1194 }
1195 memcpy(&buf[j], reptok+reptoklen, suffix_length);
1196 j += suffix_length;
1197 buf[j] = '\0';
1198 /* free old basestr */
1199 if (firstbasestr != basestr) {
1200 if (basestr)
1201 xfree(basestr);
1202 }
1203 basestr = buf;
1204 prestr = buf + prefix_length;
1205 if (special_case == REPEAT_ZERO) {
1206 prestr -= atomlen;
1207 ret++;
1208 }
1209 return ret;
1210 }
1211
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1212 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1213 int atomlen, int firstnum, int secondnum)
1214 {
1215 /*
1216 In general, the repetition specifier or "bound" is replaced here
1217 by an equivalent ERE string, repeating the immediately previous atom
1218 and appending ? and + as needed. Note that the first copy of the
1219 atom is left in place, except in the special_case of a zero-repeat
1220 (i.e., {0}).
1221 */
1222 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1223 if (firstnum < 2) {
1224 /* 0 or 1: should be handled before you get here */
1225 FATAL("internal error");
1226 } else {
1227 return replace_repeat(reptok, reptoklen, atom, atomlen,
1228 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1229 }
1230 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1231 if (firstnum == 0) { /* {0} or {0,0} */
1232 /* This case is unusual because the resulting
1233 replacement string might actually be SMALLER than
1234 the original ERE */
1235 return replace_repeat(reptok, reptoklen, atom, atomlen,
1236 firstnum, secondnum, REPEAT_ZERO);
1237 } else { /* (firstnum >= 1) */
1238 return replace_repeat(reptok, reptoklen, atom, atomlen,
1239 firstnum, secondnum, REPEAT_SIMPLE);
1240 }
1241 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1242 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1243 return replace_repeat(reptok, reptoklen, atom, atomlen,
1244 firstnum, secondnum, REPEAT_WITH_Q);
1245 } else { /* Error - shouldn't be here (n>m) */
1246 FATAL("internal error");
1247 }
1248 return 0;
1249 }
1250
relex(void)1251 int relex(void) /* lexical analyzer for reparse */
1252 {
1253 int c, n;
1254 int cflag;
1255 static uschar *buf = NULL;
1256 static int bufsz = 100;
1257 uschar *bp;
1258 const struct charclass *cc;
1259 int i;
1260 int num, m;
1261 bool commafound, digitfound;
1262 const uschar *startreptok;
1263 static int parens = 0;
1264
1265 rescan:
1266 starttok = prestr;
1267
1268 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1269 prestr += n;
1270 starttok = prestr;
1271 return CHAR;
1272 }
1273
1274 switch (c = *prestr++) {
1275 case '|': return OR;
1276 case '*': return STAR;
1277 case '+': return PLUS;
1278 case '?': return QUEST;
1279 case '.': return DOT;
1280 case '\0': prestr--; return '\0';
1281 case '^':
1282 case '$':
1283 return c;
1284 case '(':
1285 parens++;
1286 return c;
1287 case ')':
1288 if (parens) {
1289 parens--;
1290 return c;
1291 }
1292 /* unmatched close parenthesis; per POSIX, treat as literal */
1293 rlxval = c;
1294 return CHAR;
1295 case '\\':
1296 rlxval = quoted(&prestr);
1297 return CHAR;
1298 default:
1299 rlxval = c;
1300 return CHAR;
1301 case '[':
1302 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1303 FATAL("out of space in reg expr %.10s..", lastre);
1304 bp = buf;
1305 if (*prestr == '^') {
1306 cflag = 1;
1307 prestr++;
1308 }
1309 else
1310 cflag = 0;
1311 n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1312 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1313 FATAL("out of space for reg expr %.10s...", lastre);
1314 for (; ; ) {
1315 if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1316 for (i = 0; i < n; i++)
1317 *bp++ = *prestr++;
1318 continue;
1319 }
1320 if ((c = *prestr++) == '\\') {
1321 *bp++ = '\\';
1322 if ((c = *prestr++) == '\0')
1323 FATAL("nonterminated character class %.20s...", lastre);
1324 *bp++ = c;
1325 /* } else if (c == '\n') { */
1326 /* FATAL("newline in character class %.20s...", lastre); */
1327 } else if (c == '[' && *prestr == ':') {
1328 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1329 for (cc = charclasses; cc->cc_name; cc++)
1330 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1331 break;
1332 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1333 prestr[2 + cc->cc_namelen] == ']') {
1334 prestr += cc->cc_namelen + 3;
1335 /*
1336 * BUG: We begin at 1, instead of 0, since we
1337 * would otherwise prematurely terminate the
1338 * string for classes like [[:cntrl:]]. This
1339 * means that we can't match the NUL character,
1340 * not without first adapting the entire
1341 * program to track each string's length.
1342 */
1343 for (i = 1; i <= UCHAR_MAX; i++) {
1344 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1345 FATAL("out of space for reg expr %.10s...", lastre);
1346 if (cc->cc_func(i)) {
1347 /* escape backslash */
1348 if (i == '\\') {
1349 *bp++ = '\\';
1350 n++;
1351 }
1352
1353 *bp++ = i;
1354 n++;
1355 }
1356 }
1357 } else
1358 *bp++ = c;
1359 } else if (c == '[' && *prestr == '.') {
1360 char collate_char;
1361 prestr++;
1362 collate_char = *prestr++;
1363 if (*prestr == '.' && prestr[1] == ']') {
1364 prestr += 2;
1365 /* Found it: map via locale TBD: for
1366 now, simply return this char. This
1367 is sufficient to pass conformance
1368 test awk.ex 156
1369 */
1370 if (*prestr == ']') {
1371 prestr++;
1372 rlxval = collate_char;
1373 return CHAR;
1374 }
1375 }
1376 } else if (c == '[' && *prestr == '=') {
1377 char equiv_char;
1378 prestr++;
1379 equiv_char = *prestr++;
1380 if (*prestr == '=' && prestr[1] == ']') {
1381 prestr += 2;
1382 /* Found it: map via locale TBD: for now
1383 simply return this char. This is
1384 sufficient to pass conformance test
1385 awk.ex 156
1386 */
1387 if (*prestr == ']') {
1388 prestr++;
1389 rlxval = equiv_char;
1390 return CHAR;
1391 }
1392 }
1393 } else if (c == '\0') {
1394 FATAL("nonterminated character class %.20s", lastre);
1395 } else if (bp == buf) { /* 1st char is special */
1396 *bp++ = c;
1397 } else if (c == ']') {
1398 *bp++ = 0;
1399 rlxstr = (uschar *) tostring((char *) buf);
1400 if (cflag == 0)
1401 return CCL;
1402 else
1403 return NCCL;
1404 } else
1405 *bp++ = c;
1406 }
1407 break;
1408 case '{':
1409 if (isdigit((int) *(prestr))) {
1410 num = 0; /* Process as a repetition */
1411 n = -1; m = -1;
1412 commafound = false;
1413 digitfound = false;
1414 startreptok = prestr-1;
1415 /* Remember start of previous atom here ? */
1416 } else { /* just a { char, not a repetition */
1417 rlxval = c;
1418 return CHAR;
1419 }
1420 for (; ; ) {
1421 if ((c = *prestr++) == '}') {
1422 if (commafound) {
1423 if (digitfound) { /* {n,m} */
1424 m = num;
1425 if (m < n)
1426 FATAL("illegal repetition expression: class %.20s",
1427 lastre);
1428 if (n == 0 && m == 1) {
1429 return QUEST;
1430 }
1431 } else { /* {n,} */
1432 if (n == 0)
1433 return STAR;
1434 else if (n == 1)
1435 return PLUS;
1436 }
1437 } else {
1438 if (digitfound) { /* {n} same as {n,n} */
1439 n = num;
1440 m = num;
1441 } else { /* {} */
1442 FATAL("illegal repetition expression: class %.20s",
1443 lastre);
1444 }
1445 }
1446 if (repeat(starttok, prestr-starttok, lastatom,
1447 startreptok - lastatom, n, m) > 0) {
1448 if (n == 0 && m == 0) {
1449 return ZERO;
1450 }
1451 /* must rescan input for next token */
1452 goto rescan;
1453 }
1454 /* Failed to replace: eat up {...} characters
1455 and treat like just PLUS */
1456 return PLUS;
1457 } else if (c == '\0') {
1458 FATAL("nonterminated character class %.20s",
1459 lastre);
1460 } else if (isdigit(c)) {
1461 num = 10 * num + c - '0';
1462 digitfound = true;
1463 } else if (c == ',') {
1464 if (commafound)
1465 FATAL("illegal repetition expression: class %.20s",
1466 lastre);
1467 /* looking for {n,} or {n,m} */
1468 commafound = true;
1469 n = num;
1470 digitfound = false; /* reset */
1471 num = 0;
1472 } else {
1473 FATAL("illegal repetition expression: class %.20s",
1474 lastre);
1475 }
1476 }
1477 break;
1478 }
1479 }
1480
cgoto(fa * f,int s,int c)1481 int cgoto(fa *f, int s, int c)
1482 {
1483 int *p, *q;
1484 int i, j, k;
1485
1486 /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1487 while (f->accept >= maxsetvec) { /* guessing here! */
1488 resizesetvec(__func__);
1489 }
1490 for (i = 0; i <= f->accept; i++)
1491 setvec[i] = 0;
1492 setcnt = 0;
1493 resize_state(f, s);
1494 /* compute positions of gototab[s,c] into setvec */
1495 p = f->posns[s];
1496 for (i = 1; i <= *p; i++) {
1497 if ((k = f->re[p[i]].ltype) != FINAL) {
1498 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1499 || (k == DOT && c != 0 && c != HAT)
1500 || (k == ALL && c != 0)
1501 || (k == EMPTYRE && c != 0)
1502 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1503 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1504 q = f->re[p[i]].lfollow;
1505 for (j = 1; j <= *q; j++) {
1506 if (q[j] >= maxsetvec) {
1507 resizesetvec(__func__);
1508 }
1509 if (setvec[q[j]] == 0) {
1510 setcnt++;
1511 setvec[q[j]] = 1;
1512 }
1513 }
1514 }
1515 }
1516 }
1517 /* determine if setvec is a previous state */
1518 tmpset[0] = setcnt;
1519 j = 1;
1520 for (i = f->accept; i >= 0; i--)
1521 if (setvec[i]) {
1522 tmpset[j++] = i;
1523 }
1524 resize_state(f, f->curstat > s ? f->curstat : s);
1525 /* tmpset == previous state? */
1526 for (i = 1; i <= f->curstat; i++) {
1527 p = f->posns[i];
1528 if ((k = tmpset[0]) != p[0])
1529 goto different;
1530 for (j = 1; j <= k; j++)
1531 if (tmpset[j] != p[j])
1532 goto different;
1533 /* setvec is state i */
1534 if (c != HAT)
1535 set_gototab(f, s, c, i);
1536 return i;
1537 different:;
1538 }
1539
1540 /* add tmpset to current set of states */
1541 ++(f->curstat);
1542 resize_state(f, f->curstat);
1543 clear_gototab(f, f->curstat);
1544 xfree(f->posns[f->curstat]);
1545 p = intalloc(setcnt + 1, __func__);
1546
1547 f->posns[f->curstat] = p;
1548 if (c != HAT)
1549 set_gototab(f, s, c, f->curstat);
1550 for (i = 0; i <= setcnt; i++)
1551 p[i] = tmpset[i];
1552 if (setvec[f->accept])
1553 f->out[f->curstat] = 1;
1554 else
1555 f->out[f->curstat] = 0;
1556 return f->curstat;
1557 }
1558
1559
freefa(fa * f)1560 void freefa(fa *f) /* free a finite automaton */
1561 {
1562 int i;
1563
1564 if (f == NULL)
1565 return;
1566 for (i = 0; i < f->state_count; i++)
1567 xfree(f->gototab[i].entries);
1568 xfree(f->gototab);
1569 for (i = 0; i <= f->curstat; i++)
1570 xfree(f->posns[i]);
1571 for (i = 0; i <= f->accept; i++) {
1572 xfree(f->re[i].lfollow);
1573 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1574 xfree(f->re[i].lval.np);
1575 }
1576 xfree(f->restr);
1577 xfree(f->out);
1578 xfree(f->posns);
1579 xfree(f->gototab);
1580 xfree(f);
1581 }
1582