xref: /titanic_44/usr/src/tools/cscope-fast/cgrep.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*	Copyright (c) 1990 AT&T	*/
23 /*	  All Rights Reserved  	*/
24 
25 
26 /*
27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 /*
34  *	cscope - interactive C symbol or text cross-reference
35  *
36  *	text searching functions
37  */
38 
39 #include <fcntl.h>
40 #include <setjmp.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <memory.h>
44 #include <ctype.h>
45 #include <unistd.h>
46 #include "library.h"
47 
48 typedef enum {
49 	NO = 0,
50 	YES = 1
51 } BOOL;
52 
53 typedef struct re_bm {
54 	int delta0[256];
55 	int *delta2;
56 	uchar_t *cmap;
57 	uchar_t *bmpat;
58 	int patlen;
59 } re_bm;
60 
61 typedef struct Link {
62 	uchar_t lit;
63 	struct Node *node;
64 	struct Link *next;
65 } Link;
66 
67 typedef struct Node {
68 	short out;
69 	short d;
70 	short shift1;
71 	short shift2;
72 	long id;
73 	struct Node **tab;
74 	Link *alts;
75 	struct Node *fail;
76 } Node;
77 
78 typedef struct re_cw {
79 	int maxdepth, mindepth;
80 	long nodeid;
81 	int step[256];
82 	uchar_t *cmap;
83 	struct Node *root;
84 } re_cw;
85 
86 typedef enum {
87 	/* lit expression types */
88 	Literal, Dot, Charclass, EOP,
89 
90 	/* non-lit expression types */
91 	Cat, Alternate, Star, Plus, Quest,
92 
93 	/* not really expression types, just helping */
94 	Lpar, Rpar, Backslash
95 
96 } Exprtype;
97 
98 typedef int ID;
99 
100 typedef struct Expr {
101 	Exprtype type;	/* Type of expression (in grammar) */
102 	ID id;		/* unique ID of lit expression  */
103 	int lit;	/* Literal character or tag */
104 	int flen;	/* Number of following lit expressions */
105 	ID *follow;	/* Array of IDs of following lit expressions */
106 	struct Expr *l; /* pointer to Left child (or ccl count) */
107 	struct Expr *r; /* pointer to Right child (or ccl mask) */
108 	struct Expr *parent; /* pointer to Parent */
109 } Expr;
110 
111 typedef struct State {
112 	struct State *tab[256];
113 	int cnt; /* number of matched chars on full match, -1 otherwise  */
114 	int npos;	/* number of IDs in position set for this state. */
115 	int pos;	/* index into posbase for beginning of IDs */
116 } State;
117 
118 typedef struct FID {
119 	ID	id;	/* Lit Expression id */
120 	int	fcount; /* Number of Lit exp matches before this one. */
121 } FID;
122 
123 typedef struct Positionset {
124 	int count;	/* Number of lit exps in position set */
125 	ID last;	/* ID of last lit exp in position set */
126 	FID *base;	/* array of MAXID FIDS */
127 			/* 0 means not in position set */
128 			/* -1 means first in position set */
129 			/* n (>0) is ID of prev member of position set. */
130 } Positionset;
131 
132 typedef struct re_re {
133 	FID  *posbase;	/* Array of IDs from all states */
134 	int nposalloc;	/* Allocated size of posbase */
135 	int posnext;	/* Index into free space in posbase */
136 	int posreset;	/* Index into end of IDs for initial state in posbase */
137 	int maxid;	/* Number of (also maximum ID of) lit expressions */
138 	Expr *root;	/* Pointer to root (EOP) expression */
139 	Expr **ptr;	/* Pointer to array of ptrs to lit expressions. */
140 	uchar_t *cmap;	/* Character mapping array */
141 	Positionset firstpos;
142 	Positionset tmp;
143 	int nstates;	/* Number of current states defined */
144 	int statelim;	/* Limit on number of states before flushing */
145 	State *states;	/* Array of states */
146 	State istate;	/* Initial state */
147 } re_re;
148 
149 typedef struct {
150 	uchar_t *cmap;
151 	re_re *re_ptr;
152 	re_bm *bm_ptr;
153 	re_cw *cw_ptr;
154 	BOOL fullmatch;
155 	BOOL (*procfn)();
156 	BOOL (*succfn)();
157 	uchar_t *loc1;
158 	uchar_t *loc2;
159 	uchar_t *expression;
160 } PATTERN;
161 
162 typedef enum {
163 	BEGIN,		/* File is not yet in buffer at all */
164 	MORE,		/* File is partly in buffer */
165 	NO_MORE		/* File has been completely read into buffer */
166 } FILE_STAT;
167 
168 typedef struct {
169 	uchar_t	*prntbuf; /* current line of input from data file */
170 	uchar_t	*newline; /* end of line (real or sentinel \n) */
171 	long	ln;	/* line number */
172 } LINE;
173 
174 #define	NL '\n'
175 
176 #define	CCL_SIZ		32
177 #define	CCL_SET(a, c)	((a)[(c) >> 3] |= bittab[(c) & 07])
178 #define	CCL_CLR(a, c)	((a)[(c) >> 3] &= ~bittab[(c) & 07])
179 #define	CCL_CHK(a, c)	((a)[(c) >> 3] & bittab[(c) & 07])
180 
181 #define	ESIZE (BUFSIZ)
182 #define	MAXBUFSIZE (64*BUFSIZ)
183 
184 #define	MAXMALLOCS	1024
185 #define	MAXLIT	256	/* is plenty big enough */
186 #define	LARGE	MAXBUFSIZE+ESIZE+2
187 
188 #define	CLEAR(r, rps)	(void) memset((char *)(rps)->base, 0, \
189 			    (int)((r)->maxid * sizeof (FID))), \
190 			    (rps)->count = 0, (rps)->last = -1
191 #define	SET(rps, n, cnt) { \
192 	if ((rps)->base[n].id == 0) {\
193 		(rps)->count++;\
194 		(rps)->base[n].fcount = (cnt);\
195 		(rps)->base[n].id = (rps)->last;\
196 		(rps)->last = (n);\
197 	} else if ((cnt) > (rps)->base[n].fcount) {\
198 		(rps)->base[n].fcount = (cnt);\
199 	}}
200 
201 #define	START	{ _p = tmp; }
202 #define	ADDL(c)	{ if (_p >= &tmp[MAXLIT]) _p--; *_p++ = c; }
203 #define	FINISH	{ ADDL(0) if ((_p-tmp) > bestlen) \
204 		    (void) memmove(best, tmp, bestlen = _p-tmp); }
205 
206 
207 #define	NEW(N)	(froot?(t = froot, froot = froot->next, t->next = NULL, \
208 		    t->node = N, t): newlink(0, N))
209 #define	ADD(N)	if (qtail) qtail = qtail->next = NEW(N); \
210 			else qtail = qhead = NEW(N)
211 #define	DEL()	{ Link *_l = qhead; if ((qhead = qhead->next) == NULL) \
212 			qtail = NULL; _l->next = froot; froot = _l; }
213 
214 static uchar_t	*buffer;
215 static uchar_t	*bufend;
216 static FILE	*output;
217 static char	*format;
218 static char	*file;
219 static int	file_desc;
220 static FILE_STAT file_stat;
221 static PATTERN	match_pattern;
222 static uchar_t	char_map[2][256];
223 static int	iflag;
224 static Exprtype	toktype;
225 static int	toklit, parno, maxid;
226 static uchar_t	tmp[MAXLIT], best[MAXLIT];
227 static uchar_t	*_p;
228 static int	bestlen;
229 static Node	*next_node;
230 static Link	*froot, *next_link;
231 static jmp_buf	env;
232 
233 static int nmalloc;
234 static uchar_t	*mallocs[MAXMALLOCS];
235 
236 static uchar_t	bittab[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
237 
238 #ifdef	DEBUG
239 #define		TRACE(n)	(n < debug)
240 #define		EPRINTSIZE	50000
241 static void spr(int c, int *p, uchar_t *buf);
242 void epr(Expr *e, uchar_t *res);
243 static int debug = 12;
244 #endif
245 
246 static void init_file(LINE *cur_ptr);
247 static void get_line(LINE *cur_ptr, uchar_t *s);
248 static void get_ncount(LINE *cur_ptr, uchar_t *s);
249 static int execute(void);
250 static State *startstate(re_re *r);
251 static State *stateof(re_re *r, Positionset *ps);
252 static State *nextstate(re_re *r, State *s, int a);
253 static State *addstate(re_re *r, Positionset *ps, int cnt);
254 static BOOL match(Expr *e, int a);
255 static BOOL first_lit(Positionset *fpos, Expr *e);
256 static void eptr(re_re *r, Expr *e);
257 static void efollow(re_re *r, Positionset *fpos, Expr *e);
258 static void follow(Positionset *fpos, Expr *e);
259 static void followstate(re_re *r, State *s, int a, Positionset *fpos);
260 static Expr *eall(re_re *r, PATTERN *pat);
261 static Expr *d0(re_re *r, PATTERN *pat);
262 static Expr *d1(re_re *r, PATTERN *pat);
263 static Expr *d2(re_re *r, PATTERN *pat);
264 static Expr *d3(re_re *r, PATTERN *pat);
265 static Expr *newexpr(Exprtype t, int lit, Expr *left, Expr *right);
266 static void lex(re_re *r, PATTERN *pat);
267 static int re_lit(PATTERN *pat, uchar_t **b, uchar_t **e);
268 static void traverse(PATTERN *pat, Expr *e);
269 static int ccl(PATTERN *pat, uchar_t *tab);
270 static BOOL altlist(), word();
271 static BOOL altlist(Expr *e, uchar_t *buf, re_cw *pat);
272 static Node *newnode(re_cw *c, int d);
273 static Link *newlink(uchar_t lit, Node *n);
274 static void fail(Node *root);
275 static void zeroroot(Node *root, Node *n);
276 static void shift(re_cw *c);
277 static void shifttab(Node *n);
278 static void shiftprop(re_cw *c, Node *n);
279 static void delta_2(re_bm *b);
280 static int getstate(re_re *r, Positionset *ps);
281 static void savestate(re_re *r);
282 static void stateinit(re_re *r);
283 static re_bm *re_bmcomp(uchar_t *pb, uchar_t *pe, uchar_t *cmap);
284 static re_cw *re_cwinit(uchar_t *cmap);
285 static re_cw *re_recw(re_re *r, uchar_t *map);
286 static re_re *egprep(PATTERN *pat);
287 static void re_cwadd(re_cw *c, uchar_t *s, uchar_t *e);
288 static void re_cwcomp(re_cw *c);
289 static void eginit(re_re *r);
290 static BOOL re_bmexec(PATTERN *pat, uchar_t *s, uchar_t *e, uchar_t **mb,
291     uchar_t **me);
292 static BOOL re_cwexec(PATTERN *pat, uchar_t *rs, uchar_t *re, uchar_t **mb,
293     uchar_t **me);
294 static BOOL re_reexec(PATTERN *pat, uchar_t *b, uchar_t *e, uchar_t **mb,
295     uchar_t **me);
296 static uchar_t *egmalloc(size_t n);
297 static void fgetfile(LINE *cur_ptr);
298 static void dogre(PATTERN *pat);
299 static BOOL pattern_match(PATTERN *pat, LINE *lptr);
300 static BOOL fixloc(uchar_t **mb, uchar_t **me);
301 static BOOL grepmatch(PATTERN *pat, uchar_t **mb, uchar_t **me);
302 static void err(char *s);
303 
304 static char *message;
305 
306 void
egrepcaseless(int i)307 egrepcaseless(int i)
308 {
309 	iflag = i;	/* simulate "egrep -i" */
310 }
311 
312 char *
egrepinit(char * expression)313 egrepinit(char *expression)
314 {
315 	static int firsttime = 1;
316 	int i;
317 
318 	if (firsttime) {
319 		firsttime = 0;
320 		for (i = 0; i < 256; i++) {
321 			char_map[0][i] = (uchar_t)i;
322 			char_map[1][i] = tolower(i);
323 		}
324 	}
325 	for (i = 0; i < nmalloc; i ++)
326 		free(mallocs[i]);
327 	nmalloc = 0;
328 	message = NULL;
329 
330 	match_pattern.expression = (uchar_t *)expression;
331 	match_pattern.cmap = char_map[iflag];
332 	if (setjmp(env) == 0) {
333 		dogre(&match_pattern);
334 #ifdef	DEBUG
335 		{
336 		PATTERN *p = match_pattern;
337 		if (p->procfn == re_bmexec)
338 			if (!p->fullmatch)
339 				if (p->succfn == re_reexec)
340 					printf("PARTIAL BOYER_MOORE\n");
341 				else
342 					printf("PARTIAL B_M with GREP\n");
343 			else
344 				printf("FULL BOYER_MOORE\n");
345 		else if (p->procfn == re_cwexec)
346 			printf("C_W\n");
347 		else
348 			printf("GENERAL\n");
349 		}
350 #endif
351 	}
352 	return (message);
353 }
354 
355 static void
dogre(PATTERN * pat)356 dogre(PATTERN *pat)
357 {
358 	uchar_t *lb, *le;
359 
360 #ifdef	DEBUG
361 	printf("PATTERN %s\n", pat->expression);
362 #endif
363 	pat->re_ptr = egprep(pat);
364 	bestlen = re_lit(pat, &lb, &le);
365 
366 	if (bestlen && pat->fullmatch) { /* Full Boyer Moore */
367 #ifdef	DEBUG
368 		printf("BESTLEN %d\n", bestlen);
369 		{
370 			uchar_t *p;
371 			for (p = lb; p < le; p++) printf("%c", *p);
372 			printf("\n");
373 		}
374 #endif
375 		pat->bm_ptr = re_bmcomp(lb, le, pat->cmap);
376 		pat->procfn = re_bmexec;
377 		return;
378 	}
379 	if (bestlen > 1) {
380 			/* Partial Boyer Moore */
381 		pat->bm_ptr = re_bmcomp(lb, le, pat->cmap);
382 		pat->procfn = re_bmexec;
383 		pat->fullmatch = NO;
384 	} else {
385 		pat->fullmatch = YES;
386 		if ((pat->cw_ptr = re_recw(pat->re_ptr, pat->cmap)) != NULL) {
387 			pat->procfn = re_cwexec; /* CW */
388 			return;
389 		}
390 	}
391 	/* general egrep regular expression */
392 	pat->succfn = re_reexec;
393 
394 	if (pat->fullmatch) {
395 		pat->procfn = pat->succfn;
396 		pat->succfn = NULL;
397 	}
398 }
399 
400 static BOOL
fixloc(uchar_t ** mb,uchar_t ** me)401 fixloc(uchar_t **mb, uchar_t **me)
402 {
403 	/* Handle match to null string */
404 
405 	while (*me <= *mb)
406 		(*me)++;
407 
408 	if (*(*me - 1) != NL)
409 		while (**me != NL)
410 			(*me)++;
411 
412 	/* Handle match to new-line only */
413 
414 	if (*mb == *me - 1 && **mb == NL) {
415 		(*me)++;
416 	}
417 
418 	/* Handle match including beginning or ending new-line */
419 
420 	if (**mb == NL)
421 		(*mb)++;
422 	if (*(*me - 1) == NL)
423 		(*me)--;
424 	return (YES);
425 }
426 
427 static BOOL
grepmatch(PATTERN * pat,uchar_t ** mb,uchar_t ** me)428 grepmatch(PATTERN *pat, uchar_t **mb, uchar_t **me)
429 {
430 	uchar_t *s, *f;
431 
432 	if (pat->fullmatch)
433 		return (fixloc(mb, me));
434 
435 	for (f = *me - 1; *f != NL; f++) {
436 	}
437 	f++;
438 	for (s = *mb; *s != NL; s--) {
439 	}
440 
441 	if ((*pat->succfn)(pat, s, f, mb, me)) {
442 		return (YES);
443 	} else {
444 		*mb = f;
445 		return (NO);
446 	}
447 }
448 
449 static void
eginit(re_re * r)450 eginit(re_re *r)
451 {
452 	unsigned int n;
453 
454 	r->ptr = (Expr **)egmalloc(r->maxid * sizeof (Expr *));
455 	eptr(r, r->root);
456 	n = r->maxid * sizeof (FID);
457 	r->firstpos.base = (FID *)egmalloc(n);
458 	r->tmp.base = (FID *)egmalloc(n);
459 	CLEAR(r, &r->firstpos);
460 	if (!first_lit(&r->firstpos, r->root->l)) {
461 		/*
462 		 * This expression matches the null string!!!!
463 		 * Add EOP to beginning position set.
464 		 */
465 		SET(&r->firstpos, r->root->id, 0)
466 		/* (void) printf("first of root->l == 0, b=%s\n", b); */
467 	}
468 	stateinit(r);
469 	(void) addstate(r, &r->firstpos, 0);
470 	savestate(r);
471 }
472 
473 static void
eptr(re_re * r,Expr * e)474 eptr(re_re *r, Expr *e)
475 {
476 	if ((e->id < 0) || (e->id >= r->maxid)) {
477 		err("internal error");
478 	}
479 	r->ptr[e->id] = e;
480 	if (e->type != Charclass) {
481 		if (e->l) eptr(r, e->l);
482 		if (e->r) eptr(r, e->r);
483 	}
484 }
485 
486 static BOOL
re_reexec(PATTERN * pat,uchar_t * b,uchar_t * e,uchar_t ** mb,uchar_t ** me)487 re_reexec(PATTERN *pat, uchar_t *b, uchar_t *e, uchar_t **mb, uchar_t **me)
488 {
489 	re_re *r = pat->re_ptr;
490 	State *s, *t;
491 
492 #ifdef	DEBUG
493 	if (TRACE(10)) {
494 		uchar_t buf[EPRINTSIZE];
495 		epr(r->root, buf);
496 		(void) printf("expr='%s'\n", buf);
497 	}
498 #endif
499 	s = startstate(r);
500 
501 	for (;;) {
502 		uchar_t c;
503 
504 		if (s->cnt >= 0) {
505 #ifdef	DEBUG
506 			if (TRACE(6))
507 				(void) printf("match at input '%s'\n", b);
508 #endif
509 			*mb = b - s->cnt;
510 			*me = b;
511 			if (fixloc(mb, me))
512 				return (YES);
513 		}
514 
515 		if (b >= e) break;
516 		c = pat->cmap[*b];
517 #ifdef	DEBUG
518 		if (TRACE(4))
519 			(void) printf("state %d: char '%c'\n", s-r->states, *b);
520 #endif
521 		if ((t = s->tab[c]) != NULL) s = t;
522 		else s = nextstate(r, s, (int)c);
523 		b++;
524 	}
525 #ifdef	DEBUG
526 	if (TRACE(3)) {
527 		uchar_t buf[EPRINTSIZE];
528 
529 		epr(r->root, buf);
530 		(void) printf("pat = %s\n", buf);
531 	}
532 #endif
533 	return (NO);
534 }
535 
536 static BOOL
match(Expr * e,int a)537 match(Expr *e, int a)
538 {
539 	switch (e->type) {
540 	case Dot:	return ((BOOL)(a != NL));
541 	case Literal:	return ((BOOL)(a == e->lit));
542 	case Charclass:	return ((BOOL)(CCL_CHK((uchar_t *)e->r, a)));
543 	default:	return (NO);
544 	}
545 }
546 
547 /*
548  * generates the followset for a node in fpos
549  */
550 static void
follow(Positionset * fpos,Expr * e)551 follow(Positionset *fpos, Expr *e)
552 {
553 	Expr *p;
554 
555 	if (e->type == EOP)
556 		return;
557 	else
558 		p = e->parent;
559 	switch (p->type) {
560 	case EOP:
561 		SET(fpos, p->id, 0)
562 		break;
563 	case Plus:
564 	case Star:
565 		(void) first_lit(fpos, e);
566 		follow(fpos, p);
567 		break;
568 	case Quest:
569 	case Alternate:
570 		follow(fpos, p);
571 		break;
572 	case Cat:
573 		if (e == p->r || !first_lit(fpos, p->r))
574 			follow(fpos, p);
575 		break;
576 	default:
577 		break;
578 	}
579 }
580 
581 /*
582  * first_lit returns NO if e is nullable and in the process,
583  * ets up fpos.
584  */
585 static BOOL
first_lit(Positionset * fpos,Expr * e)586 first_lit(Positionset *fpos, Expr *e)
587 {
588 	BOOL k;
589 
590 	switch (e->type) {
591 	case Literal:
592 	case Dot:
593 	case Charclass:
594 		SET(fpos, e->id, 0)
595 		return (YES);
596 	case EOP:
597 	case Star:
598 	case Quest:
599 		(void) first_lit(fpos, e->l);
600 		return (NO);
601 	case Plus:
602 		return (first_lit(fpos, e->l));
603 	case Cat:
604 		return ((BOOL)(first_lit(fpos, e->l) || first_lit(fpos, e->r)));
605 	case Alternate:
606 		k = first_lit(fpos, e->r);
607 		return ((BOOL)(first_lit(fpos, e->l) && k));
608 	default:
609 		err("internal error");
610 	}
611 	return (NO);
612 }
613 
614 static void
efollow(re_re * r,Positionset * fpos,Expr * e)615 efollow(re_re *r, Positionset *fpos, Expr *e)
616 {
617 	ID i, *p;
618 
619 	CLEAR(r, fpos);
620 	follow(fpos, e);
621 	e->flen = fpos->count;
622 	e->follow = (ID *)egmalloc(e->flen * sizeof (ID));
623 	p = e->follow;
624 #ifdef	DEBUG
625 	printf("ID = %d LIT %c FLEN = %d\n", e->id, e->lit, e->flen);
626 #endif
627 	for (i = fpos->last; i > 0; i = fpos->base[i].id) {
628 		*p++ = i;
629 #ifdef	DEBUG
630 	printf("FOLLOW ID = %d LIT %c\n", r->ptr[i]->id, r->ptr[i]->lit);
631 #endif
632 	}
633 	if (p != e->follow + e->flen) {
634 		err("internal error");
635 	}
636 }
637 
638 static State *
addstate(re_re * r,Positionset * ps,int cnt)639 addstate(re_re *r, Positionset *ps, int cnt)
640 {
641 	ID j;
642 	FID *p, *q;
643 	State *s;
644 
645 	if (cnt) {
646 		s = r->states + getstate(r, ps);
647 		(void) memset((char *)s->tab, 0, sizeof (s->tab));
648 		s->cnt = r->istate.cnt;
649 	} else {
650 		s = &r->istate;
651 		s->cnt = -1;
652 	}
653 	s->pos = r->posnext;
654 	r->posnext += ps->count;
655 	s->npos = ps->count;
656 	p = r->posbase + s->pos;
657 	for (j = ps->last; j > 0; p++, j = q->id) {
658 		q = &ps->base[j];
659 		p->id = j;
660 		p->fcount = q->fcount;
661 		if (p->id == r->root->id && s->cnt < p->fcount)
662 			s->cnt = p->fcount;
663 	}
664 #ifdef	DEBUG
665 	if (TRACE(3)) {
666 		uchar_t buf[2000];
667 		spr(s->npos, s->pos+r->posbase, buf);
668 		(void) printf("new state[%d] %s%s\n", s-r->states, buf,
669 		    s->cnt?" cnt":"");
670 	}
671 #endif
672 	return (s);
673 }
674 
675 static State *
nextstate(re_re * r,State * s,int a)676 nextstate(re_re *r, State *s, int a)
677 {
678 	State *news;
679 
680 	CLEAR(r, &r->tmp);
681 	followstate(r, s, a, &r->tmp);
682 	if (s != &r->istate) followstate(r, &r->istate, a, &r->tmp);
683 
684 #ifdef	DEBUG
685 	if (TRACE(5)) {
686 		uchar_t buf[2000];
687 		ppr(&r->tmp, buf);
688 		(void) printf("nextstate(%d, '%c'): found %s\n", s-r->states,
689 		    a, buf);
690 	}
691 #endif
692 	if (r->tmp.count == 0)
693 		news = &r->istate;
694 	else if ((news = stateof(r, &r->tmp)) == NULL)
695 		news = addstate(r, &r->tmp, 1);
696 	s->tab[a] = news;
697 #ifdef	DEBUG
698 	if (TRACE(5)) {
699 		(void) printf("nextstate(%d, '%c'): returning %ld\n",
700 		    s-r->states, a, news);
701 	}
702 #endif
703 	return (news);
704 }
705 
706 static void
followstate(re_re * r,State * s,int a,Positionset * fpos)707 followstate(re_re *r, State *s, int a, Positionset *fpos)
708 {
709 	int j;
710 	ID *q, *eq;
711 	Expr *e;
712 
713 	for (j = s->pos; j < (s->pos + s->npos); j++) {
714 		e = r->ptr[r->posbase[j].id];
715 		if (e->type == EOP) continue;
716 		if (match(e, a)) {
717 			if (e->follow == NULL) efollow(r, &r->firstpos, e);
718 			for (q = e->follow, eq = q + e->flen; q < eq; q++) {
719 				SET(fpos, *q, r->posbase[j].fcount + 1)
720 #ifdef	DEBUG
721 				printf("CHAR %c FC %c COUNT %d\n",
722 					a,
723 					r->ptr[*q]->lit,
724 					r->posbase[j].fcount+1);
725 #endif
726 			}
727 		}
728 	}
729 }
730 
731 static uchar_t *
egmalloc(size_t n)732 egmalloc(size_t n)
733 {
734 	uchar_t *x;
735 
736 	x = (uchar_t *)mymalloc(n);
737 	mallocs[nmalloc++] = x;
738 	if (nmalloc >= MAXMALLOCS)
739 		nmalloc = MAXMALLOCS - 1;
740 	return (x);
741 }
742 
743 #ifdef	DEBUG
744 void
ppr(Positionse * ps,char * p)745 ppr(Positionse *ps, char *p)
746 {
747 	ID n;
748 
749 	if (ps->count < 1) {
750 		(void) sprintf(p, "{}");
751 		return;
752 	}
753 	*p++ = '{';
754 	for (n = ps->last; n > 0; n = ps->base[n].id) {
755 		(void) sprintf(p, "%d,", n);
756 		p = strchr(p, 0);
757 	}
758 	p[-1] = '}';
759 }
760 
761 void
epr(Expr * e,uchar_t * res)762 epr(Expr *e, uchar_t *res)
763 {
764 	uchar_t r1[EPRINTSIZE], r2[EPRINTSIZE];
765 	int i;
766 
767 	if (e == NULL) {
768 		(void) sprintf(res, "!0!");
769 		return;
770 	}
771 	switch (e->type) {
772 	case Dot:
773 	case Literal:
774 		spr(e->flen, e->follow, r1);
775 		(void) sprintf(res, "%c%s", e->lit, r1);
776 		break;
777 	case Charclass:
778 		*res++ = '[';
779 		for (i = 0; i < 256; i++)
780 			if (CCL_CHK((uchar_t *)e->r, i)) {
781 				*res++ = i;
782 			}
783 		*res++ = ']';
784 		*res = '\0';
785 		break;
786 	case Cat:
787 		epr(e->l, r1);
788 		epr(e->r, r2);
789 		(void) sprintf(res, "%s%s", r1, r2);
790 		break;
791 	case Alternate:
792 		epr(e->l, r1);
793 		epr(e->r, r2);
794 		(void) sprintf(res, "(%s|%s)", r1, r2);
795 		break;
796 	case Star:
797 		epr(e->l, r1);
798 		(void) sprintf(res, "(%s)*", r1);
799 		break;
800 	case Plus:
801 		epr(e->l, r1);
802 		(void) sprintf(res, "(%s)+", r1);
803 		break;
804 	case Quest:
805 		epr(e->l, r1);
806 		(void) sprintf(res, "(%s)?", r1);
807 		break;
808 	case EOP:
809 		epr(e->l, r1);
810 		(void) sprintf(res, "%s<EOP>", r1);
811 		break;
812 	default:
813 		(void) sprintf(res, "<undef type %d>", e->type);
814 		err(res);
815 		break;
816 	}
817 }
818 
819 static void
spr(int c,int * p,uchar_t * buf)820 spr(int c, int *p, uchar_t *buf)
821 {
822 	if (c > 0) {
823 		*buf++ = '{';
824 		*buf = '\0';
825 		while (--c > 0) {
826 			(void) sprintf(buf, "%d,", *p++);
827 			buf = strchr(buf, 0);
828 		}
829 		(void) sprintf(buf, "%d}", *p);
830 	} else
831 		(void) sprintf(buf, "{}");
832 }
833 #endif
834 
835 static void
stateinit(re_re * r)836 stateinit(re_re *r)
837 {
838 	/* CONSTANTCONDITION */
839 	r->statelim = (sizeof (int) < 4 ? 32 : 128);
840 	r->states = (State *)egmalloc(r->statelim * sizeof (State));
841 
842 	/* CONSTANTCONDITION */
843 	r->nposalloc = (sizeof (int) < 4 ? 2048 : 8192);
844 	r->posbase = (FID *)egmalloc(r->nposalloc * sizeof (FID));
845 	r->nstates = r->posnext = 0;
846 }
847 
848 static void
clrstates(re_re * r)849 clrstates(re_re *r)
850 {
851 	r->nstates = 0;		/* reclaim space for states and positions */
852 	r->posnext = r->posreset;
853 	(void) memset((char *)r->istate.tab, 0, sizeof (r->istate.tab));
854 }
855 
856 static void
savestate(re_re * r)857 savestate(re_re *r)
858 {
859 	r->posreset = r->posnext;	/* save for reset */
860 }
861 
862 static State *
startstate(re_re * r)863 startstate(re_re *r)
864 {
865 	return (&r->istate);
866 }
867 
868 static int
getstate(re_re * r,Positionset * ps)869 getstate(re_re *r, Positionset *ps)
870 {
871 	if (r->nstates >= r->statelim ||
872 	    r->posnext + ps->count >= r->nposalloc) {
873 		clrstates(r);
874 #ifdef	DEBUG
875 		printf("%d STATES FLUSHED\n", r->statelim);
876 #endif
877 	}
878 	return (r->nstates++);
879 }
880 
881 static State *
stateof(re_re * r,Positionset * ps)882 stateof(re_re *r, Positionset *ps)
883 {
884 	State *s;
885 	int i;
886 	FID *p, *e;
887 
888 	for (i = 0, s = r->states; i < r->nstates; i++, s++) {
889 		if (s->npos == ps->count) {
890 			for (p = s->pos+r->posbase, e = p+s->npos; p < e; p++)
891 				if (ps->base[p->id].id == 0 ||
892 					ps->base[p->id].fcount != p->fcount)
893 						goto next;
894 			return (s);
895 		}
896 	next:;
897 	}
898 	return (NULL);
899 }
900 
901 static re_re *
egprep(PATTERN * pat)902 egprep(PATTERN *pat)
903 {
904 	re_re *r;
905 
906 	r = (re_re *)egmalloc(sizeof (re_re));
907 	(void) memset((char *)r, 0, sizeof (re_re));
908 
909 	pat->loc1 = pat->expression;
910 	pat->loc2 = pat->expression + strlen((char *)pat->expression);
911 
912 	parno = 0;
913 	maxid = 1;
914 	r->cmap = pat->cmap;
915 	lex(r, pat);
916 	r->root = newexpr(EOP, '#', eall(r, pat), (Expr *)NULL);
917 	r->maxid = maxid;
918 
919 	eginit(r);
920 	return (r);
921 }
922 
923 static Expr *
newexpr(Exprtype t,int lit,Expr * left,Expr * right)924 newexpr(Exprtype t, int lit, Expr *left, Expr *right)
925 {
926 	Expr *e = (Expr *)egmalloc(sizeof (Expr));
927 
928 	e->type = t;
929 	e->parent = NULL;
930 	e->lit = lit;
931 
932 	if (e->lit) e->id = maxid++;
933 	else e->id = 0;
934 
935 	if ((e->l = left) != NULL) {
936 		left->parent = e;
937 	}
938 	if ((e->r = right) != NULL) {
939 		right->parent = e;
940 	}
941 	e->follow = NULL;
942 	e->flen = 0;
943 	return (e);
944 }
945 
946 static void
lex(re_re * r,PATTERN * pat)947 lex(re_re *r, PATTERN *pat)
948 {
949 	if (pat->loc1 == pat->loc2) {
950 		toktype = EOP;
951 		toklit = -1;
952 	} else switch (toklit = *pat->loc1++) {
953 	case '.':	toktype = Dot; break;
954 	case '*':	toktype = Star; break;
955 	case '+':	toktype = Plus; break;
956 	case '?':	toktype = Quest; break;
957 	case '[':	toktype = Charclass; break;
958 	case '|':	toktype = Alternate; break;
959 	case '(':	toktype = Lpar; break;
960 	case ')':	toktype = Rpar; break;
961 	case '\\':	toktype = Backslash;
962 			if (pat->loc1 == pat->loc2) {
963 				err("syntax error - missing character "
964 				    "after \\");
965 			} else
966 			    toklit = r->cmap[*pat->loc1++];
967 			break;
968 	case '^': case '$':	toktype = Literal; toklit = NL; break;
969 	default:	toktype = Literal; toklit = r->cmap[toklit]; break;
970 	}
971 }
972 
973 static int
ccl(PATTERN * pat,uchar_t * tab)974 ccl(PATTERN *pat, uchar_t *tab)
975 {
976 	int i;
977 	int range = 0;
978 	int lastc = -1;
979 	int count = 0;
980 	BOOL comp = NO;
981 
982 	(void) memset((char *)tab, 0, CCL_SIZ * sizeof (uchar_t));
983 	if (*pat->loc1 == '^') {
984 		pat->loc1++;
985 		comp = YES;
986 	}
987 	if (*pat->loc1 == ']') {
988 		uchar_t c = pat->cmap[*pat->loc1];
989 		CCL_SET(tab, c);
990 		lastc = *pat->loc1++;
991 	}
992 	/* scan for chars */
993 	for (; (pat->loc1 < pat->loc2) && (*pat->loc1 != ']');
994 							pat->loc1++) {
995 		if (*pat->loc1 == '-') {
996 			if (lastc < 0) CCL_SET(tab, pat->cmap['-']);
997 			else range = 1;
998 			continue;
999 		}
1000 		if (range) {
1001 			for (i = *pat->loc1; i >= lastc; i--) {
1002 				CCL_SET(tab, pat->cmap[i]);
1003 			}
1004 		} else {
1005 			uchar_t c = pat->cmap[*pat->loc1];
1006 			CCL_SET(tab, c);
1007 		}
1008 
1009 		range = 0;
1010 
1011 		lastc = *pat->loc1;
1012 	}
1013 	if (range) CCL_SET(tab, pat->cmap['-']);
1014 
1015 	if (pat->loc1 < pat->loc2) pat->loc1++;
1016 	else err("syntax error - missing ]");
1017 
1018 	if (comp) {
1019 		CCL_SET(tab, pat->cmap[NL]);
1020 		for (i = 0; i < CCL_SIZ; i++) tab[i] ^= 0xff;
1021 	}
1022 	for (i = 0; i < 256; i++) {
1023 		if (pat->cmap[i] != i) CCL_CLR(tab, i);
1024 		if (CCL_CHK(tab, i)) {
1025 			lastc = i;
1026 			count++;
1027 		}
1028 	}
1029 	if (count == 1)
1030 		*tab = (char)lastc;
1031 	return (count);
1032 }
1033 
1034 /*
1035  * egrep patterns:
1036  *
1037  * Alternation:	d0:	d1 { '|' d1 }*
1038  * Concatenation:	d1:	d2 { d2 }*
1039  * Repetition:	d2:	d3 { '*' | '?' | '+' }
1040  * Literal:	d3:	lit | '.' | '[]' | '(' d0 ')'
1041  */
1042 
1043 static Expr *
d3(re_re * r,PATTERN * pat)1044 d3(re_re *r, PATTERN *pat)
1045 {
1046 	Expr *e;
1047 	uchar_t *tab;
1048 	int count;
1049 
1050 	switch (toktype) {
1051 	case Backslash:
1052 	case Literal:
1053 		e = newexpr(Literal, toklit, (Expr *)NULL, (Expr *)NULL);
1054 		lex(r, pat);
1055 		break;
1056 	case Dot:
1057 		e = newexpr(Dot, '.', (Expr *)NULL, (Expr *)NULL);
1058 		lex(r, pat);
1059 		break;
1060 	case Charclass:
1061 		tab = egmalloc(CCL_SIZ * sizeof (uchar_t));
1062 		count = ccl(pat, tab);
1063 		if (count == 1) {
1064 			toklit = *tab;
1065 			e = newexpr(Literal, toklit, (Expr *)NULL,
1066 			    (Expr *)NULL);
1067 		} else {
1068 			e = newexpr(Charclass, '[', (Expr *)NULL, (Expr *)NULL);
1069 			e->l = (Expr *)count;	/* number of chars */
1070 			e->r = (Expr *)tab;	/* bitmap of chars */
1071 		}
1072 		lex(r, pat);
1073 		break;
1074 	case Lpar:
1075 		lex(r, pat);
1076 		count = ++parno;
1077 		e = d0(r, pat);
1078 		if (toktype == Rpar)
1079 			lex(r, pat);
1080 		else
1081 			err("syntax error - missing )");
1082 		return (e);
1083 	default:
1084 		err("syntax error");
1085 		e = NULL;
1086 	}
1087 	return (e);
1088 }
1089 
1090 static Expr *
d2(re_re * r,PATTERN * pat)1091 d2(re_re *r, PATTERN *pat)
1092 {
1093 	Expr *e;
1094 	Exprtype t;
1095 
1096 	e = d3(r, pat);
1097 	while ((toktype == Star) || (toktype == Plus) || (toktype == Quest)) {
1098 		t = toktype;
1099 		lex(r, pat);
1100 		e = newexpr(t, 0, e, (Expr *)NULL);
1101 	}
1102 	return (e);
1103 }
1104 
1105 static Expr *
d1(re_re * r,PATTERN * pat)1106 d1(re_re *r, PATTERN *pat)
1107 {
1108 	Expr *e, *f;
1109 
1110 	e = d2(r, pat);
1111 	while ((toktype == Literal) || (toktype == Dot) || (toktype == Lpar) ||
1112 		(toktype == Backslash) || (toktype == Charclass)) {
1113 		f = d2(r, pat);
1114 		e = newexpr(Cat, 0, e, f);
1115 	}
1116 	return (e);
1117 }
1118 
1119 static Expr *
d0(re_re * r,PATTERN * pat)1120 d0(re_re *r, PATTERN *pat)
1121 {
1122 	Expr *e, *f;
1123 
1124 	e = d1(r, pat);
1125 	while (toktype == Alternate) {
1126 		lex(r, pat);
1127 		if (toktype == EOP)
1128 			continue;
1129 		f = d1(r, pat);
1130 		e = newexpr(Alternate, 0, e, f);
1131 	}
1132 	return (e);
1133 }
1134 
1135 static Expr *
eall(re_re * r,PATTERN * pat)1136 eall(re_re *r, PATTERN *pat)
1137 {
1138 	Expr *e;
1139 
1140 	while (toktype == Alternate)	/* bogus but user-friendly */
1141 		lex(r, pat);
1142 	e = d0(r, pat);
1143 	while (toktype == Alternate)	/* bogus but user-friendly */
1144 		lex(r, pat);
1145 	if (toktype != EOP)
1146 		err("syntax error");
1147 	return (e);
1148 }
1149 
1150 static void
err(char * s)1151 err(char *s)
1152 {
1153 	message = s;
1154 	longjmp(env, 1);
1155 }
1156 
1157 static int
re_lit(PATTERN * pat,uchar_t ** b,uchar_t ** e)1158 re_lit(PATTERN *pat, uchar_t **b, uchar_t **e)
1159 {
1160 	bestlen = 0;
1161 	pat->fullmatch = YES;
1162 	START
1163 	traverse(pat, pat->re_ptr->root->l);
1164 	FINISH
1165 	if (bestlen < 2)
1166 		return (0);
1167 	*b = egmalloc(bestlen * sizeof (uchar_t));
1168 	(void) memmove(*b, best, bestlen);
1169 	*e = *b + bestlen - 1;
1170 	return (bestlen - 1);
1171 }
1172 
1173 static void
traverse(PATTERN * pat,Expr * e)1174 traverse(PATTERN *pat, Expr *e)
1175 {
1176 	switch (e->type) {
1177 	case Literal:
1178 		ADDL(e->lit)
1179 		break;
1180 	case Cat:
1181 		traverse(pat, e->l);
1182 		traverse(pat, e->r);
1183 		break;
1184 	case Plus:
1185 		traverse(pat, e->l);
1186 		FINISH	/* can't go on past a + */
1187 		pat->fullmatch = NO;
1188 		START	/* but we can start with one! */
1189 		traverse(pat, e->l);
1190 		break;
1191 	default:
1192 		FINISH
1193 		pat->fullmatch = NO;
1194 		START
1195 		break;
1196 	}
1197 }
1198 
1199 static re_bm *
re_bmcomp(uchar_t * pb,uchar_t * pe,uchar_t * cmap)1200 re_bmcomp(uchar_t *pb, uchar_t *pe, uchar_t *cmap)
1201 {
1202 	int j;
1203 	int delta[256];
1204 	re_bm *b;
1205 
1206 	b = (re_bm *)egmalloc(sizeof (*b));
1207 
1208 	b->patlen = pe - pb;
1209 	b->cmap = cmap;
1210 	b->bmpat = pb;
1211 
1212 	delta_2(b);
1213 
1214 	for (j = 0; j < 256; j++)
1215 		delta[j] = b->patlen;
1216 
1217 	for (pe--; pb < pe; pb++)
1218 		delta[b->cmap[*pb]] = pe - pb;
1219 
1220 	delta[b->cmap[*pb]] = LARGE;
1221 
1222 	for (j = 0; j < 256; j++)
1223 		b->delta0[j] = delta[b->cmap[j]];
1224 	return (b);
1225 }
1226 
1227 static void
delta_2(re_bm * b)1228 delta_2(re_bm *b)
1229 {
1230 	int m = b->patlen;
1231 	int i, k, j;
1232 
1233 	b->delta2 = (int *)egmalloc(m * sizeof (int));
1234 
1235 	for (j = 0; j < m; j++) {
1236 		k = 1;
1237 again:
1238 		while (k <= j && b->bmpat[j-k] == b->bmpat[j]) k++;
1239 
1240 		for (i = j + 1; i < m; i++) {
1241 			if (k <= i && b->bmpat[i-k] != b->bmpat[i]) {
1242 				k++;
1243 				goto again;
1244 			}
1245 		}
1246 		b->delta2[j] = k + m - j - 1;
1247 	}
1248 }
1249 
1250 static BOOL
re_bmexec(PATTERN * pat,uchar_t * s,uchar_t * e,uchar_t ** mb,uchar_t ** me)1251 re_bmexec(PATTERN *pat, uchar_t *s, uchar_t *e, uchar_t **mb, uchar_t **me)
1252 {
1253 	re_bm *b = pat->bm_ptr;
1254 	uchar_t *sp;
1255 	int k;
1256 
1257 	s += b->patlen - 1;
1258 	while ((unsigned long)s < (unsigned long)e) {
1259 		while ((unsigned long)(s += b->delta0[*s]) < (unsigned long)e)
1260 			;
1261 		if ((unsigned long)s < (unsigned long)(e + b->patlen))
1262 			return (NO); /* no match */
1263 		s -= LARGE;
1264 		for (k = b->patlen-2, sp = s-1; k >= 0; k--) {
1265 			if (b->cmap[*sp--] != b->bmpat[k]) break;
1266 		}
1267 		if (k < 0) {
1268 			*mb = ++sp;
1269 			*me = s+1;
1270 			if (grepmatch(pat, mb, me))
1271 				return (YES);
1272 			s = *mb;
1273 		} else if (k < 0) {
1274 			s++;
1275 		} else {
1276 			int j;
1277 			j = b->delta2[k];
1278 			k = b->delta0[*++sp];
1279 			if ((j > k) || (k == (int)LARGE))
1280 				k = j;
1281 			s = sp + k;
1282 		}
1283 	}
1284 	return (NO);
1285 }
1286 
1287 static re_cw *
re_recw(re_re * r,uchar_t * map)1288 re_recw(re_re *r, uchar_t *map)
1289 {
1290 	Expr *e, *root = r->root;
1291 	re_cw *pat;
1292 	uchar_t *buf;
1293 
1294 	if (root->type != EOP)
1295 		return (NULL);
1296 	e = root->l;
1297 	pat = re_cwinit(map);
1298 	buf = (uchar_t *)egmalloc(20000 * sizeof (uchar_t));
1299 	if (!altlist(e, buf, pat)) {
1300 		return (NULL);
1301 	}
1302 	re_cwcomp(pat);
1303 	return (pat);
1304 }
1305 
1306 static BOOL
altlist(Expr * e,uchar_t * buf,re_cw * pat)1307 altlist(Expr *e, uchar_t *buf, re_cw *pat)
1308 {
1309 	if (e->type == Alternate)
1310 		return ((BOOL)(altlist(e->l, buf, pat) &&
1311 		    altlist(e->r, buf, pat)));
1312 	return (word(e, buf, pat));
1313 }
1314 
1315 static BOOL
word(Expr * e,uchar_t * buf,re_cw * pat)1316 word(Expr *e, uchar_t *buf, re_cw *pat)
1317 {
1318 	static uchar_t *p;
1319 
1320 	if (buf) p = buf;
1321 
1322 	if (e->type == Cat) {
1323 		if (!word(e->l, (uchar_t *)NULL, pat))
1324 			return (NO);
1325 		if (!word(e->r, (uchar_t *)NULL, pat))
1326 			return (NO);
1327 	} else if (e->type == Literal)
1328 		*p++ = e->lit;
1329 	else
1330 		return (NO);
1331 
1332 	if (buf)
1333 		re_cwadd(pat, buf, p);
1334 	return (YES);
1335 }
1336 
1337 static re_cw *
re_cwinit(uchar_t * cmap)1338 re_cwinit(uchar_t *cmap)
1339 {
1340 	re_cw *c;
1341 
1342 	froot = NULL;
1343 	next_node = NULL;
1344 	next_link = NULL;
1345 	c = (re_cw *)egmalloc(sizeof (*c));
1346 	c->nodeid = 0;
1347 	c->maxdepth = 0;
1348 	c->mindepth = 10000;
1349 	c->root = newnode(c, 0);
1350 	c->cmap = cmap;
1351 	return (c);
1352 }
1353 
1354 static void
re_cwadd(re_cw * c,uchar_t * s,uchar_t * e)1355 re_cwadd(re_cw *c, uchar_t *s, uchar_t *e)
1356 {
1357 	Node *p, *state;
1358 	Link *l;
1359 	int depth;
1360 
1361 	state = c->root;
1362 	while (s <= --e) {
1363 		for (l = state->alts; l; l = l->next)
1364 			if (l->lit == *e)
1365 				break;
1366 		if (l == NULL)
1367 			break;
1368 		else
1369 			state = l->node;
1370 	}
1371 	if (s <= e) {
1372 		depth = state->d+1;
1373 		l = newlink(*e--, p = newnode(c, depth++));
1374 		l->next = state->alts;
1375 		state->alts = l;
1376 		state = p;
1377 		while (s <= e) {
1378 			state->alts = newlink(*e--, p = newnode(c, depth++));
1379 			state = p;
1380 		}
1381 		if (c->mindepth >= depth) c->mindepth = depth-1;
1382 	}
1383 	state->out = 1;
1384 }
1385 
1386 #ifdef	DEBUG
1387 static
pr(Node * n)1388 pr(Node *n)
1389 {
1390 	Link *l;
1391 
1392 	printf("%d[%d,%d]: ", n->id, n->shift1, n->shift2);
1393 	for (l = n->alts; l; l = l->next) {
1394 		printf("edge from \"%d\" to \"%d\" label {\"%c\"};",
1395 		    n->id, l->node->id, l->lit);
1396 		if (l->node->out) {
1397 			printf(" draw \"%d\" as Doublecircle;", l->node->id);
1398 		}
1399 		if (l->node->fail) {
1400 			printf(" edge from \"%d\" to \"%d\" dashed;",
1401 			    l->node->id, l->node->fail->id);
1402 		}
1403 		printf("\n");
1404 		pr(l->node);
1405 	}
1406 }
1407 #endif
1408 
1409 static void
fail(Node * root)1410 fail(Node *root)
1411 {
1412 	Link *qhead = NULL, *qtail = NULL;
1413 	Link *l, *ll;
1414 	Link *t;
1415 	Node *state, *r, *s;
1416 	int a;
1417 
1418 	for (l = root->alts; l; l = l->next) {
1419 		ADD(l->node);
1420 		l->node->fail = root;
1421 	}
1422 	while (qhead) {
1423 		r = qhead->node;
1424 		DEL();
1425 		for (l = r->alts; l; l = l->next) {
1426 			s = l->node;
1427 			a = l->lit;
1428 			ADD(s);
1429 			state = r->fail;
1430 			while (state) {
1431 				for (ll = state->alts; ll; ll = ll->next)
1432 					if (ll->lit == a)
1433 						break;
1434 				if (ll || (state == root)) {
1435 					if (ll)
1436 						state = ll->node;
1437 					/*
1438 					 *	do it here as only other exit is
1439 					 *	state 0
1440 					 */
1441 					if (state->out) {
1442 						s->out = 1;
1443 					}
1444 					break;
1445 				} else
1446 					state = state->fail;
1447 			}
1448 			s->fail = state;
1449 		}
1450 	}
1451 	zeroroot(root, root);
1452 }
1453 
1454 static void
zeroroot(Node * root,Node * n)1455 zeroroot(Node *root, Node *n)
1456 {
1457 	Link *l;
1458 
1459 	if (n->fail == root)
1460 		n->fail = NULL;
1461 	for (l = n->alts; l; l = l->next)
1462 		zeroroot(root, l->node);
1463 }
1464 
1465 static void
shift(re_cw * c)1466 shift(re_cw *c)
1467 {
1468 	Link *qhead = NULL, *qtail = NULL;
1469 	Link *l;
1470 	Link *t;
1471 	Node *state, *r, *s;
1472 	int k;
1473 
1474 	for (k = 0; k < 256; k++)
1475 		c->step[k] = c->mindepth+1;
1476 	c->root->shift1 = 1;
1477 	c->root->shift2 = c->mindepth;
1478 	for (l = c->root->alts; l; l = l->next) {
1479 		l->node->shift2 = c->root->shift2;
1480 		ADD(l->node);
1481 		l->node->fail = NULL;
1482 	}
1483 	while (qhead) {
1484 		r = qhead->node;
1485 		DEL();
1486 		r->shift1 = c->mindepth;
1487 		r->shift2 = c->mindepth;
1488 		if ((state = r->fail) != NULL) {
1489 			do {
1490 				k = r->d - state->d;
1491 				if (k < state->shift1) {
1492 					state->shift1 = k;
1493 				}
1494 				if (r->out && (k < state->shift2)) {
1495 					state->shift2 = k;
1496 				}
1497 			} while ((state = state->fail) != NULL);
1498 		}
1499 		for (l = r->alts; l; l = l->next) {
1500 			s = l->node;
1501 			ADD(s);
1502 		}
1503 	}
1504 	shiftprop(c, c->root);
1505 	shifttab(c->root);
1506 	c->step[0] = 1;
1507 }
1508 
1509 static void
shifttab(Node * n)1510 shifttab(Node *n)
1511 {
1512 	Link *l;
1513 	Node **nn;
1514 
1515 	n->tab = nn = (Node **)egmalloc(256 * sizeof (Node *));
1516 	(void) memset((char *)n->tab, 0, 256 * sizeof (Node *));
1517 	for (l = n->alts; l; l = l->next)
1518 		nn[l->lit] = l->node;
1519 }
1520 
1521 static void
shiftprop(re_cw * c,Node * n)1522 shiftprop(re_cw *c, Node *n)
1523 {
1524 	Link *l;
1525 	Node *nn;
1526 
1527 	for (l = n->alts; l; l = l->next) {
1528 		if (c->step[l->lit] > l->node->d)
1529 			c->step[l->lit] = l->node->d;
1530 		nn = l->node;
1531 		if (n->shift2 < nn->shift2)
1532 			nn->shift2 = n->shift2;
1533 		shiftprop(c, nn);
1534 	}
1535 }
1536 
1537 static void
re_cwcomp(re_cw * c)1538 re_cwcomp(re_cw *c)
1539 {
1540 	fail(c->root);
1541 	shift(c);
1542 }
1543 
1544 static BOOL
re_cwexec(PATTERN * pat,uchar_t * rs,uchar_t * re,uchar_t ** mb,uchar_t ** me)1545 re_cwexec(PATTERN *pat, uchar_t *rs, uchar_t *re, uchar_t **mb, uchar_t **me)
1546 {
1547 	Node *state;
1548 	Link *l;
1549 	uchar_t *sp;
1550 	uchar_t *s;
1551 	uchar_t *e;
1552 	Node *ostate;
1553 	int k;
1554 	re_cw *c = pat->cw_ptr;
1555 	uchar_t fake[1];
1556 	uchar_t mappedsp;
1557 
1558 	fake[0] = 0;
1559 	state = c->root;
1560 
1561 	s = rs+c->mindepth-1;
1562 	e = re;
1563 	while (s < e) {
1564 		/* scan */
1565 		for (sp = s; (ostate = state) != NULL; ) {
1566 			mappedsp = c->cmap[*sp];
1567 			if (state->tab) {
1568 				if ((state = state->tab[mappedsp]) == NULL)
1569 					goto nomatch;
1570 			} else {
1571 				for (l = state->alts; ; l = l->next) {
1572 					if (l == NULL)
1573 						goto nomatch;
1574 					if (l->lit == mappedsp) {
1575 						state = l->node;
1576 						break;
1577 					}
1578 				}
1579 			}
1580 			if (state->out) {
1581 				*mb = sp;
1582 				*me = s+1;
1583 				if (fixloc(mb, me))
1584 					return (YES);
1585 			}
1586 			if (--sp < rs) {
1587 				sp = fake;
1588 				break;
1589 			}
1590 		}
1591 	nomatch:
1592 		k = c->step[c->cmap[*sp]] - ostate->d - 1;
1593 		if (k < ostate->shift1)
1594 			k = ostate->shift1;
1595 		if (k > ostate->shift2)
1596 			k = ostate->shift2;
1597 		s += k;
1598 		state = c->root;
1599 	}
1600 	return (NO);
1601 }
1602 
1603 static Node *
newnode(re_cw * c,int d)1604 newnode(re_cw *c, int d)
1605 {
1606 	static Node *lim = NULL;
1607 	static int incr = 256;
1608 
1609 	if (!next_node) lim = NULL;
1610 	if (next_node == lim) {
1611 		next_node = (Node *)egmalloc(incr * sizeof (Node));
1612 		lim = next_node + incr;
1613 	}
1614 	next_node->d = d;
1615 	if (d > c->maxdepth) c->maxdepth = d;
1616 	next_node->id = c->nodeid++;
1617 	next_node->alts = NULL;
1618 	next_node->tab = NULL;
1619 	next_node->out = 0;
1620 	return (next_node++);
1621 }
1622 
1623 static Link *
newlink(uchar_t lit,Node * n)1624 newlink(uchar_t lit, Node *n)
1625 {
1626 	static Link *lim = NULL;
1627 	static int incr = 256;
1628 
1629 	if (!next_link) lim = NULL;
1630 	if (next_link == lim) {
1631 		next_link = (Link *)egmalloc(incr * sizeof (Link));
1632 		lim = next_link + incr;
1633 	}
1634 	next_link->lit = lit;
1635 	next_link->node = n;
1636 	next_link->next = NULL;
1637 	return (next_link++);
1638 }
1639 
1640 int
egrep(char * f,FILE * o,char * fo)1641 egrep(char *f, FILE *o, char *fo)
1642 {
1643 	uchar_t		buff[MAXBUFSIZE + ESIZE];
1644 
1645 	buffer = buff;
1646 	*buffer++ = NL;  /* New line precedes buffer to prevent runover */
1647 	file = f;
1648 	output = o;
1649 	format = fo;
1650 	return (execute());
1651 }
1652 
1653 static int
execute(void)1654 execute(void)
1655 {
1656 	LINE		current;
1657 	BOOL		matched;
1658 	BOOL	all_searched; /* file being matched until end */
1659 
1660 	if ((file_desc = open(file, O_RDONLY)) < 0) {
1661 		return (-1);
1662 	}
1663 	init_file(&current);
1664 		/* while there is more get more text from the data stream */
1665 	for (;;) {
1666 		all_searched = NO;
1667 
1668 		/*
1669 		 * Find next new-line in buffer.
1670 		 * Begin after previous new line character
1671 		 */
1672 
1673 		current.prntbuf = current.newline + 1;
1674 		current.ln++; /* increment line number */
1675 
1676 		if (current.prntbuf < bufend) {
1677 			/* There is more text in buffer */
1678 
1679 			/*
1680 			 * Take our next
1681 			 * "line" as the entire remaining buffer.
1682 			 * However, if there is more of the file yet
1683 			 * to be read in, exclude any incomplete
1684 			 * line at end.
1685 			 */
1686 			if (file_stat == NO_MORE) {
1687 				all_searched = YES;
1688 				current.newline = bufend - 1;
1689 				if (*current.newline != NL) {
1690 					current.newline = bufend;
1691 				}
1692 			} else {
1693 				/*
1694 				 * Find end of the last
1695 				 * complete line in the buffer.
1696 				 */
1697 				current.newline = bufend;
1698 				while (*--current.newline != NL) {
1699 				}
1700 				if (current.newline < current.prntbuf) {
1701 					/* end not found */
1702 					current.newline = bufend;
1703 				}
1704 			}
1705 		} else {
1706 			/* There is no more text in the buffer. */
1707 			current.newline = bufend;
1708 		}
1709 		if (current.newline >= bufend) {
1710 			/*
1711 			 * There is no more text in the buffer,
1712 			 * or no new line was found.
1713 			 */
1714 			switch (file_stat) {
1715 			case MORE:	/* file partly unread */
1716 			case BEGIN:
1717 				fgetfile(&current);
1718 
1719 				current.ln--;
1720 				continue; /* with while loop */
1721 			case NO_MORE:
1722 				break;
1723 			}
1724 			/* Nothing more to read in for this file. */
1725 			if (current.newline <= current.prntbuf) {
1726 				/* Nothing in the buffer, either */
1727 				/* We are done with the file. */
1728 				current.ln--;
1729 				break; /* out of while loop */
1730 			}
1731 			/* There is no NL at the end of the file */
1732 		}
1733 
1734 		matched = pattern_match(&match_pattern, &current);
1735 		if (matched) {
1736 			int nc;
1737 
1738 			get_ncount(&current, match_pattern.loc1);
1739 			get_line(&current, match_pattern.loc2);
1740 
1741 			nc = current.newline + 1 - current.prntbuf;
1742 			(void) fprintf(output, format, file, current.ln);
1743 			(void) fwrite((char *)current.prntbuf, 1, nc, output);
1744 		} else {
1745 			if (all_searched)
1746 				break; /* out of while loop */
1747 
1748 			get_ncount(&current, current.newline + 1);
1749 		}
1750 	}
1751 
1752 	(void) close(file_desc);
1753 	return (0);
1754 }
1755 
1756 static void
init_file(LINE * cur_ptr)1757 init_file(LINE *cur_ptr)
1758 {
1759 	file_stat = BEGIN;
1760 	cur_ptr->ln = 0;
1761 	bufend = buffer;
1762 	cur_ptr->newline = buffer - 1;
1763 }
1764 
1765 static BOOL
pattern_match(PATTERN * pat,LINE * lptr)1766 pattern_match(PATTERN *pat, LINE *lptr)
1767 {
1768 	if ((*pat->procfn)(pat, lptr->prntbuf - 1, lptr->newline + 1,
1769 	    &pat->loc1, &pat->loc2)) {
1770 		return (YES);
1771 	} else {
1772 		pat->loc1 = lptr->prntbuf;
1773 		pat->loc2 = lptr->newline - 1;
1774 		return (NO);
1775 	}
1776 } /* end of function pattern_match() */
1777 
1778 static void
fgetfile(LINE * cur_ptr)1779 fgetfile(LINE *cur_ptr)
1780 {
1781 	/*
1782 	 * This function reads as much of the current file into the buffer
1783 	 * as will fit.
1784 	 */
1785 	int	bytes;	  /* bytes read in current buffer */
1786 	int	bufsize = MAXBUFSIZE; /* free space in data buffer */
1787 	int 	save_current;
1788 	uchar_t	*begin = cur_ptr->prntbuf;
1789 
1790 	/*
1791 	 * Bytes of current incomplete line, if any, save_current in buffer.
1792 	 * These must be saved.
1793 	 */
1794 	save_current = (int)(bufend - begin);
1795 
1796 	if (file_stat == MORE) {
1797 		/*
1798 		 * A portion of the file fills the buffer. We must clear
1799 		 * out the dead wood to make room for more of the file.
1800 		 */
1801 
1802 		int k = 0;
1803 
1804 		k = begin - buffer;
1805 		if (!k) {
1806 			/*
1807 			 * We have one humungous current line,
1808 			 * which fills the whole buffer.
1809 			 * Toss it.
1810 			 */
1811 			begin = bufend;
1812 			k = begin - buffer;
1813 			save_current = 0;
1814 		}
1815 
1816 		begin = buffer;
1817 		bufend = begin + save_current;
1818 
1819 		bufsize = MAXBUFSIZE - save_current;
1820 
1821 		if (save_current > 0) {
1822 			/*
1823 			 * Must save portion of current line.
1824 			 * Copy to beginning of buffer.
1825 			 */
1826 			(void) memmove(buffer, buffer + k, save_current);
1827 		}
1828 	}
1829 
1830 	/* Now read in the file. */
1831 
1832 	do {
1833 		bytes = read(file_desc, (char *)bufend, (unsigned int)bufsize);
1834 		if (bytes < 0) {
1835 			/* can't read any more of file */
1836 			bytes = 0;
1837 		}
1838 		bufend += bytes;
1839 		bufsize -= bytes;
1840 	} while (bytes > 0 && bufsize > 0);
1841 
1842 
1843 	if (begin >= bufend) {
1844 		/* No new lines or incomplete line in buffer */
1845 		file_stat = NO_MORE;
1846 	} else if (bufsize) {
1847 		/* Still space in the buffer, so we have read entire file */
1848 		file_stat = NO_MORE;
1849 	} else {
1850 		/* We filled entire buffer, so there may be more yet to read */
1851 		file_stat = MORE;
1852 	}
1853 		/* Note: bufend is 1 past last good char */
1854 	*bufend = NL;	/* Sentinel new-line character */
1855 	/* Set newline to character preceding the current line */
1856 	cur_ptr->newline = begin - 1;
1857 }
1858 
1859 static void
get_line(LINE * cur_ptr,uchar_t * s)1860 get_line(LINE *cur_ptr, uchar_t *s)
1861 {
1862 	while (*s++ != NL) {
1863 	}
1864 	cur_ptr->newline = --s;
1865 	cur_ptr->ln++;
1866 }
1867 
1868 static void
get_ncount(LINE * cur_ptr,uchar_t * s)1869 get_ncount(LINE *cur_ptr, uchar_t *s)
1870 {
1871 	uchar_t	*p = cur_ptr->prntbuf;
1872 
1873 	while (*--s != NL) {
1874 	}
1875 	cur_ptr->newline = s++;
1876 	while ((s > p) &&
1877 	    (p = (uchar_t *)memchr((char *)p, NL, s - p)) != NULL) {
1878 		cur_ptr->ln++;
1879 		++p;
1880 	}
1881 	cur_ptr->ln--;
1882 	cur_ptr->prntbuf = s;
1883 }
1884