xref: /freebsd/lib/libc/regex/engine.c (revision 69c9999d0ca45b210e75706ab4952ad5a33ce6ec)
1 /*-
2  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3  * Copyright (c) 1992, 1993, 1994
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Henry Spencer.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)engine.c	8.5 (Berkeley) 3/20/94
38  */
39 
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42 
43 /*
44  * The matching engine and friends.  This file is #included by regexec.c
45  * after suitable #defines of a variety of macros used herein, so that
46  * different state representations can be used without duplicating masses
47  * of code.
48  */
49 
50 #ifdef SNAMES
51 #define	matcher	smatcher
52 #define	fast	sfast
53 #define	slow	sslow
54 #define	dissect	sdissect
55 #define	backref	sbackref
56 #define	step	sstep
57 #define	print	sprint
58 #define	at	sat
59 #define	match	smat
60 #endif
61 #ifdef LNAMES
62 #define	matcher	lmatcher
63 #define	fast	lfast
64 #define	slow	lslow
65 #define	dissect	ldissect
66 #define	backref	lbackref
67 #define	step	lstep
68 #define	print	lprint
69 #define	at	lat
70 #define	match	lmat
71 #endif
72 
73 /* another structure passed up and down to avoid zillions of parameters */
74 struct match {
75 	struct re_guts *g;
76 	int eflags;
77 	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
78 	char *offp;		/* offsets work from here */
79 	char *beginp;		/* start of string -- virtual NUL precedes */
80 	char *endp;		/* end of string -- virtual NUL here */
81 	char *coldp;		/* can be no match starting before here */
82 	char **lastpos;		/* [nplus+1] */
83 	STATEVARS;
84 	states st;		/* current states */
85 	states fresh;		/* states for a fresh start */
86 	states tmp;		/* temporary */
87 	states empty;		/* empty set of states */
88 };
89 
90 /* ========= begin header generated by ./mkh ========= */
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
94 
95 /* === engine.c === */
96 static int matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
97 static char *dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
98 static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
99 static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
100 static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
101 static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
102 #define	BOL	(OUT+1)
103 #define	EOL	(BOL+1)
104 #define	BOLEOL	(BOL+2)
105 #define	NOTHING	(BOL+3)
106 #define	BOW	(BOL+4)
107 #define	EOW	(BOL+5)
108 #define	CODEMAX	(BOL+5)		/* highest code used */
109 #define	NONCHAR(c)	((c) > CHAR_MAX)
110 #define	NNONCHAR	(CODEMAX-CHAR_MAX)
111 #ifdef REDEBUG
112 static void print(struct match *m, char *caption, states st, int ch, FILE *d);
113 #endif
114 #ifdef REDEBUG
115 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
116 #endif
117 #ifdef REDEBUG
118 static char *pchar(int ch);
119 #endif
120 
121 #ifdef __cplusplus
122 }
123 #endif
124 /* ========= end header generated by ./mkh ========= */
125 
126 #ifdef REDEBUG
127 #define	SP(t, s, c)	print(m, t, s, c, stdout)
128 #define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
129 #define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
130 #else
131 #define	SP(t, s, c)	/* nothing */
132 #define	AT(t, p1, p2, s1, s2)	/* nothing */
133 #define	NOTE(s)	/* nothing */
134 #endif
135 
136 /*
137  - matcher - the actual matching engine
138  == static int matcher(struct re_guts *g, char *string, \
139  ==	size_t nmatch, regmatch_t pmatch[], int eflags);
140  */
141 static int			/* 0 success, REG_NOMATCH failure */
142 matcher(g, string, nmatch, pmatch, eflags)
143 struct re_guts *g;
144 char *string;
145 size_t nmatch;
146 regmatch_t pmatch[];
147 int eflags;
148 {
149 	char *endp;
150 	int i;
151 	struct match mv;
152 	struct match *m = &mv;
153 	char *dp;
154 	const sopno gf = g->firststate+1;	/* +1 for OEND */
155 	const sopno gl = g->laststate;
156 	char *start;
157 	char *stop;
158 	/* Boyer-Moore algorithms variables */
159 	char *pp;
160 	int cj, mj;
161 	char *mustfirst;
162 	char *mustlast;
163 	int *matchjump;
164 	int *charjump;
165 
166 	/* simplify the situation where possible */
167 	if (g->cflags&REG_NOSUB)
168 		nmatch = 0;
169 	if (eflags&REG_STARTEND) {
170 		start = string + pmatch[0].rm_so;
171 		stop = string + pmatch[0].rm_eo;
172 	} else {
173 		start = string;
174 		stop = start + strlen(start);
175 	}
176 	if (stop < start)
177 		return(REG_INVARG);
178 
179 	/* prescreening; this does wonders for this rather slow code */
180 	if (g->must != NULL) {
181 		if (g->charjump != NULL && g->matchjump != NULL) {
182 			mustfirst = g->must;
183 			mustlast = g->must + g->mlen - 1;
184 			charjump = g->charjump;
185 			matchjump = g->matchjump;
186 			pp = mustlast;
187 			for (dp = start+g->mlen-1; dp < stop;) {
188 				/* Fast skip non-matches */
189 				while (dp < stop && charjump[*dp])
190 					dp += charjump[*dp];
191 
192 				if (dp >= stop)
193 					break;
194 
195 				/* Greedy matcher */
196 				/* We depend on not being used for
197 				 * for strings of length 1
198 				 */
199 				while (*--dp == *--pp && pp != mustfirst);
200 
201 				if (*dp == *pp)
202 					break;
203 
204 				/* Jump to next possible match */
205 				mj = matchjump[pp - mustfirst];
206 				cj = charjump[*dp];
207 				dp += (cj < mj ? mj : cj);
208 				pp = mustlast;
209 			}
210 			if (pp != mustfirst)
211 				return(REG_NOMATCH);
212 		} else {
213 			for (dp = start; dp < stop; dp++)
214 				if (*dp == g->must[0] &&
215 				    stop - dp >= g->mlen &&
216 				    memcmp(dp, g->must, (size_t)g->mlen) == 0)
217 					break;
218 			if (dp == stop)		/* we didn't find g->must */
219 				return(REG_NOMATCH);
220 		}
221 	}
222 
223 	/* match struct setup */
224 	m->g = g;
225 	m->eflags = eflags;
226 	m->pmatch = NULL;
227 	m->lastpos = NULL;
228 	m->offp = string;
229 	m->beginp = start;
230 	m->endp = stop;
231 	STATESETUP(m, 4);
232 	SETUP(m->st);
233 	SETUP(m->fresh);
234 	SETUP(m->tmp);
235 	SETUP(m->empty);
236 	CLEAR(m->empty);
237 
238 	/* Adjust start according to moffset, to speed things up */
239 	if (g->moffset > -1)
240 		start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
241 
242 	/* this loop does only one repetition except for backrefs */
243 	for (;;) {
244 		endp = fast(m, start, stop, gf, gl);
245 		if (endp == NULL) {		/* a miss */
246 			STATETEARDOWN(m);
247 			return(REG_NOMATCH);
248 		}
249 		if (nmatch == 0 && !g->backrefs)
250 			break;		/* no further info needed */
251 
252 		/* where? */
253 		assert(m->coldp != NULL);
254 		for (;;) {
255 			NOTE("finding start");
256 			endp = slow(m, m->coldp, stop, gf, gl);
257 			if (endp != NULL)
258 				break;
259 			assert(m->coldp < m->endp);
260 			m->coldp++;
261 		}
262 		if (nmatch == 1 && !g->backrefs)
263 			break;		/* no further info needed */
264 
265 		/* oh my, he wants the subexpressions... */
266 		if (m->pmatch == NULL)
267 			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
268 							sizeof(regmatch_t));
269 		if (m->pmatch == NULL) {
270 			STATETEARDOWN(m);
271 			return(REG_ESPACE);
272 		}
273 		for (i = 1; i <= m->g->nsub; i++)
274 			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
275 		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
276 			NOTE("dissecting");
277 			dp = dissect(m, m->coldp, endp, gf, gl);
278 		} else {
279 			if (g->nplus > 0 && m->lastpos == NULL)
280 				m->lastpos = (char **)malloc((g->nplus+1) *
281 							sizeof(char *));
282 			if (g->nplus > 0 && m->lastpos == NULL) {
283 				free(m->pmatch);
284 				STATETEARDOWN(m);
285 				return(REG_ESPACE);
286 			}
287 			NOTE("backref dissect");
288 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
289 		}
290 		if (dp != NULL)
291 			break;
292 
293 		/* uh-oh... we couldn't find a subexpression-level match */
294 		assert(g->backrefs);	/* must be back references doing it */
295 		assert(g->nplus == 0 || m->lastpos != NULL);
296 		for (;;) {
297 			if (dp != NULL || endp <= m->coldp)
298 				break;		/* defeat */
299 			NOTE("backoff");
300 			endp = slow(m, m->coldp, endp-1, gf, gl);
301 			if (endp == NULL)
302 				break;		/* defeat */
303 			/* try it on a shorter possibility */
304 #ifndef NDEBUG
305 			for (i = 1; i <= m->g->nsub; i++) {
306 				assert(m->pmatch[i].rm_so == -1);
307 				assert(m->pmatch[i].rm_eo == -1);
308 			}
309 #endif
310 			NOTE("backoff dissect");
311 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
312 		}
313 		assert(dp == NULL || dp == endp);
314 		if (dp != NULL)		/* found a shorter one */
315 			break;
316 
317 		/* despite initial appearances, there is no match here */
318 		NOTE("false alarm");
319 		start = m->coldp + 1;	/* recycle starting later */
320 		assert(start <= stop);
321 	}
322 
323 	/* fill in the details if requested */
324 	if (nmatch > 0) {
325 		pmatch[0].rm_so = m->coldp - m->offp;
326 		pmatch[0].rm_eo = endp - m->offp;
327 	}
328 	if (nmatch > 1) {
329 		assert(m->pmatch != NULL);
330 		for (i = 1; i < nmatch; i++)
331 			if (i <= m->g->nsub)
332 				pmatch[i] = m->pmatch[i];
333 			else {
334 				pmatch[i].rm_so = -1;
335 				pmatch[i].rm_eo = -1;
336 			}
337 	}
338 
339 	if (m->pmatch != NULL)
340 		free((char *)m->pmatch);
341 	if (m->lastpos != NULL)
342 		free((char *)m->lastpos);
343 	STATETEARDOWN(m);
344 	return(0);
345 }
346 
347 /*
348  - dissect - figure out what matched what, no back references
349  == static char *dissect(struct match *m, char *start, \
350  ==	char *stop, sopno startst, sopno stopst);
351  */
352 static char *			/* == stop (success) always */
353 dissect(m, start, stop, startst, stopst)
354 struct match *m;
355 char *start;
356 char *stop;
357 sopno startst;
358 sopno stopst;
359 {
360 	int i;
361 	sopno ss;		/* start sop of current subRE */
362 	sopno es;		/* end sop of current subRE */
363 	char *sp;		/* start of string matched by it */
364 	char *stp;		/* string matched by it cannot pass here */
365 	char *rest;		/* start of rest of string */
366 	char *tail;		/* string unmatched by rest of RE */
367 	sopno ssub;		/* start sop of subsubRE */
368 	sopno esub;		/* end sop of subsubRE */
369 	char *ssp;		/* start of string matched by subsubRE */
370 	char *sep;		/* end of string matched by subsubRE */
371 	char *oldssp;		/* previous ssp */
372 	char *dp;
373 
374 	AT("diss", start, stop, startst, stopst);
375 	sp = start;
376 	for (ss = startst; ss < stopst; ss = es) {
377 		/* identify end of subRE */
378 		es = ss;
379 		switch (OP(m->g->strip[es])) {
380 		case OPLUS_:
381 		case OQUEST_:
382 			es += OPND(m->g->strip[es]);
383 			break;
384 		case OCH_:
385 			while (OP(m->g->strip[es]) != O_CH)
386 				es += OPND(m->g->strip[es]);
387 			break;
388 		}
389 		es++;
390 
391 		/* figure out what it matched */
392 		switch (OP(m->g->strip[ss])) {
393 		case OEND:
394 			assert(nope);
395 			break;
396 		case OCHAR:
397 			sp++;
398 			break;
399 		case OBOL:
400 		case OEOL:
401 		case OBOW:
402 		case OEOW:
403 			break;
404 		case OANY:
405 		case OANYOF:
406 			sp++;
407 			break;
408 		case OBACK_:
409 		case O_BACK:
410 			assert(nope);
411 			break;
412 		/* cases where length of match is hard to find */
413 		case OQUEST_:
414 			stp = stop;
415 			for (;;) {
416 				/* how long could this one be? */
417 				rest = slow(m, sp, stp, ss, es);
418 				assert(rest != NULL);	/* it did match */
419 				/* could the rest match the rest? */
420 				tail = slow(m, rest, stop, es, stopst);
421 				if (tail == stop)
422 					break;		/* yes! */
423 				/* no -- try a shorter match for this one */
424 				stp = rest - 1;
425 				assert(stp >= sp);	/* it did work */
426 			}
427 			ssub = ss + 1;
428 			esub = es - 1;
429 			/* did innards match? */
430 			if (slow(m, sp, rest, ssub, esub) != NULL) {
431 				dp = dissect(m, sp, rest, ssub, esub);
432 				assert(dp == rest);
433 			} else		/* no */
434 				assert(sp == rest);
435 			sp = rest;
436 			break;
437 		case OPLUS_:
438 			stp = stop;
439 			for (;;) {
440 				/* how long could this one be? */
441 				rest = slow(m, sp, stp, ss, es);
442 				assert(rest != NULL);	/* it did match */
443 				/* could the rest match the rest? */
444 				tail = slow(m, rest, stop, es, stopst);
445 				if (tail == stop)
446 					break;		/* yes! */
447 				/* no -- try a shorter match for this one */
448 				stp = rest - 1;
449 				assert(stp >= sp);	/* it did work */
450 			}
451 			ssub = ss + 1;
452 			esub = es - 1;
453 			ssp = sp;
454 			oldssp = ssp;
455 			for (;;) {	/* find last match of innards */
456 				sep = slow(m, ssp, rest, ssub, esub);
457 				if (sep == NULL || sep == ssp)
458 					break;	/* failed or matched null */
459 				oldssp = ssp;	/* on to next try */
460 				ssp = sep;
461 			}
462 			if (sep == NULL) {
463 				/* last successful match */
464 				sep = ssp;
465 				ssp = oldssp;
466 			}
467 			assert(sep == rest);	/* must exhaust substring */
468 			assert(slow(m, ssp, sep, ssub, esub) == rest);
469 			dp = dissect(m, ssp, sep, ssub, esub);
470 			assert(dp == sep);
471 			sp = rest;
472 			break;
473 		case OCH_:
474 			stp = stop;
475 			for (;;) {
476 				/* how long could this one be? */
477 				rest = slow(m, sp, stp, ss, es);
478 				assert(rest != NULL);	/* it did match */
479 				/* could the rest match the rest? */
480 				tail = slow(m, rest, stop, es, stopst);
481 				if (tail == stop)
482 					break;		/* yes! */
483 				/* no -- try a shorter match for this one */
484 				stp = rest - 1;
485 				assert(stp >= sp);	/* it did work */
486 			}
487 			ssub = ss + 1;
488 			esub = ss + OPND(m->g->strip[ss]) - 1;
489 			assert(OP(m->g->strip[esub]) == OOR1);
490 			for (;;) {	/* find first matching branch */
491 				if (slow(m, sp, rest, ssub, esub) == rest)
492 					break;	/* it matched all of it */
493 				/* that one missed, try next one */
494 				assert(OP(m->g->strip[esub]) == OOR1);
495 				esub++;
496 				assert(OP(m->g->strip[esub]) == OOR2);
497 				ssub = esub + 1;
498 				esub += OPND(m->g->strip[esub]);
499 				if (OP(m->g->strip[esub]) == OOR2)
500 					esub--;
501 				else
502 					assert(OP(m->g->strip[esub]) == O_CH);
503 			}
504 			dp = dissect(m, sp, rest, ssub, esub);
505 			assert(dp == rest);
506 			sp = rest;
507 			break;
508 		case O_PLUS:
509 		case O_QUEST:
510 		case OOR1:
511 		case OOR2:
512 		case O_CH:
513 			assert(nope);
514 			break;
515 		case OLPAREN:
516 			i = OPND(m->g->strip[ss]);
517 			assert(0 < i && i <= m->g->nsub);
518 			m->pmatch[i].rm_so = sp - m->offp;
519 			break;
520 		case ORPAREN:
521 			i = OPND(m->g->strip[ss]);
522 			assert(0 < i && i <= m->g->nsub);
523 			m->pmatch[i].rm_eo = sp - m->offp;
524 			break;
525 		default:		/* uh oh */
526 			assert(nope);
527 			break;
528 		}
529 	}
530 
531 	assert(sp == stop);
532 	return(sp);
533 }
534 
535 /*
536  - backref - figure out what matched what, figuring in back references
537  == static char *backref(struct match *m, char *start, \
538  ==	char *stop, sopno startst, sopno stopst, sopno lev);
539  */
540 static char *			/* == stop (success) or NULL (failure) */
541 backref(m, start, stop, startst, stopst, lev)
542 struct match *m;
543 char *start;
544 char *stop;
545 sopno startst;
546 sopno stopst;
547 sopno lev;			/* PLUS nesting level */
548 {
549 	int i;
550 	sopno ss;		/* start sop of current subRE */
551 	char *sp;		/* start of string matched by it */
552 	sopno ssub;		/* start sop of subsubRE */
553 	sopno esub;		/* end sop of subsubRE */
554 	char *ssp;		/* start of string matched by subsubRE */
555 	char *dp;
556 	size_t len;
557 	int hard;
558 	sop s;
559 	regoff_t offsave;
560 	cset *cs;
561 
562 	AT("back", start, stop, startst, stopst);
563 	sp = start;
564 
565 	/* get as far as we can with easy stuff */
566 	hard = 0;
567 	for (ss = startst; !hard && ss < stopst; ss++)
568 		switch (OP(s = m->g->strip[ss])) {
569 		case OCHAR:
570 			if (sp == stop || *sp++ != (char)OPND(s))
571 				return(NULL);
572 			break;
573 		case OANY:
574 			if (sp == stop)
575 				return(NULL);
576 			sp++;
577 			break;
578 		case OANYOF:
579 			cs = &m->g->sets[OPND(s)];
580 			if (sp == stop || !CHIN(cs, *sp++))
581 				return(NULL);
582 			break;
583 		case OBOL:
584 			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
585 					(sp < m->endp && *(sp-1) == '\n' &&
586 						(m->g->cflags&REG_NEWLINE)) )
587 				{ /* yes */ }
588 			else
589 				return(NULL);
590 			break;
591 		case OEOL:
592 			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
593 					(sp < m->endp && *sp == '\n' &&
594 						(m->g->cflags&REG_NEWLINE)) )
595 				{ /* yes */ }
596 			else
597 				return(NULL);
598 			break;
599 		case OBOW:
600 			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
601 					(sp < m->endp && *(sp-1) == '\n' &&
602 						(m->g->cflags&REG_NEWLINE)) ||
603 					(sp > m->beginp &&
604 							!ISWORD(*(sp-1))) ) &&
605 					(sp < m->endp && ISWORD(*sp)) )
606 				{ /* yes */ }
607 			else
608 				return(NULL);
609 			break;
610 		case OEOW:
611 			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
612 					(sp < m->endp && *sp == '\n' &&
613 						(m->g->cflags&REG_NEWLINE)) ||
614 					(sp < m->endp && !ISWORD(*sp)) ) &&
615 					(sp > m->beginp && ISWORD(*(sp-1))) )
616 				{ /* yes */ }
617 			else
618 				return(NULL);
619 			break;
620 		case O_QUEST:
621 			break;
622 		case OOR1:	/* matches null but needs to skip */
623 			ss++;
624 			s = m->g->strip[ss];
625 			do {
626 				assert(OP(s) == OOR2);
627 				ss += OPND(s);
628 			} while (OP(s = m->g->strip[ss]) != O_CH);
629 			/* note that the ss++ gets us past the O_CH */
630 			break;
631 		default:	/* have to make a choice */
632 			hard = 1;
633 			break;
634 		}
635 	if (!hard) {		/* that was it! */
636 		if (sp != stop)
637 			return(NULL);
638 		return(sp);
639 	}
640 	ss--;			/* adjust for the for's final increment */
641 
642 	/* the hard stuff */
643 	AT("hard", sp, stop, ss, stopst);
644 	s = m->g->strip[ss];
645 	switch (OP(s)) {
646 	case OBACK_:		/* the vilest depths */
647 		i = OPND(s);
648 		assert(0 < i && i <= m->g->nsub);
649 		if (m->pmatch[i].rm_eo == -1)
650 			return(NULL);
651 		assert(m->pmatch[i].rm_so != -1);
652 		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
653 		assert(stop - m->beginp >= len);
654 		if (sp > stop - len)
655 			return(NULL);	/* not enough left to match */
656 		ssp = m->offp + m->pmatch[i].rm_so;
657 		if (memcmp(sp, ssp, len) != 0)
658 			return(NULL);
659 		while (m->g->strip[ss] != SOP(O_BACK, i))
660 			ss++;
661 		return(backref(m, sp+len, stop, ss+1, stopst, lev));
662 		break;
663 	case OQUEST_:		/* to null or not */
664 		dp = backref(m, sp, stop, ss+1, stopst, lev);
665 		if (dp != NULL)
666 			return(dp);	/* not */
667 		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
668 		break;
669 	case OPLUS_:
670 		assert(m->lastpos != NULL);
671 		assert(lev+1 <= m->g->nplus);
672 		m->lastpos[lev+1] = sp;
673 		return(backref(m, sp, stop, ss+1, stopst, lev+1));
674 		break;
675 	case O_PLUS:
676 		if (sp == m->lastpos[lev])	/* last pass matched null */
677 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
678 		/* try another pass */
679 		m->lastpos[lev] = sp;
680 		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
681 		if (dp == NULL)
682 			return(backref(m, sp, stop, ss+1, stopst, lev-1));
683 		else
684 			return(dp);
685 		break;
686 	case OCH_:		/* find the right one, if any */
687 		ssub = ss + 1;
688 		esub = ss + OPND(s) - 1;
689 		assert(OP(m->g->strip[esub]) == OOR1);
690 		for (;;) {	/* find first matching branch */
691 			dp = backref(m, sp, stop, ssub, esub, lev);
692 			if (dp != NULL)
693 				return(dp);
694 			/* that one missed, try next one */
695 			if (OP(m->g->strip[esub]) == O_CH)
696 				return(NULL);	/* there is none */
697 			esub++;
698 			assert(OP(m->g->strip[esub]) == OOR2);
699 			ssub = esub + 1;
700 			esub += OPND(m->g->strip[esub]);
701 			if (OP(m->g->strip[esub]) == OOR2)
702 				esub--;
703 			else
704 				assert(OP(m->g->strip[esub]) == O_CH);
705 		}
706 		break;
707 	case OLPAREN:		/* must undo assignment if rest fails */
708 		i = OPND(s);
709 		assert(0 < i && i <= m->g->nsub);
710 		offsave = m->pmatch[i].rm_so;
711 		m->pmatch[i].rm_so = sp - m->offp;
712 		dp = backref(m, sp, stop, ss+1, stopst, lev);
713 		if (dp != NULL)
714 			return(dp);
715 		m->pmatch[i].rm_so = offsave;
716 		return(NULL);
717 		break;
718 	case ORPAREN:		/* must undo assignment if rest fails */
719 		i = OPND(s);
720 		assert(0 < i && i <= m->g->nsub);
721 		offsave = m->pmatch[i].rm_eo;
722 		m->pmatch[i].rm_eo = sp - m->offp;
723 		dp = backref(m, sp, stop, ss+1, stopst, lev);
724 		if (dp != NULL)
725 			return(dp);
726 		m->pmatch[i].rm_eo = offsave;
727 		return(NULL);
728 		break;
729 	default:		/* uh oh */
730 		assert(nope);
731 		break;
732 	}
733 
734 	/* "can't happen" */
735 	assert(nope);
736 	/* NOTREACHED */
737 	return "shut up gcc";
738 }
739 
740 /*
741  - fast - step through the string at top speed
742  == static char *fast(struct match *m, char *start, \
743  ==	char *stop, sopno startst, sopno stopst);
744  */
745 static char *			/* where tentative match ended, or NULL */
746 fast(m, start, stop, startst, stopst)
747 struct match *m;
748 char *start;
749 char *stop;
750 sopno startst;
751 sopno stopst;
752 {
753 	states st = m->st;
754 	states fresh = m->fresh;
755 	states tmp = m->tmp;
756 	char *p = start;
757 	int c = (start == m->beginp) ? OUT : *(start-1);
758 	int lastc;		/* previous c */
759 	int flagch;
760 	int i;
761 	char *coldp;		/* last p after which no match was underway */
762 
763 	CLEAR(st);
764 	SET1(st, startst);
765 	st = step(m->g, startst, stopst, st, NOTHING, st);
766 	ASSIGN(fresh, st);
767 	SP("start", st, *p);
768 	coldp = NULL;
769 	for (;;) {
770 		/* next character */
771 		lastc = c;
772 		c = (p == m->endp) ? OUT : *p;
773 		if (EQ(st, fresh))
774 			coldp = p;
775 
776 		/* is there an EOL and/or BOL between lastc and c? */
777 		flagch = '\0';
778 		i = 0;
779 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
780 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
781 			flagch = BOL;
782 			i = m->g->nbol;
783 		}
784 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
785 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
786 			flagch = (flagch == BOL) ? BOLEOL : EOL;
787 			i += m->g->neol;
788 		}
789 		if (i != 0) {
790 			for (; i > 0; i--)
791 				st = step(m->g, startst, stopst, st, flagch, st);
792 			SP("boleol", st, c);
793 		}
794 
795 		/* how about a word boundary? */
796 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
797 					(c != OUT && ISWORD(c)) ) {
798 			flagch = BOW;
799 		}
800 		if ( (lastc != OUT && ISWORD(lastc)) &&
801 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
802 			flagch = EOW;
803 		}
804 		if (flagch == BOW || flagch == EOW) {
805 			st = step(m->g, startst, stopst, st, flagch, st);
806 			SP("boweow", st, c);
807 		}
808 
809 		/* are we done? */
810 		if (ISSET(st, stopst) || p == stop)
811 			break;		/* NOTE BREAK OUT */
812 
813 		/* no, we must deal with this character */
814 		ASSIGN(tmp, st);
815 		ASSIGN(st, fresh);
816 		assert(c != OUT);
817 		st = step(m->g, startst, stopst, tmp, c, st);
818 		SP("aft", st, c);
819 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
820 		p++;
821 	}
822 
823 	assert(coldp != NULL);
824 	m->coldp = coldp;
825 	if (ISSET(st, stopst))
826 		return(p+1);
827 	else
828 		return(NULL);
829 }
830 
831 /*
832  - slow - step through the string more deliberately
833  == static char *slow(struct match *m, char *start, \
834  ==	char *stop, sopno startst, sopno stopst);
835  */
836 static char *			/* where it ended */
837 slow(m, start, stop, startst, stopst)
838 struct match *m;
839 char *start;
840 char *stop;
841 sopno startst;
842 sopno stopst;
843 {
844 	states st = m->st;
845 	states empty = m->empty;
846 	states tmp = m->tmp;
847 	char *p = start;
848 	int c = (start == m->beginp) ? OUT : *(start-1);
849 	int lastc;		/* previous c */
850 	int flagch;
851 	int i;
852 	char *matchp;		/* last p at which a match ended */
853 
854 	AT("slow", start, stop, startst, stopst);
855 	CLEAR(st);
856 	SET1(st, startst);
857 	SP("sstart", st, *p);
858 	st = step(m->g, startst, stopst, st, NOTHING, st);
859 	matchp = NULL;
860 	for (;;) {
861 		/* next character */
862 		lastc = c;
863 		c = (p == m->endp) ? OUT : *p;
864 
865 		/* is there an EOL and/or BOL between lastc and c? */
866 		flagch = '\0';
867 		i = 0;
868 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
869 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
870 			flagch = BOL;
871 			i = m->g->nbol;
872 		}
873 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
874 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
875 			flagch = (flagch == BOL) ? BOLEOL : EOL;
876 			i += m->g->neol;
877 		}
878 		if (i != 0) {
879 			for (; i > 0; i--)
880 				st = step(m->g, startst, stopst, st, flagch, st);
881 			SP("sboleol", st, c);
882 		}
883 
884 		/* how about a word boundary? */
885 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
886 					(c != OUT && ISWORD(c)) ) {
887 			flagch = BOW;
888 		}
889 		if ( (lastc != OUT && ISWORD(lastc)) &&
890 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
891 			flagch = EOW;
892 		}
893 		if (flagch == BOW || flagch == EOW) {
894 			st = step(m->g, startst, stopst, st, flagch, st);
895 			SP("sboweow", st, c);
896 		}
897 
898 		/* are we done? */
899 		if (ISSET(st, stopst))
900 			matchp = p;
901 		if (EQ(st, empty) || p == stop)
902 			break;		/* NOTE BREAK OUT */
903 
904 		/* no, we must deal with this character */
905 		ASSIGN(tmp, st);
906 		ASSIGN(st, empty);
907 		assert(c != OUT);
908 		st = step(m->g, startst, stopst, tmp, c, st);
909 		SP("saft", st, c);
910 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
911 		p++;
912 	}
913 
914 	return(matchp);
915 }
916 
917 
918 /*
919  - step - map set of states reachable before char to set reachable after
920  == static states step(struct re_guts *g, sopno start, sopno stop, \
921  ==	states bef, int ch, states aft);
922  == #define	BOL	(OUT+1)
923  == #define	EOL	(BOL+1)
924  == #define	BOLEOL	(BOL+2)
925  == #define	NOTHING	(BOL+3)
926  == #define	BOW	(BOL+4)
927  == #define	EOW	(BOL+5)
928  == #define	CODEMAX	(BOL+5)		// highest code used
929  == #define	NONCHAR(c)	((c) > CHAR_MAX)
930  == #define	NNONCHAR	(CODEMAX-CHAR_MAX)
931  */
932 static states
933 step(g, start, stop, bef, ch, aft)
934 struct re_guts *g;
935 sopno start;			/* start state within strip */
936 sopno stop;			/* state after stop state within strip */
937 states bef;			/* states reachable before */
938 int ch;				/* character or NONCHAR code */
939 states aft;			/* states already known reachable after */
940 {
941 	cset *cs;
942 	sop s;
943 	sopno pc;
944 	onestate here;		/* note, macros know this name */
945 	sopno look;
946 	int i;
947 
948 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
949 		s = g->strip[pc];
950 		switch (OP(s)) {
951 		case OEND:
952 			assert(pc == stop-1);
953 			break;
954 		case OCHAR:
955 			/* only characters can match */
956 			assert(!NONCHAR(ch) || ch != (char)OPND(s));
957 			if (ch == (char)OPND(s))
958 				FWD(aft, bef, 1);
959 			break;
960 		case OBOL:
961 			if (ch == BOL || ch == BOLEOL)
962 				FWD(aft, bef, 1);
963 			break;
964 		case OEOL:
965 			if (ch == EOL || ch == BOLEOL)
966 				FWD(aft, bef, 1);
967 			break;
968 		case OBOW:
969 			if (ch == BOW)
970 				FWD(aft, bef, 1);
971 			break;
972 		case OEOW:
973 			if (ch == EOW)
974 				FWD(aft, bef, 1);
975 			break;
976 		case OANY:
977 			if (!NONCHAR(ch))
978 				FWD(aft, bef, 1);
979 			break;
980 		case OANYOF:
981 			cs = &g->sets[OPND(s)];
982 			if (!NONCHAR(ch) && CHIN(cs, ch))
983 				FWD(aft, bef, 1);
984 			break;
985 		case OBACK_:		/* ignored here */
986 		case O_BACK:
987 			FWD(aft, aft, 1);
988 			break;
989 		case OPLUS_:		/* forward, this is just an empty */
990 			FWD(aft, aft, 1);
991 			break;
992 		case O_PLUS:		/* both forward and back */
993 			FWD(aft, aft, 1);
994 			i = ISSETBACK(aft, OPND(s));
995 			BACK(aft, aft, OPND(s));
996 			if (!i && ISSETBACK(aft, OPND(s))) {
997 				/* oho, must reconsider loop body */
998 				pc -= OPND(s) + 1;
999 				INIT(here, pc);
1000 			}
1001 			break;
1002 		case OQUEST_:		/* two branches, both forward */
1003 			FWD(aft, aft, 1);
1004 			FWD(aft, aft, OPND(s));
1005 			break;
1006 		case O_QUEST:		/* just an empty */
1007 			FWD(aft, aft, 1);
1008 			break;
1009 		case OLPAREN:		/* not significant here */
1010 		case ORPAREN:
1011 			FWD(aft, aft, 1);
1012 			break;
1013 		case OCH_:		/* mark the first two branches */
1014 			FWD(aft, aft, 1);
1015 			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1016 			FWD(aft, aft, OPND(s));
1017 			break;
1018 		case OOR1:		/* done a branch, find the O_CH */
1019 			if (ISSTATEIN(aft, here)) {
1020 				for (look = 1;
1021 						OP(s = g->strip[pc+look]) != O_CH;
1022 						look += OPND(s))
1023 					assert(OP(s) == OOR2);
1024 				FWD(aft, aft, look);
1025 			}
1026 			break;
1027 		case OOR2:		/* propagate OCH_'s marking */
1028 			FWD(aft, aft, 1);
1029 			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1030 				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1031 				FWD(aft, aft, OPND(s));
1032 			}
1033 			break;
1034 		case O_CH:		/* just empty */
1035 			FWD(aft, aft, 1);
1036 			break;
1037 		default:		/* ooooops... */
1038 			assert(nope);
1039 			break;
1040 		}
1041 	}
1042 
1043 	return(aft);
1044 }
1045 
1046 #ifdef REDEBUG
1047 /*
1048  - print - print a set of states
1049  == #ifdef REDEBUG
1050  == static void print(struct match *m, char *caption, states st, \
1051  ==	int ch, FILE *d);
1052  == #endif
1053  */
1054 static void
1055 print(m, caption, st, ch, d)
1056 struct match *m;
1057 char *caption;
1058 states st;
1059 int ch;
1060 FILE *d;
1061 {
1062 	struct re_guts *g = m->g;
1063 	int i;
1064 	int first = 1;
1065 
1066 	if (!(m->eflags&REG_TRACE))
1067 		return;
1068 
1069 	fprintf(d, "%s", caption);
1070 	if (ch != '\0')
1071 		fprintf(d, " %s", pchar(ch));
1072 	for (i = 0; i < g->nstates; i++)
1073 		if (ISSET(st, i)) {
1074 			fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1075 			first = 0;
1076 		}
1077 	fprintf(d, "\n");
1078 }
1079 
1080 /*
1081  - at - print current situation
1082  == #ifdef REDEBUG
1083  == static void at(struct match *m, char *title, char *start, char *stop, \
1084  ==						sopno startst, sopno stopst);
1085  == #endif
1086  */
1087 static void
1088 at(m, title, start, stop, startst, stopst)
1089 struct match *m;
1090 char *title;
1091 char *start;
1092 char *stop;
1093 sopno startst;
1094 sopno stopst;
1095 {
1096 	if (!(m->eflags&REG_TRACE))
1097 		return;
1098 
1099 	printf("%s %s-", title, pchar(*start));
1100 	printf("%s ", pchar(*stop));
1101 	printf("%ld-%ld\n", (long)startst, (long)stopst);
1102 }
1103 
1104 #ifndef PCHARDONE
1105 #define	PCHARDONE	/* never again */
1106 /*
1107  - pchar - make a character printable
1108  == #ifdef REDEBUG
1109  == static char *pchar(int ch);
1110  == #endif
1111  *
1112  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1113  * duplicate here avoids having a debugging-capable regexec.o tied to
1114  * a matching debug.o, and this is convenient.  It all disappears in
1115  * the non-debug compilation anyway, so it doesn't matter much.
1116  */
1117 static char *			/* -> representation */
1118 pchar(ch)
1119 int ch;
1120 {
1121 	static char pbuf[10];
1122 
1123 	if (isprint((uch)ch) || ch == ' ')
1124 		sprintf(pbuf, "%c", ch);
1125 	else
1126 		sprintf(pbuf, "\\%o", ch);
1127 	return(pbuf);
1128 }
1129 #endif
1130 #endif
1131 
1132 #undef	matcher
1133 #undef	fast
1134 #undef	slow
1135 #undef	dissect
1136 #undef	backref
1137 #undef	step
1138 #undef	print
1139 #undef	at
1140 #undef	match
1141