xref: /freebsd/contrib/llvm-project/llvm/lib/Support/regengine.inc (revision bdd1243df58e60e85101c09001d9812a789b6bc4)
1/*-
2 * This code is derived from OpenBSD's libc/regex, original license follows:
3 *
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)engine.c	8.5 (Berkeley) 3/20/94
36 */
37
38/*
39 * The matching engine and friends.  This file is #included by regexec.c
40 * after suitable #defines of a variety of macros used herein, so that
41 * different state representations can be used without duplicating masses
42 * of code.
43 */
44
45#ifdef SNAMES
46#define	matcher	smatcher
47#define	fast	sfast
48#define	slow	sslow
49#define	dissect	sdissect
50#define	backref	sbackref
51#define	step	sstep
52#define	print	sprint
53#define	at	sat
54#define	match	smat
55#define	nope	snope
56#define step_back	sstep_back
57#endif
58#ifdef LNAMES
59#define	matcher	lmatcher
60#define	fast	lfast
61#define	slow	lslow
62#define	dissect	ldissect
63#define	backref	lbackref
64#define	step	lstep
65#define	print	lprint
66#define	at	lat
67#define	match	lmat
68#define	nope	lnope
69#define step_back	lstep_back
70#endif
71
72/* another structure passed up and down to avoid zillions of parameters */
73struct match {
74	struct re_guts *g;
75	int eflags;
76	llvm_regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
77	const char *offp;		/* offsets work from here */
78	const char *beginp;		/* start of string -- virtual NUL precedes */
79	const char *endp;		/* end of string -- virtual NUL here */
80	const char *coldp;		/* can be no match starting before here */
81	const char **lastpos;		/* [nplus+1] */
82	STATEVARS;
83	states st;		/* current states */
84	states fresh;		/* states for a fresh start */
85	states tmp;		/* temporary */
86	states empty;		/* empty set of states */
87};
88
89static int matcher(struct re_guts *, const char *, size_t,
90                   llvm_regmatch_t[], int);
91static const char *dissect(struct match *, const char *, const char *, sopno,
92                           sopno);
93static const char *backref(struct match *, const char *, const char *, sopno,
94                           sopno, sopno, int);
95static const char *fast(struct match *, const char *, const char *, sopno, sopno);
96static const char *slow(struct match *, const char *, const char *, sopno, sopno);
97static states step(struct re_guts *, sopno, sopno, states, int, states);
98#define MAX_RECURSION	100
99#define	BOL	(OUT+1)
100#define	EOL	(BOL+1)
101#define	BOLEOL	(BOL+2)
102#define	NOTHING	(BOL+3)
103#define	BOW	(BOL+4)
104#define	EOW	(BOL+5)
105#define	CODEMAX	(BOL+5)		/* highest code used */
106#define	NONCHAR(c)	((c) > CHAR_MAX)
107#define	NNONCHAR	(CODEMAX-CHAR_MAX)
108#ifdef REDEBUG
109static void print(struct match *, const char *, states, int, FILE *);
110#endif
111#ifdef REDEBUG
112static void at(
113	struct match *, const char *, const char *, const char *, sopno, sopno);
114#endif
115#ifdef REDEBUG
116static char *pchar(int);
117#endif
118
119#ifdef REDEBUG
120#define	SP(t, s, c)	print(m, t, s, c, stdout)
121#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
122#define	NOTE(str)	{ if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
123static int nope = 0;
124#else
125#define	SP(t, s, c)	/* nothing */
126#define	AT(t, p1, p2, s1, s2)	/* nothing */
127#define	NOTE(s)	/* nothing */
128#endif
129
130/*
131 - matcher - the actual matching engine
132 */
133static int			/* 0 success, REG_NOMATCH failure */
134matcher(struct re_guts *g, const char *string, size_t nmatch,
135        llvm_regmatch_t pmatch[],
136    int eflags)
137{
138	const char *endp;
139	size_t i;
140	struct match mv;
141	struct match *m = &mv;
142	const char *dp;
143	const sopno gf = g->firststate+1;	/* +1 for OEND */
144	const sopno gl = g->laststate;
145	const char *start;
146	const char *stop;
147
148	/* simplify the situation where possible */
149	if (g->cflags&REG_NOSUB)
150		nmatch = 0;
151	if (eflags&REG_STARTEND) {
152		start = string + pmatch[0].rm_so;
153		stop = string + pmatch[0].rm_eo;
154	} else {
155		start = string;
156		stop = start + strlen(start);
157	}
158	if (stop < start)
159		return(REG_INVARG);
160
161	/* prescreening; this does wonders for this rather slow code */
162	if (g->must != NULL) {
163		for (dp = start; dp < stop; dp++)
164			if (*dp == g->must[0] && stop - dp >= g->mlen &&
165				memcmp(dp, g->must, (size_t)g->mlen) == 0)
166				break;
167		if (dp == stop)		/* we didn't find g->must */
168			return(REG_NOMATCH);
169	}
170
171	/* match struct setup */
172	m->g = g;
173	m->eflags = eflags;
174	m->pmatch = NULL;
175	m->lastpos = NULL;
176	m->offp = string;
177	m->beginp = start;
178	m->endp = stop;
179	STATESETUP(m, 4);
180	SETUP(m->st);
181	SETUP(m->fresh);
182	SETUP(m->tmp);
183	SETUP(m->empty);
184	CLEAR(m->empty);
185
186	/* this loop does only one repetition except for backrefs */
187	for (;;) {
188		endp = fast(m, start, stop, gf, gl);
189		if (endp == NULL) {		/* a miss */
190			free(m->pmatch);
191			free((void*)m->lastpos);
192			STATETEARDOWN(m);
193			return(REG_NOMATCH);
194		}
195		if (nmatch == 0 && !g->backrefs)
196			break;		/* no further info needed */
197
198		/* where? */
199		assert(m->coldp != NULL);
200		for (;;) {
201			NOTE("finding start");
202			endp = slow(m, m->coldp, stop, gf, gl);
203			if (endp != NULL)
204				break;
205			assert(m->coldp < m->endp);
206			m->coldp++;
207		}
208		if (nmatch == 1 && !g->backrefs)
209			break;		/* no further info needed */
210
211		/* oh my, they want the subexpressions... */
212		if (m->pmatch == NULL)
213			m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
214							sizeof(llvm_regmatch_t));
215		if (m->pmatch == NULL) {
216			STATETEARDOWN(m);
217			return(REG_ESPACE);
218		}
219		for (i = 1; i <= m->g->nsub; i++)
220			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
221		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
222			NOTE("dissecting");
223			dp = dissect(m, m->coldp, endp, gf, gl);
224		} else {
225			if (g->nplus > 0 && m->lastpos == NULL)
226				m->lastpos = (const char **)malloc((g->nplus+1) *
227							sizeof(char *));
228			if (g->nplus > 0 && m->lastpos == NULL) {
229				free(m->pmatch);
230				STATETEARDOWN(m);
231				return(REG_ESPACE);
232			}
233			NOTE("backref dissect");
234			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
235		}
236		if (dp != NULL)
237			break;
238
239		/* uh-oh... we couldn't find a subexpression-level match */
240		assert(g->backrefs);	/* must be back references doing it */
241		assert(g->nplus == 0 || m->lastpos != NULL);
242		for (;;) {
243			if (dp != NULL || endp <= m->coldp)
244				break;		/* defeat */
245			NOTE("backoff");
246			endp = slow(m, m->coldp, endp-1, gf, gl);
247			if (endp == NULL)
248				break;		/* defeat */
249			/* try it on a shorter possibility */
250#ifndef NDEBUG
251			for (i = 1; i <= m->g->nsub; i++) {
252				assert(m->pmatch[i].rm_so == -1);
253				assert(m->pmatch[i].rm_eo == -1);
254			}
255#endif
256			NOTE("backoff dissect");
257			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
258		}
259		assert(dp == NULL || dp == endp);
260		if (dp != NULL)		/* found a shorter one */
261			break;
262
263		/* despite initial appearances, there is no match here */
264		NOTE("false alarm");
265		if (m->coldp == stop)
266			break;
267		start = m->coldp + 1;	/* recycle starting later */
268	}
269
270	/* fill in the details if requested */
271	if (nmatch > 0) {
272		pmatch[0].rm_so = m->coldp - m->offp;
273		pmatch[0].rm_eo = endp - m->offp;
274	}
275	if (nmatch > 1) {
276		assert(m->pmatch != NULL);
277		for (i = 1; i < nmatch; i++)
278			if (i <= m->g->nsub)
279				pmatch[i] = m->pmatch[i];
280			else {
281				pmatch[i].rm_so = -1;
282				pmatch[i].rm_eo = -1;
283			}
284	}
285
286	if (m->pmatch != NULL)
287		free((char *)m->pmatch);
288	if (m->lastpos != NULL)
289		free((char *)m->lastpos);
290	STATETEARDOWN(m);
291	return(0);
292}
293
294/* Step back from "stop" to a position where the strip startst..stopst might
295 * match. This can always conservatively return "stop - 1", but may return an
296 * earlier position if matches at later positions are impossible. */
297static const char *
298step_back(struct re_guts *g, const char *start, const char *stop, sopno startst,
299          sopno stopst)
300{
301	/* Always step back at least one character. */
302	assert(stop > start);
303	const char *res = stop - 1;
304
305	/* Check whether the strip startst..stropst starts with a fixed character,
306	 * ignoring any closing parentheses. If not, return a conservative result. */
307	for (;;) {
308		if (startst >= stopst)
309			return res;
310		if (OP(g->strip[startst]) != ORPAREN)
311			break;
312		startst++;
313	}
314	if (OP(g->strip[startst]) != OCHAR)
315		return res;
316
317	/* Find the character that starts the following match. */
318	char ch = OPND(g->strip[startst]);
319	for (; res != start; --res) {
320		if (*res == ch) {
321			/* Try to check the next fixed character as well. */
322			sopno nextst = startst + 1;
323			const char *next = res + 1;
324			if (nextst >= stopst || OP(g->strip[nextst]) != OCHAR || next >= stop ||
325					*next == (char)OPND(g->strip[nextst]))
326				break;
327    }
328	}
329	return res;
330}
331
332/*
333 - dissect - figure out what matched what, no back references
334 */
335static const char *			/* == stop (success) always */
336dissect(struct match *m, const char *start, const char *stop, sopno startst,
337        sopno stopst)
338{
339	int i;
340	sopno ss;	/* start sop of current subRE */
341	sopno es;	/* end sop of current subRE */
342	const char *sp;	/* start of string matched by it */
343	const char *stp;	/* string matched by it cannot pass here */
344	const char *rest;	/* start of rest of string */
345	const char *tail;	/* string unmatched by rest of RE */
346	sopno ssub;	/* start sop of subsubRE */
347	sopno esub;	/* end sop of subsubRE */
348	const char *ssp;	/* start of string matched by subsubRE */
349	const char *sep;	/* end of string matched by subsubRE */
350	const char *oldssp;	/* previous ssp */
351
352	AT("diss", start, stop, startst, stopst);
353	sp = start;
354	for (ss = startst; ss < stopst; ss = es) {
355		/* identify end of subRE */
356		es = ss;
357		switch (OP(m->g->strip[es])) {
358		case OPLUS_:
359		case OQUEST_:
360			es += OPND(m->g->strip[es]);
361			break;
362		case OCH_:
363			while (OP(m->g->strip[es]) != O_CH)
364				es += OPND(m->g->strip[es]);
365			break;
366		}
367		es++;
368
369		/* figure out what it matched */
370		switch (OP(m->g->strip[ss])) {
371		case OEND:
372			assert(nope);
373			break;
374		case OCHAR:
375			sp++;
376			break;
377		case OBOL:
378		case OEOL:
379		case OBOW:
380		case OEOW:
381			break;
382		case OANY:
383		case OANYOF:
384			sp++;
385			break;
386		case OBACK_:
387		case O_BACK:
388			assert(nope);
389			break;
390		/* cases where length of match is hard to find */
391		case OQUEST_:
392			stp = stop;
393			for (;;) {
394				/* how long could this one be? */
395				rest = slow(m, sp, stp, ss, es);
396				assert(rest != NULL);	/* it did match */
397				/* could the rest match the rest? */
398				tail = slow(m, rest, stop, es, stopst);
399				if (tail == stop)
400					break;		/* yes! */
401				/* no -- try a shorter match for this one */
402				stp = step_back(m->g, sp, rest, es, stopst);
403				assert(stp >= sp);	/* it did work */
404			}
405			ssub = ss + 1;
406			esub = es - 1;
407			/* did innards match? */
408			if (slow(m, sp, rest, ssub, esub) != NULL) {
409				const char *dp = dissect(m, sp, rest, ssub, esub);
410				(void)dp; /* avoid warning if assertions off */
411				assert(dp == rest);
412			} else		/* no */
413				assert(sp == rest);
414			sp = rest;
415			break;
416		case OPLUS_:
417			stp = stop;
418			for (;;) {
419				/* how long could this one be? */
420				rest = slow(m, sp, stp, ss, es);
421				assert(rest != NULL);	/* it did match */
422				/* could the rest match the rest? */
423				tail = slow(m, rest, stop, es, stopst);
424				if (tail == stop)
425					break;		/* yes! */
426				/* no -- try a shorter match for this one */
427				stp = step_back(m->g, sp, rest, es, stopst);
428				assert(stp >= sp);	/* it did work */
429			}
430			ssub = ss + 1;
431			esub = es - 1;
432			ssp = sp;
433			oldssp = ssp;
434			for (;;) {	/* find last match of innards */
435				sep = slow(m, ssp, rest, ssub, esub);
436				if (sep == NULL || sep == ssp)
437					break;	/* failed or matched null */
438				oldssp = ssp;	/* on to next try */
439				ssp = sep;
440			}
441			if (sep == NULL) {
442				/* last successful match */
443				sep = ssp;
444				ssp = oldssp;
445			}
446			assert(sep == rest);	/* must exhaust substring */
447			assert(slow(m, ssp, sep, ssub, esub) == rest);
448			{
449				const char *dp = dissect(m, ssp, sep, ssub, esub);
450				(void)dp; /* avoid warning if assertions off */
451				assert(dp == sep);
452			}
453			sp = rest;
454			break;
455		case OCH_:
456			stp = stop;
457			for (;;) {
458				/* how long could this one be? */
459				rest = slow(m, sp, stp, ss, es);
460				assert(rest != NULL);	/* it did match */
461				/* could the rest match the rest? */
462				tail = slow(m, rest, stop, es, stopst);
463				if (tail == stop)
464					break;		/* yes! */
465				/* no -- try a shorter match for this one */
466				stp = rest - 1;
467				assert(stp >= sp);	/* it did work */
468			}
469			ssub = ss + 1;
470			esub = ss + OPND(m->g->strip[ss]) - 1;
471			assert(OP(m->g->strip[esub]) == OOR1);
472			for (;;) {	/* find first matching branch */
473				if (slow(m, sp, rest, ssub, esub) == rest)
474					break;	/* it matched all of it */
475				/* that one missed, try next one */
476				assert(OP(m->g->strip[esub]) == OOR1);
477				esub++;
478				assert(OP(m->g->strip[esub]) == OOR2);
479				ssub = esub + 1;
480				esub += OPND(m->g->strip[esub]);
481				if (OP(m->g->strip[esub]) == OOR2)
482					esub--;
483				else
484					assert(OP(m->g->strip[esub]) == O_CH);
485			}
486			{
487				const char *dp = dissect(m, sp, rest, ssub, esub);
488				(void)dp; /* avoid warning if assertions off */
489				assert(dp == rest);
490			}
491			sp = rest;
492			break;
493		case O_PLUS:
494		case O_QUEST:
495		case OOR1:
496		case OOR2:
497		case O_CH:
498			assert(nope);
499			break;
500		case OLPAREN:
501			i = OPND(m->g->strip[ss]);
502			assert(0 < i && i <= m->g->nsub);
503			m->pmatch[i].rm_so = sp - m->offp;
504			break;
505		case ORPAREN:
506			i = OPND(m->g->strip[ss]);
507			assert(0 < i && i <= m->g->nsub);
508			m->pmatch[i].rm_eo = sp - m->offp;
509			break;
510		default:		/* uh oh */
511			assert(nope);
512			break;
513		}
514	}
515
516	assert(sp == stop);
517	return(sp);
518}
519
520/*
521 - backref - figure out what matched what, figuring in back references
522 */
523static const char *			/* == stop (success) or NULL (failure) */
524backref(struct match *m, const char *start, const char *stop, sopno startst,
525        sopno stopst, sopno lev, int rec)			/* PLUS nesting level */
526{
527	int i;
528	sopno ss;	/* start sop of current subRE */
529	const char *sp;	/* start of string matched by it */
530	sopno ssub;	/* start sop of subsubRE */
531	sopno esub;	/* end sop of subsubRE */
532	const char *ssp;	/* start of string matched by subsubRE */
533	const char *dp;
534	size_t len;
535	int hard;
536	sop s;
537	llvm_regoff_t offsave;
538	cset *cs;
539
540	AT("back", start, stop, startst, stopst);
541	sp = start;
542
543	/* get as far as we can with easy stuff */
544	hard = 0;
545	for (ss = startst; !hard && ss < stopst; ss++)
546		switch (OP(s = m->g->strip[ss])) {
547		case OCHAR:
548			if (sp == stop || *sp++ != (char)OPND(s))
549				return(NULL);
550			break;
551		case OANY:
552			if (sp == stop)
553				return(NULL);
554			sp++;
555			break;
556		case OANYOF:
557			cs = &m->g->sets[OPND(s)];
558			if (sp == stop || !CHIN(cs, *sp++))
559				return(NULL);
560			break;
561		case OBOL:
562			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
563					(sp < m->endp && *(sp-1) == '\n' &&
564						(m->g->cflags&REG_NEWLINE)) )
565				{ /* yes */ }
566			else
567				return(NULL);
568			break;
569		case OEOL:
570			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
571					(sp < m->endp && *sp == '\n' &&
572						(m->g->cflags&REG_NEWLINE)) )
573				{ /* yes */ }
574			else
575				return(NULL);
576			break;
577		case OBOW:
578			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
579					(sp < m->endp && *(sp-1) == '\n' &&
580						(m->g->cflags&REG_NEWLINE)) ||
581					(sp > m->beginp &&
582							!ISWORD(*(sp-1))) ) &&
583					(sp < m->endp && ISWORD(*sp)) )
584				{ /* yes */ }
585			else
586				return(NULL);
587			break;
588		case OEOW:
589			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
590					(sp < m->endp && *sp == '\n' &&
591						(m->g->cflags&REG_NEWLINE)) ||
592					(sp < m->endp && !ISWORD(*sp)) ) &&
593					(sp > m->beginp && ISWORD(*(sp-1))) )
594				{ /* yes */ }
595			else
596				return(NULL);
597			break;
598		case O_QUEST:
599		case O_CH:
600			break;
601		case OOR1:	/* matches null but needs to skip */
602			ss++;
603			s = m->g->strip[ss];
604			do {
605				assert(OP(s) == OOR2);
606				ss += OPND(s);
607			} while (OP(s = m->g->strip[ss]) != O_CH);
608			/* note that the ss++ gets us past the O_CH */
609			break;
610		default:	/* have to make a choice */
611			hard = 1;
612			break;
613		}
614	if (!hard) {		/* that was it! */
615		if (sp != stop)
616			return(NULL);
617		return(sp);
618	}
619	ss--;			/* adjust for the for's final increment */
620
621	/* the hard stuff */
622	AT("hard", sp, stop, ss, stopst);
623	s = m->g->strip[ss];
624	switch (OP(s)) {
625	case OBACK_:		/* the vilest depths */
626		i = OPND(s);
627		assert(0 < i && i <= m->g->nsub);
628		if (m->pmatch[i].rm_eo == -1)
629			return(NULL);
630		assert(m->pmatch[i].rm_so != -1);
631		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
632		if (len == 0 && rec++ > MAX_RECURSION)
633			return(NULL);
634		assert(stop - m->beginp >= len);
635		if (sp > stop - len)
636			return(NULL);	/* not enough left to match */
637		ssp = m->offp + m->pmatch[i].rm_so;
638		if (memcmp(sp, ssp, len) != 0)
639			return(NULL);
640		while (m->g->strip[ss] != SOP(O_BACK, i))
641			ss++;
642		return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
643		break;
644	case OQUEST_:		/* to null or not */
645		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
646		if (dp != NULL)
647			return(dp);	/* not */
648		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
649		break;
650	case OPLUS_:
651		assert(m->lastpos != NULL);
652		assert(lev+1 <= m->g->nplus);
653		m->lastpos[lev+1] = sp;
654		return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
655		break;
656	case O_PLUS:
657		if (sp == m->lastpos[lev])	/* last pass matched null */
658			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
659		/* try another pass */
660		m->lastpos[lev] = sp;
661		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
662		if (dp == NULL)
663			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
664		else
665			return(dp);
666		break;
667	case OCH_:		/* find the right one, if any */
668		ssub = ss + 1;
669		esub = ss + OPND(s) - 1;
670		assert(OP(m->g->strip[esub]) == OOR1);
671		for (;;) {	/* find first matching branch */
672			dp = backref(m, sp, stop, ssub, stopst, lev, rec);
673			if (dp != NULL)
674				return(dp);
675			/* that one missed, try next one */
676			if (OP(m->g->strip[esub]) == O_CH)
677				return(NULL);	/* there is none */
678			esub++;
679			assert(OP(m->g->strip[esub]) == OOR2);
680			ssub = esub + 1;
681			esub += OPND(m->g->strip[esub]);
682			if (OP(m->g->strip[esub]) == OOR2)
683				esub--;
684			else
685				assert(OP(m->g->strip[esub]) == O_CH);
686		}
687		break;
688	case OLPAREN:		/* must undo assignment if rest fails */
689		i = OPND(s);
690		assert(0 < i && i <= m->g->nsub);
691		offsave = m->pmatch[i].rm_so;
692		m->pmatch[i].rm_so = sp - m->offp;
693		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
694		if (dp != NULL)
695			return(dp);
696		m->pmatch[i].rm_so = offsave;
697		return(NULL);
698		break;
699	case ORPAREN:		/* must undo assignment if rest fails */
700		i = OPND(s);
701		assert(0 < i && i <= m->g->nsub);
702		offsave = m->pmatch[i].rm_eo;
703		m->pmatch[i].rm_eo = sp - m->offp;
704		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
705		if (dp != NULL)
706			return(dp);
707		m->pmatch[i].rm_eo = offsave;
708		return(NULL);
709		break;
710	default:		/* uh oh */
711		assert(nope);
712		break;
713	}
714
715	/* "can't happen" */
716	assert(nope);
717	/* NOTREACHED */
718        return NULL;
719}
720
721/*
722 - fast - step through the string at top speed
723 */
724static const char *			/* where tentative match ended, or NULL */
725fast(struct match *m, const char *start, const char *stop, sopno startst,
726     sopno stopst)
727{
728	states st = m->st;
729	states fresh = m->fresh;
730	states tmp = m->tmp;
731	const char *p = start;
732	int c = (start == m->beginp) ? OUT : *(start-1);
733	int lastc;	/* previous c */
734	int flagch;
735	int i;
736	const char *coldp;	/* last p after which no match was underway */
737
738	CLEAR(st);
739	SET1(st, startst);
740	st = step(m->g, startst, stopst, st, NOTHING, st);
741	ASSIGN(fresh, st);
742	SP("start", st, *p);
743	coldp = NULL;
744	for (;;) {
745		/* next character */
746		lastc = c;
747		c = (p == m->endp) ? OUT : *p;
748		if (EQ(st, fresh))
749			coldp = p;
750
751		/* is there an EOL and/or BOL between lastc and c? */
752		flagch = '\0';
753		i = 0;
754		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
755				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
756			flagch = BOL;
757			i = m->g->nbol;
758		}
759		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
760				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
761			flagch = (flagch == BOL) ? BOLEOL : EOL;
762			i += m->g->neol;
763		}
764		if (i != 0) {
765			for (; i > 0; i--)
766				st = step(m->g, startst, stopst, st, flagch, st);
767			SP("boleol", st, c);
768		}
769
770		/* how about a word boundary? */
771		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
772					(c != OUT && ISWORD(c)) ) {
773			flagch = BOW;
774		}
775		if ( (lastc != OUT && ISWORD(lastc)) &&
776				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
777			flagch = EOW;
778		}
779		if (flagch == BOW || flagch == EOW) {
780			st = step(m->g, startst, stopst, st, flagch, st);
781			SP("boweow", st, c);
782		}
783
784		/* are we done? */
785		if (ISSET(st, stopst) || p == stop)
786			break;		/* NOTE BREAK OUT */
787
788		/* no, we must deal with this character */
789		ASSIGN(tmp, st);
790		ASSIGN(st, fresh);
791		assert(c != OUT);
792		st = step(m->g, startst, stopst, tmp, c, st);
793		SP("aft", st, c);
794		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
795		p++;
796	}
797
798	assert(coldp != NULL);
799	m->coldp = coldp;
800	if (ISSET(st, stopst))
801		return(p+1);
802	else
803		return(NULL);
804}
805
806/*
807 - slow - step through the string more deliberately
808 */
809static const char *			/* where it ended */
810slow(struct match *m, const char *start, const char *stop, sopno startst,
811     sopno stopst)
812{
813	/* Quickly skip over fixed character matches at the start. */
814	const char *p = start;
815	for (; startst < stopst; ++startst) {
816		int hard = 0;
817		sop s = m->g->strip[startst];
818		switch (OP(s)) {
819		case OLPAREN:
820		case ORPAREN:
821			/* Not relevant here. */
822			break;
823		case OCHAR:
824			if (p == stop || *p != (char)OPND(s))
825				return NULL;
826			++p;
827			break;
828		default:
829			hard = 1;
830			break;
831		}
832		if (hard)
833			break;
834	}
835
836	states st = m->st;
837	states empty = m->empty;
838	states tmp = m->tmp;
839	int c = (p == m->beginp) ? OUT : *(p-1);
840	int lastc;	/* previous c */
841	int flagch;
842	int i;
843	const char *matchp;	/* last p at which a match ended */
844
845	AT("slow", start, stop, startst, stopst);
846	CLEAR(st);
847	SET1(st, startst);
848	SP("sstart", st, *p);
849	st = step(m->g, startst, stopst, st, NOTHING, st);
850	matchp = NULL;
851	for (;;) {
852		/* next character */
853		lastc = c;
854		c = (p == m->endp) ? OUT : *p;
855
856		/* is there an EOL and/or BOL between lastc and c? */
857		flagch = '\0';
858		i = 0;
859		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
860				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
861			flagch = BOL;
862			i = m->g->nbol;
863		}
864		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
865				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
866			flagch = (flagch == BOL) ? BOLEOL : EOL;
867			i += m->g->neol;
868		}
869		if (i != 0) {
870			for (; i > 0; i--)
871				st = step(m->g, startst, stopst, st, flagch, st);
872			SP("sboleol", st, c);
873		}
874
875		/* how about a word boundary? */
876		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
877					(c != OUT && ISWORD(c)) ) {
878			flagch = BOW;
879		}
880		if ( (lastc != OUT && ISWORD(lastc)) &&
881				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
882			flagch = EOW;
883		}
884		if (flagch == BOW || flagch == EOW) {
885			st = step(m->g, startst, stopst, st, flagch, st);
886			SP("sboweow", st, c);
887		}
888
889		/* are we done? */
890		if (ISSET(st, stopst))
891			matchp = p;
892		if (EQ(st, empty) || p == stop)
893			break;		/* NOTE BREAK OUT */
894
895		/* no, we must deal with this character */
896		ASSIGN(tmp, st);
897		ASSIGN(st, empty);
898		assert(c != OUT);
899		st = step(m->g, startst, stopst, tmp, c, st);
900		SP("saft", st, c);
901		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
902		p++;
903	}
904
905	return(matchp);
906}
907
908
909/*
910 - step - map set of states reachable before char to set reachable after
911 */
912static states
913step(struct re_guts *g,
914    sopno start,		/* start state within strip */
915    sopno stop,			/* state after stop state within strip */
916    states bef,			/* states reachable before */
917    int ch,			/* character or NONCHAR code */
918    states aft)			/* states already known reachable after */
919{
920	cset *cs;
921	sop s;
922	sopno pc;
923	onestate here;		/* note, macros know this name */
924	sopno look;
925	int i;
926
927	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
928		s = g->strip[pc];
929		switch (OP(s)) {
930		case OEND:
931			assert(pc == stop-1);
932			break;
933		case OCHAR:
934			/* only characters can match */
935			assert(!NONCHAR(ch) || ch != (char)OPND(s));
936			if (ch == (char)OPND(s))
937				FWD(aft, bef, 1);
938			break;
939		case OBOL:
940			if (ch == BOL || ch == BOLEOL)
941				FWD(aft, bef, 1);
942			break;
943		case OEOL:
944			if (ch == EOL || ch == BOLEOL)
945				FWD(aft, bef, 1);
946			break;
947		case OBOW:
948			if (ch == BOW)
949				FWD(aft, bef, 1);
950			break;
951		case OEOW:
952			if (ch == EOW)
953				FWD(aft, bef, 1);
954			break;
955		case OANY:
956			if (!NONCHAR(ch))
957				FWD(aft, bef, 1);
958			break;
959		case OANYOF:
960			cs = &g->sets[OPND(s)];
961			if (!NONCHAR(ch) && CHIN(cs, ch))
962				FWD(aft, bef, 1);
963			break;
964		case OBACK_:		/* ignored here */
965		case O_BACK:
966			FWD(aft, aft, 1);
967			break;
968		case OPLUS_:		/* forward, this is just an empty */
969			FWD(aft, aft, 1);
970			break;
971		case O_PLUS:		/* both forward and back */
972			FWD(aft, aft, 1);
973			i = ISSETBACK(aft, OPND(s));
974			BACK(aft, aft, OPND(s));
975			if (!i && ISSETBACK(aft, OPND(s))) {
976				/* oho, must reconsider loop body */
977				pc -= OPND(s) + 1;
978				INIT(here, pc);
979			}
980			break;
981		case OQUEST_:		/* two branches, both forward */
982			FWD(aft, aft, 1);
983			FWD(aft, aft, OPND(s));
984			break;
985		case O_QUEST:		/* just an empty */
986			FWD(aft, aft, 1);
987			break;
988		case OLPAREN:		/* not significant here */
989		case ORPAREN:
990			FWD(aft, aft, 1);
991			break;
992		case OCH_:		/* mark the first two branches */
993			FWD(aft, aft, 1);
994			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
995			FWD(aft, aft, OPND(s));
996			break;
997		case OOR1:		/* done a branch, find the O_CH */
998			if (ISSTATEIN(aft, here)) {
999				for (look = 1;
1000						OP(s = g->strip[pc+look]) != O_CH;
1001						look += OPND(s))
1002					assert(OP(s) == OOR2);
1003				FWD(aft, aft, look);
1004			}
1005			break;
1006		case OOR2:		/* propagate OCH_'s marking */
1007			FWD(aft, aft, 1);
1008			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1009				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1010				FWD(aft, aft, OPND(s));
1011			}
1012			break;
1013		case O_CH:		/* just empty */
1014			FWD(aft, aft, 1);
1015			break;
1016		default:		/* ooooops... */
1017			assert(nope);
1018			break;
1019		}
1020	}
1021
1022	return(aft);
1023}
1024
1025#ifdef REDEBUG
1026/*
1027 - print - print a set of states
1028 */
1029static void
1030print(struct match *m, const char *caption, states st, int ch, FILE *d)
1031{
1032	struct re_guts *g = m->g;
1033	int i;
1034	int first = 1;
1035
1036	if (!(m->eflags&REG_TRACE))
1037		return;
1038
1039	(void)fprintf(d, "%s", caption);
1040	if (ch != '\0')
1041		(void)fprintf(d, " %s", pchar(ch));
1042	for (i = 0; i < g->nstates; i++)
1043		if (ISSET(st, i)) {
1044			(void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1045			first = 0;
1046		}
1047	(void)fprintf(d, "\n");
1048}
1049
1050/*
1051 - at - print current situation
1052 */
1053static void
1054at(struct match *m, const char *title, const char *start, const char *stop,
1055	sopno startst, sopno stopst)
1056{
1057	if (!(m->eflags&REG_TRACE))
1058		return;
1059
1060	(void)printf("%s %s-", title, pchar(*start));
1061	(void)printf("%s ", pchar(*stop));
1062	(void)printf("%ld-%ld\n", (long)startst, (long)stopst);
1063}
1064
1065#ifndef PCHARDONE
1066#define	PCHARDONE	/* never again */
1067/*
1068 - pchar - make a character printable
1069 *
1070 * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1071 * duplicate here avoids having a debugging-capable regexec.o tied to
1072 * a matching debug.o, and this is convenient.  It all disappears in
1073 * the non-debug compilation anyway, so it doesn't matter much.
1074 */
1075static char *			/* -> representation */
1076pchar(int ch)
1077{
1078	static char pbuf[10];
1079
1080	if (isprint(ch) || ch == ' ')
1081		(void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1082	else
1083		(void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1084	return(pbuf);
1085}
1086#endif
1087#endif
1088
1089#undef	matcher
1090#undef	fast
1091#undef	slow
1092#undef	dissect
1093#undef	backref
1094#undef	step
1095#undef	print
1096#undef	at
1097#undef	match
1098#undef	nope
1099#undef	step_back
1100