xref: /freebsd/lib/libc/regex/engine.c (revision 22cf89c938886d14f5796fc49f9f020c23ea8eaf)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)engine.c	8.5 (Berkeley) 3/20/94
36  */
37 
38 #include <sys/cdefs.h>
39 #include <stdbool.h>
40 
41 /*
42  * The matching engine and friends.  This file is #included by regexec.c
43  * after suitable #defines of a variety of macros used herein, so that
44  * different state representations can be used without duplicating masses
45  * of code.
46  */
47 
48 #ifdef SNAMES
49 #define	stepback sstepback
50 #define	matcher	smatcher
51 #define	walk	swalk
52 #define	dissect	sdissect
53 #define	backref	sbackref
54 #define	step	sstep
55 #define	print	sprint
56 #define	at	sat
57 #define	match	smat
58 #endif
59 #ifdef LNAMES
60 #define	stepback lstepback
61 #define	matcher	lmatcher
62 #define	walk	lwalk
63 #define	dissect	ldissect
64 #define	backref	lbackref
65 #define	step	lstep
66 #define	print	lprint
67 #define	at	lat
68 #define	match	lmat
69 #endif
70 #ifdef MNAMES
71 #define	stepback mstepback
72 #define	matcher	mmatcher
73 #define	walk	mwalk
74 #define	dissect	mdissect
75 #define	backref	mbackref
76 #define	step	mstep
77 #define	print	mprint
78 #define	at	mat
79 #define	match	mmat
80 #endif
81 
82 /* another structure passed up and down to avoid zillions of parameters */
83 struct match {
84 	struct re_guts *g;
85 	int eflags;
86 	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
87 	const char *offp;	/* offsets work from here */
88 	const char *beginp;	/* start of string -- virtual NUL precedes */
89 	const char *endp;	/* end of string -- virtual NUL here */
90 	const char *coldp;	/* can be no match starting before here */
91 	const char **lastpos;	/* [nplus+1] */
92 	STATEVARS;
93 	states st;		/* current states */
94 	states fresh;		/* states for a fresh start */
95 	states tmp;		/* temporary */
96 	states empty;		/* empty set of states */
97 	mbstate_t mbs;		/* multibyte conversion state */
98 };
99 
100 /* ========= begin header generated by ./mkh ========= */
101 #ifdef __cplusplus
102 extern "C" {
103 #endif
104 
105 /* === engine.c === */
106 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
107 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
108 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
109 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
110 static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags);
111 #define MAX_RECURSION	100
112 #define	BOL	(OUT-1)
113 #define	EOL	(BOL-1)
114 #define	BOLEOL	(BOL-2)
115 #define	NOTHING	(BOL-3)
116 #define	BOW	(BOL-4)
117 #define	EOW	(BOL-5)
118 #define	BADCHAR	(BOL-6)
119 #define	NWBND	(BOL-7)
120 #define	NONCHAR(c)	((c) <= OUT)
121 /* sflags */
122 #define	SBOS	0x0001
123 #define	SEOS	0x0002
124 
125 #ifdef REDEBUG
126 static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
127 #endif
128 #ifdef REDEBUG
129 static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst);
130 #endif
131 #ifdef REDEBUG
132 static const char *pchar(int ch);
133 #endif
134 
135 #ifdef __cplusplus
136 }
137 #endif
138 /* ========= end header generated by ./mkh ========= */
139 
140 #ifdef REDEBUG
141 #define	SP(t, s, c)	print(m, t, s, c, stdout)
142 #define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
143 #define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
144 #else
145 #define	SP(t, s, c)	/* nothing */
146 #define	AT(t, p1, p2, s1, s2)	/* nothing */
147 #define	NOTE(s)	/* nothing */
148 #endif
149 
150 /*
151  * Given a multibyte string pointed to by start, step back nchar characters
152  * from current position pointed to by cur.
153  */
154 static const char *
155 stepback(const char *start, const char *cur, int nchar)
156 {
157 	const char *ret;
158 	int wc, mbc;
159 	mbstate_t mbs;
160 	size_t clen;
161 
162 	if (MB_CUR_MAX == 1)
163 		return ((cur - nchar) > start ? cur - nchar : NULL);
164 
165 	ret = cur;
166 	for (wc = nchar; wc > 0; wc--) {
167 		for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) {
168 			if ((ret - mbc) < start)
169 				return (NULL);
170 			memset(&mbs, 0, sizeof(mbs));
171 			clen = mbrtowc(NULL, ret - mbc, mbc, &mbs);
172 			if (clen != (size_t)-1 && clen != (size_t)-2)
173 				break;
174 		}
175 		if (mbc > MB_CUR_MAX)
176 			return (NULL);
177 		ret -= mbc;
178 	}
179 
180 	return (ret);
181 }
182 
183 /*
184  - matcher - the actual matching engine
185  == static int matcher(struct re_guts *g, const char *string, \
186  ==	size_t nmatch, regmatch_t pmatch[], int eflags);
187  */
188 static int			/* 0 success, REG_NOMATCH failure */
189 matcher(struct re_guts *g,
190 	const char *string,
191 	size_t nmatch,
192 	regmatch_t pmatch[],
193 	int eflags)
194 {
195 	const char *endp;
196 	size_t i;
197 	struct match mv;
198 	struct match *m = &mv;
199 	const char *dp = NULL;
200 	const sopno gf = g->firststate+1;	/* +1 for OEND */
201 	const sopno gl = g->laststate;
202 	const char *start;
203 	const char *stop;
204 	/* Boyer-Moore algorithms variables */
205 	const char *pp;
206 	int cj, mj;
207 	const char *mustfirst;
208 	const char *mustlast;
209 	int *matchjump;
210 	int *charjump;
211 
212 	/* simplify the situation where possible */
213 	if (g->cflags&REG_NOSUB)
214 		nmatch = 0;
215 	if (eflags&REG_STARTEND) {
216 		start = string + pmatch[0].rm_so;
217 		stop = string + pmatch[0].rm_eo;
218 	} else {
219 		start = string;
220 		stop = start + strlen(start);
221 	}
222 	if (stop < start)
223 		return(REG_INVARG);
224 
225 	/* prescreening; this does wonders for this rather slow code */
226 	if (g->must != NULL) {
227 		if (g->charjump != NULL && g->matchjump != NULL) {
228 			mustfirst = g->must;
229 			mustlast = g->must + g->mlen - 1;
230 			charjump = g->charjump;
231 			matchjump = g->matchjump;
232 			pp = mustlast;
233 			for (dp = start+g->mlen-1; dp < stop;) {
234 				/* Fast skip non-matches */
235 				while (dp < stop && charjump[(int)*dp])
236 					dp += charjump[(int)*dp];
237 
238 				if (dp >= stop)
239 					break;
240 
241 				/* Greedy matcher */
242 				/* We depend on not being used for
243 				 * for strings of length 1
244 				 */
245 				while (*--dp == *--pp && pp != mustfirst);
246 
247 				if (*dp == *pp)
248 					break;
249 
250 				/* Jump to next possible match */
251 				mj = matchjump[pp - mustfirst];
252 				cj = charjump[(int)*dp];
253 				dp += (cj < mj ? mj : cj);
254 				pp = mustlast;
255 			}
256 			if (pp != mustfirst)
257 				return(REG_NOMATCH);
258 		} else {
259 			for (dp = start; dp < stop; dp++)
260 				if (*dp == g->must[0] &&
261 				    stop - dp >= g->mlen &&
262 				    memcmp(dp, g->must, (size_t)g->mlen) == 0)
263 					break;
264 			if (dp == stop)		/* we didn't find g->must */
265 				return(REG_NOMATCH);
266 		}
267 	}
268 
269 	/* match struct setup */
270 	m->g = g;
271 	m->eflags = eflags;
272 	m->pmatch = NULL;
273 	m->lastpos = NULL;
274 	m->offp = string;
275 	m->beginp = start;
276 	m->endp = stop;
277 	STATESETUP(m, 4);
278 	SETUP(m->st);
279 	SETUP(m->fresh);
280 	SETUP(m->tmp);
281 	SETUP(m->empty);
282 	CLEAR(m->empty);
283 	ZAPSTATE(&m->mbs);
284 
285 	/* Adjust start according to moffset, to speed things up */
286 	if (dp != NULL && g->moffset > -1) {
287 		const char *nstart;
288 
289 		nstart = stepback(start, dp, g->moffset);
290 		if (nstart != NULL)
291 			start = nstart;
292 	}
293 
294 	SP("mloop", m->st, *start);
295 
296 	/* this loop does only one repetition except for backrefs */
297 	for (;;) {
298 		endp = walk(m, start, stop, gf, gl, true);
299 		if (endp == NULL) {		/* a miss */
300 			if (m->pmatch != NULL)
301 				free((char *)m->pmatch);
302 			if (m->lastpos != NULL)
303 				free((char *)m->lastpos);
304 			STATETEARDOWN(m);
305 			return(REG_NOMATCH);
306 		}
307 		if (nmatch == 0 && !g->backrefs)
308 			break;		/* no further info needed */
309 
310 		/* where? */
311 		assert(m->coldp != NULL);
312 		for (;;) {
313 			NOTE("finding start");
314 			endp = walk(m, m->coldp, stop, gf, gl, false);
315 			if (endp != NULL)
316 				break;
317 			assert(m->coldp < m->endp);
318 			m->coldp += XMBRTOWC(NULL, m->coldp,
319 			    m->endp - m->coldp, &m->mbs, 0);
320 		}
321 		if (nmatch == 1 && !g->backrefs)
322 			break;		/* no further info needed */
323 
324 		/* oh my, he wants the subexpressions... */
325 		if (m->pmatch == NULL)
326 			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
327 							sizeof(regmatch_t));
328 		if (m->pmatch == NULL) {
329 			STATETEARDOWN(m);
330 			return(REG_ESPACE);
331 		}
332 		for (i = 1; i <= m->g->nsub; i++)
333 			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
334 		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
335 			NOTE("dissecting");
336 			dp = dissect(m, m->coldp, endp, gf, gl);
337 		} else {
338 			if (g->nplus > 0 && m->lastpos == NULL)
339 				m->lastpos = malloc((g->nplus+1) *
340 						sizeof(const char *));
341 			if (g->nplus > 0 && m->lastpos == NULL) {
342 				free(m->pmatch);
343 				STATETEARDOWN(m);
344 				return(REG_ESPACE);
345 			}
346 			NOTE("backref dissect");
347 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
348 		}
349 		if (dp != NULL)
350 			break;
351 
352 		/* uh-oh... we couldn't find a subexpression-level match */
353 		assert(g->backrefs);	/* must be back references doing it */
354 		assert(g->nplus == 0 || m->lastpos != NULL);
355 		for (;;) {
356 			if (dp != NULL || endp <= m->coldp)
357 				break;		/* defeat */
358 			NOTE("backoff");
359 			endp = walk(m, m->coldp, endp-1, gf, gl, false);
360 			if (endp == NULL)
361 				break;		/* defeat */
362 			/* try it on a shorter possibility */
363 #ifndef NDEBUG
364 			for (i = 1; i <= m->g->nsub; i++) {
365 				assert(m->pmatch[i].rm_so == -1);
366 				assert(m->pmatch[i].rm_eo == -1);
367 			}
368 #endif
369 			NOTE("backoff dissect");
370 			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
371 		}
372 		assert(dp == NULL || dp == endp);
373 		if (dp != NULL)		/* found a shorter one */
374 			break;
375 
376 		/* despite initial appearances, there is no match here */
377 		NOTE("false alarm");
378 		/* recycle starting later */
379 		start = m->coldp + XMBRTOWC(NULL, m->coldp,
380 		    stop - m->coldp, &m->mbs, 0);
381 		assert(start <= stop);
382 	}
383 
384 	/* fill in the details if requested */
385 	if (nmatch > 0) {
386 		pmatch[0].rm_so = m->coldp - m->offp;
387 		pmatch[0].rm_eo = endp - m->offp;
388 	}
389 	if (nmatch > 1) {
390 		assert(m->pmatch != NULL);
391 		for (i = 1; i < nmatch; i++)
392 			if (i <= m->g->nsub)
393 				pmatch[i] = m->pmatch[i];
394 			else {
395 				pmatch[i].rm_so = -1;
396 				pmatch[i].rm_eo = -1;
397 			}
398 	}
399 
400 	if (m->pmatch != NULL)
401 		free((char *)m->pmatch);
402 	if (m->lastpos != NULL)
403 		free((char *)m->lastpos);
404 	STATETEARDOWN(m);
405 	return(0);
406 }
407 
408 /*
409  - dissect - figure out what matched what, no back references
410  == static const char *dissect(struct match *m, const char *start, \
411  ==	const char *stop, sopno startst, sopno stopst);
412  */
413 static const char *		/* == stop (success) always */
414 dissect(struct match *m,
415 	const char *start,
416 	const char *stop,
417 	sopno startst,
418 	sopno stopst)
419 {
420 	int i;
421 	sopno ss;		/* start sop of current subRE */
422 	sopno es;		/* end sop of current subRE */
423 	const char *sp;		/* start of string matched by it */
424 	const char *stp;	/* string matched by it cannot pass here */
425 	const char *rest;	/* start of rest of string */
426 	const char *tail;	/* string unmatched by rest of RE */
427 	sopno ssub;		/* start sop of subsubRE */
428 	sopno esub;		/* end sop of subsubRE */
429 	const char *ssp;	/* start of string matched by subsubRE */
430 	const char *sep;	/* end of string matched by subsubRE */
431 	const char *oldssp;	/* previous ssp */
432 	const char *dp __unused;
433 
434 	AT("diss", start, stop, startst, stopst);
435 	sp = start;
436 	for (ss = startst; ss < stopst; ss = es) {
437 		/* identify end of subRE */
438 		es = ss;
439 		switch (OP(m->g->strip[es])) {
440 		case OPLUS_:
441 		case OQUEST_:
442 			es += OPND(m->g->strip[es]);
443 			break;
444 		case OCH_:
445 			while (OP(m->g->strip[es]) != (sop)O_CH)
446 				es += OPND(m->g->strip[es]);
447 			break;
448 		}
449 		es++;
450 
451 		/* figure out what it matched */
452 		switch (OP(m->g->strip[ss])) {
453 		case OEND:
454 			assert(nope);
455 			break;
456 		case OCHAR:
457 			sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
458 			break;
459 		case OBOL:
460 		case OEOL:
461 		case OBOW:
462 		case OEOW:
463 		case OBOS:
464 		case OEOS:
465 		case OWBND:
466 		case ONWBND:
467 			break;
468 		case OANY:
469 		case OANYOF:
470 			sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
471 			break;
472 		case OBACK_:
473 		case O_BACK:
474 			assert(nope);
475 			break;
476 		/* cases where length of match is hard to find */
477 		case OQUEST_:
478 			stp = stop;
479 			for (;;) {
480 				/* how long could this one be? */
481 				rest = walk(m, sp, stp, ss, es, false);
482 				assert(rest != NULL);	/* it did match */
483 				/* could the rest match the rest? */
484 				tail = walk(m, rest, stop, es, stopst, false);
485 				if (tail == stop)
486 					break;		/* yes! */
487 				/* no -- try a shorter match for this one */
488 				stp = rest - 1;
489 				assert(stp >= sp);	/* it did work */
490 			}
491 			ssub = ss + 1;
492 			esub = es - 1;
493 			/* did innards match? */
494 			if (walk(m, sp, rest, ssub, esub, false) != NULL) {
495 				dp = dissect(m, sp, rest, ssub, esub);
496 				assert(dp == rest);
497 			} else		/* no */
498 				assert(sp == rest);
499 			sp = rest;
500 			break;
501 		case OPLUS_:
502 			stp = stop;
503 			for (;;) {
504 				/* how long could this one be? */
505 				rest = walk(m, sp, stp, ss, es, false);
506 				assert(rest != NULL);	/* it did match */
507 				/* could the rest match the rest? */
508 				tail = walk(m, rest, stop, es, stopst, false);
509 				if (tail == stop)
510 					break;		/* yes! */
511 				/* no -- try a shorter match for this one */
512 				stp = rest - 1;
513 				assert(stp >= sp);	/* it did work */
514 			}
515 			ssub = ss + 1;
516 			esub = es - 1;
517 			ssp = sp;
518 			oldssp = ssp;
519 			for (;;) {	/* find last match of innards */
520 				sep = walk(m, ssp, rest, ssub, esub, false);
521 				if (sep == NULL || sep == ssp)
522 					break;	/* failed or matched null */
523 				oldssp = ssp;	/* on to next try */
524 				ssp = sep;
525 			}
526 			if (sep == NULL) {
527 				/* last successful match */
528 				sep = ssp;
529 				ssp = oldssp;
530 			}
531 			assert(sep == rest);	/* must exhaust substring */
532 			assert(walk(m, ssp, sep, ssub, esub, false) == rest);
533 			dp = dissect(m, ssp, sep, ssub, esub);
534 			assert(dp == sep);
535 			sp = rest;
536 			break;
537 		case OCH_:
538 			stp = stop;
539 			for (;;) {
540 				/* how long could this one be? */
541 				rest = walk(m, sp, stp, ss, es, false);
542 				assert(rest != NULL);	/* it did match */
543 				/* could the rest match the rest? */
544 				tail = walk(m, rest, stop, es, stopst, false);
545 				if (tail == stop)
546 					break;		/* yes! */
547 				/* no -- try a shorter match for this one */
548 				stp = rest - 1;
549 				assert(stp >= sp);	/* it did work */
550 			}
551 			ssub = ss + 1;
552 			esub = ss + OPND(m->g->strip[ss]) - 1;
553 			assert(OP(m->g->strip[esub]) == OOR1);
554 			for (;;) {	/* find first matching branch */
555 				if (walk(m, sp, rest, ssub, esub, false) == rest)
556 					break;	/* it matched all of it */
557 				/* that one missed, try next one */
558 				assert(OP(m->g->strip[esub]) == OOR1);
559 				esub++;
560 				assert(OP(m->g->strip[esub]) == OOR2);
561 				ssub = esub + 1;
562 				esub += OPND(m->g->strip[esub]);
563 				if (OP(m->g->strip[esub]) == (sop)OOR2)
564 					esub--;
565 				else
566 					assert(OP(m->g->strip[esub]) == O_CH);
567 			}
568 			dp = dissect(m, sp, rest, ssub, esub);
569 			assert(dp == rest);
570 			sp = rest;
571 			break;
572 		case O_PLUS:
573 		case O_QUEST:
574 		case OOR1:
575 		case OOR2:
576 		case O_CH:
577 			assert(nope);
578 			break;
579 		case OLPAREN:
580 			i = OPND(m->g->strip[ss]);
581 			assert(0 < i && i <= m->g->nsub);
582 			m->pmatch[i].rm_so = sp - m->offp;
583 			break;
584 		case ORPAREN:
585 			i = OPND(m->g->strip[ss]);
586 			assert(0 < i && i <= m->g->nsub);
587 			m->pmatch[i].rm_eo = sp - m->offp;
588 			break;
589 		default:		/* uh oh */
590 			assert(nope);
591 			break;
592 		}
593 	}
594 
595 	assert(sp == stop);
596 	return(sp);
597 }
598 
599 #define	ISBOW(m, sp)					\
600     (sp < m->endp && ISWORD(*sp) &&			\
601     ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||	\
602     (sp > m->offp && !ISWORD(*(sp-1)))))
603 #define	ISEOW(m, sp)					\
604     (((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||	\
605     (sp < m->endp && *sp == '\n' &&			\
606     (m->g->cflags&REG_NEWLINE)) ||			\
607     (sp < m->endp && !ISWORD(*sp)) ) &&			\
608     (sp > m->beginp && ISWORD(*(sp-1))))		\
609 
610 /*
611  - backref - figure out what matched what, figuring in back references
612  == static const char *backref(struct match *m, const char *start, \
613  ==	const char *stop, sopno startst, sopno stopst, sopno lev);
614  */
615 static const char *		/* == stop (success) or NULL (failure) */
616 backref(struct match *m,
617 	const char *start,
618 	const char *stop,
619 	sopno startst,
620 	sopno stopst,
621 	sopno lev,		/* PLUS nesting level */
622 	int rec)
623 {
624 	int i;
625 	sopno ss;		/* start sop of current subRE */
626 	const char *sp;		/* start of string matched by it */
627 	sopno ssub;		/* start sop of subsubRE */
628 	sopno esub;		/* end sop of subsubRE */
629 	const char *ssp;	/* start of string matched by subsubRE */
630 	const char *dp;
631 	size_t len;
632 	int hard;
633 	sop s;
634 	regoff_t offsave;
635 	cset *cs;
636 	wint_t wc;
637 
638 	AT("back", start, stop, startst, stopst);
639 	sp = start;
640 
641 	/* get as far as we can with easy stuff */
642 	hard = 0;
643 	for (ss = startst; !hard && ss < stopst; ss++)
644 		switch (OP(s = m->g->strip[ss])) {
645 		case OCHAR:
646 			if (sp == stop)
647 				return(NULL);
648 			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
649 			if (wc != OPND(s))
650 				return(NULL);
651 			break;
652 		case OANY:
653 			if (sp == stop)
654 				return(NULL);
655 			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
656 			if (wc == BADCHAR)
657 				return (NULL);
658 			break;
659 		case OANYOF:
660 			if (sp == stop)
661 				return (NULL);
662 			cs = &m->g->sets[OPND(s)];
663 			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
664 			if (wc == BADCHAR || !CHIN(cs, wc))
665 				return(NULL);
666 			break;
667 		case OBOS:
668 			if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0)
669 				{ /* yes */ }
670 			else
671 				return(NULL);
672 			break;
673 		case OEOS:
674 			if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0)
675 				{ /* yes */ }
676 			else
677 				return(NULL);
678 			break;
679 		case OBOL:
680 			if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
681 			    (sp > m->offp && sp < m->endp &&
682 			    *(sp-1) == '\n' && (m->g->cflags&REG_NEWLINE)))
683 				{ /* yes */ }
684 			else
685 				return(NULL);
686 			break;
687 		case OEOL:
688 			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
689 					(sp < m->endp && *sp == '\n' &&
690 						(m->g->cflags&REG_NEWLINE)) )
691 				{ /* yes */ }
692 			else
693 				return(NULL);
694 			break;
695 		case OWBND:
696 			if (ISBOW(m, sp) || ISEOW(m, sp))
697 				{ /* yes */ }
698 			else
699 				return(NULL);
700 			break;
701 		case ONWBND:
702 			if (((sp == m->beginp) && !ISWORD(*sp)) ||
703 			    (sp == m->endp && !ISWORD(*(sp - 1))))
704 				{ /* yes, beginning/end of subject */ }
705 			else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
706 				{ /* yes, beginning/end of subject */ }
707 			else
708 				return(NULL);
709 			break;
710 		case OBOW:
711 			if (ISBOW(m, sp))
712 				{ /* yes */ }
713 			else
714 				return(NULL);
715 			break;
716 		case OEOW:
717 			if (ISEOW(m, sp))
718 				{ /* yes */ }
719 			else
720 				return(NULL);
721 			break;
722 		case O_QUEST:
723 			break;
724 		case OOR1:	/* matches null but needs to skip */
725 			ss++;
726 			s = m->g->strip[ss];
727 			do {
728 				assert(OP(s) == OOR2);
729 				ss += OPND(s);
730 			} while (OP(s = m->g->strip[ss]) != (sop)O_CH);
731 			/* note that the ss++ gets us past the O_CH */
732 			break;
733 		default:	/* have to make a choice */
734 			hard = 1;
735 			break;
736 		}
737 	if (!hard) {		/* that was it! */
738 		if (sp != stop)
739 			return(NULL);
740 		return(sp);
741 	}
742 	ss--;			/* adjust for the for's final increment */
743 
744 	/* the hard stuff */
745 	AT("hard", sp, stop, ss, stopst);
746 	s = m->g->strip[ss];
747 	switch (OP(s)) {
748 	case OBACK_:		/* the vilest depths */
749 		i = OPND(s);
750 		assert(0 < i && i <= m->g->nsub);
751 		if (m->pmatch[i].rm_eo == -1)
752 			return(NULL);
753 		assert(m->pmatch[i].rm_so != -1);
754 		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
755 		if (len == 0 && rec++ > MAX_RECURSION)
756 			return(NULL);
757 		assert(stop - m->beginp >= len);
758 		if (sp > stop - len)
759 			return(NULL);	/* not enough left to match */
760 		ssp = m->offp + m->pmatch[i].rm_so;
761 		if (memcmp(sp, ssp, len) != 0)
762 			return(NULL);
763 		while (m->g->strip[ss] != (sop)SOP(O_BACK, i))
764 			ss++;
765 		return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
766 	case OQUEST_:		/* to null or not */
767 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
768 		if (dp != NULL)
769 			return(dp);	/* not */
770 		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
771 	case OPLUS_:
772 		assert(m->lastpos != NULL);
773 		assert(lev+1 <= m->g->nplus);
774 		m->lastpos[lev+1] = sp;
775 		return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
776 	case O_PLUS:
777 		if (sp == m->lastpos[lev])	/* last pass matched null */
778 			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
779 		/* try another pass */
780 		m->lastpos[lev] = sp;
781 		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
782 		if (dp == NULL)
783 			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
784 		else
785 			return(dp);
786 	case OCH_:		/* find the right one, if any */
787 		ssub = ss + 1;
788 		esub = ss + OPND(s) - 1;
789 		assert(OP(m->g->strip[esub]) == OOR1);
790 		for (;;) {	/* find first matching branch */
791 			dp = backref(m, sp, stop, ssub, esub, lev, rec);
792 			if (dp != NULL)
793 				return(dp);
794 			/* that one missed, try next one */
795 			if (OP(m->g->strip[esub]) == (sop)O_CH)
796 				return(NULL);	/* there is none */
797 			esub++;
798 			assert(OP(m->g->strip[esub]) == (sop)OOR2);
799 			ssub = esub + 1;
800 			esub += OPND(m->g->strip[esub]);
801 			if (OP(m->g->strip[esub]) == (sop)OOR2)
802 				esub--;
803 			else
804 				assert(OP(m->g->strip[esub]) == O_CH);
805 		}
806 		/* NOTREACHED */
807 		break;
808 	case OLPAREN:		/* must undo assignment if rest fails */
809 		i = OPND(s);
810 		assert(0 < i && i <= m->g->nsub);
811 		offsave = m->pmatch[i].rm_so;
812 		m->pmatch[i].rm_so = sp - m->offp;
813 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
814 		if (dp != NULL)
815 			return(dp);
816 		m->pmatch[i].rm_so = offsave;
817 		return(NULL);
818 	case ORPAREN:		/* must undo assignment if rest fails */
819 		i = OPND(s);
820 		assert(0 < i && i <= m->g->nsub);
821 		offsave = m->pmatch[i].rm_eo;
822 		m->pmatch[i].rm_eo = sp - m->offp;
823 		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
824 		if (dp != NULL)
825 			return(dp);
826 		m->pmatch[i].rm_eo = offsave;
827 		return(NULL);
828 	default:		/* uh oh */
829 		assert(nope);
830 		break;
831 	}
832 
833 	/* "can't happen" */
834 	assert(nope);
835 	/* NOTREACHED */
836 	return "shut up gcc";
837 }
838 
839 /*
840  - walk - step through the string either quickly or slowly
841  == static const char *walk(struct match *m, const char *start, \
842  ==	const char *stop, sopno startst, sopno stopst, bool fast);
843  */
844 static const char * /* where it ended, or NULL */
845 walk(struct match *m, const char *start, const char *stop, sopno startst,
846 	sopno stopst, bool fast)
847 {
848 	states st = m->st;
849 	states fresh = m->fresh;
850 	states empty = m->empty;
851 	states tmp = m->tmp;
852 	const char *p = start;
853 	wint_t c;
854 	wint_t lastc;		/* previous c */
855 	wint_t flagch;
856 	int i, sflags;
857 	const char *matchp;	/* last p at which a match ended */
858 	size_t clen;
859 
860 	sflags = 0;
861 	AT("slow", start, stop, startst, stopst);
862 	CLEAR(st);
863 	SET1(st, startst);
864 	SP("sstart", st, *p);
865 	st = step(m->g, startst, stopst, st, NOTHING, st, sflags);
866 	if (fast)
867 		ASSIGN(fresh, st);
868 	matchp = NULL;
869 	if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
870 		c = OUT;
871 	else {
872 		/*
873 		 * XXX Wrong if the previous character was multi-byte.
874 		 * Newline never is (in encodings supported by FreeBSD),
875 		 * so this only breaks the ISWORD tests below.
876 		 */
877 		c = (uch)*(start - 1);
878 	}
879 	for (;;) {
880 		/* next character */
881 		lastc = c;
882 		sflags = 0;
883 		if (p == m->endp) {
884 			c = OUT;
885 			clen = 0;
886 		} else
887 			clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
888 
889 		if (fast && EQ(st, fresh))
890 			matchp = p;
891 
892 		/* is there an EOL and/or BOL between lastc and c? */
893 		flagch = '\0';
894 		i = 0;
895 		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
896 				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
897 			flagch = BOL;
898 			i = m->g->nbol;
899 		}
900 		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
901 				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
902 			flagch = (flagch == BOL) ? BOLEOL : EOL;
903 			i += m->g->neol;
904 		}
905 		if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) {
906 			sflags |= SBOS;
907 			/* Step one more for BOS. */
908 			i++;
909 		}
910 		if (c == OUT && (m->eflags & REG_NOTEOL) == 0) {
911 			sflags |= SEOS;
912 			/* Step one more for EOS. */
913 			i++;
914 		}
915 		if (i != 0) {
916 			for (; i > 0; i--)
917 				st = step(m->g, startst, stopst, st, flagch, st,
918 				    sflags);
919 			SP("sboleol", st, c);
920 		}
921 
922 		/* how about a word boundary? */
923 		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
924 					(c != OUT && ISWORD(c)) ) {
925 			flagch = BOW;
926 		}
927 		if ( (lastc != OUT && ISWORD(lastc)) &&
928 				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
929 			flagch = EOW;
930 		}
931 		if (flagch == BOW || flagch == EOW) {
932 			st = step(m->g, startst, stopst, st, flagch, st, sflags);
933 			SP("sboweow", st, c);
934 		}
935 		if (lastc != OUT && c != OUT &&
936 		    ISWORD(lastc) == ISWORD(c)) {
937 			flagch = NWBND;
938 		} else if ((lastc == OUT && !ISWORD(c)) ||
939 		    (c == OUT && !ISWORD(lastc))) {
940 			flagch = NWBND;
941 		}
942 		if (flagch == NWBND) {
943 			st = step(m->g, startst, stopst, st, flagch, st, sflags);
944 			SP("snwbnd", st, c);
945 		}
946 
947 		/* are we done? */
948 		if (ISSET(st, stopst)) {
949 			if (fast)
950 				break;
951 			else
952 				matchp = p;
953 		}
954 		if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p))
955 			break;		/* NOTE BREAK OUT */
956 
957 		/* no, we must deal with this character */
958 		ASSIGN(tmp, st);
959 		if (fast)
960 			ASSIGN(st, fresh);
961 		else
962 			ASSIGN(st, empty);
963 		assert(c != OUT);
964 		st = step(m->g, startst, stopst, tmp, c, st, sflags);
965 		SP("saft", st, c);
966 		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags),
967 		    st));
968 		p += clen;
969 	}
970 
971 	if (fast) {
972 		assert(matchp != NULL);
973 		m->coldp = matchp;
974 		if (ISSET(st, stopst))
975 			return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
976 		else
977 			return (NULL);
978 	} else
979 		return (matchp);
980 }
981 
982 /*
983  - step - map set of states reachable before char to set reachable after
984  == static states step(struct re_guts *g, sopno start, sopno stop, \
985  ==	states bef, int ch, states aft);
986  == #define	BOL	(OUT-1)
987  == #define	EOL	(BOL-1)
988  == #define	BOLEOL	(BOL-2)
989  == #define	NOTHING	(BOL-3)
990  == #define	BOW	(BOL-4)
991  == #define	EOW	(BOL-5)
992  == #define	BADCHAR	(BOL-6)
993  == #define	NONCHAR(c)	((c) <= OUT)
994  */
995 static states
996 step(struct re_guts *g,
997 	sopno start,		/* start state within strip */
998 	sopno stop,		/* state after stop state within strip */
999 	states bef,		/* states reachable before */
1000 	wint_t ch,		/* character or NONCHAR code */
1001 	states aft,		/* states already known reachable after */
1002 	int sflags)		/* state flags */
1003 {
1004 	cset *cs;
1005 	sop s;
1006 	sopno pc;
1007 	onestate here;		/* note, macros know this name */
1008 	sopno look;
1009 	int i;
1010 
1011 	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
1012 		s = g->strip[pc];
1013 		switch (OP(s)) {
1014 		case OEND:
1015 			assert(pc == stop-1);
1016 			break;
1017 		case OCHAR:
1018 			/* only characters can match */
1019 			assert(!NONCHAR(ch) || ch != OPND(s));
1020 			if (ch == OPND(s))
1021 				FWD(aft, bef, 1);
1022 			break;
1023 		case OBOS:
1024 			if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0)
1025 				FWD(aft, bef, 1);
1026 			break;
1027 		case OEOS:
1028 			if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0)
1029 				FWD(aft, bef, 1);
1030 			break;
1031 		case OBOL:
1032 			if (ch == BOL || ch == BOLEOL)
1033 				FWD(aft, bef, 1);
1034 			break;
1035 		case OEOL:
1036 			if (ch == EOL || ch == BOLEOL)
1037 				FWD(aft, bef, 1);
1038 			break;
1039 		case OBOW:
1040 			if (ch == BOW)
1041 				FWD(aft, bef, 1);
1042 			break;
1043 		case OEOW:
1044 			if (ch == EOW)
1045 				FWD(aft, bef, 1);
1046 			break;
1047 		case OWBND:
1048 			if (ch == BOW || ch == EOW)
1049 				FWD(aft, bef, 1);
1050 			break;
1051 		case ONWBND:
1052 			if (ch == NWBND)
1053 				FWD(aft, aft, 1);
1054 			break;
1055 		case OANY:
1056 			if (!NONCHAR(ch))
1057 				FWD(aft, bef, 1);
1058 			break;
1059 		case OANYOF:
1060 			cs = &g->sets[OPND(s)];
1061 			if (!NONCHAR(ch) && CHIN(cs, ch))
1062 				FWD(aft, bef, 1);
1063 			break;
1064 		case OBACK_:		/* ignored here */
1065 		case O_BACK:
1066 			FWD(aft, aft, 1);
1067 			break;
1068 		case OPLUS_:		/* forward, this is just an empty */
1069 			FWD(aft, aft, 1);
1070 			break;
1071 		case O_PLUS:		/* both forward and back */
1072 			FWD(aft, aft, 1);
1073 			i = ISSETBACK(aft, OPND(s));
1074 			BACK(aft, aft, OPND(s));
1075 			if (!i && ISSETBACK(aft, OPND(s))) {
1076 				/* oho, must reconsider loop body */
1077 				pc -= OPND(s) + 1;
1078 				INIT(here, pc);
1079 			}
1080 			break;
1081 		case OQUEST_:		/* two branches, both forward */
1082 			FWD(aft, aft, 1);
1083 			FWD(aft, aft, OPND(s));
1084 			break;
1085 		case O_QUEST:		/* just an empty */
1086 			FWD(aft, aft, 1);
1087 			break;
1088 		case OLPAREN:		/* not significant here */
1089 		case ORPAREN:
1090 			FWD(aft, aft, 1);
1091 			break;
1092 		case OCH_:		/* mark the first two branches */
1093 			FWD(aft, aft, 1);
1094 			assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
1095 			FWD(aft, aft, OPND(s));
1096 			break;
1097 		case OOR1:		/* done a branch, find the O_CH */
1098 			if (ISSTATEIN(aft, here)) {
1099 				for (look = 1;
1100 				    OP(s = g->strip[pc+look]) != (sop)O_CH;
1101 				    look += OPND(s))
1102 					assert(OP(s) == (sop)OOR2);
1103 				FWD(aft, aft, look + 1);
1104 			}
1105 			break;
1106 		case OOR2:		/* propagate OCH_'s marking */
1107 			FWD(aft, aft, 1);
1108 			if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) {
1109 				assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
1110 				FWD(aft, aft, OPND(s));
1111 			}
1112 			break;
1113 		case O_CH:		/* just empty */
1114 			FWD(aft, aft, 1);
1115 			break;
1116 		default:		/* ooooops... */
1117 			assert(nope);
1118 			break;
1119 		}
1120 	}
1121 
1122 	return(aft);
1123 }
1124 
1125 #ifdef REDEBUG
1126 /*
1127  - print - print a set of states
1128  == #ifdef REDEBUG
1129  == static void print(struct match *m, const char *caption, states st, \
1130  ==	int ch, FILE *d);
1131  == #endif
1132  */
1133 static void
1134 print(struct match *m,
1135 	const char *caption,
1136 	states st,
1137 	int ch,
1138 	FILE *d)
1139 {
1140 	struct re_guts *g = m->g;
1141 	sopno i;
1142 	int first = 1;
1143 
1144 	if (!(m->eflags&REG_TRACE))
1145 		return;
1146 
1147 	fprintf(d, "%s", caption);
1148 	if (ch != '\0')
1149 		fprintf(d, " %s", pchar(ch));
1150 	for (i = 0; i < g->nstates; i++)
1151 		if (ISSET(st, i)) {
1152 			fprintf(d, "%s%lu", (first) ? "\t" : ", ", i);
1153 			first = 0;
1154 		}
1155 	fprintf(d, "\n");
1156 }
1157 
1158 /*
1159  - at - print current situation
1160  == #ifdef REDEBUG
1161  == static void at(struct match *m, const char *title, const char *start, \
1162  ==			 const char *stop, sopno startst, sopno stopst);
1163  == #endif
1164  */
1165 static void
1166 at(	struct match *m,
1167 	const char *title,
1168 	const char *start,
1169 	const char *stop,
1170 	sopno startst,
1171 	sopno stopst)
1172 {
1173 	if (!(m->eflags&REG_TRACE))
1174 		return;
1175 
1176 	printf("%s %s-", title, pchar(*start));
1177 	printf("%s ", pchar(*stop));
1178 	printf("%ld-%ld\n", (long)startst, (long)stopst);
1179 }
1180 
1181 #ifndef PCHARDONE
1182 #define	PCHARDONE	/* never again */
1183 /*
1184  - pchar - make a character printable
1185  == #ifdef REDEBUG
1186  == static const char *pchar(int ch);
1187  == #endif
1188  *
1189  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1190  * duplicate here avoids having a debugging-capable regexec.o tied to
1191  * a matching debug.o, and this is convenient.  It all disappears in
1192  * the non-debug compilation anyway, so it doesn't matter much.
1193  */
1194 static const char *		/* -> representation */
1195 pchar(int ch)
1196 {
1197 	static char pbuf[10];
1198 
1199 	if (isprint((uch)ch) || ch == ' ')
1200 		sprintf(pbuf, "%c", ch);
1201 	else
1202 		sprintf(pbuf, "\\%o", ch);
1203 	return(pbuf);
1204 }
1205 #endif
1206 #endif
1207 
1208 #undef	stepback
1209 #undef	matcher
1210 #undef	walk
1211 #undef	dissect
1212 #undef	backref
1213 #undef	step
1214 #undef	print
1215 #undef	at
1216 #undef	match
1217