xref: /freebsd/lib/libc/regex/regcomp.c (revision 884d26c84cba3ffc3d4e626306098fcdfe6a0c2b)
1 /*-
2  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3  * Copyright (c) 1992, 1993, 1994
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * Copyright (c) 2011 The FreeBSD Foundation
7  * All rights reserved.
8  * Portions of this software were developed by David Chisnall
9  * under sponsorship from the FreeBSD Foundation.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * Henry Spencer.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
39  */
40 
41 #if defined(LIBC_SCCS) && !defined(lint)
42 static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
43 #endif /* LIBC_SCCS and not lint */
44 #include <sys/cdefs.h>
45 __FBSDID("$FreeBSD$");
46 
47 #include <sys/types.h>
48 #include <stdio.h>
49 #include <string.h>
50 #include <ctype.h>
51 #include <limits.h>
52 #include <stdlib.h>
53 #include <regex.h>
54 #include <wchar.h>
55 #include <wctype.h>
56 
57 #include "utils.h"
58 #include "regex2.h"
59 
60 #include "cname.h"
61 
62 /*
63  * parse structure, passed up and down to avoid global variables and
64  * other clumsinesses
65  */
66 struct parse {
67 	char *next;		/* next character in RE */
68 	char *end;		/* end of string (-> NUL normally) */
69 	int error;		/* has an error been seen? */
70 	sop *strip;		/* malloced strip */
71 	sopno ssize;		/* malloced strip size (allocated) */
72 	sopno slen;		/* malloced strip length (used) */
73 	int ncsalloc;		/* number of csets allocated */
74 	struct re_guts *g;
75 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
76 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
77 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
78 };
79 
80 /* ========= begin header generated by ./mkh ========= */
81 #ifdef __cplusplus
82 extern "C" {
83 #endif
84 
85 /* === regcomp.c === */
86 static void p_ere(struct parse *p, int stop);
87 static void p_ere_exp(struct parse *p);
88 static void p_str(struct parse *p);
89 static void p_bre(struct parse *p, int end1, int end2);
90 static int p_simp_re(struct parse *p, int starordinary);
91 static int p_count(struct parse *p);
92 static void p_bracket(struct parse *p);
93 static void p_b_term(struct parse *p, cset *cs);
94 static void p_b_cclass(struct parse *p, cset *cs);
95 static void p_b_eclass(struct parse *p, cset *cs);
96 static wint_t p_b_symbol(struct parse *p);
97 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
98 static wint_t othercase(wint_t ch);
99 static void bothcases(struct parse *p, wint_t ch);
100 static void ordinary(struct parse *p, wint_t ch);
101 static void nonnewline(struct parse *p);
102 static void repeat(struct parse *p, sopno start, int from, int to);
103 static int seterr(struct parse *p, int e);
104 static cset *allocset(struct parse *p);
105 static void freeset(struct parse *p, cset *cs);
106 static void CHadd(struct parse *p, cset *cs, wint_t ch);
107 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
108 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
109 static wint_t singleton(cset *cs);
110 static sopno dupl(struct parse *p, sopno start, sopno finish);
111 static void doemit(struct parse *p, sop op, size_t opnd);
112 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
113 static void dofwd(struct parse *p, sopno pos, sop value);
114 static int enlarge(struct parse *p, sopno size);
115 static void stripsnug(struct parse *p, struct re_guts *g);
116 static void findmust(struct parse *p, struct re_guts *g);
117 static int altoffset(sop *scan, int offset);
118 static void computejumps(struct parse *p, struct re_guts *g);
119 static void computematchjumps(struct parse *p, struct re_guts *g);
120 static sopno pluscount(struct parse *p, struct re_guts *g);
121 static wint_t wgetnext(struct parse *p);
122 
123 #ifdef __cplusplus
124 }
125 #endif
126 /* ========= end header generated by ./mkh ========= */
127 
128 static char nuls[10];		/* place to point scanner in event of error */
129 
130 /*
131  * macros for use with parse structure
132  * BEWARE:  these know that the parse structure is named `p' !!!
133  */
134 #define	PEEK()	(*p->next)
135 #define	PEEK2()	(*(p->next+1))
136 #define	MORE()	(p->next < p->end)
137 #define	MORE2()	(p->next+1 < p->end)
138 #define	SEE(c)	(MORE() && PEEK() == (c))
139 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
140 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
141 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
142 #define	NEXT()	(p->next++)
143 #define	NEXT2()	(p->next += 2)
144 #define	NEXTn(n)	(p->next += (n))
145 #define	GETNEXT()	(*p->next++)
146 #define	WGETNEXT()	wgetnext(p)
147 #define	SETERROR(e)	seterr(p, (e))
148 #define	REQUIRE(co, e)	((co) || SETERROR(e))
149 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
150 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
151 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
152 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
153 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
154 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
155 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
156 #define	HERE()		(p->slen)
157 #define	THERE()		(p->slen - 1)
158 #define	THERETHERE()	(p->slen - 2)
159 #define	DROP(n)	(p->slen -= (n))
160 
161 #ifndef NDEBUG
162 static int never = 0;		/* for use in asserts; shuts lint up */
163 #else
164 #define	never	0		/* some <assert.h>s have bugs too */
165 #endif
166 
167 /* Macro used by computejump()/computematchjump() */
168 #define MIN(a,b)	((a)<(b)?(a):(b))
169 
170 /*
171  - regcomp - interface for parser and compilation
172  = extern int regcomp(regex_t *, const char *, int);
173  = #define	REG_BASIC	0000
174  = #define	REG_EXTENDED	0001
175  = #define	REG_ICASE	0002
176  = #define	REG_NOSUB	0004
177  = #define	REG_NEWLINE	0010
178  = #define	REG_NOSPEC	0020
179  = #define	REG_PEND	0040
180  = #define	REG_DUMP	0200
181  */
182 int				/* 0 success, otherwise REG_something */
183 regcomp(regex_t * __restrict preg,
184 	const char * __restrict pattern,
185 	int cflags)
186 {
187 	struct parse pa;
188 	struct re_guts *g;
189 	struct parse *p = &pa;
190 	int i;
191 	size_t len;
192 	size_t maxlen;
193 #ifdef REDEBUG
194 #	define	GOODFLAGS(f)	(f)
195 #else
196 #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
197 #endif
198 
199 	cflags = GOODFLAGS(cflags);
200 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
201 		return(REG_INVARG);
202 
203 	if (cflags&REG_PEND) {
204 		if (preg->re_endp < pattern)
205 			return(REG_INVARG);
206 		len = preg->re_endp - pattern;
207 	} else
208 		len = strlen((char *)pattern);
209 
210 	/* do the mallocs early so failure handling is easy */
211 	g = (struct re_guts *)malloc(sizeof(struct re_guts));
212 	if (g == NULL)
213 		return(REG_ESPACE);
214 	/*
215 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
216 	 * extension.  Also avoid any signed overflow in case of conversion
217 	 * so make the real limit based on a 31-bit overflow.
218 	 *
219 	 * Likely not applicable on 64-bit systems but handle the case
220 	 * generically (who are we to stop people from using ~715MB+
221 	 * patterns?).
222 	 */
223 	maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
224 	if (len >= maxlen) {
225 		free((char *)g);
226 		return(REG_ESPACE);
227 	}
228 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
229 	assert(p->ssize >= len);
230 
231 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
232 	p->slen = 0;
233 	if (p->strip == NULL) {
234 		free((char *)g);
235 		return(REG_ESPACE);
236 	}
237 
238 	/* set things up */
239 	p->g = g;
240 	p->next = (char *)pattern;	/* convenience; we do not modify it */
241 	p->end = p->next + len;
242 	p->error = 0;
243 	p->ncsalloc = 0;
244 	for (i = 0; i < NPAREN; i++) {
245 		p->pbegin[i] = 0;
246 		p->pend[i] = 0;
247 	}
248 	g->sets = NULL;
249 	g->ncsets = 0;
250 	g->cflags = cflags;
251 	g->iflags = 0;
252 	g->nbol = 0;
253 	g->neol = 0;
254 	g->must = NULL;
255 	g->moffset = -1;
256 	g->charjump = NULL;
257 	g->matchjump = NULL;
258 	g->mlen = 0;
259 	g->nsub = 0;
260 	g->backrefs = 0;
261 
262 	/* do it */
263 	EMIT(OEND, 0);
264 	g->firststate = THERE();
265 	if (cflags&REG_EXTENDED)
266 		p_ere(p, OUT);
267 	else if (cflags&REG_NOSPEC)
268 		p_str(p);
269 	else
270 		p_bre(p, OUT, OUT);
271 	EMIT(OEND, 0);
272 	g->laststate = THERE();
273 
274 	/* tidy up loose ends and fill things in */
275 	stripsnug(p, g);
276 	findmust(p, g);
277 	/* only use Boyer-Moore algorithm if the pattern is bigger
278 	 * than three characters
279 	 */
280 	if(g->mlen > 3) {
281 		computejumps(p, g);
282 		computematchjumps(p, g);
283 		if(g->matchjump == NULL && g->charjump != NULL) {
284 			free(g->charjump);
285 			g->charjump = NULL;
286 		}
287 	}
288 	g->nplus = pluscount(p, g);
289 	g->magic = MAGIC2;
290 	preg->re_nsub = g->nsub;
291 	preg->re_g = g;
292 	preg->re_magic = MAGIC1;
293 #ifndef REDEBUG
294 	/* not debugging, so can't rely on the assert() in regexec() */
295 	if (g->iflags&BAD)
296 		SETERROR(REG_ASSERT);
297 #endif
298 
299 	/* win or lose, we're done */
300 	if (p->error != 0)	/* lose */
301 		regfree(preg);
302 	return(p->error);
303 }
304 
305 /*
306  - p_ere - ERE parser top level, concatenation and alternation
307  == static void p_ere(struct parse *p, int_t stop);
308  */
309 static void
310 p_ere(struct parse *p,
311 	int stop)		/* character this ERE should end at */
312 {
313 	char c;
314 	sopno prevback;
315 	sopno prevfwd;
316 	sopno conc;
317 	int first = 1;		/* is this the first alternative? */
318 
319 	for (;;) {
320 		/* do a bunch of concatenated expressions */
321 		conc = HERE();
322 		while (MORE() && (c = PEEK()) != '|' && c != stop)
323 			p_ere_exp(p);
324 		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
325 
326 		if (!EAT('|'))
327 			break;		/* NOTE BREAK OUT */
328 
329 		if (first) {
330 			INSERT(OCH_, conc);	/* offset is wrong */
331 			prevfwd = conc;
332 			prevback = conc;
333 			first = 0;
334 		}
335 		ASTERN(OOR1, prevback);
336 		prevback = THERE();
337 		AHEAD(prevfwd);			/* fix previous offset */
338 		prevfwd = HERE();
339 		EMIT(OOR2, 0);			/* offset is very wrong */
340 	}
341 
342 	if (!first) {		/* tail-end fixups */
343 		AHEAD(prevfwd);
344 		ASTERN(O_CH, prevback);
345 	}
346 
347 	assert(!MORE() || SEE(stop));
348 }
349 
350 /*
351  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
352  == static void p_ere_exp(struct parse *p);
353  */
354 static void
355 p_ere_exp(struct parse *p)
356 {
357 	char c;
358 	wint_t wc;
359 	sopno pos;
360 	int count;
361 	int count2;
362 	sopno subno;
363 	int wascaret = 0;
364 
365 	assert(MORE());		/* caller should have ensured this */
366 	c = GETNEXT();
367 
368 	pos = HERE();
369 	switch (c) {
370 	case '(':
371 		(void)REQUIRE(MORE(), REG_EPAREN);
372 		p->g->nsub++;
373 		subno = p->g->nsub;
374 		if (subno < NPAREN)
375 			p->pbegin[subno] = HERE();
376 		EMIT(OLPAREN, subno);
377 		if (!SEE(')'))
378 			p_ere(p, ')');
379 		if (subno < NPAREN) {
380 			p->pend[subno] = HERE();
381 			assert(p->pend[subno] != 0);
382 		}
383 		EMIT(ORPAREN, subno);
384 		(void)MUSTEAT(')', REG_EPAREN);
385 		break;
386 #ifndef POSIX_MISTAKE
387 	case ')':		/* happens only if no current unmatched ( */
388 		/*
389 		 * You may ask, why the ifndef?  Because I didn't notice
390 		 * this until slightly too late for 1003.2, and none of the
391 		 * other 1003.2 regular-expression reviewers noticed it at
392 		 * all.  So an unmatched ) is legal POSIX, at least until
393 		 * we can get it fixed.
394 		 */
395 		SETERROR(REG_EPAREN);
396 		break;
397 #endif
398 	case '^':
399 		EMIT(OBOL, 0);
400 		p->g->iflags |= USEBOL;
401 		p->g->nbol++;
402 		wascaret = 1;
403 		break;
404 	case '$':
405 		EMIT(OEOL, 0);
406 		p->g->iflags |= USEEOL;
407 		p->g->neol++;
408 		break;
409 	case '|':
410 		SETERROR(REG_EMPTY);
411 		break;
412 	case '*':
413 	case '+':
414 	case '?':
415 		SETERROR(REG_BADRPT);
416 		break;
417 	case '.':
418 		if (p->g->cflags&REG_NEWLINE)
419 			nonnewline(p);
420 		else
421 			EMIT(OANY, 0);
422 		break;
423 	case '[':
424 		p_bracket(p);
425 		break;
426 	case '\\':
427 		(void)REQUIRE(MORE(), REG_EESCAPE);
428 		wc = WGETNEXT();
429 		switch (wc) {
430 		case '<':
431 			EMIT(OBOW, 0);
432 			break;
433 		case '>':
434 			EMIT(OEOW, 0);
435 			break;
436 		default:
437 			ordinary(p, wc);
438 			break;
439 		}
440 		break;
441 	case '{':		/* okay as ordinary except if digit follows */
442 		(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
443 		/* FALLTHROUGH */
444 	default:
445 		p->next--;
446 		wc = WGETNEXT();
447 		ordinary(p, wc);
448 		break;
449 	}
450 
451 	if (!MORE())
452 		return;
453 	c = PEEK();
454 	/* we call { a repetition if followed by a digit */
455 	if (!( c == '*' || c == '+' || c == '?' ||
456 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
457 		return;		/* no repetition, we're done */
458 	NEXT();
459 
460 	(void)REQUIRE(!wascaret, REG_BADRPT);
461 	switch (c) {
462 	case '*':	/* implemented as +? */
463 		/* this case does not require the (y|) trick, noKLUDGE */
464 		INSERT(OPLUS_, pos);
465 		ASTERN(O_PLUS, pos);
466 		INSERT(OQUEST_, pos);
467 		ASTERN(O_QUEST, pos);
468 		break;
469 	case '+':
470 		INSERT(OPLUS_, pos);
471 		ASTERN(O_PLUS, pos);
472 		break;
473 	case '?':
474 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
475 		INSERT(OCH_, pos);		/* offset slightly wrong */
476 		ASTERN(OOR1, pos);		/* this one's right */
477 		AHEAD(pos);			/* fix the OCH_ */
478 		EMIT(OOR2, 0);			/* offset very wrong... */
479 		AHEAD(THERE());			/* ...so fix it */
480 		ASTERN(O_CH, THERETHERE());
481 		break;
482 	case '{':
483 		count = p_count(p);
484 		if (EAT(',')) {
485 			if (isdigit((uch)PEEK())) {
486 				count2 = p_count(p);
487 				(void)REQUIRE(count <= count2, REG_BADBR);
488 			} else		/* single number with comma */
489 				count2 = INFINITY;
490 		} else		/* just a single number */
491 			count2 = count;
492 		repeat(p, pos, count, count2);
493 		if (!EAT('}')) {	/* error heuristics */
494 			while (MORE() && PEEK() != '}')
495 				NEXT();
496 			(void)REQUIRE(MORE(), REG_EBRACE);
497 			SETERROR(REG_BADBR);
498 		}
499 		break;
500 	}
501 
502 	if (!MORE())
503 		return;
504 	c = PEEK();
505 	if (!( c == '*' || c == '+' || c == '?' ||
506 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
507 		return;
508 	SETERROR(REG_BADRPT);
509 }
510 
511 /*
512  - p_str - string (no metacharacters) "parser"
513  == static void p_str(struct parse *p);
514  */
515 static void
516 p_str(struct parse *p)
517 {
518 	(void)REQUIRE(MORE(), REG_EMPTY);
519 	while (MORE())
520 		ordinary(p, WGETNEXT());
521 }
522 
523 /*
524  - p_bre - BRE parser top level, anchoring and concatenation
525  == static void p_bre(struct parse *p,  int end1, \
526  ==	int end2);
527  * Giving end1 as OUT essentially eliminates the end1/end2 check.
528  *
529  * This implementation is a bit of a kludge, in that a trailing $ is first
530  * taken as an ordinary character and then revised to be an anchor.
531  * The amount of lookahead needed to avoid this kludge is excessive.
532  */
533 static void
534 p_bre(struct parse *p,
535 	int end1,		/* first terminating character */
536 	int end2)		/* second terminating character */
537 {
538 	sopno start = HERE();
539 	int first = 1;			/* first subexpression? */
540 	int wasdollar = 0;
541 
542 	if (EAT('^')) {
543 		EMIT(OBOL, 0);
544 		p->g->iflags |= USEBOL;
545 		p->g->nbol++;
546 	}
547 	while (MORE() && !SEETWO(end1, end2)) {
548 		wasdollar = p_simp_re(p, first);
549 		first = 0;
550 	}
551 	if (wasdollar) {	/* oops, that was a trailing anchor */
552 		DROP(1);
553 		EMIT(OEOL, 0);
554 		p->g->iflags |= USEEOL;
555 		p->g->neol++;
556 	}
557 
558 	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
559 }
560 
561 /*
562  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
563  == static int p_simp_re(struct parse *p, int starordinary);
564  */
565 static int			/* was the simple RE an unbackslashed $? */
566 p_simp_re(struct parse *p,
567 	int starordinary)	/* is a leading * an ordinary character? */
568 {
569 	int c;
570 	int count;
571 	int count2;
572 	sopno pos;
573 	int i;
574 	wint_t wc;
575 	sopno subno;
576 #	define	BACKSL	(1<<CHAR_BIT)
577 
578 	pos = HERE();		/* repetition op, if any, covers from here */
579 
580 	assert(MORE());		/* caller should have ensured this */
581 	c = GETNEXT();
582 	if (c == '\\') {
583 		(void)REQUIRE(MORE(), REG_EESCAPE);
584 		c = BACKSL | GETNEXT();
585 	}
586 	switch (c) {
587 	case '.':
588 		if (p->g->cflags&REG_NEWLINE)
589 			nonnewline(p);
590 		else
591 			EMIT(OANY, 0);
592 		break;
593 	case '[':
594 		p_bracket(p);
595 		break;
596 	case BACKSL|'<':
597 		EMIT(OBOW, 0);
598 		break;
599 	case BACKSL|'>':
600 		EMIT(OEOW, 0);
601 		break;
602 	case BACKSL|'{':
603 		SETERROR(REG_BADRPT);
604 		break;
605 	case BACKSL|'(':
606 		p->g->nsub++;
607 		subno = p->g->nsub;
608 		if (subno < NPAREN)
609 			p->pbegin[subno] = HERE();
610 		EMIT(OLPAREN, subno);
611 		/* the MORE here is an error heuristic */
612 		if (MORE() && !SEETWO('\\', ')'))
613 			p_bre(p, '\\', ')');
614 		if (subno < NPAREN) {
615 			p->pend[subno] = HERE();
616 			assert(p->pend[subno] != 0);
617 		}
618 		EMIT(ORPAREN, subno);
619 		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
620 		break;
621 	case BACKSL|')':	/* should not get here -- must be user */
622 	case BACKSL|'}':
623 		SETERROR(REG_EPAREN);
624 		break;
625 	case BACKSL|'1':
626 	case BACKSL|'2':
627 	case BACKSL|'3':
628 	case BACKSL|'4':
629 	case BACKSL|'5':
630 	case BACKSL|'6':
631 	case BACKSL|'7':
632 	case BACKSL|'8':
633 	case BACKSL|'9':
634 		i = (c&~BACKSL) - '0';
635 		assert(i < NPAREN);
636 		if (p->pend[i] != 0) {
637 			assert(i <= p->g->nsub);
638 			EMIT(OBACK_, i);
639 			assert(p->pbegin[i] != 0);
640 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
641 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
642 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
643 			EMIT(O_BACK, i);
644 		} else
645 			SETERROR(REG_ESUBREG);
646 		p->g->backrefs = 1;
647 		break;
648 	case '*':
649 		(void)REQUIRE(starordinary, REG_BADRPT);
650 		/* FALLTHROUGH */
651 	default:
652 		p->next--;
653 		wc = WGETNEXT();
654 		ordinary(p, wc);
655 		break;
656 	}
657 
658 	if (EAT('*')) {		/* implemented as +? */
659 		/* this case does not require the (y|) trick, noKLUDGE */
660 		INSERT(OPLUS_, pos);
661 		ASTERN(O_PLUS, pos);
662 		INSERT(OQUEST_, pos);
663 		ASTERN(O_QUEST, pos);
664 	} else if (EATTWO('\\', '{')) {
665 		count = p_count(p);
666 		if (EAT(',')) {
667 			if (MORE() && isdigit((uch)PEEK())) {
668 				count2 = p_count(p);
669 				(void)REQUIRE(count <= count2, REG_BADBR);
670 			} else		/* single number with comma */
671 				count2 = INFINITY;
672 		} else		/* just a single number */
673 			count2 = count;
674 		repeat(p, pos, count, count2);
675 		if (!EATTWO('\\', '}')) {	/* error heuristics */
676 			while (MORE() && !SEETWO('\\', '}'))
677 				NEXT();
678 			(void)REQUIRE(MORE(), REG_EBRACE);
679 			SETERROR(REG_BADBR);
680 		}
681 	} else if (c == '$')     /* $ (but not \$) ends it */
682 		return(1);
683 
684 	return(0);
685 }
686 
687 /*
688  - p_count - parse a repetition count
689  == static int p_count(struct parse *p);
690  */
691 static int			/* the value */
692 p_count(struct parse *p)
693 {
694 	int count = 0;
695 	int ndigits = 0;
696 
697 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
698 		count = count*10 + (GETNEXT() - '0');
699 		ndigits++;
700 	}
701 
702 	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
703 	return(count);
704 }
705 
706 /*
707  - p_bracket - parse a bracketed character list
708  == static void p_bracket(struct parse *p);
709  */
710 static void
711 p_bracket(struct parse *p)
712 {
713 	cset *cs;
714 	wint_t ch;
715 
716 	/* Dept of Truly Sickening Special-Case Kludges */
717 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
718 		EMIT(OBOW, 0);
719 		NEXTn(6);
720 		return;
721 	}
722 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
723 		EMIT(OEOW, 0);
724 		NEXTn(6);
725 		return;
726 	}
727 
728 	if ((cs = allocset(p)) == NULL)
729 		return;
730 
731 	if (p->g->cflags&REG_ICASE)
732 		cs->icase = 1;
733 	if (EAT('^'))
734 		cs->invert = 1;
735 	if (EAT(']'))
736 		CHadd(p, cs, ']');
737 	else if (EAT('-'))
738 		CHadd(p, cs, '-');
739 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
740 		p_b_term(p, cs);
741 	if (EAT('-'))
742 		CHadd(p, cs, '-');
743 	(void)MUSTEAT(']', REG_EBRACK);
744 
745 	if (p->error != 0)	/* don't mess things up further */
746 		return;
747 
748 	if (cs->invert && p->g->cflags&REG_NEWLINE)
749 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
750 
751 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
752 		ordinary(p, ch);
753 		freeset(p, cs);
754 	} else
755 		EMIT(OANYOF, (int)(cs - p->g->sets));
756 }
757 
758 /*
759  - p_b_term - parse one term of a bracketed character list
760  == static void p_b_term(struct parse *p, cset *cs);
761  */
762 static void
763 p_b_term(struct parse *p, cset *cs)
764 {
765 	char c;
766 	wint_t start, finish;
767 
768 	/* classify what we've got */
769 	switch ((MORE()) ? PEEK() : '\0') {
770 	case '[':
771 		c = (MORE2()) ? PEEK2() : '\0';
772 		break;
773 	case '-':
774 		SETERROR(REG_ERANGE);
775 		return;			/* NOTE RETURN */
776 	default:
777 		c = '\0';
778 		break;
779 	}
780 
781 	switch (c) {
782 	case ':':		/* character class */
783 		NEXT2();
784 		(void)REQUIRE(MORE(), REG_EBRACK);
785 		c = PEEK();
786 		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
787 		p_b_cclass(p, cs);
788 		(void)REQUIRE(MORE(), REG_EBRACK);
789 		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
790 		break;
791 	case '=':		/* equivalence class */
792 		NEXT2();
793 		(void)REQUIRE(MORE(), REG_EBRACK);
794 		c = PEEK();
795 		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
796 		p_b_eclass(p, cs);
797 		(void)REQUIRE(MORE(), REG_EBRACK);
798 		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
799 		break;
800 	default:		/* symbol, ordinary character, or range */
801 		start = p_b_symbol(p);
802 		if (SEE('-') && MORE2() && PEEK2() != ']') {
803 			/* range */
804 			NEXT();
805 			if (EAT('-'))
806 				finish = '-';
807 			else
808 				finish = p_b_symbol(p);
809 		} else
810 			finish = start;
811 		if (start == finish)
812 			CHadd(p, cs, start);
813 		else {
814 			(void)REQUIRE(start <= finish, REG_ERANGE);
815 			CHaddrange(p, cs, start, finish);
816 		}
817 		break;
818 	}
819 }
820 
821 /*
822  - p_b_cclass - parse a character-class name and deal with it
823  == static void p_b_cclass(struct parse *p, cset *cs);
824  */
825 static void
826 p_b_cclass(struct parse *p, cset *cs)
827 {
828 	char *sp = p->next;
829 	size_t len;
830 	wctype_t wct;
831 	char clname[16];
832 
833 	while (MORE() && isalpha((uch)PEEK()))
834 		NEXT();
835 	len = p->next - sp;
836 	if (len >= sizeof(clname) - 1) {
837 		SETERROR(REG_ECTYPE);
838 		return;
839 	}
840 	memcpy(clname, sp, len);
841 	clname[len] = '\0';
842 	if ((wct = wctype(clname)) == 0) {
843 		SETERROR(REG_ECTYPE);
844 		return;
845 	}
846 	CHaddtype(p, cs, wct);
847 }
848 
849 /*
850  - p_b_eclass - parse an equivalence-class name and deal with it
851  == static void p_b_eclass(struct parse *p, cset *cs);
852  *
853  * This implementation is incomplete. xxx
854  */
855 static void
856 p_b_eclass(struct parse *p, cset *cs)
857 {
858 	wint_t c;
859 
860 	c = p_b_coll_elem(p, '=');
861 	CHadd(p, cs, c);
862 }
863 
864 /*
865  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
866  == static wint_t p_b_symbol(struct parse *p);
867  */
868 static wint_t			/* value of symbol */
869 p_b_symbol(struct parse *p)
870 {
871 	wint_t value;
872 
873 	(void)REQUIRE(MORE(), REG_EBRACK);
874 	if (!EATTWO('[', '.'))
875 		return(WGETNEXT());
876 
877 	/* collating symbol */
878 	value = p_b_coll_elem(p, '.');
879 	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
880 	return(value);
881 }
882 
883 /*
884  - p_b_coll_elem - parse a collating-element name and look it up
885  == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
886  */
887 static wint_t			/* value of collating element */
888 p_b_coll_elem(struct parse *p,
889 	wint_t endc)		/* name ended by endc,']' */
890 {
891 	char *sp = p->next;
892 	struct cname *cp;
893 	int len;
894 	mbstate_t mbs;
895 	wchar_t wc;
896 	size_t clen;
897 
898 	while (MORE() && !SEETWO(endc, ']'))
899 		NEXT();
900 	if (!MORE()) {
901 		SETERROR(REG_EBRACK);
902 		return(0);
903 	}
904 	len = p->next - sp;
905 	for (cp = cnames; cp->name != NULL; cp++)
906 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
907 			return(cp->code);	/* known name */
908 	memset(&mbs, 0, sizeof(mbs));
909 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
910 		return (wc);			/* single character */
911 	else if (clen == (size_t)-1 || clen == (size_t)-2)
912 		SETERROR(REG_ILLSEQ);
913 	else
914 		SETERROR(REG_ECOLLATE);		/* neither */
915 	return(0);
916 }
917 
918 /*
919  - othercase - return the case counterpart of an alphabetic
920  == static wint_t othercase(wint_t ch);
921  */
922 static wint_t			/* if no counterpart, return ch */
923 othercase(wint_t ch)
924 {
925 	assert(iswalpha(ch));
926 	if (iswupper(ch))
927 		return(towlower(ch));
928 	else if (iswlower(ch))
929 		return(towupper(ch));
930 	else			/* peculiar, but could happen */
931 		return(ch);
932 }
933 
934 /*
935  - bothcases - emit a dualcase version of a two-case character
936  == static void bothcases(struct parse *p, wint_t ch);
937  *
938  * Boy, is this implementation ever a kludge...
939  */
940 static void
941 bothcases(struct parse *p, wint_t ch)
942 {
943 	char *oldnext = p->next;
944 	char *oldend = p->end;
945 	char bracket[3 + MB_LEN_MAX];
946 	size_t n;
947 	mbstate_t mbs;
948 
949 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
950 	p->next = bracket;
951 	memset(&mbs, 0, sizeof(mbs));
952 	n = wcrtomb(bracket, ch, &mbs);
953 	assert(n != (size_t)-1);
954 	bracket[n] = ']';
955 	bracket[n + 1] = '\0';
956 	p->end = bracket+n+1;
957 	p_bracket(p);
958 	assert(p->next == p->end);
959 	p->next = oldnext;
960 	p->end = oldend;
961 }
962 
963 /*
964  - ordinary - emit an ordinary character
965  == static void ordinary(struct parse *p, wint_t ch);
966  */
967 static void
968 ordinary(struct parse *p, wint_t ch)
969 {
970 	cset *cs;
971 
972 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
973 		bothcases(p, ch);
974 	else if ((ch & OPDMASK) == ch)
975 		EMIT(OCHAR, ch);
976 	else {
977 		/*
978 		 * Kludge: character is too big to fit into an OCHAR operand.
979 		 * Emit a singleton set.
980 		 */
981 		if ((cs = allocset(p)) == NULL)
982 			return;
983 		CHadd(p, cs, ch);
984 		EMIT(OANYOF, (int)(cs - p->g->sets));
985 	}
986 }
987 
988 /*
989  - nonnewline - emit REG_NEWLINE version of OANY
990  == static void nonnewline(struct parse *p);
991  *
992  * Boy, is this implementation ever a kludge...
993  */
994 static void
995 nonnewline(struct parse *p)
996 {
997 	char *oldnext = p->next;
998 	char *oldend = p->end;
999 	char bracket[4];
1000 
1001 	p->next = bracket;
1002 	p->end = bracket+3;
1003 	bracket[0] = '^';
1004 	bracket[1] = '\n';
1005 	bracket[2] = ']';
1006 	bracket[3] = '\0';
1007 	p_bracket(p);
1008 	assert(p->next == bracket+3);
1009 	p->next = oldnext;
1010 	p->end = oldend;
1011 }
1012 
1013 /*
1014  - repeat - generate code for a bounded repetition, recursively if needed
1015  == static void repeat(struct parse *p, sopno start, int from, int to);
1016  */
1017 static void
1018 repeat(struct parse *p,
1019 	sopno start,		/* operand from here to end of strip */
1020 	int from,		/* repeated from this number */
1021 	int to)			/* to this number of times (maybe INFINITY) */
1022 {
1023 	sopno finish = HERE();
1024 #	define	N	2
1025 #	define	INF	3
1026 #	define	REP(f, t)	((f)*8 + (t))
1027 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1028 	sopno copy;
1029 
1030 	if (p->error != 0)	/* head off possible runaway recursion */
1031 		return;
1032 
1033 	assert(from <= to);
1034 
1035 	switch (REP(MAP(from), MAP(to))) {
1036 	case REP(0, 0):			/* must be user doing this */
1037 		DROP(finish-start);	/* drop the operand */
1038 		break;
1039 	case REP(0, 1):			/* as x{1,1}? */
1040 	case REP(0, N):			/* as x{1,n}? */
1041 	case REP(0, INF):		/* as x{1,}? */
1042 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1043 		INSERT(OCH_, start);		/* offset is wrong... */
1044 		repeat(p, start+1, 1, to);
1045 		ASTERN(OOR1, start);
1046 		AHEAD(start);			/* ... fix it */
1047 		EMIT(OOR2, 0);
1048 		AHEAD(THERE());
1049 		ASTERN(O_CH, THERETHERE());
1050 		break;
1051 	case REP(1, 1):			/* trivial case */
1052 		/* done */
1053 		break;
1054 	case REP(1, N):			/* as x?x{1,n-1} */
1055 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1056 		INSERT(OCH_, start);
1057 		ASTERN(OOR1, start);
1058 		AHEAD(start);
1059 		EMIT(OOR2, 0);			/* offset very wrong... */
1060 		AHEAD(THERE());			/* ...so fix it */
1061 		ASTERN(O_CH, THERETHERE());
1062 		copy = dupl(p, start+1, finish+1);
1063 		assert(copy == finish+4);
1064 		repeat(p, copy, 1, to-1);
1065 		break;
1066 	case REP(1, INF):		/* as x+ */
1067 		INSERT(OPLUS_, start);
1068 		ASTERN(O_PLUS, start);
1069 		break;
1070 	case REP(N, N):			/* as xx{m-1,n-1} */
1071 		copy = dupl(p, start, finish);
1072 		repeat(p, copy, from-1, to-1);
1073 		break;
1074 	case REP(N, INF):		/* as xx{n-1,INF} */
1075 		copy = dupl(p, start, finish);
1076 		repeat(p, copy, from-1, to);
1077 		break;
1078 	default:			/* "can't happen" */
1079 		SETERROR(REG_ASSERT);	/* just in case */
1080 		break;
1081 	}
1082 }
1083 
1084 /*
1085  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1086  - character from the parse struct, signals a REG_ILLSEQ error if the
1087  - character can't be converted. Returns the number of bytes consumed.
1088  */
1089 static wint_t
1090 wgetnext(struct parse *p)
1091 {
1092 	mbstate_t mbs;
1093 	wchar_t wc;
1094 	size_t n;
1095 
1096 	memset(&mbs, 0, sizeof(mbs));
1097 	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1098 	if (n == (size_t)-1 || n == (size_t)-2) {
1099 		SETERROR(REG_ILLSEQ);
1100 		return (0);
1101 	}
1102 	if (n == 0)
1103 		n = 1;
1104 	p->next += n;
1105 	return (wc);
1106 }
1107 
1108 /*
1109  - seterr - set an error condition
1110  == static int seterr(struct parse *p, int e);
1111  */
1112 static int			/* useless but makes type checking happy */
1113 seterr(struct parse *p, int e)
1114 {
1115 	if (p->error == 0)	/* keep earliest error condition */
1116 		p->error = e;
1117 	p->next = nuls;		/* try to bring things to a halt */
1118 	p->end = nuls;
1119 	return(0);		/* make the return value well-defined */
1120 }
1121 
1122 /*
1123  - allocset - allocate a set of characters for []
1124  == static cset *allocset(struct parse *p);
1125  */
1126 static cset *
1127 allocset(struct parse *p)
1128 {
1129 	cset *cs, *ncs;
1130 
1131 	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1132 	if (ncs == NULL) {
1133 		SETERROR(REG_ESPACE);
1134 		return (NULL);
1135 	}
1136 	p->g->sets = ncs;
1137 	cs = &p->g->sets[p->g->ncsets++];
1138 	memset(cs, 0, sizeof(*cs));
1139 
1140 	return(cs);
1141 }
1142 
1143 /*
1144  - freeset - free a now-unused set
1145  == static void freeset(struct parse *p, cset *cs);
1146  */
1147 static void
1148 freeset(struct parse *p, cset *cs)
1149 {
1150 	cset *top = &p->g->sets[p->g->ncsets];
1151 
1152 	free(cs->wides);
1153 	free(cs->ranges);
1154 	free(cs->types);
1155 	memset(cs, 0, sizeof(*cs));
1156 	if (cs == top-1)	/* recover only the easy case */
1157 		p->g->ncsets--;
1158 }
1159 
1160 /*
1161  - singleton - Determine whether a set contains only one character,
1162  - returning it if so, otherwise returning OUT.
1163  */
1164 static wint_t
1165 singleton(cset *cs)
1166 {
1167 	wint_t i, s, n;
1168 
1169 	for (i = n = 0; i < NC; i++)
1170 		if (CHIN(cs, i)) {
1171 			n++;
1172 			s = i;
1173 		}
1174 	if (n == 1)
1175 		return (s);
1176 	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1177 	    cs->icase == 0)
1178 		return (cs->wides[0]);
1179 	/* Don't bother handling the other cases. */
1180 	return (OUT);
1181 }
1182 
1183 /*
1184  - CHadd - add character to character set.
1185  */
1186 static void
1187 CHadd(struct parse *p, cset *cs, wint_t ch)
1188 {
1189 	wint_t nch, *newwides;
1190 	assert(ch >= 0);
1191 	if (ch < NC)
1192 		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1193 	else {
1194 		newwides = realloc(cs->wides, (cs->nwides + 1) *
1195 		    sizeof(*cs->wides));
1196 		if (newwides == NULL) {
1197 			SETERROR(REG_ESPACE);
1198 			return;
1199 		}
1200 		cs->wides = newwides;
1201 		cs->wides[cs->nwides++] = ch;
1202 	}
1203 	if (cs->icase) {
1204 		if ((nch = towlower(ch)) < NC)
1205 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1206 		if ((nch = towupper(ch)) < NC)
1207 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1208 	}
1209 }
1210 
1211 /*
1212  - CHaddrange - add all characters in the range [min,max] to a character set.
1213  */
1214 static void
1215 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1216 {
1217 	crange *newranges;
1218 
1219 	for (; min < NC && min <= max; min++)
1220 		CHadd(p, cs, min);
1221 	if (min >= max)
1222 		return;
1223 	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1224 	    sizeof(*cs->ranges));
1225 	if (newranges == NULL) {
1226 		SETERROR(REG_ESPACE);
1227 		return;
1228 	}
1229 	cs->ranges = newranges;
1230 	cs->ranges[cs->nranges].min = min;
1231 	cs->ranges[cs->nranges].max = max;
1232 	cs->nranges++;
1233 }
1234 
1235 /*
1236  - CHaddtype - add all characters of a certain type to a character set.
1237  */
1238 static void
1239 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1240 {
1241 	wint_t i;
1242 	wctype_t *newtypes;
1243 
1244 	for (i = 0; i < NC; i++)
1245 		if (iswctype(i, wct))
1246 			CHadd(p, cs, i);
1247 	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1248 	    sizeof(*cs->types));
1249 	if (newtypes == NULL) {
1250 		SETERROR(REG_ESPACE);
1251 		return;
1252 	}
1253 	cs->types = newtypes;
1254 	cs->types[cs->ntypes++] = wct;
1255 }
1256 
1257 /*
1258  - dupl - emit a duplicate of a bunch of sops
1259  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1260  */
1261 static sopno			/* start of duplicate */
1262 dupl(struct parse *p,
1263 	sopno start,		/* from here */
1264 	sopno finish)		/* to this less one */
1265 {
1266 	sopno ret = HERE();
1267 	sopno len = finish - start;
1268 
1269 	assert(finish >= start);
1270 	if (len == 0)
1271 		return(ret);
1272 	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1273 		return(ret);
1274 	(void) memcpy((char *)(p->strip + p->slen),
1275 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1276 	p->slen += len;
1277 	return(ret);
1278 }
1279 
1280 /*
1281  - doemit - emit a strip operator
1282  == static void doemit(struct parse *p, sop op, size_t opnd);
1283  *
1284  * It might seem better to implement this as a macro with a function as
1285  * hard-case backup, but it's just too big and messy unless there are
1286  * some changes to the data structures.  Maybe later.
1287  */
1288 static void
1289 doemit(struct parse *p, sop op, size_t opnd)
1290 {
1291 	/* avoid making error situations worse */
1292 	if (p->error != 0)
1293 		return;
1294 
1295 	/* deal with oversize operands ("can't happen", more or less) */
1296 	assert(opnd < 1<<OPSHIFT);
1297 
1298 	/* deal with undersized strip */
1299 	if (p->slen >= p->ssize)
1300 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1301 			return;
1302 
1303 	/* finally, it's all reduced to the easy case */
1304 	p->strip[p->slen++] = SOP(op, opnd);
1305 }
1306 
1307 /*
1308  - doinsert - insert a sop into the strip
1309  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1310  */
1311 static void
1312 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1313 {
1314 	sopno sn;
1315 	sop s;
1316 	int i;
1317 
1318 	/* avoid making error situations worse */
1319 	if (p->error != 0)
1320 		return;
1321 
1322 	sn = HERE();
1323 	EMIT(op, opnd);		/* do checks, ensure space */
1324 	assert(HERE() == sn+1);
1325 	s = p->strip[sn];
1326 
1327 	/* adjust paren pointers */
1328 	assert(pos > 0);
1329 	for (i = 1; i < NPAREN; i++) {
1330 		if (p->pbegin[i] >= pos) {
1331 			p->pbegin[i]++;
1332 		}
1333 		if (p->pend[i] >= pos) {
1334 			p->pend[i]++;
1335 		}
1336 	}
1337 
1338 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1339 						(HERE()-pos-1)*sizeof(sop));
1340 	p->strip[pos] = s;
1341 }
1342 
1343 /*
1344  - dofwd - complete a forward reference
1345  == static void dofwd(struct parse *p, sopno pos, sop value);
1346  */
1347 static void
1348 dofwd(struct parse *p, sopno pos, sop value)
1349 {
1350 	/* avoid making error situations worse */
1351 	if (p->error != 0)
1352 		return;
1353 
1354 	assert(value < 1<<OPSHIFT);
1355 	p->strip[pos] = OP(p->strip[pos]) | value;
1356 }
1357 
1358 /*
1359  - enlarge - enlarge the strip
1360  == static int enlarge(struct parse *p, sopno size);
1361  */
1362 static int
1363 enlarge(struct parse *p, sopno size)
1364 {
1365 	sop *sp;
1366 
1367 	if (p->ssize >= size)
1368 		return 1;
1369 
1370 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1371 	if (sp == NULL) {
1372 		SETERROR(REG_ESPACE);
1373 		return 0;
1374 	}
1375 	p->strip = sp;
1376 	p->ssize = size;
1377 	return 1;
1378 }
1379 
1380 /*
1381  - stripsnug - compact the strip
1382  == static void stripsnug(struct parse *p, struct re_guts *g);
1383  */
1384 static void
1385 stripsnug(struct parse *p, struct re_guts *g)
1386 {
1387 	g->nstates = p->slen;
1388 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1389 	if (g->strip == NULL) {
1390 		SETERROR(REG_ESPACE);
1391 		g->strip = p->strip;
1392 	}
1393 }
1394 
1395 /*
1396  - findmust - fill in must and mlen with longest mandatory literal string
1397  == static void findmust(struct parse *p, struct re_guts *g);
1398  *
1399  * This algorithm could do fancy things like analyzing the operands of |
1400  * for common subsequences.  Someday.  This code is simple and finds most
1401  * of the interesting cases.
1402  *
1403  * Note that must and mlen got initialized during setup.
1404  */
1405 static void
1406 findmust(struct parse *p, struct re_guts *g)
1407 {
1408 	sop *scan;
1409 	sop *start = NULL;
1410 	sop *newstart = NULL;
1411 	sopno newlen;
1412 	sop s;
1413 	char *cp;
1414 	int offset;
1415 	char buf[MB_LEN_MAX];
1416 	size_t clen;
1417 	mbstate_t mbs;
1418 
1419 	/* avoid making error situations worse */
1420 	if (p->error != 0)
1421 		return;
1422 
1423 	/*
1424 	 * It's not generally safe to do a ``char'' substring search on
1425 	 * multibyte character strings, but it's safe for at least
1426 	 * UTF-8 (see RFC 3629).
1427 	 */
1428 	if (MB_CUR_MAX > 1 &&
1429 	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1430 		return;
1431 
1432 	/* find the longest OCHAR sequence in strip */
1433 	newlen = 0;
1434 	offset = 0;
1435 	g->moffset = 0;
1436 	scan = g->strip + 1;
1437 	do {
1438 		s = *scan++;
1439 		switch (OP(s)) {
1440 		case OCHAR:		/* sequence member */
1441 			if (newlen == 0) {		/* new sequence */
1442 				memset(&mbs, 0, sizeof(mbs));
1443 				newstart = scan - 1;
1444 			}
1445 			clen = wcrtomb(buf, OPND(s), &mbs);
1446 			if (clen == (size_t)-1)
1447 				goto toohard;
1448 			newlen += clen;
1449 			break;
1450 		case OPLUS_:		/* things that don't break one */
1451 		case OLPAREN:
1452 		case ORPAREN:
1453 			break;
1454 		case OQUEST_:		/* things that must be skipped */
1455 		case OCH_:
1456 			offset = altoffset(scan, offset);
1457 			scan--;
1458 			do {
1459 				scan += OPND(s);
1460 				s = *scan;
1461 				/* assert() interferes w debug printouts */
1462 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1463 							OP(s) != OOR2) {
1464 					g->iflags |= BAD;
1465 					return;
1466 				}
1467 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1468 			/* FALLTHROUGH */
1469 		case OBOW:		/* things that break a sequence */
1470 		case OEOW:
1471 		case OBOL:
1472 		case OEOL:
1473 		case O_QUEST:
1474 		case O_CH:
1475 		case OEND:
1476 			if (newlen > g->mlen) {		/* ends one */
1477 				start = newstart;
1478 				g->mlen = newlen;
1479 				if (offset > -1) {
1480 					g->moffset += offset;
1481 					offset = newlen;
1482 				} else
1483 					g->moffset = offset;
1484 			} else {
1485 				if (offset > -1)
1486 					offset += newlen;
1487 			}
1488 			newlen = 0;
1489 			break;
1490 		case OANY:
1491 			if (newlen > g->mlen) {		/* ends one */
1492 				start = newstart;
1493 				g->mlen = newlen;
1494 				if (offset > -1) {
1495 					g->moffset += offset;
1496 					offset = newlen;
1497 				} else
1498 					g->moffset = offset;
1499 			} else {
1500 				if (offset > -1)
1501 					offset += newlen;
1502 			}
1503 			if (offset > -1)
1504 				offset++;
1505 			newlen = 0;
1506 			break;
1507 		case OANYOF:		/* may or may not invalidate offset */
1508 			/* First, everything as OANY */
1509 			if (newlen > g->mlen) {		/* ends one */
1510 				start = newstart;
1511 				g->mlen = newlen;
1512 				if (offset > -1) {
1513 					g->moffset += offset;
1514 					offset = newlen;
1515 				} else
1516 					g->moffset = offset;
1517 			} else {
1518 				if (offset > -1)
1519 					offset += newlen;
1520 			}
1521 			if (offset > -1)
1522 				offset++;
1523 			newlen = 0;
1524 			break;
1525 		toohard:
1526 		default:
1527 			/* Anything here makes it impossible or too hard
1528 			 * to calculate the offset -- so we give up;
1529 			 * save the last known good offset, in case the
1530 			 * must sequence doesn't occur later.
1531 			 */
1532 			if (newlen > g->mlen) {		/* ends one */
1533 				start = newstart;
1534 				g->mlen = newlen;
1535 				if (offset > -1)
1536 					g->moffset += offset;
1537 				else
1538 					g->moffset = offset;
1539 			}
1540 			offset = -1;
1541 			newlen = 0;
1542 			break;
1543 		}
1544 	} while (OP(s) != OEND);
1545 
1546 	if (g->mlen == 0) {		/* there isn't one */
1547 		g->moffset = -1;
1548 		return;
1549 	}
1550 
1551 	/* turn it into a character string */
1552 	g->must = malloc((size_t)g->mlen + 1);
1553 	if (g->must == NULL) {		/* argh; just forget it */
1554 		g->mlen = 0;
1555 		g->moffset = -1;
1556 		return;
1557 	}
1558 	cp = g->must;
1559 	scan = start;
1560 	memset(&mbs, 0, sizeof(mbs));
1561 	while (cp < g->must + g->mlen) {
1562 		while (OP(s = *scan++) != OCHAR)
1563 			continue;
1564 		clen = wcrtomb(cp, OPND(s), &mbs);
1565 		assert(clen != (size_t)-1);
1566 		cp += clen;
1567 	}
1568 	assert(cp == g->must + g->mlen);
1569 	*cp++ = '\0';		/* just on general principles */
1570 }
1571 
1572 /*
1573  - altoffset - choose biggest offset among multiple choices
1574  == static int altoffset(sop *scan, int offset);
1575  *
1576  * Compute, recursively if necessary, the largest offset among multiple
1577  * re paths.
1578  */
1579 static int
1580 altoffset(sop *scan, int offset)
1581 {
1582 	int largest;
1583 	int try;
1584 	sop s;
1585 
1586 	/* If we gave up already on offsets, return */
1587 	if (offset == -1)
1588 		return -1;
1589 
1590 	largest = 0;
1591 	try = 0;
1592 	s = *scan++;
1593 	while (OP(s) != O_QUEST && OP(s) != O_CH) {
1594 		switch (OP(s)) {
1595 		case OOR1:
1596 			if (try > largest)
1597 				largest = try;
1598 			try = 0;
1599 			break;
1600 		case OQUEST_:
1601 		case OCH_:
1602 			try = altoffset(scan, try);
1603 			if (try == -1)
1604 				return -1;
1605 			scan--;
1606 			do {
1607 				scan += OPND(s);
1608 				s = *scan;
1609 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1610 							OP(s) != OOR2)
1611 					return -1;
1612 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1613 			/* We must skip to the next position, or we'll
1614 			 * leave altoffset() too early.
1615 			 */
1616 			scan++;
1617 			break;
1618 		case OANYOF:
1619 		case OCHAR:
1620 		case OANY:
1621 			try++;
1622 		case OBOW:
1623 		case OEOW:
1624 		case OLPAREN:
1625 		case ORPAREN:
1626 		case OOR2:
1627 			break;
1628 		default:
1629 			try = -1;
1630 			break;
1631 		}
1632 		if (try == -1)
1633 			return -1;
1634 		s = *scan++;
1635 	}
1636 
1637 	if (try > largest)
1638 		largest = try;
1639 
1640 	return largest+offset;
1641 }
1642 
1643 /*
1644  - computejumps - compute char jumps for BM scan
1645  == static void computejumps(struct parse *p, struct re_guts *g);
1646  *
1647  * This algorithm assumes g->must exists and is has size greater than
1648  * zero. It's based on the algorithm found on Computer Algorithms by
1649  * Sara Baase.
1650  *
1651  * A char jump is the number of characters one needs to jump based on
1652  * the value of the character from the text that was mismatched.
1653  */
1654 static void
1655 computejumps(struct parse *p, struct re_guts *g)
1656 {
1657 	int ch;
1658 	int mindex;
1659 
1660 	/* Avoid making errors worse */
1661 	if (p->error != 0)
1662 		return;
1663 
1664 	g->charjump = (int*) malloc((NC + 1) * sizeof(int));
1665 	if (g->charjump == NULL)	/* Not a fatal error */
1666 		return;
1667 	/* Adjust for signed chars, if necessary */
1668 	g->charjump = &g->charjump[-(CHAR_MIN)];
1669 
1670 	/* If the character does not exist in the pattern, the jump
1671 	 * is equal to the number of characters in the pattern.
1672 	 */
1673 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1674 		g->charjump[ch] = g->mlen;
1675 
1676 	/* If the character does exist, compute the jump that would
1677 	 * take us to the last character in the pattern equal to it
1678 	 * (notice that we match right to left, so that last character
1679 	 * is the first one that would be matched).
1680 	 */
1681 	for (mindex = 0; mindex < g->mlen; mindex++)
1682 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1683 }
1684 
1685 /*
1686  - computematchjumps - compute match jumps for BM scan
1687  == static void computematchjumps(struct parse *p, struct re_guts *g);
1688  *
1689  * This algorithm assumes g->must exists and is has size greater than
1690  * zero. It's based on the algorithm found on Computer Algorithms by
1691  * Sara Baase.
1692  *
1693  * A match jump is the number of characters one needs to advance based
1694  * on the already-matched suffix.
1695  * Notice that all values here are minus (g->mlen-1), because of the way
1696  * the search algorithm works.
1697  */
1698 static void
1699 computematchjumps(struct parse *p, struct re_guts *g)
1700 {
1701 	int mindex;		/* General "must" iterator */
1702 	int suffix;		/* Keeps track of matching suffix */
1703 	int ssuffix;		/* Keeps track of suffixes' suffix */
1704 	int* pmatches;		/* pmatches[k] points to the next i
1705 				 * such that i+1...mlen is a substring
1706 				 * of k+1...k+mlen-i-1
1707 				 */
1708 
1709 	/* Avoid making errors worse */
1710 	if (p->error != 0)
1711 		return;
1712 
1713 	pmatches = (int*) malloc(g->mlen * sizeof(int));
1714 	if (pmatches == NULL) {
1715 		g->matchjump = NULL;
1716 		return;
1717 	}
1718 
1719 	g->matchjump = (int*) malloc(g->mlen * sizeof(int));
1720 	if (g->matchjump == NULL) {	/* Not a fatal error */
1721 		free(pmatches);
1722 		return;
1723 	}
1724 
1725 	/* Set maximum possible jump for each character in the pattern */
1726 	for (mindex = 0; mindex < g->mlen; mindex++)
1727 		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1728 
1729 	/* Compute pmatches[] */
1730 	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1731 	    mindex--, suffix--) {
1732 		pmatches[mindex] = suffix;
1733 
1734 		/* If a mismatch is found, interrupting the substring,
1735 		 * compute the matchjump for that position. If no
1736 		 * mismatch is found, then a text substring mismatched
1737 		 * against the suffix will also mismatch against the
1738 		 * substring.
1739 		 */
1740 		while (suffix < g->mlen
1741 		    && g->must[mindex] != g->must[suffix]) {
1742 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1743 			    g->mlen - mindex - 1);
1744 			suffix = pmatches[suffix];
1745 		}
1746 	}
1747 
1748 	/* Compute the matchjump up to the last substring found to jump
1749 	 * to the beginning of the largest must pattern prefix matching
1750 	 * it's own suffix.
1751 	 */
1752 	for (mindex = 0; mindex <= suffix; mindex++)
1753 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1754 		    g->mlen + suffix - mindex);
1755 
1756         ssuffix = pmatches[suffix];
1757         while (suffix < g->mlen) {
1758                 while (suffix <= ssuffix && suffix < g->mlen) {
1759                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
1760 			    g->mlen + ssuffix - suffix);
1761                         suffix++;
1762                 }
1763 		if (suffix < g->mlen)
1764                 	ssuffix = pmatches[ssuffix];
1765         }
1766 
1767 	free(pmatches);
1768 }
1769 
1770 /*
1771  - pluscount - count + nesting
1772  == static sopno pluscount(struct parse *p, struct re_guts *g);
1773  */
1774 static sopno			/* nesting depth */
1775 pluscount(struct parse *p, struct re_guts *g)
1776 {
1777 	sop *scan;
1778 	sop s;
1779 	sopno plusnest = 0;
1780 	sopno maxnest = 0;
1781 
1782 	if (p->error != 0)
1783 		return(0);	/* there may not be an OEND */
1784 
1785 	scan = g->strip + 1;
1786 	do {
1787 		s = *scan++;
1788 		switch (OP(s)) {
1789 		case OPLUS_:
1790 			plusnest++;
1791 			break;
1792 		case O_PLUS:
1793 			if (plusnest > maxnest)
1794 				maxnest = plusnest;
1795 			plusnest--;
1796 			break;
1797 		}
1798 	} while (OP(s) != OEND);
1799 	if (plusnest != 0)
1800 		g->iflags |= BAD;
1801 	return(maxnest);
1802 }
1803