xref: /illumos-gate/usr/src/lib/libc/port/regex/regcomp.c (revision e3ae4b35c024af1196582063ecee3ab79367227d)
1 /*
2  * Copyright 2023 Bill Sommerfeld <sommerfeld@hamachi.org>
3  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
4  * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
5  * Copyright 2012 Milan Jurik. All rights reserved.
6  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7  * Copyright (c) 1992, 1993, 1994
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * Henry Spencer.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #include "lint.h"
39 #include "file64.h"
40 #include <sys/types.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <ctype.h>
44 #include <limits.h>
45 #include <regex.h>
46 #include <stdlib.h>
47 #include <stdbool.h>
48 #include <wchar.h>
49 #include <wctype.h>
50 
51 #include "../locale/runetype.h"
52 #include "../locale/collate.h"
53 
54 #include "utils.h"
55 #include "regex2.h"
56 
57 #include "cname.h"
58 #include "../locale/mblocal.h"
59 
60 /*
61  * Branching context, used to keep track of branch state for all of the branch-
62  * aware functions. In addition to keeping track of branch positions for the
63  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
64  * detection of whether ^ is acting as an anchor or being used erroneously and
65  * also for whether we're in a sub-expression or not.
66  */
67 struct branchc {
68 	sopno start;
69 	sopno back;
70 	sopno fwd;
71 
72 	int nbranch;
73 	int nchain;
74 	bool outer;
75 	bool terminate;
76 };
77 
78 /*
79  * parse structure, passed up and down to avoid global variables and
80  * other clumsinesses
81  */
82 struct parse {
83 	const char *next;	/* next character in RE */
84 	const char *end;	/* end of string (-> NUL normally) */
85 	int error;		/* has an error been seen? */
86 	sop *strip;		/* malloced strip */
87 	sopno ssize;		/* malloced strip size (allocated) */
88 	sopno slen;		/* malloced strip length (used) */
89 	int ncsalloc;		/* number of csets allocated */
90 	struct re_guts *g;
91 #define	NPAREN	10		/* we need to remember () 1-9 for back refs */
92 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
93 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
94 	bool allowbranch;	/* can this expression branch? */
95 	bool bre;		/* convenience; is this a BRE? */
96 	bool (*parse_expr)(struct parse *, struct branchc *);
97 	void (*pre_parse)(struct parse *, struct branchc *);
98 	void (*post_parse)(struct parse *, struct branchc *);
99 };
100 
101 /* ========= begin header generated by ./mkh ========= */
102 #ifdef __cplusplus
103 extern "C" {
104 #endif
105 
106 /* === regcomp.c === */
107 static bool p_ere_exp(struct parse *p, struct branchc *bc);
108 static void p_str(struct parse *p);
109 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
110 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
111 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
112 static bool p_branch_empty(struct parse *p, struct branchc *bc);
113 static bool p_branch_do(struct parse *p, struct branchc *bc);
114 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
115 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
116 static void p_re(struct parse *p, int end1, int end2);
117 static bool p_simp_re(struct parse *p, struct branchc *bc);
118 static int p_count(struct parse *p);
119 static void p_bracket(struct parse *p);
120 static void p_b_term(struct parse *p, cset *cs);
121 static void p_b_cclass(struct parse *p, cset *cs);
122 static void p_b_eclass(struct parse *p, cset *cs);
123 static wint_t p_b_symbol(struct parse *p);
124 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
125 static wint_t othercase(wint_t ch);
126 static void bothcases(struct parse *p, wint_t ch);
127 static void ordinary(struct parse *p, wint_t ch);
128 static void nonnewline(struct parse *p);
129 static void repeat(struct parse *p, sopno start, int from, int to);
130 static int seterr(struct parse *p, int e);
131 static cset *allocset(struct parse *p);
132 static void freeset(struct parse *p, cset *cs);
133 static void CHadd(struct parse *p, cset *cs, wint_t ch);
134 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
135 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
136 static wint_t singleton(cset *cs);
137 static sopno dupl(struct parse *p, sopno start, sopno finish);
138 static void doemit(struct parse *p, sop op, size_t opnd);
139 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
140 static void dofwd(struct parse *p, sopno pos, sop value);
141 static int enlarge(struct parse *p, sopno size);
142 static void stripsnug(struct parse *p, struct re_guts *g);
143 static void findmust(struct parse *p, struct re_guts *g);
144 static int altoffset(sop *scan, int offset);
145 static void computejumps(struct parse *p, struct re_guts *g);
146 static void computematchjumps(struct parse *p, struct re_guts *g);
147 static sopno pluscount(struct parse *p, struct re_guts *g);
148 static wint_t wgetnext(struct parse *p);
149 
150 #ifdef __cplusplus
151 }
152 #endif
153 /* ========= end header generated by ./mkh ========= */
154 
155 static char nuls[10];		/* place to point scanner in event of error */
156 
157 /*
158  * macros for use with parse structure
159  * BEWARE:  these know that the parse structure is named `p' !!!
160  */
161 #define	PEEK()	(*p->next)
162 #define	PEEK2()	(*(p->next+1))
163 #define	MORE()	(p->next < p->end)
164 #define	MORE2()	(p->next+1 < p->end)
165 #define	SEE(c)	(MORE() && PEEK() == (c))
166 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
167 #define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
168 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
169 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
170 #define	NEXT()	(p->next++)
171 #define	NEXT2()	(p->next += 2)
172 #define	NEXTn(n)	(p->next += (n))
173 #define	GETNEXT()	(*p->next++)
174 #define	WGETNEXT()	wgetnext(p)
175 #define	SETERROR(e)	((void)seterr(p, (e)))
176 #define	REQUIRE(co, e)	((co) || seterr(p, e))
177 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
178 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
179 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
180 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
181 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
182 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
183 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
184 #define	HERE()		(p->slen)
185 #define	THERE()		(p->slen - 1)
186 #define	THERETHERE()	(p->slen - 2)
187 #define	DROP(n)	(p->slen -= (n))
188 
189 #ifndef NDEBUG
190 static int never = 0;		/* for use in asserts; shuts lint up */
191 #else
192 #define	never	0		/* some <assert.h>s have bugs too */
193 #endif
194 
195 /*
196  * regcomp - interface for parser and compilation
197  */
198 int				/* 0 success, otherwise REG_something */
199 regcomp(regex_t *_RESTRICT_KYWD preg, const char *_RESTRICT_KYWD pattern,
200     int cflags)
201 {
202 	struct parse pa;
203 	struct re_guts *g;
204 	struct parse *p = &pa;
205 	int i;
206 	size_t len;
207 	size_t maxlen;
208 #ifdef REDEBUG
209 #define	GOODFLAGS(f)	(f)
210 #else
211 #define	GOODFLAGS(f)	((f)&~REG_DUMP)
212 #endif
213 
214 	/* We had REG_INVARG, but we don't have that on Solaris. */
215 	cflags = GOODFLAGS(cflags);
216 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
217 		return (REG_EFATAL);
218 
219 	if (cflags&REG_PEND) {
220 		if (preg->re_endp < pattern)
221 			return (REG_EFATAL);
222 		len = preg->re_endp - pattern;
223 	} else
224 		len = strlen(pattern);
225 
226 	/* do the mallocs early so failure handling is easy */
227 	g = (struct re_guts *)malloc(sizeof (struct re_guts));
228 	if (g == NULL)
229 		return (REG_ESPACE);
230 	/*
231 	 * Limit the pattern space to avoid a 32-bit overflow on buffer
232 	 * extension.  Also avoid any signed overflow in case of conversion
233 	 * so make the real limit based on a 31-bit overflow.
234 	 *
235 	 * Likely not applicable on 64-bit systems but handle the case
236 	 * generically (who are we to stop people from using ~715MB+
237 	 * patterns?).
238 	 */
239 	maxlen = ((size_t)-1 >> 1) / sizeof (sop) * 2 / 3;
240 	if (len >= maxlen) {
241 		free((char *)g);
242 		return (REG_ESPACE);
243 	}
244 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
245 	assert(p->ssize >= len);
246 
247 	p->strip = (sop *)malloc(p->ssize * sizeof (sop));
248 	p->slen = 0;
249 	if (p->strip == NULL) {
250 		free((char *)g);
251 		return (REG_ESPACE);
252 	}
253 
254 	/* set things up */
255 	p->g = g;
256 	p->next = pattern;	/* convenience; we do not modify it */
257 	p->end = p->next + len;
258 	p->error = 0;
259 	p->ncsalloc = 0;
260 	for (i = 0; i < NPAREN; i++) {
261 		p->pbegin[i] = 0;
262 		p->pend[i] = 0;
263 	}
264 	if (cflags & REG_EXTENDED) {
265 		p->allowbranch = true;
266 		p->bre = false;
267 		p->parse_expr = p_ere_exp;
268 		p->pre_parse = NULL;
269 		p->post_parse = NULL;
270 	} else {
271 		p->allowbranch = false;
272 		p->bre = true;
273 		p->parse_expr = p_simp_re;
274 		p->pre_parse = p_bre_pre_parse;
275 		p->post_parse = p_bre_post_parse;
276 	}
277 	g->sets = NULL;
278 	g->ncsets = 0;
279 	g->cflags = cflags;
280 	g->iflags = 0;
281 	g->nbol = 0;
282 	g->neol = 0;
283 	g->must = NULL;
284 	g->moffset = -1;
285 	g->charjump = NULL;
286 	g->matchjump = NULL;
287 	g->mlen = 0;
288 	g->nsub = 0;
289 	g->backrefs = 0;
290 
291 	/* do it */
292 	EMIT(OEND, 0);
293 	g->firststate = THERE();
294 	if (cflags & REG_NOSPEC)
295 		p_str(p);
296 	else
297 		p_re(p, OUT, OUT);
298 	EMIT(OEND, 0);
299 	g->laststate = THERE();
300 
301 	/* tidy up loose ends and fill things in */
302 	stripsnug(p, g);
303 	findmust(p, g);
304 	/*
305 	 * only use Boyer-Moore algorithm if the pattern is bigger
306 	 * than three characters
307 	 */
308 	if (g->mlen > 3) {
309 		computejumps(p, g);
310 		computematchjumps(p, g);
311 		if (g->matchjump == NULL && g->charjump != NULL) {
312 			free(g->charjump);
313 			g->charjump = NULL;
314 		}
315 	}
316 	g->nplus = pluscount(p, g);
317 	g->magic = MAGIC2;
318 	preg->re_nsub = g->nsub;
319 	preg->re_g = g;
320 	preg->re_magic = MAGIC1;
321 #ifndef REDEBUG
322 	/* not debugging, so can't rely on the assert() in regexec() */
323 	if (g->iflags&BAD)
324 		SETERROR(REG_EFATAL);
325 #endif
326 
327 	/* win or lose, we're done */
328 	if (p->error != 0)	/* lose */
329 		regfree(preg);
330 	return (p->error);
331 }
332 
333 /*
334  * Parse one subERE, an atom possibly followed by a repetition op,
335  * return whether we should terminate or not.
336  */
337 static bool
338 p_ere_exp(struct parse *p, struct branchc *bc)
339 {
340 	char c;
341 	wint_t wc;
342 	sopno pos;
343 	int count;
344 	int count2;
345 	sopno subno;
346 	int wascaret = 0;
347 
348 	(void) bc;
349 	assert(MORE());		/* caller should have ensured this */
350 	c = GETNEXT();
351 
352 	pos = HERE();
353 	switch (c) {
354 	case '(':
355 		(void) REQUIRE(MORE(), REG_EPAREN);
356 		p->g->nsub++;
357 		subno = p->g->nsub;
358 		if (subno < NPAREN)
359 			p->pbegin[subno] = HERE();
360 		EMIT(OLPAREN, subno);
361 		if (!SEE(')'))
362 			p_re(p, ')', IGN);
363 		if (subno < NPAREN) {
364 			p->pend[subno] = HERE();
365 			assert(p->pend[subno] != 0);
366 		}
367 		EMIT(ORPAREN, subno);
368 		(void) MUSTEAT(')', REG_EPAREN);
369 		break;
370 #ifndef POSIX_MISTAKE
371 	case ')':		/* happens only if no current unmatched ( */
372 		/*
373 		 * You may ask, why the ifndef?  Because I didn't notice
374 		 * this until slightly too late for 1003.2, and none of the
375 		 * other 1003.2 regular-expression reviewers noticed it at
376 		 * all.  So an unmatched ) is legal POSIX, at least until
377 		 * we can get it fixed.
378 		 */
379 		SETERROR(REG_EPAREN);
380 		break;
381 #endif
382 	case '^':
383 		EMIT(OBOL, 0);
384 		p->g->iflags |= USEBOL;
385 		p->g->nbol++;
386 		wascaret = 1;
387 		break;
388 	case '$':
389 		EMIT(OEOL, 0);
390 		p->g->iflags |= USEEOL;
391 		p->g->neol++;
392 		break;
393 	case '|':
394 		SETERROR(REG_BADPAT);
395 		break;
396 	case '*':
397 	case '+':
398 	case '?':
399 	case '{':
400 		SETERROR(REG_BADRPT);
401 		break;
402 	case '.':
403 		if (p->g->cflags&REG_NEWLINE)
404 			nonnewline(p);
405 		else
406 			EMIT(OANY, 0);
407 		break;
408 	case '[':
409 		p_bracket(p);
410 		break;
411 	case '\\':
412 		(void) REQUIRE(MORE(), REG_EESCAPE);
413 		wc = WGETNEXT();
414 		switch (wc) {
415 		case '<':
416 			EMIT(OBOW, 0);
417 			break;
418 		case '>':
419 			EMIT(OEOW, 0);
420 			break;
421 		default:
422 			ordinary(p, wc);
423 			break;
424 		}
425 		break;
426 	default:
427 		if (p->error != 0)
428 			return (false);
429 		p->next--;
430 		wc = WGETNEXT();
431 		ordinary(p, wc);
432 		break;
433 	}
434 
435 	if (!MORE())
436 		return (false);
437 	c = PEEK();
438 	/* we call { a repetition if followed by a digit */
439 	if (!(c == '*' || c == '+' || c == '?' || c == '{'))
440 		return (false);		/* no repetition, we're done */
441 	else if (c == '{')
442 		(void) REQUIRE(MORE2() && \
443 		    (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
444 	NEXT();
445 
446 	(void) REQUIRE(!wascaret, REG_BADRPT);
447 	switch (c) {
448 	case '*':	/* implemented as +? */
449 		/* this case does not require the (y|) trick, noKLUDGE */
450 		INSERT(OPLUS_, pos);
451 		ASTERN(O_PLUS, pos);
452 		INSERT(OQUEST_, pos);
453 		ASTERN(O_QUEST, pos);
454 		break;
455 	case '+':
456 		INSERT(OPLUS_, pos);
457 		ASTERN(O_PLUS, pos);
458 		break;
459 	case '?':
460 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
461 		INSERT(OCH_, pos);		/* offset slightly wrong */
462 		ASTERN(OOR1, pos);		/* this one's right */
463 		AHEAD(pos);			/* fix the OCH_ */
464 		EMIT(OOR2, 0);			/* offset very wrong... */
465 		AHEAD(THERE());			/* ...so fix it */
466 		ASTERN(O_CH, THERETHERE());
467 		break;
468 	case '{':
469 		count = p_count(p);
470 		if (EAT(',')) {
471 			if (isdigit((uch)PEEK())) {
472 				count2 = p_count(p);
473 				(void) REQUIRE(count <= count2, REG_BADBR);
474 			} else		/* single number with comma */
475 				count2 = INFINITY;
476 		} else		/* just a single number */
477 			count2 = count;
478 		repeat(p, pos, count, count2);
479 		if (!EAT('}')) {	/* error heuristics */
480 			while (MORE() && PEEK() != '}')
481 				NEXT();
482 			(void) REQUIRE(MORE(), REG_EBRACE);
483 			SETERROR(REG_BADBR);
484 		}
485 		break;
486 	}
487 
488 	if (!MORE())
489 		return (false);
490 	c = PEEK();
491 	if (!(c == '*' || c == '+' || c == '?' ||
492 	    (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
493 		return (false);
494 	SETERROR(REG_BADRPT);
495 	return (false);
496 }
497 
498 /*
499  * p_str - string (no metacharacters) "parser"
500  */
501 static void
502 p_str(struct parse *p)
503 {
504 	(void) REQUIRE(MORE(), REG_BADPAT);
505 	while (MORE())
506 		ordinary(p, WGETNEXT());
507 }
508 
509 /*
510  * Eat consecutive branch delimiters for the kind of expression that we are
511  * parsing, return the number of delimiters that we ate.
512  */
513 static int
514 p_branch_eat_delim(struct parse *p, struct branchc *bc)
515 {
516 	int nskip;
517 
518 	(void) bc;
519 	nskip = 0;
520 	while (EAT('|'))
521 		++nskip;
522 	return (nskip);
523 }
524 
525 /*
526  * Insert necessary branch book-keeping operations. This emits a
527  * bogus 'next' offset, since we still have more to parse
528  */
529 static void
530 p_branch_ins_offset(struct parse *p, struct branchc *bc)
531 {
532 	if (bc->nbranch == 0) {
533 		INSERT(OCH_, bc->start);	/* offset is wrong */
534 		bc->fwd = bc->start;
535 		bc->back = bc->start;
536 	}
537 
538 	ASTERN(OOR1, bc->back);
539 	bc->back = THERE();
540 	AHEAD(bc->fwd);			/* fix previous offset */
541 	bc->fwd = HERE();
542 	EMIT(OOR2, 0);			/* offset is very wrong */
543 	++bc->nbranch;
544 }
545 
546 /*
547  * Fix the offset of the tail branch, if we actually had any branches.
548  * This is to correct the bogus placeholder offset that we use.
549  */
550 static void
551 p_branch_fix_tail(struct parse *p, struct branchc *bc)
552 {
553 	/* Fix bogus offset at the tail if we actually have branches */
554 	if (bc->nbranch > 0) {
555 		AHEAD(bc->fwd);
556 		ASTERN(O_CH, bc->back);
557 	}
558 }
559 
560 /*
561  * Signal to the parser that an empty branch has been encountered; this will,
562  * in the future, be used to allow for more permissive behavior with empty
563  * branches. The return value should indicate whether parsing may continue
564  * or not.
565  */
566 static bool
567 p_branch_empty(struct parse *p, struct branchc *bc)
568 {
569 	(void) bc;
570 	SETERROR(REG_BADPAT);
571 	return (false);
572 }
573 
574 /*
575  * Take care of any branching requirements. This includes inserting the
576  * appropriate branching instructions as well as eating all of the branch
577  * delimiters until we either run out of pattern or need to parse more pattern.
578  */
579 static bool
580 p_branch_do(struct parse *p, struct branchc *bc)
581 {
582 	int ate = 0;
583 
584 	ate = p_branch_eat_delim(p, bc);
585 	if (ate == 0)
586 		return (false);
587 	else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
588 		/*
589 		 * Halt parsing only if we have an empty branch and
590 		 * p_branch_empty indicates that we must not continue.
591 		 * In the future, this will not  necessarily be an error.
592 		 */
593 		return (false);
594 	p_branch_ins_offset(p, bc);
595 
596 	return (true);
597 }
598 
599 static void
600 p_bre_pre_parse(struct parse *p, struct branchc *bc)
601 {
602 	(void) bc;
603 	/*
604 	 * Does not move cleanly into expression parser because of
605 	 * ordinary interpration of * at the beginning position of
606 	 * an expression.
607 	 */
608 	if (EAT('^')) {
609 		EMIT(OBOL, 0);
610 		p->g->iflags |= USEBOL;
611 		p->g->nbol++;
612 	}
613 }
614 
615 static void
616 p_bre_post_parse(struct parse *p, struct branchc *bc)
617 {
618 	/* Expression is terminating due to EOL token */
619 	if (bc->terminate) {
620 		DROP(1);
621 		EMIT(OEOL, 0);
622 		p->g->iflags |= USEEOL;
623 		p->g->neol++;
624 	}
625 }
626 
627 /*
628  * Top level parser, concatenation and BRE anchoring.
629  * Giving end1 as OUT essentially eliminates the end1/end2 check.
630  *
631  * This implementation is a bit of a kludge, in that a trailing $ is first
632  * taken as an ordinary character and then revised to be an anchor.
633  * The amount of lookahead needed to avoid this kludge is excessive.
634  */
635 static void
636 p_re(struct parse *p,
637     int end1,	/* first terminating character */
638     int end2)	/* second terminating character; ignored for EREs */
639 {
640 	struct branchc bc;
641 
642 	bc.nbranch = 0;
643 	if (end1 == OUT && end2 == OUT)
644 		bc.outer = true;
645 	else
646 		bc.outer = false;
647 #define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
648 	for (;;) {
649 		bc.start = HERE();
650 		bc.nchain = 0;
651 		bc.terminate = false;
652 		if (p->pre_parse != NULL)
653 			p->pre_parse(p, &bc);
654 		while (MORE() && (!p->allowbranch || !SEESPEC('|')) &&
655 		    !SEEEND()) {
656 			bc.terminate = p->parse_expr(p, &bc);
657 			++bc.nchain;
658 		}
659 		if (p->post_parse != NULL)
660 			p->post_parse(p, &bc);
661 		(void) REQUIRE(HERE() != bc.start, REG_BADPAT);
662 		if (!p->allowbranch)
663 			break;
664 		/*
665 		 * p_branch_do's return value indicates whether we should
666 		 * continue parsing or not. This is both for correctness and
667 		 * a slight optimization, because it will check if we've
668 		 * encountered an empty branch or the end of the string
669 		 * immediately following a branch delimiter.
670 		 */
671 		if (!p_branch_do(p, &bc))
672 			break;
673 	}
674 #undef SEE_END
675 	if (p->allowbranch)
676 		p_branch_fix_tail(p, &bc);
677 	assert(!MORE() || SEE(end1));
678 }
679 
680 /*
681  * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
682  */
683 static bool			/* was the simple RE an unbackslashed $? */
684 p_simp_re(struct parse *p, struct branchc *bc)
685 {
686 	int c;
687 	int count;
688 	int count2;
689 	sopno pos;
690 	int i;
691 	wint_t wc;
692 	sopno subno;
693 #define	BACKSL	(1<<CHAR_BIT)
694 
695 	pos = HERE();		/* repetition op, if any, covers from here */
696 
697 	assert(MORE());		/* caller should have ensured this */
698 	c = GETNEXT();
699 	if (c == '\\') {
700 		(void) REQUIRE(MORE(), REG_EESCAPE);
701 		c = BACKSL | GETNEXT();
702 	}
703 	switch (c) {
704 	case '.':
705 		if (p->g->cflags&REG_NEWLINE)
706 			nonnewline(p);
707 		else
708 			EMIT(OANY, 0);
709 		break;
710 	case '[':
711 		p_bracket(p);
712 		break;
713 	case BACKSL|'<':
714 		EMIT(OBOW, 0);
715 		break;
716 	case BACKSL|'>':
717 		EMIT(OEOW, 0);
718 		break;
719 	case BACKSL|'{':
720 		SETERROR(REG_BADRPT);
721 		break;
722 	case BACKSL|'(':
723 		p->g->nsub++;
724 		subno = p->g->nsub;
725 		if (subno < NPAREN)
726 			p->pbegin[subno] = HERE();
727 		EMIT(OLPAREN, subno);
728 		/* the MORE here is an error heuristic */
729 		if (MORE() && !SEETWO('\\', ')'))
730 			p_re(p, '\\', ')');
731 		if (subno < NPAREN) {
732 			p->pend[subno] = HERE();
733 			assert(p->pend[subno] != 0);
734 		}
735 		EMIT(ORPAREN, subno);
736 		(void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
737 		break;
738 	case BACKSL|')':	/* should not get here -- must be user */
739 		SETERROR(REG_EPAREN);
740 		break;
741 	case BACKSL|'1':
742 	case BACKSL|'2':
743 	case BACKSL|'3':
744 	case BACKSL|'4':
745 	case BACKSL|'5':
746 	case BACKSL|'6':
747 	case BACKSL|'7':
748 	case BACKSL|'8':
749 	case BACKSL|'9':
750 		i = (c&~BACKSL) - '0';
751 		assert(i < NPAREN);
752 		if (p->pend[i] != 0) {
753 			assert(i <= p->g->nsub);
754 			EMIT(OBACK_, i);
755 			assert(p->pbegin[i] != 0);
756 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
757 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
758 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
759 			EMIT(O_BACK, i);
760 		} else
761 			SETERROR(REG_ESUBREG);
762 		p->g->backrefs = 1;
763 		break;
764 	case '*':
765 		/*
766 		 * Ordinary if used as the first character beyond BOL anchor of
767 		 * a (sub-)expression, counts as a bad repetition operator if it
768 		 * appears otherwise.
769 		 */
770 		(void) REQUIRE(bc->nchain == 0, REG_BADRPT);
771 		/* FALLTHROUGH */
772 	default:
773 		if (p->error != 0)
774 			return (false);	/* Definitely not $... */
775 		p->next--;
776 		wc = WGETNEXT();
777 		ordinary(p, wc);
778 		break;
779 	}
780 
781 	if (EAT('*')) {		/* implemented as +? */
782 		/* this case does not require the (y|) trick, noKLUDGE */
783 		INSERT(OPLUS_, pos);
784 		ASTERN(O_PLUS, pos);
785 		INSERT(OQUEST_, pos);
786 		ASTERN(O_QUEST, pos);
787 	} else if (EATTWO('\\', '{')) {
788 		count = p_count(p);
789 		if (EAT(',')) {
790 			if (MORE() && isdigit((uch)PEEK())) {
791 				count2 = p_count(p);
792 				(void) REQUIRE(count <= count2, REG_BADBR);
793 			} else		/* single number with comma */
794 				count2 = INFINITY;
795 		} else		/* just a single number */
796 			count2 = count;
797 		repeat(p, pos, count, count2);
798 		if (!EATTWO('\\', '}')) {	/* error heuristics */
799 			while (MORE() && !SEETWO('\\', '}'))
800 				NEXT();
801 			(void) REQUIRE(MORE(), REG_EBRACE);
802 			SETERROR(REG_BADBR);
803 		}
804 	} else if (c == '$')	/* $ (but not \$) ends it */
805 		return (true);
806 
807 	return (false);
808 }
809 
810 /*
811  * p_count - parse a repetition count
812  */
813 static int			/* the value */
814 p_count(struct parse *p)
815 {
816 	int count = 0;
817 	int ndigits = 0;
818 
819 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
820 		count = count*10 + (GETNEXT() - '0');
821 		ndigits++;
822 	}
823 
824 	(void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
825 	return (count);
826 }
827 
828 /*
829  * p_bracket - parse a bracketed character list
830  */
831 static void
832 p_bracket(struct parse *p)
833 {
834 	cset *cs;
835 	wint_t ch;
836 
837 	/* Dept of Truly Sickening Special-Case Kludges */
838 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
839 		EMIT(OBOW, 0);
840 		NEXTn(6);
841 		return;
842 	}
843 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
844 		EMIT(OEOW, 0);
845 		NEXTn(6);
846 		return;
847 	}
848 
849 	if ((cs = allocset(p)) == NULL)
850 		return;
851 
852 	if (p->g->cflags&REG_ICASE)
853 		cs->icase = 1;
854 	if (EAT('^'))
855 		cs->invert = 1;
856 	if (EAT(']'))
857 		CHadd(p, cs, ']');
858 	else if (EAT('-'))
859 		CHadd(p, cs, '-');
860 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
861 		p_b_term(p, cs);
862 	if (EAT('-'))
863 		CHadd(p, cs, '-');
864 	(void) MUSTEAT(']', REG_EBRACK);
865 
866 	if (p->error != 0)	/* don't mess things up further */
867 		return;
868 
869 	if (cs->invert && p->g->cflags&REG_NEWLINE)
870 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
871 
872 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
873 		ordinary(p, ch);
874 		freeset(p, cs);
875 	} else
876 		EMIT(OANYOF, (int)(cs - p->g->sets));
877 }
878 
879 /*
880  * p_b_term - parse one term of a bracketed character list
881  */
882 static void
883 p_b_term(struct parse *p, cset *cs)
884 {
885 	char c;
886 	wint_t start, finish;
887 	wint_t i;
888 	locale_t loc = uselocale(NULL);
889 
890 	/* classify what we've got */
891 	switch ((MORE()) ? PEEK() : '\0') {
892 	case '[':
893 		c = (MORE2()) ? PEEK2() : '\0';
894 		break;
895 	case '-':
896 		SETERROR(REG_ERANGE);
897 		return;			/* NOTE RETURN */
898 	default:
899 		c = '\0';
900 		break;
901 	}
902 
903 	switch (c) {
904 	case ':':		/* character class */
905 		NEXT2();
906 		(void) REQUIRE(MORE(), REG_EBRACK);
907 		c = PEEK();
908 		(void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
909 		p_b_cclass(p, cs);
910 		(void) REQUIRE(MORE(), REG_EBRACK);
911 		(void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
912 		break;
913 	case '=':		/* equivalence class */
914 		NEXT2();
915 		(void) REQUIRE(MORE(), REG_EBRACK);
916 		c = PEEK();
917 		(void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
918 		p_b_eclass(p, cs);
919 		(void) REQUIRE(MORE(), REG_EBRACK);
920 		(void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
921 		break;
922 	default:		/* symbol, ordinary character, or range */
923 		start = p_b_symbol(p);
924 		if (SEE('-') && MORE2() && PEEK2() != ']') {
925 			/* range */
926 			NEXT();
927 			if (EAT('-'))
928 				finish = '-';
929 			else
930 				finish = p_b_symbol(p);
931 		} else
932 			finish = start;
933 		if (start == finish)
934 			CHadd(p, cs, start);
935 		else {
936 			if (loc->collate->lc_is_posix) {
937 				(void) REQUIRE((uch)start <= (uch)finish,
938 				    REG_ERANGE);
939 				CHaddrange(p, cs, start, finish);
940 			} else {
941 				(void) REQUIRE(_collate_range_cmp(start,
942 				    finish, loc) <= 0, REG_ERANGE);
943 				for (i = 0; i <= UCHAR_MAX; i++) {
944 					if (_collate_range_cmp(start, i, loc)
945 					    <= 0 &&
946 					    _collate_range_cmp(i, finish, loc)
947 					    <= 0)
948 						CHadd(p, cs, i);
949 				}
950 			}
951 		}
952 		break;
953 	}
954 }
955 
956 /*
957  * p_b_cclass - parse a character-class name and deal with it
958  */
959 static void
960 p_b_cclass(struct parse *p, cset *cs)
961 {
962 	const char *sp = p->next;
963 	size_t len;
964 	wctype_t wct;
965 	char clname[16];
966 
967 	while (MORE() && isalpha((uch)PEEK()))
968 		NEXT();
969 	len = p->next - sp;
970 	if (len >= sizeof (clname) - 1) {
971 		SETERROR(REG_ECTYPE);
972 		return;
973 	}
974 	(void) memcpy(clname, sp, len);
975 	clname[len] = '\0';
976 	if ((wct = wctype(clname)) == 0) {
977 		SETERROR(REG_ECTYPE);
978 		return;
979 	}
980 	CHaddtype(p, cs, wct);
981 }
982 
983 /*
984  * p_b_eclass - parse an equivalence-class name and deal with it
985  *
986  * This implementation is incomplete. xxx
987  */
988 static void
989 p_b_eclass(struct parse *p, cset *cs)
990 {
991 	wint_t c;
992 
993 	c = p_b_coll_elem(p, '=');
994 	CHadd(p, cs, c);
995 }
996 
997 /*
998  * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
999  */
1000 static wint_t			/* value of symbol */
1001 p_b_symbol(struct parse *p)
1002 {
1003 	wint_t value;
1004 
1005 	(void) REQUIRE(MORE(), REG_EBRACK);
1006 	if (!EATTWO('[', '.'))
1007 		return (WGETNEXT());
1008 
1009 	/* collating symbol */
1010 	value = p_b_coll_elem(p, '.');
1011 	(void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1012 	return (value);
1013 }
1014 
1015 /*
1016  * p_b_coll_elem - parse a collating-element name and look it up
1017  */
1018 static wint_t			/* value of collating element */
1019 p_b_coll_elem(struct parse *p,
1020     wint_t endc)		/* name ended by endc,']' */
1021 {
1022 	const char *sp = p->next;
1023 	struct cname *cp;
1024 	mbstate_t mbs;
1025 	wchar_t wc;
1026 	size_t clen, len;
1027 
1028 	while (MORE() && !SEETWO(endc, ']'))
1029 		NEXT();
1030 	if (!MORE()) {
1031 		SETERROR(REG_EBRACK);
1032 		return (0);
1033 	}
1034 	len = p->next - sp;
1035 	for (cp = cnames; cp->name != NULL; cp++)
1036 		if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1037 			return (cp->code);	/* known name */
1038 	(void) memset(&mbs, 0, sizeof (mbs));
1039 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1040 		return (wc);			/* single character */
1041 	else if (clen == (size_t)-1 || clen == (size_t)-2)
1042 		SETERROR(REG_ECHAR);
1043 	else
1044 		SETERROR(REG_ECOLLATE);		/* neither */
1045 	return (0);
1046 }
1047 
1048 /*
1049  * othercase - return the case counterpart of an alphabetic
1050  */
1051 static wint_t			/* if no counterpart, return ch */
1052 othercase(wint_t ch)
1053 {
1054 	assert(iswalpha(ch));
1055 	if (iswupper(ch))
1056 		return (towlower(ch));
1057 	else if (iswlower(ch))
1058 		return (towupper(ch));
1059 	else			/* peculiar, but could happen */
1060 		return (ch);
1061 }
1062 
1063 /*
1064  * bothcases - emit a dualcase version of a two-case character
1065  *
1066  * Boy, is this implementation ever a kludge...
1067  */
1068 static void
1069 bothcases(struct parse *p, wint_t ch)
1070 {
1071 	const char *oldnext = p->next;
1072 	const char *oldend = p->end;
1073 	char bracket[3 + MB_LEN_MAX];
1074 	size_t n;
1075 	mbstate_t mbs;
1076 
1077 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1078 	p->next = bracket;
1079 	(void) memset(&mbs, 0, sizeof (mbs));
1080 	n = wcrtomb(bracket, ch, &mbs);
1081 	assert(n != (size_t)-1);
1082 	bracket[n] = ']';
1083 	bracket[n + 1] = '\0';
1084 	p->end = bracket+n+1;
1085 	p_bracket(p);
1086 	assert(p->next == p->end);
1087 	p->next = oldnext;
1088 	p->end = oldend;
1089 }
1090 
1091 /*
1092  * ordinary - emit an ordinary character
1093  */
1094 static void
1095 ordinary(struct parse *p, wint_t ch)
1096 {
1097 	cset *cs;
1098 
1099 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1100 		bothcases(p, ch);
1101 	else if ((ch & OPDMASK) == ch)
1102 		EMIT(OCHAR, ch);
1103 	else {
1104 		/*
1105 		 * Kludge: character is too big to fit into an OCHAR operand.
1106 		 * Emit a singleton set.
1107 		 */
1108 		if ((cs = allocset(p)) == NULL)
1109 			return;
1110 		CHadd(p, cs, ch);
1111 		EMIT(OANYOF, (int)(cs - p->g->sets));
1112 	}
1113 }
1114 
1115 /*
1116  * nonnewline - emit REG_NEWLINE version of OANY
1117  *
1118  * Boy, is this implementation ever a kludge...
1119  */
1120 static void
1121 nonnewline(struct parse *p)
1122 {
1123 	const char *oldnext = p->next;
1124 	const char *oldend = p->end;
1125 	char bracket[4];
1126 
1127 	p->next = bracket;
1128 	p->end = bracket+3;
1129 	bracket[0] = '^';
1130 	bracket[1] = '\n';
1131 	bracket[2] = ']';
1132 	bracket[3] = '\0';
1133 	p_bracket(p);
1134 	assert(p->next == bracket+3);
1135 	p->next = oldnext;
1136 	p->end = oldend;
1137 }
1138 
1139 /*
1140  * repeat - generate code for a bounded repetition, recursively if needed
1141  */
1142 static void
1143 repeat(struct parse *p,
1144     sopno start,		/* operand from here to end of strip */
1145     int from,			/* repeated from this number */
1146     int to)			/* to this number of times (maybe INFINITY) */
1147 {
1148 	sopno finish = HERE();
1149 #define	N	2
1150 #define	INF	3
1151 #define	REP(f, t)	((f)*8 + (t))
1152 #define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1153 	sopno copy;
1154 
1155 	if (p->error != 0)	/* head off possible runaway recursion */
1156 		return;
1157 
1158 	assert(from <= to);
1159 
1160 	switch (REP(MAP(from), MAP(to))) {
1161 	case REP(0, 0):			/* must be user doing this */
1162 		DROP(finish-start);	/* drop the operand */
1163 		break;
1164 	case REP(0, 1):			/* as x{1,1}? */
1165 	case REP(0, N):			/* as x{1,n}? */
1166 	case REP(0, INF):		/* as x{1,}? */
1167 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1168 		INSERT(OCH_, start);		/* offset is wrong... */
1169 		repeat(p, start+1, 1, to);
1170 		ASTERN(OOR1, start);
1171 		AHEAD(start);			/* ... fix it */
1172 		EMIT(OOR2, 0);
1173 		AHEAD(THERE());
1174 		ASTERN(O_CH, THERETHERE());
1175 		break;
1176 	case REP(1, 1):			/* trivial case */
1177 		/* done */
1178 		break;
1179 	case REP(1, N):			/* as x?x{1,n-1} */
1180 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1181 		INSERT(OCH_, start);
1182 		ASTERN(OOR1, start);
1183 		AHEAD(start);
1184 		EMIT(OOR2, 0);			/* offset very wrong... */
1185 		AHEAD(THERE());			/* ...so fix it */
1186 		ASTERN(O_CH, THERETHERE());
1187 		copy = dupl(p, start+1, finish+1);
1188 		assert(copy == finish+4);
1189 		repeat(p, copy, 1, to-1);
1190 		break;
1191 	case REP(1, INF):		/* as x+ */
1192 		INSERT(OPLUS_, start);
1193 		ASTERN(O_PLUS, start);
1194 		break;
1195 	case REP(N, N):			/* as xx{m-1,n-1} */
1196 		copy = dupl(p, start, finish);
1197 		repeat(p, copy, from-1, to-1);
1198 		break;
1199 	case REP(N, INF):		/* as xx{n-1,INF} */
1200 		copy = dupl(p, start, finish);
1201 		repeat(p, copy, from-1, to);
1202 		break;
1203 	default:			/* "can't happen" */
1204 		SETERROR(REG_EFATAL);	/* just in case */
1205 		break;
1206 	}
1207 }
1208 
1209 /*
1210  * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1211  * character from the parse struct, signals a REG_ILLSEQ error if the
1212  * character can't be converted. Returns the number of bytes consumed.
1213  */
1214 static wint_t
1215 wgetnext(struct parse *p)
1216 {
1217 	mbstate_t mbs;
1218 	wchar_t wc;
1219 	size_t n;
1220 
1221 	(void) memset(&mbs, 0, sizeof (mbs));
1222 	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1223 	if (n == (size_t)-1 || n == (size_t)-2) {
1224 		SETERROR(REG_ECHAR);
1225 		return (0);
1226 	}
1227 	if (n == 0)
1228 		n = 1;
1229 	p->next += n;
1230 	return (wc);
1231 }
1232 
1233 /*
1234  * seterr - set an error condition
1235  */
1236 static int			/* useless but makes type checking happy */
1237 seterr(struct parse *p, int e)
1238 {
1239 	if (p->error == 0)	/* keep earliest error condition */
1240 		p->error = e;
1241 	p->next = nuls;		/* try to bring things to a halt */
1242 	p->end = nuls;
1243 	return (0);		/* make the return value well-defined */
1244 }
1245 
1246 /*
1247  * allocset - allocate a set of characters for []
1248  */
1249 static cset *
1250 allocset(struct parse *p)
1251 {
1252 	cset *cs, *ncs;
1253 
1254 	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1255 	if (ncs == NULL) {
1256 		SETERROR(REG_ESPACE);
1257 		return (NULL);
1258 	}
1259 	p->g->sets = ncs;
1260 	cs = &p->g->sets[p->g->ncsets++];
1261 	(void) memset(cs, 0, sizeof (*cs));
1262 
1263 	return (cs);
1264 }
1265 
1266 /*
1267  * freeset - free a now-unused set
1268  */
1269 static void
1270 freeset(struct parse *p, cset *cs)
1271 {
1272 	cset *top = &p->g->sets[p->g->ncsets];
1273 
1274 	free(cs->wides);
1275 	free(cs->ranges);
1276 	free(cs->types);
1277 	(void) memset(cs, 0, sizeof (*cs));
1278 	if (cs == top-1)	/* recover only the easy case */
1279 		p->g->ncsets--;
1280 }
1281 
1282 /*
1283  * singleton - Determine whether a set contains only one character,
1284  * returning it if so, otherwise returning OUT.
1285  */
1286 static wint_t
1287 singleton(cset *cs)
1288 {
1289 	wint_t i, s, n;
1290 
1291 	/* Exclude the complicated cases we don't want to deal with */
1292 	if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
1293 		return (OUT);
1294 
1295 	if (cs->nwides > 1)
1296 		return (OUT);
1297 
1298 	/* Count the number of characters present in the bitmap */
1299 	for (i = n = 0; i < NC; i++)
1300 		if (CHIN(cs, i)) {
1301 			n++;
1302 			s = i;
1303 		}
1304 
1305 	if (n > 1)
1306 		return (OUT);
1307 
1308 	if (n == 1) {
1309 		if (cs->nwides == 0)
1310 			return (s);
1311 		else
1312 			return (OUT);
1313 	}
1314 	if (cs->nwides == 1)
1315 		return (cs->wides[0]);
1316 
1317 	return (OUT);
1318 }
1319 
1320 /*
1321  * CHadd - add character to character set.
1322  */
1323 static void
1324 CHadd(struct parse *p, cset *cs, wint_t ch)
1325 {
1326 	wint_t nch, *newwides;
1327 	assert(ch >= 0);
1328 	if (ch < NC)
1329 		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1330 	else {
1331 		newwides = realloc(cs->wides, (cs->nwides + 1) *
1332 		    sizeof (*cs->wides));
1333 		if (newwides == NULL) {
1334 			SETERROR(REG_ESPACE);
1335 			return;
1336 		}
1337 		cs->wides = newwides;
1338 		cs->wides[cs->nwides++] = ch;
1339 	}
1340 	if (cs->icase) {
1341 		if ((nch = towlower(ch)) < NC)
1342 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1343 		if ((nch = towupper(ch)) < NC)
1344 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1345 	}
1346 }
1347 
1348 /*
1349  * CHaddrange - add all characters in the range [min,max] to a character set.
1350  */
1351 static void
1352 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1353 {
1354 	crange *newranges;
1355 
1356 	for (; min < NC && min <= max; min++)
1357 		CHadd(p, cs, min);
1358 	if (min >= max)
1359 		return;
1360 	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1361 	    sizeof (*cs->ranges));
1362 	if (newranges == NULL) {
1363 		SETERROR(REG_ESPACE);
1364 		return;
1365 	}
1366 	cs->ranges = newranges;
1367 	cs->ranges[cs->nranges].min = min;
1368 	cs->ranges[cs->nranges].max = max;
1369 	cs->nranges++;
1370 }
1371 
1372 /*
1373  * CHaddtype - add all characters of a certain type to a character set.
1374  */
1375 static void
1376 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1377 {
1378 	wint_t i;
1379 	wctype_t *newtypes;
1380 
1381 	for (i = 0; i < NC; i++)
1382 		if (iswctype(i, wct))
1383 			CHadd(p, cs, i);
1384 	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1385 	    sizeof (*cs->types));
1386 	if (newtypes == NULL) {
1387 		SETERROR(REG_ESPACE);
1388 		return;
1389 	}
1390 	cs->types = newtypes;
1391 	cs->types[cs->ntypes++] = wct;
1392 }
1393 
1394 /*
1395  * dupl - emit a duplicate of a bunch of sops
1396  */
1397 static sopno			/* start of duplicate */
1398 dupl(struct parse *p,
1399     sopno start,		/* from here */
1400     sopno finish)		/* to this less one */
1401 {
1402 	sopno ret = HERE();
1403 	sopno len = finish - start;
1404 
1405 	assert(finish >= start);
1406 	if (len == 0)
1407 		return (ret);
1408 	if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1409 		return (ret);
1410 	assert(p->ssize >= p->slen + len);
1411 	(void) memcpy((char *)(p->strip + p->slen),
1412 	    (char *)(p->strip + start), (size_t)len*sizeof (sop));
1413 	p->slen += len;
1414 	return (ret);
1415 }
1416 
1417 /*
1418  * doemit - emit a strip operator
1419  *
1420  * It might seem better to implement this as a macro with a function as
1421  * hard-case backup, but it's just too big and messy unless there are
1422  * some changes to the data structures.  Maybe later.
1423  */
1424 static void
1425 doemit(struct parse *p, sop op, size_t opnd)
1426 {
1427 	/* avoid making error situations worse */
1428 	if (p->error != 0)
1429 		return;
1430 
1431 	/* deal with oversize operands ("can't happen", more or less) */
1432 	assert(opnd < 1<<OPSHIFT);
1433 
1434 	/* deal with undersized strip */
1435 	if (p->slen >= p->ssize)
1436 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1437 			return;
1438 
1439 	/* finally, it's all reduced to the easy case */
1440 	p->strip[p->slen++] = SOP(op, opnd);
1441 }
1442 
1443 /*
1444  * doinsert - insert a sop into the strip
1445  */
1446 static void
1447 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1448 {
1449 	sopno sn;
1450 	sop s;
1451 	int i;
1452 
1453 	/* avoid making error situations worse */
1454 	if (p->error != 0)
1455 		return;
1456 
1457 	sn = HERE();
1458 	EMIT(op, opnd);		/* do checks, ensure space */
1459 	assert(HERE() == sn+1);
1460 	s = p->strip[sn];
1461 
1462 	/* adjust paren pointers */
1463 	assert(pos > 0);
1464 	for (i = 1; i < NPAREN; i++) {
1465 		if (p->pbegin[i] >= pos) {
1466 			p->pbegin[i]++;
1467 		}
1468 		if (p->pend[i] >= pos) {
1469 			p->pend[i]++;
1470 		}
1471 	}
1472 
1473 	(void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1474 	    (HERE()-pos-1)*sizeof (sop));
1475 	p->strip[pos] = s;
1476 }
1477 
1478 /*
1479  * dofwd - complete a forward reference
1480  */
1481 static void
1482 dofwd(struct parse *p, sopno pos, sop value)
1483 {
1484 	/* avoid making error situations worse */
1485 	if (p->error != 0)
1486 		return;
1487 
1488 	assert(value < 1<<OPSHIFT);
1489 	p->strip[pos] = OP(p->strip[pos]) | value;
1490 }
1491 
1492 /*
1493  * enlarge - enlarge the strip
1494  */
1495 static int
1496 enlarge(struct parse *p, sopno size)
1497 {
1498 	sop *sp;
1499 
1500 	if (p->ssize >= size)
1501 		return (1);
1502 
1503 	sp = (sop *)realloc(p->strip, size*sizeof (sop));
1504 	if (sp == NULL) {
1505 		SETERROR(REG_ESPACE);
1506 		return (0);
1507 	}
1508 	p->strip = sp;
1509 	p->ssize = size;
1510 	return (1);
1511 }
1512 
1513 /*
1514  * stripsnug - compact the strip
1515  */
1516 static void
1517 stripsnug(struct parse *p, struct re_guts *g)
1518 {
1519 	g->nstates = p->slen;
1520 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1521 	if (g->strip == NULL) {
1522 		SETERROR(REG_ESPACE);
1523 		g->strip = p->strip;
1524 	}
1525 }
1526 
1527 /*
1528  * findmust - fill in must and mlen with longest mandatory literal string
1529  *
1530  * This algorithm could do fancy things like analyzing the operands of |
1531  * for common subsequences.  Someday.  This code is simple and finds most
1532  * of the interesting cases.
1533  *
1534  * Note that must and mlen got initialized during setup.
1535  */
1536 static void
1537 findmust(struct parse *p, struct re_guts *g)
1538 {
1539 	sop *scan;
1540 	sop *start = NULL;
1541 	sop *newstart = NULL;
1542 	sopno newlen;
1543 	sop s;
1544 	char *cp;
1545 	int offset;
1546 	char buf[MB_LEN_MAX];
1547 	size_t clen;
1548 	mbstate_t mbs;
1549 	locale_t loc = uselocale(NULL);
1550 
1551 	/* avoid making error situations worse */
1552 	if (p->error != 0)
1553 		return;
1554 
1555 	/*
1556 	 * It's not generally safe to do a ``char'' substring search on
1557 	 * multibyte character strings, but it's safe for at least
1558 	 * UTF-8 (see RFC 3629).
1559 	 */
1560 	if (MB_CUR_MAX > 1 &&
1561 	    strcmp(loc->runelocale->__encoding, "UTF-8") != 0)
1562 		return;
1563 
1564 	/* find the longest OCHAR sequence in strip */
1565 	newlen = 0;
1566 	offset = 0;
1567 	g->moffset = 0;
1568 	scan = g->strip + 1;
1569 	do {
1570 		s = *scan++;
1571 		switch (OP(s)) {
1572 		case OCHAR:		/* sequence member */
1573 			if (newlen == 0) {		/* new sequence */
1574 				(void) memset(&mbs, 0, sizeof (mbs));
1575 				newstart = scan - 1;
1576 			}
1577 			clen = wcrtomb(buf, OPND(s), &mbs);
1578 			if (clen == (size_t)-1)
1579 				goto toohard;
1580 			newlen += clen;
1581 			break;
1582 		case OPLUS_:		/* things that don't break one */
1583 		case OLPAREN:
1584 		case ORPAREN:
1585 			break;
1586 		case OQUEST_:		/* things that must be skipped */
1587 		case OCH_:
1588 			offset = altoffset(scan, offset);
1589 			scan--;
1590 			do {
1591 				scan += OPND(s);
1592 				s = *scan;
1593 				/* assert() interferes w debug printouts */
1594 				if (OP(s) != (sop)O_QUEST &&
1595 				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2) {
1596 					g->iflags |= BAD;
1597 					return;
1598 				}
1599 			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1600 			/* FALLTHROUGH */
1601 		case OBOW:		/* things that break a sequence */
1602 		case OEOW:
1603 		case OBOL:
1604 		case OEOL:
1605 		case O_QUEST:
1606 		case O_CH:
1607 		case OEND:
1608 			if (newlen > (sopno)g->mlen) {		/* ends one */
1609 				start = newstart;
1610 				g->mlen = newlen;
1611 				if (offset > -1) {
1612 					g->moffset += offset;
1613 					offset = newlen;
1614 				} else
1615 					g->moffset = offset;
1616 			} else {
1617 				if (offset > -1)
1618 					offset += newlen;
1619 			}
1620 			newlen = 0;
1621 			break;
1622 		case OANY:
1623 			if (newlen > (sopno)g->mlen) {		/* ends one */
1624 				start = newstart;
1625 				g->mlen = newlen;
1626 				if (offset > -1) {
1627 					g->moffset += offset;
1628 					offset = newlen;
1629 				} else
1630 					g->moffset = offset;
1631 			} else {
1632 				if (offset > -1)
1633 					offset += newlen;
1634 			}
1635 			if (offset > -1)
1636 				offset++;
1637 			newlen = 0;
1638 			break;
1639 		case OANYOF:		/* may or may not invalidate offset */
1640 			/* First, everything as OANY */
1641 			if (newlen > (sopno)g->mlen) {		/* ends one */
1642 				start = newstart;
1643 				g->mlen = newlen;
1644 				if (offset > -1) {
1645 					g->moffset += offset;
1646 					offset = newlen;
1647 				} else
1648 					g->moffset = offset;
1649 			} else {
1650 				if (offset > -1)
1651 					offset += newlen;
1652 			}
1653 			if (offset > -1)
1654 				offset++;
1655 			newlen = 0;
1656 			break;
1657 		toohard:
1658 		default:
1659 			/*
1660 			 * Anything here makes it impossible or too hard
1661 			 * to calculate the offset -- so we give up;
1662 			 * save the last known good offset, in case the
1663 			 * must sequence doesn't occur later.
1664 			 */
1665 			if (newlen > (sopno)g->mlen) {		/* ends one */
1666 				start = newstart;
1667 				g->mlen = newlen;
1668 				if (offset > -1)
1669 					g->moffset += offset;
1670 				else
1671 					g->moffset = offset;
1672 			}
1673 			offset = -1;
1674 			newlen = 0;
1675 			break;
1676 		}
1677 	} while (OP(s) != OEND);
1678 
1679 	if (g->mlen == 0) {		/* there isn't one */
1680 		g->moffset = -1;
1681 		return;
1682 	}
1683 
1684 	/* turn it into a character string */
1685 	g->must = malloc((size_t)g->mlen + 1);
1686 	if (g->must == NULL) {		/* argh; just forget it */
1687 		g->mlen = 0;
1688 		g->moffset = -1;
1689 		return;
1690 	}
1691 	cp = g->must;
1692 	scan = start;
1693 	(void) memset(&mbs, 0, sizeof (mbs));
1694 	while (cp < g->must + g->mlen) {
1695 		while (OP(s = *scan++) != OCHAR)
1696 			continue;
1697 		clen = wcrtomb(cp, OPND(s), &mbs);
1698 		assert(clen != (size_t)-1);
1699 		cp += clen;
1700 	}
1701 	assert(cp == g->must + g->mlen);
1702 	*cp++ = '\0';		/* just on general principles */
1703 }
1704 
1705 /*
1706  * altoffset - choose biggest offset among multiple choices
1707  *
1708  * Compute, recursively if necessary, the largest offset among multiple
1709  * re paths.
1710  */
1711 static int
1712 altoffset(sop *scan, int offset)
1713 {
1714 	int largest;
1715 	int try;
1716 	sop s;
1717 
1718 	/* If we gave up already on offsets, return */
1719 	if (offset == -1)
1720 		return (-1);
1721 
1722 	largest = 0;
1723 	try = 0;
1724 	s = *scan++;
1725 	while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH) {
1726 		switch (OP(s)) {
1727 		case OOR1:
1728 			if (try > largest)
1729 				largest = try;
1730 			try = 0;
1731 			break;
1732 		case OQUEST_:
1733 		case OCH_:
1734 			try = altoffset(scan, try);
1735 			if (try == -1)
1736 				return (-1);
1737 			scan--;
1738 			do {
1739 				scan += OPND(s);
1740 				s = *scan;
1741 				if (OP(s) != (sop)O_QUEST &&
1742 				    OP(s) != (sop)O_CH && OP(s) != (sop)OOR2)
1743 					return (-1);
1744 			} while (OP(s) != (sop)O_QUEST && OP(s) != (sop)O_CH);
1745 			/*
1746 			 * We must skip to the next position, or we'll
1747 			 * leave altoffset() too early.
1748 			 */
1749 			scan++;
1750 			break;
1751 		case OANYOF:
1752 		case OCHAR:
1753 		case OANY:
1754 			try++;
1755 			/*FALLTHRU*/
1756 		case OBOW:
1757 		case OEOW:
1758 		case OLPAREN:
1759 		case ORPAREN:
1760 		case OOR2:
1761 			break;
1762 		default:
1763 			try = -1;
1764 			break;
1765 		}
1766 		if (try == -1)
1767 			return (-1);
1768 		s = *scan++;
1769 	}
1770 
1771 	if (try > largest)
1772 		largest = try;
1773 
1774 	return (largest+offset);
1775 }
1776 
1777 /*
1778  * computejumps - compute char jumps for BM scan
1779  *
1780  * This algorithm assumes g->must exists and is has size greater than
1781  * zero. It's based on the algorithm found on Computer Algorithms by
1782  * Sara Baase.
1783  *
1784  * A char jump is the number of characters one needs to jump based on
1785  * the value of the character from the text that was mismatched.
1786  */
1787 static void
1788 computejumps(struct parse *p, struct re_guts *g)
1789 {
1790 	int ch;
1791 	int mindex;
1792 
1793 	/* Avoid making errors worse */
1794 	if (p->error != 0)
1795 		return;
1796 
1797 	g->charjump = (int *)malloc((NC_MAX + 1) * sizeof (int));
1798 	if (g->charjump == NULL)	/* Not a fatal error */
1799 		return;
1800 	/* Adjust for signed chars, if necessary */
1801 	g->charjump = &g->charjump[-(CHAR_MIN)];
1802 
1803 	/*
1804 	 * If the character does not exist in the pattern, the jump
1805 	 * is equal to the number of characters in the pattern.
1806 	 */
1807 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1808 		g->charjump[ch] = g->mlen;
1809 
1810 	/*
1811 	 * If the character does exist, compute the jump that would
1812 	 * take us to the last character in the pattern equal to it
1813 	 * (notice that we match right to left, so that last character
1814 	 * is the first one that would be matched).
1815 	 */
1816 	for (mindex = 0; mindex < g->mlen; mindex++)
1817 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1818 }
1819 
1820 /*
1821  * computematchjumps - compute match jumps for BM scan
1822  *
1823  * This algorithm assumes g->must exists and is has size greater than
1824  * zero. It's based on the algorithm found on Computer Algorithms by
1825  * Sara Baase.
1826  *
1827  * A match jump is the number of characters one needs to advance based
1828  * on the already-matched suffix.
1829  * Notice that all values here are minus (g->mlen-1), because of the way
1830  * the search algorithm works.
1831  */
1832 static void
1833 computematchjumps(struct parse *p, struct re_guts *g)
1834 {
1835 	int mindex;		/* General "must" iterator */
1836 	int suffix;		/* Keeps track of matching suffix */
1837 	int ssuffix;		/* Keeps track of suffixes' suffix */
1838 	int *pmatches;
1839 				/*
1840 				 * pmatches[k] points to the next i
1841 				 * such that i+1...mlen is a substring
1842 				 * of k+1...k+mlen-i-1
1843 				 */
1844 
1845 	/* Avoid making errors worse */
1846 	if (p->error != 0)
1847 		return;
1848 
1849 	pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1850 	if (pmatches == NULL) {
1851 		g->matchjump = NULL;
1852 		return;
1853 	}
1854 
1855 	g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1856 	if (g->matchjump == NULL) {	/* Not a fatal error */
1857 		free(pmatches);
1858 		return;
1859 	}
1860 
1861 	/* Set maximum possible jump for each character in the pattern */
1862 	for (mindex = 0; mindex < g->mlen; mindex++)
1863 		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1864 
1865 	/* Compute pmatches[] */
1866 	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1867 	    mindex--, suffix--) {
1868 		pmatches[mindex] = suffix;
1869 
1870 		/*
1871 		 * If a mismatch is found, interrupting the substring,
1872 		 * compute the matchjump for that position. If no
1873 		 * mismatch is found, then a text substring mismatched
1874 		 * against the suffix will also mismatch against the
1875 		 * substring.
1876 		 */
1877 		while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1878 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1879 			    g->mlen - mindex - 1);
1880 			suffix = pmatches[suffix];
1881 		}
1882 	}
1883 
1884 	/*
1885 	 * Compute the matchjump up to the last substring found to jump
1886 	 * to the beginning of the largest must pattern prefix matching
1887 	 * it's own suffix.
1888 	 */
1889 	for (mindex = 0; mindex <= suffix; mindex++)
1890 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1891 		    g->mlen + suffix - mindex);
1892 
1893 	ssuffix = pmatches[suffix];
1894 	while (suffix < g->mlen) {
1895 		while (suffix <= ssuffix && suffix < g->mlen) {
1896 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1897 			    g->mlen + ssuffix - suffix);
1898 			suffix++;
1899 		}
1900 		if (suffix < g->mlen)
1901 			ssuffix = pmatches[ssuffix];
1902 	}
1903 
1904 	free(pmatches);
1905 }
1906 
1907 /*
1908  * pluscount - count + nesting
1909  */
1910 static sopno			/* nesting depth */
1911 pluscount(struct parse *p, struct re_guts *g)
1912 {
1913 	sop *scan;
1914 	sop s;
1915 	sopno plusnest = 0;
1916 	sopno maxnest = 0;
1917 
1918 	if (p->error != 0)
1919 		return (0);	/* there may not be an OEND */
1920 
1921 	scan = g->strip + 1;
1922 	do {
1923 		s = *scan++;
1924 		switch (OP(s)) {
1925 		case OPLUS_:
1926 			plusnest++;
1927 			break;
1928 		case O_PLUS:
1929 			if (plusnest > maxnest)
1930 				maxnest = plusnest;
1931 			plusnest--;
1932 			break;
1933 		}
1934 	} while (OP(s) != OEND);
1935 	if (plusnest != 0)
1936 		g->iflags |= BAD;
1937 	return (maxnest);
1938 }
1939