xref: /titanic_51/usr/src/lib/libc/port/locale/regcomp.c (revision 5aec55eb0591d2fcdd38d7dd5408a6ff3456e596)
1 /*
2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4  * Copyright (c) 1992, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Henry Spencer.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "lint.h"
36 #include "file64.h"
37 #include <sys/types.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <ctype.h>
41 #include <limits.h>
42 #include <stdlib.h>
43 #include <regex.h>
44 #include <wchar.h>
45 #include <wctype.h>
46 
47 #include "runetype.h"
48 #include "collate.h"
49 
50 #include "utils.h"
51 #include "regex2.h"
52 
53 #include "cname.h"
54 #include "mblocal.h"
55 
56 /*
57  * parse structure, passed up and down to avoid global variables and
58  * other clumsinesses
59  */
60 struct parse {
61 	char *next;		/* next character in RE */
62 	char *end;		/* end of string (-> NUL normally) */
63 	int error;		/* has an error been seen? */
64 	sop *strip;		/* malloced strip */
65 	sopno ssize;		/* malloced strip size (allocated) */
66 	sopno slen;		/* malloced strip length (used) */
67 	int ncsalloc;		/* number of csets allocated */
68 	struct re_guts *g;
69 #define	NPAREN	10		/* we need to remember () 1-9 for back refs */
70 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
71 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
72 };
73 
74 /* ========= begin header generated by ./mkh ========= */
75 #ifdef __cplusplus
76 extern "C" {
77 #endif
78 
79 /* === regcomp.c === */
80 static void p_ere(struct parse *p, wint_t stop);
81 static void p_ere_exp(struct parse *p);
82 static void p_str(struct parse *p);
83 static void p_bre(struct parse *p, wint_t end1, wint_t end2);
84 static int p_simp_re(struct parse *p, int starordinary);
85 static int p_count(struct parse *p);
86 static void p_bracket(struct parse *p);
87 static void p_b_term(struct parse *p, cset *cs);
88 static void p_b_cclass(struct parse *p, cset *cs);
89 static void p_b_eclass(struct parse *p, cset *cs);
90 static wint_t p_b_symbol(struct parse *p);
91 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
92 static wint_t othercase(wint_t ch);
93 static void bothcases(struct parse *p, wint_t ch);
94 static void ordinary(struct parse *p, wint_t ch);
95 static void nonnewline(struct parse *p);
96 static void repeat(struct parse *p, sopno start, int from, int to);
97 static int seterr(struct parse *p, int e);
98 static cset *allocset(struct parse *p);
99 static void freeset(struct parse *p, cset *cs);
100 static void CHadd(struct parse *p, cset *cs, wint_t ch);
101 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
102 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
103 static wint_t singleton(cset *cs);
104 static sopno dupl(struct parse *p, sopno start, sopno finish);
105 static void doemit(struct parse *p, sop op, size_t opnd);
106 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
107 static void dofwd(struct parse *p, sopno pos, sop value);
108 static void enlarge(struct parse *p, sopno size);
109 static void stripsnug(struct parse *p, struct re_guts *g);
110 static void findmust(struct parse *p, struct re_guts *g);
111 static int altoffset(sop *scan, int offset);
112 static void computejumps(struct parse *p, struct re_guts *g);
113 static void computematchjumps(struct parse *p, struct re_guts *g);
114 static sopno pluscount(struct parse *p, struct re_guts *g);
115 static wint_t wgetnext(struct parse *p);
116 
117 #ifdef __cplusplus
118 }
119 #endif
120 /* ========= end header generated by ./mkh ========= */
121 
122 static char nuls[10];		/* place to point scanner in event of error */
123 
124 /*
125  * macros for use with parse structure
126  * BEWARE:  these know that the parse structure is named `p' !!!
127  */
128 #define	PEEK()	(*p->next)
129 #define	PEEK2()	(*(p->next+1))
130 #define	MORE()	(p->next < p->end)
131 #define	MORE2()	(p->next+1 < p->end)
132 #define	SEE(c)	(MORE() && PEEK() == (c))
133 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
134 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
135 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
136 #define	NEXT()	(p->next++)
137 #define	NEXT2()	(p->next += 2)
138 #define	NEXTn(n)	(p->next += (n))
139 #define	GETNEXT()	(*p->next++)
140 #define	WGETNEXT()	wgetnext(p)
141 #define	SETERROR(e)	((void)seterr(p, (e)))
142 #define	REQUIRE(co, e)	((co) || seterr(p, e))
143 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
144 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
145 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
146 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
147 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
148 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
149 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
150 #define	HERE()		(p->slen)
151 #define	THERE()		(p->slen - 1)
152 #define	THERETHERE()	(p->slen - 2)
153 #define	DROP(n)	(p->slen -= (n))
154 
155 #ifndef NDEBUG
156 static int never = 0;		/* for use in asserts; shuts lint up */
157 #else
158 #define	never	0		/* some <assert.h>s have bugs too */
159 #endif
160 
161 /*
162  * regcomp - interface for parser and compilation
163  */
164 int				/* 0 success, otherwise REG_something */
165 regcomp(regex_t *_RESTRICT_KYWD preg,
166 	const char *_RESTRICT_KYWD pattern,
167 	int cflags)
168 {
169 	struct parse pa;
170 	struct re_guts *g;
171 	struct parse *p = &pa;
172 	int i;
173 	size_t len;
174 #ifdef REDEBUG
175 #define	GOODFLAGS(f)	(f)
176 #else
177 #define	GOODFLAGS(f)	((f)&~REG_DUMP)
178 #endif
179 
180 	/* We had REG_INVARG, but we don't have that on Solaris. */
181 	cflags = GOODFLAGS(cflags);
182 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
183 		return (REG_EFATAL);
184 
185 	if (cflags&REG_PEND) {
186 		if (preg->re_endp < pattern)
187 			return (REG_EFATAL);
188 		len = preg->re_endp - pattern;
189 	} else
190 		len = strlen((char *)pattern);
191 
192 	/* do the mallocs early so failure handling is easy */
193 	g = (struct re_guts *)malloc(sizeof (struct re_guts));
194 	if (g == NULL)
195 		return (REG_ESPACE);
196 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
197 	p->strip = (sop *)malloc(p->ssize * sizeof (sop));
198 	p->slen = 0;
199 	if (p->strip == NULL) {
200 		free((char *)g);
201 		return (REG_ESPACE);
202 	}
203 
204 	/* set things up */
205 	p->g = g;
206 	p->next = (char *)pattern;	/* convenience; we do not modify it */
207 	p->end = p->next + len;
208 	p->error = 0;
209 	p->ncsalloc = 0;
210 	for (i = 0; i < NPAREN; i++) {
211 		p->pbegin[i] = 0;
212 		p->pend[i] = 0;
213 	}
214 	g->sets = NULL;
215 	g->ncsets = 0;
216 	g->cflags = cflags;
217 	g->iflags = 0;
218 	g->nbol = 0;
219 	g->neol = 0;
220 	g->must = NULL;
221 	g->moffset = -1;
222 	g->charjump = NULL;
223 	g->matchjump = NULL;
224 	g->mlen = 0;
225 	g->nsub = 0;
226 	g->backrefs = 0;
227 
228 	/* do it */
229 	EMIT(OEND, 0);
230 	g->firststate = THERE();
231 	if (cflags&REG_EXTENDED)
232 		p_ere(p, OUT);
233 	else if (cflags&REG_NOSPEC)
234 		p_str(p);
235 	else
236 		p_bre(p, OUT, OUT);
237 	EMIT(OEND, 0);
238 	g->laststate = THERE();
239 
240 	/* tidy up loose ends and fill things in */
241 	stripsnug(p, g);
242 	findmust(p, g);
243 	/*
244 	 * only use Boyer-Moore algorithm if the pattern is bigger
245 	 * than three characters
246 	 */
247 	if (g->mlen > 3) {
248 		computejumps(p, g);
249 		computematchjumps(p, g);
250 		if (g->matchjump == NULL && g->charjump != NULL) {
251 			free(g->charjump);
252 			g->charjump = NULL;
253 		}
254 	}
255 	g->nplus = pluscount(p, g);
256 	g->magic = MAGIC2;
257 	preg->re_nsub = g->nsub;
258 	preg->re_g = g;
259 	preg->re_magic = MAGIC1;
260 #ifndef REDEBUG
261 	/* not debugging, so can't rely on the assert() in regexec() */
262 	if (g->iflags&BAD)
263 		SETERROR(REG_EFATAL);
264 #endif
265 
266 	/* win or lose, we're done */
267 	if (p->error != 0)	/* lose */
268 		regfree(preg);
269 	return (p->error);
270 }
271 
272 /*
273  * p_ere - ERE parser top level, concatenation and alternation
274  */
275 static void
276 p_ere(struct parse *p,
277     wint_t stop)		/* character this ERE should end at */
278 {
279 	char c;
280 	sopno prevback;
281 	sopno prevfwd;
282 	sopno conc;
283 	int first = 1;		/* is this the first alternative? */
284 
285 	for (;;) {
286 		/* do a bunch of concatenated expressions */
287 		conc = HERE();
288 		while (MORE() && (c = PEEK()) != '|' && c != stop)
289 			p_ere_exp(p);
290 		/* require nonempty */
291 		(void) REQUIRE(HERE() != conc, REG_BADPAT);
292 
293 		if (!EAT('|'))
294 			break;		/* NOTE BREAK OUT */
295 
296 		if (first) {
297 			INSERT(OCH_, conc);	/* offset is wrong */
298 			prevfwd = conc;
299 			prevback = conc;
300 			first = 0;
301 		}
302 		ASTERN(OOR1, prevback);
303 		prevback = THERE();
304 		AHEAD(prevfwd);			/* fix previous offset */
305 		prevfwd = HERE();
306 		EMIT(OOR2, 0);			/* offset is very wrong */
307 	}
308 
309 	if (!first) {		/* tail-end fixups */
310 		AHEAD(prevfwd);
311 		ASTERN(O_CH, prevback);
312 	}
313 
314 	assert(!MORE() || SEE(stop));
315 }
316 
317 /*
318  * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
319  */
320 static void
321 p_ere_exp(struct parse *p)
322 {
323 	char c;
324 	wint_t wc;
325 	sopno pos;
326 	int count;
327 	int count2;
328 	sopno subno;
329 	int wascaret = 0;
330 
331 	assert(MORE());		/* caller should have ensured this */
332 	c = GETNEXT();
333 
334 	pos = HERE();
335 	switch (c) {
336 	case '(':
337 		(void) REQUIRE(MORE(), REG_EPAREN);
338 		p->g->nsub++;
339 		subno = p->g->nsub;
340 		if (subno < NPAREN)
341 			p->pbegin[subno] = HERE();
342 		EMIT(OLPAREN, subno);
343 		if (!SEE(')'))
344 			p_ere(p, ')');
345 		if (subno < NPAREN) {
346 			p->pend[subno] = HERE();
347 			assert(p->pend[subno] != 0);
348 		}
349 		EMIT(ORPAREN, subno);
350 		(void) MUSTEAT(')', REG_EPAREN);
351 		break;
352 #ifndef POSIX_MISTAKE
353 	case ')':		/* happens only if no current unmatched ( */
354 		/*
355 		 * You may ask, why the ifndef?  Because I didn't notice
356 		 * this until slightly too late for 1003.2, and none of the
357 		 * other 1003.2 regular-expression reviewers noticed it at
358 		 * all.  So an unmatched ) is legal POSIX, at least until
359 		 * we can get it fixed.
360 		 */
361 		SETERROR(REG_EPAREN);
362 		break;
363 #endif
364 	case '^':
365 		EMIT(OBOL, 0);
366 		p->g->iflags |= USEBOL;
367 		p->g->nbol++;
368 		wascaret = 1;
369 		break;
370 	case '$':
371 		EMIT(OEOL, 0);
372 		p->g->iflags |= USEEOL;
373 		p->g->neol++;
374 		break;
375 	case '|':
376 		SETERROR(REG_BADPAT);
377 		break;
378 	case '*':
379 	case '+':
380 	case '?':
381 		SETERROR(REG_BADRPT);
382 		break;
383 	case '.':
384 		if (p->g->cflags&REG_NEWLINE)
385 			nonnewline(p);
386 		else
387 			EMIT(OANY, 0);
388 		break;
389 	case '[':
390 		p_bracket(p);
391 		break;
392 	case '\\':
393 		(void) REQUIRE(MORE(), REG_EESCAPE);
394 		wc = WGETNEXT();
395 		ordinary(p, wc);
396 		break;
397 	case '{':		/* okay as ordinary except if digit follows */
398 		(void) REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
399 		/* FALLTHROUGH */
400 	default:
401 		p->next--;
402 		wc = WGETNEXT();
403 		ordinary(p, wc);
404 		break;
405 	}
406 
407 	if (!MORE())
408 		return;
409 	c = PEEK();
410 	/* we call { a repetition if followed by a digit */
411 	if (!(c == '*' || c == '+' || c == '?' ||
412 	    (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
413 		return;		/* no repetition, we're done */
414 	NEXT();
415 
416 	(void) REQUIRE(!wascaret, REG_BADRPT);
417 	switch (c) {
418 	case '*':	/* implemented as +? */
419 		/* this case does not require the (y|) trick, noKLUDGE */
420 		INSERT(OPLUS_, pos);
421 		ASTERN(O_PLUS, pos);
422 		INSERT(OQUEST_, pos);
423 		ASTERN(O_QUEST, pos);
424 		break;
425 	case '+':
426 		INSERT(OPLUS_, pos);
427 		ASTERN(O_PLUS, pos);
428 		break;
429 	case '?':
430 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
431 		INSERT(OCH_, pos);		/* offset slightly wrong */
432 		ASTERN(OOR1, pos);		/* this one's right */
433 		AHEAD(pos);			/* fix the OCH_ */
434 		EMIT(OOR2, 0);			/* offset very wrong... */
435 		AHEAD(THERE());			/* ...so fix it */
436 		ASTERN(O_CH, THERETHERE());
437 		break;
438 	case '{':
439 		count = p_count(p);
440 		if (EAT(',')) {
441 			if (isdigit((uch)PEEK())) {
442 				count2 = p_count(p);
443 				(void) REQUIRE(count <= count2, REG_BADBR);
444 			} else		/* single number with comma */
445 				count2 = INFINITY;
446 		} else		/* just a single number */
447 			count2 = count;
448 		repeat(p, pos, count, count2);
449 		if (!EAT('}')) {	/* error heuristics */
450 			while (MORE() && PEEK() != '}')
451 				NEXT();
452 			(void) REQUIRE(MORE(), REG_EBRACE);
453 			SETERROR(REG_BADBR);
454 		}
455 		break;
456 	}
457 
458 	if (!MORE())
459 		return;
460 	c = PEEK();
461 	if (!(c == '*' || c == '+' || c == '?' ||
462 	    (c == '{' && MORE2() && isdigit((uch)PEEK2()))))
463 		return;
464 	SETERROR(REG_BADRPT);
465 }
466 
467 /*
468  * p_str - string (no metacharacters) "parser"
469  */
470 static void
471 p_str(struct parse *p)
472 {
473 	(void) REQUIRE(MORE(), REG_BADPAT);
474 	while (MORE())
475 		ordinary(p, WGETNEXT());
476 }
477 
478 /*
479  * p_bre - BRE parser top level, anchoring and concatenation
480  * Giving end1 as OUT essentially eliminates the end1/end2 check.
481  *
482  * This implementation is a bit of a kludge, in that a trailing $ is first
483  * taken as an ordinary character and then revised to be an anchor.
484  * The amount of lookahead needed to avoid this kludge is excessive.
485  */
486 static void
487 p_bre(struct parse *p,
488     wint_t end1,		/* first terminating character */
489     wint_t end2)		/* second terminating character */
490 {
491 	sopno start = HERE();
492 	int first = 1;			/* first subexpression? */
493 	int wasdollar = 0;
494 
495 	if (EAT('^')) {
496 		EMIT(OBOL, 0);
497 		p->g->iflags |= USEBOL;
498 		p->g->nbol++;
499 	}
500 	while (MORE() && !SEETWO(end1, end2)) {
501 		wasdollar = p_simp_re(p, first);
502 		first = 0;
503 	}
504 	if (wasdollar) {	/* oops, that was a trailing anchor */
505 		DROP(1);
506 		EMIT(OEOL, 0);
507 		p->g->iflags |= USEEOL;
508 		p->g->neol++;
509 	}
510 
511 	(void) REQUIRE(HERE() != start, REG_BADPAT);	/* require nonempty */
512 }
513 
514 /*
515  * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
516  */
517 static int			/* was the simple RE an unbackslashed $? */
518 p_simp_re(struct parse *p,
519 	int starordinary)	/* is a leading * an ordinary character? */
520 {
521 	int c;
522 	int count;
523 	int count2;
524 	sopno pos;
525 	int i;
526 	wint_t wc;
527 	sopno subno;
528 #define	BACKSL	(1<<CHAR_BIT)
529 
530 	pos = HERE();		/* repetion op, if any, covers from here */
531 
532 	assert(MORE());		/* caller should have ensured this */
533 	c = GETNEXT();
534 	if (c == '\\') {
535 		(void) REQUIRE(MORE(), REG_EESCAPE);
536 		c = BACKSL | GETNEXT();
537 	}
538 	switch (c) {
539 	case '.':
540 		if (p->g->cflags&REG_NEWLINE)
541 			nonnewline(p);
542 		else
543 			EMIT(OANY, 0);
544 		break;
545 	case '[':
546 		p_bracket(p);
547 		break;
548 	case BACKSL|'{':
549 		SETERROR(REG_BADRPT);
550 		break;
551 	case BACKSL|'(':
552 		p->g->nsub++;
553 		subno = p->g->nsub;
554 		if (subno < NPAREN)
555 			p->pbegin[subno] = HERE();
556 		EMIT(OLPAREN, subno);
557 		/* the MORE here is an error heuristic */
558 		if (MORE() && !SEETWO('\\', ')'))
559 			p_bre(p, '\\', ')');
560 		if (subno < NPAREN) {
561 			p->pend[subno] = HERE();
562 			assert(p->pend[subno] != 0);
563 		}
564 		EMIT(ORPAREN, subno);
565 		(void) REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
566 		break;
567 	case BACKSL|')':	/* should not get here -- must be user */
568 	case BACKSL|'}':
569 		SETERROR(REG_EPAREN);
570 		break;
571 	case BACKSL|'1':
572 	case BACKSL|'2':
573 	case BACKSL|'3':
574 	case BACKSL|'4':
575 	case BACKSL|'5':
576 	case BACKSL|'6':
577 	case BACKSL|'7':
578 	case BACKSL|'8':
579 	case BACKSL|'9':
580 		i = (c&~BACKSL) - '0';
581 		assert(i < NPAREN);
582 		if (p->pend[i] != 0) {
583 			assert(i <= p->g->nsub);
584 			EMIT(OBACK_, i);
585 			assert(p->pbegin[i] != 0);
586 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
587 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
588 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
589 			EMIT(O_BACK, i);
590 		} else
591 			SETERROR(REG_ESUBREG);
592 		p->g->backrefs = 1;
593 		break;
594 	case '*':
595 		(void) REQUIRE(starordinary, REG_BADRPT);
596 		/* FALLTHROUGH */
597 	default:
598 		p->next--;
599 		wc = WGETNEXT();
600 		ordinary(p, wc);
601 		break;
602 	}
603 
604 	if (EAT('*')) {		/* implemented as +? */
605 		/* this case does not require the (y|) trick, noKLUDGE */
606 		INSERT(OPLUS_, pos);
607 		ASTERN(O_PLUS, pos);
608 		INSERT(OQUEST_, pos);
609 		ASTERN(O_QUEST, pos);
610 	} else if (EATTWO('\\', '{')) {
611 		count = p_count(p);
612 		if (EAT(',')) {
613 			if (MORE() && isdigit((uch)PEEK())) {
614 				count2 = p_count(p);
615 				(void) REQUIRE(count <= count2, REG_BADBR);
616 			} else		/* single number with comma */
617 				count2 = INFINITY;
618 		} else		/* just a single number */
619 			count2 = count;
620 		repeat(p, pos, count, count2);
621 		if (!EATTWO('\\', '}')) {	/* error heuristics */
622 			while (MORE() && !SEETWO('\\', '}'))
623 				NEXT();
624 			(void) REQUIRE(MORE(), REG_EBRACE);
625 			SETERROR(REG_BADBR);
626 		}
627 	} else if (c == '$')	/* $ (but not \$) ends it */
628 		return (1);
629 
630 	return (0);
631 }
632 
633 /*
634  * p_count - parse a repetition count
635  */
636 static int			/* the value */
637 p_count(struct parse *p)
638 {
639 	int count = 0;
640 	int ndigits = 0;
641 
642 	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
643 		count = count*10 + (GETNEXT() - '0');
644 		ndigits++;
645 	}
646 
647 	(void) REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
648 	return (count);
649 }
650 
651 /*
652  * p_bracket - parse a bracketed character list
653  */
654 static void
655 p_bracket(struct parse *p)
656 {
657 	cset *cs;
658 	wint_t ch;
659 
660 	/* Dept of Truly Sickening Special-Case Kludges */
661 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
662 		EMIT(OBOW, 0);
663 		NEXTn(6);
664 		return;
665 	}
666 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
667 		EMIT(OEOW, 0);
668 		NEXTn(6);
669 		return;
670 	}
671 
672 	if ((cs = allocset(p)) == NULL)
673 		return;
674 
675 	if (p->g->cflags&REG_ICASE)
676 		cs->icase = 1;
677 	if (EAT('^'))
678 		cs->invert = 1;
679 	if (EAT(']'))
680 		CHadd(p, cs, ']');
681 	else if (EAT('-'))
682 		CHadd(p, cs, '-');
683 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
684 		p_b_term(p, cs);
685 	if (EAT('-'))
686 		CHadd(p, cs, '-');
687 	(void) MUSTEAT(']', REG_EBRACK);
688 
689 	if (p->error != 0)	/* don't mess things up further */
690 		return;
691 
692 	if (cs->invert && p->g->cflags&REG_NEWLINE)
693 		cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
694 
695 	if ((ch = singleton(cs)) != OUT) {	/* optimize singleton sets */
696 		ordinary(p, ch);
697 		freeset(p, cs);
698 	} else
699 		EMIT(OANYOF, (int)(cs - p->g->sets));
700 }
701 
702 /*
703  * p_b_term - parse one term of a bracketed character list
704  */
705 static void
706 p_b_term(struct parse *p, cset *cs)
707 {
708 	char c;
709 	wint_t start, finish;
710 	wint_t i;
711 
712 	/* classify what we've got */
713 	switch ((MORE()) ? PEEK() : '\0') {
714 	case '[':
715 		c = (MORE2()) ? PEEK2() : '\0';
716 		break;
717 	case '-':
718 		SETERROR(REG_ERANGE);
719 		return;			/* NOTE RETURN */
720 		break;
721 	default:
722 		c = '\0';
723 		break;
724 	}
725 
726 	switch (c) {
727 	case ':':		/* character class */
728 		NEXT2();
729 		(void) REQUIRE(MORE(), REG_EBRACK);
730 		c = PEEK();
731 		(void) REQUIRE(c != '-' && c != ']', REG_ECTYPE);
732 		p_b_cclass(p, cs);
733 		(void) REQUIRE(MORE(), REG_EBRACK);
734 		(void) REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
735 		break;
736 	case '=':		/* equivalence class */
737 		NEXT2();
738 		(void) REQUIRE(MORE(), REG_EBRACK);
739 		c = PEEK();
740 		(void) REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
741 		p_b_eclass(p, cs);
742 		(void) REQUIRE(MORE(), REG_EBRACK);
743 		(void) REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
744 		break;
745 	default:		/* symbol, ordinary character, or range */
746 		start = p_b_symbol(p);
747 		if (SEE('-') && MORE2() && PEEK2() != ']') {
748 			/* range */
749 			NEXT();
750 			if (EAT('-'))
751 				finish = '-';
752 			else
753 				finish = p_b_symbol(p);
754 		} else
755 			finish = start;
756 		if (start == finish)
757 			CHadd(p, cs, start);
758 		else {
759 			if (_collate_load_error) {
760 				(void) REQUIRE((uch)start <= (uch)finish,
761 				    REG_ERANGE);
762 				CHaddrange(p, cs, start, finish);
763 			} else {
764 				(void) REQUIRE(_collate_range_cmp(start,
765 				    finish) <= 0, REG_ERANGE);
766 				for (i = 0; i <= UCHAR_MAX; i++) {
767 					if (_collate_range_cmp(start, i) <= 0 &&
768 					    _collate_range_cmp(i, finish) <= 0)
769 						CHadd(p, cs, i);
770 				}
771 			}
772 		}
773 		break;
774 	}
775 }
776 
777 /*
778  * p_b_cclass - parse a character-class name and deal with it
779  */
780 static void
781 p_b_cclass(struct parse *p, cset *cs)
782 {
783 	char *sp = p->next;
784 	size_t len;
785 	wctype_t wct;
786 	char clname[16];
787 
788 	while (MORE() && isalpha((uch)PEEK()))
789 		NEXT();
790 	len = p->next - sp;
791 	if (len >= sizeof (clname) - 1) {
792 		SETERROR(REG_ECTYPE);
793 		return;
794 	}
795 	(void) memcpy(clname, sp, len);
796 	clname[len] = '\0';
797 	if ((wct = wctype(clname)) == 0) {
798 		SETERROR(REG_ECTYPE);
799 		return;
800 	}
801 	CHaddtype(p, cs, wct);
802 }
803 
804 /*
805  * p_b_eclass - parse an equivalence-class name and deal with it
806  *
807  * This implementation is incomplete. xxx
808  */
809 static void
810 p_b_eclass(struct parse *p, cset *cs)
811 {
812 	wint_t c;
813 
814 	c = p_b_coll_elem(p, '=');
815 	CHadd(p, cs, c);
816 }
817 
818 /*
819  * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
820  */
821 static wint_t			/* value of symbol */
822 p_b_symbol(struct parse *p)
823 {
824 	wint_t value;
825 
826 	(void) REQUIRE(MORE(), REG_EBRACK);
827 	if (!EATTWO('[', '.'))
828 		return (WGETNEXT());
829 
830 	/* collating symbol */
831 	value = p_b_coll_elem(p, '.');
832 	(void) REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
833 	return (value);
834 }
835 
836 /*
837  * p_b_coll_elem - parse a collating-element name and look it up
838  */
839 static wint_t			/* value of collating element */
840 p_b_coll_elem(struct parse *p,
841 	wint_t endc)		/* name ended by endc,']' */
842 {
843 	char *sp = p->next;
844 	struct cname *cp;
845 	int len;
846 	mbstate_t mbs;
847 	wchar_t wc;
848 	size_t clen;
849 
850 	while (MORE() && !SEETWO(endc, ']'))
851 		NEXT();
852 	if (!MORE()) {
853 		SETERROR(REG_EBRACK);
854 		return (0);
855 	}
856 	len = p->next - sp;
857 	for (cp = cnames; cp->name != NULL; cp++)
858 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
859 			return (cp->code);	/* known name */
860 	(void) memset(&mbs, 0, sizeof (mbs));
861 	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
862 		return (wc);			/* single character */
863 	else if (clen == (size_t)-1 || clen == (size_t)-2)
864 		SETERROR(REG_ECHAR);
865 	else
866 		SETERROR(REG_ECOLLATE);		/* neither */
867 	return (0);
868 }
869 
870 /*
871  * othercase - return the case counterpart of an alphabetic
872  */
873 static wint_t			/* if no counterpart, return ch */
874 othercase(wint_t ch)
875 {
876 	assert(iswalpha(ch));
877 	if (iswupper(ch))
878 		return (towlower(ch));
879 	else if (iswlower(ch))
880 		return (towupper(ch));
881 	else			/* peculiar, but could happen */
882 		return (ch);
883 }
884 
885 /*
886  * bothcases - emit a dualcase version of a two-case character
887  *
888  * Boy, is this implementation ever a kludge...
889  */
890 static void
891 bothcases(struct parse *p, wint_t ch)
892 {
893 	char *oldnext = p->next;
894 	char *oldend = p->end;
895 	char bracket[3 + MB_LEN_MAX];
896 	size_t n;
897 	mbstate_t mbs;
898 
899 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
900 	p->next = bracket;
901 	(void) memset(&mbs, 0, sizeof (mbs));
902 	n = wcrtomb(bracket, ch, &mbs);
903 	assert(n != (size_t)-1);
904 	bracket[n] = ']';
905 	bracket[n + 1] = '\0';
906 	p->end = bracket+n+1;
907 	p_bracket(p);
908 	assert(p->next == p->end);
909 	p->next = oldnext;
910 	p->end = oldend;
911 }
912 
913 /*
914  * ordinary - emit an ordinary character
915  */
916 static void
917 ordinary(struct parse *p, wint_t ch)
918 {
919 	cset *cs;
920 
921 	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
922 		bothcases(p, ch);
923 	else if ((ch & OPDMASK) == ch)
924 		EMIT(OCHAR, ch);
925 	else {
926 		/*
927 		 * Kludge: character is too big to fit into an OCHAR operand.
928 		 * Emit a singleton set.
929 		 */
930 		if ((cs = allocset(p)) == NULL)
931 			return;
932 		CHadd(p, cs, ch);
933 		EMIT(OANYOF, (int)(cs - p->g->sets));
934 	}
935 }
936 
937 /*
938  * nonnewline - emit REG_NEWLINE version of OANY
939  *
940  * Boy, is this implementation ever a kludge...
941  */
942 static void
943 nonnewline(struct parse *p)
944 {
945 	char *oldnext = p->next;
946 	char *oldend = p->end;
947 	char bracket[4];
948 
949 	p->next = bracket;
950 	p->end = bracket+3;
951 	bracket[0] = '^';
952 	bracket[1] = '\n';
953 	bracket[2] = ']';
954 	bracket[3] = '\0';
955 	p_bracket(p);
956 	assert(p->next == bracket+3);
957 	p->next = oldnext;
958 	p->end = oldend;
959 }
960 
961 /*
962  * repeat - generate code for a bounded repetition, recursively if needed
963  */
964 static void
965 repeat(struct parse *p,
966     sopno start,		/* operand from here to end of strip */
967     int from,			/* repeated from this number */
968     int to)			/* to this number of times (maybe INFINITY) */
969 {
970 	sopno finish = HERE();
971 #define	N	2
972 #define	INF	3
973 #define	REP(f, t)	((f)*8 + (t))
974 #define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
975 	sopno copy;
976 
977 	if (p->error != 0)	/* head off possible runaway recursion */
978 		return;
979 
980 	assert(from <= to);
981 
982 	switch (REP(MAP(from), MAP(to))) {
983 	case REP(0, 0):			/* must be user doing this */
984 		DROP(finish-start);	/* drop the operand */
985 		break;
986 	case REP(0, 1):			/* as x{1,1}? */
987 	case REP(0, N):			/* as x{1,n}? */
988 	case REP(0, INF):		/* as x{1,}? */
989 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
990 		INSERT(OCH_, start);		/* offset is wrong... */
991 		repeat(p, start+1, 1, to);
992 		ASTERN(OOR1, start);
993 		AHEAD(start);			/* ... fix it */
994 		EMIT(OOR2, 0);
995 		AHEAD(THERE());
996 		ASTERN(O_CH, THERETHERE());
997 		break;
998 	case REP(1, 1):			/* trivial case */
999 		/* done */
1000 		break;
1001 	case REP(1, N):			/* as x?x{1,n-1} */
1002 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1003 		INSERT(OCH_, start);
1004 		ASTERN(OOR1, start);
1005 		AHEAD(start);
1006 		EMIT(OOR2, 0);			/* offset very wrong... */
1007 		AHEAD(THERE());			/* ...so fix it */
1008 		ASTERN(O_CH, THERETHERE());
1009 		copy = dupl(p, start+1, finish+1);
1010 		assert(copy == finish+4);
1011 		repeat(p, copy, 1, to-1);
1012 		break;
1013 	case REP(1, INF):		/* as x+ */
1014 		INSERT(OPLUS_, start);
1015 		ASTERN(O_PLUS, start);
1016 		break;
1017 	case REP(N, N):			/* as xx{m-1,n-1} */
1018 		copy = dupl(p, start, finish);
1019 		repeat(p, copy, from-1, to-1);
1020 		break;
1021 	case REP(N, INF):		/* as xx{n-1,INF} */
1022 		copy = dupl(p, start, finish);
1023 		repeat(p, copy, from-1, to);
1024 		break;
1025 	default:			/* "can't happen" */
1026 		SETERROR(REG_EFATAL);	/* just in case */
1027 		break;
1028 	}
1029 }
1030 
1031 /*
1032  * wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1033  * character from the parse struct, signals a REG_ILLSEQ error if the
1034  * character can't be converted. Returns the number of bytes consumed.
1035  */
1036 static wint_t
1037 wgetnext(struct parse *p)
1038 {
1039 	mbstate_t mbs;
1040 	wchar_t wc;
1041 	size_t n;
1042 
1043 	(void) memset(&mbs, 0, sizeof (mbs));
1044 	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1045 	if (n == (size_t)-1 || n == (size_t)-2) {
1046 		SETERROR(REG_ECHAR);
1047 		return (0);
1048 	}
1049 	if (n == 0)
1050 		n = 1;
1051 	p->next += n;
1052 	return (wc);
1053 }
1054 
1055 /*
1056  * seterr - set an error condition
1057  */
1058 static int			/* useless but makes type checking happy */
1059 seterr(struct parse *p, int e)
1060 {
1061 	if (p->error == 0)	/* keep earliest error condition */
1062 		p->error = e;
1063 	p->next = nuls;		/* try to bring things to a halt */
1064 	p->end = nuls;
1065 	return (0);		/* make the return value well-defined */
1066 }
1067 
1068 /*
1069  * allocset - allocate a set of characters for []
1070  */
1071 static cset *
1072 allocset(struct parse *p)
1073 {
1074 	cset *cs, *ncs;
1075 
1076 	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof (*ncs));
1077 	if (ncs == NULL) {
1078 		SETERROR(REG_ESPACE);
1079 		return (NULL);
1080 	}
1081 	p->g->sets = ncs;
1082 	cs = &p->g->sets[p->g->ncsets++];
1083 	(void) memset(cs, 0, sizeof (*cs));
1084 
1085 	return (cs);
1086 }
1087 
1088 /*
1089  * freeset - free a now-unused set
1090  */
1091 static void
1092 freeset(struct parse *p, cset *cs)
1093 {
1094 	cset *top = &p->g->sets[p->g->ncsets];
1095 
1096 	free(cs->wides);
1097 	free(cs->ranges);
1098 	free(cs->types);
1099 	(void) memset(cs, 0, sizeof (*cs));
1100 	if (cs == top-1)	/* recover only the easy case */
1101 		p->g->ncsets--;
1102 }
1103 
1104 /*
1105  * singleton - Determine whether a set contains only one character,
1106  * returning it if so, otherwise returning OUT.
1107  */
1108 static wint_t
1109 singleton(cset *cs)
1110 {
1111 	wint_t i, s, n;
1112 
1113 	for (i = n = 0; i < NC; i++)
1114 		if (CHIN(cs, i)) {
1115 			n++;
1116 			s = i;
1117 		}
1118 	if (n == 1)
1119 		return (s);
1120 	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1121 	    cs->icase == 0)
1122 		return (cs->wides[0]);
1123 	/* Don't bother handling the other cases. */
1124 	return (OUT);
1125 }
1126 
1127 /*
1128  * CHadd - add character to character set.
1129  */
1130 static void
1131 CHadd(struct parse *p, cset *cs, wint_t ch)
1132 {
1133 	wint_t nch, *newwides;
1134 	assert(ch >= 0);
1135 	if (ch < NC)
1136 		cs->bmp[ch >> 3] |= 1 << (ch & 7);
1137 	else {
1138 		newwides = realloc(cs->wides, (cs->nwides + 1) *
1139 		    sizeof (*cs->wides));
1140 		if (newwides == NULL) {
1141 			SETERROR(REG_ESPACE);
1142 			return;
1143 		}
1144 		cs->wides = newwides;
1145 		cs->wides[cs->nwides++] = ch;
1146 	}
1147 	if (cs->icase) {
1148 		if ((nch = towlower(ch)) < NC)
1149 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1150 		if ((nch = towupper(ch)) < NC)
1151 			cs->bmp[nch >> 3] |= 1 << (nch & 7);
1152 	}
1153 }
1154 
1155 /*
1156  * CHaddrange - add all characters in the range [min,max] to a character set.
1157  */
1158 static void
1159 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1160 {
1161 	crange *newranges;
1162 
1163 	for (; min < NC && min <= max; min++)
1164 		CHadd(p, cs, min);
1165 	if (min >= max)
1166 		return;
1167 	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1168 	    sizeof (*cs->ranges));
1169 	if (newranges == NULL) {
1170 		SETERROR(REG_ESPACE);
1171 		return;
1172 	}
1173 	cs->ranges = newranges;
1174 	cs->ranges[cs->nranges].min = min;
1175 	cs->ranges[cs->nranges].min = max;
1176 	cs->nranges++;
1177 }
1178 
1179 /*
1180  * CHaddtype - add all characters of a certain type to a character set.
1181  */
1182 static void
1183 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1184 {
1185 	wint_t i;
1186 	wctype_t *newtypes;
1187 
1188 	for (i = 0; i < NC; i++)
1189 		if (iswctype(i, wct))
1190 			CHadd(p, cs, i);
1191 	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1192 	    sizeof (*cs->types));
1193 	if (newtypes == NULL) {
1194 		SETERROR(REG_ESPACE);
1195 		return;
1196 	}
1197 	cs->types = newtypes;
1198 	cs->types[cs->ntypes++] = wct;
1199 }
1200 
1201 /*
1202  * dupl - emit a duplicate of a bunch of sops
1203  */
1204 static sopno			/* start of duplicate */
1205 dupl(struct parse *p,
1206 	sopno start,		/* from here */
1207 	sopno finish)		/* to this less one */
1208 {
1209 	sopno ret = HERE();
1210 	sopno len = finish - start;
1211 
1212 	assert(finish >= start);
1213 	if (len == 0)
1214 		return (ret);
1215 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1216 	assert(p->ssize >= p->slen + len);
1217 	(void) memcpy((char *)(p->strip + p->slen),
1218 	    (char *)(p->strip + start), (size_t)len*sizeof (sop));
1219 	p->slen += len;
1220 	return (ret);
1221 }
1222 
1223 /*
1224  * doemit - emit a strip operator
1225  *
1226  * It might seem better to implement this as a macro with a function as
1227  * hard-case backup, but it's just too big and messy unless there are
1228  * some changes to the data structures.  Maybe later.
1229  */
1230 static void
1231 doemit(struct parse *p, sop op, size_t opnd)
1232 {
1233 	/* avoid making error situations worse */
1234 	if (p->error != 0)
1235 		return;
1236 
1237 	/* deal with oversize operands ("can't happen", more or less) */
1238 	assert(opnd < 1<<OPSHIFT);
1239 
1240 	/* deal with undersized strip */
1241 	if (p->slen >= p->ssize)
1242 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1243 	assert(p->slen < p->ssize);
1244 
1245 	/* finally, it's all reduced to the easy case */
1246 	p->strip[p->slen++] = SOP(op, opnd);
1247 }
1248 
1249 /*
1250  * doinsert - insert a sop into the strip
1251  */
1252 static void
1253 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1254 {
1255 	sopno sn;
1256 	sop s;
1257 	int i;
1258 
1259 	/* avoid making error situations worse */
1260 	if (p->error != 0)
1261 		return;
1262 
1263 	sn = HERE();
1264 	EMIT(op, opnd);		/* do checks, ensure space */
1265 	assert(HERE() == sn+1);
1266 	s = p->strip[sn];
1267 
1268 	/* adjust paren pointers */
1269 	assert(pos > 0);
1270 	for (i = 1; i < NPAREN; i++) {
1271 		if (p->pbegin[i] >= pos) {
1272 			p->pbegin[i]++;
1273 		}
1274 		if (p->pend[i] >= pos) {
1275 			p->pend[i]++;
1276 		}
1277 	}
1278 
1279 	(void) memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1280 	    (HERE()-pos-1)*sizeof (sop));
1281 	p->strip[pos] = s;
1282 }
1283 
1284 /*
1285  * dofwd - complete a forward reference
1286  */
1287 static void
1288 dofwd(struct parse *p, sopno pos, sop value)
1289 {
1290 	/* avoid making error situations worse */
1291 	if (p->error != 0)
1292 		return;
1293 
1294 	assert(value < 1<<OPSHIFT);
1295 	p->strip[pos] = OP(p->strip[pos]) | value;
1296 }
1297 
1298 /*
1299  * enlarge - enlarge the strip
1300  */
1301 static void
1302 enlarge(struct parse *p, sopno size)
1303 {
1304 	sop *sp;
1305 
1306 	if (p->ssize >= size)
1307 		return;
1308 
1309 	sp = (sop *)realloc(p->strip, size*sizeof (sop));
1310 	if (sp == NULL) {
1311 		SETERROR(REG_ESPACE);
1312 		return;
1313 	}
1314 	p->strip = sp;
1315 	p->ssize = size;
1316 }
1317 
1318 /*
1319  * stripsnug - compact the strip
1320  */
1321 static void
1322 stripsnug(struct parse *p, struct re_guts *g)
1323 {
1324 	g->nstates = p->slen;
1325 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof (sop));
1326 	if (g->strip == NULL) {
1327 		SETERROR(REG_ESPACE);
1328 		g->strip = p->strip;
1329 	}
1330 }
1331 
1332 /*
1333  * findmust - fill in must and mlen with longest mandatory literal string
1334  *
1335  * This algorithm could do fancy things like analyzing the operands of |
1336  * for common subsequences.  Someday.  This code is simple and finds most
1337  * of the interesting cases.
1338  *
1339  * Note that must and mlen got initialized during setup.
1340  */
1341 static void
1342 findmust(struct parse *p, struct re_guts *g)
1343 {
1344 	sop *scan;
1345 	sop *start;
1346 	sop *newstart;
1347 	sopno newlen;
1348 	sop s;
1349 	char *cp;
1350 	int offset;
1351 	char buf[MB_LEN_MAX];
1352 	size_t clen;
1353 	mbstate_t mbs;
1354 
1355 	/* avoid making error situations worse */
1356 	if (p->error != 0)
1357 		return;
1358 
1359 	/*
1360 	 * It's not generally safe to do a ``char'' substring search on
1361 	 * multibyte character strings, but it's safe for at least
1362 	 * UTF-8 (see RFC 3629).
1363 	 */
1364 	if (MB_CUR_MAX > 1 &&
1365 	    strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1366 		return;
1367 
1368 	/* find the longest OCHAR sequence in strip */
1369 	newlen = 0;
1370 	offset = 0;
1371 	g->moffset = 0;
1372 	scan = g->strip + 1;
1373 	do {
1374 		s = *scan++;
1375 		switch (OP(s)) {
1376 		case OCHAR:		/* sequence member */
1377 			if (newlen == 0) {		/* new sequence */
1378 				(void) memset(&mbs, 0, sizeof (mbs));
1379 				newstart = scan - 1;
1380 			}
1381 			clen = wcrtomb(buf, OPND(s), &mbs);
1382 			if (clen == (size_t)-1)
1383 				goto toohard;
1384 			newlen += clen;
1385 			break;
1386 		case OPLUS_:		/* things that don't break one */
1387 		case OLPAREN:
1388 		case ORPAREN:
1389 			break;
1390 		case OQUEST_:		/* things that must be skipped */
1391 		case OCH_:
1392 			offset = altoffset(scan, offset);
1393 			scan--;
1394 			do {
1395 				scan += OPND(s);
1396 				s = *scan;
1397 				/* assert() interferes w debug printouts */
1398 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1399 				    OP(s) != OOR2) {
1400 					g->iflags |= BAD;
1401 					return;
1402 				}
1403 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1404 			/* FALLTHROUGH */
1405 		case OBOW:		/* things that break a sequence */
1406 		case OEOW:
1407 		case OBOL:
1408 		case OEOL:
1409 		case O_QUEST:
1410 		case O_CH:
1411 		case OEND:
1412 			if (newlen > g->mlen) {		/* ends one */
1413 				start = newstart;
1414 				g->mlen = newlen;
1415 				if (offset > -1) {
1416 					g->moffset += offset;
1417 					offset = newlen;
1418 				} else
1419 					g->moffset = offset;
1420 			} else {
1421 				if (offset > -1)
1422 					offset += newlen;
1423 			}
1424 			newlen = 0;
1425 			break;
1426 		case OANY:
1427 			if (newlen > g->mlen) {		/* ends one */
1428 				start = newstart;
1429 				g->mlen = newlen;
1430 				if (offset > -1) {
1431 					g->moffset += offset;
1432 					offset = newlen;
1433 				} else
1434 					g->moffset = offset;
1435 			} else {
1436 				if (offset > -1)
1437 					offset += newlen;
1438 			}
1439 			if (offset > -1)
1440 				offset++;
1441 			newlen = 0;
1442 			break;
1443 		case OANYOF:		/* may or may not invalidate offset */
1444 			/* First, everything as OANY */
1445 			if (newlen > g->mlen) {		/* ends one */
1446 				start = newstart;
1447 				g->mlen = newlen;
1448 				if (offset > -1) {
1449 					g->moffset += offset;
1450 					offset = newlen;
1451 				} else
1452 					g->moffset = offset;
1453 			} else {
1454 				if (offset > -1)
1455 					offset += newlen;
1456 			}
1457 			if (offset > -1)
1458 				offset++;
1459 			newlen = 0;
1460 			break;
1461 		toohard:
1462 		default:
1463 			/*
1464 			 * Anything here makes it impossible or too hard
1465 			 * to calculate the offset -- so we give up;
1466 			 * save the last known good offset, in case the
1467 			 * must sequence doesn't occur later.
1468 			 */
1469 			if (newlen > g->mlen) {		/* ends one */
1470 				start = newstart;
1471 				g->mlen = newlen;
1472 				if (offset > -1)
1473 					g->moffset += offset;
1474 				else
1475 					g->moffset = offset;
1476 			}
1477 			offset = -1;
1478 			newlen = 0;
1479 			break;
1480 		}
1481 	} while (OP(s) != OEND);
1482 
1483 	if (g->mlen == 0) {		/* there isn't one */
1484 		g->moffset = -1;
1485 		return;
1486 	}
1487 
1488 	/* turn it into a character string */
1489 	g->must = malloc((size_t)g->mlen + 1);
1490 	if (g->must == NULL) {		/* argh; just forget it */
1491 		g->mlen = 0;
1492 		g->moffset = -1;
1493 		return;
1494 	}
1495 	cp = g->must;
1496 	scan = start;
1497 	(void) memset(&mbs, 0, sizeof (mbs));
1498 	while (cp < g->must + g->mlen) {
1499 		while (OP(s = *scan++) != OCHAR)
1500 			continue;
1501 		clen = wcrtomb(cp, OPND(s), &mbs);
1502 		assert(clen != (size_t)-1);
1503 		cp += clen;
1504 	}
1505 	assert(cp == g->must + g->mlen);
1506 	*cp++ = '\0';		/* just on general principles */
1507 }
1508 
1509 /*
1510  * altoffset - choose biggest offset among multiple choices
1511  *
1512  * Compute, recursively if necessary, the largest offset among multiple
1513  * re paths.
1514  */
1515 static int
1516 altoffset(sop *scan, int offset)
1517 {
1518 	int largest;
1519 	int try;
1520 	sop s;
1521 
1522 	/* If we gave up already on offsets, return */
1523 	if (offset == -1)
1524 		return (-1);
1525 
1526 	largest = 0;
1527 	try = 0;
1528 	s = *scan++;
1529 	while (OP(s) != O_QUEST && OP(s) != O_CH) {
1530 		switch (OP(s)) {
1531 		case OOR1:
1532 			if (try > largest)
1533 				largest = try;
1534 			try = 0;
1535 			break;
1536 		case OQUEST_:
1537 		case OCH_:
1538 			try = altoffset(scan, try);
1539 			if (try == -1)
1540 				return (-1);
1541 			scan--;
1542 			do {
1543 				scan += OPND(s);
1544 				s = *scan;
1545 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1546 				    OP(s) != OOR2)
1547 					return (-1);
1548 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1549 			/*
1550 			 * We must skip to the next position, or we'll
1551 			 * leave altoffset() too early.
1552 			 */
1553 			scan++;
1554 			break;
1555 		case OANYOF:
1556 		case OCHAR:
1557 		case OANY:
1558 			try++;
1559 			/*FALLTHRU*/
1560 		case OBOW:
1561 		case OEOW:
1562 		case OLPAREN:
1563 		case ORPAREN:
1564 		case OOR2:
1565 			break;
1566 		default:
1567 			try = -1;
1568 			break;
1569 		}
1570 		if (try == -1)
1571 			return (-1);
1572 		s = *scan++;
1573 	}
1574 
1575 	if (try > largest)
1576 		largest = try;
1577 
1578 	return (largest+offset);
1579 }
1580 
1581 /*
1582  * computejumps - compute char jumps for BM scan
1583  *
1584  * This algorithm assumes g->must exists and is has size greater than
1585  * zero. It's based on the algorithm found on Computer Algorithms by
1586  * Sara Baase.
1587  *
1588  * A char jump is the number of characters one needs to jump based on
1589  * the value of the character from the text that was mismatched.
1590  */
1591 static void
1592 computejumps(struct parse *p, struct re_guts *g)
1593 {
1594 	int ch;
1595 	int mindex;
1596 
1597 	/* Avoid making errors worse */
1598 	if (p->error != 0)
1599 		return;
1600 
1601 	g->charjump = (int *)malloc((NC + 1) * sizeof (int));
1602 	if (g->charjump == NULL)	/* Not a fatal error */
1603 		return;
1604 	/* Adjust for signed chars, if necessary */
1605 	g->charjump = &g->charjump[-(CHAR_MIN)];
1606 
1607 	/*
1608 	 * If the character does not exist in the pattern, the jump
1609 	 * is equal to the number of characters in the pattern.
1610 	 */
1611 	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1612 		g->charjump[ch] = g->mlen;
1613 
1614 	/*
1615 	 * If the character does exist, compute the jump that would
1616 	 * take us to the last character in the pattern equal to it
1617 	 * (notice that we match right to left, so that last character
1618 	 * is the first one that would be matched).
1619 	 */
1620 	for (mindex = 0; mindex < g->mlen; mindex++)
1621 		g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1622 }
1623 
1624 /*
1625  * computematchjumps - compute match jumps for BM scan
1626  *
1627  * This algorithm assumes g->must exists and is has size greater than
1628  * zero. It's based on the algorithm found on Computer Algorithms by
1629  * Sara Baase.
1630  *
1631  * A match jump is the number of characters one needs to advance based
1632  * on the already-matched suffix.
1633  * Notice that all values here are minus (g->mlen-1), because of the way
1634  * the search algorithm works.
1635  */
1636 static void
1637 computematchjumps(struct parse *p, struct re_guts *g)
1638 {
1639 	int mindex;		/* General "must" iterator */
1640 	int suffix;		/* Keeps track of matching suffix */
1641 	int ssuffix;		/* Keeps track of suffixes' suffix */
1642 	int *pmatches;
1643 				/*
1644 				 * pmatches[k] points to the next i
1645 				 * such that i+1...mlen is a substring
1646 				 * of k+1...k+mlen-i-1
1647 				 */
1648 
1649 	/* Avoid making errors worse */
1650 	if (p->error != 0)
1651 		return;
1652 
1653 	pmatches = (int *)malloc(g->mlen * sizeof (unsigned int));
1654 	if (pmatches == NULL) {
1655 		g->matchjump = NULL;
1656 		return;
1657 	}
1658 
1659 	g->matchjump = (int *)malloc(g->mlen * sizeof (unsigned int));
1660 	if (g->matchjump == NULL)	/* Not a fatal error */
1661 		return;
1662 
1663 	/* Set maximum possible jump for each character in the pattern */
1664 	for (mindex = 0; mindex < g->mlen; mindex++)
1665 		g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1666 
1667 	/* Compute pmatches[] */
1668 	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1669 	    mindex--, suffix--) {
1670 		pmatches[mindex] = suffix;
1671 
1672 		/*
1673 		 * If a mismatch is found, interrupting the substring,
1674 		 * compute the matchjump for that position. If no
1675 		 * mismatch is found, then a text substring mismatched
1676 		 * against the suffix will also mismatch against the
1677 		 * substring.
1678 		 */
1679 		while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) {
1680 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1681 			    g->mlen - mindex - 1);
1682 			suffix = pmatches[suffix];
1683 		}
1684 	}
1685 
1686 	/*
1687 	 * Compute the matchjump up to the last substring found to jump
1688 	 * to the beginning of the largest must pattern prefix matching
1689 	 * it's own suffix.
1690 	 */
1691 	for (mindex = 0; mindex <= suffix; mindex++)
1692 		g->matchjump[mindex] = MIN(g->matchjump[mindex],
1693 		    g->mlen + suffix - mindex);
1694 
1695 	ssuffix = pmatches[suffix];
1696 	while (suffix < g->mlen) {
1697 		while (suffix <= ssuffix && suffix < g->mlen) {
1698 			g->matchjump[suffix] = MIN(g->matchjump[suffix],
1699 			    g->mlen + ssuffix - suffix);
1700 			suffix++;
1701 		}
1702 		if (suffix < g->mlen)
1703 			ssuffix = pmatches[ssuffix];
1704 	}
1705 
1706 	free(pmatches);
1707 }
1708 
1709 /*
1710  * pluscount - count + nesting
1711  */
1712 static sopno			/* nesting depth */
1713 pluscount(struct parse *p, struct re_guts *g)
1714 {
1715 	sop *scan;
1716 	sop s;
1717 	sopno plusnest = 0;
1718 	sopno maxnest = 0;
1719 
1720 	if (p->error != 0)
1721 		return (0);	/* there may not be an OEND */
1722 
1723 	scan = g->strip + 1;
1724 	do {
1725 		s = *scan++;
1726 		switch (OP(s)) {
1727 		case OPLUS_:
1728 			plusnest++;
1729 			break;
1730 		case O_PLUS:
1731 			if (plusnest > maxnest)
1732 				maxnest = plusnest;
1733 			plusnest--;
1734 			break;
1735 		}
1736 	} while (OP(s) != OEND);
1737 	if (plusnest != 0)
1738 		g->iflags |= BAD;
1739 	return (maxnest);
1740 }
1741