xref: /freebsd/lib/libc/regex/regcomp.c (revision e627b39baccd1ec9129690167cf5e6d860509655)
1 /*-
2  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3  * Copyright (c) 1992, 1993, 1994
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Henry Spencer.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
38  */
39 
40 #if defined(LIBC_SCCS) && !defined(lint)
41 static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
42 #endif /* LIBC_SCCS and not lint */
43 
44 #include <sys/types.h>
45 #include <stdio.h>
46 #include <string.h>
47 #include <ctype.h>
48 #include <limits.h>
49 #include <locale.h>
50 #include <stdlib.h>
51 #include <regex.h>
52 
53 #include "utils.h"
54 #include "regex2.h"
55 
56 #include "cclass.h"
57 #include "cname.h"
58 
59 /*
60  * parse structure, passed up and down to avoid global variables and
61  * other clumsinesses
62  */
63 struct parse {
64 	char *next;		/* next character in RE */
65 	char *end;		/* end of string (-> NUL normally) */
66 	int error;		/* has an error been seen? */
67 	sop *strip;		/* malloced strip */
68 	sopno ssize;		/* malloced strip size (allocated) */
69 	sopno slen;		/* malloced strip length (used) */
70 	int ncsalloc;		/* number of csets allocated */
71 	struct re_guts *g;
72 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
73 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
74 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
75 };
76 
77 /* ========= begin header generated by ./mkh ========= */
78 #ifdef __cplusplus
79 extern "C" {
80 #endif
81 
82 /* === regcomp.c === */
83 static void p_ere __P((struct parse *p, int stop));
84 static void p_ere_exp __P((struct parse *p));
85 static void p_str __P((struct parse *p));
86 static void p_bre __P((struct parse *p, int end1, int end2));
87 static int p_simp_re __P((struct parse *p, int starordinary));
88 static int p_count __P((struct parse *p));
89 static void p_bracket __P((struct parse *p));
90 static void p_b_term __P((struct parse *p, cset *cs));
91 static void p_b_cclass __P((struct parse *p, cset *cs));
92 static void p_b_eclass __P((struct parse *p, cset *cs));
93 static char p_b_symbol __P((struct parse *p));
94 static char p_b_coll_elem __P((struct parse *p, int endc));
95 static char othercase __P((int ch));
96 static void bothcases __P((struct parse *p, int ch));
97 static void ordinary __P((struct parse *p, int ch));
98 static void nonnewline __P((struct parse *p));
99 static void repeat __P((struct parse *p, sopno start, int from, int to));
100 static int seterr __P((struct parse *p, int e));
101 static cset *allocset __P((struct parse *p));
102 static void freeset __P((struct parse *p, cset *cs));
103 static int freezeset __P((struct parse *p, cset *cs));
104 static int firstch __P((struct parse *p, cset *cs));
105 static int nch __P((struct parse *p, cset *cs));
106 static void mcadd __P((struct parse *p, cset *cs, char *cp));
107 #if used
108 static void mcsub __P((cset *cs, char *cp));
109 static int mcin __P((cset *cs, char *cp));
110 static char *mcfind __P((cset *cs, char *cp));
111 #endif
112 static void mcinvert __P((struct parse *p, cset *cs));
113 static void mccase __P((struct parse *p, cset *cs));
114 static int isinsets __P((struct re_guts *g, int c));
115 static int samesets __P((struct re_guts *g, int c1, int c2));
116 static void categorize __P((struct parse *p, struct re_guts *g));
117 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
118 static void doemit __P((struct parse *p, sop op, size_t opnd));
119 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
120 static void dofwd __P((struct parse *p, sopno pos, sop value));
121 static void enlarge __P((struct parse *p, sopno size));
122 static void stripsnug __P((struct parse *p, struct re_guts *g));
123 static void findmust __P((struct parse *p, struct re_guts *g));
124 static sopno pluscount __P((struct parse *p, struct re_guts *g));
125 
126 #ifdef __cplusplus
127 }
128 #endif
129 /* ========= end header generated by ./mkh ========= */
130 
131 static char nuls[10];		/* place to point scanner in event of error */
132 
133 /*
134  * macros for use with parse structure
135  * BEWARE:  these know that the parse structure is named `p' !!!
136  */
137 #define	PEEK()	(*p->next)
138 #define	PEEK2()	(*(p->next+1))
139 #define	MORE()	(p->next < p->end)
140 #define	MORE2()	(p->next+1 < p->end)
141 #define	SEE(c)	(MORE() && PEEK() == (c))
142 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
143 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
144 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
145 #define	NEXT()	(p->next++)
146 #define	NEXT2()	(p->next += 2)
147 #define	NEXTn(n)	(p->next += (n))
148 #define	GETNEXT()	(*p->next++)
149 #define	SETERROR(e)	seterr(p, (e))
150 #define	REQUIRE(co, e)	((co) || SETERROR(e))
151 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
152 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
153 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
154 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
155 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
156 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
157 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
158 #define	HERE()		(p->slen)
159 #define	THERE()		(p->slen - 1)
160 #define	THERETHERE()	(p->slen - 2)
161 #define	DROP(n)	(p->slen -= (n))
162 
163 #ifndef NDEBUG
164 static int never = 0;		/* for use in asserts; shuts lint up */
165 #else
166 #define	never	0		/* some <assert.h>s have bugs too */
167 #endif
168 
169 /*
170  - regcomp - interface for parser and compilation
171  = extern int regcomp(regex_t *, const char *, int);
172  = #define	REG_BASIC	0000
173  = #define	REG_EXTENDED	0001
174  = #define	REG_ICASE	0002
175  = #define	REG_NOSUB	0004
176  = #define	REG_NEWLINE	0010
177  = #define	REG_NOSPEC	0020
178  = #define	REG_PEND	0040
179  = #define	REG_DUMP	0200
180  */
181 int				/* 0 success, otherwise REG_something */
182 regcomp(preg, pattern, cflags)
183 regex_t *preg;
184 const char *pattern;
185 int cflags;
186 {
187 	struct parse pa;
188 	register struct re_guts *g;
189 	register struct parse *p = &pa;
190 	register int i;
191 	register size_t len;
192 #ifdef REDEBUG
193 #	define	GOODFLAGS(f)	(f)
194 #else
195 #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
196 #endif
197 
198 	cflags = GOODFLAGS(cflags);
199 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
200 		return(REG_INVARG);
201 
202 	if (cflags&REG_PEND) {
203 		if (preg->re_endp < pattern)
204 			return(REG_INVARG);
205 		len = preg->re_endp - pattern;
206 	} else
207 		len = strlen((char *)pattern);
208 
209 	/* do the mallocs early so failure handling is easy */
210 	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
211 							(NC-1)*sizeof(cat_t));
212 	if (g == NULL)
213 		return(REG_ESPACE);
214 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
215 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
216 	p->slen = 0;
217 	if (p->strip == NULL) {
218 		free((char *)g);
219 		return(REG_ESPACE);
220 	}
221 
222 	/* set things up */
223 	p->g = g;
224 	p->next = (char *)pattern;	/* convenience; we do not modify it */
225 	p->end = p->next + len;
226 	p->error = 0;
227 	p->ncsalloc = 0;
228 	for (i = 0; i < NPAREN; i++) {
229 		p->pbegin[i] = 0;
230 		p->pend[i] = 0;
231 	}
232 	g->csetsize = NC;
233 	g->sets = NULL;
234 	g->setbits = NULL;
235 	g->ncsets = 0;
236 	g->cflags = cflags;
237 	g->iflags = 0;
238 	g->nbol = 0;
239 	g->neol = 0;
240 	g->must = NULL;
241 	g->mlen = 0;
242 	g->nsub = 0;
243 	g->ncategories = 1;	/* category 0 is "everything else" */
244 	g->categories = &g->catspace[-(CHAR_MIN)];
245 	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
246 	g->backrefs = 0;
247 
248 	/* do it */
249 	EMIT(OEND, 0);
250 	g->firststate = THERE();
251 	if (cflags&REG_EXTENDED)
252 		p_ere(p, OUT);
253 	else if (cflags&REG_NOSPEC)
254 		p_str(p);
255 	else
256 		p_bre(p, OUT, OUT);
257 	EMIT(OEND, 0);
258 	g->laststate = THERE();
259 
260 	/* tidy up loose ends and fill things in */
261 	categorize(p, g);
262 	stripsnug(p, g);
263 	findmust(p, g);
264 	g->nplus = pluscount(p, g);
265 	g->magic = MAGIC2;
266 	preg->re_nsub = g->nsub;
267 	preg->re_g = g;
268 	preg->re_magic = MAGIC1;
269 #ifndef REDEBUG
270 	/* not debugging, so can't rely on the assert() in regexec() */
271 	if (g->iflags&BAD)
272 		SETERROR(REG_ASSERT);
273 #endif
274 
275 	/* win or lose, we're done */
276 	if (p->error != 0)	/* lose */
277 		regfree(preg);
278 	return(p->error);
279 }
280 
281 /*
282  - p_ere - ERE parser top level, concatenation and alternation
283  == static void p_ere(register struct parse *p, int stop);
284  */
285 static void
286 p_ere(p, stop)
287 register struct parse *p;
288 int stop;			/* character this ERE should end at */
289 {
290 	register char c;
291 	register sopno prevback;
292 	register sopno prevfwd;
293 	register sopno conc;
294 	register int first = 1;		/* is this the first alternative? */
295 
296 	for (;;) {
297 		/* do a bunch of concatenated expressions */
298 		conc = HERE();
299 		while (MORE() && (c = PEEK()) != '|' && c != stop)
300 			p_ere_exp(p);
301 		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
302 
303 		if (!EAT('|'))
304 			break;		/* NOTE BREAK OUT */
305 
306 		if (first) {
307 			INSERT(OCH_, conc);	/* offset is wrong */
308 			prevfwd = conc;
309 			prevback = conc;
310 			first = 0;
311 		}
312 		ASTERN(OOR1, prevback);
313 		prevback = THERE();
314 		AHEAD(prevfwd);			/* fix previous offset */
315 		prevfwd = HERE();
316 		EMIT(OOR2, 0);			/* offset is very wrong */
317 	}
318 
319 	if (!first) {		/* tail-end fixups */
320 		AHEAD(prevfwd);
321 		ASTERN(O_CH, prevback);
322 	}
323 
324 	assert(!MORE() || SEE(stop));
325 }
326 
327 /*
328  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
329  == static void p_ere_exp(register struct parse *p);
330  */
331 static void
332 p_ere_exp(p)
333 register struct parse *p;
334 {
335 	register char c;
336 	register sopno pos;
337 	register int count;
338 	register int count2;
339 	register sopno subno;
340 	int wascaret = 0;
341 
342 	assert(MORE());		/* caller should have ensured this */
343 	c = GETNEXT();
344 
345 	pos = HERE();
346 	switch (c) {
347 	case '(':
348 		(void)REQUIRE(MORE(), REG_EPAREN);
349 		p->g->nsub++;
350 		subno = p->g->nsub;
351 		if (subno < NPAREN)
352 			p->pbegin[subno] = HERE();
353 		EMIT(OLPAREN, subno);
354 		if (!SEE(')'))
355 			p_ere(p, ')');
356 		if (subno < NPAREN) {
357 			p->pend[subno] = HERE();
358 			assert(p->pend[subno] != 0);
359 		}
360 		EMIT(ORPAREN, subno);
361 		(void)MUSTEAT(')', REG_EPAREN);
362 		break;
363 #ifndef POSIX_MISTAKE
364 	case ')':		/* happens only if no current unmatched ( */
365 		/*
366 		 * You may ask, why the ifndef?  Because I didn't notice
367 		 * this until slightly too late for 1003.2, and none of the
368 		 * other 1003.2 regular-expression reviewers noticed it at
369 		 * all.  So an unmatched ) is legal POSIX, at least until
370 		 * we can get it fixed.
371 		 */
372 		SETERROR(REG_EPAREN);
373 		break;
374 #endif
375 	case '^':
376 		EMIT(OBOL, 0);
377 		p->g->iflags |= USEBOL;
378 		p->g->nbol++;
379 		wascaret = 1;
380 		break;
381 	case '$':
382 		EMIT(OEOL, 0);
383 		p->g->iflags |= USEEOL;
384 		p->g->neol++;
385 		break;
386 	case '|':
387 		SETERROR(REG_EMPTY);
388 		break;
389 	case '*':
390 	case '+':
391 	case '?':
392 		SETERROR(REG_BADRPT);
393 		break;
394 	case '.':
395 		if (p->g->cflags&REG_NEWLINE)
396 			nonnewline(p);
397 		else
398 			EMIT(OANY, 0);
399 		break;
400 	case '[':
401 		p_bracket(p);
402 		break;
403 	case '\\':
404 		(void)REQUIRE(MORE(), REG_EESCAPE);
405 		c = GETNEXT();
406 		ordinary(p, c);
407 		break;
408 	case '{':		/* okay as ordinary except if digit follows */
409 		(void)REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
410 		/* FALLTHROUGH */
411 	default:
412 		ordinary(p, c);
413 		break;
414 	}
415 
416 	if (!MORE())
417 		return;
418 	c = PEEK();
419 	/* we call { a repetition if followed by a digit */
420 	if (!( c == '*' || c == '+' || c == '?' ||
421 				(c == '{' && MORE2() && isdigit(PEEK2())) ))
422 		return;		/* no repetition, we're done */
423 	NEXT();
424 
425 	(void)REQUIRE(!wascaret, REG_BADRPT);
426 	switch (c) {
427 	case '*':	/* implemented as +? */
428 		/* this case does not require the (y|) trick, noKLUDGE */
429 		INSERT(OPLUS_, pos);
430 		ASTERN(O_PLUS, pos);
431 		INSERT(OQUEST_, pos);
432 		ASTERN(O_QUEST, pos);
433 		break;
434 	case '+':
435 		INSERT(OPLUS_, pos);
436 		ASTERN(O_PLUS, pos);
437 		break;
438 	case '?':
439 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
440 		INSERT(OCH_, pos);		/* offset slightly wrong */
441 		ASTERN(OOR1, pos);		/* this one's right */
442 		AHEAD(pos);			/* fix the OCH_ */
443 		EMIT(OOR2, 0);			/* offset very wrong... */
444 		AHEAD(THERE());			/* ...so fix it */
445 		ASTERN(O_CH, THERETHERE());
446 		break;
447 	case '{':
448 		count = p_count(p);
449 		if (EAT(',')) {
450 			if (isdigit(PEEK())) {
451 				count2 = p_count(p);
452 				(void)REQUIRE(count <= count2, REG_BADBR);
453 			} else		/* single number with comma */
454 				count2 = INFINITY;
455 		} else		/* just a single number */
456 			count2 = count;
457 		repeat(p, pos, count, count2);
458 		if (!EAT('}')) {	/* error heuristics */
459 			while (MORE() && PEEK() != '}')
460 				NEXT();
461 			(void)REQUIRE(MORE(), REG_EBRACE);
462 			SETERROR(REG_BADBR);
463 		}
464 		break;
465 	}
466 
467 	if (!MORE())
468 		return;
469 	c = PEEK();
470 	if (!( c == '*' || c == '+' || c == '?' ||
471 				(c == '{' && MORE2() && isdigit(PEEK2())) ) )
472 		return;
473 	SETERROR(REG_BADRPT);
474 }
475 
476 /*
477  - p_str - string (no metacharacters) "parser"
478  == static void p_str(register struct parse *p);
479  */
480 static void
481 p_str(p)
482 register struct parse *p;
483 {
484 	(void)REQUIRE(MORE(), REG_EMPTY);
485 	while (MORE())
486 		ordinary(p, GETNEXT());
487 }
488 
489 /*
490  - p_bre - BRE parser top level, anchoring and concatenation
491  == static void p_bre(register struct parse *p, register int end1, \
492  ==	register int end2);
493  * Giving end1 as OUT essentially eliminates the end1/end2 check.
494  *
495  * This implementation is a bit of a kludge, in that a trailing $ is first
496  * taken as an ordinary character and then revised to be an anchor.  The
497  * only undesirable side effect is that '$' gets included as a character
498  * category in such cases.  This is fairly harmless; not worth fixing.
499  * The amount of lookahead needed to avoid this kludge is excessive.
500  */
501 static void
502 p_bre(p, end1, end2)
503 register struct parse *p;
504 register int end1;		/* first terminating character */
505 register int end2;		/* second terminating character */
506 {
507 	register sopno start = HERE();
508 	register int first = 1;			/* first subexpression? */
509 	register int wasdollar = 0;
510 
511 	if (EAT('^')) {
512 		EMIT(OBOL, 0);
513 		p->g->iflags |= USEBOL;
514 		p->g->nbol++;
515 	}
516 	while (MORE() && !SEETWO(end1, end2)) {
517 		wasdollar = p_simp_re(p, first);
518 		first = 0;
519 	}
520 	if (wasdollar) {	/* oops, that was a trailing anchor */
521 		DROP(1);
522 		EMIT(OEOL, 0);
523 		p->g->iflags |= USEEOL;
524 		p->g->neol++;
525 	}
526 
527 	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
528 }
529 
530 /*
531  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
532  == static int p_simp_re(register struct parse *p, int starordinary);
533  */
534 static int			/* was the simple RE an unbackslashed $? */
535 p_simp_re(p, starordinary)
536 register struct parse *p;
537 int starordinary;		/* is a leading * an ordinary character? */
538 {
539 	register int c;
540 	register int count;
541 	register int count2;
542 	register sopno pos;
543 	register int i;
544 	register sopno subno;
545 #	define	BACKSL	(1<<CHAR_BIT)
546 
547 	pos = HERE();		/* repetion op, if any, covers from here */
548 
549 	assert(MORE());		/* caller should have ensured this */
550 	c = GETNEXT();
551 	if (c == '\\') {
552 		(void)REQUIRE(MORE(), REG_EESCAPE);
553 		c = BACKSL | (unsigned char)GETNEXT();
554 	}
555 	switch (c) {
556 	case '.':
557 		if (p->g->cflags&REG_NEWLINE)
558 			nonnewline(p);
559 		else
560 			EMIT(OANY, 0);
561 		break;
562 	case '[':
563 		p_bracket(p);
564 		break;
565 	case BACKSL|'{':
566 		SETERROR(REG_BADRPT);
567 		break;
568 	case BACKSL|'(':
569 		p->g->nsub++;
570 		subno = p->g->nsub;
571 		if (subno < NPAREN)
572 			p->pbegin[subno] = HERE();
573 		EMIT(OLPAREN, subno);
574 		/* the MORE here is an error heuristic */
575 		if (MORE() && !SEETWO('\\', ')'))
576 			p_bre(p, '\\', ')');
577 		if (subno < NPAREN) {
578 			p->pend[subno] = HERE();
579 			assert(p->pend[subno] != 0);
580 		}
581 		EMIT(ORPAREN, subno);
582 		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
583 		break;
584 	case BACKSL|')':	/* should not get here -- must be user */
585 	case BACKSL|'}':
586 		SETERROR(REG_EPAREN);
587 		break;
588 	case BACKSL|'1':
589 	case BACKSL|'2':
590 	case BACKSL|'3':
591 	case BACKSL|'4':
592 	case BACKSL|'5':
593 	case BACKSL|'6':
594 	case BACKSL|'7':
595 	case BACKSL|'8':
596 	case BACKSL|'9':
597 		i = (c&~BACKSL) - '0';
598 		assert(i < NPAREN);
599 		if (p->pend[i] != 0) {
600 			assert(i <= p->g->nsub);
601 			EMIT(OBACK_, i);
602 			assert(p->pbegin[i] != 0);
603 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
604 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
605 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
606 			EMIT(O_BACK, i);
607 		} else
608 			SETERROR(REG_ESUBREG);
609 		p->g->backrefs = 1;
610 		break;
611 	case '*':
612 		(void)REQUIRE(starordinary, REG_BADRPT);
613 		/* FALLTHROUGH */
614 	default:
615 		ordinary(p, c &~ BACKSL);
616 		break;
617 	}
618 
619 	if (EAT('*')) {		/* implemented as +? */
620 		/* this case does not require the (y|) trick, noKLUDGE */
621 		INSERT(OPLUS_, pos);
622 		ASTERN(O_PLUS, pos);
623 		INSERT(OQUEST_, pos);
624 		ASTERN(O_QUEST, pos);
625 	} else if (EATTWO('\\', '{')) {
626 		count = p_count(p);
627 		if (EAT(',')) {
628 			if (MORE() && isdigit(PEEK())) {
629 				count2 = p_count(p);
630 				(void)REQUIRE(count <= count2, REG_BADBR);
631 			} else		/* single number with comma */
632 				count2 = INFINITY;
633 		} else		/* just a single number */
634 			count2 = count;
635 		repeat(p, pos, count, count2);
636 		if (!EATTWO('\\', '}')) {	/* error heuristics */
637 			while (MORE() && !SEETWO('\\', '}'))
638 				NEXT();
639 			(void)REQUIRE(MORE(), REG_EBRACE);
640 			SETERROR(REG_BADBR);
641 		}
642 	} else if (c == (unsigned char)'$')	/* $ (but not \$) ends it */
643 		return(1);
644 
645 	return(0);
646 }
647 
648 /*
649  - p_count - parse a repetition count
650  == static int p_count(register struct parse *p);
651  */
652 static int			/* the value */
653 p_count(p)
654 register struct parse *p;
655 {
656 	register int count = 0;
657 	register int ndigits = 0;
658 
659 	while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
660 		count = count*10 + (GETNEXT() - '0');
661 		ndigits++;
662 	}
663 
664 	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
665 	return(count);
666 }
667 
668 /*
669  - p_bracket - parse a bracketed character list
670  == static void p_bracket(register struct parse *p);
671  *
672  * Note a significant property of this code:  if the allocset() did SETERROR,
673  * no set operations are done.
674  */
675 static void
676 p_bracket(p)
677 register struct parse *p;
678 {
679 	register cset *cs = allocset(p);
680 	register int invert = 0;
681 
682 	/* Dept of Truly Sickening Special-Case Kludges */
683 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
684 		EMIT(OBOW, 0);
685 		NEXTn(6);
686 		return;
687 	}
688 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
689 		EMIT(OEOW, 0);
690 		NEXTn(6);
691 		return;
692 	}
693 
694 	if (EAT('^'))
695 		invert++;	/* make note to invert set at end */
696 	if (EAT(']'))
697 		CHadd(cs, ']');
698 	else if (EAT('-'))
699 		CHadd(cs, '-');
700 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
701 		p_b_term(p, cs);
702 	if (EAT('-'))
703 		CHadd(cs, '-');
704 	(void)MUSTEAT(']', REG_EBRACK);
705 
706 	if (p->error != 0)	/* don't mess things up further */
707 		return;
708 
709 	if (p->g->cflags&REG_ICASE) {
710 		register int i;
711 		register int ci;
712 
713 		for (i = p->g->csetsize - 1; i >= 0; i--)
714 			if (CHIN(cs, i) && isalpha(i)) {
715 				ci = othercase(i);
716 				if (ci != i)
717 					CHadd(cs, ci);
718 			}
719 		if (cs->multis != NULL)
720 			mccase(p, cs);
721 	}
722 	if (invert) {
723 		register int i;
724 
725 		for (i = p->g->csetsize - 1; i >= 0; i--)
726 			if (CHIN(cs, i))
727 				CHsub(cs, i);
728 			else
729 				CHadd(cs, i);
730 		if (p->g->cflags&REG_NEWLINE)
731 			CHsub(cs, '\n');
732 		if (cs->multis != NULL)
733 			mcinvert(p, cs);
734 	}
735 
736 	assert(cs->multis == NULL);		/* xxx */
737 
738 	if (nch(p, cs) == 1) {		/* optimize singleton sets */
739 		ordinary(p, firstch(p, cs));
740 		freeset(p, cs);
741 	} else
742 		EMIT(OANYOF, freezeset(p, cs));
743 }
744 
745 /*
746  - p_b_term - parse one term of a bracketed character list
747  == static void p_b_term(register struct parse *p, register cset *cs);
748  */
749 static void
750 p_b_term(p, cs)
751 register struct parse *p;
752 register cset *cs;
753 {
754 	register char c;
755 	register char start, finish;
756 	register int i;
757 
758 	/* classify what we've got */
759 	switch ((MORE()) ? PEEK() : '\0') {
760 	case '[':
761 		c = (MORE2()) ? PEEK2() : '\0';
762 		break;
763 	case '-':
764 		SETERROR(REG_ERANGE);
765 		return;			/* NOTE RETURN */
766 		break;
767 	default:
768 		c = '\0';
769 		break;
770 	}
771 
772 	switch (c) {
773 	case ':':		/* character class */
774 		NEXT2();
775 		(void)REQUIRE(MORE(), REG_EBRACK);
776 		c = PEEK();
777 		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
778 		p_b_cclass(p, cs);
779 		(void)REQUIRE(MORE(), REG_EBRACK);
780 		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
781 		break;
782 	case '=':		/* equivalence class */
783 		NEXT2();
784 		(void)REQUIRE(MORE(), REG_EBRACK);
785 		c = PEEK();
786 		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
787 		p_b_eclass(p, cs);
788 		(void)REQUIRE(MORE(), REG_EBRACK);
789 		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
790 		break;
791 	default:		/* symbol, ordinary character, or range */
792 /* xxx revision needed for multichar stuff */
793 		start = p_b_symbol(p);
794 		if (SEE('-') && MORE2() && PEEK2() != ']') {
795 			/* range */
796 			NEXT();
797 			if (EAT('-'))
798 				finish = '-';
799 			else
800 				finish = p_b_symbol(p);
801 		} else
802 			finish = start;
803 		if (start == finish)
804 			CHadd(cs, start);
805 		else {
806 			(void)REQUIRE(collate_range_cmp(start, finish) <= 0, REG_ERANGE);
807 			for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
808 				if (   collate_range_cmp(start, i) <= 0
809 				    && collate_range_cmp(i, finish) <= 0
810 				   )
811 					CHadd(cs, i);
812 			}
813 		}
814 		break;
815 	}
816 }
817 
818 /*
819  - p_b_cclass - parse a character-class name and deal with it
820  == static void p_b_cclass(register struct parse *p, register cset *cs);
821  */
822 static void
823 p_b_cclass(p, cs)
824 register struct parse *p;
825 register cset *cs;
826 {
827 	register int c;
828 	register char *sp = p->next;
829 	register struct cclass *cp;
830 	register size_t len;
831 
832 	while (MORE() && isalpha((uch)PEEK()))
833 		NEXT();
834 	len = p->next - sp;
835 	for (cp = cclasses; cp->name != NULL; cp++)
836 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
837 			break;
838 	if (cp->name == NULL) {
839 		/* oops, didn't find it */
840 		SETERROR(REG_ECTYPE);
841 		return;
842 	}
843 
844 	switch (cp->fidx) {
845 	case CALNUM:
846 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
847 			if (isalnum((uch)c))
848 				CHadd(cs, c);
849 		break;
850 	case CALPHA:
851 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
852 			if (isalpha((uch)c))
853 				CHadd(cs, c);
854 		break;
855 	case CBLANK:
856 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
857 			if (isblank((uch)c))
858 				CHadd(cs, c);
859 		break;
860 	case CCNTRL:
861 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
862 			if (iscntrl((uch)c))
863 				CHadd(cs, c);
864 		break;
865 	case CDIGIT:
866 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
867 			if (isdigit((uch)c))
868 				CHadd(cs, c);
869 		break;
870 	case CGRAPH:
871 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
872 			if (isgraph((uch)c))
873 				CHadd(cs, c);
874 		break;
875 	case CLOWER:
876 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
877 			if (islower((uch)c))
878 				CHadd(cs, c);
879 		break;
880 	case CPRINT:
881 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
882 			if (isprint((uch)c))
883 				CHadd(cs, c);
884 		break;
885 	case CPUNCT:
886 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
887 			if (ispunct((uch)c))
888 				CHadd(cs, c);
889 		break;
890 	case CSPACE:
891 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
892 			if (isspace((uch)c))
893 				CHadd(cs, c);
894 		break;
895 	case CUPPER:
896 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
897 			if (isupper((uch)c))
898 				CHadd(cs, c);
899 		break;
900 	case CXDIGIT:
901 		for (c = CHAR_MIN; c <= CHAR_MAX; c++)
902 			if (isxdigit((uch)c))
903 				CHadd(cs, c);
904 		break;
905 	}
906 #if 0
907 	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
908 		MCadd(p, cs, u);
909 #endif
910 }
911 
912 /*
913  - p_b_eclass - parse an equivalence-class name and deal with it
914  == static void p_b_eclass(register struct parse *p, register cset *cs);
915  *
916  * This implementation is incomplete. xxx
917  */
918 static void
919 p_b_eclass(p, cs)
920 register struct parse *p;
921 register cset *cs;
922 {
923 	register char c;
924 
925 	c = p_b_coll_elem(p, '=');
926 	CHadd(cs, c);
927 }
928 
929 /*
930  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
931  == static char p_b_symbol(register struct parse *p);
932  */
933 static char			/* value of symbol */
934 p_b_symbol(p)
935 register struct parse *p;
936 {
937 	register char value;
938 
939 	(void)REQUIRE(MORE(), REG_EBRACK);
940 	if (!EATTWO('[', '.'))
941 		return(GETNEXT());
942 
943 	/* collating symbol */
944 	value = p_b_coll_elem(p, '.');
945 	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
946 	return(value);
947 }
948 
949 /*
950  - p_b_coll_elem - parse a collating-element name and look it up
951  == static char p_b_coll_elem(register struct parse *p, int endc);
952  */
953 static char			/* value of collating element */
954 p_b_coll_elem(p, endc)
955 register struct parse *p;
956 int endc;			/* name ended by endc,']' */
957 {
958 	register char *sp = p->next;
959 	register struct cname *cp;
960 	register int len;
961 
962 	while (MORE() && !SEETWO(endc, ']'))
963 		NEXT();
964 	if (!MORE()) {
965 		SETERROR(REG_EBRACK);
966 		return(0);
967 	}
968 	len = p->next - sp;
969 	for (cp = cnames; cp->name != NULL; cp++)
970 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
971 			return(cp->code);	/* known name */
972 	if (len == 1)
973 		return(*sp);	/* single character */
974 	SETERROR(REG_ECOLLATE);			/* neither */
975 	return(0);
976 }
977 
978 /*
979  - othercase - return the case counterpart of an alphabetic
980  == static char othercase(int ch);
981  */
982 static char			/* if no counterpart, return ch */
983 othercase(ch)
984 int ch;
985 {
986 	ch = (unsigned char)ch;
987 	assert(isalpha(ch));
988 	if (isupper(ch))
989 		return(tolower(ch));
990 	else if (islower(ch))
991 		return(toupper(ch));
992 	else			/* peculiar, but could happen */
993 		return(ch);
994 }
995 
996 /*
997  - bothcases - emit a dualcase version of a two-case character
998  == static void bothcases(register struct parse *p, int ch);
999  *
1000  * Boy, is this implementation ever a kludge...
1001  */
1002 static void
1003 bothcases(p, ch)
1004 register struct parse *p;
1005 int ch;
1006 {
1007 	register char *oldnext = p->next;
1008 	register char *oldend = p->end;
1009 	char bracket[3];
1010 
1011 	ch = (unsigned char)ch;
1012 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1013 	p->next = bracket;
1014 	p->end = bracket+2;
1015 	bracket[0] = ch;
1016 	bracket[1] = ']';
1017 	bracket[2] = '\0';
1018 	p_bracket(p);
1019 	assert(p->next == bracket+2);
1020 	p->next = oldnext;
1021 	p->end = oldend;
1022 }
1023 
1024 /*
1025  - ordinary - emit an ordinary character
1026  == static void ordinary(register struct parse *p, register int ch);
1027  */
1028 static void
1029 ordinary(p, ch)
1030 register struct parse *p;
1031 register int ch;
1032 {
1033 	register cat_t *cap = p->g->categories;
1034 
1035 	if ((p->g->cflags&REG_ICASE) && isalpha((unsigned char)ch) && othercase(ch) != ch)
1036 		bothcases(p, ch);
1037 	else {
1038 		EMIT(OCHAR, (unsigned char)ch);
1039 		if (cap[ch] == 0)
1040 			cap[ch] = p->g->ncategories++;
1041 	}
1042 }
1043 
1044 /*
1045  - nonnewline - emit REG_NEWLINE version of OANY
1046  == static void nonnewline(register struct parse *p);
1047  *
1048  * Boy, is this implementation ever a kludge...
1049  */
1050 static void
1051 nonnewline(p)
1052 register struct parse *p;
1053 {
1054 	register char *oldnext = p->next;
1055 	register char *oldend = p->end;
1056 	char bracket[4];
1057 
1058 	p->next = bracket;
1059 	p->end = bracket+3;
1060 	bracket[0] = '^';
1061 	bracket[1] = '\n';
1062 	bracket[2] = ']';
1063 	bracket[3] = '\0';
1064 	p_bracket(p);
1065 	assert(p->next == bracket+3);
1066 	p->next = oldnext;
1067 	p->end = oldend;
1068 }
1069 
1070 /*
1071  - repeat - generate code for a bounded repetition, recursively if needed
1072  == static void repeat(register struct parse *p, sopno start, int from, int to);
1073  */
1074 static void
1075 repeat(p, start, from, to)
1076 register struct parse *p;
1077 sopno start;			/* operand from here to end of strip */
1078 int from;			/* repeated from this number */
1079 int to;				/* to this number of times (maybe INFINITY) */
1080 {
1081 	register sopno finish = HERE();
1082 #	define	N	2
1083 #	define	INF	3
1084 #	define	REP(f, t)	((f)*8 + (t))
1085 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1086 	register sopno copy;
1087 
1088 	if (p->error != 0)	/* head off possible runaway recursion */
1089 		return;
1090 
1091 	assert(from <= to);
1092 
1093 	switch (REP(MAP(from), MAP(to))) {
1094 	case REP(0, 0):			/* must be user doing this */
1095 		DROP(finish-start);	/* drop the operand */
1096 		break;
1097 	case REP(0, 1):			/* as x{1,1}? */
1098 	case REP(0, N):			/* as x{1,n}? */
1099 	case REP(0, INF):		/* as x{1,}? */
1100 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1101 		INSERT(OCH_, start);		/* offset is wrong... */
1102 		repeat(p, start+1, 1, to);
1103 		ASTERN(OOR1, start);
1104 		AHEAD(start);			/* ... fix it */
1105 		EMIT(OOR2, 0);
1106 		AHEAD(THERE());
1107 		ASTERN(O_CH, THERETHERE());
1108 		break;
1109 	case REP(1, 1):			/* trivial case */
1110 		/* done */
1111 		break;
1112 	case REP(1, N):			/* as x?x{1,n-1} */
1113 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1114 		INSERT(OCH_, start);
1115 		ASTERN(OOR1, start);
1116 		AHEAD(start);
1117 		EMIT(OOR2, 0);			/* offset very wrong... */
1118 		AHEAD(THERE());			/* ...so fix it */
1119 		ASTERN(O_CH, THERETHERE());
1120 		copy = dupl(p, start+1, finish+1);
1121 		assert(copy == finish+4);
1122 		repeat(p, copy, 1, to-1);
1123 		break;
1124 	case REP(1, INF):		/* as x+ */
1125 		INSERT(OPLUS_, start);
1126 		ASTERN(O_PLUS, start);
1127 		break;
1128 	case REP(N, N):			/* as xx{m-1,n-1} */
1129 		copy = dupl(p, start, finish);
1130 		repeat(p, copy, from-1, to-1);
1131 		break;
1132 	case REP(N, INF):		/* as xx{n-1,INF} */
1133 		copy = dupl(p, start, finish);
1134 		repeat(p, copy, from-1, to);
1135 		break;
1136 	default:			/* "can't happen" */
1137 		SETERROR(REG_ASSERT);	/* just in case */
1138 		break;
1139 	}
1140 }
1141 
1142 /*
1143  - seterr - set an error condition
1144  == static int seterr(register struct parse *p, int e);
1145  */
1146 static int			/* useless but makes type checking happy */
1147 seterr(p, e)
1148 register struct parse *p;
1149 int e;
1150 {
1151 	if (p->error == 0)	/* keep earliest error condition */
1152 		p->error = e;
1153 	p->next = nuls;		/* try to bring things to a halt */
1154 	p->end = nuls;
1155 	return(0);		/* make the return value well-defined */
1156 }
1157 
1158 /*
1159  - allocset - allocate a set of characters for []
1160  == static cset *allocset(register struct parse *p);
1161  */
1162 static cset *
1163 allocset(p)
1164 register struct parse *p;
1165 {
1166 	register int no = p->g->ncsets++;
1167 	register size_t nc;
1168 	register size_t nbytes;
1169 	register cset *cs;
1170 	register size_t css = (size_t)p->g->csetsize;
1171 	register int i;
1172 
1173 	if (no >= p->ncsalloc) {	/* need another column of space */
1174 		p->ncsalloc += CHAR_BIT;
1175 		nc = p->ncsalloc;
1176 		assert(nc % CHAR_BIT == 0);
1177 		nbytes = nc / CHAR_BIT * css;
1178 		if (p->g->sets == NULL)
1179 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1180 		else
1181 			p->g->sets = (cset *)realloc((char *)p->g->sets,
1182 							nc * sizeof(cset));
1183 		if (p->g->setbits == NULL)
1184 			p->g->setbits = (uch *)malloc(nbytes);
1185 		else {
1186 			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1187 								nbytes);
1188 			/* xxx this isn't right if setbits is now NULL */
1189 			for (i = 0; i < no; i++)
1190 				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1191 		}
1192 		if (p->g->sets != NULL && p->g->setbits != NULL)
1193 			(void) memset((char *)p->g->setbits + (nbytes - css),
1194 								0, css);
1195 		else {
1196 			no = 0;
1197 			SETERROR(REG_ESPACE);
1198 			/* caller's responsibility not to do set ops */
1199 		}
1200 	}
1201 
1202 	assert(p->g->sets != NULL);	/* xxx */
1203 	cs = &p->g->sets[no];
1204 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1205 	cs->mask = 1 << ((no) % CHAR_BIT);
1206 	cs->hash = 0;
1207 	cs->smultis = 0;
1208 	cs->multis = NULL;
1209 
1210 	return(cs);
1211 }
1212 
1213 /*
1214  - freeset - free a now-unused set
1215  == static void freeset(register struct parse *p, register cset *cs);
1216  */
1217 static void
1218 freeset(p, cs)
1219 register struct parse *p;
1220 register cset *cs;
1221 {
1222 	register int i;
1223 	register cset *top = &p->g->sets[p->g->ncsets];
1224 	register size_t css = (size_t)p->g->csetsize;
1225 
1226 	for (i = 0; i < css; i++)
1227 		CHsub(cs, i);
1228 	if (cs == top-1)	/* recover only the easy case */
1229 		p->g->ncsets--;
1230 }
1231 
1232 /*
1233  - freezeset - final processing on a set of characters
1234  == static int freezeset(register struct parse *p, register cset *cs);
1235  *
1236  * The main task here is merging identical sets.  This is usually a waste
1237  * of time (although the hash code minimizes the overhead), but can win
1238  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1239  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1240  * the same value!
1241  */
1242 static int			/* set number */
1243 freezeset(p, cs)
1244 register struct parse *p;
1245 register cset *cs;
1246 {
1247 	register short h = cs->hash;
1248 	register int i;
1249 	register cset *top = &p->g->sets[p->g->ncsets];
1250 	register cset *cs2;
1251 	register size_t css = (size_t)p->g->csetsize;
1252 
1253 	/* look for an earlier one which is the same */
1254 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1255 		if (cs2->hash == h && cs2 != cs) {
1256 			/* maybe */
1257 			for (i = 0; i < css; i++)
1258 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1259 					break;		/* no */
1260 			if (i == css)
1261 				break;			/* yes */
1262 		}
1263 
1264 	if (cs2 < top) {	/* found one */
1265 		freeset(p, cs);
1266 		cs = cs2;
1267 	}
1268 
1269 	return((int)(cs - p->g->sets));
1270 }
1271 
1272 /*
1273  - firstch - return first character in a set (which must have at least one)
1274  == static int firstch(register struct parse *p, register cset *cs);
1275  */
1276 static int			/* character; there is no "none" value */
1277 firstch(p, cs)
1278 register struct parse *p;
1279 register cset *cs;
1280 {
1281 	register int i;
1282 	register size_t css = (size_t)p->g->csetsize;
1283 
1284 	for (i = 0; i < css; i++)
1285 		if (CHIN(cs, i))
1286 			return((unsigned char)i);
1287 	assert(never);
1288 	return(0);		/* arbitrary */
1289 }
1290 
1291 /*
1292  - nch - number of characters in a set
1293  == static int nch(register struct parse *p, register cset *cs);
1294  */
1295 static int
1296 nch(p, cs)
1297 register struct parse *p;
1298 register cset *cs;
1299 {
1300 	register int i;
1301 	register size_t css = (size_t)p->g->csetsize;
1302 	register int n = 0;
1303 
1304 	for (i = 0; i < css; i++)
1305 		if (CHIN(cs, i))
1306 			n++;
1307 	return(n);
1308 }
1309 
1310 /*
1311  - mcadd - add a collating element to a cset
1312  == static void mcadd(register struct parse *p, register cset *cs, \
1313  ==	register char *cp);
1314  */
1315 static void
1316 mcadd(p, cs, cp)
1317 register struct parse *p;
1318 register cset *cs;
1319 register char *cp;
1320 {
1321 	register size_t oldend = cs->smultis;
1322 
1323 	cs->smultis += strlen(cp) + 1;
1324 	if (cs->multis == NULL)
1325 		cs->multis = malloc(cs->smultis);
1326 	else
1327 		cs->multis = realloc(cs->multis, cs->smultis);
1328 	if (cs->multis == NULL) {
1329 		SETERROR(REG_ESPACE);
1330 		return;
1331 	}
1332 
1333 	(void) strcpy(cs->multis + oldend - 1, cp);
1334 	cs->multis[cs->smultis - 1] = '\0';
1335 }
1336 
1337 #if used
1338 /*
1339  - mcsub - subtract a collating element from a cset
1340  == static void mcsub(register cset *cs, register char *cp);
1341  */
1342 static void
1343 mcsub(cs, cp)
1344 register cset *cs;
1345 register char *cp;
1346 {
1347 	register char *fp = mcfind(cs, cp);
1348 	register size_t len = strlen(fp);
1349 
1350 	assert(fp != NULL);
1351 	(void) memmove(fp, fp + len + 1,
1352 				cs->smultis - (fp + len + 1 - cs->multis));
1353 	cs->smultis -= len;
1354 
1355 	if (cs->smultis == 0) {
1356 		free(cs->multis);
1357 		cs->multis = NULL;
1358 		return;
1359 	}
1360 
1361 	cs->multis = realloc(cs->multis, cs->smultis);
1362 	assert(cs->multis != NULL);
1363 }
1364 
1365 /*
1366  - mcin - is a collating element in a cset?
1367  == static int mcin(register cset *cs, register char *cp);
1368  */
1369 static int
1370 mcin(cs, cp)
1371 register cset *cs;
1372 register char *cp;
1373 {
1374 	return(mcfind(cs, cp) != NULL);
1375 }
1376 
1377 /*
1378  - mcfind - find a collating element in a cset
1379  == static char *mcfind(register cset *cs, register char *cp);
1380  */
1381 static char *
1382 mcfind(cs, cp)
1383 register cset *cs;
1384 register char *cp;
1385 {
1386 	register char *p;
1387 
1388 	if (cs->multis == NULL)
1389 		return(NULL);
1390 	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1391 		if (strcmp(cp, p) == 0)
1392 			return(p);
1393 	return(NULL);
1394 }
1395 #endif
1396 
1397 /*
1398  - mcinvert - invert the list of collating elements in a cset
1399  == static void mcinvert(register struct parse *p, register cset *cs);
1400  *
1401  * This would have to know the set of possibilities.  Implementation
1402  * is deferred.
1403  */
1404 static void
1405 mcinvert(p, cs)
1406 register struct parse *p;
1407 register cset *cs;
1408 {
1409 	assert(cs->multis == NULL);	/* xxx */
1410 }
1411 
1412 /*
1413  - mccase - add case counterparts of the list of collating elements in a cset
1414  == static void mccase(register struct parse *p, register cset *cs);
1415  *
1416  * This would have to know the set of possibilities.  Implementation
1417  * is deferred.
1418  */
1419 static void
1420 mccase(p, cs)
1421 register struct parse *p;
1422 register cset *cs;
1423 {
1424 	assert(cs->multis == NULL);	/* xxx */
1425 }
1426 
1427 /*
1428  - isinsets - is this character in any sets?
1429  == static int isinsets(register struct re_guts *g, int c);
1430  */
1431 static int			/* predicate */
1432 isinsets(g, c)
1433 register struct re_guts *g;
1434 int c;
1435 {
1436 	register uch *col;
1437 	register int i;
1438 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1439 	register unsigned uc = (unsigned char)c;
1440 
1441 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1442 		if (col[uc] != 0)
1443 			return(1);
1444 	return(0);
1445 }
1446 
1447 /*
1448  - samesets - are these two characters in exactly the same sets?
1449  == static int samesets(register struct re_guts *g, int c1, int c2);
1450  */
1451 static int			/* predicate */
1452 samesets(g, c1, c2)
1453 register struct re_guts *g;
1454 int c1;
1455 int c2;
1456 {
1457 	register uch *col;
1458 	register int i;
1459 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1460 	register unsigned uc1 = (unsigned char)c1;
1461 	register unsigned uc2 = (unsigned char)c2;
1462 
1463 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1464 		if (col[uc1] != col[uc2])
1465 			return(0);
1466 	return(1);
1467 }
1468 
1469 /*
1470  - categorize - sort out character categories
1471  == static void categorize(struct parse *p, register struct re_guts *g);
1472  */
1473 static void
1474 categorize(p, g)
1475 struct parse *p;
1476 register struct re_guts *g;
1477 {
1478 	register cat_t *cats = g->categories;
1479 	register int c;
1480 	register int c2;
1481 	register cat_t cat;
1482 
1483 	/* avoid making error situations worse */
1484 	if (p->error != 0)
1485 		return;
1486 
1487 	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1488 		if (cats[c] == 0 && isinsets(g, c)) {
1489 			cat = g->ncategories++;
1490 			cats[c] = cat;
1491 			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1492 				if (cats[c2] == 0 && samesets(g, c, c2))
1493 					cats[c2] = cat;
1494 		}
1495 }
1496 
1497 /*
1498  - dupl - emit a duplicate of a bunch of sops
1499  == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1500  */
1501 static sopno			/* start of duplicate */
1502 dupl(p, start, finish)
1503 register struct parse *p;
1504 sopno start;			/* from here */
1505 sopno finish;			/* to this less one */
1506 {
1507 	register sopno ret = HERE();
1508 	register sopno len = finish - start;
1509 
1510 	assert(finish >= start);
1511 	if (len == 0)
1512 		return(ret);
1513 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1514 	assert(p->ssize >= p->slen + len);
1515 	(void) memcpy((char *)(p->strip + p->slen),
1516 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1517 	p->slen += len;
1518 	return(ret);
1519 }
1520 
1521 /*
1522  - doemit - emit a strip operator
1523  == static void doemit(register struct parse *p, sop op, size_t opnd);
1524  *
1525  * It might seem better to implement this as a macro with a function as
1526  * hard-case backup, but it's just too big and messy unless there are
1527  * some changes to the data structures.  Maybe later.
1528  */
1529 static void
1530 doemit(p, op, opnd)
1531 register struct parse *p;
1532 sop op;
1533 size_t opnd;
1534 {
1535 	/* avoid making error situations worse */
1536 	if (p->error != 0)
1537 		return;
1538 
1539 	/* deal with oversize operands ("can't happen", more or less) */
1540 	assert(opnd < 1<<OPSHIFT);
1541 
1542 	/* deal with undersized strip */
1543 	if (p->slen >= p->ssize)
1544 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1545 	assert(p->slen < p->ssize);
1546 
1547 	/* finally, it's all reduced to the easy case */
1548 	p->strip[p->slen++] = SOP(op, opnd);
1549 }
1550 
1551 /*
1552  - doinsert - insert a sop into the strip
1553  == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1554  */
1555 static void
1556 doinsert(p, op, opnd, pos)
1557 register struct parse *p;
1558 sop op;
1559 size_t opnd;
1560 sopno pos;
1561 {
1562 	register sopno sn;
1563 	register sop s;
1564 	register int i;
1565 
1566 	/* avoid making error situations worse */
1567 	if (p->error != 0)
1568 		return;
1569 
1570 	sn = HERE();
1571 	EMIT(op, opnd);		/* do checks, ensure space */
1572 	assert(HERE() == sn+1);
1573 	s = p->strip[sn];
1574 
1575 	/* adjust paren pointers */
1576 	assert(pos > 0);
1577 	for (i = 1; i < NPAREN; i++) {
1578 		if (p->pbegin[i] >= pos) {
1579 			p->pbegin[i]++;
1580 		}
1581 		if (p->pend[i] >= pos) {
1582 			p->pend[i]++;
1583 		}
1584 	}
1585 
1586 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1587 						(HERE()-pos-1)*sizeof(sop));
1588 	p->strip[pos] = s;
1589 }
1590 
1591 /*
1592  - dofwd - complete a forward reference
1593  == static void dofwd(register struct parse *p, sopno pos, sop value);
1594  */
1595 static void
1596 dofwd(p, pos, value)
1597 register struct parse *p;
1598 register sopno pos;
1599 sop value;
1600 {
1601 	/* avoid making error situations worse */
1602 	if (p->error != 0)
1603 		return;
1604 
1605 	assert(value < 1<<OPSHIFT);
1606 	p->strip[pos] = OP(p->strip[pos]) | value;
1607 }
1608 
1609 /*
1610  - enlarge - enlarge the strip
1611  == static void enlarge(register struct parse *p, sopno size);
1612  */
1613 static void
1614 enlarge(p, size)
1615 register struct parse *p;
1616 register sopno size;
1617 {
1618 	register sop *sp;
1619 
1620 	if (p->ssize >= size)
1621 		return;
1622 
1623 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1624 	if (sp == NULL) {
1625 		SETERROR(REG_ESPACE);
1626 		return;
1627 	}
1628 	p->strip = sp;
1629 	p->ssize = size;
1630 }
1631 
1632 /*
1633  - stripsnug - compact the strip
1634  == static void stripsnug(register struct parse *p, register struct re_guts *g);
1635  */
1636 static void
1637 stripsnug(p, g)
1638 register struct parse *p;
1639 register struct re_guts *g;
1640 {
1641 	g->nstates = p->slen;
1642 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1643 	if (g->strip == NULL) {
1644 		SETERROR(REG_ESPACE);
1645 		g->strip = p->strip;
1646 	}
1647 }
1648 
1649 /*
1650  - findmust - fill in must and mlen with longest mandatory literal string
1651  == static void findmust(register struct parse *p, register struct re_guts *g);
1652  *
1653  * This algorithm could do fancy things like analyzing the operands of |
1654  * for common subsequences.  Someday.  This code is simple and finds most
1655  * of the interesting cases.
1656  *
1657  * Note that must and mlen got initialized during setup.
1658  */
1659 static void
1660 findmust(p, g)
1661 struct parse *p;
1662 register struct re_guts *g;
1663 {
1664 	register sop *scan;
1665 	sop *start;
1666 	register sop *newstart;
1667 	register sopno newlen;
1668 	register sop s;
1669 	register char *cp;
1670 	register sopno i;
1671 
1672 	/* avoid making error situations worse */
1673 	if (p->error != 0)
1674 		return;
1675 
1676 	/* find the longest OCHAR sequence in strip */
1677 	newlen = 0;
1678 	scan = g->strip + 1;
1679 	do {
1680 		s = *scan++;
1681 		switch (OP(s)) {
1682 		case OCHAR:		/* sequence member */
1683 			if (newlen == 0)		/* new sequence */
1684 				newstart = scan - 1;
1685 			newlen++;
1686 			break;
1687 		case OPLUS_:		/* things that don't break one */
1688 		case OLPAREN:
1689 		case ORPAREN:
1690 			break;
1691 		case OQUEST_:		/* things that must be skipped */
1692 		case OCH_:
1693 			scan--;
1694 			do {
1695 				scan += OPND(s);
1696 				s = *scan;
1697 				/* assert() interferes w debug printouts */
1698 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1699 							OP(s) != OOR2) {
1700 					g->iflags |= BAD;
1701 					return;
1702 				}
1703 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1704 			/* fallthrough */
1705 		default:		/* things that break a sequence */
1706 			if (newlen > g->mlen) {		/* ends one */
1707 				start = newstart;
1708 				g->mlen = newlen;
1709 			}
1710 			newlen = 0;
1711 			break;
1712 		}
1713 	} while (OP(s) != OEND);
1714 
1715 	if (g->mlen == 0)		/* there isn't one */
1716 		return;
1717 
1718 	/* turn it into a character string */
1719 	g->must = malloc((size_t)g->mlen + 1);
1720 	if (g->must == NULL) {		/* argh; just forget it */
1721 		g->mlen = 0;
1722 		return;
1723 	}
1724 	cp = g->must;
1725 	scan = start;
1726 	for (i = g->mlen; i > 0; i--) {
1727 		while (OP(s = *scan++) != OCHAR)
1728 			continue;
1729 		assert(cp < g->must + g->mlen);
1730 		*cp++ = (char)OPND(s);
1731 	}
1732 	assert(cp == g->must + g->mlen);
1733 	*cp++ = '\0';		/* just on general principles */
1734 }
1735 
1736 /*
1737  - pluscount - count + nesting
1738  == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1739  */
1740 static sopno			/* nesting depth */
1741 pluscount(p, g)
1742 struct parse *p;
1743 register struct re_guts *g;
1744 {
1745 	register sop *scan;
1746 	register sop s;
1747 	register sopno plusnest = 0;
1748 	register sopno maxnest = 0;
1749 
1750 	if (p->error != 0)
1751 		return(0);	/* there may not be an OEND */
1752 
1753 	scan = g->strip + 1;
1754 	do {
1755 		s = *scan++;
1756 		switch (OP(s)) {
1757 		case OPLUS_:
1758 			plusnest++;
1759 			break;
1760 		case O_PLUS:
1761 			if (plusnest > maxnest)
1762 				maxnest = plusnest;
1763 			plusnest--;
1764 			break;
1765 		}
1766 	} while (OP(s) != OEND);
1767 	if (plusnest != 0)
1768 		g->iflags |= BAD;
1769 	return(maxnest);
1770 }
1771