xref: /freebsd/contrib/less/regexp.c (revision 5477372324b92240a96310ef2d45fa44ce8d0a93)
1 /*
2  * regcomp and regexec -- regsub and regerror are elsewhere
3  *
4  *	Copyright (c) 1986 by University of Toronto.
5  *	Written by Henry Spencer.  Not derived from licensed software.
6  *
7  *	Permission is granted to anyone to use this software for any
8  *	purpose on any computer system, and to redistribute it freely,
9  *	subject to the following restrictions:
10  *
11  *	1. The author is not responsible for the consequences of use of
12  *		this software, no matter how awful, even if they arise
13  *		from defects in it.
14  *
15  *	2. The origin of this software must not be misrepresented, either
16  *		by explicit claim or by omission.
17  *
18  *	3. Altered versions must be plainly marked as such, and must not
19  *		be misrepresented as being the original software.
20  *
21  * Beware that some of this code is subtly aware of the way operator
22  * precedence is structured in regular expressions.  Serious changes in
23  * regular-expression syntax might require a total rethink.
24  *
25  * *** NOTE: this code has been altered slightly for use in Tcl. ***
26  * Slightly modified by David MacKenzie to undo most of the changes for TCL.
27  * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
28  */
29 
30 #include "less.h"
31 #if HAVE_STDIO_H
32 #include <stdio.h>
33 #endif
34 #if HAVE_STDLIB_H
35 #include <stdlib.h>
36 #endif
37 #if HAVE_STRING_H
38 #include <string.h>
39 #endif
40 #include "regexp.h"
41 
42 /*
43  * The "internal use only" fields in regexp.h are present to pass info from
44  * compile to execute that permits the execute phase to run lots faster on
45  * simple cases.  They are:
46  *
47  * regstart	char that must begin a match; '\0' if none obvious
48  * reganch	is the match anchored (at beginning-of-line only)?
49  * regmust	string (pointer into program) that match must include, or NULL
50  * regmlen	length of regmust string
51  *
52  * Regstart and reganch permit very fast decisions on suitable starting points
53  * for a match, cutting down the work a lot.  Regmust permits fast rejection
54  * of lines that cannot possibly match.  The regmust tests are costly enough
55  * that regcomp() supplies a regmust only if the r.e. contains something
56  * potentially expensive (at present, the only such thing detected is * or +
57  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
58  * supplied because the test in regexec() needs it and regcomp() is
59  * computing it anyway.
60  */
61 
62 /*
63  * Structure for regexp "program".  This is essentially a linear encoding
64  * of a nondeterministic finite-state machine (aka syntax charts or
65  * "railroad normal form" in parsing technology).  Each node is an opcode
66  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
67  * all nodes except BRANCH implement concatenation; a "next" pointer with
68  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
69  * have one of the subtle syntax dependencies:  an individual BRANCH (as
70  * opposed to a collection of them) is never concatenated with anything
71  * because of operator precedence.)  The operand of some types of node is
72  * a literal string; for others, it is a node leading into a sub-FSM.  In
73  * particular, the operand of a BRANCH node is the first node of the branch.
74  * (NB this is *not* a tree structure:  the tail of the branch connects
75  * to the thing following the set of BRANCHes.)  The opcodes are:
76  */
77 
78 /* definition	number	opnd?	meaning */
79 #undef EOL
80 #define	END	0	/* no	End of program. */
81 #define	BOL	1	/* no	Match "" at beginning of line. */
82 #define	EOL	2	/* no	Match "" at end of line. */
83 #define	ANY	3	/* no	Match any one character. */
84 #define	ANYOF	4	/* str	Match any character in this string. */
85 #define	ANYBUT	5	/* str	Match any character not in this string. */
86 #define	BRANCH	6	/* node	Match this alternative, or the next... */
87 #define	BACK	7	/* no	Match "", "next" ptr points backward. */
88 #define	EXACTLY	8	/* str	Match this string. */
89 #define	NOTHING	9	/* no	Match empty string. */
90 #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
91 #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
92 #define	OPEN	20	/* no	Mark this point in input as start of #n. */
93 			/*	OPEN+1 is number 1, etc. */
94 #define	CLOSE	30	/* no	Analogous to OPEN. */
95 
96 /*
97  * Opcode notes:
98  *
99  * BRANCH	The set of branches constituting a single choice are hooked
100  *		together with their "next" pointers, since precedence prevents
101  *		anything being concatenated to any individual branch.  The
102  *		"next" pointer of the last BRANCH in a choice points to the
103  *		thing following the whole choice.  This is also where the
104  *		final "next" pointer of each individual branch points; each
105  *		branch starts with the operand node of a BRANCH node.
106  *
107  * BACK		Normal "next" pointers all implicitly point forward; BACK
108  *		exists to make loop structures possible.
109  *
110  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
111  *		BRANCH structures using BACK.  Simple cases (one character
112  *		per match) are implemented with STAR and PLUS for speed
113  *		and to minimize recursive plunges.
114  *
115  * OPEN,CLOSE	...are numbered at compile time.
116  */
117 
118 /*
119  * A node is one char of opcode followed by two chars of "next" pointer.
120  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
121  * value is a positive offset from the opcode of the node containing it.
122  * An operand, if any, simply follows the node.  (Note that much of the
123  * code generation knows about this implicit relationship.)
124  *
125  * Using two bytes for the "next" pointer is vast overkill for most things,
126  * but allows patterns to get big without disasters.
127  */
128 #define	OP(p)	(*(p))
129 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130 #define	OPERAND(p)	((p) + 3)
131 
132 /*
133  * See regmagic.h for one further detail of program structure.
134  */
135 
136 
137 /*
138  * Utility definitions.
139  */
140 #ifndef CHARBITS
141 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
142 #else
143 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
144 #endif
145 
146 #define	FAIL(m)	{ regerror(m); return(NULL); }
147 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
148 #define	META	"^$.[()|?+*\\"
149 
150 /*
151  * Flags to be passed up and down.
152  */
153 #define	HASWIDTH	01	/* Known never to match null string. */
154 #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
155 #define	SPSTART		04	/* Starts with * or +. */
156 #define	WORST		0	/* Worst case. */
157 
158 /*
159  * Global work variables for regcomp().
160  */
161 static char *regparse;		/* Input-scan pointer. */
162 static int regnpar;		/* () count. */
163 static char regdummy;
164 static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
165 static long regsize;		/* Code size. */
166 
167 /*
168  * The first byte of the regexp internal "program" is actually this magic
169  * number; the start node begins in the second byte.
170  */
171 #define	MAGIC	0234
172 
173 
174 /*
175  * Forward declarations for regcomp()'s friends.
176  */
177 #ifndef STATIC
178 #define	STATIC	static
179 #endif
180 STATIC char *reg();
181 STATIC char *regbranch();
182 STATIC char *regpiece();
183 STATIC char *regatom();
184 STATIC char *regnode();
185 STATIC char *regnext();
186 STATIC void regc();
187 STATIC void reginsert();
188 STATIC void regtail();
189 STATIC void regoptail();
190 #ifdef STRCSPN
191 STATIC int strcspn();
192 #endif
193 
194 /*
195  - regcomp - compile a regular expression into internal code
196  *
197  * We can't allocate space until we know how big the compiled form will be,
198  * but we can't compile it (and thus know how big it is) until we've got a
199  * place to put the code.  So we cheat:  we compile it twice, once with code
200  * generation turned off and size counting turned on, and once "for real".
201  * This also means that we don't allocate space until we are sure that the
202  * thing really will compile successfully, and we never have to move the
203  * code and thus invalidate pointers into it.  (Note that it has to be in
204  * one piece because free() must be able to free it all.)
205  *
206  * Beware that the optimization-preparation code in here knows about some
207  * of the structure of the compiled regexp.
208  */
209 regexp *
210 regcomp(char *exp)
211 {
212 	regexp *r;
213 	char *scan;
214 	char *longest;
215 	int len;
216 	int flags;
217 
218 	if (exp == NULL)
219 		FAIL("NULL argument");
220 
221 	/* First pass: determine size, legality. */
222 	regparse = exp;
223 	regnpar = 1;
224 	regsize = 0L;
225 	regcode = &regdummy;
226 	regc(MAGIC);
227 	if (reg(0, &flags) == NULL)
228 		return(NULL);
229 
230 	/* Small enough for pointer-storage convention? */
231 	if (regsize >= 32767L)		/* Probably could be 65535L. */
232 		FAIL("regexp too big");
233 
234 	/* Allocate space. */
235 	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
236 	if (r == NULL)
237 		FAIL("out of space");
238 
239 	/* Second pass: emit code. */
240 	regparse = exp;
241 	regnpar = 1;
242 	regcode = r->program;
243 	regc(MAGIC);
244 	if (reg(0, &flags) == NULL)
245 	{
246 		free(r);
247 		return(NULL);
248 	}
249 
250 	/* Dig out information for optimizations. */
251 	r->regstart = '\0';	/* Worst-case defaults. */
252 	r->reganch = 0;
253 	r->regmust = NULL;
254 	r->regmlen = 0;
255 	scan = r->program+1;			/* First BRANCH. */
256 	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
257 		scan = OPERAND(scan);
258 
259 		/* Starting-point info. */
260 		if (OP(scan) == EXACTLY)
261 			r->regstart = *OPERAND(scan);
262 		else if (OP(scan) == BOL)
263 			r->reganch++;
264 
265 		/*
266 		 * If there's something expensive in the r.e., find the
267 		 * longest literal string that must appear and make it the
268 		 * regmust.  Resolve ties in favor of later strings, since
269 		 * the regstart check works with the beginning of the r.e.
270 		 * and avoiding duplication strengthens checking.  Not a
271 		 * strong reason, but sufficient in the absence of others.
272 		 */
273 		if (flags&SPSTART) {
274 			longest = NULL;
275 			len = 0;
276 			for (; scan != NULL; scan = regnext(scan))
277 				if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
278 					longest = OPERAND(scan);
279 					len = (int) strlen(OPERAND(scan));
280 				}
281 			r->regmust = longest;
282 			r->regmlen = len;
283 		}
284 	}
285 
286 	return(r);
287 }
288 
289 /*
290  - reg - regular expression, i.e. main body or parenthesized thing
291  *
292  * Caller must absorb opening parenthesis.
293  *
294  * Combining parenthesis handling with the base level of regular expression
295  * is a trifle forced, but the need to tie the tails of the branches to what
296  * follows makes it hard to avoid.
297  */
298 static char *
299 reg(int paren, int *flagp)
300 {
301 	char *ret;
302 	char *br;
303 	char *ender;
304 	int parno = 0;
305 	int flags;
306 
307 	*flagp = HASWIDTH;	/* Tentatively. */
308 
309 	/* Make an OPEN node, if parenthesized. */
310 	if (paren) {
311 		if (regnpar >= NSUBEXP)
312 			FAIL("too many ()");
313 		parno = regnpar;
314 		regnpar++;
315 		ret = regnode(OPEN+parno);
316 	} else
317 		ret = NULL;
318 
319 	/* Pick up the branches, linking them together. */
320 	br = regbranch(&flags);
321 	if (br == NULL)
322 		return(NULL);
323 	if (ret != NULL)
324 		regtail(ret, br);	/* OPEN -> first. */
325 	else
326 		ret = br;
327 	if (!(flags&HASWIDTH))
328 		*flagp &= ~HASWIDTH;
329 	*flagp |= flags&SPSTART;
330 	while (*regparse == '|') {
331 		regparse++;
332 		br = regbranch(&flags);
333 		if (br == NULL)
334 			return(NULL);
335 		regtail(ret, br);	/* BRANCH -> BRANCH. */
336 		if (!(flags&HASWIDTH))
337 			*flagp &= ~HASWIDTH;
338 		*flagp |= flags&SPSTART;
339 	}
340 
341 	/* Make a closing node, and hook it on the end. */
342 	ender = regnode((paren) ? CLOSE+parno : END);
343 	regtail(ret, ender);
344 
345 	/* Hook the tails of the branches to the closing node. */
346 	for (br = ret; br != NULL; br = regnext(br))
347 		regoptail(br, ender);
348 
349 	/* Check for proper termination. */
350 	if (paren && *regparse++ != ')') {
351 		FAIL("unmatched ()");
352 	} else if (!paren && *regparse != '\0') {
353 		if (*regparse == ')') {
354 			FAIL("unmatched ()");
355 		} else
356 			FAIL("junk on end");	/* "Can't happen". */
357 		/* NOTREACHED */
358 	}
359 
360 	return(ret);
361 }
362 
363 /*
364  - regbranch - one alternative of an | operator
365  *
366  * Implements the concatenation operator.
367  */
368 static char *
369 regbranch(int *flagp)
370 {
371 	char *ret;
372 	char *chain;
373 	char *latest;
374 	int flags;
375 
376 	*flagp = WORST;		/* Tentatively. */
377 
378 	ret = regnode(BRANCH);
379 	chain = NULL;
380 	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
381 		latest = regpiece(&flags);
382 		if (latest == NULL)
383 			return(NULL);
384 		*flagp |= flags&HASWIDTH;
385 		if (chain == NULL)	/* First piece. */
386 			*flagp |= flags&SPSTART;
387 		else
388 			regtail(chain, latest);
389 		chain = latest;
390 	}
391 	if (chain == NULL)	/* Loop ran zero times. */
392 		(void) regnode(NOTHING);
393 
394 	return(ret);
395 }
396 
397 /*
398  - regpiece - something followed by possible [*+?]
399  *
400  * Note that the branching code sequences used for ? and the general cases
401  * of * and + are somewhat optimized:  they use the same NOTHING node as
402  * both the endmarker for their branch list and the body of the last branch.
403  * It might seem that this node could be dispensed with entirely, but the
404  * endmarker role is not redundant.
405  */
406 static char *
407 regpiece(int *flagp)
408 {
409 	char *ret;
410 	char op;
411 	char *next;
412 	int flags;
413 
414 	ret = regatom(&flags);
415 	if (ret == NULL)
416 		return(NULL);
417 
418 	op = *regparse;
419 	if (!ISMULT(op)) {
420 		*flagp = flags;
421 		return(ret);
422 	}
423 
424 	if (!(flags&HASWIDTH) && op != '?')
425 		FAIL("*+ operand could be empty");
426 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
427 
428 	if (op == '*' && (flags&SIMPLE))
429 		reginsert(STAR, ret);
430 	else if (op == '*') {
431 		/* Emit x* as (x&|), where & means "self". */
432 		reginsert(BRANCH, ret);			/* Either x */
433 		regoptail(ret, regnode(BACK));		/* and loop */
434 		regoptail(ret, ret);			/* back */
435 		regtail(ret, regnode(BRANCH));		/* or */
436 		regtail(ret, regnode(NOTHING));		/* null. */
437 	} else if (op == '+' && (flags&SIMPLE))
438 		reginsert(PLUS, ret);
439 	else if (op == '+') {
440 		/* Emit x+ as x(&|), where & means "self". */
441 		next = regnode(BRANCH);			/* Either */
442 		regtail(ret, next);
443 		regtail(regnode(BACK), ret);		/* loop back */
444 		regtail(next, regnode(BRANCH));		/* or */
445 		regtail(ret, regnode(NOTHING));		/* null. */
446 	} else if (op == '?') {
447 		/* Emit x? as (x|) */
448 		reginsert(BRANCH, ret);			/* Either x */
449 		regtail(ret, regnode(BRANCH));		/* or */
450 		next = regnode(NOTHING);		/* null. */
451 		regtail(ret, next);
452 		regoptail(ret, next);
453 	}
454 	regparse++;
455 	if (ISMULT(*regparse))
456 		FAIL("nested *?+");
457 
458 	return(ret);
459 }
460 
461 /*
462  - regatom - the lowest level
463  *
464  * Optimization:  gobbles an entire sequence of ordinary characters so that
465  * it can turn them into a single node, which is smaller to store and
466  * faster to run.  Backslashed characters are exceptions, each becoming a
467  * separate node; the code is simpler that way and it's not worth fixing.
468  */
469 static char *
470 regatom(int *flagp)
471 {
472 	char *ret;
473 	int flags;
474 
475 	*flagp = WORST;		/* Tentatively. */
476 
477 	switch (*regparse++) {
478 	case '^':
479 		ret = regnode(BOL);
480 		break;
481 	case '$':
482 		ret = regnode(EOL);
483 		break;
484 	case '.':
485 		ret = regnode(ANY);
486 		*flagp |= HASWIDTH|SIMPLE;
487 		break;
488 	case '[': {
489 			int clss;
490 			int classend;
491 
492 			if (*regparse == '^') {	/* Complement of range. */
493 				ret = regnode(ANYBUT);
494 				regparse++;
495 			} else
496 				ret = regnode(ANYOF);
497 			if (*regparse == ']' || *regparse == '-')
498 				regc(*regparse++);
499 			while (*regparse != '\0' && *regparse != ']') {
500 				if (*regparse == '-') {
501 					regparse++;
502 					if (*regparse == ']' || *regparse == '\0')
503 						regc('-');
504 					else {
505 						clss = UCHARAT(regparse-2)+1;
506 						classend = UCHARAT(regparse);
507 						if (clss > classend+1)
508 							FAIL("invalid [] range");
509 						for (; clss <= classend; clss++)
510 							regc(clss);
511 						regparse++;
512 					}
513 				} else
514 					regc(*regparse++);
515 			}
516 			regc('\0');
517 			if (*regparse != ']')
518 				FAIL("unmatched []");
519 			regparse++;
520 			*flagp |= HASWIDTH|SIMPLE;
521 		}
522 		break;
523 	case '(':
524 		ret = reg(1, &flags);
525 		if (ret == NULL)
526 			return(NULL);
527 		*flagp |= flags&(HASWIDTH|SPSTART);
528 		break;
529 	case '\0':
530 	case '|':
531 	case ')':
532 		FAIL("internal urp");	/* Supposed to be caught earlier. */
533 		/* NOTREACHED */
534 		break;
535 	case '?':
536 	case '+':
537 	case '*':
538 		FAIL("?+* follows nothing");
539 		/* NOTREACHED */
540 		break;
541 	case '\\':
542 		if (*regparse == '\0')
543 			FAIL("trailing \\");
544 		ret = regnode(EXACTLY);
545 		regc(*regparse++);
546 		regc('\0');
547 		*flagp |= HASWIDTH|SIMPLE;
548 		break;
549 	default: {
550 			int len;
551 			char ender;
552 
553 			regparse--;
554 			len = (int) strcspn(regparse, META);
555 			if (len <= 0)
556 				FAIL("internal disaster");
557 			ender = *(regparse+len);
558 			if (len > 1 && ISMULT(ender))
559 				len--;		/* Back off clear of ?+* operand. */
560 			*flagp |= HASWIDTH;
561 			if (len == 1)
562 				*flagp |= SIMPLE;
563 			ret = regnode(EXACTLY);
564 			while (len > 0) {
565 				regc(*regparse++);
566 				len--;
567 			}
568 			regc('\0');
569 		}
570 		break;
571 	}
572 
573 	return(ret);
574 }
575 
576 /*
577  - regnode - emit a node
578  */
579 static char *			/* Location. */
580 regnode(char op)
581 {
582 	char *ret;
583 	char *ptr;
584 
585 	ret = regcode;
586 	if (ret == &regdummy) {
587 		regsize += 3;
588 		return(ret);
589 	}
590 
591 	ptr = ret;
592 	*ptr++ = op;
593 	*ptr++ = '\0';		/* Null "next" pointer. */
594 	*ptr++ = '\0';
595 	regcode = ptr;
596 
597 	return(ret);
598 }
599 
600 /*
601  - regc - emit (if appropriate) a byte of code
602  */
603 static void
604 regc(char b)
605 {
606 	if (regcode != &regdummy)
607 		*regcode++ = b;
608 	else
609 		regsize++;
610 }
611 
612 /*
613  - reginsert - insert an operator in front of already-emitted operand
614  *
615  * Means relocating the operand.
616  */
617 static void
618 reginsert(char op, char *opnd)
619 {
620 	char *src;
621 	char *dst;
622 	char *place;
623 
624 	if (regcode == &regdummy) {
625 		regsize += 3;
626 		return;
627 	}
628 
629 	src = regcode;
630 	regcode += 3;
631 	dst = regcode;
632 	while (src > opnd)
633 		*--dst = *--src;
634 
635 	place = opnd;		/* Op node, where operand used to be. */
636 	*place++ = op;
637 	*place++ = '\0';
638 	*place++ = '\0';
639 }
640 
641 /*
642  - regtail - set the next-pointer at the end of a node chain
643  */
644 static void
645 regtail(char *p, char *val)
646 {
647 	char *scan;
648 	char *temp;
649 	int offset;
650 
651 	if (p == &regdummy)
652 		return;
653 
654 	/* Find last node. */
655 	scan = p;
656 	for (;;) {
657 		temp = regnext(scan);
658 		if (temp == NULL)
659 			break;
660 		scan = temp;
661 	}
662 
663 	if (OP(scan) == BACK)
664 		offset = (int) (scan - val);
665 	else
666 		offset = (int) (val - scan);
667 	*(scan+1) = (offset>>8)&0377;
668 	*(scan+2) = offset&0377;
669 }
670 
671 /*
672  - regoptail - regtail on operand of first argument; nop if operandless
673  */
674 static void
675 regoptail(char *p, char *val)
676 {
677 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
678 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
679 		return;
680 	regtail(OPERAND(p), val);
681 }
682 
683 /*
684  * regexec and friends
685  */
686 
687 /*
688  * Global work variables for regexec().
689  */
690 static char *reginput;		/* String-input pointer. */
691 static char *regbol;		/* Beginning of input, for ^ check. */
692 static char **regstartp;	/* Pointer to startp array. */
693 static char **regendp;		/* Ditto for endp. */
694 
695 /*
696  * Forwards.
697  */
698 STATIC int regtry();
699 STATIC int regmatch();
700 STATIC int regrepeat();
701 
702 #ifdef DEBUG
703 int regnarrate = 0;
704 void regdump();
705 STATIC char *regprop();
706 #endif
707 
708 /*
709  - regexec - match a regexp against a string
710  */
711 int
712 regexec2(regexp *prog, char *string, int notbol)
713 {
714 	char *s;
715 
716 	/* Be paranoid... */
717 	if (prog == NULL || string == NULL) {
718 		regerror("NULL parameter");
719 		return(0);
720 	}
721 
722 	/* Check validity of program. */
723 	if (UCHARAT(prog->program) != MAGIC) {
724 		regerror("corrupted program");
725 		return(0);
726 	}
727 
728 	/* If there is a "must appear" string, look for it. */
729 	if (prog->regmust != NULL) {
730 		s = string;
731 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
732 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
733 				break;	/* Found it. */
734 			s++;
735 		}
736 		if (s == NULL)	/* Not present. */
737 			return(0);
738 	}
739 
740 	/* Mark beginning of line for ^ . */
741 	if (notbol)
742 		regbol = NULL;
743 	else
744 		regbol = string;
745 
746 	/* Simplest case:  anchored match need be tried only once. */
747 	if (prog->reganch)
748 		return(regtry(prog, string));
749 
750 	/* Messy cases:  unanchored match. */
751 	s = string;
752 	if (prog->regstart != '\0')
753 		/* We know what char it must start with. */
754 		while ((s = strchr(s, prog->regstart)) != NULL) {
755 			if (regtry(prog, s))
756 				return(1);
757 			s++;
758 		}
759 	else
760 		/* We don't -- general case. */
761 		do {
762 			if (regtry(prog, s))
763 				return(1);
764 		} while (*s++ != '\0');
765 
766 	/* Failure. */
767 	return(0);
768 }
769 
770 int
771 regexec(regexp *prog, char *string)
772 {
773 	return regexec2(prog, string, 0);
774 }
775 
776 /*
777  - regtry - try match at specific point
778  */
779 static int			/* 0 failure, 1 success */
780 regtry(regexp *prog, char *string)
781 {
782 	int i;
783 	char **sp;
784 	char **ep;
785 
786 	reginput = string;
787 	regstartp = prog->startp;
788 	regendp = prog->endp;
789 
790 	sp = prog->startp;
791 	ep = prog->endp;
792 	for (i = NSUBEXP; i > 0; i--) {
793 		*sp++ = NULL;
794 		*ep++ = NULL;
795 	}
796 	if (regmatch(prog->program + 1)) {
797 		prog->startp[0] = string;
798 		prog->endp[0] = reginput;
799 		return(1);
800 	} else
801 		return(0);
802 }
803 
804 /*
805  - regmatch - main matching routine
806  *
807  * Conceptually the strategy is simple:  check to see whether the current
808  * node matches, call self recursively to see whether the rest matches,
809  * and then act accordingly.  In practice we make some effort to avoid
810  * recursion, in particular by going through "ordinary" nodes (that don't
811  * need to know whether the rest of the match failed) by a loop instead of
812  * by recursion.
813  */
814 static int			/* 0 failure, 1 success */
815 regmatch(char *prog)
816 {
817 	char *scan;	/* Current node. */
818 	char *next;	/* Next node. */
819 
820 	scan = prog;
821 #ifdef DEBUG
822 	if (scan != NULL && regnarrate)
823 		fprintf(stderr, "%s(\n", regprop(scan));
824 #endif
825 	while (scan != NULL) {
826 #ifdef DEBUG
827 		if (regnarrate)
828 			fprintf(stderr, "%s...\n", regprop(scan));
829 #endif
830 		next = regnext(scan);
831 
832 		switch (OP(scan)) {
833 		case BOL:
834 			if (reginput != regbol)
835 				return(0);
836 			break;
837 		case EOL:
838 			if (*reginput != '\0')
839 				return(0);
840 			break;
841 		case ANY:
842 			if (*reginput == '\0')
843 				return(0);
844 			reginput++;
845 			break;
846 		case EXACTLY: {
847 				int len;
848 				char *opnd;
849 
850 				opnd = OPERAND(scan);
851 				/* Inline the first character, for speed. */
852 				if (*opnd != *reginput)
853 					return(0);
854 				len = (int) strlen(opnd);
855 				if (len > 1 && strncmp(opnd, reginput, len) != 0)
856 					return(0);
857 				reginput += len;
858 			}
859 			break;
860 		case ANYOF:
861  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
862 				return(0);
863 			reginput++;
864 			break;
865 		case ANYBUT:
866  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
867 				return(0);
868 			reginput++;
869 			break;
870 		case NOTHING:
871 			break;
872 		case BACK:
873 			break;
874 		case OPEN+1:
875 		case OPEN+2:
876 		case OPEN+3:
877 		case OPEN+4:
878 		case OPEN+5:
879 		case OPEN+6:
880 		case OPEN+7:
881 		case OPEN+8:
882 		case OPEN+9: {
883 				int no;
884 				char *save;
885 
886 				no = OP(scan) - OPEN;
887 				save = reginput;
888 
889 				if (regmatch(next)) {
890 					/*
891 					 * Don't set startp if some later
892 					 * invocation of the same parentheses
893 					 * already has.
894 					 */
895 					if (regstartp[no] == NULL)
896 						regstartp[no] = save;
897 					return(1);
898 				} else
899 					return(0);
900 			}
901 			/* NOTREACHED */
902 			break;
903 		case CLOSE+1:
904 		case CLOSE+2:
905 		case CLOSE+3:
906 		case CLOSE+4:
907 		case CLOSE+5:
908 		case CLOSE+6:
909 		case CLOSE+7:
910 		case CLOSE+8:
911 		case CLOSE+9: {
912 				int no;
913 				char *save;
914 
915 				no = OP(scan) - CLOSE;
916 				save = reginput;
917 
918 				if (regmatch(next)) {
919 					/*
920 					 * Don't set endp if some later
921 					 * invocation of the same parentheses
922 					 * already has.
923 					 */
924 					if (regendp[no] == NULL)
925 						regendp[no] = save;
926 					return(1);
927 				} else
928 					return(0);
929 			}
930 			/* NOTREACHED */
931 			break;
932 		case BRANCH: {
933 				char *save;
934 
935 				if (OP(next) != BRANCH)		/* No choice. */
936 					next = OPERAND(scan);	/* Avoid recursion. */
937 				else {
938 					do {
939 						save = reginput;
940 						if (regmatch(OPERAND(scan)))
941 							return(1);
942 						reginput = save;
943 						scan = regnext(scan);
944 					} while (scan != NULL && OP(scan) == BRANCH);
945 					return(0);
946 					/* NOTREACHED */
947 				}
948 			}
949 			/* NOTREACHED */
950 			break;
951 		case STAR:
952 		case PLUS: {
953 				char nextch;
954 				int no;
955 				char *save;
956 				int min;
957 
958 				/*
959 				 * Lookahead to avoid useless match attempts
960 				 * when we know what character comes next.
961 				 */
962 				nextch = '\0';
963 				if (OP(next) == EXACTLY)
964 					nextch = *OPERAND(next);
965 				min = (OP(scan) == STAR) ? 0 : 1;
966 				save = reginput;
967 				no = regrepeat(OPERAND(scan));
968 				while (no >= min) {
969 					/* If it could work, try it. */
970 					if (nextch == '\0' || *reginput == nextch)
971 						if (regmatch(next))
972 							return(1);
973 					/* Couldn't or didn't -- back up. */
974 					no--;
975 					reginput = save + no;
976 				}
977 				return(0);
978 			}
979 			/* NOTREACHED */
980 			break;
981 		case END:
982 			return(1);	/* Success! */
983 			/* NOTREACHED */
984 			break;
985 		default:
986 			regerror("memory corruption");
987 			return(0);
988 			/* NOTREACHED */
989 			break;
990 		}
991 
992 		scan = next;
993 	}
994 
995 	/*
996 	 * We get here only if there's trouble -- normally "case END" is
997 	 * the terminating point.
998 	 */
999 	regerror("corrupted pointers");
1000 	return(0);
1001 }
1002 
1003 /*
1004  - regrepeat - repeatedly match something simple, report how many
1005  */
1006 static int
1007 regrepeat(char *p)
1008 {
1009 	int count = 0;
1010 	char *scan;
1011 	char *opnd;
1012 
1013 	scan = reginput;
1014 	opnd = OPERAND(p);
1015 	switch (OP(p)) {
1016 	case ANY:
1017 		count = (int) strlen(scan);
1018 		scan += count;
1019 		break;
1020 	case EXACTLY:
1021 		while (*opnd == *scan) {
1022 			count++;
1023 			scan++;
1024 		}
1025 		break;
1026 	case ANYOF:
1027 		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1028 			count++;
1029 			scan++;
1030 		}
1031 		break;
1032 	case ANYBUT:
1033 		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1034 			count++;
1035 			scan++;
1036 		}
1037 		break;
1038 	default:		/* Oh dear.  Called inappropriately. */
1039 		regerror("internal foulup");
1040 		count = 0;	/* Best compromise. */
1041 		break;
1042 	}
1043 	reginput = scan;
1044 
1045 	return(count);
1046 }
1047 
1048 /*
1049  - regnext - dig the "next" pointer out of a node
1050  */
1051 static char *
1052 regnext(char *p)
1053 {
1054 	int offset;
1055 
1056 	if (p == &regdummy)
1057 		return(NULL);
1058 
1059 	offset = NEXT(p);
1060 	if (offset == 0)
1061 		return(NULL);
1062 
1063 	if (OP(p) == BACK)
1064 		return(p-offset);
1065 	else
1066 		return(p+offset);
1067 }
1068 
1069 #ifdef DEBUG
1070 
1071 STATIC char *regprop();
1072 
1073 /*
1074  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1075  */
1076 void
1077 regdump(regexp *r)
1078 {
1079 	char *s;
1080 	char op = EXACTLY;	/* Arbitrary non-END op. */
1081 	char *next;
1082 
1083 
1084 	s = r->program + 1;
1085 	while (op != END) {	/* While that wasn't END last time... */
1086 		op = OP(s);
1087 		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
1088 		next = regnext(s);
1089 		if (next == NULL)		/* Next ptr. */
1090 			printf("(0)");
1091 		else
1092 			printf("(%d)", (s-r->program)+(next-s));
1093 		s += 3;
1094 		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1095 			/* Literal string, where present. */
1096 			while (*s != '\0') {
1097 				putchar(*s);
1098 				s++;
1099 			}
1100 			s++;
1101 		}
1102 		putchar('\n');
1103 	}
1104 
1105 	/* Header fields of interest. */
1106 	if (r->regstart != '\0')
1107 		printf("start `%c' ", r->regstart);
1108 	if (r->reganch)
1109 		printf("anchored ");
1110 	if (r->regmust != NULL)
1111 		printf("must have \"%s\"", r->regmust);
1112 	printf("\n");
1113 }
1114 
1115 /*
1116  - regprop - printable representation of opcode
1117  */
1118 static char *
1119 regprop(char *op)
1120 {
1121 	char *p;
1122 	static char buf[50];
1123 
1124 	(void) strcpy(buf, ":");
1125 
1126 	switch (OP(op)) {
1127 	case BOL:
1128 		p = "BOL";
1129 		break;
1130 	case EOL:
1131 		p = "EOL";
1132 		break;
1133 	case ANY:
1134 		p = "ANY";
1135 		break;
1136 	case ANYOF:
1137 		p = "ANYOF";
1138 		break;
1139 	case ANYBUT:
1140 		p = "ANYBUT";
1141 		break;
1142 	case BRANCH:
1143 		p = "BRANCH";
1144 		break;
1145 	case EXACTLY:
1146 		p = "EXACTLY";
1147 		break;
1148 	case NOTHING:
1149 		p = "NOTHING";
1150 		break;
1151 	case BACK:
1152 		p = "BACK";
1153 		break;
1154 	case END:
1155 		p = "END";
1156 		break;
1157 	case OPEN+1:
1158 	case OPEN+2:
1159 	case OPEN+3:
1160 	case OPEN+4:
1161 	case OPEN+5:
1162 	case OPEN+6:
1163 	case OPEN+7:
1164 	case OPEN+8:
1165 	case OPEN+9:
1166 		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1167 		p = NULL;
1168 		break;
1169 	case CLOSE+1:
1170 	case CLOSE+2:
1171 	case CLOSE+3:
1172 	case CLOSE+4:
1173 	case CLOSE+5:
1174 	case CLOSE+6:
1175 	case CLOSE+7:
1176 	case CLOSE+8:
1177 	case CLOSE+9:
1178 		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1179 		p = NULL;
1180 		break;
1181 	case STAR:
1182 		p = "STAR";
1183 		break;
1184 	case PLUS:
1185 		p = "PLUS";
1186 		break;
1187 	default:
1188 		regerror("corrupted opcode");
1189 		break;
1190 	}
1191 	if (p != NULL)
1192 		(void) strcat(buf, p);
1193 	return(buf);
1194 }
1195 #endif
1196 
1197 /*
1198  * The following is provided for those people who do not have strcspn() in
1199  * their C libraries.  They should get off their butts and do something
1200  * about it; at least one public-domain implementation of those (highly
1201  * useful) string routines has been published on Usenet.
1202  */
1203 #ifdef STRCSPN
1204 /*
1205  * strcspn - find length of initial segment of s1 consisting entirely
1206  * of characters not from s2
1207  */
1208 
1209 static int
1210 strcspn(char *s1, char *s2)
1211 {
1212 	char *scan1;
1213 	char *scan2;
1214 	int count;
1215 
1216 	count = 0;
1217 	for (scan1 = s1; *scan1 != '\0'; scan1++) {
1218 		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1219 			if (*scan1 == *scan2++)
1220 				return(count);
1221 		count++;
1222 	}
1223 	return(count);
1224 }
1225 #endif
1226