xref: /freebsd/contrib/less/regexp.c (revision 0572ccaa4543b0abef8ef81e384c1d04de9f3da1)
1 /*
2  * regcomp and regexec -- regsub and regerror are elsewhere
3  *
4  *	Copyright (c) 1986 by University of Toronto.
5  *	Written by Henry Spencer.  Not derived from licensed software.
6  *
7  *	Permission is granted to anyone to use this software for any
8  *	purpose on any computer system, and to redistribute it freely,
9  *	subject to the following restrictions:
10  *
11  *	1. The author is not responsible for the consequences of use of
12  *		this software, no matter how awful, even if they arise
13  *		from defects in it.
14  *
15  *	2. The origin of this software must not be misrepresented, either
16  *		by explicit claim or by omission.
17  *
18  *	3. Altered versions must be plainly marked as such, and must not
19  *		be misrepresented as being the original software.
20  *
21  * Beware that some of this code is subtly aware of the way operator
22  * precedence is structured in regular expressions.  Serious changes in
23  * regular-expression syntax might require a total rethink.
24  *
25  * *** NOTE: this code has been altered slightly for use in Tcl. ***
26  * Slightly modified by David MacKenzie to undo most of the changes for TCL.
27  * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
28  */
29 
30 #include "less.h"
31 #if HAVE_STDIO_H
32 #include <stdio.h>
33 #endif
34 #if HAVE_STDLIB_H
35 #include <stdlib.h>
36 #endif
37 #if HAVE_STRING_H
38 #include <string.h>
39 #endif
40 #include "regexp.h"
41 
42 /*
43  * The "internal use only" fields in regexp.h are present to pass info from
44  * compile to execute that permits the execute phase to run lots faster on
45  * simple cases.  They are:
46  *
47  * regstart	char that must begin a match; '\0' if none obvious
48  * reganch	is the match anchored (at beginning-of-line only)?
49  * regmust	string (pointer into program) that match must include, or NULL
50  * regmlen	length of regmust string
51  *
52  * Regstart and reganch permit very fast decisions on suitable starting points
53  * for a match, cutting down the work a lot.  Regmust permits fast rejection
54  * of lines that cannot possibly match.  The regmust tests are costly enough
55  * that regcomp() supplies a regmust only if the r.e. contains something
56  * potentially expensive (at present, the only such thing detected is * or +
57  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
58  * supplied because the test in regexec() needs it and regcomp() is
59  * computing it anyway.
60  */
61 
62 /*
63  * Structure for regexp "program".  This is essentially a linear encoding
64  * of a nondeterministic finite-state machine (aka syntax charts or
65  * "railroad normal form" in parsing technology).  Each node is an opcode
66  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
67  * all nodes except BRANCH implement concatenation; a "next" pointer with
68  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
69  * have one of the subtle syntax dependencies:  an individual BRANCH (as
70  * opposed to a collection of them) is never concatenated with anything
71  * because of operator precedence.)  The operand of some types of node is
72  * a literal string; for others, it is a node leading into a sub-FSM.  In
73  * particular, the operand of a BRANCH node is the first node of the branch.
74  * (NB this is *not* a tree structure:  the tail of the branch connects
75  * to the thing following the set of BRANCHes.)  The opcodes are:
76  */
77 
78 /* definition	number	opnd?	meaning */
79 #undef EOL
80 #define	END	0	/* no	End of program. */
81 #define	BOL	1	/* no	Match "" at beginning of line. */
82 #define	EOL	2	/* no	Match "" at end of line. */
83 #define	ANY	3	/* no	Match any one character. */
84 #define	ANYOF	4	/* str	Match any character in this string. */
85 #define	ANYBUT	5	/* str	Match any character not in this string. */
86 #define	BRANCH	6	/* node	Match this alternative, or the next... */
87 #define	BACK	7	/* no	Match "", "next" ptr points backward. */
88 #define	EXACTLY	8	/* str	Match this string. */
89 #define	NOTHING	9	/* no	Match empty string. */
90 #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
91 #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
92 #define	OPEN	20	/* no	Mark this point in input as start of #n. */
93 			/*	OPEN+1 is number 1, etc. */
94 #define	CLOSE	30	/* no	Analogous to OPEN. */
95 
96 /*
97  * Opcode notes:
98  *
99  * BRANCH	The set of branches constituting a single choice are hooked
100  *		together with their "next" pointers, since precedence prevents
101  *		anything being concatenated to any individual branch.  The
102  *		"next" pointer of the last BRANCH in a choice points to the
103  *		thing following the whole choice.  This is also where the
104  *		final "next" pointer of each individual branch points; each
105  *		branch starts with the operand node of a BRANCH node.
106  *
107  * BACK		Normal "next" pointers all implicitly point forward; BACK
108  *		exists to make loop structures possible.
109  *
110  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
111  *		BRANCH structures using BACK.  Simple cases (one character
112  *		per match) are implemented with STAR and PLUS for speed
113  *		and to minimize recursive plunges.
114  *
115  * OPEN,CLOSE	...are numbered at compile time.
116  */
117 
118 /*
119  * A node is one char of opcode followed by two chars of "next" pointer.
120  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
121  * value is a positive offset from the opcode of the node containing it.
122  * An operand, if any, simply follows the node.  (Note that much of the
123  * code generation knows about this implicit relationship.)
124  *
125  * Using two bytes for the "next" pointer is vast overkill for most things,
126  * but allows patterns to get big without disasters.
127  */
128 #define	OP(p)	(*(p))
129 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130 #define	OPERAND(p)	((p) + 3)
131 
132 /*
133  * See regmagic.h for one further detail of program structure.
134  */
135 
136 
137 /*
138  * Utility definitions.
139  */
140 #ifndef CHARBITS
141 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
142 #else
143 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
144 #endif
145 
146 #define	FAIL(m)	{ regerror(m); return(NULL); }
147 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
148 #define	META	"^$.[()|?+*\\"
149 
150 /*
151  * Flags to be passed up and down.
152  */
153 #define	HASWIDTH	01	/* Known never to match null string. */
154 #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
155 #define	SPSTART		04	/* Starts with * or +. */
156 #define	WORST		0	/* Worst case. */
157 
158 /*
159  * Global work variables for regcomp().
160  */
161 static char *regparse;		/* Input-scan pointer. */
162 static int regnpar;		/* () count. */
163 static char regdummy;
164 static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
165 static long regsize;		/* Code size. */
166 
167 /*
168  * The first byte of the regexp internal "program" is actually this magic
169  * number; the start node begins in the second byte.
170  */
171 #define	MAGIC	0234
172 
173 
174 /*
175  * Forward declarations for regcomp()'s friends.
176  */
177 #ifndef STATIC
178 #define	STATIC	static
179 #endif
180 STATIC char *reg();
181 STATIC char *regbranch();
182 STATIC char *regpiece();
183 STATIC char *regatom();
184 STATIC char *regnode();
185 STATIC char *regnext();
186 STATIC void regc();
187 STATIC void reginsert();
188 STATIC void regtail();
189 STATIC void regoptail();
190 #ifdef STRCSPN
191 STATIC int strcspn();
192 #endif
193 
194 /*
195  - regcomp - compile a regular expression into internal code
196  *
197  * We can't allocate space until we know how big the compiled form will be,
198  * but we can't compile it (and thus know how big it is) until we've got a
199  * place to put the code.  So we cheat:  we compile it twice, once with code
200  * generation turned off and size counting turned on, and once "for real".
201  * This also means that we don't allocate space until we are sure that the
202  * thing really will compile successfully, and we never have to move the
203  * code and thus invalidate pointers into it.  (Note that it has to be in
204  * one piece because free() must be able to free it all.)
205  *
206  * Beware that the optimization-preparation code in here knows about some
207  * of the structure of the compiled regexp.
208  */
209 regexp *
210 regcomp(exp)
211 char *exp;
212 {
213 	register regexp *r;
214 	register char *scan;
215 	register char *longest;
216 	register int len;
217 	int flags;
218 
219 	if (exp == NULL)
220 		FAIL("NULL argument");
221 
222 	/* First pass: determine size, legality. */
223 	regparse = exp;
224 	regnpar = 1;
225 	regsize = 0L;
226 	regcode = &regdummy;
227 	regc(MAGIC);
228 	if (reg(0, &flags) == NULL)
229 		return(NULL);
230 
231 	/* Small enough for pointer-storage convention? */
232 	if (regsize >= 32767L)		/* Probably could be 65535L. */
233 		FAIL("regexp too big");
234 
235 	/* Allocate space. */
236 	r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
237 	if (r == NULL)
238 		FAIL("out of space");
239 
240 	/* Second pass: emit code. */
241 	regparse = exp;
242 	regnpar = 1;
243 	regcode = r->program;
244 	regc(MAGIC);
245 	if (reg(0, &flags) == NULL)
246 		return(NULL);
247 
248 	/* Dig out information for optimizations. */
249 	r->regstart = '\0';	/* Worst-case defaults. */
250 	r->reganch = 0;
251 	r->regmust = NULL;
252 	r->regmlen = 0;
253 	scan = r->program+1;			/* First BRANCH. */
254 	if (OP(regnext(scan)) == END) {		/* Only one top-level choice. */
255 		scan = OPERAND(scan);
256 
257 		/* Starting-point info. */
258 		if (OP(scan) == EXACTLY)
259 			r->regstart = *OPERAND(scan);
260 		else if (OP(scan) == BOL)
261 			r->reganch++;
262 
263 		/*
264 		 * If there's something expensive in the r.e., find the
265 		 * longest literal string that must appear and make it the
266 		 * regmust.  Resolve ties in favor of later strings, since
267 		 * the regstart check works with the beginning of the r.e.
268 		 * and avoiding duplication strengthens checking.  Not a
269 		 * strong reason, but sufficient in the absence of others.
270 		 */
271 		if (flags&SPSTART) {
272 			longest = NULL;
273 			len = 0;
274 			for (; scan != NULL; scan = regnext(scan))
275 				if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
276 					longest = OPERAND(scan);
277 					len = strlen(OPERAND(scan));
278 				}
279 			r->regmust = longest;
280 			r->regmlen = len;
281 		}
282 	}
283 
284 	return(r);
285 }
286 
287 /*
288  - reg - regular expression, i.e. main body or parenthesized thing
289  *
290  * Caller must absorb opening parenthesis.
291  *
292  * Combining parenthesis handling with the base level of regular expression
293  * is a trifle forced, but the need to tie the tails of the branches to what
294  * follows makes it hard to avoid.
295  */
296 static char *
297 reg(paren, flagp)
298 int paren;			/* Parenthesized? */
299 int *flagp;
300 {
301 	register char *ret;
302 	register char *br;
303 	register char *ender;
304 	register int parno = 0;
305 	int flags;
306 
307 	*flagp = HASWIDTH;	/* Tentatively. */
308 
309 	/* Make an OPEN node, if parenthesized. */
310 	if (paren) {
311 		if (regnpar >= NSUBEXP)
312 			FAIL("too many ()");
313 		parno = regnpar;
314 		regnpar++;
315 		ret = regnode(OPEN+parno);
316 	} else
317 		ret = NULL;
318 
319 	/* Pick up the branches, linking them together. */
320 	br = regbranch(&flags);
321 	if (br == NULL)
322 		return(NULL);
323 	if (ret != NULL)
324 		regtail(ret, br);	/* OPEN -> first. */
325 	else
326 		ret = br;
327 	if (!(flags&HASWIDTH))
328 		*flagp &= ~HASWIDTH;
329 	*flagp |= flags&SPSTART;
330 	while (*regparse == '|') {
331 		regparse++;
332 		br = regbranch(&flags);
333 		if (br == NULL)
334 			return(NULL);
335 		regtail(ret, br);	/* BRANCH -> BRANCH. */
336 		if (!(flags&HASWIDTH))
337 			*flagp &= ~HASWIDTH;
338 		*flagp |= flags&SPSTART;
339 	}
340 
341 	/* Make a closing node, and hook it on the end. */
342 	ender = regnode((paren) ? CLOSE+parno : END);
343 	regtail(ret, ender);
344 
345 	/* Hook the tails of the branches to the closing node. */
346 	for (br = ret; br != NULL; br = regnext(br))
347 		regoptail(br, ender);
348 
349 	/* Check for proper termination. */
350 	if (paren && *regparse++ != ')') {
351 		FAIL("unmatched ()");
352 	} else if (!paren && *regparse != '\0') {
353 		if (*regparse == ')') {
354 			FAIL("unmatched ()");
355 		} else
356 			FAIL("junk on end");	/* "Can't happen". */
357 		/* NOTREACHED */
358 	}
359 
360 	return(ret);
361 }
362 
363 /*
364  - regbranch - one alternative of an | operator
365  *
366  * Implements the concatenation operator.
367  */
368 static char *
369 regbranch(flagp)
370 int *flagp;
371 {
372 	register char *ret;
373 	register char *chain;
374 	register char *latest;
375 	int flags;
376 
377 	*flagp = WORST;		/* Tentatively. */
378 
379 	ret = regnode(BRANCH);
380 	chain = NULL;
381 	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
382 		latest = regpiece(&flags);
383 		if (latest == NULL)
384 			return(NULL);
385 		*flagp |= flags&HASWIDTH;
386 		if (chain == NULL)	/* First piece. */
387 			*flagp |= flags&SPSTART;
388 		else
389 			regtail(chain, latest);
390 		chain = latest;
391 	}
392 	if (chain == NULL)	/* Loop ran zero times. */
393 		(void) regnode(NOTHING);
394 
395 	return(ret);
396 }
397 
398 /*
399  - regpiece - something followed by possible [*+?]
400  *
401  * Note that the branching code sequences used for ? and the general cases
402  * of * and + are somewhat optimized:  they use the same NOTHING node as
403  * both the endmarker for their branch list and the body of the last branch.
404  * It might seem that this node could be dispensed with entirely, but the
405  * endmarker role is not redundant.
406  */
407 static char *
408 regpiece(flagp)
409 int *flagp;
410 {
411 	register char *ret;
412 	register char op;
413 	register char *next;
414 	int flags;
415 
416 	ret = regatom(&flags);
417 	if (ret == NULL)
418 		return(NULL);
419 
420 	op = *regparse;
421 	if (!ISMULT(op)) {
422 		*flagp = flags;
423 		return(ret);
424 	}
425 
426 	if (!(flags&HASWIDTH) && op != '?')
427 		FAIL("*+ operand could be empty");
428 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
429 
430 	if (op == '*' && (flags&SIMPLE))
431 		reginsert(STAR, ret);
432 	else if (op == '*') {
433 		/* Emit x* as (x&|), where & means "self". */
434 		reginsert(BRANCH, ret);			/* Either x */
435 		regoptail(ret, regnode(BACK));		/* and loop */
436 		regoptail(ret, ret);			/* back */
437 		regtail(ret, regnode(BRANCH));		/* or */
438 		regtail(ret, regnode(NOTHING));		/* null. */
439 	} else if (op == '+' && (flags&SIMPLE))
440 		reginsert(PLUS, ret);
441 	else if (op == '+') {
442 		/* Emit x+ as x(&|), where & means "self". */
443 		next = regnode(BRANCH);			/* Either */
444 		regtail(ret, next);
445 		regtail(regnode(BACK), ret);		/* loop back */
446 		regtail(next, regnode(BRANCH));		/* or */
447 		regtail(ret, regnode(NOTHING));		/* null. */
448 	} else if (op == '?') {
449 		/* Emit x? as (x|) */
450 		reginsert(BRANCH, ret);			/* Either x */
451 		regtail(ret, regnode(BRANCH));		/* or */
452 		next = regnode(NOTHING);		/* null. */
453 		regtail(ret, next);
454 		regoptail(ret, next);
455 	}
456 	regparse++;
457 	if (ISMULT(*regparse))
458 		FAIL("nested *?+");
459 
460 	return(ret);
461 }
462 
463 /*
464  - regatom - the lowest level
465  *
466  * Optimization:  gobbles an entire sequence of ordinary characters so that
467  * it can turn them into a single node, which is smaller to store and
468  * faster to run.  Backslashed characters are exceptions, each becoming a
469  * separate node; the code is simpler that way and it's not worth fixing.
470  */
471 static char *
472 regatom(flagp)
473 int *flagp;
474 {
475 	register char *ret;
476 	int flags;
477 
478 	*flagp = WORST;		/* Tentatively. */
479 
480 	switch (*regparse++) {
481 	case '^':
482 		ret = regnode(BOL);
483 		break;
484 	case '$':
485 		ret = regnode(EOL);
486 		break;
487 	case '.':
488 		ret = regnode(ANY);
489 		*flagp |= HASWIDTH|SIMPLE;
490 		break;
491 	case '[': {
492 			register int clss;
493 			register int classend;
494 
495 			if (*regparse == '^') {	/* Complement of range. */
496 				ret = regnode(ANYBUT);
497 				regparse++;
498 			} else
499 				ret = regnode(ANYOF);
500 			if (*regparse == ']' || *regparse == '-')
501 				regc(*regparse++);
502 			while (*regparse != '\0' && *regparse != ']') {
503 				if (*regparse == '-') {
504 					regparse++;
505 					if (*regparse == ']' || *regparse == '\0')
506 						regc('-');
507 					else {
508 						clss = UCHARAT(regparse-2)+1;
509 						classend = UCHARAT(regparse);
510 						if (clss > classend+1)
511 							FAIL("invalid [] range");
512 						for (; clss <= classend; clss++)
513 							regc(clss);
514 						regparse++;
515 					}
516 				} else
517 					regc(*regparse++);
518 			}
519 			regc('\0');
520 			if (*regparse != ']')
521 				FAIL("unmatched []");
522 			regparse++;
523 			*flagp |= HASWIDTH|SIMPLE;
524 		}
525 		break;
526 	case '(':
527 		ret = reg(1, &flags);
528 		if (ret == NULL)
529 			return(NULL);
530 		*flagp |= flags&(HASWIDTH|SPSTART);
531 		break;
532 	case '\0':
533 	case '|':
534 	case ')':
535 		FAIL("internal urp");	/* Supposed to be caught earlier. */
536 		/* NOTREACHED */
537 		break;
538 	case '?':
539 	case '+':
540 	case '*':
541 		FAIL("?+* follows nothing");
542 		/* NOTREACHED */
543 		break;
544 	case '\\':
545 		if (*regparse == '\0')
546 			FAIL("trailing \\");
547 		ret = regnode(EXACTLY);
548 		regc(*regparse++);
549 		regc('\0');
550 		*flagp |= HASWIDTH|SIMPLE;
551 		break;
552 	default: {
553 			register int len;
554 			register char ender;
555 
556 			regparse--;
557 			len = strcspn(regparse, META);
558 			if (len <= 0)
559 				FAIL("internal disaster");
560 			ender = *(regparse+len);
561 			if (len > 1 && ISMULT(ender))
562 				len--;		/* Back off clear of ?+* operand. */
563 			*flagp |= HASWIDTH;
564 			if (len == 1)
565 				*flagp |= SIMPLE;
566 			ret = regnode(EXACTLY);
567 			while (len > 0) {
568 				regc(*regparse++);
569 				len--;
570 			}
571 			regc('\0');
572 		}
573 		break;
574 	}
575 
576 	return(ret);
577 }
578 
579 /*
580  - regnode - emit a node
581  */
582 static char *			/* Location. */
583 regnode(op)
584 char op;
585 {
586 	register char *ret;
587 	register char *ptr;
588 
589 	ret = regcode;
590 	if (ret == &regdummy) {
591 		regsize += 3;
592 		return(ret);
593 	}
594 
595 	ptr = ret;
596 	*ptr++ = op;
597 	*ptr++ = '\0';		/* Null "next" pointer. */
598 	*ptr++ = '\0';
599 	regcode = ptr;
600 
601 	return(ret);
602 }
603 
604 /*
605  - regc - emit (if appropriate) a byte of code
606  */
607 static void
608 regc(b)
609 char b;
610 {
611 	if (regcode != &regdummy)
612 		*regcode++ = b;
613 	else
614 		regsize++;
615 }
616 
617 /*
618  - reginsert - insert an operator in front of already-emitted operand
619  *
620  * Means relocating the operand.
621  */
622 static void
623 reginsert(op, opnd)
624 char op;
625 char *opnd;
626 {
627 	register char *src;
628 	register char *dst;
629 	register char *place;
630 
631 	if (regcode == &regdummy) {
632 		regsize += 3;
633 		return;
634 	}
635 
636 	src = regcode;
637 	regcode += 3;
638 	dst = regcode;
639 	while (src > opnd)
640 		*--dst = *--src;
641 
642 	place = opnd;		/* Op node, where operand used to be. */
643 	*place++ = op;
644 	*place++ = '\0';
645 	*place++ = '\0';
646 }
647 
648 /*
649  - regtail - set the next-pointer at the end of a node chain
650  */
651 static void
652 regtail(p, val)
653 char *p;
654 char *val;
655 {
656 	register char *scan;
657 	register char *temp;
658 	register int offset;
659 
660 	if (p == &regdummy)
661 		return;
662 
663 	/* Find last node. */
664 	scan = p;
665 	for (;;) {
666 		temp = regnext(scan);
667 		if (temp == NULL)
668 			break;
669 		scan = temp;
670 	}
671 
672 	if (OP(scan) == BACK)
673 		offset = scan - val;
674 	else
675 		offset = val - scan;
676 	*(scan+1) = (offset>>8)&0377;
677 	*(scan+2) = offset&0377;
678 }
679 
680 /*
681  - regoptail - regtail on operand of first argument; nop if operandless
682  */
683 static void
684 regoptail(p, val)
685 char *p;
686 char *val;
687 {
688 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
689 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
690 		return;
691 	regtail(OPERAND(p), val);
692 }
693 
694 /*
695  * regexec and friends
696  */
697 
698 /*
699  * Global work variables for regexec().
700  */
701 static char *reginput;		/* String-input pointer. */
702 static char *regbol;		/* Beginning of input, for ^ check. */
703 static char **regstartp;	/* Pointer to startp array. */
704 static char **regendp;		/* Ditto for endp. */
705 
706 /*
707  * Forwards.
708  */
709 STATIC int regtry();
710 STATIC int regmatch();
711 STATIC int regrepeat();
712 
713 #ifdef DEBUG
714 int regnarrate = 0;
715 void regdump();
716 STATIC char *regprop();
717 #endif
718 
719 /*
720  - regexec - match a regexp against a string
721  */
722 int
723 regexec2(prog, string, notbol)
724 register regexp *prog;
725 register char *string;
726 int notbol;
727 {
728 	register char *s;
729 
730 	/* Be paranoid... */
731 	if (prog == NULL || string == NULL) {
732 		regerror("NULL parameter");
733 		return(0);
734 	}
735 
736 	/* Check validity of program. */
737 	if (UCHARAT(prog->program) != MAGIC) {
738 		regerror("corrupted program");
739 		return(0);
740 	}
741 
742 	/* If there is a "must appear" string, look for it. */
743 	if (prog->regmust != NULL) {
744 		s = string;
745 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
746 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
747 				break;	/* Found it. */
748 			s++;
749 		}
750 		if (s == NULL)	/* Not present. */
751 			return(0);
752 	}
753 
754 	/* Mark beginning of line for ^ . */
755 	if (notbol)
756 		regbol = NULL;
757 	else
758 		regbol = string;
759 
760 	/* Simplest case:  anchored match need be tried only once. */
761 	if (prog->reganch)
762 		return(regtry(prog, string));
763 
764 	/* Messy cases:  unanchored match. */
765 	s = string;
766 	if (prog->regstart != '\0')
767 		/* We know what char it must start with. */
768 		while ((s = strchr(s, prog->regstart)) != NULL) {
769 			if (regtry(prog, s))
770 				return(1);
771 			s++;
772 		}
773 	else
774 		/* We don't -- general case. */
775 		do {
776 			if (regtry(prog, s))
777 				return(1);
778 		} while (*s++ != '\0');
779 
780 	/* Failure. */
781 	return(0);
782 }
783 
784 int
785 regexec(prog, string)
786 register regexp *prog;
787 register char *string;
788 {
789 	return regexec2(prog, string, 0);
790 }
791 
792 /*
793  - regtry - try match at specific point
794  */
795 static int			/* 0 failure, 1 success */
796 regtry(prog, string)
797 regexp *prog;
798 char *string;
799 {
800 	register int i;
801 	register char **sp;
802 	register char **ep;
803 
804 	reginput = string;
805 	regstartp = prog->startp;
806 	regendp = prog->endp;
807 
808 	sp = prog->startp;
809 	ep = prog->endp;
810 	for (i = NSUBEXP; i > 0; i--) {
811 		*sp++ = NULL;
812 		*ep++ = NULL;
813 	}
814 	if (regmatch(prog->program + 1)) {
815 		prog->startp[0] = string;
816 		prog->endp[0] = reginput;
817 		return(1);
818 	} else
819 		return(0);
820 }
821 
822 /*
823  - regmatch - main matching routine
824  *
825  * Conceptually the strategy is simple:  check to see whether the current
826  * node matches, call self recursively to see whether the rest matches,
827  * and then act accordingly.  In practice we make some effort to avoid
828  * recursion, in particular by going through "ordinary" nodes (that don't
829  * need to know whether the rest of the match failed) by a loop instead of
830  * by recursion.
831  */
832 static int			/* 0 failure, 1 success */
833 regmatch(prog)
834 char *prog;
835 {
836 	register char *scan;	/* Current node. */
837 	char *next;		/* Next node. */
838 
839 	scan = prog;
840 #ifdef DEBUG
841 	if (scan != NULL && regnarrate)
842 		fprintf(stderr, "%s(\n", regprop(scan));
843 #endif
844 	while (scan != NULL) {
845 #ifdef DEBUG
846 		if (regnarrate)
847 			fprintf(stderr, "%s...\n", regprop(scan));
848 #endif
849 		next = regnext(scan);
850 
851 		switch (OP(scan)) {
852 		case BOL:
853 			if (reginput != regbol)
854 				return(0);
855 			break;
856 		case EOL:
857 			if (*reginput != '\0')
858 				return(0);
859 			break;
860 		case ANY:
861 			if (*reginput == '\0')
862 				return(0);
863 			reginput++;
864 			break;
865 		case EXACTLY: {
866 				register int len;
867 				register char *opnd;
868 
869 				opnd = OPERAND(scan);
870 				/* Inline the first character, for speed. */
871 				if (*opnd != *reginput)
872 					return(0);
873 				len = strlen(opnd);
874 				if (len > 1 && strncmp(opnd, reginput, len) != 0)
875 					return(0);
876 				reginput += len;
877 			}
878 			break;
879 		case ANYOF:
880  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
881 				return(0);
882 			reginput++;
883 			break;
884 		case ANYBUT:
885  			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
886 				return(0);
887 			reginput++;
888 			break;
889 		case NOTHING:
890 			break;
891 		case BACK:
892 			break;
893 		case OPEN+1:
894 		case OPEN+2:
895 		case OPEN+3:
896 		case OPEN+4:
897 		case OPEN+5:
898 		case OPEN+6:
899 		case OPEN+7:
900 		case OPEN+8:
901 		case OPEN+9: {
902 				register int no;
903 				register char *save;
904 
905 				no = OP(scan) - OPEN;
906 				save = reginput;
907 
908 				if (regmatch(next)) {
909 					/*
910 					 * Don't set startp if some later
911 					 * invocation of the same parentheses
912 					 * already has.
913 					 */
914 					if (regstartp[no] == NULL)
915 						regstartp[no] = save;
916 					return(1);
917 				} else
918 					return(0);
919 			}
920 			/* NOTREACHED */
921 			break;
922 		case CLOSE+1:
923 		case CLOSE+2:
924 		case CLOSE+3:
925 		case CLOSE+4:
926 		case CLOSE+5:
927 		case CLOSE+6:
928 		case CLOSE+7:
929 		case CLOSE+8:
930 		case CLOSE+9: {
931 				register int no;
932 				register char *save;
933 
934 				no = OP(scan) - CLOSE;
935 				save = reginput;
936 
937 				if (regmatch(next)) {
938 					/*
939 					 * Don't set endp if some later
940 					 * invocation of the same parentheses
941 					 * already has.
942 					 */
943 					if (regendp[no] == NULL)
944 						regendp[no] = save;
945 					return(1);
946 				} else
947 					return(0);
948 			}
949 			/* NOTREACHED */
950 			break;
951 		case BRANCH: {
952 				register char *save;
953 
954 				if (OP(next) != BRANCH)		/* No choice. */
955 					next = OPERAND(scan);	/* Avoid recursion. */
956 				else {
957 					do {
958 						save = reginput;
959 						if (regmatch(OPERAND(scan)))
960 							return(1);
961 						reginput = save;
962 						scan = regnext(scan);
963 					} while (scan != NULL && OP(scan) == BRANCH);
964 					return(0);
965 					/* NOTREACHED */
966 				}
967 			}
968 			/* NOTREACHED */
969 			break;
970 		case STAR:
971 		case PLUS: {
972 				register char nextch;
973 				register int no;
974 				register char *save;
975 				register int min;
976 
977 				/*
978 				 * Lookahead to avoid useless match attempts
979 				 * when we know what character comes next.
980 				 */
981 				nextch = '\0';
982 				if (OP(next) == EXACTLY)
983 					nextch = *OPERAND(next);
984 				min = (OP(scan) == STAR) ? 0 : 1;
985 				save = reginput;
986 				no = regrepeat(OPERAND(scan));
987 				while (no >= min) {
988 					/* If it could work, try it. */
989 					if (nextch == '\0' || *reginput == nextch)
990 						if (regmatch(next))
991 							return(1);
992 					/* Couldn't or didn't -- back up. */
993 					no--;
994 					reginput = save + no;
995 				}
996 				return(0);
997 			}
998 			/* NOTREACHED */
999 			break;
1000 		case END:
1001 			return(1);	/* Success! */
1002 			/* NOTREACHED */
1003 			break;
1004 		default:
1005 			regerror("memory corruption");
1006 			return(0);
1007 			/* NOTREACHED */
1008 			break;
1009 		}
1010 
1011 		scan = next;
1012 	}
1013 
1014 	/*
1015 	 * We get here only if there's trouble -- normally "case END" is
1016 	 * the terminating point.
1017 	 */
1018 	regerror("corrupted pointers");
1019 	return(0);
1020 }
1021 
1022 /*
1023  - regrepeat - repeatedly match something simple, report how many
1024  */
1025 static int
1026 regrepeat(p)
1027 char *p;
1028 {
1029 	register int count = 0;
1030 	register char *scan;
1031 	register char *opnd;
1032 
1033 	scan = reginput;
1034 	opnd = OPERAND(p);
1035 	switch (OP(p)) {
1036 	case ANY:
1037 		count = strlen(scan);
1038 		scan += count;
1039 		break;
1040 	case EXACTLY:
1041 		while (*opnd == *scan) {
1042 			count++;
1043 			scan++;
1044 		}
1045 		break;
1046 	case ANYOF:
1047 		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1048 			count++;
1049 			scan++;
1050 		}
1051 		break;
1052 	case ANYBUT:
1053 		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1054 			count++;
1055 			scan++;
1056 		}
1057 		break;
1058 	default:		/* Oh dear.  Called inappropriately. */
1059 		regerror("internal foulup");
1060 		count = 0;	/* Best compromise. */
1061 		break;
1062 	}
1063 	reginput = scan;
1064 
1065 	return(count);
1066 }
1067 
1068 /*
1069  - regnext - dig the "next" pointer out of a node
1070  */
1071 static char *
1072 regnext(p)
1073 register char *p;
1074 {
1075 	register int offset;
1076 
1077 	if (p == &regdummy)
1078 		return(NULL);
1079 
1080 	offset = NEXT(p);
1081 	if (offset == 0)
1082 		return(NULL);
1083 
1084 	if (OP(p) == BACK)
1085 		return(p-offset);
1086 	else
1087 		return(p+offset);
1088 }
1089 
1090 #ifdef DEBUG
1091 
1092 STATIC char *regprop();
1093 
1094 /*
1095  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1096  */
1097 void
1098 regdump(r)
1099 regexp *r;
1100 {
1101 	register char *s;
1102 	register char op = EXACTLY;	/* Arbitrary non-END op. */
1103 	register char *next;
1104 
1105 
1106 	s = r->program + 1;
1107 	while (op != END) {	/* While that wasn't END last time... */
1108 		op = OP(s);
1109 		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
1110 		next = regnext(s);
1111 		if (next == NULL)		/* Next ptr. */
1112 			printf("(0)");
1113 		else
1114 			printf("(%d)", (s-r->program)+(next-s));
1115 		s += 3;
1116 		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1117 			/* Literal string, where present. */
1118 			while (*s != '\0') {
1119 				putchar(*s);
1120 				s++;
1121 			}
1122 			s++;
1123 		}
1124 		putchar('\n');
1125 	}
1126 
1127 	/* Header fields of interest. */
1128 	if (r->regstart != '\0')
1129 		printf("start `%c' ", r->regstart);
1130 	if (r->reganch)
1131 		printf("anchored ");
1132 	if (r->regmust != NULL)
1133 		printf("must have \"%s\"", r->regmust);
1134 	printf("\n");
1135 }
1136 
1137 /*
1138  - regprop - printable representation of opcode
1139  */
1140 static char *
1141 regprop(op)
1142 char *op;
1143 {
1144 	register char *p;
1145 	static char buf[50];
1146 
1147 	(void) strcpy(buf, ":");
1148 
1149 	switch (OP(op)) {
1150 	case BOL:
1151 		p = "BOL";
1152 		break;
1153 	case EOL:
1154 		p = "EOL";
1155 		break;
1156 	case ANY:
1157 		p = "ANY";
1158 		break;
1159 	case ANYOF:
1160 		p = "ANYOF";
1161 		break;
1162 	case ANYBUT:
1163 		p = "ANYBUT";
1164 		break;
1165 	case BRANCH:
1166 		p = "BRANCH";
1167 		break;
1168 	case EXACTLY:
1169 		p = "EXACTLY";
1170 		break;
1171 	case NOTHING:
1172 		p = "NOTHING";
1173 		break;
1174 	case BACK:
1175 		p = "BACK";
1176 		break;
1177 	case END:
1178 		p = "END";
1179 		break;
1180 	case OPEN+1:
1181 	case OPEN+2:
1182 	case OPEN+3:
1183 	case OPEN+4:
1184 	case OPEN+5:
1185 	case OPEN+6:
1186 	case OPEN+7:
1187 	case OPEN+8:
1188 	case OPEN+9:
1189 		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1190 		p = NULL;
1191 		break;
1192 	case CLOSE+1:
1193 	case CLOSE+2:
1194 	case CLOSE+3:
1195 	case CLOSE+4:
1196 	case CLOSE+5:
1197 	case CLOSE+6:
1198 	case CLOSE+7:
1199 	case CLOSE+8:
1200 	case CLOSE+9:
1201 		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1202 		p = NULL;
1203 		break;
1204 	case STAR:
1205 		p = "STAR";
1206 		break;
1207 	case PLUS:
1208 		p = "PLUS";
1209 		break;
1210 	default:
1211 		regerror("corrupted opcode");
1212 		break;
1213 	}
1214 	if (p != NULL)
1215 		(void) strcat(buf, p);
1216 	return(buf);
1217 }
1218 #endif
1219 
1220 /*
1221  * The following is provided for those people who do not have strcspn() in
1222  * their C libraries.  They should get off their butts and do something
1223  * about it; at least one public-domain implementation of those (highly
1224  * useful) string routines has been published on Usenet.
1225  */
1226 #ifdef STRCSPN
1227 /*
1228  * strcspn - find length of initial segment of s1 consisting entirely
1229  * of characters not from s2
1230  */
1231 
1232 static int
1233 strcspn(s1, s2)
1234 char *s1;
1235 char *s2;
1236 {
1237 	register char *scan1;
1238 	register char *scan2;
1239 	register int count;
1240 
1241 	count = 0;
1242 	for (scan1 = s1; *scan1 != '\0'; scan1++) {
1243 		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
1244 			if (*scan1 == *scan2++)
1245 				return(count);
1246 		count++;
1247 	}
1248 	return(count);
1249 }
1250 #endif
1251