xref: /freebsd/usr.bin/vgrind/regexp.c (revision 8d20be1e22095c27faf8fe8b2f0d089739cc742e)
1 /*
2  * Copyright (c) 1980, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 4. Neither the name of the University nor the names of its contributors
15  *    may be used to endorse or promote products derived from this software
16  *    without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 
33 __FBSDID("$FreeBSD$");
34 
35 #ifndef lint
36 static const char copyright[] =
37 "@(#) Copyright (c) 1980, 1993\n\
38 	The Regents of the University of California.  All rights reserved.\n";
39 #endif
40 
41 #ifndef lint
42 static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
43 #endif
44 
45 #include <ctype.h>
46 #include <stdlib.h>
47 #include <string.h>
48 
49 #include "extern.h"
50 
51 #define FALSE	0
52 #define TRUE	!(FALSE)
53 #define NIL	0
54 
55 static void	expconv(void);
56 
57 boolean	 _escaped;	/* true if we are currently _escaped */
58 char	*s_start;	/* start of string */
59 boolean	 l_onecase;	/* true if upper and lower equivalent */
60 
61 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
62 
63 /*  STRNCMP -	like strncmp except that we convert the
64  *	 	first string to lower case before comparing
65  *		if l_onecase is set.
66  */
67 
68 int
69 STRNCMP(s1, s2, len)
70 	register char *s1,*s2;
71 	register int len;
72 {
73 	if (l_onecase) {
74 	    do
75 		if (*s2 - makelower(*s1))
76 			return (*s2 - makelower(*s1));
77 		else {
78 			s2++;
79 			s1++;
80 		}
81 	    while (--len);
82 	} else {
83 	    do
84 		if (*s2 - *s1)
85 			return (*s2 - *s1);
86 		else {
87 			s2++;
88 			s1++;
89 		}
90 	    while (--len);
91 	}
92 	return(0);
93 }
94 
95 /*	The following routine converts an irregular expression to
96  *	internal format.
97  *
98  *	Either meta symbols (\a \d or \p) or character strings or
99  *	operations ( alternation or perenthesizing ) can be
100  *	specified.  Each starts with a descriptor byte.  The descriptor
101  *	byte has STR set for strings, META set for meta symbols
102  *	and OPER set for operations.
103  *	The descriptor byte can also have the OPT bit set if the object
104  *	defined is optional.  Also ALT can be set to indicate an alternation.
105  *
106  *	For metasymbols the byte following the descriptor byte identities
107  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
108  *	strings the byte after the descriptor is a character count for
109  *	the string:
110  *
111  *		meta symbols := descriptor
112  *				symbol
113  *
114  *		strings :=	descriptor
115  *				character count
116  *				the string
117  *
118  *		operatins :=	descriptor
119  *				symbol
120  *				character count
121  */
122 
123 /*
124  *  handy macros for accessing parts of match blocks
125  */
126 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
127 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
128 
129 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
130 #define OCNT(A) (*(A+2))	/* character count */
131 #define ONEXT(A) (A+3)		/* next character after the operation */
132 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
133 
134 #define SCNT(A) (*(A+1))	/* byte count of a string */
135 #define SSTR(A) (A+2)		/* address of the string */
136 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
137 
138 /*
139  *  bit flags in the descriptor
140  */
141 #define OPT 1
142 #define STR 2
143 #define META 4
144 #define ALT 8
145 #define OPER 16
146 
147 static char *ccre;	/* pointer to current position in converted exp*/
148 static char *ure;	/* pointer current position in unconverted exp */
149 
150 char *
151 convexp(re)
152     char *re;		/* unconverted irregular expression */
153 {
154     register char *cre;		/* pointer to converted regular expression */
155 
156     /* allocate room for the converted expression */
157     if (re == NIL)
158 	return (NIL);
159     if (*re == '\0')
160 	return (NIL);
161     cre = malloc (4 * strlen(re) + 3);
162     ccre = cre;
163     ure = re;
164 
165     /* start the conversion with a \a */
166     *cre = META | OPT;
167     MSYM(cre) = 'a';
168     ccre = MNEXT(cre);
169 
170     /* start the conversion (its recursive) */
171     expconv ();
172     *ccre = 0;
173     return (cre);
174 }
175 
176 static void
177 expconv()
178 {
179     register char *cs;		/* pointer to current symbol in converted exp */
180     register char c;		/* character being processed */
181     register char *acs;		/* pinter to last alternate */
182     register int temp;
183 
184     /* let the conversion begin */
185     acs = NIL;
186     cs = NIL;
187     while (*ure != NIL) {
188 	switch (c = *ure++) {
189 
190 	case '\\':
191 	    switch (c = *ure++) {
192 
193 	    /* escaped characters are just characters */
194 	    default:
195 		if (cs == NIL || (*cs & STR) == 0) {
196 		    cs = ccre;
197 		    *cs = STR;
198 		    SCNT(cs) = 1;
199 		    ccre += 2;
200 		} else
201 		    SCNT(cs)++;
202 		*ccre++ = c;
203 		break;
204 
205 	    /* normal(?) metacharacters */
206 	    case 'a':
207 	    case 'd':
208 	    case 'e':
209 	    case 'p':
210 		if (acs != NIL && acs != cs) {
211 		    do {
212 			temp = OCNT(acs);
213 			OCNT(acs) = ccre - acs;
214 			acs -= temp;
215 		    } while (temp != 0);
216 		    acs = NIL;
217 		}
218 		cs = ccre;
219 		*cs = META;
220 		MSYM(cs) = c;
221 		ccre = MNEXT(cs);
222 		break;
223 	    }
224 	    break;
225 
226 	/* just put the symbol in */
227 	case '^':
228 	case '$':
229 	    if (acs != NIL && acs != cs) {
230 		do {
231 		    temp = OCNT(acs);
232 		    OCNT(acs) = ccre - acs;
233 		    acs -= temp;
234 		} while (temp != 0);
235 		acs = NIL;
236 	    }
237 	    cs = ccre;
238 	    *cs = META;
239 	    MSYM(cs) = c;
240 	    ccre = MNEXT(cs);
241 	    break;
242 
243 	/* mark the last match sequence as optional */
244 	case '?':
245 	    if (cs)
246 	    	*cs = *cs | OPT;
247 	    break;
248 
249 	/* recurse and define a subexpression */
250 	case '(':
251 	    if (acs != NIL && acs != cs) {
252 		do {
253 		    temp = OCNT(acs);
254 		    OCNT(acs) = ccre - acs;
255 		    acs -= temp;
256 		} while (temp != 0);
257 		acs = NIL;
258 	    }
259 	    cs = ccre;
260 	    *cs = OPER;
261 	    OSYM(cs) = '(';
262 	    ccre = ONEXT(cs);
263 	    expconv ();
264 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
265 	    break;
266 
267 	/* return from a recursion */
268 	case ')':
269 	    if (acs != NIL) {
270 		do {
271 		    temp = OCNT(acs);
272 		    OCNT(acs) = ccre - acs;
273 		    acs -= temp;
274 		} while (temp != 0);
275 		acs = NIL;
276 	    }
277 	    cs = ccre;
278 	    *cs = META;
279 	    MSYM(cs) = c;
280 	    ccre = MNEXT(cs);
281 	    return;
282 
283 	/* mark the last match sequence as having an alternate */
284 	/* the third byte will contain an offset to jump over the */
285 	/* alternate match in case the first did not fail */
286 	case '|':
287 	    if (acs != NIL && acs != cs)
288 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
289 	    else
290 		OCNT(ccre) = 0;
291 	    *cs |= ALT;
292 	    cs = ccre;
293 	    *cs = OPER;
294 	    OSYM(cs) = '|';
295 	    ccre = ONEXT(cs);
296 	    acs = cs;	/* remember that the pointer is to be filles */
297 	    break;
298 
299 	/* if its not a metasymbol just build a scharacter string */
300 	default:
301 	    if (cs == NIL || (*cs & STR) == 0) {
302 		cs = ccre;
303 		*cs = STR;
304 		SCNT(cs) = 1;
305 		ccre = SSTR(cs);
306 	    } else
307 		SCNT(cs)++;
308 	    *ccre++ = c;
309 	    break;
310 	}
311     }
312     if (acs != NIL) {
313 	do {
314 	    temp = OCNT(acs);
315 	    OCNT(acs) = ccre - acs;
316 	    acs -= temp;
317 	} while (temp != 0);
318 	acs = NIL;
319     }
320     return;
321 }
322 /* end of convertre */
323 
324 
325 /*
326  *	The following routine recognises an irregular expresion
327  *	with the following special characters:
328  *
329  *		\?	-	means last match was optional
330  *		\a	-	matches any number of characters
331  *		\d	-	matches any number of spaces and tabs
332  *		\p	-	matches any number of alphanumeric
333  *				characters. The
334  *				characters matched will be copied into
335  *				the area pointed to by 'name'.
336  *		\|	-	alternation
337  *		\( \)	-	grouping used mostly for alternation and
338  *				optionality
339  *
340  *	The irregular expression must be translated to internal form
341  *	prior to calling this routine
342  *
343  *	The value returned is the pointer to the first non \a
344  *	character matched.
345  */
346 
347 char *
348 expmatch (s, re, mstring)
349     register char *s;		/* string to check for a match in */
350     register char *re;		/* a converted irregular expression */
351     register char *mstring;	/* where to put whatever matches a \p */
352 {
353     register char *cs;		/* the current symbol */
354     register char *ptr,*s1;	/* temporary pointer */
355     boolean matched;		/* a temporary boolean */
356 
357     /* initial conditions */
358     if (re == NIL)
359 	return (NIL);
360     cs = re;
361     matched = FALSE;
362 
363     /* loop till expression string is exhausted (or at least pretty tired) */
364     while (*cs) {
365 	switch (*cs & (OPER | STR | META)) {
366 
367 	/* try to match a string */
368 	case STR:
369 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
370 	    if (matched) {
371 
372 		/* hoorah it matches */
373 		s += SCNT(cs);
374 		cs = SNEXT(cs);
375 	    } else if (*cs & ALT) {
376 
377 		/* alternation, skip to next expression */
378 		cs = SNEXT(cs);
379 	    } else if (*cs & OPT) {
380 
381 		/* the match is optional */
382 		cs = SNEXT(cs);
383 		matched = 1;		/* indicate a successful match */
384 	    } else {
385 
386 		/* no match, error return */
387 		return (NIL);
388 	    }
389 	    break;
390 
391 	/* an operator, do something fancy */
392 	case OPER:
393 	    switch (OSYM(cs)) {
394 
395 	    /* this is an alternation */
396 	    case '|':
397 		if (matched)
398 
399 		    /* last thing in the alternation was a match, skip ahead */
400 		    cs = OPTR(cs);
401 		else
402 
403 		    /* no match, keep trying */
404 		    cs = ONEXT(cs);
405 		break;
406 
407 	    /* this is a grouping, recurse */
408 	    case '(':
409 		ptr = expmatch (s, ONEXT(cs), mstring);
410 		if (ptr != NIL) {
411 
412 		    /* the subexpression matched */
413 		    matched = 1;
414 		    s = ptr;
415 		} else if (*cs & ALT) {
416 
417 		    /* alternation, skip to next expression */
418 		    matched = 0;
419 		} else if (*cs & OPT) {
420 
421 		    /* the match is optional */
422 		    matched = 1;	/* indicate a successful match */
423 		} else {
424 
425 		    /* no match, error return */
426 		    return (NIL);
427 		}
428 		cs = OPTR(cs);
429 		break;
430 	    }
431 	    break;
432 
433 	/* try to match a metasymbol */
434 	case META:
435 	    switch (MSYM(cs)) {
436 
437 	    /* try to match anything and remember what was matched */
438 	    case 'p':
439 		/*
440 		 *  This is really the same as trying the match the
441 		 *  remaining parts of the expression to any subset
442 		 *  of the string.
443 		 */
444 		s1 = s;
445 		do {
446 		    ptr = expmatch (s1, MNEXT(cs), mstring);
447 		    if (ptr != NIL && s1 != s) {
448 
449 			/* we have a match, remember the match */
450 			strncpy (mstring, s, s1 - s);
451 			mstring[s1 - s] = '\0';
452 			return (ptr);
453 		    } else if (ptr != NIL && (*cs & OPT)) {
454 
455 			/* it was aoptional so no match is ok */
456 			return (ptr);
457 		    } else if (ptr != NIL) {
458 
459 			/* not optional and we still matched */
460 			return (NIL);
461 		    }
462 		    if (!(isalnum(*s1) || *s1 == '_' ||
463 			  /* C++ destructor */
464 			  *s1 == '~' ||
465 			  /* C++ scope operator */
466 			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
467 			   (s1++, TRUE))))
468 			return (NIL);
469 		    if (*s1 == '\\')
470 			_escaped = _escaped ? FALSE : TRUE;
471 		    else
472 			_escaped = FALSE;
473 		} while (*s1++);
474 		return (NIL);
475 
476 	    /* try to match anything */
477 	    case 'a':
478 		/*
479 		 *  This is really the same as trying the match the
480 		 *  remaining parts of the expression to any subset
481 		 *  of the string.
482 		 */
483 		s1 = s;
484 		do {
485 		    ptr = expmatch (s1, MNEXT(cs), mstring);
486 		    if (ptr != NIL && s1 != s) {
487 
488 			/* we have a match */
489 			return (ptr);
490 		    } else if (ptr != NIL && (*cs & OPT)) {
491 
492 			/* it was aoptional so no match is ok */
493 			return (ptr);
494 		    } else if (ptr != NIL) {
495 
496 			/* not optional and we still matched */
497 			return (NIL);
498 		    }
499 		    if (*s1 == '\\')
500 			_escaped = _escaped ? FALSE : TRUE;
501 		    else
502 			_escaped = FALSE;
503 		} while (*s1++);
504 		return (NIL);
505 
506 	    /* fail if we are currently _escaped */
507 	    case 'e':
508 		if (_escaped)
509 		    return(NIL);
510 		cs = MNEXT(cs);
511 		break;
512 
513 	    /* match any number of tabs and spaces */
514 	    case 'd':
515 		ptr = s;
516 		while (*s == ' ' || *s == '\t')
517 		    s++;
518 		if (s != ptr || s == s_start) {
519 
520 		    /* match, be happy */
521 		    matched = 1;
522 		    cs = MNEXT(cs);
523 		} else if (*s == '\n' || *s == '\0') {
524 
525 		    /* match, be happy */
526 		    matched = 1;
527 		    cs = MNEXT(cs);
528 		} else if (*cs & ALT) {
529 
530 		    /* try the next part */
531 		    matched = 0;
532 		    cs = MNEXT(cs);
533 		} else if (*cs & OPT) {
534 
535 		    /* doesn't matter */
536 		    matched = 1;
537 		    cs = MNEXT(cs);
538 		} else
539 
540 		    /* no match, error return */
541 		    return (NIL);
542 		break;
543 
544 	    /* check for end of line */
545 	    case '$':
546 		if (*s == '\0' || *s == '\n') {
547 
548 		    /* match, be happy */
549 		    s++;
550 		    matched = 1;
551 		    cs = MNEXT(cs);
552 		} else if (*cs & ALT) {
553 
554 		    /* try the next part */
555 		    matched = 0;
556 		    cs = MNEXT(cs);
557 		} else if (*cs & OPT) {
558 
559 		    /* doesn't matter */
560 		    matched = 1;
561 		    cs = MNEXT(cs);
562 		} else
563 
564 		    /* no match, error return */
565 		    return (NIL);
566 		break;
567 
568 	    /* check for start of line */
569 	    case '^':
570 		if (s == s_start) {
571 
572 		    /* match, be happy */
573 		    matched = 1;
574 		    cs = MNEXT(cs);
575 		} else if (*cs & ALT) {
576 
577 		    /* try the next part */
578 		    matched = 0;
579 		    cs = MNEXT(cs);
580 		} else if (*cs & OPT) {
581 
582 		    /* doesn't matter */
583 		    matched = 1;
584 		    cs = MNEXT(cs);
585 		} else
586 
587 		    /* no match, error return */
588 		    return (NIL);
589 		break;
590 
591 	    /* end of a subexpression, return success */
592 	    case ')':
593 		return (s);
594 	    }
595 	    break;
596 	}
597     }
598     return (s);
599 }
600