xref: /freebsd/usr.bin/vgrind/regexp.c (revision d1a0d267b78b542fbd7e6553af2493760f49bfa8)
1 /*
2  * Copyright (c) 1980, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 4. Neither the name of the University nor the names of its contributors
15  *    may be used to endorse or promote products derived from this software
16  *    without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 
33 __FBSDID("$FreeBSD$");
34 
35 #ifndef lint
36 static const char copyright[] =
37 "@(#) Copyright (c) 1980, 1993\n\
38 	The Regents of the University of California.  All rights reserved.\n";
39 #endif
40 
41 #ifndef lint
42 static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
43 #endif
44 
45 #include <ctype.h>
46 #include <stdlib.h>
47 #include <stdbool.h>
48 #include <string.h>
49 
50 #include "extern.h"
51 
52 static void	expconv(void);
53 
54 bool	 _escaped;	/* true if we are currently x_escaped */
55 char	*s_start;	/* start of string */
56 bool	 l_onecase;	/* true if upper and lower equivalent */
57 
58 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
59 
60 /*  STRNCMP -	like strncmp except that we convert the
61  *	 	first string to lower case before comparing
62  *		if l_onecase is set.
63  */
64 
65 int
66 STRNCMP(register char *s1, register char *s2, register int len)
67 {
68 	if (l_onecase) {
69 	    do
70 		if (*s2 - makelower(*s1))
71 			return (*s2 - makelower(*s1));
72 		else {
73 			s2++;
74 			s1++;
75 		}
76 	    while (--len);
77 	} else {
78 	    do
79 		if (*s2 - *s1)
80 			return (*s2 - *s1);
81 		else {
82 			s2++;
83 			s1++;
84 		}
85 	    while (--len);
86 	}
87 	return(0);
88 }
89 
90 /*	The following routine converts an irregular expression to
91  *	internal format.
92  *
93  *	Either meta symbols (\a \d or \p) or character strings or
94  *	operations ( alternation or parenthesizing ) can be
95  *	specified.  Each starts with a descriptor byte.  The descriptor
96  *	byte has STR set for strings, META set for meta symbols
97  *	and OPER set for operations.
98  *	The descriptor byte can also have the OPT bit set if the object
99  *	defined is optional.  Also ALT can be set to indicate an alternation.
100  *
101  *	For metasymbols the byte following the descriptor byte identities
102  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
103  *	strings the byte after the descriptor is a character count for
104  *	the string:
105  *
106  *		meta symbols := descriptor
107  *				symbol
108  *
109  *		strings :=	descriptor
110  *				character count
111  *				the string
112  *
113  *		operations :=	descriptor
114  *				symbol
115  *				character count
116  */
117 
118 /*
119  *  handy macros for accessing parts of match blocks
120  */
121 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
122 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
123 
124 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
125 #define OCNT(A) (*(A+2))	/* character count */
126 #define ONEXT(A) (A+3)		/* next character after the operation */
127 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
128 
129 #define SCNT(A) (*(A+1))	/* byte count of a string */
130 #define SSTR(A) (A+2)		/* address of the string */
131 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
132 
133 /*
134  *  bit flags in the descriptor
135  */
136 #define OPT 1
137 #define STR 2
138 #define META 4
139 #define ALT 8
140 #define OPER 16
141 
142 static char *ccre;	/* pointer to current position in converted exp*/
143 static char *ure;	/* pointer current position in unconverted exp */
144 
145 /* re: unconverted irregular expression */
146 char *
147 convexp(char *re)
148 {
149     register char *cre;		/* pointer to converted regular expression */
150 
151     /* allocate room for the converted expression */
152     if (re == NULL)
153 	return (NULL);
154     if (*re == '\0')
155 	return (NULL);
156     cre = malloc(4 * strlen(re) + 3);
157     ccre = cre;
158     ure = re;
159 
160     /* start the conversion with a \a */
161     *cre = META | OPT;
162     MSYM(cre) = 'a';
163     ccre = MNEXT(cre);
164 
165     /* start the conversion (its recursive) */
166     expconv ();
167     *ccre = 0;
168     return (cre);
169 }
170 
171 static void
172 expconv()
173 {
174     register char *cs;		/* pointer to current symbol in converted exp */
175     register char c;		/* character being processed */
176     register char *acs;		/* pinter to last alternate */
177     register int temp;
178 
179     /* let the conversion begin */
180     acs = NULL;
181     cs = NULL;
182     while (*ure) {
183 	switch (c = *ure++) {
184 
185 	case '\\':
186 	    switch (c = *ure++) {
187 
188 	    /* escaped characters are just characters */
189 	    default:
190 		if (cs == NULL || (*cs & STR) == 0) {
191 		    cs = ccre;
192 		    *cs = STR;
193 		    SCNT(cs) = 1;
194 		    ccre += 2;
195 		} else
196 		    SCNT(cs)++;
197 		*ccre++ = c;
198 		break;
199 
200 	    /* normal(?) metacharacters */
201 	    case 'a':
202 	    case 'd':
203 	    case 'e':
204 	    case 'p':
205 		if (acs != NULL && acs != cs) {
206 		    do {
207 			temp = OCNT(acs);
208 			OCNT(acs) = ccre - acs;
209 			acs -= temp;
210 		    } while (temp != 0);
211 		    acs = NULL;
212 		}
213 		cs = ccre;
214 		*cs = META;
215 		MSYM(cs) = c;
216 		ccre = MNEXT(cs);
217 		break;
218 	    }
219 	    break;
220 
221 	/* just put the symbol in */
222 	case '^':
223 	case '$':
224 	    if (acs != NULL && acs != cs) {
225 		do {
226 		    temp = OCNT(acs);
227 		    OCNT(acs) = ccre - acs;
228 		    acs -= temp;
229 		} while (temp != 0);
230 		acs = NULL;
231 	    }
232 	    cs = ccre;
233 	    *cs = META;
234 	    MSYM(cs) = c;
235 	    ccre = MNEXT(cs);
236 	    break;
237 
238 	/* mark the last match sequence as optional */
239 	case '?':
240 	    if (cs)
241 	    	*cs = *cs | OPT;
242 	    break;
243 
244 	/* recurse and define a subexpression */
245 	case '(':
246 	    if (acs != NULL && acs != cs) {
247 		do {
248 		    temp = OCNT(acs);
249 		    OCNT(acs) = ccre - acs;
250 		    acs -= temp;
251 		} while (temp != 0);
252 		acs = NULL;
253 	    }
254 	    cs = ccre;
255 	    *cs = OPER;
256 	    OSYM(cs) = '(';
257 	    ccre = ONEXT(cs);
258 	    expconv();
259 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
260 	    break;
261 
262 	/* return from a recursion */
263 	case ')':
264 	    if (acs != NULL) {
265 		do {
266 		    temp = OCNT(acs);
267 		    OCNT(acs) = ccre - acs;
268 		    acs -= temp;
269 		} while (temp != 0);
270 		acs = NULL;
271 	    }
272 	    cs = ccre;
273 	    *cs = META;
274 	    MSYM(cs) = c;
275 	    ccre = MNEXT(cs);
276 	    return;
277 
278 	/* mark the last match sequence as having an alternate */
279 	/* the third byte will contain an offset to jump over the */
280 	/* alternate match in case the first did not fail */
281 	case '|':
282 	    if (acs != NULL && acs != cs)
283 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
284 	    else
285 		OCNT(ccre) = 0;
286 	    *cs |= ALT;
287 	    cs = ccre;
288 	    *cs = OPER;
289 	    OSYM(cs) = '|';
290 	    ccre = ONEXT(cs);
291 	    acs = cs;	/* remember that the pointer is to be filles */
292 	    break;
293 
294 	/* if its not a metasymbol just build a scharacter string */
295 	default:
296 	    if (cs == NULL || (*cs & STR) == 0) {
297 		cs = ccre;
298 		*cs = STR;
299 		SCNT(cs) = 1;
300 		ccre = SSTR(cs);
301 	    } else
302 		SCNT(cs)++;
303 	    *ccre++ = c;
304 	    break;
305 	}
306     }
307     if (acs != NULL) {
308 	do {
309 	    temp = OCNT(acs);
310 	    OCNT(acs) = ccre - acs;
311 	    acs -= temp;
312 	} while (temp != 0);
313 	acs = NULL;
314     }
315     return;
316 }
317 /* end of convertre */
318 
319 
320 /*
321  *	The following routine recognises an irregular expresion
322  *	with the following special characters:
323  *
324  *		\?	-	means last match was optional
325  *		\a	-	matches any number of characters
326  *		\d	-	matches any number of spaces and tabs
327  *		\p	-	matches any number of alphanumeric
328  *				characters. The
329  *				characters matched will be copied into
330  *				the area pointed to by 'name'.
331  *		\|	-	alternation
332  *		\( \)	-	grouping used mostly for alternation and
333  *				optionality
334  *
335  *	The irregular expression must be translated to internal form
336  *	prior to calling this routine
337  *
338  *	The value returned is the pointer to the first non \a
339  *	character matched.
340  */
341 
342 /*
343  *  s: string to check for a match in
344  *  re: a converted irregular expression
345  *  mstring: where to put whatever matches a \p
346  */
347 char *
348 expmatch (register char *s, register char *re, register char *mstring)
349 {
350     register char *cs;		/* the current symbol */
351     register char *ptr,*s1;	/* temporary pointer */
352     bool matched;	/* a temporary bool */
353 
354     /* initial conditions */
355     if (re == NULL)
356 	return (NULL);
357     cs = re;
358     matched = false;
359 
360     /* loop till expression string is exhausted (or at least pretty tired) */
361     while (*cs) {
362 	switch (*cs & (OPER | STR | META)) {
363 
364 	/* try to match a string */
365 	case STR:
366 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
367 	    if (matched) {
368 
369 		/* hoorah it matches */
370 		s += SCNT(cs);
371 		cs = SNEXT(cs);
372 	    } else if (*cs & ALT) {
373 
374 		/* alternation, skip to next expression */
375 		cs = SNEXT(cs);
376 	    } else if (*cs & OPT) {
377 
378 		/* the match is optional */
379 		cs = SNEXT(cs);
380 		matched = 1;		/* indicate a successful match */
381 	    } else {
382 
383 		/* no match, error return */
384 		return (NULL);
385 	    }
386 	    break;
387 
388 	/* an operator, do something fancy */
389 	case OPER:
390 	    switch (OSYM(cs)) {
391 
392 	    /* this is an alternation */
393 	    case '|':
394 		if (matched)
395 
396 		    /* last thing in the alternation was a match, skip ahead */
397 		    cs = OPTR(cs);
398 		else
399 
400 		    /* no match, keep trying */
401 		    cs = ONEXT(cs);
402 		break;
403 
404 	    /* this is a grouping, recurse */
405 	    case '(':
406 		ptr = expmatch(s, ONEXT(cs), mstring);
407 		if (ptr != NULL) {
408 
409 		    /* the subexpression matched */
410 		    matched = 1;
411 		    s = ptr;
412 		} else if (*cs & ALT) {
413 
414 		    /* alternation, skip to next expression */
415 		    matched = 0;
416 		} else if (*cs & OPT) {
417 
418 		    /* the match is optional */
419 		    matched = 1;	/* indicate a successful match */
420 		} else {
421 
422 		    /* no match, error return */
423 		    return (NULL);
424 		}
425 		cs = OPTR(cs);
426 		break;
427 	    }
428 	    break;
429 
430 	/* try to match a metasymbol */
431 	case META:
432 	    switch (MSYM(cs)) {
433 
434 	    /* try to match anything and remember what was matched */
435 	    case 'p':
436 		/*
437 		 *  This is really the same as trying the match the
438 		 *  remaining parts of the expression to any subset
439 		 *  of the string.
440 		 */
441 		s1 = s;
442 		do {
443 		    ptr = expmatch(s1, MNEXT(cs), mstring);
444 		    if (ptr != NULL && s1 != s) {
445 
446 			/* we have a match, remember the match */
447 			strncpy (mstring, s, s1 - s);
448 			mstring[s1 - s] = '\0';
449 			return (ptr);
450 		    } else if (ptr != NULL && (*cs & OPT)) {
451 
452 			/* it was aoptional so no match is ok */
453 			return (ptr);
454 		    } else if (ptr != NULL) {
455 
456 			/* not optional and we still matched */
457 			return (NULL);
458 		    }
459 		    if (!(isalnum(*s1) || *s1 == '_' ||
460 			  /* C++ destructor */
461 			  *s1 == '~' ||
462 			  /* C++ scope operator */
463 			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
464 			   (s1++, true))))
465 			return (NULL);
466 		    if (*s1 == '\\')
467 			_escaped = _escaped ? false : true;
468 		    else
469 			_escaped = false;
470 		} while (*s1++);
471 		return (NULL);
472 
473 	    /* try to match anything */
474 	    case 'a':
475 		/*
476 		 *  This is really the same as trying the match the
477 		 *  remaining parts of the expression to any subset
478 		 *  of the string.
479 		 */
480 		s1 = s;
481 		do {
482 		    ptr = expmatch(s1, MNEXT(cs), mstring);
483 		    if (ptr != NULL && s1 != s) {
484 
485 			/* we have a match */
486 			return (ptr);
487 		    } else if (ptr != NULL && (*cs & OPT)) {
488 
489 			/* it was aoptional so no match is ok */
490 			return (ptr);
491 		    } else if (ptr != NULL) {
492 
493 			/* not optional and we still matched */
494 			return (NULL);
495 		    }
496 		    if (*s1 == '\\')
497 			_escaped = _escaped ? false : true;
498 		    else
499 			_escaped = false;
500 		} while (*s1++);
501 		return (NULL);
502 
503 	    /* fail if we are currently _escaped */
504 	    case 'e':
505 		if (_escaped)
506 		    return(NULL);
507 		cs = MNEXT(cs);
508 		break;
509 
510 	    /* match any number of tabs and spaces */
511 	    case 'd':
512 		ptr = s;
513 		while (*s == ' ' || *s == '\t')
514 		    s++;
515 		if (s != ptr || s == s_start) {
516 
517 		    /* match, be happy */
518 		    matched = 1;
519 		    cs = MNEXT(cs);
520 		} else if (*s == '\n' || *s == '\0') {
521 
522 		    /* match, be happy */
523 		    matched = 1;
524 		    cs = MNEXT(cs);
525 		} else if (*cs & ALT) {
526 
527 		    /* try the next part */
528 		    matched = 0;
529 		    cs = MNEXT(cs);
530 		} else if (*cs & OPT) {
531 
532 		    /* doesn't matter */
533 		    matched = 1;
534 		    cs = MNEXT(cs);
535 		} else
536 
537 		    /* no match, error return */
538 		    return (NULL);
539 		break;
540 
541 	    /* check for end of line */
542 	    case '$':
543 		if (*s == '\0' || *s == '\n') {
544 
545 		    /* match, be happy */
546 		    s++;
547 		    matched = 1;
548 		    cs = MNEXT(cs);
549 		} else if (*cs & ALT) {
550 
551 		    /* try the next part */
552 		    matched = 0;
553 		    cs = MNEXT(cs);
554 		} else if (*cs & OPT) {
555 
556 		    /* doesn't matter */
557 		    matched = 1;
558 		    cs = MNEXT(cs);
559 		} else
560 
561 		    /* no match, error return */
562 		    return (NULL);
563 		break;
564 
565 	    /* check for start of line */
566 	    case '^':
567 		if (s == s_start) {
568 
569 		    /* match, be happy */
570 		    matched = 1;
571 		    cs = MNEXT(cs);
572 		} else if (*cs & ALT) {
573 
574 		    /* try the next part */
575 		    matched = 0;
576 		    cs = MNEXT(cs);
577 		} else if (*cs & OPT) {
578 
579 		    /* doesn't matter */
580 		    matched = 1;
581 		    cs = MNEXT(cs);
582 		} else
583 
584 		    /* no match, error return */
585 		    return (NULL);
586 		break;
587 
588 	    /* end of a subexpression, return success */
589 	    case ')':
590 		return (s);
591 	    }
592 	    break;
593 	}
594     }
595     return (s);
596 }
597