xref: /freebsd/usr.bin/vgrind/regexp.c (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 
34 #ifndef lint
35 static const char copyright[] =
36 "@(#) Copyright (c) 1980, 1993\n\
37 	The Regents of the University of California.  All rights reserved.\n";
38 #endif
39 
40 #ifndef lint
41 static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
42 #endif
43 
44 #include <ctype.h>
45 #include <stdlib.h>
46 #include <stdbool.h>
47 #include <string.h>
48 
49 #include "extern.h"
50 
51 static void	expconv(void);
52 
53 bool	 _escaped;	/* true if we are currently x_escaped */
54 char	*s_start;	/* start of string */
55 bool	 l_onecase;	/* true if upper and lower equivalent */
56 
57 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
58 
59 /*  STRNCMP -	like strncmp except that we convert the
60  *	 	first string to lower case before comparing
61  *		if l_onecase is set.
62  */
63 
64 int
65 STRNCMP(register char *s1, register char *s2, register int len)
66 {
67 	if (l_onecase) {
68 	    do
69 		if (*s2 - makelower(*s1))
70 			return (*s2 - makelower(*s1));
71 		else {
72 			s2++;
73 			s1++;
74 		}
75 	    while (--len);
76 	} else {
77 	    do
78 		if (*s2 - *s1)
79 			return (*s2 - *s1);
80 		else {
81 			s2++;
82 			s1++;
83 		}
84 	    while (--len);
85 	}
86 	return(0);
87 }
88 
89 /*	The following routine converts an irregular expression to
90  *	internal format.
91  *
92  *	Either meta symbols (\a \d or \p) or character strings or
93  *	operations ( alternation or parenthesizing ) can be
94  *	specified.  Each starts with a descriptor byte.  The descriptor
95  *	byte has STR set for strings, META set for meta symbols
96  *	and OPER set for operations.
97  *	The descriptor byte can also have the OPT bit set if the object
98  *	defined is optional.  Also ALT can be set to indicate an alternation.
99  *
100  *	For metasymbols the byte following the descriptor byte identities
101  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
102  *	strings the byte after the descriptor is a character count for
103  *	the string:
104  *
105  *		meta symbols := descriptor
106  *				symbol
107  *
108  *		strings :=	descriptor
109  *				character count
110  *				the string
111  *
112  *		operations :=	descriptor
113  *				symbol
114  *				character count
115  */
116 
117 /*
118  *  handy macros for accessing parts of match blocks
119  */
120 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
121 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
122 
123 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
124 #define OCNT(A) (*(A+2))	/* character count */
125 #define ONEXT(A) (A+3)		/* next character after the operation */
126 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
127 
128 #define SCNT(A) (*(A+1))	/* byte count of a string */
129 #define SSTR(A) (A+2)		/* address of the string */
130 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
131 
132 /*
133  *  bit flags in the descriptor
134  */
135 #define OPT 1
136 #define STR 2
137 #define META 4
138 #define ALT 8
139 #define OPER 16
140 
141 static char *ccre;	/* pointer to current position in converted exp*/
142 static char *ure;	/* pointer current position in unconverted exp */
143 
144 /* re: unconverted irregular expression */
145 char *
146 convexp(char *re)
147 {
148     register char *cre;		/* pointer to converted regular expression */
149 
150     /* allocate room for the converted expression */
151     if (re == NULL)
152 	return (NULL);
153     if (*re == '\0')
154 	return (NULL);
155     cre = malloc(4 * strlen(re) + 3);
156     ccre = cre;
157     ure = re;
158 
159     /* start the conversion with a \a */
160     *cre = META | OPT;
161     MSYM(cre) = 'a';
162     ccre = MNEXT(cre);
163 
164     /* start the conversion (its recursive) */
165     expconv ();
166     *ccre = 0;
167     return (cre);
168 }
169 
170 static void
171 expconv()
172 {
173     register char *cs;		/* pointer to current symbol in converted exp */
174     register char c;		/* character being processed */
175     register char *acs;		/* pinter to last alternate */
176     register int temp;
177 
178     /* let the conversion begin */
179     acs = NULL;
180     cs = NULL;
181     while (*ure) {
182 	switch (c = *ure++) {
183 
184 	case '\\':
185 	    switch (c = *ure++) {
186 
187 	    /* escaped characters are just characters */
188 	    default:
189 		if (cs == NULL || (*cs & STR) == 0) {
190 		    cs = ccre;
191 		    *cs = STR;
192 		    SCNT(cs) = 1;
193 		    ccre += 2;
194 		} else
195 		    SCNT(cs)++;
196 		*ccre++ = c;
197 		break;
198 
199 	    /* normal(?) metacharacters */
200 	    case 'a':
201 	    case 'd':
202 	    case 'e':
203 	    case 'p':
204 		if (acs != NULL && acs != cs) {
205 		    do {
206 			temp = OCNT(acs);
207 			OCNT(acs) = ccre - acs;
208 			acs -= temp;
209 		    } while (temp != 0);
210 		    acs = NULL;
211 		}
212 		cs = ccre;
213 		*cs = META;
214 		MSYM(cs) = c;
215 		ccre = MNEXT(cs);
216 		break;
217 	    }
218 	    break;
219 
220 	/* just put the symbol in */
221 	case '^':
222 	case '$':
223 	    if (acs != NULL && acs != cs) {
224 		do {
225 		    temp = OCNT(acs);
226 		    OCNT(acs) = ccre - acs;
227 		    acs -= temp;
228 		} while (temp != 0);
229 		acs = NULL;
230 	    }
231 	    cs = ccre;
232 	    *cs = META;
233 	    MSYM(cs) = c;
234 	    ccre = MNEXT(cs);
235 	    break;
236 
237 	/* mark the last match sequence as optional */
238 	case '?':
239 	    if (cs)
240 	    	*cs = *cs | OPT;
241 	    break;
242 
243 	/* recurse and define a subexpression */
244 	case '(':
245 	    if (acs != NULL && acs != cs) {
246 		do {
247 		    temp = OCNT(acs);
248 		    OCNT(acs) = ccre - acs;
249 		    acs -= temp;
250 		} while (temp != 0);
251 		acs = NULL;
252 	    }
253 	    cs = ccre;
254 	    *cs = OPER;
255 	    OSYM(cs) = '(';
256 	    ccre = ONEXT(cs);
257 	    expconv();
258 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
259 	    break;
260 
261 	/* return from a recursion */
262 	case ')':
263 	    if (acs != NULL) {
264 		do {
265 		    temp = OCNT(acs);
266 		    OCNT(acs) = ccre - acs;
267 		    acs -= temp;
268 		} while (temp != 0);
269 		acs = NULL;
270 	    }
271 	    cs = ccre;
272 	    *cs = META;
273 	    MSYM(cs) = c;
274 	    ccre = MNEXT(cs);
275 	    return;
276 
277 	/* mark the last match sequence as having an alternate */
278 	/* the third byte will contain an offset to jump over the */
279 	/* alternate match in case the first did not fail */
280 	case '|':
281 	    if (acs != NULL && acs != cs)
282 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
283 	    else
284 		OCNT(ccre) = 0;
285 	    *cs |= ALT;
286 	    cs = ccre;
287 	    *cs = OPER;
288 	    OSYM(cs) = '|';
289 	    ccre = ONEXT(cs);
290 	    acs = cs;	/* remember that the pointer is to be filles */
291 	    break;
292 
293 	/* if its not a metasymbol just build a scharacter string */
294 	default:
295 	    if (cs == NULL || (*cs & STR) == 0) {
296 		cs = ccre;
297 		*cs = STR;
298 		SCNT(cs) = 1;
299 		ccre = SSTR(cs);
300 	    } else
301 		SCNT(cs)++;
302 	    *ccre++ = c;
303 	    break;
304 	}
305     }
306     if (acs != NULL) {
307 	do {
308 	    temp = OCNT(acs);
309 	    OCNT(acs) = ccre - acs;
310 	    acs -= temp;
311 	} while (temp != 0);
312 	acs = NULL;
313     }
314     return;
315 }
316 /* end of convertre */
317 
318 
319 /*
320  *	The following routine recognises an irregular expression
321  *	with the following special characters:
322  *
323  *		\?	-	means last match was optional
324  *		\a	-	matches any number of characters
325  *		\d	-	matches any number of spaces and tabs
326  *		\p	-	matches any number of alphanumeric
327  *				characters. The
328  *				characters matched will be copied into
329  *				the area pointed to by 'name'.
330  *		\|	-	alternation
331  *		\( \)	-	grouping used mostly for alternation and
332  *				optionality
333  *
334  *	The irregular expression must be translated to internal form
335  *	prior to calling this routine
336  *
337  *	The value returned is the pointer to the first non \a
338  *	character matched.
339  */
340 
341 /*
342  *  s: string to check for a match in
343  *  re: a converted irregular expression
344  *  mstring: where to put whatever matches a \p
345  */
346 char *
347 expmatch (register char *s, register char *re, register char *mstring)
348 {
349     register char *cs;		/* the current symbol */
350     register char *ptr,*s1;	/* temporary pointer */
351     bool matched;	/* a temporary bool */
352 
353     /* initial conditions */
354     if (re == NULL)
355 	return (NULL);
356     cs = re;
357     matched = false;
358 
359     /* loop till expression string is exhausted (or at least pretty tired) */
360     while (*cs) {
361 	switch (*cs & (OPER | STR | META)) {
362 
363 	/* try to match a string */
364 	case STR:
365 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
366 	    if (matched) {
367 
368 		/* hoorah it matches */
369 		s += SCNT(cs);
370 		cs = SNEXT(cs);
371 	    } else if (*cs & ALT) {
372 
373 		/* alternation, skip to next expression */
374 		cs = SNEXT(cs);
375 	    } else if (*cs & OPT) {
376 
377 		/* the match is optional */
378 		cs = SNEXT(cs);
379 		matched = 1;		/* indicate a successful match */
380 	    } else {
381 
382 		/* no match, error return */
383 		return (NULL);
384 	    }
385 	    break;
386 
387 	/* an operator, do something fancy */
388 	case OPER:
389 	    switch (OSYM(cs)) {
390 
391 	    /* this is an alternation */
392 	    case '|':
393 		if (matched)
394 
395 		    /* last thing in the alternation was a match, skip ahead */
396 		    cs = OPTR(cs);
397 		else
398 
399 		    /* no match, keep trying */
400 		    cs = ONEXT(cs);
401 		break;
402 
403 	    /* this is a grouping, recurse */
404 	    case '(':
405 		ptr = expmatch(s, ONEXT(cs), mstring);
406 		if (ptr != NULL) {
407 
408 		    /* the subexpression matched */
409 		    matched = 1;
410 		    s = ptr;
411 		} else if (*cs & ALT) {
412 
413 		    /* alternation, skip to next expression */
414 		    matched = 0;
415 		} else if (*cs & OPT) {
416 
417 		    /* the match is optional */
418 		    matched = 1;	/* indicate a successful match */
419 		} else {
420 
421 		    /* no match, error return */
422 		    return (NULL);
423 		}
424 		cs = OPTR(cs);
425 		break;
426 	    }
427 	    break;
428 
429 	/* try to match a metasymbol */
430 	case META:
431 	    switch (MSYM(cs)) {
432 
433 	    /* try to match anything and remember what was matched */
434 	    case 'p':
435 		/*
436 		 *  This is really the same as trying the match the
437 		 *  remaining parts of the expression to any subset
438 		 *  of the string.
439 		 */
440 		s1 = s;
441 		do {
442 		    ptr = expmatch(s1, MNEXT(cs), mstring);
443 		    if (ptr != NULL && s1 != s) {
444 
445 			/* we have a match, remember the match */
446 			strncpy (mstring, s, s1 - s);
447 			mstring[s1 - s] = '\0';
448 			return (ptr);
449 		    } else if (ptr != NULL && (*cs & OPT)) {
450 
451 			/* it was aoptional so no match is ok */
452 			return (ptr);
453 		    } else if (ptr != NULL) {
454 
455 			/* not optional and we still matched */
456 			return (NULL);
457 		    }
458 		    if (!(isalnum(*s1) || *s1 == '_' ||
459 			  /* C++ destructor */
460 			  *s1 == '~' ||
461 			  /* C++ scope operator */
462 			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
463 			   (s1++, true))))
464 			return (NULL);
465 		    if (*s1 == '\\')
466 			_escaped = _escaped ? false : true;
467 		    else
468 			_escaped = false;
469 		} while (*s1++);
470 		return (NULL);
471 
472 	    /* try to match anything */
473 	    case 'a':
474 		/*
475 		 *  This is really the same as trying the match the
476 		 *  remaining parts of the expression to any subset
477 		 *  of the string.
478 		 */
479 		s1 = s;
480 		do {
481 		    ptr = expmatch(s1, MNEXT(cs), mstring);
482 		    if (ptr != NULL && s1 != s) {
483 
484 			/* we have a match */
485 			return (ptr);
486 		    } else if (ptr != NULL && (*cs & OPT)) {
487 
488 			/* it was aoptional so no match is ok */
489 			return (ptr);
490 		    } else if (ptr != NULL) {
491 
492 			/* not optional and we still matched */
493 			return (NULL);
494 		    }
495 		    if (*s1 == '\\')
496 			_escaped = _escaped ? false : true;
497 		    else
498 			_escaped = false;
499 		} while (*s1++);
500 		return (NULL);
501 
502 	    /* fail if we are currently _escaped */
503 	    case 'e':
504 		if (_escaped)
505 		    return(NULL);
506 		cs = MNEXT(cs);
507 		break;
508 
509 	    /* match any number of tabs and spaces */
510 	    case 'd':
511 		ptr = s;
512 		while (*s == ' ' || *s == '\t')
513 		    s++;
514 		if (s != ptr || s == s_start) {
515 
516 		    /* match, be happy */
517 		    matched = 1;
518 		    cs = MNEXT(cs);
519 		} else if (*s == '\n' || *s == '\0') {
520 
521 		    /* match, be happy */
522 		    matched = 1;
523 		    cs = MNEXT(cs);
524 		} else if (*cs & ALT) {
525 
526 		    /* try the next part */
527 		    matched = 0;
528 		    cs = MNEXT(cs);
529 		} else if (*cs & OPT) {
530 
531 		    /* doesn't matter */
532 		    matched = 1;
533 		    cs = MNEXT(cs);
534 		} else
535 
536 		    /* no match, error return */
537 		    return (NULL);
538 		break;
539 
540 	    /* check for end of line */
541 	    case '$':
542 		if (*s == '\0' || *s == '\n') {
543 
544 		    /* match, be happy */
545 		    s++;
546 		    matched = 1;
547 		    cs = MNEXT(cs);
548 		} else if (*cs & ALT) {
549 
550 		    /* try the next part */
551 		    matched = 0;
552 		    cs = MNEXT(cs);
553 		} else if (*cs & OPT) {
554 
555 		    /* doesn't matter */
556 		    matched = 1;
557 		    cs = MNEXT(cs);
558 		} else
559 
560 		    /* no match, error return */
561 		    return (NULL);
562 		break;
563 
564 	    /* check for start of line */
565 	    case '^':
566 		if (s == s_start) {
567 
568 		    /* match, be happy */
569 		    matched = 1;
570 		    cs = MNEXT(cs);
571 		} else if (*cs & ALT) {
572 
573 		    /* try the next part */
574 		    matched = 0;
575 		    cs = MNEXT(cs);
576 		} else if (*cs & OPT) {
577 
578 		    /* doesn't matter */
579 		    matched = 1;
580 		    cs = MNEXT(cs);
581 		} else
582 
583 		    /* no match, error return */
584 		    return (NULL);
585 		break;
586 
587 	    /* end of a subexpression, return success */
588 	    case ')':
589 		return (s);
590 	    }
591 	    break;
592 	}
593     }
594     return (s);
595 }
596