xref: /freebsd/usr.bin/vgrind/regexp.c (revision e64fe029e9d3ce476e77a478318e0c3cd201ff08)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 
35 __FBSDID("$FreeBSD$");
36 
37 #ifndef lint
38 static const char copyright[] =
39 "@(#) Copyright (c) 1980, 1993\n\
40 	The Regents of the University of California.  All rights reserved.\n";
41 #endif
42 
43 #ifndef lint
44 static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
45 #endif
46 
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include <stdbool.h>
50 #include <string.h>
51 
52 #include "extern.h"
53 
54 static void	expconv(void);
55 
56 bool	 _escaped;	/* true if we are currently x_escaped */
57 char	*s_start;	/* start of string */
58 bool	 l_onecase;	/* true if upper and lower equivalent */
59 
60 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
61 
62 /*  STRNCMP -	like strncmp except that we convert the
63  *	 	first string to lower case before comparing
64  *		if l_onecase is set.
65  */
66 
67 int
68 STRNCMP(register char *s1, register char *s2, register int len)
69 {
70 	if (l_onecase) {
71 	    do
72 		if (*s2 - makelower(*s1))
73 			return (*s2 - makelower(*s1));
74 		else {
75 			s2++;
76 			s1++;
77 		}
78 	    while (--len);
79 	} else {
80 	    do
81 		if (*s2 - *s1)
82 			return (*s2 - *s1);
83 		else {
84 			s2++;
85 			s1++;
86 		}
87 	    while (--len);
88 	}
89 	return(0);
90 }
91 
92 /*	The following routine converts an irregular expression to
93  *	internal format.
94  *
95  *	Either meta symbols (\a \d or \p) or character strings or
96  *	operations ( alternation or parenthesizing ) can be
97  *	specified.  Each starts with a descriptor byte.  The descriptor
98  *	byte has STR set for strings, META set for meta symbols
99  *	and OPER set for operations.
100  *	The descriptor byte can also have the OPT bit set if the object
101  *	defined is optional.  Also ALT can be set to indicate an alternation.
102  *
103  *	For metasymbols the byte following the descriptor byte identities
104  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
105  *	strings the byte after the descriptor is a character count for
106  *	the string:
107  *
108  *		meta symbols := descriptor
109  *				symbol
110  *
111  *		strings :=	descriptor
112  *				character count
113  *				the string
114  *
115  *		operations :=	descriptor
116  *				symbol
117  *				character count
118  */
119 
120 /*
121  *  handy macros for accessing parts of match blocks
122  */
123 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
124 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
125 
126 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
127 #define OCNT(A) (*(A+2))	/* character count */
128 #define ONEXT(A) (A+3)		/* next character after the operation */
129 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
130 
131 #define SCNT(A) (*(A+1))	/* byte count of a string */
132 #define SSTR(A) (A+2)		/* address of the string */
133 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
134 
135 /*
136  *  bit flags in the descriptor
137  */
138 #define OPT 1
139 #define STR 2
140 #define META 4
141 #define ALT 8
142 #define OPER 16
143 
144 static char *ccre;	/* pointer to current position in converted exp*/
145 static char *ure;	/* pointer current position in unconverted exp */
146 
147 /* re: unconverted irregular expression */
148 char *
149 convexp(char *re)
150 {
151     register char *cre;		/* pointer to converted regular expression */
152 
153     /* allocate room for the converted expression */
154     if (re == NULL)
155 	return (NULL);
156     if (*re == '\0')
157 	return (NULL);
158     cre = malloc(4 * strlen(re) + 3);
159     ccre = cre;
160     ure = re;
161 
162     /* start the conversion with a \a */
163     *cre = META | OPT;
164     MSYM(cre) = 'a';
165     ccre = MNEXT(cre);
166 
167     /* start the conversion (its recursive) */
168     expconv ();
169     *ccre = 0;
170     return (cre);
171 }
172 
173 static void
174 expconv()
175 {
176     register char *cs;		/* pointer to current symbol in converted exp */
177     register char c;		/* character being processed */
178     register char *acs;		/* pinter to last alternate */
179     register int temp;
180 
181     /* let the conversion begin */
182     acs = NULL;
183     cs = NULL;
184     while (*ure) {
185 	switch (c = *ure++) {
186 
187 	case '\\':
188 	    switch (c = *ure++) {
189 
190 	    /* escaped characters are just characters */
191 	    default:
192 		if (cs == NULL || (*cs & STR) == 0) {
193 		    cs = ccre;
194 		    *cs = STR;
195 		    SCNT(cs) = 1;
196 		    ccre += 2;
197 		} else
198 		    SCNT(cs)++;
199 		*ccre++ = c;
200 		break;
201 
202 	    /* normal(?) metacharacters */
203 	    case 'a':
204 	    case 'd':
205 	    case 'e':
206 	    case 'p':
207 		if (acs != NULL && acs != cs) {
208 		    do {
209 			temp = OCNT(acs);
210 			OCNT(acs) = ccre - acs;
211 			acs -= temp;
212 		    } while (temp != 0);
213 		    acs = NULL;
214 		}
215 		cs = ccre;
216 		*cs = META;
217 		MSYM(cs) = c;
218 		ccre = MNEXT(cs);
219 		break;
220 	    }
221 	    break;
222 
223 	/* just put the symbol in */
224 	case '^':
225 	case '$':
226 	    if (acs != NULL && acs != cs) {
227 		do {
228 		    temp = OCNT(acs);
229 		    OCNT(acs) = ccre - acs;
230 		    acs -= temp;
231 		} while (temp != 0);
232 		acs = NULL;
233 	    }
234 	    cs = ccre;
235 	    *cs = META;
236 	    MSYM(cs) = c;
237 	    ccre = MNEXT(cs);
238 	    break;
239 
240 	/* mark the last match sequence as optional */
241 	case '?':
242 	    if (cs)
243 	    	*cs = *cs | OPT;
244 	    break;
245 
246 	/* recurse and define a subexpression */
247 	case '(':
248 	    if (acs != NULL && acs != cs) {
249 		do {
250 		    temp = OCNT(acs);
251 		    OCNT(acs) = ccre - acs;
252 		    acs -= temp;
253 		} while (temp != 0);
254 		acs = NULL;
255 	    }
256 	    cs = ccre;
257 	    *cs = OPER;
258 	    OSYM(cs) = '(';
259 	    ccre = ONEXT(cs);
260 	    expconv();
261 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
262 	    break;
263 
264 	/* return from a recursion */
265 	case ')':
266 	    if (acs != NULL) {
267 		do {
268 		    temp = OCNT(acs);
269 		    OCNT(acs) = ccre - acs;
270 		    acs -= temp;
271 		} while (temp != 0);
272 		acs = NULL;
273 	    }
274 	    cs = ccre;
275 	    *cs = META;
276 	    MSYM(cs) = c;
277 	    ccre = MNEXT(cs);
278 	    return;
279 
280 	/* mark the last match sequence as having an alternate */
281 	/* the third byte will contain an offset to jump over the */
282 	/* alternate match in case the first did not fail */
283 	case '|':
284 	    if (acs != NULL && acs != cs)
285 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
286 	    else
287 		OCNT(ccre) = 0;
288 	    *cs |= ALT;
289 	    cs = ccre;
290 	    *cs = OPER;
291 	    OSYM(cs) = '|';
292 	    ccre = ONEXT(cs);
293 	    acs = cs;	/* remember that the pointer is to be filles */
294 	    break;
295 
296 	/* if its not a metasymbol just build a scharacter string */
297 	default:
298 	    if (cs == NULL || (*cs & STR) == 0) {
299 		cs = ccre;
300 		*cs = STR;
301 		SCNT(cs) = 1;
302 		ccre = SSTR(cs);
303 	    } else
304 		SCNT(cs)++;
305 	    *ccre++ = c;
306 	    break;
307 	}
308     }
309     if (acs != NULL) {
310 	do {
311 	    temp = OCNT(acs);
312 	    OCNT(acs) = ccre - acs;
313 	    acs -= temp;
314 	} while (temp != 0);
315 	acs = NULL;
316     }
317     return;
318 }
319 /* end of convertre */
320 
321 
322 /*
323  *	The following routine recognises an irregular expression
324  *	with the following special characters:
325  *
326  *		\?	-	means last match was optional
327  *		\a	-	matches any number of characters
328  *		\d	-	matches any number of spaces and tabs
329  *		\p	-	matches any number of alphanumeric
330  *				characters. The
331  *				characters matched will be copied into
332  *				the area pointed to by 'name'.
333  *		\|	-	alternation
334  *		\( \)	-	grouping used mostly for alternation and
335  *				optionality
336  *
337  *	The irregular expression must be translated to internal form
338  *	prior to calling this routine
339  *
340  *	The value returned is the pointer to the first non \a
341  *	character matched.
342  */
343 
344 /*
345  *  s: string to check for a match in
346  *  re: a converted irregular expression
347  *  mstring: where to put whatever matches a \p
348  */
349 char *
350 expmatch (register char *s, register char *re, register char *mstring)
351 {
352     register char *cs;		/* the current symbol */
353     register char *ptr,*s1;	/* temporary pointer */
354     bool matched;	/* a temporary bool */
355 
356     /* initial conditions */
357     if (re == NULL)
358 	return (NULL);
359     cs = re;
360     matched = false;
361 
362     /* loop till expression string is exhausted (or at least pretty tired) */
363     while (*cs) {
364 	switch (*cs & (OPER | STR | META)) {
365 
366 	/* try to match a string */
367 	case STR:
368 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
369 	    if (matched) {
370 
371 		/* hoorah it matches */
372 		s += SCNT(cs);
373 		cs = SNEXT(cs);
374 	    } else if (*cs & ALT) {
375 
376 		/* alternation, skip to next expression */
377 		cs = SNEXT(cs);
378 	    } else if (*cs & OPT) {
379 
380 		/* the match is optional */
381 		cs = SNEXT(cs);
382 		matched = 1;		/* indicate a successful match */
383 	    } else {
384 
385 		/* no match, error return */
386 		return (NULL);
387 	    }
388 	    break;
389 
390 	/* an operator, do something fancy */
391 	case OPER:
392 	    switch (OSYM(cs)) {
393 
394 	    /* this is an alternation */
395 	    case '|':
396 		if (matched)
397 
398 		    /* last thing in the alternation was a match, skip ahead */
399 		    cs = OPTR(cs);
400 		else
401 
402 		    /* no match, keep trying */
403 		    cs = ONEXT(cs);
404 		break;
405 
406 	    /* this is a grouping, recurse */
407 	    case '(':
408 		ptr = expmatch(s, ONEXT(cs), mstring);
409 		if (ptr != NULL) {
410 
411 		    /* the subexpression matched */
412 		    matched = 1;
413 		    s = ptr;
414 		} else if (*cs & ALT) {
415 
416 		    /* alternation, skip to next expression */
417 		    matched = 0;
418 		} else if (*cs & OPT) {
419 
420 		    /* the match is optional */
421 		    matched = 1;	/* indicate a successful match */
422 		} else {
423 
424 		    /* no match, error return */
425 		    return (NULL);
426 		}
427 		cs = OPTR(cs);
428 		break;
429 	    }
430 	    break;
431 
432 	/* try to match a metasymbol */
433 	case META:
434 	    switch (MSYM(cs)) {
435 
436 	    /* try to match anything and remember what was matched */
437 	    case 'p':
438 		/*
439 		 *  This is really the same as trying the match the
440 		 *  remaining parts of the expression to any subset
441 		 *  of the string.
442 		 */
443 		s1 = s;
444 		do {
445 		    ptr = expmatch(s1, MNEXT(cs), mstring);
446 		    if (ptr != NULL && s1 != s) {
447 
448 			/* we have a match, remember the match */
449 			strncpy (mstring, s, s1 - s);
450 			mstring[s1 - s] = '\0';
451 			return (ptr);
452 		    } else if (ptr != NULL && (*cs & OPT)) {
453 
454 			/* it was aoptional so no match is ok */
455 			return (ptr);
456 		    } else if (ptr != NULL) {
457 
458 			/* not optional and we still matched */
459 			return (NULL);
460 		    }
461 		    if (!(isalnum(*s1) || *s1 == '_' ||
462 			  /* C++ destructor */
463 			  *s1 == '~' ||
464 			  /* C++ scope operator */
465 			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
466 			   (s1++, true))))
467 			return (NULL);
468 		    if (*s1 == '\\')
469 			_escaped = _escaped ? false : true;
470 		    else
471 			_escaped = false;
472 		} while (*s1++);
473 		return (NULL);
474 
475 	    /* try to match anything */
476 	    case 'a':
477 		/*
478 		 *  This is really the same as trying the match the
479 		 *  remaining parts of the expression to any subset
480 		 *  of the string.
481 		 */
482 		s1 = s;
483 		do {
484 		    ptr = expmatch(s1, MNEXT(cs), mstring);
485 		    if (ptr != NULL && s1 != s) {
486 
487 			/* we have a match */
488 			return (ptr);
489 		    } else if (ptr != NULL && (*cs & OPT)) {
490 
491 			/* it was aoptional so no match is ok */
492 			return (ptr);
493 		    } else if (ptr != NULL) {
494 
495 			/* not optional and we still matched */
496 			return (NULL);
497 		    }
498 		    if (*s1 == '\\')
499 			_escaped = _escaped ? false : true;
500 		    else
501 			_escaped = false;
502 		} while (*s1++);
503 		return (NULL);
504 
505 	    /* fail if we are currently _escaped */
506 	    case 'e':
507 		if (_escaped)
508 		    return(NULL);
509 		cs = MNEXT(cs);
510 		break;
511 
512 	    /* match any number of tabs and spaces */
513 	    case 'd':
514 		ptr = s;
515 		while (*s == ' ' || *s == '\t')
516 		    s++;
517 		if (s != ptr || s == s_start) {
518 
519 		    /* match, be happy */
520 		    matched = 1;
521 		    cs = MNEXT(cs);
522 		} else if (*s == '\n' || *s == '\0') {
523 
524 		    /* match, be happy */
525 		    matched = 1;
526 		    cs = MNEXT(cs);
527 		} else if (*cs & ALT) {
528 
529 		    /* try the next part */
530 		    matched = 0;
531 		    cs = MNEXT(cs);
532 		} else if (*cs & OPT) {
533 
534 		    /* doesn't matter */
535 		    matched = 1;
536 		    cs = MNEXT(cs);
537 		} else
538 
539 		    /* no match, error return */
540 		    return (NULL);
541 		break;
542 
543 	    /* check for end of line */
544 	    case '$':
545 		if (*s == '\0' || *s == '\n') {
546 
547 		    /* match, be happy */
548 		    s++;
549 		    matched = 1;
550 		    cs = MNEXT(cs);
551 		} else if (*cs & ALT) {
552 
553 		    /* try the next part */
554 		    matched = 0;
555 		    cs = MNEXT(cs);
556 		} else if (*cs & OPT) {
557 
558 		    /* doesn't matter */
559 		    matched = 1;
560 		    cs = MNEXT(cs);
561 		} else
562 
563 		    /* no match, error return */
564 		    return (NULL);
565 		break;
566 
567 	    /* check for start of line */
568 	    case '^':
569 		if (s == s_start) {
570 
571 		    /* match, be happy */
572 		    matched = 1;
573 		    cs = MNEXT(cs);
574 		} else if (*cs & ALT) {
575 
576 		    /* try the next part */
577 		    matched = 0;
578 		    cs = MNEXT(cs);
579 		} else if (*cs & OPT) {
580 
581 		    /* doesn't matter */
582 		    matched = 1;
583 		    cs = MNEXT(cs);
584 		} else
585 
586 		    /* no match, error return */
587 		    return (NULL);
588 		break;
589 
590 	    /* end of a subexpression, return success */
591 	    case ')':
592 		return (s);
593 	    }
594 	    break;
595 	}
596     }
597     return (s);
598 }
599