xref: /freebsd/usr.bin/vgrind/regexp.c (revision 1da7f3f6f72b2245e458fc7195733268ae4a1136)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <ctype.h>
34 #include <stdlib.h>
35 #include <stdbool.h>
36 #include <string.h>
37 
38 #include "extern.h"
39 
40 static void	expconv(void);
41 
42 bool	 _escaped;	/* true if we are currently x_escaped */
43 char	*s_start;	/* start of string */
44 bool	 l_onecase;	/* true if upper and lower equivalent */
45 
46 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
47 
48 /*  STRNCMP -	like strncmp except that we convert the
49  *	 	first string to lower case before comparing
50  *		if l_onecase is set.
51  */
52 
53 int
54 STRNCMP(register char *s1, register char *s2, register int len)
55 {
56 	if (l_onecase) {
57 	    do
58 		if (*s2 - makelower(*s1))
59 			return (*s2 - makelower(*s1));
60 		else {
61 			s2++;
62 			s1++;
63 		}
64 	    while (--len);
65 	} else {
66 	    do
67 		if (*s2 - *s1)
68 			return (*s2 - *s1);
69 		else {
70 			s2++;
71 			s1++;
72 		}
73 	    while (--len);
74 	}
75 	return(0);
76 }
77 
78 /*	The following routine converts an irregular expression to
79  *	internal format.
80  *
81  *	Either meta symbols (\a \d or \p) or character strings or
82  *	operations ( alternation or parenthesizing ) can be
83  *	specified.  Each starts with a descriptor byte.  The descriptor
84  *	byte has STR set for strings, META set for meta symbols
85  *	and OPER set for operations.
86  *	The descriptor byte can also have the OPT bit set if the object
87  *	defined is optional.  Also ALT can be set to indicate an alternation.
88  *
89  *	For metasymbols the byte following the descriptor byte identities
90  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
91  *	strings the byte after the descriptor is a character count for
92  *	the string:
93  *
94  *		meta symbols := descriptor
95  *				symbol
96  *
97  *		strings :=	descriptor
98  *				character count
99  *				the string
100  *
101  *		operations :=	descriptor
102  *				symbol
103  *				character count
104  */
105 
106 /*
107  *  handy macros for accessing parts of match blocks
108  */
109 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
110 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
111 
112 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
113 #define OCNT(A) (*(A+2))	/* character count */
114 #define ONEXT(A) (A+3)		/* next character after the operation */
115 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
116 
117 #define SCNT(A) (*(A+1))	/* byte count of a string */
118 #define SSTR(A) (A+2)		/* address of the string */
119 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
120 
121 /*
122  *  bit flags in the descriptor
123  */
124 #define OPT 1
125 #define STR 2
126 #define META 4
127 #define ALT 8
128 #define OPER 16
129 
130 static char *ccre;	/* pointer to current position in converted exp*/
131 static char *ure;	/* pointer current position in unconverted exp */
132 
133 /* re: unconverted irregular expression */
134 char *
135 convexp(char *re)
136 {
137     register char *cre;		/* pointer to converted regular expression */
138 
139     /* allocate room for the converted expression */
140     if (re == NULL)
141 	return (NULL);
142     if (*re == '\0')
143 	return (NULL);
144     cre = malloc(4 * strlen(re) + 3);
145     ccre = cre;
146     ure = re;
147 
148     /* start the conversion with a \a */
149     *cre = META | OPT;
150     MSYM(cre) = 'a';
151     ccre = MNEXT(cre);
152 
153     /* start the conversion (its recursive) */
154     expconv ();
155     *ccre = 0;
156     return (cre);
157 }
158 
159 static void
160 expconv()
161 {
162     register char *cs;		/* pointer to current symbol in converted exp */
163     register char c;		/* character being processed */
164     register char *acs;		/* pinter to last alternate */
165     register int temp;
166 
167     /* let the conversion begin */
168     acs = NULL;
169     cs = NULL;
170     while (*ure) {
171 	switch (c = *ure++) {
172 
173 	case '\\':
174 	    switch (c = *ure++) {
175 
176 	    /* escaped characters are just characters */
177 	    default:
178 		if (cs == NULL || (*cs & STR) == 0) {
179 		    cs = ccre;
180 		    *cs = STR;
181 		    SCNT(cs) = 1;
182 		    ccre += 2;
183 		} else
184 		    SCNT(cs)++;
185 		*ccre++ = c;
186 		break;
187 
188 	    /* normal(?) metacharacters */
189 	    case 'a':
190 	    case 'd':
191 	    case 'e':
192 	    case 'p':
193 		if (acs != NULL && acs != cs) {
194 		    do {
195 			temp = OCNT(acs);
196 			OCNT(acs) = ccre - acs;
197 			acs -= temp;
198 		    } while (temp != 0);
199 		    acs = NULL;
200 		}
201 		cs = ccre;
202 		*cs = META;
203 		MSYM(cs) = c;
204 		ccre = MNEXT(cs);
205 		break;
206 	    }
207 	    break;
208 
209 	/* just put the symbol in */
210 	case '^':
211 	case '$':
212 	    if (acs != NULL && acs != cs) {
213 		do {
214 		    temp = OCNT(acs);
215 		    OCNT(acs) = ccre - acs;
216 		    acs -= temp;
217 		} while (temp != 0);
218 		acs = NULL;
219 	    }
220 	    cs = ccre;
221 	    *cs = META;
222 	    MSYM(cs) = c;
223 	    ccre = MNEXT(cs);
224 	    break;
225 
226 	/* mark the last match sequence as optional */
227 	case '?':
228 	    if (cs)
229 	    	*cs = *cs | OPT;
230 	    break;
231 
232 	/* recurse and define a subexpression */
233 	case '(':
234 	    if (acs != NULL && acs != cs) {
235 		do {
236 		    temp = OCNT(acs);
237 		    OCNT(acs) = ccre - acs;
238 		    acs -= temp;
239 		} while (temp != 0);
240 		acs = NULL;
241 	    }
242 	    cs = ccre;
243 	    *cs = OPER;
244 	    OSYM(cs) = '(';
245 	    ccre = ONEXT(cs);
246 	    expconv();
247 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
248 	    break;
249 
250 	/* return from a recursion */
251 	case ')':
252 	    if (acs != NULL) {
253 		do {
254 		    temp = OCNT(acs);
255 		    OCNT(acs) = ccre - acs;
256 		    acs -= temp;
257 		} while (temp != 0);
258 		acs = NULL;
259 	    }
260 	    cs = ccre;
261 	    *cs = META;
262 	    MSYM(cs) = c;
263 	    ccre = MNEXT(cs);
264 	    return;
265 
266 	/* mark the last match sequence as having an alternate */
267 	/* the third byte will contain an offset to jump over the */
268 	/* alternate match in case the first did not fail */
269 	case '|':
270 	    if (acs != NULL && acs != cs)
271 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
272 	    else
273 		OCNT(ccre) = 0;
274 	    *cs |= ALT;
275 	    cs = ccre;
276 	    *cs = OPER;
277 	    OSYM(cs) = '|';
278 	    ccre = ONEXT(cs);
279 	    acs = cs;	/* remember that the pointer is to be filles */
280 	    break;
281 
282 	/* if its not a metasymbol just build a scharacter string */
283 	default:
284 	    if (cs == NULL || (*cs & STR) == 0) {
285 		cs = ccre;
286 		*cs = STR;
287 		SCNT(cs) = 1;
288 		ccre = SSTR(cs);
289 	    } else
290 		SCNT(cs)++;
291 	    *ccre++ = c;
292 	    break;
293 	}
294     }
295     if (acs != NULL) {
296 	do {
297 	    temp = OCNT(acs);
298 	    OCNT(acs) = ccre - acs;
299 	    acs -= temp;
300 	} while (temp != 0);
301 	acs = NULL;
302     }
303     return;
304 }
305 /* end of convertre */
306 
307 
308 /*
309  *	The following routine recognises an irregular expression
310  *	with the following special characters:
311  *
312  *		\?	-	means last match was optional
313  *		\a	-	matches any number of characters
314  *		\d	-	matches any number of spaces and tabs
315  *		\p	-	matches any number of alphanumeric
316  *				characters. The
317  *				characters matched will be copied into
318  *				the area pointed to by 'name'.
319  *		\|	-	alternation
320  *		\( \)	-	grouping used mostly for alternation and
321  *				optionality
322  *
323  *	The irregular expression must be translated to internal form
324  *	prior to calling this routine
325  *
326  *	The value returned is the pointer to the first non \a
327  *	character matched.
328  */
329 
330 /*
331  *  s: string to check for a match in
332  *  re: a converted irregular expression
333  *  mstring: where to put whatever matches a \p
334  */
335 char *
336 expmatch (register char *s, register char *re, register char *mstring)
337 {
338     register char *cs;		/* the current symbol */
339     register char *ptr,*s1;	/* temporary pointer */
340     bool matched;	/* a temporary bool */
341 
342     /* initial conditions */
343     if (re == NULL)
344 	return (NULL);
345     cs = re;
346     matched = false;
347 
348     /* loop till expression string is exhausted (or at least pretty tired) */
349     while (*cs) {
350 	switch (*cs & (OPER | STR | META)) {
351 
352 	/* try to match a string */
353 	case STR:
354 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
355 	    if (matched) {
356 
357 		/* hoorah it matches */
358 		s += SCNT(cs);
359 		cs = SNEXT(cs);
360 	    } else if (*cs & ALT) {
361 
362 		/* alternation, skip to next expression */
363 		cs = SNEXT(cs);
364 	    } else if (*cs & OPT) {
365 
366 		/* the match is optional */
367 		cs = SNEXT(cs);
368 		matched = 1;		/* indicate a successful match */
369 	    } else {
370 
371 		/* no match, error return */
372 		return (NULL);
373 	    }
374 	    break;
375 
376 	/* an operator, do something fancy */
377 	case OPER:
378 	    switch (OSYM(cs)) {
379 
380 	    /* this is an alternation */
381 	    case '|':
382 		if (matched)
383 
384 		    /* last thing in the alternation was a match, skip ahead */
385 		    cs = OPTR(cs);
386 		else
387 
388 		    /* no match, keep trying */
389 		    cs = ONEXT(cs);
390 		break;
391 
392 	    /* this is a grouping, recurse */
393 	    case '(':
394 		ptr = expmatch(s, ONEXT(cs), mstring);
395 		if (ptr != NULL) {
396 
397 		    /* the subexpression matched */
398 		    matched = 1;
399 		    s = ptr;
400 		} else if (*cs & ALT) {
401 
402 		    /* alternation, skip to next expression */
403 		    matched = 0;
404 		} else if (*cs & OPT) {
405 
406 		    /* the match is optional */
407 		    matched = 1;	/* indicate a successful match */
408 		} else {
409 
410 		    /* no match, error return */
411 		    return (NULL);
412 		}
413 		cs = OPTR(cs);
414 		break;
415 	    }
416 	    break;
417 
418 	/* try to match a metasymbol */
419 	case META:
420 	    switch (MSYM(cs)) {
421 
422 	    /* try to match anything and remember what was matched */
423 	    case 'p':
424 		/*
425 		 *  This is really the same as trying the match the
426 		 *  remaining parts of the expression to any subset
427 		 *  of the string.
428 		 */
429 		s1 = s;
430 		do {
431 		    ptr = expmatch(s1, MNEXT(cs), mstring);
432 		    if (ptr != NULL && s1 != s) {
433 
434 			/* we have a match, remember the match */
435 			strncpy (mstring, s, s1 - s);
436 			mstring[s1 - s] = '\0';
437 			return (ptr);
438 		    } else if (ptr != NULL && (*cs & OPT)) {
439 
440 			/* it was aoptional so no match is ok */
441 			return (ptr);
442 		    } else if (ptr != NULL) {
443 
444 			/* not optional and we still matched */
445 			return (NULL);
446 		    }
447 		    if (!(isalnum(*s1) || *s1 == '_' ||
448 			  /* C++ destructor */
449 			  *s1 == '~' ||
450 			  /* C++ scope operator */
451 			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
452 			   (s1++, true))))
453 			return (NULL);
454 		    if (*s1 == '\\')
455 			_escaped = _escaped ? false : true;
456 		    else
457 			_escaped = false;
458 		} while (*s1++);
459 		return (NULL);
460 
461 	    /* try to match anything */
462 	    case 'a':
463 		/*
464 		 *  This is really the same as trying the match the
465 		 *  remaining parts of the expression to any subset
466 		 *  of the string.
467 		 */
468 		s1 = s;
469 		do {
470 		    ptr = expmatch(s1, MNEXT(cs), mstring);
471 		    if (ptr != NULL && s1 != s) {
472 
473 			/* we have a match */
474 			return (ptr);
475 		    } else if (ptr != NULL && (*cs & OPT)) {
476 
477 			/* it was aoptional so no match is ok */
478 			return (ptr);
479 		    } else if (ptr != NULL) {
480 
481 			/* not optional and we still matched */
482 			return (NULL);
483 		    }
484 		    if (*s1 == '\\')
485 			_escaped = _escaped ? false : true;
486 		    else
487 			_escaped = false;
488 		} while (*s1++);
489 		return (NULL);
490 
491 	    /* fail if we are currently _escaped */
492 	    case 'e':
493 		if (_escaped)
494 		    return(NULL);
495 		cs = MNEXT(cs);
496 		break;
497 
498 	    /* match any number of tabs and spaces */
499 	    case 'd':
500 		ptr = s;
501 		while (*s == ' ' || *s == '\t')
502 		    s++;
503 		if (s != ptr || s == s_start) {
504 
505 		    /* match, be happy */
506 		    matched = 1;
507 		    cs = MNEXT(cs);
508 		} else if (*s == '\n' || *s == '\0') {
509 
510 		    /* match, be happy */
511 		    matched = 1;
512 		    cs = MNEXT(cs);
513 		} else if (*cs & ALT) {
514 
515 		    /* try the next part */
516 		    matched = 0;
517 		    cs = MNEXT(cs);
518 		} else if (*cs & OPT) {
519 
520 		    /* doesn't matter */
521 		    matched = 1;
522 		    cs = MNEXT(cs);
523 		} else
524 
525 		    /* no match, error return */
526 		    return (NULL);
527 		break;
528 
529 	    /* check for end of line */
530 	    case '$':
531 		if (*s == '\0' || *s == '\n') {
532 
533 		    /* match, be happy */
534 		    s++;
535 		    matched = 1;
536 		    cs = MNEXT(cs);
537 		} else if (*cs & ALT) {
538 
539 		    /* try the next part */
540 		    matched = 0;
541 		    cs = MNEXT(cs);
542 		} else if (*cs & OPT) {
543 
544 		    /* doesn't matter */
545 		    matched = 1;
546 		    cs = MNEXT(cs);
547 		} else
548 
549 		    /* no match, error return */
550 		    return (NULL);
551 		break;
552 
553 	    /* check for start of line */
554 	    case '^':
555 		if (s == s_start) {
556 
557 		    /* match, be happy */
558 		    matched = 1;
559 		    cs = MNEXT(cs);
560 		} else if (*cs & ALT) {
561 
562 		    /* try the next part */
563 		    matched = 0;
564 		    cs = MNEXT(cs);
565 		} else if (*cs & OPT) {
566 
567 		    /* doesn't matter */
568 		    matched = 1;
569 		    cs = MNEXT(cs);
570 		} else
571 
572 		    /* no match, error return */
573 		    return (NULL);
574 		break;
575 
576 	    /* end of a subexpression, return success */
577 	    case ')':
578 		return (s);
579 	    }
580 	    break;
581 	}
582     }
583     return (s);
584 }
585