xref: /illumos-gate/usr/src/cmd/vgrind/regexp.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * Copyright (c) 1980 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  */
6 
7 #include <ctype.h>
8 
9 typedef int	boolean;
10 #define TRUE	1
11 #define FALSE	0
12 #define NIL	0
13 
14 extern boolean	l_onecase;	/* true if upper and lower equivalent */
15 extern char	*l_idchars;	/* set of characters legal in identifiers
16 				   in addition to letters and digits */
17 
18 extern char	*strchr();
19 static void	expconv(void);
20 
21 #define isidchr(c)	\
22 		(isalnum(c) || ((c) != NIL && strchr(l_idchars, (c)) != NIL))
23 #define makelower(c)	(isupper((c)) ? tolower((c)) : (c))
24 
25 /*  STRNCMP -	like strncmp except that we convert the
26  *		first string to lower case before comparing
27  *		if l_onecase is set.
28  */
29 
30 int
31 STRNCMP(char *s1, char *s2, int len)
32 {
33 	if (l_onecase) {
34 	    do
35 		if (*s2 - makelower(*s1))
36 			return (*s2 - makelower(*s1));
37 		else {
38 			s2++;
39 			s1++;
40 		}
41 	    while (--len);
42 	} else {
43 	    do
44 		if (*s2 - *s1)
45 			return (*s2 - *s1);
46 		else {
47 			s2++;
48 			s1++;
49 		}
50 	    while (--len);
51 	}
52 	return(0);
53 }
54 
55 /*	The following routine converts an irregular expression to
56  *	internal format.
57  *
58  *	Either meta symbols (\a \d or \p) or character strings or
59  *	operations ( alternation or parenthesizing ) can be
60  *	specified.  Each starts with a descriptor byte.  The descriptor
61  *	byte has STR set for strings, META set for meta symbols
62  *	and OPER set for operations.
63  *	The descriptor byte can also have the OPT bit set if the object
64  *	defined is optional.  Also ALT can be set to indicate an alternation.
65  *
66  *	For metasymbols the byte following the descriptor byte identities
67  *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
68  *	strings the byte after the descriptor is a character count for
69  *	the string:
70  *
71  *		meta symbols := descriptor
72  *				symbol
73  *
74  *		strings :=	descriptor
75  *				character count
76  *				the string
77  *
78  *		operations :=	descriptor
79  *				symbol
80  *				character count
81  */
82 
83 /*
84  *  handy macros for accessing parts of match blocks
85  */
86 #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
87 #define MNEXT(A) (A+2)		/* character following a metasymbol block */
88 
89 #define OSYM(A) (*(A+1))	/* symbol in an operation block */
90 #define OCNT(A) (*(A+2))	/* character count */
91 #define ONEXT(A) (A+3)		/* next character after the operation */
92 #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
93 
94 #define SCNT(A) (*(A+1))	/* byte count of a string */
95 #define SSTR(A) (A+2)		/* address of the string */
96 #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
97 
98 /*
99  * bit flags in the descriptor
100  */
101 #define OPT 1
102 #define STR 2
103 #define META 4
104 #define ALT 8
105 #define OPER 16
106 
107 char *ure;		/* pointer current position in unconverted exp */
108 char *ccre;		/* pointer to current position in converted exp*/
109 char *malloc();
110 
111 char *
112 convexp(char *re)
113 	/* re - unconverted irregular expression */
114 {
115     char *cre;		/* pointer to converted regular expression */
116 
117     /* allocate room for the converted expression */
118     if (re == NIL)
119 	return (NIL);
120     if (*re == '\0')
121 	return (NIL);
122     cre = malloc (4 * strlen(re) + 3);
123     ccre = cre;
124     ure = re;
125 
126     /* start the conversion with a \a */
127     *cre = META | OPT;
128     MSYM(cre) = 'a';
129     ccre = MNEXT(cre);
130 
131     /* start the conversion (its recursive) */
132     expconv ();
133     *ccre = 0;
134     return (cre);
135 }
136 
137 static void
138 expconv(void)
139 {
140     char *cs;		/* pointer to current symbol in converted exp */
141     char c;		/* character being processed */
142     char *acs;		/* pinter to last alternate */
143     int temp;
144 
145     /* let the conversion begin */
146     acs = NIL;
147     cs = NIL;
148     while (*ure != NIL) {
149 	switch (c = *ure++) {
150 
151 	case '\\':
152 	    switch (c = *ure++) {
153 
154 	    /* escaped characters are just characters */
155 	    default:
156 		if (cs == NIL || (*cs & STR) == 0) {
157 		    cs = ccre;
158 		    *cs = STR;
159 		    SCNT(cs) = 1;
160 		    ccre += 2;
161 		} else {
162 		    SCNT(cs)++;
163 		}
164 		*ccre++ = c;
165 		break;
166 
167 	    /* normal(?) metacharacters */
168 	    case 'a':
169 	    case 'd':
170 	    case 'e':
171 	    case 'p':
172 		if (acs != NIL && acs != cs) {
173 		    do {
174 			temp = OCNT(acs);
175 			OCNT(acs) = ccre - acs;
176 			acs -= temp;
177 		    } while (temp != 0);
178 		    acs = NIL;
179 		}
180 		cs = ccre;
181 		*cs = META;
182 		MSYM(cs) = c;
183 		ccre = MNEXT(cs);
184 		break;
185 	    }
186 	    break;
187 
188 	/* just put the symbol in */
189 	case '^':
190 	case '$':
191 	    if (acs != NIL && acs != cs) {
192 		do {
193 		    temp = OCNT(acs);
194 		    OCNT(acs) = ccre - acs;
195 		    acs -= temp;
196 		} while (temp != 0);
197 		acs = NIL;
198 	    }
199 	    cs = ccre;
200 	    *cs = META;
201 	    MSYM(cs) = c;
202 	    ccre = MNEXT(cs);
203 	    break;
204 
205 	/* mark the last match sequence as optional */
206 	case '?':
207 	    if (cs)
208 		*cs = *cs | OPT;
209 	    break;
210 
211 	/* recurse and define a subexpression */
212 	case '(':
213 	    if (acs != NIL && acs != cs) {
214 		do {
215 		    temp = OCNT(acs);
216 		    OCNT(acs) = ccre - acs;
217 		    acs -= temp;
218 		} while (temp != 0);
219 		acs = NIL;
220 	    }
221 	    cs = ccre;
222 	    *cs = OPER;
223 	    OSYM(cs) = '(';
224 	    ccre = ONEXT(cs);
225 	    expconv ();
226 	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
227 	    break;
228 
229 	/* return from a recursion */
230 	case ')':
231 	    if (acs != NIL) {
232 		do {
233 		    temp = OCNT(acs);
234 		    OCNT(acs) = ccre - acs;
235 		    acs -= temp;
236 		} while (temp != 0);
237 		acs = NIL;
238 	    }
239 	    cs = ccre;
240 	    *cs = META;
241 	    MSYM(cs) = c;
242 	    ccre = MNEXT(cs);
243 	    return;
244 
245 	/* mark the last match sequence as having an alternate */
246 	/* the third byte will contain an offset to jump over the */
247 	/* alternate match in case the first did not fail */
248 	case '|':
249 	    if (acs != NIL && acs != cs)
250 		OCNT(ccre) = ccre - acs;	/* make a back pointer */
251 	    else
252 		OCNT(ccre) = 0;
253 	    *cs |= ALT;
254 	    cs = ccre;
255 	    *cs = OPER;
256 	    OSYM(cs) = '|';
257 	    ccre = ONEXT(cs);
258 	    acs = cs;	/* remember that the pointer is to be filles */
259 	    break;
260 
261 	/* if its not a metasymbol just build a scharacter string */
262 	default:
263 	    if (cs == NIL || (*cs & STR) == 0) {
264 		cs = ccre;
265 		*cs = STR;
266 		SCNT(cs) = 1;
267 		ccre = SSTR(cs);
268 	    } else
269 		SCNT(cs)++;
270 	    *ccre++ = c;
271 	    break;
272 	}
273     }
274     if (acs != NIL) {
275 	do {
276 	    temp = OCNT(acs);
277 	    OCNT(acs) = ccre - acs;
278 	    acs -= temp;
279 	} while (temp != 0);
280 	acs = NIL;
281     }
282 }
283 /* end of convertre */
284 
285 
286 /*
287  *	The following routine recognises an irregular expresion
288  *	with the following special characters:
289  *
290  *		\?	-	means last match was optional
291  *		\a	-	matches any number of characters
292  *		\d	-	matches any number of spaces and tabs
293  *		\p	-	matches any number of alphanumeric
294  *				characters. The
295  *				characters matched will be copied into
296  *				the area pointed to by 'name'.
297  *		\|	-	alternation
298  *		\( \)	-	grouping used mostly for alternation and
299  *				optionality
300  *
301  *	The irregular expression must be translated to internal form
302  *	prior to calling this routine
303  *
304  *	The value returned is the pointer to the first non \a
305  *	character matched.
306  */
307 
308 extern boolean _escaped;	/* true if we are currently _escaped */
309 extern char *Start;		/* start of string */
310 
311 char *
312 expmatch(char *s, char *re, char *mstring)
313 	/* s - string to check for a match in */
314 	/* re - a converted irregular expression */
315 	/* mstring - where to put whatever matches a \p */
316 {
317     char *cs;		/* the current symbol */
318     char *ptr, *s1;	/* temporary pointer */
319     boolean matched;	/* a temporary boolean */
320 
321     /* initial conditions */
322     if (re == NIL)
323 	return (NIL);
324     cs = re;
325     matched = FALSE;
326 
327     /* loop till expression string is exhausted (or at least pretty tired) */
328     while (*cs) {
329 	switch (*cs & (OPER | STR | META)) {
330 
331 	/* try to match a string */
332 	case STR:
333 	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
334 	    if (matched) {
335 
336 		/* hoorah it matches */
337 		s += SCNT(cs);
338 		cs = SNEXT(cs);
339 	    } else if (*cs & ALT) {
340 
341 		/* alternation, skip to next expression */
342 		cs = SNEXT(cs);
343 	    } else if (*cs & OPT) {
344 
345 		/* the match is optional */
346 		cs = SNEXT(cs);
347 		matched = 1;		/* indicate a successful match */
348 	    } else {
349 
350 		/* no match, error return */
351 		return (NIL);
352 	    }
353 	    break;
354 
355 	/* an operator, do something fancy */
356 	case OPER:
357 	    switch (OSYM(cs)) {
358 
359 	    /* this is an alternation */
360 	    case '|':
361 		if (matched)
362 
363 		    /* last thing in the alternation was a match, skip ahead */
364 		    cs = OPTR(cs);
365 		else
366 
367 		    /* no match, keep trying */
368 		    cs = ONEXT(cs);
369 		break;
370 
371 	    /* this is a grouping, recurse */
372 	    case '(':
373 		ptr = expmatch (s, ONEXT(cs), mstring);
374 		if (ptr != NIL) {
375 
376 		    /* the subexpression matched */
377 		    matched = 1;
378 		    s = ptr;
379 		} else if (*cs & ALT) {
380 
381 		    /* alternation, skip to next expression */
382 		    matched = 0;
383 		} else if (*cs & OPT) {
384 
385 		    /* the match is optional */
386 		    matched = 1;	/* indicate a successful match */
387 		} else {
388 
389 		    /* no match, error return */
390 		    return (NIL);
391 		}
392 		cs = OPTR(cs);
393 		break;
394 	    }
395 	    break;
396 
397 	/* try to match a metasymbol */
398 	case META:
399 	    switch (MSYM(cs)) {
400 
401 	    /* try to match anything and remember what was matched */
402 	    case 'p':
403 		/*
404 		 *  This is really the same as trying the match the
405 		 *  remaining parts of the expression to any subset
406 		 *  of the string.
407 		 */
408 		s1 = s;
409 		do {
410 		    ptr = expmatch (s1, MNEXT(cs), mstring);
411 		    if (ptr != NIL && s1 != s) {
412 
413 			/* we have a match, remember the match */
414 			strncpy (mstring, s, s1 - s);
415 			mstring[s1 - s] = '\0';
416 			return (ptr);
417 		    } else if (ptr != NIL && (*cs & OPT)) {
418 
419 			/* it was aoptional so no match is ok */
420 			return (ptr);
421 		    } else if (ptr != NIL) {
422 
423 			/* not optional and we still matched */
424 			return (NIL);
425 		    }
426 		    if (!isidchr(*s1))
427 			return (NIL);
428 		    if (*s1 == '\\')
429 			_escaped = _escaped ? FALSE : TRUE;
430 		    else
431 			_escaped = FALSE;
432 		} while (*s1++);
433 		return (NIL);
434 
435 	    /* try to match anything */
436 	    case 'a':
437 		/*
438 		 *  This is really the same as trying the match the
439 		 *  remaining parts of the expression to any subset
440 		 *  of the string.
441 		 */
442 		s1 = s;
443 		do {
444 		    ptr = expmatch (s1, MNEXT(cs), mstring);
445 		    if (ptr != NIL && s1 != s) {
446 
447 			/* we have a match */
448 			return (ptr);
449 		    } else if (ptr != NIL && (*cs & OPT)) {
450 
451 			/* it was aoptional so no match is ok */
452 			return (ptr);
453 		    } else if (ptr != NIL) {
454 
455 			/* not optional and we still matched */
456 			return (NIL);
457 		    }
458 		    if (*s1 == '\\')
459 			_escaped = _escaped ? FALSE : TRUE;
460 		    else
461 			_escaped = FALSE;
462 		} while (*s1++);
463 		return (NIL);
464 
465 	    /* fail if we are currently _escaped */
466 	    case 'e':
467 		if (_escaped)
468 		    return(NIL);
469 		cs = MNEXT(cs);
470 		break;
471 
472 	    /* match any number of tabs and spaces */
473 	    case 'd':
474 		ptr = s;
475 		while (*s == ' ' || *s == '\t')
476 		    s++;
477 		if (s != ptr || s == Start) {
478 
479 		    /* match, be happy */
480 		    matched = 1;
481 		    cs = MNEXT(cs);
482 		} else if (*s == '\n' || *s == '\0') {
483 
484 		    /* match, be happy */
485 		    matched = 1;
486 		    cs = MNEXT(cs);
487 		} else if (*cs & ALT) {
488 
489 		    /* try the next part */
490 		    matched = 0;
491 		    cs = MNEXT(cs);
492 		} else if (*cs & OPT) {
493 
494 		    /* doesn't matter */
495 		    matched = 1;
496 		    cs = MNEXT(cs);
497 		} else
498 
499 		    /* no match, error return */
500 		    return (NIL);
501 		break;
502 
503 	    /* check for end of line */
504 	    case '$':
505 		if (*s == '\0' || *s == '\n') {
506 
507 		    /* match, be happy */
508 		    s++;
509 		    matched = 1;
510 		    cs = MNEXT(cs);
511 		} else if (*cs & ALT) {
512 
513 		    /* try the next part */
514 		    matched = 0;
515 		    cs = MNEXT(cs);
516 		} else if (*cs & OPT) {
517 
518 		    /* doesn't matter */
519 		    matched = 1;
520 		    cs = MNEXT(cs);
521 		} else
522 
523 		    /* no match, error return */
524 		    return (NIL);
525 		break;
526 
527 	    /* check for start of line */
528 	    case '^':
529 		if (s == Start) {
530 
531 		    /* match, be happy */
532 		    matched = 1;
533 		    cs = MNEXT(cs);
534 		} else if (*cs & ALT) {
535 
536 		    /* try the next part */
537 		    matched = 0;
538 		    cs = MNEXT(cs);
539 		} else if (*cs & OPT) {
540 
541 		    /* doesn't matter */
542 		    matched = 1;
543 		    cs = MNEXT(cs);
544 		} else
545 
546 		    /* no match, error return */
547 		    return (NIL);
548 		break;
549 
550 	    /* end of a subexpression, return success */
551 	    case ')':
552 		return (s);
553 	    }
554 	    break;
555 	}
556     }
557     return (s);
558 }
559