xref: /titanic_44/usr/src/cmd/vgrind/regexp.c (revision e5af7cce311c40ac0c6cf3c9d3e81b3f0a9c7c3a)
1  /*
2   * Copyright (c) 1980 Regents of the University of California.
3   * All rights reserved.  The Berkeley software License Agreement
4   * specifies the terms and conditions for redistribution.
5   */
6  
7  #pragma ident	"%Z%%M%	%I%	%E% SMI"
8  
9  #include <ctype.h>
10  
11  typedef int	boolean;
12  #define TRUE	1
13  #define FALSE	0
14  #define NIL	0
15  
16  extern boolean	l_onecase;	/* true if upper and lower equivalent */
17  extern char	*l_idchars;	/* set of characters legal in identifiers
18  				   in addition to letters and digits */
19  
20  extern char	*strchr();
21  static void	expconv(void);
22  
23  #define isidchr(c)	\
24  		(isalnum(c) || ((c) != NIL && strchr(l_idchars, (c)) != NIL))
25  #define makelower(c)	(isupper((c)) ? tolower((c)) : (c))
26  
27  /*  STRNCMP -	like strncmp except that we convert the
28   *	 	first string to lower case before comparing
29   *		if l_onecase is set.
30   */
31  
32  int
STRNCMP(char * s1,char * s2,int len)33  STRNCMP(char *s1, char *s2, int len)
34  {
35  	if (l_onecase) {
36  	    do
37  		if (*s2 - makelower(*s1))
38  			return (*s2 - makelower(*s1));
39  		else {
40  			s2++;
41  			s1++;
42  		}
43  	    while (--len);
44  	} else {
45  	    do
46  		if (*s2 - *s1)
47  			return (*s2 - *s1);
48  		else {
49  			s2++;
50  			s1++;
51  		}
52  	    while (--len);
53  	}
54  	return(0);
55  }
56  
57  /*	The following routine converts an irregular expression to
58   *	internal format.
59   *
60   *	Either meta symbols (\a \d or \p) or character strings or
61   *	operations ( alternation or parenthesizing ) can be
62   *	specified.  Each starts with a descriptor byte.  The descriptor
63   *	byte has STR set for strings, META set for meta symbols
64   *	and OPER set for operations.
65   *	The descriptor byte can also have the OPT bit set if the object
66   *	defined is optional.  Also ALT can be set to indicate an alternation.
67   *
68   *	For metasymbols the byte following the descriptor byte identities
69   *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
70   *	strings the byte after the descriptor is a character count for
71   *	the string:
72   *
73   *		meta symbols := descriptor
74   *				symbol
75   *
76   *		strings :=	descriptor
77   *				character count
78   *				the string
79   *
80   *		operations :=	descriptor
81   *				symbol
82   *				character count
83   */
84  
85  /*
86   *  handy macros for accessing parts of match blocks
87   */
88  #define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
89  #define MNEXT(A) (A+2)		/* character following a metasymbol block */
90  
91  #define OSYM(A) (*(A+1))	/* symbol in an operation block */
92  #define OCNT(A) (*(A+2))	/* character count */
93  #define ONEXT(A) (A+3)		/* next character after the operation */
94  #define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
95  
96  #define SCNT(A) (*(A+1))	/* byte count of a string */
97  #define SSTR(A) (A+2)		/* address of the string */
98  #define SNEXT(A) (A+2+*(A+1))	/* character following the string */
99  
100  /*
101   *  bit flags in the descriptor
102   */
103  #define OPT 1
104  #define STR 2
105  #define META 4
106  #define ALT 8
107  #define OPER 16
108  
109  char *ure;		/* pointer current position in unconverted exp */
110  char *ccre;		/* pointer to current position in converted exp*/
111  char *malloc();
112  
113  char *
convexp(char * re)114  convexp(char *re)
115  	/* re - unconverted irregular expression */
116  {
117      char *cre;		/* pointer to converted regular expression */
118  
119      /* allocate room for the converted expression */
120      if (re == NIL)
121  	return (NIL);
122      if (*re == '\0')
123  	return (NIL);
124      cre = malloc (4 * strlen(re) + 3);
125      ccre = cre;
126      ure = re;
127  
128      /* start the conversion with a \a */
129      *cre = META | OPT;
130      MSYM(cre) = 'a';
131      ccre = MNEXT(cre);
132  
133      /* start the conversion (its recursive) */
134      expconv ();
135      *ccre = 0;
136      return (cre);
137  }
138  
139  static void
expconv(void)140  expconv(void)
141  {
142      char *cs;		/* pointer to current symbol in converted exp */
143      char c;		/* character being processed */
144      char *acs;		/* pinter to last alternate */
145      int temp;
146  
147      /* let the conversion begin */
148      acs = NIL;
149      cs = NIL;
150      while (*ure != NIL) {
151  	switch (c = *ure++) {
152  
153  	case '\\':
154  	    switch (c = *ure++) {
155  
156  	    /* escaped characters are just characters */
157  	    default:
158  		if (cs == NIL || (*cs & STR) == 0) {
159  		    cs = ccre;
160  		    *cs = STR;
161  		    SCNT(cs) = 1;
162  		    ccre += 2;
163  		} else
164  		    SCNT(cs)++;
165  		*ccre++ = c;
166  		break;
167  
168  	    /* normal(?) metacharacters */
169  	    case 'a':
170  	    case 'd':
171  	    case 'e':
172  	    case 'p':
173  		if (acs != NIL && acs != cs) {
174  		    do {
175  			temp = OCNT(acs);
176  			OCNT(acs) = ccre - acs;
177  			acs -= temp;
178  		    } while (temp != 0);
179  		    acs = NIL;
180  		}
181  		cs = ccre;
182  		*cs = META;
183  		MSYM(cs) = c;
184  		ccre = MNEXT(cs);
185  		break;
186  	    }
187  	    break;
188  
189  	/* just put the symbol in */
190  	case '^':
191  	case '$':
192  	    if (acs != NIL && acs != cs) {
193  		do {
194  		    temp = OCNT(acs);
195  		    OCNT(acs) = ccre - acs;
196  		    acs -= temp;
197  		} while (temp != 0);
198  		acs = NIL;
199  	    }
200  	    cs = ccre;
201  	    *cs = META;
202  	    MSYM(cs) = c;
203  	    ccre = MNEXT(cs);
204  	    break;
205  
206  	/* mark the last match sequence as optional */
207  	case '?':
208  	    if (cs)
209  	    	*cs = *cs | OPT;
210  	    break;
211  
212  	/* recurse and define a subexpression */
213  	case '(':
214  	    if (acs != NIL && acs != cs) {
215  		do {
216  		    temp = OCNT(acs);
217  		    OCNT(acs) = ccre - acs;
218  		    acs -= temp;
219  		} while (temp != 0);
220  		acs = NIL;
221  	    }
222  	    cs = ccre;
223  	    *cs = OPER;
224  	    OSYM(cs) = '(';
225  	    ccre = ONEXT(cs);
226  	    expconv ();
227  	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
228  	    break;
229  
230  	/* return from a recursion */
231  	case ')':
232  	    if (acs != NIL) {
233  		do {
234  		    temp = OCNT(acs);
235  		    OCNT(acs) = ccre - acs;
236  		    acs -= temp;
237  		} while (temp != 0);
238  		acs = NIL;
239  	    }
240  	    cs = ccre;
241  	    *cs = META;
242  	    MSYM(cs) = c;
243  	    ccre = MNEXT(cs);
244  	    return;
245  
246  	/* mark the last match sequence as having an alternate */
247  	/* the third byte will contain an offset to jump over the */
248  	/* alternate match in case the first did not fail */
249  	case '|':
250  	    if (acs != NIL && acs != cs)
251  		OCNT(ccre) = ccre - acs;	/* make a back pointer */
252  	    else
253  		OCNT(ccre) = 0;
254  	    *cs |= ALT;
255  	    cs = ccre;
256  	    *cs = OPER;
257  	    OSYM(cs) = '|';
258  	    ccre = ONEXT(cs);
259  	    acs = cs;	/* remember that the pointer is to be filles */
260  	    break;
261  
262  	/* if its not a metasymbol just build a scharacter string */
263  	default:
264  	    if (cs == NIL || (*cs & STR) == 0) {
265  		cs = ccre;
266  		*cs = STR;
267  		SCNT(cs) = 1;
268  		ccre = SSTR(cs);
269  	    } else
270  		SCNT(cs)++;
271  	    *ccre++ = c;
272  	    break;
273  	}
274      }
275      if (acs != NIL) {
276  	do {
277  	    temp = OCNT(acs);
278  	    OCNT(acs) = ccre - acs;
279  	    acs -= temp;
280  	} while (temp != 0);
281  	acs = NIL;
282      }
283  }
284  /* end of convertre */
285  
286  
287  /*
288   *	The following routine recognises an irregular expresion
289   *	with the following special characters:
290   *
291   *		\?	-	means last match was optional
292   *		\a	-	matches any number of characters
293   *		\d	-	matches any number of spaces and tabs
294   *		\p	-	matches any number of alphanumeric
295   *				characters. The
296   *				characters matched will be copied into
297   *				the area pointed to by 'name'.
298   *		\|	-	alternation
299   *		\( \)	-	grouping used mostly for alternation and
300   *				optionality
301   *
302   *	The irregular expression must be translated to internal form
303   *	prior to calling this routine
304   *
305   *	The value returned is the pointer to the first non \a
306   *	character matched.
307   */
308  
309  boolean _escaped;		/* true if we are currently _escaped */
310  char *Start;			/* start of string */
311  
312  char *
expmatch(char * s,char * re,char * mstring)313  expmatch(char *s, char *re, char *mstring)
314  	/* s - string to check for a match in */
315  	/* re - a converted irregular expression */
316  	/* mstring - where to put whatever matches a \p */
317  {
318      char *cs;		/* the current symbol */
319      char *ptr, *s1;	/* temporary pointer */
320      boolean matched;	/* a temporary boolean */
321  
322      /* initial conditions */
323      if (re == NIL)
324  	return (NIL);
325      cs = re;
326      matched = FALSE;
327  
328      /* loop till expression string is exhausted (or at least pretty tired) */
329      while (*cs) {
330  	switch (*cs & (OPER | STR | META)) {
331  
332  	/* try to match a string */
333  	case STR:
334  	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
335  	    if (matched) {
336  
337  		/* hoorah it matches */
338  		s += SCNT(cs);
339  		cs = SNEXT(cs);
340  	    } else if (*cs & ALT) {
341  
342  		/* alternation, skip to next expression */
343  		cs = SNEXT(cs);
344  	    } else if (*cs & OPT) {
345  
346  		/* the match is optional */
347  		cs = SNEXT(cs);
348  		matched = 1;		/* indicate a successful match */
349  	    } else {
350  
351  		/* no match, error return */
352  		return (NIL);
353  	    }
354  	    break;
355  
356  	/* an operator, do something fancy */
357  	case OPER:
358  	    switch (OSYM(cs)) {
359  
360  	    /* this is an alternation */
361  	    case '|':
362  		if (matched)
363  
364  		    /* last thing in the alternation was a match, skip ahead */
365  		    cs = OPTR(cs);
366  		else
367  
368  		    /* no match, keep trying */
369  		    cs = ONEXT(cs);
370  		break;
371  
372  	    /* this is a grouping, recurse */
373  	    case '(':
374  		ptr = expmatch (s, ONEXT(cs), mstring);
375  		if (ptr != NIL) {
376  
377  		    /* the subexpression matched */
378  		    matched = 1;
379  		    s = ptr;
380  		} else if (*cs & ALT) {
381  
382  		    /* alternation, skip to next expression */
383  		    matched = 0;
384  		} else if (*cs & OPT) {
385  
386  		    /* the match is optional */
387  		    matched = 1;	/* indicate a successful match */
388  		} else {
389  
390  		    /* no match, error return */
391  		    return (NIL);
392  		}
393  		cs = OPTR(cs);
394  		break;
395  	    }
396  	    break;
397  
398  	/* try to match a metasymbol */
399  	case META:
400  	    switch (MSYM(cs)) {
401  
402  	    /* try to match anything and remember what was matched */
403  	    case 'p':
404  		/*
405  		 *  This is really the same as trying the match the
406  		 *  remaining parts of the expression to any subset
407  		 *  of the string.
408  		 */
409  		s1 = s;
410  		do {
411  		    ptr = expmatch (s1, MNEXT(cs), mstring);
412  		    if (ptr != NIL && s1 != s) {
413  
414  			/* we have a match, remember the match */
415  			strncpy (mstring, s, s1 - s);
416  			mstring[s1 - s] = '\0';
417  			return (ptr);
418  		    } else if (ptr != NIL && (*cs & OPT)) {
419  
420  			/* it was aoptional so no match is ok */
421  			return (ptr);
422  		    } else if (ptr != NIL) {
423  
424  			/* not optional and we still matched */
425  			return (NIL);
426  		    }
427  		    if (!isidchr(*s1))
428  			return (NIL);
429  		    if (*s1 == '\\')
430  			_escaped = _escaped ? FALSE : TRUE;
431  		    else
432  			_escaped = FALSE;
433  		} while (*s1++);
434  		return (NIL);
435  
436  	    /* try to match anything */
437  	    case 'a':
438  		/*
439  		 *  This is really the same as trying the match the
440  		 *  remaining parts of the expression to any subset
441  		 *  of the string.
442  		 */
443  		s1 = s;
444  		do {
445  		    ptr = expmatch (s1, MNEXT(cs), mstring);
446  		    if (ptr != NIL && s1 != s) {
447  
448  			/* we have a match */
449  			return (ptr);
450  		    } else if (ptr != NIL && (*cs & OPT)) {
451  
452  			/* it was aoptional so no match is ok */
453  			return (ptr);
454  		    } else if (ptr != NIL) {
455  
456  			/* not optional and we still matched */
457  			return (NIL);
458  		    }
459  		    if (*s1 == '\\')
460  			_escaped = _escaped ? FALSE : TRUE;
461  		    else
462  			_escaped = FALSE;
463  		} while (*s1++);
464  		return (NIL);
465  
466  	    /* fail if we are currently _escaped */
467  	    case 'e':
468  		if (_escaped)
469  		    return(NIL);
470  		cs = MNEXT(cs);
471  		break;
472  
473  	    /* match any number of tabs and spaces */
474  	    case 'd':
475  		ptr = s;
476  		while (*s == ' ' || *s == '\t')
477  		    s++;
478  		if (s != ptr || s == Start) {
479  
480  		    /* match, be happy */
481  		    matched = 1;
482  		    cs = MNEXT(cs);
483  		} else if (*s == '\n' || *s == '\0') {
484  
485  		    /* match, be happy */
486  		    matched = 1;
487  		    cs = MNEXT(cs);
488  		} else if (*cs & ALT) {
489  
490  		    /* try the next part */
491  		    matched = 0;
492  		    cs = MNEXT(cs);
493  		} else if (*cs & OPT) {
494  
495  		    /* doesn't matter */
496  		    matched = 1;
497  		    cs = MNEXT(cs);
498  		} else
499  
500  		    /* no match, error return */
501  		    return (NIL);
502  		break;
503  
504  	    /* check for end of line */
505  	    case '$':
506  		if (*s == '\0' || *s == '\n') {
507  
508  		    /* match, be happy */
509  		    s++;
510  		    matched = 1;
511  		    cs = MNEXT(cs);
512  		} else if (*cs & ALT) {
513  
514  		    /* try the next part */
515  		    matched = 0;
516  		    cs = MNEXT(cs);
517  		} else if (*cs & OPT) {
518  
519  		    /* doesn't matter */
520  		    matched = 1;
521  		    cs = MNEXT(cs);
522  		} else
523  
524  		    /* no match, error return */
525  		    return (NIL);
526  		break;
527  
528  	    /* check for start of line */
529  	    case '^':
530  		if (s == Start) {
531  
532  		    /* match, be happy */
533  		    matched = 1;
534  		    cs = MNEXT(cs);
535  		} else if (*cs & ALT) {
536  
537  		    /* try the next part */
538  		    matched = 0;
539  		    cs = MNEXT(cs);
540  		} else if (*cs & OPT) {
541  
542  		    /* doesn't matter */
543  		    matched = 1;
544  		    cs = MNEXT(cs);
545  		} else
546  
547  		    /* no match, error return */
548  		    return (NIL);
549  		break;
550  
551  	    /* end of a subexpression, return success */
552  	    case ')':
553  		return (s);
554  	    }
555  	    break;
556  	}
557      }
558      return (s);
559  }
560