1 /*
2 * Copyright (c) 1980 Regents of the University of California.
3 * All rights reserved. The Berkeley software License Agreement
4 * specifies the terms and conditions for redistribution.
5 */
6
7 #pragma ident "%Z%%M% %I% %E% SMI"
8
9 #include <ctype.h>
10
11 typedef int boolean;
12 #define TRUE 1
13 #define FALSE 0
14 #define NIL 0
15
16 extern boolean l_onecase; /* true if upper and lower equivalent */
17 extern char *l_idchars; /* set of characters legal in identifiers
18 in addition to letters and digits */
19
20 extern char *strchr();
21 static void expconv(void);
22
23 #define isidchr(c) \
24 (isalnum(c) || ((c) != NIL && strchr(l_idchars, (c)) != NIL))
25 #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
26
27 /* STRNCMP - like strncmp except that we convert the
28 * first string to lower case before comparing
29 * if l_onecase is set.
30 */
31
32 int
STRNCMP(char * s1,char * s2,int len)33 STRNCMP(char *s1, char *s2, int len)
34 {
35 if (l_onecase) {
36 do
37 if (*s2 - makelower(*s1))
38 return (*s2 - makelower(*s1));
39 else {
40 s2++;
41 s1++;
42 }
43 while (--len);
44 } else {
45 do
46 if (*s2 - *s1)
47 return (*s2 - *s1);
48 else {
49 s2++;
50 s1++;
51 }
52 while (--len);
53 }
54 return(0);
55 }
56
57 /* The following routine converts an irregular expression to
58 * internal format.
59 *
60 * Either meta symbols (\a \d or \p) or character strings or
61 * operations ( alternation or parenthesizing ) can be
62 * specified. Each starts with a descriptor byte. The descriptor
63 * byte has STR set for strings, META set for meta symbols
64 * and OPER set for operations.
65 * The descriptor byte can also have the OPT bit set if the object
66 * defined is optional. Also ALT can be set to indicate an alternation.
67 *
68 * For metasymbols the byte following the descriptor byte identities
69 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
70 * strings the byte after the descriptor is a character count for
71 * the string:
72 *
73 * meta symbols := descriptor
74 * symbol
75 *
76 * strings := descriptor
77 * character count
78 * the string
79 *
80 * operations := descriptor
81 * symbol
82 * character count
83 */
84
85 /*
86 * handy macros for accessing parts of match blocks
87 */
88 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
89 #define MNEXT(A) (A+2) /* character following a metasymbol block */
90
91 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
92 #define OCNT(A) (*(A+2)) /* character count */
93 #define ONEXT(A) (A+3) /* next character after the operation */
94 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
95
96 #define SCNT(A) (*(A+1)) /* byte count of a string */
97 #define SSTR(A) (A+2) /* address of the string */
98 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
99
100 /*
101 * bit flags in the descriptor
102 */
103 #define OPT 1
104 #define STR 2
105 #define META 4
106 #define ALT 8
107 #define OPER 16
108
109 char *ure; /* pointer current position in unconverted exp */
110 char *ccre; /* pointer to current position in converted exp*/
111 char *malloc();
112
113 char *
convexp(char * re)114 convexp(char *re)
115 /* re - unconverted irregular expression */
116 {
117 char *cre; /* pointer to converted regular expression */
118
119 /* allocate room for the converted expression */
120 if (re == NIL)
121 return (NIL);
122 if (*re == '\0')
123 return (NIL);
124 cre = malloc (4 * strlen(re) + 3);
125 ccre = cre;
126 ure = re;
127
128 /* start the conversion with a \a */
129 *cre = META | OPT;
130 MSYM(cre) = 'a';
131 ccre = MNEXT(cre);
132
133 /* start the conversion (its recursive) */
134 expconv ();
135 *ccre = 0;
136 return (cre);
137 }
138
139 static void
expconv(void)140 expconv(void)
141 {
142 char *cs; /* pointer to current symbol in converted exp */
143 char c; /* character being processed */
144 char *acs; /* pinter to last alternate */
145 int temp;
146
147 /* let the conversion begin */
148 acs = NIL;
149 cs = NIL;
150 while (*ure != NIL) {
151 switch (c = *ure++) {
152
153 case '\\':
154 switch (c = *ure++) {
155
156 /* escaped characters are just characters */
157 default:
158 if (cs == NIL || (*cs & STR) == 0) {
159 cs = ccre;
160 *cs = STR;
161 SCNT(cs) = 1;
162 ccre += 2;
163 } else
164 SCNT(cs)++;
165 *ccre++ = c;
166 break;
167
168 /* normal(?) metacharacters */
169 case 'a':
170 case 'd':
171 case 'e':
172 case 'p':
173 if (acs != NIL && acs != cs) {
174 do {
175 temp = OCNT(acs);
176 OCNT(acs) = ccre - acs;
177 acs -= temp;
178 } while (temp != 0);
179 acs = NIL;
180 }
181 cs = ccre;
182 *cs = META;
183 MSYM(cs) = c;
184 ccre = MNEXT(cs);
185 break;
186 }
187 break;
188
189 /* just put the symbol in */
190 case '^':
191 case '$':
192 if (acs != NIL && acs != cs) {
193 do {
194 temp = OCNT(acs);
195 OCNT(acs) = ccre - acs;
196 acs -= temp;
197 } while (temp != 0);
198 acs = NIL;
199 }
200 cs = ccre;
201 *cs = META;
202 MSYM(cs) = c;
203 ccre = MNEXT(cs);
204 break;
205
206 /* mark the last match sequence as optional */
207 case '?':
208 if (cs)
209 *cs = *cs | OPT;
210 break;
211
212 /* recurse and define a subexpression */
213 case '(':
214 if (acs != NIL && acs != cs) {
215 do {
216 temp = OCNT(acs);
217 OCNT(acs) = ccre - acs;
218 acs -= temp;
219 } while (temp != 0);
220 acs = NIL;
221 }
222 cs = ccre;
223 *cs = OPER;
224 OSYM(cs) = '(';
225 ccre = ONEXT(cs);
226 expconv ();
227 OCNT(cs) = ccre - cs; /* offset to next symbol */
228 break;
229
230 /* return from a recursion */
231 case ')':
232 if (acs != NIL) {
233 do {
234 temp = OCNT(acs);
235 OCNT(acs) = ccre - acs;
236 acs -= temp;
237 } while (temp != 0);
238 acs = NIL;
239 }
240 cs = ccre;
241 *cs = META;
242 MSYM(cs) = c;
243 ccre = MNEXT(cs);
244 return;
245
246 /* mark the last match sequence as having an alternate */
247 /* the third byte will contain an offset to jump over the */
248 /* alternate match in case the first did not fail */
249 case '|':
250 if (acs != NIL && acs != cs)
251 OCNT(ccre) = ccre - acs; /* make a back pointer */
252 else
253 OCNT(ccre) = 0;
254 *cs |= ALT;
255 cs = ccre;
256 *cs = OPER;
257 OSYM(cs) = '|';
258 ccre = ONEXT(cs);
259 acs = cs; /* remember that the pointer is to be filles */
260 break;
261
262 /* if its not a metasymbol just build a scharacter string */
263 default:
264 if (cs == NIL || (*cs & STR) == 0) {
265 cs = ccre;
266 *cs = STR;
267 SCNT(cs) = 1;
268 ccre = SSTR(cs);
269 } else
270 SCNT(cs)++;
271 *ccre++ = c;
272 break;
273 }
274 }
275 if (acs != NIL) {
276 do {
277 temp = OCNT(acs);
278 OCNT(acs) = ccre - acs;
279 acs -= temp;
280 } while (temp != 0);
281 acs = NIL;
282 }
283 }
284 /* end of convertre */
285
286
287 /*
288 * The following routine recognises an irregular expresion
289 * with the following special characters:
290 *
291 * \? - means last match was optional
292 * \a - matches any number of characters
293 * \d - matches any number of spaces and tabs
294 * \p - matches any number of alphanumeric
295 * characters. The
296 * characters matched will be copied into
297 * the area pointed to by 'name'.
298 * \| - alternation
299 * \( \) - grouping used mostly for alternation and
300 * optionality
301 *
302 * The irregular expression must be translated to internal form
303 * prior to calling this routine
304 *
305 * The value returned is the pointer to the first non \a
306 * character matched.
307 */
308
309 boolean _escaped; /* true if we are currently _escaped */
310 char *Start; /* start of string */
311
312 char *
expmatch(char * s,char * re,char * mstring)313 expmatch(char *s, char *re, char *mstring)
314 /* s - string to check for a match in */
315 /* re - a converted irregular expression */
316 /* mstring - where to put whatever matches a \p */
317 {
318 char *cs; /* the current symbol */
319 char *ptr, *s1; /* temporary pointer */
320 boolean matched; /* a temporary boolean */
321
322 /* initial conditions */
323 if (re == NIL)
324 return (NIL);
325 cs = re;
326 matched = FALSE;
327
328 /* loop till expression string is exhausted (or at least pretty tired) */
329 while (*cs) {
330 switch (*cs & (OPER | STR | META)) {
331
332 /* try to match a string */
333 case STR:
334 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
335 if (matched) {
336
337 /* hoorah it matches */
338 s += SCNT(cs);
339 cs = SNEXT(cs);
340 } else if (*cs & ALT) {
341
342 /* alternation, skip to next expression */
343 cs = SNEXT(cs);
344 } else if (*cs & OPT) {
345
346 /* the match is optional */
347 cs = SNEXT(cs);
348 matched = 1; /* indicate a successful match */
349 } else {
350
351 /* no match, error return */
352 return (NIL);
353 }
354 break;
355
356 /* an operator, do something fancy */
357 case OPER:
358 switch (OSYM(cs)) {
359
360 /* this is an alternation */
361 case '|':
362 if (matched)
363
364 /* last thing in the alternation was a match, skip ahead */
365 cs = OPTR(cs);
366 else
367
368 /* no match, keep trying */
369 cs = ONEXT(cs);
370 break;
371
372 /* this is a grouping, recurse */
373 case '(':
374 ptr = expmatch (s, ONEXT(cs), mstring);
375 if (ptr != NIL) {
376
377 /* the subexpression matched */
378 matched = 1;
379 s = ptr;
380 } else if (*cs & ALT) {
381
382 /* alternation, skip to next expression */
383 matched = 0;
384 } else if (*cs & OPT) {
385
386 /* the match is optional */
387 matched = 1; /* indicate a successful match */
388 } else {
389
390 /* no match, error return */
391 return (NIL);
392 }
393 cs = OPTR(cs);
394 break;
395 }
396 break;
397
398 /* try to match a metasymbol */
399 case META:
400 switch (MSYM(cs)) {
401
402 /* try to match anything and remember what was matched */
403 case 'p':
404 /*
405 * This is really the same as trying the match the
406 * remaining parts of the expression to any subset
407 * of the string.
408 */
409 s1 = s;
410 do {
411 ptr = expmatch (s1, MNEXT(cs), mstring);
412 if (ptr != NIL && s1 != s) {
413
414 /* we have a match, remember the match */
415 strncpy (mstring, s, s1 - s);
416 mstring[s1 - s] = '\0';
417 return (ptr);
418 } else if (ptr != NIL && (*cs & OPT)) {
419
420 /* it was aoptional so no match is ok */
421 return (ptr);
422 } else if (ptr != NIL) {
423
424 /* not optional and we still matched */
425 return (NIL);
426 }
427 if (!isidchr(*s1))
428 return (NIL);
429 if (*s1 == '\\')
430 _escaped = _escaped ? FALSE : TRUE;
431 else
432 _escaped = FALSE;
433 } while (*s1++);
434 return (NIL);
435
436 /* try to match anything */
437 case 'a':
438 /*
439 * This is really the same as trying the match the
440 * remaining parts of the expression to any subset
441 * of the string.
442 */
443 s1 = s;
444 do {
445 ptr = expmatch (s1, MNEXT(cs), mstring);
446 if (ptr != NIL && s1 != s) {
447
448 /* we have a match */
449 return (ptr);
450 } else if (ptr != NIL && (*cs & OPT)) {
451
452 /* it was aoptional so no match is ok */
453 return (ptr);
454 } else if (ptr != NIL) {
455
456 /* not optional and we still matched */
457 return (NIL);
458 }
459 if (*s1 == '\\')
460 _escaped = _escaped ? FALSE : TRUE;
461 else
462 _escaped = FALSE;
463 } while (*s1++);
464 return (NIL);
465
466 /* fail if we are currently _escaped */
467 case 'e':
468 if (_escaped)
469 return(NIL);
470 cs = MNEXT(cs);
471 break;
472
473 /* match any number of tabs and spaces */
474 case 'd':
475 ptr = s;
476 while (*s == ' ' || *s == '\t')
477 s++;
478 if (s != ptr || s == Start) {
479
480 /* match, be happy */
481 matched = 1;
482 cs = MNEXT(cs);
483 } else if (*s == '\n' || *s == '\0') {
484
485 /* match, be happy */
486 matched = 1;
487 cs = MNEXT(cs);
488 } else if (*cs & ALT) {
489
490 /* try the next part */
491 matched = 0;
492 cs = MNEXT(cs);
493 } else if (*cs & OPT) {
494
495 /* doesn't matter */
496 matched = 1;
497 cs = MNEXT(cs);
498 } else
499
500 /* no match, error return */
501 return (NIL);
502 break;
503
504 /* check for end of line */
505 case '$':
506 if (*s == '\0' || *s == '\n') {
507
508 /* match, be happy */
509 s++;
510 matched = 1;
511 cs = MNEXT(cs);
512 } else if (*cs & ALT) {
513
514 /* try the next part */
515 matched = 0;
516 cs = MNEXT(cs);
517 } else if (*cs & OPT) {
518
519 /* doesn't matter */
520 matched = 1;
521 cs = MNEXT(cs);
522 } else
523
524 /* no match, error return */
525 return (NIL);
526 break;
527
528 /* check for start of line */
529 case '^':
530 if (s == Start) {
531
532 /* match, be happy */
533 matched = 1;
534 cs = MNEXT(cs);
535 } else if (*cs & ALT) {
536
537 /* try the next part */
538 matched = 0;
539 cs = MNEXT(cs);
540 } else if (*cs & OPT) {
541
542 /* doesn't matter */
543 matched = 1;
544 cs = MNEXT(cs);
545 } else
546
547 /* no match, error return */
548 return (NIL);
549 break;
550
551 /* end of a subexpression, return success */
552 case ')':
553 return (s);
554 }
555 break;
556 }
557 }
558 return (s);
559 }
560