xref: /titanic_51/usr/src/lib/libbc/libc/gen/common/regex.c (revision 1a7c1b724419d3cb5fa6eea75123c6b2060ba31b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 1987 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 	  /* from UCB 4.1 80/12/21 */
29 
30 /*
31  * routines to do regular expression matching
32  *
33  * Entry points:
34  *
35  *	re_comp(s)
36  *		char *s;
37  *	 ... returns 0 if the string s was compiled successfully,
38  *		     a pointer to an error message otherwise.
39  *	     If passed 0 or a null string returns without changing
40  *           the currently compiled re (see note 11 below).
41  *
42  *	re_exec(s)
43  *		char *s;
44  *	 ... returns 1 if the string s matches the last compiled regular
45  *		       expression,
46  *		     0 if the string s failed to match the last compiled
47  *		       regular expression, and
48  *		    -1 if the compiled regular expression was invalid
49  *		       (indicating an internal error).
50  *
51  * The strings passed to both re_comp and re_exec may have trailing or
52  * embedded newline characters; they are terminated by nulls.
53  *
54  * The identity of the author of these routines is lost in antiquity;
55  * this is essentially the same as the re code in the original V6 ed.
56  *
57  * The regular expressions recognized are described below. This description
58  * is essentially the same as that for ed.
59  *
60  *	A regular expression specifies a set of strings of characters.
61  *	A member of this set of strings is said to be matched by
62  *	the regular expression.  In the following specification for
63  *	regular expressions the word `character' means any character but NUL.
64  *
65  *	1.  Any character except a special character matches itself.
66  *	    Special characters are the regular expression delimiter plus
67  *	    \ [ . and sometimes ^ * $.
68  *	2.  A . matches any character.
69  *	3.  A \ followed by any character except a digit or ( )
70  *	    matches that character.
71  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
72  *	    character in (or not in) s. In s, \ has no special meaning,
73  *	    and ] may only appear as the first letter. A substring
74  *	    a-b, with a and b in ascending ASCII order, stands for
75  *	    the inclusive range of ASCII characters.
76  *	5.  A regular expression of form 1-4 followed by * matches a
77  *	    sequence of 0 or more matches of the regular expression.
78  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
79  *	    matches what x matches.
80  *	7.  A \ followed by a digit n matches a copy of the string that the
81  *	    bracketed regular expression beginning with the nth \( matched.
82  *	8.  A regular expression of form 1-8, x, followed by a regular
83  *	    expression of form 1-7, y matches a match for x followed by
84  *	    a match for y, with the x match being as long as possible
85  *	    while still permitting a y match.
86  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
87  *	    by $), is constrained to matches that begin at the left
88  *	    (or end at the right) end of a line.
89  *	10. A regular expression of form 1-9 picks out the longest among
90  *	    the leftmost matches in a line.
91  *	11. An empty regular expression stands for a copy of the last
92  *	    regular expression encountered.
93  */
94 
95 /*
96  * constants for re's
97  */
98 #define	CBRA	1
99 #define	CCHR	2
100 #define	CDOT	4
101 #define	CCL	6
102 #define	NCCL	8
103 #define	CDOL	10
104 #define	CEOF	11
105 #define	CKET	12
106 #define	CBACK	18
107 
108 #define	CSTAR	01
109 
110 #define	ESIZE	512
111 #define	NBRA	9
112 
113 static struct re_globals {
114 	char	_expbuf[ESIZE];
115 	char	*_braslist[NBRA], *_braelist[NBRA];
116 	char	_circf;
117 } *re_globals;
118 #define	expbuf (_re->_expbuf)
119 #define	braslist (_re->_braslist)
120 #define	braelist (_re->_braelist)
121 #define	circf (_re->_circf)
122 
123 /*
124  * compile the regular expression argument into a dfa
125  */
126 char *
127 re_comp(sp)
128 	register char	*sp;
129 {
130 	register int	c;
131 	register struct re_globals *_re = re_globals;
132 	register char	*ep;
133 	int	cclcnt, numbra = 0;
134 	char	*lastep = 0;
135 	char	bracket[NBRA];
136 	char	*bracketp = &bracket[0];
137 	char	*retoolong = "Regular expression too long";
138 
139 	if (_re == 0) {
140 		_re = (struct re_globals *)calloc(1, sizeof (*_re));
141 		if (_re == 0)
142 			return ("Out of memory");
143 		re_globals = _re;
144 	}
145 	ep = expbuf;
146 
147 #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
148 
149 	if (sp == 0 || *sp == '\0') {
150 		if (*ep == 0)
151 			return("No previous regular expression");
152 		return(0);
153 	}
154 	if (*sp == '^') {
155 		circf = 1;
156 		sp++;
157 	}
158 	else
159 		circf = 0;
160 	for (;;) {
161 		if (ep >= &expbuf[ESIZE])
162 			comerr(retoolong);
163 		if ((c = *sp++) == '\0') {
164 			if (bracketp != bracket)
165 				comerr("unmatched \\(");
166 			*ep++ = CEOF;
167 			*ep++ = 0;
168 			return(0);
169 		}
170 		if (c != '*')
171 			lastep = ep;
172 		switch (c) {
173 
174 		case '.':
175 			*ep++ = CDOT;
176 			continue;
177 
178 		case '*':
179 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
180 				goto defchar;
181 			*lastep |= CSTAR;
182 			continue;
183 
184 		case '$':
185 			if (*sp != '\0')
186 				goto defchar;
187 			*ep++ = CDOL;
188 			continue;
189 
190 		case '[':
191 			*ep++ = CCL;
192 			*ep++ = 0;
193 			cclcnt = 1;
194 			if ((c = *sp++) == '^') {
195 				c = *sp++;
196 				ep[-2] = NCCL;
197 			}
198 			do {
199 				if (c == '\0')
200 					comerr("missing ]");
201 				if (c == '-' && ep [-1] != 0) {
202 					if ((c = *sp++) == ']') {
203 						*ep++ = '-';
204 						cclcnt++;
205 						break;
206 					}
207 					while (ep[-1] < c) {
208 						*ep = ep[-1] + 1;
209 						ep++;
210 						cclcnt++;
211 						if (ep >= &expbuf[ESIZE])
212 							comerr(retoolong);
213 					}
214 				}
215 				*ep++ = c;
216 				cclcnt++;
217 				if (ep >= &expbuf[ESIZE])
218 					comerr(retoolong);
219 			} while ((c = *sp++) != ']');
220 			lastep[1] = cclcnt;
221 			continue;
222 
223 		case '\\':
224 			if ((c = *sp++) == '(') {
225 				if (numbra >= NBRA)
226 					comerr("too many \\(\\) pairs");
227 				*bracketp++ = numbra;
228 				*ep++ = CBRA;
229 				*ep++ = numbra++;
230 				continue;
231 			}
232 			if (c == ')') {
233 				if (bracketp <= bracket)
234 					comerr("unmatched \\)");
235 				*ep++ = CKET;
236 				*ep++ = *--bracketp;
237 				continue;
238 			}
239 			if (c >= '1' && c < ('1' + NBRA)) {
240 				*ep++ = CBACK;
241 				*ep++ = c - '1';
242 				continue;
243 			}
244 			*ep++ = CCHR;
245 			*ep++ = c;
246 			continue;
247 
248 		defchar:
249 		default:
250 			*ep++ = CCHR;
251 			*ep++ = c;
252 		}
253 	}
254 }
255 
256 /*
257  * match the argument string against the compiled re
258  */
259 int
260 re_exec(p1)
261 	register char	*p1;
262 {
263 	register struct re_globals *_re = re_globals;
264 	register char	*p2;
265 	register int	c;
266 	int	rv;
267 
268 	if (_re == 0)
269 		return (0);
270 	p2 = expbuf;
271 	for (c = 0; c < NBRA; c++) {
272 		braslist[c] = 0;
273 		braelist[c] = 0;
274 	}
275 	if (circf)
276 		return((advance(p1, p2)));
277 	/*
278 	 * fast check for first character
279 	 */
280 	if (*p2 == CCHR) {
281 		c = p2[1];
282 		do {
283 			if (*p1 != c)
284 				continue;
285 			if (rv = advance(p1, p2))
286 				return(rv);
287 		} while (*p1++);
288 		return(0);
289 	}
290 	/*
291 	 * regular algorithm
292 	 */
293 	do
294 		if (rv = advance(p1, p2))
295 			return(rv);
296 	while (*p1++);
297 	return(0);
298 }
299 
300 /*
301  * try to match the next thing in the dfa
302  */
303 static	int
304 advance(lp, ep)
305 	register char	*lp, *ep;
306 {
307 	register char	*curlp;
308 	int	ct, i;
309 	int	rv;
310 	register struct re_globals *_re = re_globals;
311 
312 	for (;;)
313 		switch (*ep++) {
314 
315 		case CCHR:
316 			if (*ep++ == *lp++)
317 				continue;
318 			return(0);
319 
320 		case CDOT:
321 			if (*lp++)
322 				continue;
323 			return(0);
324 
325 		case CDOL:
326 			if (*lp == '\0')
327 				continue;
328 			return(0);
329 
330 		case CEOF:
331 			return(1);
332 
333 		case CCL:
334 			if (cclass(ep, *lp++, 1)) {
335 				ep += *ep;
336 				continue;
337 			}
338 			return(0);
339 
340 		case NCCL:
341 			if (cclass(ep, *lp++, 0)) {
342 				ep += *ep;
343 				continue;
344 			}
345 			return(0);
346 
347 		case CBRA:
348 			braslist[*ep++] = lp;
349 			continue;
350 
351 		case CKET:
352 			braelist[*ep++] = lp;
353 			continue;
354 
355 		case CBACK:
356 			if (braelist[i = *ep++] == 0)
357 				return(-1);
358 			if (backref(i, lp)) {
359 				lp += braelist[i] - braslist[i];
360 				continue;
361 			}
362 			return(0);
363 
364 		case CBACK|CSTAR:
365 			if (braelist[i = *ep++] == 0)
366 				return(-1);
367 			curlp = lp;
368 			ct = braelist[i] - braslist[i];
369 			while (backref(i, lp))
370 				lp += ct;
371 			while (lp >= curlp) {
372 				if (rv = advance(lp, ep))
373 					return(rv);
374 				lp -= ct;
375 			}
376 			continue;
377 
378 		case CDOT|CSTAR:
379 			curlp = lp;
380 			while (*lp++)
381 				;
382 			goto star;
383 
384 		case CCHR|CSTAR:
385 			curlp = lp;
386 			while (*lp++ == *ep)
387 				;
388 			ep++;
389 			goto star;
390 
391 		case CCL|CSTAR:
392 		case NCCL|CSTAR:
393 			curlp = lp;
394 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
395 				;
396 			ep += *ep;
397 			goto star;
398 
399 		star:
400 			do {
401 				lp--;
402 				if (rv = advance(lp, ep))
403 					return(rv);
404 			} while (lp > curlp);
405 			return(0);
406 
407 		default:
408 			return(-1);
409 		}
410 }
411 
412 static
413 backref(i, lp)
414 	register int	i;
415 	register char	*lp;
416 {
417 	register char	*bp;
418 	register struct re_globals *_re = re_globals;
419 
420 	bp = braslist[i];
421 	while (*bp++ == *lp++)
422 		if (bp >= braelist[i])
423 			return(1);
424 	return(0);
425 }
426 
427 static int
428 cclass(set, c, af)
429 	register char	*set, c;
430 	int	af;
431 {
432 	register int	n;
433 
434 	if (c == 0)
435 		return(0);
436 	n = *set++;
437 	while (--n)
438 		if (*set++ == c)
439 			return(af);
440 	return(! af);
441 }
442