xref: /titanic_52/usr/src/lib/libbc/libc/gen/common/regex.c (revision 59ac0c1669407488b67ae9e273667a340dccc611)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 1987 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * routines to do regular expression matching
31  *
32  * Entry points:
33  *
34  *	re_comp(s)
35  *		char *s;
36  *	 ... returns 0 if the string s was compiled successfully,
37  *		     a pointer to an error message otherwise.
38  *	     If passed 0 or a null string returns without changing
39  *           the currently compiled re (see note 11 below).
40  *
41  *	re_exec(s)
42  *		char *s;
43  *	 ... returns 1 if the string s matches the last compiled regular
44  *		       expression,
45  *		     0 if the string s failed to match the last compiled
46  *		       regular expression, and
47  *		    -1 if the compiled regular expression was invalid
48  *		       (indicating an internal error).
49  *
50  * The strings passed to both re_comp and re_exec may have trailing or
51  * embedded newline characters; they are terminated by nulls.
52  *
53  * The identity of the author of these routines is lost in antiquity;
54  * this is essentially the same as the re code in the original V6 ed.
55  *
56  * The regular expressions recognized are described below. This description
57  * is essentially the same as that for ed.
58  *
59  *	A regular expression specifies a set of strings of characters.
60  *	A member of this set of strings is said to be matched by
61  *	the regular expression.  In the following specification for
62  *	regular expressions the word `character' means any character but NUL.
63  *
64  *	1.  Any character except a special character matches itself.
65  *	    Special characters are the regular expression delimiter plus
66  *	    \ [ . and sometimes ^ * $.
67  *	2.  A . matches any character.
68  *	3.  A \ followed by any character except a digit or ( )
69  *	    matches that character.
70  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
71  *	    character in (or not in) s. In s, \ has no special meaning,
72  *	    and ] may only appear as the first letter. A substring
73  *	    a-b, with a and b in ascending ASCII order, stands for
74  *	    the inclusive range of ASCII characters.
75  *	5.  A regular expression of form 1-4 followed by * matches a
76  *	    sequence of 0 or more matches of the regular expression.
77  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
78  *	    matches what x matches.
79  *	7.  A \ followed by a digit n matches a copy of the string that the
80  *	    bracketed regular expression beginning with the nth \( matched.
81  *	8.  A regular expression of form 1-8, x, followed by a regular
82  *	    expression of form 1-7, y matches a match for x followed by
83  *	    a match for y, with the x match being as long as possible
84  *	    while still permitting a y match.
85  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
86  *	    by $), is constrained to matches that begin at the left
87  *	    (or end at the right) end of a line.
88  *	10. A regular expression of form 1-9 picks out the longest among
89  *	    the leftmost matches in a line.
90  *	11. An empty regular expression stands for a copy of the last
91  *	    regular expression encountered.
92  */
93 
94 /*
95  * constants for re's
96  */
97 #define	CBRA	1
98 #define	CCHR	2
99 #define	CDOT	4
100 #define	CCL	6
101 #define	NCCL	8
102 #define	CDOL	10
103 #define	CEOF	11
104 #define	CKET	12
105 #define	CBACK	18
106 
107 #define	CSTAR	01
108 
109 #define	ESIZE	512
110 #define	NBRA	9
111 
112 static struct re_globals {
113 	char	_expbuf[ESIZE];
114 	char	*_braslist[NBRA], *_braelist[NBRA];
115 	char	_circf;
116 } *re_globals;
117 #define	expbuf (_re->_expbuf)
118 #define	braslist (_re->_braslist)
119 #define	braelist (_re->_braelist)
120 #define	circf (_re->_circf)
121 
122 static int	advance(char *, char *);
123 static int	backref(int, char *);
124 static int	cclass(char *, char, int);
125 
126 /*
127  * compile the regular expression argument into a dfa
128  */
129 char *
130 re_comp(char *sp)
131 {
132 	int	c;
133 	struct re_globals *_re = re_globals;
134 	char	*ep;
135 	int	cclcnt, numbra = 0;
136 	char	*lastep = 0;
137 	char	bracket[NBRA];
138 	char	*bracketp = &bracket[0];
139 	char	*retoolong = "Regular expression too long";
140 
141 	if (_re == 0) {
142 		_re = (struct re_globals *)calloc(1, sizeof (*_re));
143 		if (_re == 0)
144 			return ("Out of memory");
145 		re_globals = _re;
146 	}
147 	ep = expbuf;
148 
149 #define	comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
150 
151 	if (sp == 0 || *sp == '\0') {
152 		if (*ep == 0)
153 			return("No previous regular expression");
154 		return (0);
155 	}
156 	if (*sp == '^') {
157 		circf = 1;
158 		sp++;
159 	}
160 	else
161 		circf = 0;
162 	for (;;) {
163 		if (ep >= &expbuf[ESIZE])
164 			comerr(retoolong);
165 		if ((c = *sp++) == '\0') {
166 			if (bracketp != bracket)
167 				comerr("unmatched \\(");
168 			*ep++ = CEOF;
169 			*ep++ = 0;
170 			return (0);
171 		}
172 		if (c != '*')
173 			lastep = ep;
174 		switch (c) {
175 
176 		case '.':
177 			*ep++ = CDOT;
178 			continue;
179 
180 		case '*':
181 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
182 				goto defchar;
183 			*lastep |= CSTAR;
184 			continue;
185 
186 		case '$':
187 			if (*sp != '\0')
188 				goto defchar;
189 			*ep++ = CDOL;
190 			continue;
191 
192 		case '[':
193 			*ep++ = CCL;
194 			*ep++ = 0;
195 			cclcnt = 1;
196 			if ((c = *sp++) == '^') {
197 				c = *sp++;
198 				ep[-2] = NCCL;
199 			}
200 			do {
201 				if (c == '\0')
202 					comerr("missing ]");
203 				if (c == '-' && ep [-1] != 0) {
204 					if ((c = *sp++) == ']') {
205 						*ep++ = '-';
206 						cclcnt++;
207 						break;
208 					}
209 					while (ep[-1] < c) {
210 						*ep = ep[-1] + 1;
211 						ep++;
212 						cclcnt++;
213 						if (ep >= &expbuf[ESIZE])
214 							comerr(retoolong);
215 					}
216 				}
217 				*ep++ = c;
218 				cclcnt++;
219 				if (ep >= &expbuf[ESIZE])
220 					comerr(retoolong);
221 			} while ((c = *sp++) != ']');
222 			lastep[1] = cclcnt;
223 			continue;
224 
225 		case '\\':
226 			if ((c = *sp++) == '(') {
227 				if (numbra >= NBRA)
228 					comerr("too many \\(\\) pairs");
229 				*bracketp++ = numbra;
230 				*ep++ = CBRA;
231 				*ep++ = numbra++;
232 				continue;
233 			}
234 			if (c == ')') {
235 				if (bracketp <= bracket)
236 					comerr("unmatched \\)");
237 				*ep++ = CKET;
238 				*ep++ = *--bracketp;
239 				continue;
240 			}
241 			if (c >= '1' && c < ('1' + NBRA)) {
242 				*ep++ = CBACK;
243 				*ep++ = c - '1';
244 				continue;
245 			}
246 			*ep++ = CCHR;
247 			*ep++ = c;
248 			continue;
249 
250 		defchar:
251 		default:
252 			*ep++ = CCHR;
253 			*ep++ = c;
254 		}
255 	}
256 }
257 
258 /*
259  * match the argument string against the compiled re
260  */
261 int
262 re_exec(char *p1)
263 {
264 	struct re_globals *_re = re_globals;
265 	char	*p2;
266 	int	c;
267 	int	rv;
268 
269 	if (_re == 0)
270 		return (0);
271 	p2 = expbuf;
272 	for (c = 0; c < NBRA; c++) {
273 		braslist[c] = 0;
274 		braelist[c] = 0;
275 	}
276 	if (circf)
277 		return((advance(p1, p2)));
278 	/*
279 	 * fast check for first character
280 	 */
281 	if (*p2 == CCHR) {
282 		c = p2[1];
283 		do {
284 			if (*p1 != c)
285 				continue;
286 			if (rv = advance(p1, p2))
287 				return(rv);
288 		} while (*p1++);
289 		return(0);
290 	}
291 	/*
292 	 * regular algorithm
293 	 */
294 	do
295 		if (rv = advance(p1, p2))
296 			return(rv);
297 	while (*p1++);
298 	return(0);
299 }
300 
301 /*
302  * try to match the next thing in the dfa
303  */
304 static	int
305 advance(char *lp, char *ep)
306 {
307 	char	*curlp;
308 	int	ct, i;
309 	int	rv;
310 	struct re_globals *_re = re_globals;
311 
312 	for (;;)
313 		switch (*ep++) {
314 
315 		case CCHR:
316 			if (*ep++ == *lp++)
317 				continue;
318 			return(0);
319 
320 		case CDOT:
321 			if (*lp++)
322 				continue;
323 			return(0);
324 
325 		case CDOL:
326 			if (*lp == '\0')
327 				continue;
328 			return(0);
329 
330 		case CEOF:
331 			return(1);
332 
333 		case CCL:
334 			if (cclass(ep, *lp++, 1)) {
335 				ep += *ep;
336 				continue;
337 			}
338 			return(0);
339 
340 		case NCCL:
341 			if (cclass(ep, *lp++, 0)) {
342 				ep += *ep;
343 				continue;
344 			}
345 			return(0);
346 
347 		case CBRA:
348 			braslist[*ep++] = lp;
349 			continue;
350 
351 		case CKET:
352 			braelist[*ep++] = lp;
353 			continue;
354 
355 		case CBACK:
356 			if (braelist[i = *ep++] == 0)
357 				return(-1);
358 			if (backref(i, lp)) {
359 				lp += braelist[i] - braslist[i];
360 				continue;
361 			}
362 			return(0);
363 
364 		case CBACK|CSTAR:
365 			if (braelist[i = *ep++] == 0)
366 				return(-1);
367 			curlp = lp;
368 			ct = braelist[i] - braslist[i];
369 			while (backref(i, lp))
370 				lp += ct;
371 			while (lp >= curlp) {
372 				if (rv = advance(lp, ep))
373 					return(rv);
374 				lp -= ct;
375 			}
376 			continue;
377 
378 		case CDOT|CSTAR:
379 			curlp = lp;
380 			while (*lp++)
381 				;
382 			goto star;
383 
384 		case CCHR|CSTAR:
385 			curlp = lp;
386 			while (*lp++ == *ep)
387 				;
388 			ep++;
389 			goto star;
390 
391 		case CCL|CSTAR:
392 		case NCCL|CSTAR:
393 			curlp = lp;
394 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
395 				;
396 			ep += *ep;
397 			goto star;
398 
399 		star:
400 			do {
401 				lp--;
402 				if (rv = advance(lp, ep))
403 					return(rv);
404 			} while (lp > curlp);
405 			return(0);
406 
407 		default:
408 			return(-1);
409 		}
410 }
411 
412 static int
413 backref(int i, char *lp)
414 {
415 	char	*bp;
416 	struct re_globals *_re = re_globals;
417 
418 	bp = braslist[i];
419 	while (*bp++ == *lp++)
420 		if (bp >= braelist[i])
421 			return (1);
422 	return (0);
423 }
424 
425 static int
426 cclass(char *set, char c, int af)
427 {
428 	int	n;
429 
430 	if (c == 0)
431 		return(0);
432 	n = *set++;
433 	while (--n)
434 		if (*set++ == c)
435 			return (af);
436 	return (!af);
437 }
438