xref: /illumos-gate/usr/src/lib/libc/port/gen/regexpr.c (revision 2aeafac3612e19716bf8164f89c3c9196342979c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 /*
33  * routines to do regular expression matching
34  *
35  * Entry points:
36  *
37  *	re_comp(s)
38  *		char *s;
39  *	 ... returns 0 if the string s was compiled successfully,
40  *		     a pointer to an error message otherwise.
41  *	     If passed 0 or a null string returns without changing
42  *           the currently compiled re (see note 11 below).
43  *
44  *	re_exec(s)
45  *		char *s;
46  *	 ... returns 1 if the string s matches the last compiled regular
47  *		       expression,
48  *		     0 if the string s failed to match the last compiled
49  *		       regular expression, and
50  *		    -1 if the compiled regular expression was invalid
51  *		       (indicating an internal error).
52  *
53  * The strings passed to both re_comp and re_exec may have trailing or
54  * embedded newline characters; they are terminated by nulls.
55  *
56  * The identity of the author of these routines is lost in antiquity;
57  * this is essentially the same as the re code in the original V6 ed.
58  *
59  * The regular expressions recognized are described below. This description
60  * is essentially the same as that for ed.
61  *
62  *	A regular expression specifies a set of strings of characters.
63  *	A member of this set of strings is said to be matched by
64  *	the regular expression.  In the following specification for
65  *	regular expressions the word `character' means any character but NUL.
66  *
67  *	1.  Any character except a special character matches itself.
68  *	    Special characters are the regular expression delimiter plus
69  *	    \ [ . and sometimes ^ * $.
70  *	2.  A . matches any character.
71  *	3.  A \ followed by any character except a digit or ( )
72  *	    matches that character.
73  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
74  *	    character in (or not in) s. In s, \ has no special meaning,
75  *	    and ] may only appear as the first letter. A substring
76  *	    a-b, with a and b in ascending ASCII order, stands for
77  *	    the inclusive range of ASCII characters.
78  *	5.  A regular expression of form 1-4 followed by * matches a
79  *	    sequence of 0 or more matches of the regular expression.
80  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
81  *	    matches what x matches.
82  *	7.  A \ followed by a digit n matches a copy of the string that the
83  *	    bracketed regular expression beginning with the nth \( matched.
84  *	8.  A regular expression of form 1-8, x, followed by a regular
85  *	    expression of form 1-7, y matches a match for x followed by
86  *	    a match for y, with the x match being as long as possible
87  *	    while still permitting a y match.
88  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
89  *	    by $), is constrained to matches that begin at the left
90  *	    (or end at the right) end of a line.
91  *	10. A regular expression of form 1-9 picks out the longest among
92  *	    the leftmost matches in a line.
93  *	11. An empty regular expression stands for a copy of the last
94  *	    regular expression encountered.
95  */
96 
97 #include "lint.h"
98 
99 #include <stdlib.h>
100 #include <re_comp.h>
101 #include <stddef.h>
102 #include <sys/types.h>
103 
104 /*
105  * constants for re's
106  */
107 #define	CBRA	1
108 #define	CCHR	2
109 #define	CDOT	4
110 #define	CCL	6
111 #define	NCCL	8
112 #define	CDOL	10
113 #define	CEOF	11
114 #define	CKET	12
115 #define	CBACK	18
116 
117 #define	CSTAR	01
118 
119 #define	ESIZE	512
120 #define	NBRA	9
121 
122 static struct re_globals {
123 	char	_expbuf[ESIZE];
124 	char	*_braslist[NBRA], *_braelist[NBRA];
125 	char	_circf;
126 } *re_globals;
127 #define	expbuf (_re->_expbuf)
128 #define	braslist (_re->_braslist)
129 #define	braelist (_re->_braelist)
130 #define	circf (_re->_circf)
131 
132 static int advance(const char *, char *);
133 static int backref(int, const char *);
134 static int cclass(char *, char, int);
135 
136 /*
137  * compile the regular expression argument into a dfa
138  */
139 char *
140 re_comp(const char *sp)
141 {
142 	char	c;
143 	struct re_globals *_re = re_globals;
144 	char	*ep;
145 	char	cclcnt, numbra = 0;
146 	char	*lastep = NULL;
147 	char	bracket[NBRA];
148 	char	*bracketp = &bracket[0];
149 	char	*retoolong = "Regular expression too long";
150 
151 	if (_re == NULL) {
152 		_re = (struct re_globals *)calloc(1, sizeof (*_re));
153 		if (_re == NULL)
154 			return ("Out of memory");
155 		re_globals = _re;
156 	}
157 	ep = expbuf;
158 
159 #define	comerr(msg) {expbuf[0] = 0; return (msg); }
160 
161 	if (sp == NULL || *sp == '\0') {
162 		if (*ep == 0)
163 			return ("No previous regular expression");
164 		return (NULL);
165 	}
166 	if (*sp == '^') {
167 		circf = 1;
168 		sp++;
169 	}
170 	else
171 		circf = 0;
172 	for (;;) {
173 		if (ep >= &expbuf[ESIZE])
174 			comerr(retoolong);
175 		if ((c = *sp++) == '\0') {
176 			if (bracketp != bracket)
177 				comerr("unmatched \\(");
178 			*ep++ = CEOF;
179 			*ep++ = 0;
180 			return (NULL);
181 		}
182 		if (c != '*')
183 			lastep = ep;
184 		switch (c) {
185 
186 		case '.':
187 			*ep++ = CDOT;
188 			continue;
189 
190 		case '*':
191 			if (lastep == NULL || *lastep == CBRA ||
192 			    *lastep == CKET)
193 				goto defchar;
194 			*lastep |= CSTAR;
195 			continue;
196 
197 		case '$':
198 			if (*sp != '\0')
199 				goto defchar;
200 			*ep++ = CDOL;
201 			continue;
202 
203 		case '[':
204 			*ep++ = CCL;
205 			*ep++ = 0;
206 			cclcnt = 1;
207 			if ((c = *sp++) == '^') {
208 				c = *sp++;
209 				ep[-2] = NCCL;
210 			}
211 			do {
212 				if (c == '\0')
213 					comerr("missing ]");
214 				if (c == '-' && ep [-1] != 0) {
215 					if ((c = *sp++) == ']') {
216 						*ep++ = '-';
217 						cclcnt++;
218 						break;
219 					}
220 					while (ep[-1] < c) {
221 						*ep = ep[-1] + 1;
222 						ep++;
223 						cclcnt++;
224 						if (ep >= &expbuf[ESIZE])
225 							comerr(retoolong);
226 					}
227 				}
228 				*ep++ = c;
229 				cclcnt++;
230 				if (ep >= &expbuf[ESIZE])
231 					comerr(retoolong);
232 			} while ((c = *sp++) != ']');
233 			lastep[1] = cclcnt;
234 			continue;
235 
236 		case '\\':
237 			if ((c = *sp++) == '(') {
238 				if (numbra >= NBRA)
239 					comerr("too many \\(\\) pairs");
240 				*bracketp++ = numbra;
241 				*ep++ = CBRA;
242 				*ep++ = numbra++;
243 				continue;
244 			}
245 			if (c == ')') {
246 				if (bracketp <= bracket)
247 					comerr("unmatched \\)");
248 				*ep++ = CKET;
249 				*ep++ = *--bracketp;
250 				continue;
251 			}
252 			if (c >= '1' && c < ('1' + NBRA)) {
253 				*ep++ = CBACK;
254 				*ep++ = c - '1';
255 				continue;
256 			}
257 			*ep++ = CCHR;
258 			*ep++ = c;
259 			continue;
260 
261 		defchar:
262 		default:
263 			*ep++ = CCHR;
264 			*ep++ = c;
265 		}
266 	}
267 }
268 
269 /*
270  * match the argument string against the compiled re
271  */
272 int
273 re_exec(const char *p1)
274 {
275 	struct re_globals *_re = re_globals;
276 	char	*p2;
277 	int	c;
278 	int	rv;
279 
280 	if (_re == NULL)
281 		return (0);
282 	p2 = expbuf;
283 	for (c = 0; c < NBRA; c++) {
284 		braslist[c] = 0;
285 		braelist[c] = 0;
286 	}
287 	if (circf)
288 		return ((advance(p1, p2)));
289 	/*
290 	 * fast check for first character
291 	 */
292 	if (*p2 == CCHR) {
293 		c = p2[1];
294 		do {
295 			if (*p1 != c)
296 				continue;
297 			if (rv = advance(p1, p2))
298 				return (rv);
299 		} while (*p1++);
300 		return (0);
301 	}
302 	/*
303 	 * regular algorithm
304 	 */
305 	do {
306 		if (rv = advance(p1, p2))
307 			return (rv);
308 	} while (*p1++);
309 	return (0);
310 }
311 
312 /*
313  * try to match the next thing in the dfa
314  */
315 static int
316 advance(const char *lp, char *ep)
317 {
318 	const char	*curlp;
319 	ptrdiff_t	ct;
320 	int		i;
321 	int		rv;
322 	struct re_globals *_re = re_globals;
323 
324 	for (;;)
325 		switch (*ep++) {
326 
327 		case CCHR:
328 			if (*ep++ == *lp++)
329 				continue;
330 			return (0);
331 
332 		case CDOT:
333 			if (*lp++)
334 				continue;
335 			return (0);
336 
337 		case CDOL:
338 			if (*lp == '\0')
339 				continue;
340 			return (0);
341 
342 		case CEOF:
343 			return (1);
344 
345 		case CCL:
346 			if (cclass(ep, *lp++, 1)) {
347 				ep += *ep;
348 				continue;
349 			}
350 			return (0);
351 
352 		case NCCL:
353 			if (cclass(ep, *lp++, 0)) {
354 				ep += *ep;
355 				continue;
356 			}
357 			return (0);
358 
359 		case CBRA:
360 			braslist[*ep++] = (char *)lp;
361 			continue;
362 
363 		case CKET:
364 			braelist[*ep++] = (char *)lp;
365 			continue;
366 
367 		case CBACK:
368 			if (braelist[i = *ep++] == NULL)
369 				return (-1);
370 			if (backref(i, lp)) {
371 				lp += braelist[i] - braslist[i];
372 				continue;
373 			}
374 			return (0);
375 
376 		case CBACK|CSTAR:
377 			if (braelist[i = *ep++] == NULL)
378 				return (-1);
379 			curlp = lp;
380 			ct = braelist[i] - braslist[i];
381 			while (backref(i, lp))
382 				lp += ct;
383 			while (lp >= curlp) {
384 				if (rv = advance(lp, ep))
385 					return (rv);
386 				lp -= ct;
387 			}
388 			continue;
389 
390 		case CDOT|CSTAR:
391 			curlp = lp;
392 			while (*lp++)
393 				;
394 			goto star;
395 
396 		case CCHR|CSTAR:
397 			curlp = lp;
398 			while (*lp++ == *ep)
399 				;
400 			ep++;
401 			goto star;
402 
403 		case CCL|CSTAR:
404 		case NCCL|CSTAR:
405 			curlp = lp;
406 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
407 				;
408 			ep += *ep;
409 			goto star;
410 
411 		star:
412 			do {
413 				lp--;
414 				if (rv = advance(lp, ep))
415 					return (rv);
416 			} while (lp > curlp);
417 			return (0);
418 
419 		default:
420 			return (-1);
421 		}
422 }
423 
424 static int
425 backref(int i, const char *lp)
426 {
427 	char	*bp;
428 	struct re_globals *_re = re_globals;
429 
430 	bp = braslist[i];
431 	while (*bp++ == *lp++)
432 		if (bp >= braelist[i])
433 			return (1);
434 	return (0);
435 }
436 
437 static int
438 cclass(char *set, char c, int af)
439 {
440 	int	n;
441 
442 	if (c == 0)
443 		return (0);
444 	n = *set++;
445 	while (--n)
446 		if (*set++ == c)
447 			return (af);
448 	return (! af);
449 }
450