xref: /illumos-gate/usr/src/lib/libc/port/gen/regexpr.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 /*
31  * routines to do regular expression matching
32  *
33  * Entry points:
34  *
35  *	re_comp(s)
36  *		char *s;
37  *	 ... returns 0 if the string s was compiled successfully,
38  *		     a pointer to an error message otherwise.
39  *	     If passed 0 or a null string returns without changing
40  *           the currently compiled re (see note 11 below).
41  *
42  *	re_exec(s)
43  *		char *s;
44  *	 ... returns 1 if the string s matches the last compiled regular
45  *		       expression,
46  *		     0 if the string s failed to match the last compiled
47  *		       regular expression, and
48  *		    -1 if the compiled regular expression was invalid
49  *		       (indicating an internal error).
50  *
51  * The strings passed to both re_comp and re_exec may have trailing or
52  * embedded newline characters; they are terminated by nulls.
53  *
54  * The identity of the author of these routines is lost in antiquity;
55  * this is essentially the same as the re code in the original V6 ed.
56  *
57  * The regular expressions recognized are described below. This description
58  * is essentially the same as that for ed.
59  *
60  *	A regular expression specifies a set of strings of characters.
61  *	A member of this set of strings is said to be matched by
62  *	the regular expression.  In the following specification for
63  *	regular expressions the word `character' means any character but NUL.
64  *
65  *	1.  Any character except a special character matches itself.
66  *	    Special characters are the regular expression delimiter plus
67  *	    \ [ . and sometimes ^ * $.
68  *	2.  A . matches any character.
69  *	3.  A \ followed by any character except a digit or ( )
70  *	    matches that character.
71  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
72  *	    character in (or not in) s. In s, \ has no special meaning,
73  *	    and ] may only appear as the first letter. A substring
74  *	    a-b, with a and b in ascending ASCII order, stands for
75  *	    the inclusive range of ASCII characters.
76  *	5.  A regular expression of form 1-4 followed by * matches a
77  *	    sequence of 0 or more matches of the regular expression.
78  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
79  *	    matches what x matches.
80  *	7.  A \ followed by a digit n matches a copy of the string that the
81  *	    bracketed regular expression beginning with the nth \( matched.
82  *	8.  A regular expression of form 1-8, x, followed by a regular
83  *	    expression of form 1-7, y matches a match for x followed by
84  *	    a match for y, with the x match being as long as possible
85  *	    while still permitting a y match.
86  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
87  *	    by $), is constrained to matches that begin at the left
88  *	    (or end at the right) end of a line.
89  *	10. A regular expression of form 1-9 picks out the longest among
90  *	    the leftmost matches in a line.
91  *	11. An empty regular expression stands for a copy of the last
92  *	    regular expression encountered.
93  */
94 
95 #include "lint.h"
96 
97 #include <stdlib.h>
98 #include <re_comp.h>
99 #include <stddef.h>
100 #include <sys/types.h>
101 
102 /*
103  * constants for re's
104  */
105 #define	CBRA	1
106 #define	CCHR	2
107 #define	CDOT	4
108 #define	CCL	6
109 #define	NCCL	8
110 #define	CDOL	10
111 #define	CEOF	11
112 #define	CKET	12
113 #define	CBACK	18
114 
115 #define	CSTAR	01
116 
117 #define	ESIZE	512
118 #define	NBRA	9
119 
120 static struct re_globals {
121 	char	_expbuf[ESIZE];
122 	char	*_braslist[NBRA], *_braelist[NBRA];
123 	char	_circf;
124 } *re_globals;
125 #define	expbuf (_re->_expbuf)
126 #define	braslist (_re->_braslist)
127 #define	braelist (_re->_braelist)
128 #define	circf (_re->_circf)
129 
130 static int advance(const char *, char *);
131 static int backref(int, const char *);
132 static int cclass(char *, char, int);
133 
134 /*
135  * compile the regular expression argument into a dfa
136  */
137 char *
138 re_comp(const char *sp)
139 {
140 	char	c;
141 	struct re_globals *_re = re_globals;
142 	char	*ep;
143 	char	cclcnt, numbra = 0;
144 	char	*lastep = NULL;
145 	char	bracket[NBRA];
146 	char	*bracketp = &bracket[0];
147 	char	*retoolong = "Regular expression too long";
148 
149 	if (_re == NULL) {
150 		_re = (struct re_globals *)calloc(1, sizeof (*_re));
151 		if (_re == NULL)
152 			return ("Out of memory");
153 		re_globals = _re;
154 	}
155 	ep = expbuf;
156 
157 #define	comerr(msg) {expbuf[0] = 0; return (msg); }
158 
159 	if (sp == NULL || *sp == '\0') {
160 		if (*ep == 0)
161 			return ("No previous regular expression");
162 		return (NULL);
163 	}
164 	if (*sp == '^') {
165 		circf = 1;
166 		sp++;
167 	}
168 	else
169 		circf = 0;
170 	for (;;) {
171 		if (ep >= &expbuf[ESIZE])
172 			comerr(retoolong);
173 		if ((c = *sp++) == '\0') {
174 			if (bracketp != bracket)
175 				comerr("unmatched \\(");
176 			*ep++ = CEOF;
177 			*ep++ = 0;
178 			return (NULL);
179 		}
180 		if (c != '*')
181 			lastep = ep;
182 		switch (c) {
183 
184 		case '.':
185 			*ep++ = CDOT;
186 			continue;
187 
188 		case '*':
189 			if (lastep == NULL || *lastep == CBRA ||
190 			    *lastep == CKET)
191 				goto defchar;
192 			*lastep |= CSTAR;
193 			continue;
194 
195 		case '$':
196 			if (*sp != '\0')
197 				goto defchar;
198 			*ep++ = CDOL;
199 			continue;
200 
201 		case '[':
202 			*ep++ = CCL;
203 			*ep++ = 0;
204 			cclcnt = 1;
205 			if ((c = *sp++) == '^') {
206 				c = *sp++;
207 				ep[-2] = NCCL;
208 			}
209 			do {
210 				if (c == '\0')
211 					comerr("missing ]");
212 				if (c == '-' && ep [-1] != 0) {
213 					if ((c = *sp++) == ']') {
214 						*ep++ = '-';
215 						cclcnt++;
216 						break;
217 					}
218 					while (ep[-1] < c) {
219 						*ep = ep[-1] + 1;
220 						ep++;
221 						cclcnt++;
222 						if (ep >= &expbuf[ESIZE])
223 							comerr(retoolong);
224 					}
225 				}
226 				*ep++ = c;
227 				cclcnt++;
228 				if (ep >= &expbuf[ESIZE])
229 					comerr(retoolong);
230 			} while ((c = *sp++) != ']');
231 			lastep[1] = cclcnt;
232 			continue;
233 
234 		case '\\':
235 			if ((c = *sp++) == '(') {
236 				if (numbra >= NBRA)
237 					comerr("too many \\(\\) pairs");
238 				*bracketp++ = numbra;
239 				*ep++ = CBRA;
240 				*ep++ = numbra++;
241 				continue;
242 			}
243 			if (c == ')') {
244 				if (bracketp <= bracket)
245 					comerr("unmatched \\)");
246 				*ep++ = CKET;
247 				*ep++ = *--bracketp;
248 				continue;
249 			}
250 			if (c >= '1' && c < ('1' + NBRA)) {
251 				*ep++ = CBACK;
252 				*ep++ = c - '1';
253 				continue;
254 			}
255 			*ep++ = CCHR;
256 			*ep++ = c;
257 			continue;
258 
259 		defchar:
260 		default:
261 			*ep++ = CCHR;
262 			*ep++ = c;
263 		}
264 	}
265 }
266 
267 /*
268  * match the argument string against the compiled re
269  */
270 int
271 re_exec(const char *p1)
272 {
273 	struct re_globals *_re = re_globals;
274 	char	*p2;
275 	int	c;
276 	int	rv;
277 
278 	if (_re == NULL)
279 		return (0);
280 	p2 = expbuf;
281 	for (c = 0; c < NBRA; c++) {
282 		braslist[c] = 0;
283 		braelist[c] = 0;
284 	}
285 	if (circf)
286 		return ((advance(p1, p2)));
287 	/*
288 	 * fast check for first character
289 	 */
290 	if (*p2 == CCHR) {
291 		c = p2[1];
292 		do {
293 			if (*p1 != c)
294 				continue;
295 			rv = advance(p1, p2);
296 			if (rv != 0)
297 				return (rv);
298 		} while (*p1++);
299 		return (0);
300 	}
301 	/*
302 	 * regular algorithm
303 	 */
304 	do {
305 		rv = advance(p1, p2);
306 		if (rv != 0)
307 			return (rv);
308 	} while (*p1++);
309 	return (0);
310 }
311 
312 /*
313  * try to match the next thing in the dfa
314  */
315 static int
316 advance(const char *lp, char *ep)
317 {
318 	const char	*curlp;
319 	ptrdiff_t	ct;
320 	int		i;
321 	int		rv;
322 	struct re_globals *_re = re_globals;
323 
324 	for (;;)
325 		switch (*ep++) {
326 
327 		case CCHR:
328 			if (*ep++ == *lp++)
329 				continue;
330 			return (0);
331 
332 		case CDOT:
333 			if (*lp++)
334 				continue;
335 			return (0);
336 
337 		case CDOL:
338 			if (*lp == '\0')
339 				continue;
340 			return (0);
341 
342 		case CEOF:
343 			return (1);
344 
345 		case CCL:
346 			if (cclass(ep, *lp++, 1)) {
347 				ep += *ep;
348 				continue;
349 			}
350 			return (0);
351 
352 		case NCCL:
353 			if (cclass(ep, *lp++, 0)) {
354 				ep += *ep;
355 				continue;
356 			}
357 			return (0);
358 
359 		case CBRA:
360 			braslist[*ep++] = (char *)lp;
361 			continue;
362 
363 		case CKET:
364 			braelist[*ep++] = (char *)lp;
365 			continue;
366 
367 		case CBACK:
368 			if (braelist[i = *ep++] == NULL)
369 				return (-1);
370 			if (backref(i, lp)) {
371 				lp += braelist[i] - braslist[i];
372 				continue;
373 			}
374 			return (0);
375 
376 		case CBACK|CSTAR:
377 			if (braelist[i = *ep++] == NULL)
378 				return (-1);
379 			curlp = lp;
380 			ct = braelist[i] - braslist[i];
381 			while (backref(i, lp))
382 				lp += ct;
383 			while (lp >= curlp) {
384 				rv = advance(lp, ep);
385 				if (rv != 0)
386 					return (rv);
387 				lp -= ct;
388 			}
389 			continue;
390 
391 		case CDOT|CSTAR:
392 			curlp = lp;
393 			while (*lp++)
394 				;
395 			goto star;
396 
397 		case CCHR|CSTAR:
398 			curlp = lp;
399 			while (*lp++ == *ep)
400 				;
401 			ep++;
402 			goto star;
403 
404 		case CCL|CSTAR:
405 		case NCCL|CSTAR:
406 			curlp = lp;
407 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
408 				;
409 			ep += *ep;
410 			goto star;
411 
412 		star:
413 			do {
414 				lp--;
415 				rv = advance(lp, ep);
416 				if (rv != 0)
417 					return (rv);
418 			} while (lp > curlp);
419 			return (0);
420 
421 		default:
422 			return (-1);
423 		}
424 }
425 
426 static int
427 backref(int i, const char *lp)
428 {
429 	char	*bp;
430 	struct re_globals *_re = re_globals;
431 
432 	bp = braslist[i];
433 	while (*bp++ == *lp++)
434 		if (bp >= braelist[i])
435 			return (1);
436 	return (0);
437 }
438 
439 static int
440 cclass(char *set, char c, int af)
441 {
442 	int	n;
443 
444 	if (c == 0)
445 		return (0);
446 	n = *set++;
447 	while (--n)
448 		if (*set++ == c)
449 			return (af);
450 	return (! af);
451 }
452