xref: /illumos-gate/usr/src/lib/libc/port/gen/regexpr.c (revision 4de2612967d06c4fdbf524a62556a1e8118a006f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
30 /*	  All Rights Reserved  	*/
31 
32 
33 /*
34  * routines to do regular expression matching
35  *
36  * Entry points:
37  *
38  *	re_comp(s)
39  *		char *s;
40  *	 ... returns 0 if the string s was compiled successfully,
41  *		     a pointer to an error message otherwise.
42  *	     If passed 0 or a null string returns without changing
43  *           the currently compiled re (see note 11 below).
44  *
45  *	re_exec(s)
46  *		char *s;
47  *	 ... returns 1 if the string s matches the last compiled regular
48  *		       expression,
49  *		     0 if the string s failed to match the last compiled
50  *		       regular expression, and
51  *		    -1 if the compiled regular expression was invalid
52  *		       (indicating an internal error).
53  *
54  * The strings passed to both re_comp and re_exec may have trailing or
55  * embedded newline characters; they are terminated by nulls.
56  *
57  * The identity of the author of these routines is lost in antiquity;
58  * this is essentially the same as the re code in the original V6 ed.
59  *
60  * The regular expressions recognized are described below. This description
61  * is essentially the same as that for ed.
62  *
63  *	A regular expression specifies a set of strings of characters.
64  *	A member of this set of strings is said to be matched by
65  *	the regular expression.  In the following specification for
66  *	regular expressions the word `character' means any character but NUL.
67  *
68  *	1.  Any character except a special character matches itself.
69  *	    Special characters are the regular expression delimiter plus
70  *	    \ [ . and sometimes ^ * $.
71  *	2.  A . matches any character.
72  *	3.  A \ followed by any character except a digit or ( )
73  *	    matches that character.
74  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
75  *	    character in (or not in) s. In s, \ has no special meaning,
76  *	    and ] may only appear as the first letter. A substring
77  *	    a-b, with a and b in ascending ASCII order, stands for
78  *	    the inclusive range of ASCII characters.
79  *	5.  A regular expression of form 1-4 followed by * matches a
80  *	    sequence of 0 or more matches of the regular expression.
81  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
82  *	    matches what x matches.
83  *	7.  A \ followed by a digit n matches a copy of the string that the
84  *	    bracketed regular expression beginning with the nth \( matched.
85  *	8.  A regular expression of form 1-8, x, followed by a regular
86  *	    expression of form 1-7, y matches a match for x followed by
87  *	    a match for y, with the x match being as long as possible
88  *	    while still permitting a y match.
89  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
90  *	    by $), is constrained to matches that begin at the left
91  *	    (or end at the right) end of a line.
92  *	10. A regular expression of form 1-9 picks out the longest among
93  *	    the leftmost matches in a line.
94  *	11. An empty regular expression stands for a copy of the last
95  *	    regular expression encountered.
96  */
97 
98 #include "synonyms.h"
99 
100 #include <stdlib.h>
101 #include <re_comp.h>
102 #include <stddef.h>
103 #include <sys/types.h>
104 
105 /*
106  * constants for re's
107  */
108 #define	CBRA	1
109 #define	CCHR	2
110 #define	CDOT	4
111 #define	CCL	6
112 #define	NCCL	8
113 #define	CDOL	10
114 #define	CEOF	11
115 #define	CKET	12
116 #define	CBACK	18
117 
118 #define	CSTAR	01
119 
120 #define	ESIZE	512
121 #define	NBRA	9
122 
123 static struct re_globals {
124 	char	_expbuf[ESIZE];
125 	char	*_braslist[NBRA], *_braelist[NBRA];
126 	char	_circf;
127 } *re_globals;
128 #define	expbuf (_re->_expbuf)
129 #define	braslist (_re->_braslist)
130 #define	braelist (_re->_braelist)
131 #define	circf (_re->_circf)
132 
133 static int advance(const char *, char *);
134 static int backref(int, const char *);
135 static int cclass(char *, char, int);
136 
137 /*
138  * compile the regular expression argument into a dfa
139  */
140 char *
141 re_comp(const char *sp)
142 {
143 	char	c;
144 	struct re_globals *_re = re_globals;
145 	char	*ep;
146 	char	cclcnt, numbra = 0;
147 	char	*lastep = NULL;
148 	char	bracket[NBRA];
149 	char	*bracketp = &bracket[0];
150 	char	*retoolong = "Regular expression too long";
151 
152 	if (_re == NULL) {
153 		_re = (struct re_globals *)calloc(1, sizeof (*_re));
154 		if (_re == NULL)
155 			return ("Out of memory");
156 		re_globals = _re;
157 	}
158 	ep = expbuf;
159 
160 #define	comerr(msg) {expbuf[0] = 0; return (msg); }
161 
162 	if (sp == NULL || *sp == '\0') {
163 		if (*ep == 0)
164 			return ("No previous regular expression");
165 		return (NULL);
166 	}
167 	if (*sp == '^') {
168 		circf = 1;
169 		sp++;
170 	}
171 	else
172 		circf = 0;
173 	for (;;) {
174 		if (ep >= &expbuf[ESIZE])
175 			comerr(retoolong);
176 		if ((c = *sp++) == '\0') {
177 			if (bracketp != bracket)
178 				comerr("unmatched \\(");
179 			*ep++ = CEOF;
180 			*ep++ = 0;
181 			return (NULL);
182 		}
183 		if (c != '*')
184 			lastep = ep;
185 		switch (c) {
186 
187 		case '.':
188 			*ep++ = CDOT;
189 			continue;
190 
191 		case '*':
192 			if (lastep == NULL || *lastep == CBRA ||
193 			    *lastep == CKET)
194 				goto defchar;
195 			*lastep |= CSTAR;
196 			continue;
197 
198 		case '$':
199 			if (*sp != '\0')
200 				goto defchar;
201 			*ep++ = CDOL;
202 			continue;
203 
204 		case '[':
205 			*ep++ = CCL;
206 			*ep++ = 0;
207 			cclcnt = 1;
208 			if ((c = *sp++) == '^') {
209 				c = *sp++;
210 				ep[-2] = NCCL;
211 			}
212 			do {
213 				if (c == '\0')
214 					comerr("missing ]");
215 				if (c == '-' && ep [-1] != 0) {
216 					if ((c = *sp++) == ']') {
217 						*ep++ = '-';
218 						cclcnt++;
219 						break;
220 					}
221 					while (ep[-1] < c) {
222 						*ep = ep[-1] + 1;
223 						ep++;
224 						cclcnt++;
225 						if (ep >= &expbuf[ESIZE])
226 							comerr(retoolong);
227 					}
228 				}
229 				*ep++ = c;
230 				cclcnt++;
231 				if (ep >= &expbuf[ESIZE])
232 					comerr(retoolong);
233 			} while ((c = *sp++) != ']');
234 			lastep[1] = cclcnt;
235 			continue;
236 
237 		case '\\':
238 			if ((c = *sp++) == '(') {
239 				if (numbra >= NBRA)
240 					comerr("too many \\(\\) pairs");
241 				*bracketp++ = numbra;
242 				*ep++ = CBRA;
243 				*ep++ = numbra++;
244 				continue;
245 			}
246 			if (c == ')') {
247 				if (bracketp <= bracket)
248 					comerr("unmatched \\)");
249 				*ep++ = CKET;
250 				*ep++ = *--bracketp;
251 				continue;
252 			}
253 			if (c >= '1' && c < ('1' + NBRA)) {
254 				*ep++ = CBACK;
255 				*ep++ = c - '1';
256 				continue;
257 			}
258 			*ep++ = CCHR;
259 			*ep++ = c;
260 			continue;
261 
262 		defchar:
263 		default:
264 			*ep++ = CCHR;
265 			*ep++ = c;
266 		}
267 	}
268 }
269 
270 /*
271  * match the argument string against the compiled re
272  */
273 int
274 re_exec(const char *p1)
275 {
276 	struct re_globals *_re = re_globals;
277 	char	*p2;
278 	int	c;
279 	int	rv;
280 
281 	if (_re == NULL)
282 		return (0);
283 	p2 = expbuf;
284 	for (c = 0; c < NBRA; c++) {
285 		braslist[c] = 0;
286 		braelist[c] = 0;
287 	}
288 	if (circf)
289 		return ((advance(p1, p2)));
290 	/*
291 	 * fast check for first character
292 	 */
293 	if (*p2 == CCHR) {
294 		c = p2[1];
295 		do {
296 			if (*p1 != c)
297 				continue;
298 			if (rv = advance(p1, p2))
299 				return (rv);
300 		} while (*p1++);
301 		return (0);
302 	}
303 	/*
304 	 * regular algorithm
305 	 */
306 	do
307 		if (rv = advance(p1, p2))
308 			return (rv);
309 	while (*p1++);
310 	return (0);
311 }
312 
313 /*
314  * try to match the next thing in the dfa
315  */
316 static int
317 advance(const char *lp, char *ep)
318 {
319 	const char	*curlp;
320 	ptrdiff_t	ct;
321 	int		i;
322 	int		rv;
323 	struct re_globals *_re = re_globals;
324 
325 	for (;;)
326 		switch (*ep++) {
327 
328 		case CCHR:
329 			if (*ep++ == *lp++)
330 				continue;
331 			return (0);
332 
333 		case CDOT:
334 			if (*lp++)
335 				continue;
336 			return (0);
337 
338 		case CDOL:
339 			if (*lp == '\0')
340 				continue;
341 			return (0);
342 
343 		case CEOF:
344 			return (1);
345 
346 		case CCL:
347 			if (cclass(ep, *lp++, 1)) {
348 				ep += *ep;
349 				continue;
350 			}
351 			return (0);
352 
353 		case NCCL:
354 			if (cclass(ep, *lp++, 0)) {
355 				ep += *ep;
356 				continue;
357 			}
358 			return (0);
359 
360 		case CBRA:
361 			braslist[*ep++] = (char *)lp;
362 			continue;
363 
364 		case CKET:
365 			braelist[*ep++] = (char *)lp;
366 			continue;
367 
368 		case CBACK:
369 			if (braelist[i = *ep++] == NULL)
370 				return (-1);
371 			if (backref(i, lp)) {
372 				lp += braelist[i] - braslist[i];
373 				continue;
374 			}
375 			return (0);
376 
377 		case CBACK|CSTAR:
378 			if (braelist[i = *ep++] == NULL)
379 				return (-1);
380 			curlp = lp;
381 			ct = braelist[i] - braslist[i];
382 			while (backref(i, lp))
383 				lp += ct;
384 			while (lp >= curlp) {
385 				if (rv = advance(lp, ep))
386 					return (rv);
387 				lp -= ct;
388 			}
389 			continue;
390 
391 		case CDOT|CSTAR:
392 			curlp = lp;
393 			while (*lp++)
394 				;
395 			goto star;
396 
397 		case CCHR|CSTAR:
398 			curlp = lp;
399 			while (*lp++ == *ep)
400 				;
401 			ep++;
402 			goto star;
403 
404 		case CCL|CSTAR:
405 		case NCCL|CSTAR:
406 			curlp = lp;
407 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
408 				;
409 			ep += *ep;
410 			goto star;
411 
412 		star:
413 			do {
414 				lp--;
415 				if (rv = advance(lp, ep))
416 					return (rv);
417 			} while (lp > curlp);
418 			return (0);
419 
420 		default:
421 			return (-1);
422 		}
423 }
424 
425 static int
426 backref(int i, const char *lp)
427 {
428 	char	*bp;
429 	struct re_globals *_re = re_globals;
430 
431 	bp = braslist[i];
432 	while (*bp++ == *lp++)
433 		if (bp >= braelist[i])
434 			return (1);
435 	return (0);
436 }
437 
438 static int
439 cclass(char *set, char c, int af)
440 {
441 	int	n;
442 
443 	if (c == 0)
444 		return (0);
445 	n = *set++;
446 	while (--n)
447 		if (*set++ == c)
448 			return (af);
449 	return (! af);
450 }
451