xref: /illumos-gate/usr/src/ucblib/libucb/port/gen/regex.c (revision d48be21240dfd051b689384ce2b23479d757f2d8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 1997, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 /* 	Portions Copyright(c) 1988, Sun Microsystems Inc.	*/
32 /*	All Rights Reserved					*/
33 
34 /*LINTLIBRARY*/
35 
36 /*
37  * routines to do regular expression matching
38  *
39  * Entry points:
40  *
41  *	re_comp(s)
42  *		char *s;
43  *	 ... returns 0 if the string s was compiled successfully,
44  *		     a pointer to an error message otherwise.
45  *	     If passed 0 or a null string returns without changing
46  *           the currently compiled re (see note 11 below).
47  *
48  *	re_exec(s)
49  *		char *s;
50  *	 ... returns 1 if the string s matches the last compiled regular
51  *		       expression,
52  *		     0 if the string s failed to match the last compiled
53  *		       regular expression, and
54  *		    -1 if the compiled regular expression was invalid
55  *		       (indicating an internal error).
56  *
57  * The strings passed to both re_comp and re_exec may have trailing or
58  * embedded newline characters; they are terminated by nulls.
59  *
60  * The identity of the author of these routines is lost in antiquity;
61  * this is essentially the same as the re code in the original V6 ed.
62  *
63  * The regular expressions recognized are described below. This description
64  * is essentially the same as that for ed.
65  *
66  *	A regular expression specifies a set of strings of characters.
67  *	A member of this set of strings is said to be matched by
68  *	the regular expression.  In the following specification for
69  *	regular expressions the word `character' means any character but NUL.
70  *
71  *	1.  Any character except a special character matches itself.
72  *	    Special characters are the regular expression delimiter plus
73  *	    \ [ . and sometimes ^ * $.
74  *	2.  A . matches any character.
75  *	3.  A \ followed by any character except a digit or ( )
76  *	    matches that character.
77  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
78  *	    character in (or not in) s. In s, \ has no special meaning,
79  *	    and ] may only appear as the first letter. A substring
80  *	    a-b, with a and b in ascending ASCII order, stands for
81  *	    the inclusive range of ASCII characters.
82  *	5.  A regular expression of form 1-4 followed by * matches a
83  *	    sequence of 0 or more matches of the regular expression.
84  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
85  *	    matches what x matches.
86  *	7.  A \ followed by a digit n matches a copy of the string that the
87  *	    bracketed regular expression beginning with the nth \( matched.
88  *	8.  A regular expression of form 1-8, x, followed by a regular
89  *	    expression of form 1-7, y matches a match for x followed by
90  *	    a match for y, with the x match being as long as possible
91  *	    while still permitting a y match.
92  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
93  *	    by $), is constrained to matches that begin at the left
94  *	    (or end at the right) end of a line.
95  *	10. A regular expression of form 1-9 picks out the longest among
96  *	    the leftmost matches in a line.
97  *	11. An empty regular expression stands for a copy of the last
98  *	    regular expression encountered.
99  */
100 
101 #include <sys/types.h>
102 #include <stdlib.h>
103 #include <stddef.h>
104 
105 /*
106  * constants for re's
107  */
108 #define	CBRA	1
109 #define	CCHR	2
110 #define	CDOT	4
111 #define	CCL	6
112 #define	NCCL	8
113 #define	CDOL	10
114 #define	CEOF	11
115 #define	CKET	12
116 #define	CBACK	18
117 
118 #define	CSTAR	01
119 
120 #define	ESIZE	512
121 #define	NBRA	9
122 
123 static struct re_globals {
124 	char	_expbuf[ESIZE];
125 	char	*_braslist[NBRA], *_braelist[NBRA];
126 	char	_circf;
127 } *re_globals;
128 #define	expbuf (_re->_expbuf)
129 #define	braslist (_re->_braslist)
130 #define	braelist (_re->_braelist)
131 #define	circf (_re->_circf)
132 
133 /*
134  * forward declarations
135  */
136 static int backref(int, char *);
137 static int advance(char *, char *);
138 static int cclass(char *, char, int);
139 
140 /*
141  * compile the regular expression argument into a dfa
142  */
143 char *
144 re_comp(char *sp)
145 {
146 	char	c;
147 	struct re_globals *_re = re_globals;
148 	char	*ep;
149 	int	cclcnt, numbra = 0;
150 	char	*lastep = 0;
151 	char	bracket[NBRA];
152 	char	*bracketp = &bracket[0];
153 	char	*retoolong = "Regular expression too long";
154 
155 	if (_re == 0) {
156 		_re = (struct re_globals *)calloc(1, sizeof (*_re));
157 		if (_re == 0)
158 			return ("Out of memory");
159 		re_globals = _re;
160 	}
161 	ep = expbuf;
162 
163 #define	comerr(msg) {expbuf[0] = 0; return (msg); }
164 
165 	if (sp == 0 || *sp == '\0') {
166 		if (*ep == 0)
167 			return ("No previous regular expression");
168 		return (0);
169 	}
170 	if (*sp == '^') {
171 		circf = 1;
172 		sp++;
173 	}
174 	else
175 		circf = 0;
176 	for (;;) {
177 		if (ep >= &expbuf[ESIZE])
178 			comerr(retoolong);
179 		if ((c = *sp++) == '\0') {
180 			if (bracketp != bracket)
181 				comerr("unmatched \\(");
182 			*ep++ = CEOF;
183 			*ep++ = 0;
184 			return (0);
185 		}
186 		if (c != '*')
187 			lastep = ep;
188 		switch (c) {
189 
190 		case '.':
191 			*ep++ = CDOT;
192 			continue;
193 
194 		case '*':
195 			if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
196 				goto defchar;
197 			*lastep |= CSTAR;
198 			continue;
199 
200 		case '$':
201 			if (*sp != '\0')
202 				goto defchar;
203 			*ep++ = CDOL;
204 			continue;
205 
206 		case '[':
207 			*ep++ = CCL;
208 			*ep++ = 0;
209 			cclcnt = 1;
210 			if ((c = *sp++) == '^') {
211 				c = *sp++;
212 				ep[-2] = NCCL;
213 			}
214 			do {
215 				if (c == '\0')
216 					comerr("missing ]");
217 				if (c == '-' && ep [-1] != 0) {
218 					if ((c = *sp++) == ']') {
219 						*ep++ = '-';
220 						cclcnt++;
221 						break;
222 					}
223 					while (ep[-1] < c) {
224 						*ep = ep[-1] + 1;
225 						ep++;
226 						cclcnt++;
227 						if (ep >= &expbuf[ESIZE])
228 							comerr(retoolong);
229 					}
230 				}
231 				*ep++ = c;
232 				cclcnt++;
233 				if (ep >= &expbuf[ESIZE])
234 					comerr(retoolong);
235 			} while ((c = *sp++) != ']');
236 			lastep[1] = (char)cclcnt;
237 			continue;
238 
239 		case '\\':
240 			if ((c = *sp++) == '(') {
241 				if (numbra >= NBRA)
242 					comerr("too many \\(\\) pairs");
243 				*bracketp++ = (char)numbra;
244 				*ep++ = CBRA;
245 				*ep++ = numbra++;
246 				continue;
247 			}
248 			if (c == ')') {
249 				if (bracketp <= bracket)
250 					comerr("unmatched \\)");
251 				*ep++ = CKET;
252 				*ep++ = *--bracketp;
253 				continue;
254 			}
255 			if (c >= '1' && c < ('1' + NBRA)) {
256 				*ep++ = CBACK;
257 				*ep++ = c - '1';
258 				continue;
259 			}
260 			*ep++ = CCHR;
261 			*ep++ = c;
262 			continue;
263 
264 		defchar:
265 		default:
266 			*ep++ = CCHR;
267 			*ep++ = c;
268 		}
269 	}
270 }
271 
272 /*
273  * match the argument string against the compiled re
274  */
275 int
276 re_exec(char *p1)
277 {
278 	struct re_globals *_re = re_globals;
279 	char	*p2;
280 	int	c;
281 	int	rv;
282 
283 	if (_re == 0)
284 		return (0);
285 	p2 = expbuf;
286 	for (c = 0; c < NBRA; c++) {
287 		braslist[c] = 0;
288 		braelist[c] = 0;
289 	}
290 	if (circf)
291 		return ((advance(p1, p2)));
292 	/*
293 	 * fast check for first character
294 	 */
295 	if (*p2 == CCHR) {
296 		c = p2[1];
297 		do {
298 			if (*p1 != c)
299 				continue;
300 			if (rv = advance(p1, p2))
301 				return (rv);
302 		} while (*p1++);
303 		return (0);
304 	}
305 	/*
306 	 * regular algorithm
307 	 */
308 	do
309 		if (rv = advance(p1, p2))
310 			return (rv);
311 	while (*p1++);
312 	return (0);
313 }
314 
315 /*
316  * try to match the next thing in the dfa
317  */
318 static int
319 advance(char *lp, char *ep)
320 {
321 	char	*curlp;
322 	int	i;
323 	ptrdiff_t	ct;
324 	int	rv;
325 	struct re_globals *_re = re_globals;
326 
327 	for (;;)
328 		switch (*ep++) {
329 
330 		case CCHR:
331 			if (*ep++ == *lp++)
332 				continue;
333 			return (0);
334 
335 		case CDOT:
336 			if (*lp++)
337 				continue;
338 			return (0);
339 
340 		case CDOL:
341 			if (*lp == '\0')
342 				continue;
343 			return (0);
344 
345 		case CEOF:
346 			return (1);
347 
348 		case CCL:
349 			if (cclass(ep, *lp++, 1)) {
350 				ep += *ep;
351 				continue;
352 			}
353 			return (0);
354 
355 		case NCCL:
356 			if (cclass(ep, *lp++, 0)) {
357 				ep += *ep;
358 				continue;
359 			}
360 			return (0);
361 
362 		case CBRA:
363 			braslist[*ep++] = lp;
364 			continue;
365 
366 		case CKET:
367 			braelist[*ep++] = lp;
368 			continue;
369 
370 		case CBACK:
371 			if (braelist[i = *ep++] == 0)
372 				return (-1);
373 			if (backref(i, lp)) {
374 				lp += braelist[i] - braslist[i];
375 				continue;
376 			}
377 			return (0);
378 
379 		case CBACK|CSTAR:
380 			if (braelist[i = *ep++] == 0)
381 				return (-1);
382 			curlp = lp;
383 			ct = braelist[i] - braslist[i];
384 			while (backref(i, lp))
385 				lp += ct;
386 			while (lp >= curlp) {
387 				if (rv = advance(lp, ep))
388 					return (rv);
389 				lp -= ct;
390 			}
391 			continue;
392 
393 		case CDOT|CSTAR:
394 			curlp = lp;
395 			while (*lp++)
396 				;
397 			goto star;
398 
399 		case CCHR|CSTAR:
400 			curlp = lp;
401 			while (*lp++ == *ep)
402 				;
403 			ep++;
404 			goto star;
405 
406 		case CCL|CSTAR:
407 		case NCCL|CSTAR:
408 			curlp = lp;
409 			while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
410 				;
411 			ep += *ep;
412 			goto star;
413 
414 		star:
415 			do {
416 				lp--;
417 				if (rv = advance(lp, ep))
418 					return (rv);
419 			} while (lp > curlp);
420 			return (0);
421 
422 		default:
423 			return (-1);
424 		}
425 }
426 
427 static int
428 backref(int i, char *lp)
429 {
430 	char	*bp;
431 	struct re_globals *_re = re_globals;
432 
433 	bp = braslist[i];
434 	while (*bp++ == *lp++)
435 		if (bp >= braelist[i])
436 			return (1);
437 	return (0);
438 }
439 
440 static int
441 cclass(char *set, char c, int af)
442 {
443 	int	n;
444 
445 	if (c == 0)
446 		return (0);
447 	n = *set++;
448 	while (--n)
449 		if (*set++ == c)
450 			return (af);
451 	return (! af);
452 }
453