1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 #pragma ident "%Z%%M% %I% %E% SMI"
31
32 /*
33 * routines to do regular expression matching
34 *
35 * Entry points:
36 *
37 * re_comp(s)
38 * char *s;
39 * ... returns 0 if the string s was compiled successfully,
40 * a pointer to an error message otherwise.
41 * If passed 0 or a null string returns without changing
42 * the currently compiled re (see note 11 below).
43 *
44 * re_exec(s)
45 * char *s;
46 * ... returns 1 if the string s matches the last compiled regular
47 * expression,
48 * 0 if the string s failed to match the last compiled
49 * regular expression, and
50 * -1 if the compiled regular expression was invalid
51 * (indicating an internal error).
52 *
53 * The strings passed to both re_comp and re_exec may have trailing or
54 * embedded newline characters; they are terminated by nulls.
55 *
56 * The identity of the author of these routines is lost in antiquity;
57 * this is essentially the same as the re code in the original V6 ed.
58 *
59 * The regular expressions recognized are described below. This description
60 * is essentially the same as that for ed.
61 *
62 * A regular expression specifies a set of strings of characters.
63 * A member of this set of strings is said to be matched by
64 * the regular expression. In the following specification for
65 * regular expressions the word `character' means any character but NUL.
66 *
67 * 1. Any character except a special character matches itself.
68 * Special characters are the regular expression delimiter plus
69 * \ [ . and sometimes ^ * $.
70 * 2. A . matches any character.
71 * 3. A \ followed by any character except a digit or ( )
72 * matches that character.
73 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
74 * character in (or not in) s. In s, \ has no special meaning,
75 * and ] may only appear as the first letter. A substring
76 * a-b, with a and b in ascending ASCII order, stands for
77 * the inclusive range of ASCII characters.
78 * 5. A regular expression of form 1-4 followed by * matches a
79 * sequence of 0 or more matches of the regular expression.
80 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
81 * matches what x matches.
82 * 7. A \ followed by a digit n matches a copy of the string that the
83 * bracketed regular expression beginning with the nth \( matched.
84 * 8. A regular expression of form 1-8, x, followed by a regular
85 * expression of form 1-7, y matches a match for x followed by
86 * a match for y, with the x match being as long as possible
87 * while still permitting a y match.
88 * 9. A regular expression of form 1-8 preceded by ^ (or followed
89 * by $), is constrained to matches that begin at the left
90 * (or end at the right) end of a line.
91 * 10. A regular expression of form 1-9 picks out the longest among
92 * the leftmost matches in a line.
93 * 11. An empty regular expression stands for a copy of the last
94 * regular expression encountered.
95 */
96
97 #include "lint.h"
98
99 #include <stdlib.h>
100 #include <re_comp.h>
101 #include <stddef.h>
102 #include <sys/types.h>
103
104 /*
105 * constants for re's
106 */
107 #define CBRA 1
108 #define CCHR 2
109 #define CDOT 4
110 #define CCL 6
111 #define NCCL 8
112 #define CDOL 10
113 #define CEOF 11
114 #define CKET 12
115 #define CBACK 18
116
117 #define CSTAR 01
118
119 #define ESIZE 512
120 #define NBRA 9
121
122 static struct re_globals {
123 char _expbuf[ESIZE];
124 char *_braslist[NBRA], *_braelist[NBRA];
125 char _circf;
126 } *re_globals;
127 #define expbuf (_re->_expbuf)
128 #define braslist (_re->_braslist)
129 #define braelist (_re->_braelist)
130 #define circf (_re->_circf)
131
132 static int advance(const char *, char *);
133 static int backref(int, const char *);
134 static int cclass(char *, char, int);
135
136 /*
137 * compile the regular expression argument into a dfa
138 */
139 char *
re_comp(const char * sp)140 re_comp(const char *sp)
141 {
142 char c;
143 struct re_globals *_re = re_globals;
144 char *ep;
145 char cclcnt, numbra = 0;
146 char *lastep = NULL;
147 char bracket[NBRA];
148 char *bracketp = &bracket[0];
149 char *retoolong = "Regular expression too long";
150
151 if (_re == NULL) {
152 _re = (struct re_globals *)calloc(1, sizeof (*_re));
153 if (_re == NULL)
154 return ("Out of memory");
155 re_globals = _re;
156 }
157 ep = expbuf;
158
159 #define comerr(msg) {expbuf[0] = 0; return (msg); }
160
161 if (sp == NULL || *sp == '\0') {
162 if (*ep == 0)
163 return ("No previous regular expression");
164 return (NULL);
165 }
166 if (*sp == '^') {
167 circf = 1;
168 sp++;
169 }
170 else
171 circf = 0;
172 for (;;) {
173 if (ep >= &expbuf[ESIZE])
174 comerr(retoolong);
175 if ((c = *sp++) == '\0') {
176 if (bracketp != bracket)
177 comerr("unmatched \\(");
178 *ep++ = CEOF;
179 *ep++ = 0;
180 return (NULL);
181 }
182 if (c != '*')
183 lastep = ep;
184 switch (c) {
185
186 case '.':
187 *ep++ = CDOT;
188 continue;
189
190 case '*':
191 if (lastep == NULL || *lastep == CBRA ||
192 *lastep == CKET)
193 goto defchar;
194 *lastep |= CSTAR;
195 continue;
196
197 case '$':
198 if (*sp != '\0')
199 goto defchar;
200 *ep++ = CDOL;
201 continue;
202
203 case '[':
204 *ep++ = CCL;
205 *ep++ = 0;
206 cclcnt = 1;
207 if ((c = *sp++) == '^') {
208 c = *sp++;
209 ep[-2] = NCCL;
210 }
211 do {
212 if (c == '\0')
213 comerr("missing ]");
214 if (c == '-' && ep [-1] != 0) {
215 if ((c = *sp++) == ']') {
216 *ep++ = '-';
217 cclcnt++;
218 break;
219 }
220 while (ep[-1] < c) {
221 *ep = ep[-1] + 1;
222 ep++;
223 cclcnt++;
224 if (ep >= &expbuf[ESIZE])
225 comerr(retoolong);
226 }
227 }
228 *ep++ = c;
229 cclcnt++;
230 if (ep >= &expbuf[ESIZE])
231 comerr(retoolong);
232 } while ((c = *sp++) != ']');
233 lastep[1] = cclcnt;
234 continue;
235
236 case '\\':
237 if ((c = *sp++) == '(') {
238 if (numbra >= NBRA)
239 comerr("too many \\(\\) pairs");
240 *bracketp++ = numbra;
241 *ep++ = CBRA;
242 *ep++ = numbra++;
243 continue;
244 }
245 if (c == ')') {
246 if (bracketp <= bracket)
247 comerr("unmatched \\)");
248 *ep++ = CKET;
249 *ep++ = *--bracketp;
250 continue;
251 }
252 if (c >= '1' && c < ('1' + NBRA)) {
253 *ep++ = CBACK;
254 *ep++ = c - '1';
255 continue;
256 }
257 *ep++ = CCHR;
258 *ep++ = c;
259 continue;
260
261 defchar:
262 default:
263 *ep++ = CCHR;
264 *ep++ = c;
265 }
266 }
267 }
268
269 /*
270 * match the argument string against the compiled re
271 */
272 int
re_exec(const char * p1)273 re_exec(const char *p1)
274 {
275 struct re_globals *_re = re_globals;
276 char *p2;
277 int c;
278 int rv;
279
280 if (_re == NULL)
281 return (0);
282 p2 = expbuf;
283 for (c = 0; c < NBRA; c++) {
284 braslist[c] = 0;
285 braelist[c] = 0;
286 }
287 if (circf)
288 return ((advance(p1, p2)));
289 /*
290 * fast check for first character
291 */
292 if (*p2 == CCHR) {
293 c = p2[1];
294 do {
295 if (*p1 != c)
296 continue;
297 if (rv = advance(p1, p2))
298 return (rv);
299 } while (*p1++);
300 return (0);
301 }
302 /*
303 * regular algorithm
304 */
305 do {
306 if (rv = advance(p1, p2))
307 return (rv);
308 } while (*p1++);
309 return (0);
310 }
311
312 /*
313 * try to match the next thing in the dfa
314 */
315 static int
advance(const char * lp,char * ep)316 advance(const char *lp, char *ep)
317 {
318 const char *curlp;
319 ptrdiff_t ct;
320 int i;
321 int rv;
322 struct re_globals *_re = re_globals;
323
324 for (;;)
325 switch (*ep++) {
326
327 case CCHR:
328 if (*ep++ == *lp++)
329 continue;
330 return (0);
331
332 case CDOT:
333 if (*lp++)
334 continue;
335 return (0);
336
337 case CDOL:
338 if (*lp == '\0')
339 continue;
340 return (0);
341
342 case CEOF:
343 return (1);
344
345 case CCL:
346 if (cclass(ep, *lp++, 1)) {
347 ep += *ep;
348 continue;
349 }
350 return (0);
351
352 case NCCL:
353 if (cclass(ep, *lp++, 0)) {
354 ep += *ep;
355 continue;
356 }
357 return (0);
358
359 case CBRA:
360 braslist[*ep++] = (char *)lp;
361 continue;
362
363 case CKET:
364 braelist[*ep++] = (char *)lp;
365 continue;
366
367 case CBACK:
368 if (braelist[i = *ep++] == NULL)
369 return (-1);
370 if (backref(i, lp)) {
371 lp += braelist[i] - braslist[i];
372 continue;
373 }
374 return (0);
375
376 case CBACK|CSTAR:
377 if (braelist[i = *ep++] == NULL)
378 return (-1);
379 curlp = lp;
380 ct = braelist[i] - braslist[i];
381 while (backref(i, lp))
382 lp += ct;
383 while (lp >= curlp) {
384 if (rv = advance(lp, ep))
385 return (rv);
386 lp -= ct;
387 }
388 continue;
389
390 case CDOT|CSTAR:
391 curlp = lp;
392 while (*lp++)
393 ;
394 goto star;
395
396 case CCHR|CSTAR:
397 curlp = lp;
398 while (*lp++ == *ep)
399 ;
400 ep++;
401 goto star;
402
403 case CCL|CSTAR:
404 case NCCL|CSTAR:
405 curlp = lp;
406 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
407 ;
408 ep += *ep;
409 goto star;
410
411 star:
412 do {
413 lp--;
414 if (rv = advance(lp, ep))
415 return (rv);
416 } while (lp > curlp);
417 return (0);
418
419 default:
420 return (-1);
421 }
422 }
423
424 static int
backref(int i,const char * lp)425 backref(int i, const char *lp)
426 {
427 char *bp;
428 struct re_globals *_re = re_globals;
429
430 bp = braslist[i];
431 while (*bp++ == *lp++)
432 if (bp >= braelist[i])
433 return (1);
434 return (0);
435 }
436
437 static int
cclass(char * set,char c,int af)438 cclass(char *set, char c, int af)
439 {
440 int n;
441
442 if (c == 0)
443 return (0);
444 n = *set++;
445 while (--n)
446 if (*set++ == c)
447 return (af);
448 return (! af);
449 }
450