1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * routines to do regular expression matching
32 *
33 * Entry points:
34 *
35 * re_comp(s)
36 * char *s;
37 * ... returns 0 if the string s was compiled successfully,
38 * a pointer to an error message otherwise.
39 * If passed 0 or a null string returns without changing
40 * the currently compiled re (see note 11 below).
41 *
42 * re_exec(s)
43 * char *s;
44 * ... returns 1 if the string s matches the last compiled regular
45 * expression,
46 * 0 if the string s failed to match the last compiled
47 * regular expression, and
48 * -1 if the compiled regular expression was invalid
49 * (indicating an internal error).
50 *
51 * The strings passed to both re_comp and re_exec may have trailing or
52 * embedded newline characters; they are terminated by nulls.
53 *
54 * The identity of the author of these routines is lost in antiquity;
55 * this is essentially the same as the re code in the original V6 ed.
56 *
57 * The regular expressions recognized are described below. This description
58 * is essentially the same as that for ed.
59 *
60 * A regular expression specifies a set of strings of characters.
61 * A member of this set of strings is said to be matched by
62 * the regular expression. In the following specification for
63 * regular expressions the word `character' means any character but NUL.
64 *
65 * 1. Any character except a special character matches itself.
66 * Special characters are the regular expression delimiter plus
67 * \ [ . and sometimes ^ * $.
68 * 2. A . matches any character.
69 * 3. A \ followed by any character except a digit or ( )
70 * matches that character.
71 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
72 * character in (or not in) s. In s, \ has no special meaning,
73 * and ] may only appear as the first letter. A substring
74 * a-b, with a and b in ascending ASCII order, stands for
75 * the inclusive range of ASCII characters.
76 * 5. A regular expression of form 1-4 followed by * matches a
77 * sequence of 0 or more matches of the regular expression.
78 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
79 * matches what x matches.
80 * 7. A \ followed by a digit n matches a copy of the string that the
81 * bracketed regular expression beginning with the nth \( matched.
82 * 8. A regular expression of form 1-8, x, followed by a regular
83 * expression of form 1-7, y matches a match for x followed by
84 * a match for y, with the x match being as long as possible
85 * while still permitting a y match.
86 * 9. A regular expression of form 1-8 preceded by ^ (or followed
87 * by $), is constrained to matches that begin at the left
88 * (or end at the right) end of a line.
89 * 10. A regular expression of form 1-9 picks out the longest among
90 * the leftmost matches in a line.
91 * 11. An empty regular expression stands for a copy of the last
92 * regular expression encountered.
93 */
94
95 #include "lint.h"
96
97 #include <stdlib.h>
98 #include <re_comp.h>
99 #include <stddef.h>
100 #include <sys/types.h>
101
102 /*
103 * constants for re's
104 */
105 #define CBRA 1
106 #define CCHR 2
107 #define CDOT 4
108 #define CCL 6
109 #define NCCL 8
110 #define CDOL 10
111 #define CEOF 11
112 #define CKET 12
113 #define CBACK 18
114
115 #define CSTAR 01
116
117 #define ESIZE 512
118 #define NBRA 9
119
120 static struct re_globals {
121 char _expbuf[ESIZE];
122 char *_braslist[NBRA], *_braelist[NBRA];
123 char _circf;
124 } *re_globals;
125 #define expbuf (_re->_expbuf)
126 #define braslist (_re->_braslist)
127 #define braelist (_re->_braelist)
128 #define circf (_re->_circf)
129
130 static int advance(const char *, char *);
131 static int backref(int, const char *);
132 static int cclass(char *, char, int);
133
134 /*
135 * compile the regular expression argument into a dfa
136 */
137 char *
re_comp(const char * sp)138 re_comp(const char *sp)
139 {
140 char c;
141 struct re_globals *_re = re_globals;
142 char *ep;
143 char cclcnt, numbra = 0;
144 char *lastep = NULL;
145 char bracket[NBRA];
146 char *bracketp = &bracket[0];
147 char *retoolong = "Regular expression too long";
148
149 if (_re == NULL) {
150 _re = (struct re_globals *)calloc(1, sizeof (*_re));
151 if (_re == NULL)
152 return ("Out of memory");
153 re_globals = _re;
154 }
155 ep = expbuf;
156
157 #define comerr(msg) {expbuf[0] = 0; return (msg); }
158
159 if (sp == NULL || *sp == '\0') {
160 if (*ep == 0)
161 return ("No previous regular expression");
162 return (NULL);
163 }
164 if (*sp == '^') {
165 circf = 1;
166 sp++;
167 }
168 else
169 circf = 0;
170 for (;;) {
171 if (ep >= &expbuf[ESIZE])
172 comerr(retoolong);
173 if ((c = *sp++) == '\0') {
174 if (bracketp != bracket)
175 comerr("unmatched \\(");
176 *ep++ = CEOF;
177 *ep++ = 0;
178 return (NULL);
179 }
180 if (c != '*')
181 lastep = ep;
182 switch (c) {
183
184 case '.':
185 *ep++ = CDOT;
186 continue;
187
188 case '*':
189 if (lastep == NULL || *lastep == CBRA ||
190 *lastep == CKET)
191 goto defchar;
192 *lastep |= CSTAR;
193 continue;
194
195 case '$':
196 if (*sp != '\0')
197 goto defchar;
198 *ep++ = CDOL;
199 continue;
200
201 case '[':
202 *ep++ = CCL;
203 *ep++ = 0;
204 cclcnt = 1;
205 if ((c = *sp++) == '^') {
206 c = *sp++;
207 ep[-2] = NCCL;
208 }
209 do {
210 if (c == '\0')
211 comerr("missing ]");
212 if (c == '-' && ep [-1] != 0) {
213 if ((c = *sp++) == ']') {
214 *ep++ = '-';
215 cclcnt++;
216 break;
217 }
218 while (ep[-1] < c) {
219 *ep = ep[-1] + 1;
220 ep++;
221 cclcnt++;
222 if (ep >= &expbuf[ESIZE])
223 comerr(retoolong);
224 }
225 }
226 *ep++ = c;
227 cclcnt++;
228 if (ep >= &expbuf[ESIZE])
229 comerr(retoolong);
230 } while ((c = *sp++) != ']');
231 lastep[1] = cclcnt;
232 continue;
233
234 case '\\':
235 if ((c = *sp++) == '(') {
236 if (numbra >= NBRA)
237 comerr("too many \\(\\) pairs");
238 *bracketp++ = numbra;
239 *ep++ = CBRA;
240 *ep++ = numbra++;
241 continue;
242 }
243 if (c == ')') {
244 if (bracketp <= bracket)
245 comerr("unmatched \\)");
246 *ep++ = CKET;
247 *ep++ = *--bracketp;
248 continue;
249 }
250 if (c >= '1' && c < ('1' + NBRA)) {
251 *ep++ = CBACK;
252 *ep++ = c - '1';
253 continue;
254 }
255 *ep++ = CCHR;
256 *ep++ = c;
257 continue;
258
259 defchar:
260 default:
261 *ep++ = CCHR;
262 *ep++ = c;
263 }
264 }
265 }
266
267 /*
268 * match the argument string against the compiled re
269 */
270 int
re_exec(const char * p1)271 re_exec(const char *p1)
272 {
273 struct re_globals *_re = re_globals;
274 char *p2;
275 int c;
276 int rv;
277
278 if (_re == NULL)
279 return (0);
280 p2 = expbuf;
281 for (c = 0; c < NBRA; c++) {
282 braslist[c] = 0;
283 braelist[c] = 0;
284 }
285 if (circf)
286 return ((advance(p1, p2)));
287 /*
288 * fast check for first character
289 */
290 if (*p2 == CCHR) {
291 c = p2[1];
292 do {
293 if (*p1 != c)
294 continue;
295 rv = advance(p1, p2);
296 if (rv != 0)
297 return (rv);
298 } while (*p1++);
299 return (0);
300 }
301 /*
302 * regular algorithm
303 */
304 do {
305 rv = advance(p1, p2);
306 if (rv != 0)
307 return (rv);
308 } while (*p1++);
309 return (0);
310 }
311
312 /*
313 * try to match the next thing in the dfa
314 */
315 static int
advance(const char * lp,char * ep)316 advance(const char *lp, char *ep)
317 {
318 const char *curlp;
319 ptrdiff_t ct;
320 int i;
321 int rv;
322 struct re_globals *_re = re_globals;
323
324 for (;;)
325 switch (*ep++) {
326
327 case CCHR:
328 if (*ep++ == *lp++)
329 continue;
330 return (0);
331
332 case CDOT:
333 if (*lp++)
334 continue;
335 return (0);
336
337 case CDOL:
338 if (*lp == '\0')
339 continue;
340 return (0);
341
342 case CEOF:
343 return (1);
344
345 case CCL:
346 if (cclass(ep, *lp++, 1)) {
347 ep += *ep;
348 continue;
349 }
350 return (0);
351
352 case NCCL:
353 if (cclass(ep, *lp++, 0)) {
354 ep += *ep;
355 continue;
356 }
357 return (0);
358
359 case CBRA:
360 braslist[*ep++] = (char *)lp;
361 continue;
362
363 case CKET:
364 braelist[*ep++] = (char *)lp;
365 continue;
366
367 case CBACK:
368 if (braelist[i = *ep++] == NULL)
369 return (-1);
370 if (backref(i, lp)) {
371 lp += braelist[i] - braslist[i];
372 continue;
373 }
374 return (0);
375
376 case CBACK|CSTAR:
377 if (braelist[i = *ep++] == NULL)
378 return (-1);
379 curlp = lp;
380 ct = braelist[i] - braslist[i];
381 while (backref(i, lp))
382 lp += ct;
383 while (lp >= curlp) {
384 rv = advance(lp, ep);
385 if (rv != 0)
386 return (rv);
387 lp -= ct;
388 }
389 continue;
390
391 case CDOT|CSTAR:
392 curlp = lp;
393 while (*lp++)
394 ;
395 goto star;
396
397 case CCHR|CSTAR:
398 curlp = lp;
399 while (*lp++ == *ep)
400 ;
401 ep++;
402 goto star;
403
404 case CCL|CSTAR:
405 case NCCL|CSTAR:
406 curlp = lp;
407 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
408 ;
409 ep += *ep;
410 goto star;
411
412 star:
413 do {
414 lp--;
415 rv = advance(lp, ep);
416 if (rv != 0)
417 return (rv);
418 } while (lp > curlp);
419 return (0);
420
421 default:
422 return (-1);
423 }
424 }
425
426 static int
backref(int i,const char * lp)427 backref(int i, const char *lp)
428 {
429 char *bp;
430 struct re_globals *_re = re_globals;
431
432 bp = braslist[i];
433 while (*bp++ == *lp++)
434 if (bp >= braelist[i])
435 return (1);
436 return (0);
437 }
438
439 static int
cclass(char * set,char c,int af)440 cclass(char *set, char c, int af)
441 {
442 int n;
443
444 if (c == 0)
445 return (0);
446 n = *set++;
447 while (--n)
448 if (*set++ == c)
449 return (af);
450 return (! af);
451 }
452