1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 1997, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30
31 /* Portions Copyright(c) 1988, Sun Microsystems Inc. */
32 /* All Rights Reserved */
33
34 #pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.1 */
35
36 /*LINTLIBRARY*/
37
38 /*
39 * routines to do regular expression matching
40 *
41 * Entry points:
42 *
43 * re_comp(s)
44 * char *s;
45 * ... returns 0 if the string s was compiled successfully,
46 * a pointer to an error message otherwise.
47 * If passed 0 or a null string returns without changing
48 * the currently compiled re (see note 11 below).
49 *
50 * re_exec(s)
51 * char *s;
52 * ... returns 1 if the string s matches the last compiled regular
53 * expression,
54 * 0 if the string s failed to match the last compiled
55 * regular expression, and
56 * -1 if the compiled regular expression was invalid
57 * (indicating an internal error).
58 *
59 * The strings passed to both re_comp and re_exec may have trailing or
60 * embedded newline characters; they are terminated by nulls.
61 *
62 * The identity of the author of these routines is lost in antiquity;
63 * this is essentially the same as the re code in the original V6 ed.
64 *
65 * The regular expressions recognized are described below. This description
66 * is essentially the same as that for ed.
67 *
68 * A regular expression specifies a set of strings of characters.
69 * A member of this set of strings is said to be matched by
70 * the regular expression. In the following specification for
71 * regular expressions the word `character' means any character but NUL.
72 *
73 * 1. Any character except a special character matches itself.
74 * Special characters are the regular expression delimiter plus
75 * \ [ . and sometimes ^ * $.
76 * 2. A . matches any character.
77 * 3. A \ followed by any character except a digit or ( )
78 * matches that character.
79 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
80 * character in (or not in) s. In s, \ has no special meaning,
81 * and ] may only appear as the first letter. A substring
82 * a-b, with a and b in ascending ASCII order, stands for
83 * the inclusive range of ASCII characters.
84 * 5. A regular expression of form 1-4 followed by * matches a
85 * sequence of 0 or more matches of the regular expression.
86 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
87 * matches what x matches.
88 * 7. A \ followed by a digit n matches a copy of the string that the
89 * bracketed regular expression beginning with the nth \( matched.
90 * 8. A regular expression of form 1-8, x, followed by a regular
91 * expression of form 1-7, y matches a match for x followed by
92 * a match for y, with the x match being as long as possible
93 * while still permitting a y match.
94 * 9. A regular expression of form 1-8 preceded by ^ (or followed
95 * by $), is constrained to matches that begin at the left
96 * (or end at the right) end of a line.
97 * 10. A regular expression of form 1-9 picks out the longest among
98 * the leftmost matches in a line.
99 * 11. An empty regular expression stands for a copy of the last
100 * regular expression encountered.
101 */
102
103 #include <sys/types.h>
104 #include <stdlib.h>
105 #include <stddef.h>
106
107 /*
108 * constants for re's
109 */
110 #define CBRA 1
111 #define CCHR 2
112 #define CDOT 4
113 #define CCL 6
114 #define NCCL 8
115 #define CDOL 10
116 #define CEOF 11
117 #define CKET 12
118 #define CBACK 18
119
120 #define CSTAR 01
121
122 #define ESIZE 512
123 #define NBRA 9
124
125 static struct re_globals {
126 char _expbuf[ESIZE];
127 char *_braslist[NBRA], *_braelist[NBRA];
128 char _circf;
129 } *re_globals;
130 #define expbuf (_re->_expbuf)
131 #define braslist (_re->_braslist)
132 #define braelist (_re->_braelist)
133 #define circf (_re->_circf)
134
135 /*
136 * forward declarations
137 */
138 static int backref(int, char *);
139 static int advance(char *, char *);
140 static int cclass(char *, char, int);
141
142 /*
143 * compile the regular expression argument into a dfa
144 */
145 char *
re_comp(char * sp)146 re_comp(char *sp)
147 {
148 char c;
149 struct re_globals *_re = re_globals;
150 char *ep;
151 int cclcnt, numbra = 0;
152 char *lastep = 0;
153 char bracket[NBRA];
154 char *bracketp = &bracket[0];
155 char *retoolong = "Regular expression too long";
156
157 if (_re == 0) {
158 _re = (struct re_globals *)calloc(1, sizeof (*_re));
159 if (_re == 0)
160 return ("Out of memory");
161 re_globals = _re;
162 }
163 ep = expbuf;
164
165 #define comerr(msg) {expbuf[0] = 0; return (msg); }
166
167 if (sp == 0 || *sp == '\0') {
168 if (*ep == 0)
169 return ("No previous regular expression");
170 return (0);
171 }
172 if (*sp == '^') {
173 circf = 1;
174 sp++;
175 }
176 else
177 circf = 0;
178 for (;;) {
179 if (ep >= &expbuf[ESIZE])
180 comerr(retoolong);
181 if ((c = *sp++) == '\0') {
182 if (bracketp != bracket)
183 comerr("unmatched \\(");
184 *ep++ = CEOF;
185 *ep++ = 0;
186 return (0);
187 }
188 if (c != '*')
189 lastep = ep;
190 switch (c) {
191
192 case '.':
193 *ep++ = CDOT;
194 continue;
195
196 case '*':
197 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
198 goto defchar;
199 *lastep |= CSTAR;
200 continue;
201
202 case '$':
203 if (*sp != '\0')
204 goto defchar;
205 *ep++ = CDOL;
206 continue;
207
208 case '[':
209 *ep++ = CCL;
210 *ep++ = 0;
211 cclcnt = 1;
212 if ((c = *sp++) == '^') {
213 c = *sp++;
214 ep[-2] = NCCL;
215 }
216 do {
217 if (c == '\0')
218 comerr("missing ]");
219 if (c == '-' && ep [-1] != 0) {
220 if ((c = *sp++) == ']') {
221 *ep++ = '-';
222 cclcnt++;
223 break;
224 }
225 while (ep[-1] < c) {
226 *ep = ep[-1] + 1;
227 ep++;
228 cclcnt++;
229 if (ep >= &expbuf[ESIZE])
230 comerr(retoolong);
231 }
232 }
233 *ep++ = c;
234 cclcnt++;
235 if (ep >= &expbuf[ESIZE])
236 comerr(retoolong);
237 } while ((c = *sp++) != ']');
238 lastep[1] = (char)cclcnt;
239 continue;
240
241 case '\\':
242 if ((c = *sp++) == '(') {
243 if (numbra >= NBRA)
244 comerr("too many \\(\\) pairs");
245 *bracketp++ = (char)numbra;
246 *ep++ = CBRA;
247 *ep++ = numbra++;
248 continue;
249 }
250 if (c == ')') {
251 if (bracketp <= bracket)
252 comerr("unmatched \\)");
253 *ep++ = CKET;
254 *ep++ = *--bracketp;
255 continue;
256 }
257 if (c >= '1' && c < ('1' + NBRA)) {
258 *ep++ = CBACK;
259 *ep++ = c - '1';
260 continue;
261 }
262 *ep++ = CCHR;
263 *ep++ = c;
264 continue;
265
266 defchar:
267 default:
268 *ep++ = CCHR;
269 *ep++ = c;
270 }
271 }
272 }
273
274 /*
275 * match the argument string against the compiled re
276 */
277 int
re_exec(char * p1)278 re_exec(char *p1)
279 {
280 struct re_globals *_re = re_globals;
281 char *p2;
282 int c;
283 int rv;
284
285 if (_re == 0)
286 return (0);
287 p2 = expbuf;
288 for (c = 0; c < NBRA; c++) {
289 braslist[c] = 0;
290 braelist[c] = 0;
291 }
292 if (circf)
293 return ((advance(p1, p2)));
294 /*
295 * fast check for first character
296 */
297 if (*p2 == CCHR) {
298 c = p2[1];
299 do {
300 if (*p1 != c)
301 continue;
302 if (rv = advance(p1, p2))
303 return (rv);
304 } while (*p1++);
305 return (0);
306 }
307 /*
308 * regular algorithm
309 */
310 do
311 if (rv = advance(p1, p2))
312 return (rv);
313 while (*p1++);
314 return (0);
315 }
316
317 /*
318 * try to match the next thing in the dfa
319 */
320 static int
advance(char * lp,char * ep)321 advance(char *lp, char *ep)
322 {
323 char *curlp;
324 int i;
325 ptrdiff_t ct;
326 int rv;
327 struct re_globals *_re = re_globals;
328
329 for (;;)
330 switch (*ep++) {
331
332 case CCHR:
333 if (*ep++ == *lp++)
334 continue;
335 return (0);
336
337 case CDOT:
338 if (*lp++)
339 continue;
340 return (0);
341
342 case CDOL:
343 if (*lp == '\0')
344 continue;
345 return (0);
346
347 case CEOF:
348 return (1);
349
350 case CCL:
351 if (cclass(ep, *lp++, 1)) {
352 ep += *ep;
353 continue;
354 }
355 return (0);
356
357 case NCCL:
358 if (cclass(ep, *lp++, 0)) {
359 ep += *ep;
360 continue;
361 }
362 return (0);
363
364 case CBRA:
365 braslist[*ep++] = lp;
366 continue;
367
368 case CKET:
369 braelist[*ep++] = lp;
370 continue;
371
372 case CBACK:
373 if (braelist[i = *ep++] == 0)
374 return (-1);
375 if (backref(i, lp)) {
376 lp += braelist[i] - braslist[i];
377 continue;
378 }
379 return (0);
380
381 case CBACK|CSTAR:
382 if (braelist[i = *ep++] == 0)
383 return (-1);
384 curlp = lp;
385 ct = braelist[i] - braslist[i];
386 while (backref(i, lp))
387 lp += ct;
388 while (lp >= curlp) {
389 if (rv = advance(lp, ep))
390 return (rv);
391 lp -= ct;
392 }
393 continue;
394
395 case CDOT|CSTAR:
396 curlp = lp;
397 while (*lp++)
398 ;
399 goto star;
400
401 case CCHR|CSTAR:
402 curlp = lp;
403 while (*lp++ == *ep)
404 ;
405 ep++;
406 goto star;
407
408 case CCL|CSTAR:
409 case NCCL|CSTAR:
410 curlp = lp;
411 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
412 ;
413 ep += *ep;
414 goto star;
415
416 star:
417 do {
418 lp--;
419 if (rv = advance(lp, ep))
420 return (rv);
421 } while (lp > curlp);
422 return (0);
423
424 default:
425 return (-1);
426 }
427 }
428
429 static int
backref(int i,char * lp)430 backref(int i, char *lp)
431 {
432 char *bp;
433 struct re_globals *_re = re_globals;
434
435 bp = braslist[i];
436 while (*bp++ == *lp++)
437 if (bp >= braelist[i])
438 return (1);
439 return (0);
440 }
441
442 static int
cclass(char * set,char c,int af)443 cclass(char *set, char c, int af)
444 {
445 int n;
446
447 if (c == 0)
448 return (0);
449 n = *set++;
450 while (--n)
451 if (*set++ == c)
452 return (af);
453 return (! af);
454 }
455