1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 1997, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30
31 /* Portions Copyright(c) 1988, Sun Microsystems Inc. */
32 /* All Rights Reserved */
33
34 /*LINTLIBRARY*/
35
36 /*
37 * routines to do regular expression matching
38 *
39 * Entry points:
40 *
41 * re_comp(s)
42 * char *s;
43 * ... returns 0 if the string s was compiled successfully,
44 * a pointer to an error message otherwise.
45 * If passed 0 or a null string returns without changing
46 * the currently compiled re (see note 11 below).
47 *
48 * re_exec(s)
49 * char *s;
50 * ... returns 1 if the string s matches the last compiled regular
51 * expression,
52 * 0 if the string s failed to match the last compiled
53 * regular expression, and
54 * -1 if the compiled regular expression was invalid
55 * (indicating an internal error).
56 *
57 * The strings passed to both re_comp and re_exec may have trailing or
58 * embedded newline characters; they are terminated by nulls.
59 *
60 * The identity of the author of these routines is lost in antiquity;
61 * this is essentially the same as the re code in the original V6 ed.
62 *
63 * The regular expressions recognized are described below. This description
64 * is essentially the same as that for ed.
65 *
66 * A regular expression specifies a set of strings of characters.
67 * A member of this set of strings is said to be matched by
68 * the regular expression. In the following specification for
69 * regular expressions the word `character' means any character but NUL.
70 *
71 * 1. Any character except a special character matches itself.
72 * Special characters are the regular expression delimiter plus
73 * \ [ . and sometimes ^ * $.
74 * 2. A . matches any character.
75 * 3. A \ followed by any character except a digit or ( )
76 * matches that character.
77 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
78 * character in (or not in) s. In s, \ has no special meaning,
79 * and ] may only appear as the first letter. A substring
80 * a-b, with a and b in ascending ASCII order, stands for
81 * the inclusive range of ASCII characters.
82 * 5. A regular expression of form 1-4 followed by * matches a
83 * sequence of 0 or more matches of the regular expression.
84 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
85 * matches what x matches.
86 * 7. A \ followed by a digit n matches a copy of the string that the
87 * bracketed regular expression beginning with the nth \( matched.
88 * 8. A regular expression of form 1-8, x, followed by a regular
89 * expression of form 1-7, y matches a match for x followed by
90 * a match for y, with the x match being as long as possible
91 * while still permitting a y match.
92 * 9. A regular expression of form 1-8 preceded by ^ (or followed
93 * by $), is constrained to matches that begin at the left
94 * (or end at the right) end of a line.
95 * 10. A regular expression of form 1-9 picks out the longest among
96 * the leftmost matches in a line.
97 * 11. An empty regular expression stands for a copy of the last
98 * regular expression encountered.
99 */
100
101 #include <sys/types.h>
102 #include <stdlib.h>
103 #include <stddef.h>
104
105 /*
106 * constants for re's
107 */
108 #define CBRA 1
109 #define CCHR 2
110 #define CDOT 4
111 #define CCL 6
112 #define NCCL 8
113 #define CDOL 10
114 #define CEOF 11
115 #define CKET 12
116 #define CBACK 18
117
118 #define CSTAR 01
119
120 #define ESIZE 512
121 #define NBRA 9
122
123 static struct re_globals {
124 char _expbuf[ESIZE];
125 char *_braslist[NBRA], *_braelist[NBRA];
126 char _circf;
127 } *re_globals;
128 #define expbuf (_re->_expbuf)
129 #define braslist (_re->_braslist)
130 #define braelist (_re->_braelist)
131 #define circf (_re->_circf)
132
133 /*
134 * forward declarations
135 */
136 static int backref(int, char *);
137 static int advance(char *, char *);
138 static int cclass(char *, char, int);
139
140 /*
141 * compile the regular expression argument into a dfa
142 */
143 char *
re_comp(char * sp)144 re_comp(char *sp)
145 {
146 char c;
147 struct re_globals *_re = re_globals;
148 char *ep;
149 int cclcnt, numbra = 0;
150 char *lastep = 0;
151 char bracket[NBRA];
152 char *bracketp = &bracket[0];
153 char *retoolong = "Regular expression too long";
154
155 if (_re == 0) {
156 _re = (struct re_globals *)calloc(1, sizeof (*_re));
157 if (_re == 0)
158 return ("Out of memory");
159 re_globals = _re;
160 }
161 ep = expbuf;
162
163 #define comerr(msg) {expbuf[0] = 0; return (msg); }
164
165 if (sp == 0 || *sp == '\0') {
166 if (*ep == 0)
167 return ("No previous regular expression");
168 return (0);
169 }
170 if (*sp == '^') {
171 circf = 1;
172 sp++;
173 }
174 else
175 circf = 0;
176 for (;;) {
177 if (ep >= &expbuf[ESIZE])
178 comerr(retoolong);
179 if ((c = *sp++) == '\0') {
180 if (bracketp != bracket)
181 comerr("unmatched \\(");
182 *ep++ = CEOF;
183 *ep++ = 0;
184 return (0);
185 }
186 if (c != '*')
187 lastep = ep;
188 switch (c) {
189
190 case '.':
191 *ep++ = CDOT;
192 continue;
193
194 case '*':
195 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
196 goto defchar;
197 *lastep |= CSTAR;
198 continue;
199
200 case '$':
201 if (*sp != '\0')
202 goto defchar;
203 *ep++ = CDOL;
204 continue;
205
206 case '[':
207 *ep++ = CCL;
208 *ep++ = 0;
209 cclcnt = 1;
210 if ((c = *sp++) == '^') {
211 c = *sp++;
212 ep[-2] = NCCL;
213 }
214 do {
215 if (c == '\0')
216 comerr("missing ]");
217 if (c == '-' && ep [-1] != 0) {
218 if ((c = *sp++) == ']') {
219 *ep++ = '-';
220 cclcnt++;
221 break;
222 }
223 while (ep[-1] < c) {
224 *ep = ep[-1] + 1;
225 ep++;
226 cclcnt++;
227 if (ep >= &expbuf[ESIZE])
228 comerr(retoolong);
229 }
230 }
231 *ep++ = c;
232 cclcnt++;
233 if (ep >= &expbuf[ESIZE])
234 comerr(retoolong);
235 } while ((c = *sp++) != ']');
236 lastep[1] = (char)cclcnt;
237 continue;
238
239 case '\\':
240 if ((c = *sp++) == '(') {
241 if (numbra >= NBRA)
242 comerr("too many \\(\\) pairs");
243 *bracketp++ = (char)numbra;
244 *ep++ = CBRA;
245 *ep++ = numbra++;
246 continue;
247 }
248 if (c == ')') {
249 if (bracketp <= bracket)
250 comerr("unmatched \\)");
251 *ep++ = CKET;
252 *ep++ = *--bracketp;
253 continue;
254 }
255 if (c >= '1' && c < ('1' + NBRA)) {
256 *ep++ = CBACK;
257 *ep++ = c - '1';
258 continue;
259 }
260 *ep++ = CCHR;
261 *ep++ = c;
262 continue;
263
264 defchar:
265 default:
266 *ep++ = CCHR;
267 *ep++ = c;
268 }
269 }
270 }
271
272 /*
273 * match the argument string against the compiled re
274 */
275 int
re_exec(char * p1)276 re_exec(char *p1)
277 {
278 struct re_globals *_re = re_globals;
279 char *p2;
280 int c;
281 int rv;
282
283 if (_re == 0)
284 return (0);
285 p2 = expbuf;
286 for (c = 0; c < NBRA; c++) {
287 braslist[c] = 0;
288 braelist[c] = 0;
289 }
290 if (circf)
291 return ((advance(p1, p2)));
292 /*
293 * fast check for first character
294 */
295 if (*p2 == CCHR) {
296 c = p2[1];
297 do {
298 if (*p1 != c)
299 continue;
300 if (rv = advance(p1, p2))
301 return (rv);
302 } while (*p1++);
303 return (0);
304 }
305 /*
306 * regular algorithm
307 */
308 do
309 if (rv = advance(p1, p2))
310 return (rv);
311 while (*p1++);
312 return (0);
313 }
314
315 /*
316 * try to match the next thing in the dfa
317 */
318 static int
advance(char * lp,char * ep)319 advance(char *lp, char *ep)
320 {
321 char *curlp;
322 int i;
323 ptrdiff_t ct;
324 int rv;
325 struct re_globals *_re = re_globals;
326
327 for (;;)
328 switch (*ep++) {
329
330 case CCHR:
331 if (*ep++ == *lp++)
332 continue;
333 return (0);
334
335 case CDOT:
336 if (*lp++)
337 continue;
338 return (0);
339
340 case CDOL:
341 if (*lp == '\0')
342 continue;
343 return (0);
344
345 case CEOF:
346 return (1);
347
348 case CCL:
349 if (cclass(ep, *lp++, 1)) {
350 ep += *ep;
351 continue;
352 }
353 return (0);
354
355 case NCCL:
356 if (cclass(ep, *lp++, 0)) {
357 ep += *ep;
358 continue;
359 }
360 return (0);
361
362 case CBRA:
363 braslist[*ep++] = lp;
364 continue;
365
366 case CKET:
367 braelist[*ep++] = lp;
368 continue;
369
370 case CBACK:
371 if (braelist[i = *ep++] == 0)
372 return (-1);
373 if (backref(i, lp)) {
374 lp += braelist[i] - braslist[i];
375 continue;
376 }
377 return (0);
378
379 case CBACK|CSTAR:
380 if (braelist[i = *ep++] == 0)
381 return (-1);
382 curlp = lp;
383 ct = braelist[i] - braslist[i];
384 while (backref(i, lp))
385 lp += ct;
386 while (lp >= curlp) {
387 if (rv = advance(lp, ep))
388 return (rv);
389 lp -= ct;
390 }
391 continue;
392
393 case CDOT|CSTAR:
394 curlp = lp;
395 while (*lp++)
396 ;
397 goto star;
398
399 case CCHR|CSTAR:
400 curlp = lp;
401 while (*lp++ == *ep)
402 ;
403 ep++;
404 goto star;
405
406 case CCL|CSTAR:
407 case NCCL|CSTAR:
408 curlp = lp;
409 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
410 ;
411 ep += *ep;
412 goto star;
413
414 star:
415 do {
416 lp--;
417 if (rv = advance(lp, ep))
418 return (rv);
419 } while (lp > curlp);
420 return (0);
421
422 default:
423 return (-1);
424 }
425 }
426
427 static int
backref(int i,char * lp)428 backref(int i, char *lp)
429 {
430 char *bp;
431 struct re_globals *_re = re_globals;
432
433 bp = braslist[i];
434 while (*bp++ == *lp++)
435 if (bp >= braelist[i])
436 return (1);
437 return (0);
438 }
439
440 static int
cclass(char * set,char c,int af)441 cclass(char *set, char c, int af)
442 {
443 int n;
444
445 if (c == 0)
446 return (0);
447 n = *set++;
448 while (--n)
449 if (*set++ == c)
450 return (af);
451 return (! af);
452 }
453