/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * routines to do regular expression matching * * Entry points: * * re_comp(s) * char *s; * ... returns 0 if the string s was compiled successfully, * a pointer to an error message otherwise. * If passed 0 or a null string returns without changing * the currently compiled re (see note 11 below). * * re_exec(s) * char *s; * ... returns 1 if the string s matches the last compiled regular * expression, * 0 if the string s failed to match the last compiled * regular expression, and * -1 if the compiled regular expression was invalid * (indicating an internal error). * * The strings passed to both re_comp and re_exec may have trailing or * embedded newline characters; they are terminated by nulls. * * The identity of the author of these routines is lost in antiquity; * this is essentially the same as the re code in the original V6 ed. * * The regular expressions recognized are described below. This description * is essentially the same as that for ed. * * A regular expression specifies a set of strings of characters. * A member of this set of strings is said to be matched by * the regular expression. In the following specification for * regular expressions the word `character' means any character but NUL. * * 1. Any character except a special character matches itself. * Special characters are the regular expression delimiter plus * \ [ . and sometimes ^ * $. * 2. A . matches any character. * 3. A \ followed by any character except a digit or ( ) * matches that character. * 4. A nonempty string s bracketed [s] (or [^s]) matches any * character in (or not in) s. In s, \ has no special meaning, * and ] may only appear as the first letter. A substring * a-b, with a and b in ascending ASCII order, stands for * the inclusive range of ASCII characters. * 5. A regular expression of form 1-4 followed by * matches a * sequence of 0 or more matches of the regular expression. * 6. A regular expression, x, of form 1-8, bracketed \(x\) * matches what x matches. * 7. A \ followed by a digit n matches a copy of the string that the * bracketed regular expression beginning with the nth \( matched. * 8. A regular expression of form 1-8, x, followed by a regular * expression of form 1-7, y matches a match for x followed by * a match for y, with the x match being as long as possible * while still permitting a y match. * 9. A regular expression of form 1-8 preceded by ^ (or followed * by $), is constrained to matches that begin at the left * (or end at the right) end of a line. * 10. A regular expression of form 1-9 picks out the longest among * the leftmost matches in a line. * 11. An empty regular expression stands for a copy of the last * regular expression encountered. */ #include "lint.h" #include #include #include #include /* * constants for re's */ #define CBRA 1 #define CCHR 2 #define CDOT 4 #define CCL 6 #define NCCL 8 #define CDOL 10 #define CEOF 11 #define CKET 12 #define CBACK 18 #define CSTAR 01 #define ESIZE 512 #define NBRA 9 static struct re_globals { char _expbuf[ESIZE]; char *_braslist[NBRA], *_braelist[NBRA]; char _circf; } *re_globals; #define expbuf (_re->_expbuf) #define braslist (_re->_braslist) #define braelist (_re->_braelist) #define circf (_re->_circf) static int advance(const char *, char *); static int backref(int, const char *); static int cclass(char *, char, int); /* * compile the regular expression argument into a dfa */ char * re_comp(const char *sp) { char c; struct re_globals *_re = re_globals; char *ep; char cclcnt, numbra = 0; char *lastep = NULL; char bracket[NBRA]; char *bracketp = &bracket[0]; char *retoolong = "Regular expression too long"; if (_re == NULL) { _re = (struct re_globals *)calloc(1, sizeof (*_re)); if (_re == NULL) return ("Out of memory"); re_globals = _re; } ep = expbuf; #define comerr(msg) {expbuf[0] = 0; return (msg); } if (sp == NULL || *sp == '\0') { if (*ep == 0) return ("No previous regular expression"); return (NULL); } if (*sp == '^') { circf = 1; sp++; } else circf = 0; for (;;) { if (ep >= &expbuf[ESIZE]) comerr(retoolong); if ((c = *sp++) == '\0') { if (bracketp != bracket) comerr("unmatched \\("); *ep++ = CEOF; *ep++ = 0; return (NULL); } if (c != '*') lastep = ep; switch (c) { case '.': *ep++ = CDOT; continue; case '*': if (lastep == NULL || *lastep == CBRA || *lastep == CKET) goto defchar; *lastep |= CSTAR; continue; case '$': if (*sp != '\0') goto defchar; *ep++ = CDOL; continue; case '[': *ep++ = CCL; *ep++ = 0; cclcnt = 1; if ((c = *sp++) == '^') { c = *sp++; ep[-2] = NCCL; } do { if (c == '\0') comerr("missing ]"); if (c == '-' && ep [-1] != 0) { if ((c = *sp++) == ']') { *ep++ = '-'; cclcnt++; break; } while (ep[-1] < c) { *ep = ep[-1] + 1; ep++; cclcnt++; if (ep >= &expbuf[ESIZE]) comerr(retoolong); } } *ep++ = c; cclcnt++; if (ep >= &expbuf[ESIZE]) comerr(retoolong); } while ((c = *sp++) != ']'); lastep[1] = cclcnt; continue; case '\\': if ((c = *sp++) == '(') { if (numbra >= NBRA) comerr("too many \\(\\) pairs"); *bracketp++ = numbra; *ep++ = CBRA; *ep++ = numbra++; continue; } if (c == ')') { if (bracketp <= bracket) comerr("unmatched \\)"); *ep++ = CKET; *ep++ = *--bracketp; continue; } if (c >= '1' && c < ('1' + NBRA)) { *ep++ = CBACK; *ep++ = c - '1'; continue; } *ep++ = CCHR; *ep++ = c; continue; defchar: default: *ep++ = CCHR; *ep++ = c; } } } /* * match the argument string against the compiled re */ int re_exec(const char *p1) { struct re_globals *_re = re_globals; char *p2; int c; int rv; if (_re == NULL) return (0); p2 = expbuf; for (c = 0; c < NBRA; c++) { braslist[c] = 0; braelist[c] = 0; } if (circf) return ((advance(p1, p2))); /* * fast check for first character */ if (*p2 == CCHR) { c = p2[1]; do { if (*p1 != c) continue; rv = advance(p1, p2); if (rv != 0) return (rv); } while (*p1++); return (0); } /* * regular algorithm */ do { rv = advance(p1, p2); if (rv != 0) return (rv); } while (*p1++); return (0); } /* * try to match the next thing in the dfa */ static int advance(const char *lp, char *ep) { const char *curlp; ptrdiff_t ct; int i; int rv; struct re_globals *_re = re_globals; for (;;) switch (*ep++) { case CCHR: if (*ep++ == *lp++) continue; return (0); case CDOT: if (*lp++) continue; return (0); case CDOL: if (*lp == '\0') continue; return (0); case CEOF: return (1); case CCL: if (cclass(ep, *lp++, 1)) { ep += *ep; continue; } return (0); case NCCL: if (cclass(ep, *lp++, 0)) { ep += *ep; continue; } return (0); case CBRA: braslist[*ep++] = (char *)lp; continue; case CKET: braelist[*ep++] = (char *)lp; continue; case CBACK: if (braelist[i = *ep++] == NULL) return (-1); if (backref(i, lp)) { lp += braelist[i] - braslist[i]; continue; } return (0); case CBACK|CSTAR: if (braelist[i = *ep++] == NULL) return (-1); curlp = lp; ct = braelist[i] - braslist[i]; while (backref(i, lp)) lp += ct; while (lp >= curlp) { rv = advance(lp, ep); if (rv != 0) return (rv); lp -= ct; } continue; case CDOT|CSTAR: curlp = lp; while (*lp++) ; goto star; case CCHR|CSTAR: curlp = lp; while (*lp++ == *ep) ; ep++; goto star; case CCL|CSTAR: case NCCL|CSTAR: curlp = lp; while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) ; ep += *ep; goto star; star: do { lp--; rv = advance(lp, ep); if (rv != 0) return (rv); } while (lp > curlp); return (0); default: return (-1); } } static int backref(int i, const char *lp) { char *bp; struct re_globals *_re = re_globals; bp = braslist[i]; while (*bp++ == *lp++) if (bp >= braelist[i]) return (1); return (0); } static int cclass(char *set, char c, int af) { int n; if (c == 0) return (0); n = *set++; while (--n) if (*set++ == c) return (af); return (! af); }