1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * routines to do regular expression matching 32 * 33 * Entry points: 34 * 35 * re_comp(s) 36 * char *s; 37 * ... returns 0 if the string s was compiled successfully, 38 * a pointer to an error message otherwise. 39 * If passed 0 or a null string returns without changing 40 * the currently compiled re (see note 11 below). 41 * 42 * re_exec(s) 43 * char *s; 44 * ... returns 1 if the string s matches the last compiled regular 45 * expression, 46 * 0 if the string s failed to match the last compiled 47 * regular expression, and 48 * -1 if the compiled regular expression was invalid 49 * (indicating an internal error). 50 * 51 * The strings passed to both re_comp and re_exec may have trailing or 52 * embedded newline characters; they are terminated by nulls. 53 * 54 * The identity of the author of these routines is lost in antiquity; 55 * this is essentially the same as the re code in the original V6 ed. 56 * 57 * The regular expressions recognized are described below. This description 58 * is essentially the same as that for ed. 59 * 60 * A regular expression specifies a set of strings of characters. 61 * A member of this set of strings is said to be matched by 62 * the regular expression. In the following specification for 63 * regular expressions the word `character' means any character but NUL. 64 * 65 * 1. Any character except a special character matches itself. 66 * Special characters are the regular expression delimiter plus 67 * \ [ . and sometimes ^ * $. 68 * 2. A . matches any character. 69 * 3. A \ followed by any character except a digit or ( ) 70 * matches that character. 71 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 72 * character in (or not in) s. In s, \ has no special meaning, 73 * and ] may only appear as the first letter. A substring 74 * a-b, with a and b in ascending ASCII order, stands for 75 * the inclusive range of ASCII characters. 76 * 5. A regular expression of form 1-4 followed by * matches a 77 * sequence of 0 or more matches of the regular expression. 78 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 79 * matches what x matches. 80 * 7. A \ followed by a digit n matches a copy of the string that the 81 * bracketed regular expression beginning with the nth \( matched. 82 * 8. A regular expression of form 1-8, x, followed by a regular 83 * expression of form 1-7, y matches a match for x followed by 84 * a match for y, with the x match being as long as possible 85 * while still permitting a y match. 86 * 9. A regular expression of form 1-8 preceded by ^ (or followed 87 * by $), is constrained to matches that begin at the left 88 * (or end at the right) end of a line. 89 * 10. A regular expression of form 1-9 picks out the longest among 90 * the leftmost matches in a line. 91 * 11. An empty regular expression stands for a copy of the last 92 * regular expression encountered. 93 */ 94 95 #include "lint.h" 96 97 #include <stdlib.h> 98 #include <re_comp.h> 99 #include <stddef.h> 100 #include <sys/types.h> 101 102 /* 103 * constants for re's 104 */ 105 #define CBRA 1 106 #define CCHR 2 107 #define CDOT 4 108 #define CCL 6 109 #define NCCL 8 110 #define CDOL 10 111 #define CEOF 11 112 #define CKET 12 113 #define CBACK 18 114 115 #define CSTAR 01 116 117 #define ESIZE 512 118 #define NBRA 9 119 120 static struct re_globals { 121 char _expbuf[ESIZE]; 122 char *_braslist[NBRA], *_braelist[NBRA]; 123 char _circf; 124 } *re_globals; 125 #define expbuf (_re->_expbuf) 126 #define braslist (_re->_braslist) 127 #define braelist (_re->_braelist) 128 #define circf (_re->_circf) 129 130 static int advance(const char *, char *); 131 static int backref(int, const char *); 132 static int cclass(char *, char, int); 133 134 /* 135 * compile the regular expression argument into a dfa 136 */ 137 char * 138 re_comp(const char *sp) 139 { 140 char c; 141 struct re_globals *_re = re_globals; 142 char *ep; 143 char cclcnt, numbra = 0; 144 char *lastep = NULL; 145 char bracket[NBRA]; 146 char *bracketp = &bracket[0]; 147 char *retoolong = "Regular expression too long"; 148 149 if (_re == NULL) { 150 _re = (struct re_globals *)calloc(1, sizeof (*_re)); 151 if (_re == NULL) 152 return ("Out of memory"); 153 re_globals = _re; 154 } 155 ep = expbuf; 156 157 #define comerr(msg) {expbuf[0] = 0; return (msg); } 158 159 if (sp == NULL || *sp == '\0') { 160 if (*ep == 0) 161 return ("No previous regular expression"); 162 return (NULL); 163 } 164 if (*sp == '^') { 165 circf = 1; 166 sp++; 167 } 168 else 169 circf = 0; 170 for (;;) { 171 if (ep >= &expbuf[ESIZE]) 172 comerr(retoolong); 173 if ((c = *sp++) == '\0') { 174 if (bracketp != bracket) 175 comerr("unmatched \\("); 176 *ep++ = CEOF; 177 *ep++ = 0; 178 return (NULL); 179 } 180 if (c != '*') 181 lastep = ep; 182 switch (c) { 183 184 case '.': 185 *ep++ = CDOT; 186 continue; 187 188 case '*': 189 if (lastep == NULL || *lastep == CBRA || 190 *lastep == CKET) 191 goto defchar; 192 *lastep |= CSTAR; 193 continue; 194 195 case '$': 196 if (*sp != '\0') 197 goto defchar; 198 *ep++ = CDOL; 199 continue; 200 201 case '[': 202 *ep++ = CCL; 203 *ep++ = 0; 204 cclcnt = 1; 205 if ((c = *sp++) == '^') { 206 c = *sp++; 207 ep[-2] = NCCL; 208 } 209 do { 210 if (c == '\0') 211 comerr("missing ]"); 212 if (c == '-' && ep [-1] != 0) { 213 if ((c = *sp++) == ']') { 214 *ep++ = '-'; 215 cclcnt++; 216 break; 217 } 218 while (ep[-1] < c) { 219 *ep = ep[-1] + 1; 220 ep++; 221 cclcnt++; 222 if (ep >= &expbuf[ESIZE]) 223 comerr(retoolong); 224 } 225 } 226 *ep++ = c; 227 cclcnt++; 228 if (ep >= &expbuf[ESIZE]) 229 comerr(retoolong); 230 } while ((c = *sp++) != ']'); 231 lastep[1] = cclcnt; 232 continue; 233 234 case '\\': 235 if ((c = *sp++) == '(') { 236 if (numbra >= NBRA) 237 comerr("too many \\(\\) pairs"); 238 *bracketp++ = numbra; 239 *ep++ = CBRA; 240 *ep++ = numbra++; 241 continue; 242 } 243 if (c == ')') { 244 if (bracketp <= bracket) 245 comerr("unmatched \\)"); 246 *ep++ = CKET; 247 *ep++ = *--bracketp; 248 continue; 249 } 250 if (c >= '1' && c < ('1' + NBRA)) { 251 *ep++ = CBACK; 252 *ep++ = c - '1'; 253 continue; 254 } 255 *ep++ = CCHR; 256 *ep++ = c; 257 continue; 258 259 defchar: 260 default: 261 *ep++ = CCHR; 262 *ep++ = c; 263 } 264 } 265 } 266 267 /* 268 * match the argument string against the compiled re 269 */ 270 int 271 re_exec(const char *p1) 272 { 273 struct re_globals *_re = re_globals; 274 char *p2; 275 int c; 276 int rv; 277 278 if (_re == NULL) 279 return (0); 280 p2 = expbuf; 281 for (c = 0; c < NBRA; c++) { 282 braslist[c] = 0; 283 braelist[c] = 0; 284 } 285 if (circf) 286 return ((advance(p1, p2))); 287 /* 288 * fast check for first character 289 */ 290 if (*p2 == CCHR) { 291 c = p2[1]; 292 do { 293 if (*p1 != c) 294 continue; 295 rv = advance(p1, p2); 296 if (rv != 0) 297 return (rv); 298 } while (*p1++); 299 return (0); 300 } 301 /* 302 * regular algorithm 303 */ 304 do { 305 rv = advance(p1, p2); 306 if (rv != 0) 307 return (rv); 308 } while (*p1++); 309 return (0); 310 } 311 312 /* 313 * try to match the next thing in the dfa 314 */ 315 static int 316 advance(const char *lp, char *ep) 317 { 318 const char *curlp; 319 ptrdiff_t ct; 320 int i; 321 int rv; 322 struct re_globals *_re = re_globals; 323 324 for (;;) 325 switch (*ep++) { 326 327 case CCHR: 328 if (*ep++ == *lp++) 329 continue; 330 return (0); 331 332 case CDOT: 333 if (*lp++) 334 continue; 335 return (0); 336 337 case CDOL: 338 if (*lp == '\0') 339 continue; 340 return (0); 341 342 case CEOF: 343 return (1); 344 345 case CCL: 346 if (cclass(ep, *lp++, 1)) { 347 ep += *ep; 348 continue; 349 } 350 return (0); 351 352 case NCCL: 353 if (cclass(ep, *lp++, 0)) { 354 ep += *ep; 355 continue; 356 } 357 return (0); 358 359 case CBRA: 360 braslist[*ep++] = (char *)lp; 361 continue; 362 363 case CKET: 364 braelist[*ep++] = (char *)lp; 365 continue; 366 367 case CBACK: 368 if (braelist[i = *ep++] == NULL) 369 return (-1); 370 if (backref(i, lp)) { 371 lp += braelist[i] - braslist[i]; 372 continue; 373 } 374 return (0); 375 376 case CBACK|CSTAR: 377 if (braelist[i = *ep++] == NULL) 378 return (-1); 379 curlp = lp; 380 ct = braelist[i] - braslist[i]; 381 while (backref(i, lp)) 382 lp += ct; 383 while (lp >= curlp) { 384 rv = advance(lp, ep); 385 if (rv != 0) 386 return (rv); 387 lp -= ct; 388 } 389 continue; 390 391 case CDOT|CSTAR: 392 curlp = lp; 393 while (*lp++) 394 ; 395 goto star; 396 397 case CCHR|CSTAR: 398 curlp = lp; 399 while (*lp++ == *ep) 400 ; 401 ep++; 402 goto star; 403 404 case CCL|CSTAR: 405 case NCCL|CSTAR: 406 curlp = lp; 407 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 408 ; 409 ep += *ep; 410 goto star; 411 412 star: 413 do { 414 lp--; 415 rv = advance(lp, ep); 416 if (rv != 0) 417 return (rv); 418 } while (lp > curlp); 419 return (0); 420 421 default: 422 return (-1); 423 } 424 } 425 426 static int 427 backref(int i, const char *lp) 428 { 429 char *bp; 430 struct re_globals *_re = re_globals; 431 432 bp = braslist[i]; 433 while (*bp++ == *lp++) 434 if (bp >= braelist[i]) 435 return (1); 436 return (0); 437 } 438 439 static int 440 cclass(char *set, char c, int af) 441 { 442 int n; 443 444 if (c == 0) 445 return (0); 446 n = *set++; 447 while (--n) 448 if (*set++ == c) 449 return (af); 450 return (! af); 451 } 452