1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 /* 33 * routines to do regular expression matching 34 * 35 * Entry points: 36 * 37 * re_comp(s) 38 * char *s; 39 * ... returns 0 if the string s was compiled successfully, 40 * a pointer to an error message otherwise. 41 * If passed 0 or a null string returns without changing 42 * the currently compiled re (see note 11 below). 43 * 44 * re_exec(s) 45 * char *s; 46 * ... returns 1 if the string s matches the last compiled regular 47 * expression, 48 * 0 if the string s failed to match the last compiled 49 * regular expression, and 50 * -1 if the compiled regular expression was invalid 51 * (indicating an internal error). 52 * 53 * The strings passed to both re_comp and re_exec may have trailing or 54 * embedded newline characters; they are terminated by nulls. 55 * 56 * The identity of the author of these routines is lost in antiquity; 57 * this is essentially the same as the re code in the original V6 ed. 58 * 59 * The regular expressions recognized are described below. This description 60 * is essentially the same as that for ed. 61 * 62 * A regular expression specifies a set of strings of characters. 63 * A member of this set of strings is said to be matched by 64 * the regular expression. In the following specification for 65 * regular expressions the word `character' means any character but NUL. 66 * 67 * 1. Any character except a special character matches itself. 68 * Special characters are the regular expression delimiter plus 69 * \ [ . and sometimes ^ * $. 70 * 2. A . matches any character. 71 * 3. A \ followed by any character except a digit or ( ) 72 * matches that character. 73 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 74 * character in (or not in) s. In s, \ has no special meaning, 75 * and ] may only appear as the first letter. A substring 76 * a-b, with a and b in ascending ASCII order, stands for 77 * the inclusive range of ASCII characters. 78 * 5. A regular expression of form 1-4 followed by * matches a 79 * sequence of 0 or more matches of the regular expression. 80 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 81 * matches what x matches. 82 * 7. A \ followed by a digit n matches a copy of the string that the 83 * bracketed regular expression beginning with the nth \( matched. 84 * 8. A regular expression of form 1-8, x, followed by a regular 85 * expression of form 1-7, y matches a match for x followed by 86 * a match for y, with the x match being as long as possible 87 * while still permitting a y match. 88 * 9. A regular expression of form 1-8 preceded by ^ (or followed 89 * by $), is constrained to matches that begin at the left 90 * (or end at the right) end of a line. 91 * 10. A regular expression of form 1-9 picks out the longest among 92 * the leftmost matches in a line. 93 * 11. An empty regular expression stands for a copy of the last 94 * regular expression encountered. 95 */ 96 97 #include "lint.h" 98 99 #include <stdlib.h> 100 #include <re_comp.h> 101 #include <stddef.h> 102 #include <sys/types.h> 103 104 /* 105 * constants for re's 106 */ 107 #define CBRA 1 108 #define CCHR 2 109 #define CDOT 4 110 #define CCL 6 111 #define NCCL 8 112 #define CDOL 10 113 #define CEOF 11 114 #define CKET 12 115 #define CBACK 18 116 117 #define CSTAR 01 118 119 #define ESIZE 512 120 #define NBRA 9 121 122 static struct re_globals { 123 char _expbuf[ESIZE]; 124 char *_braslist[NBRA], *_braelist[NBRA]; 125 char _circf; 126 } *re_globals; 127 #define expbuf (_re->_expbuf) 128 #define braslist (_re->_braslist) 129 #define braelist (_re->_braelist) 130 #define circf (_re->_circf) 131 132 static int advance(const char *, char *); 133 static int backref(int, const char *); 134 static int cclass(char *, char, int); 135 136 /* 137 * compile the regular expression argument into a dfa 138 */ 139 char * 140 re_comp(const char *sp) 141 { 142 char c; 143 struct re_globals *_re = re_globals; 144 char *ep; 145 char cclcnt, numbra = 0; 146 char *lastep = NULL; 147 char bracket[NBRA]; 148 char *bracketp = &bracket[0]; 149 char *retoolong = "Regular expression too long"; 150 151 if (_re == NULL) { 152 _re = (struct re_globals *)calloc(1, sizeof (*_re)); 153 if (_re == NULL) 154 return ("Out of memory"); 155 re_globals = _re; 156 } 157 ep = expbuf; 158 159 #define comerr(msg) {expbuf[0] = 0; return (msg); } 160 161 if (sp == NULL || *sp == '\0') { 162 if (*ep == 0) 163 return ("No previous regular expression"); 164 return (NULL); 165 } 166 if (*sp == '^') { 167 circf = 1; 168 sp++; 169 } 170 else 171 circf = 0; 172 for (;;) { 173 if (ep >= &expbuf[ESIZE]) 174 comerr(retoolong); 175 if ((c = *sp++) == '\0') { 176 if (bracketp != bracket) 177 comerr("unmatched \\("); 178 *ep++ = CEOF; 179 *ep++ = 0; 180 return (NULL); 181 } 182 if (c != '*') 183 lastep = ep; 184 switch (c) { 185 186 case '.': 187 *ep++ = CDOT; 188 continue; 189 190 case '*': 191 if (lastep == NULL || *lastep == CBRA || 192 *lastep == CKET) 193 goto defchar; 194 *lastep |= CSTAR; 195 continue; 196 197 case '$': 198 if (*sp != '\0') 199 goto defchar; 200 *ep++ = CDOL; 201 continue; 202 203 case '[': 204 *ep++ = CCL; 205 *ep++ = 0; 206 cclcnt = 1; 207 if ((c = *sp++) == '^') { 208 c = *sp++; 209 ep[-2] = NCCL; 210 } 211 do { 212 if (c == '\0') 213 comerr("missing ]"); 214 if (c == '-' && ep [-1] != 0) { 215 if ((c = *sp++) == ']') { 216 *ep++ = '-'; 217 cclcnt++; 218 break; 219 } 220 while (ep[-1] < c) { 221 *ep = ep[-1] + 1; 222 ep++; 223 cclcnt++; 224 if (ep >= &expbuf[ESIZE]) 225 comerr(retoolong); 226 } 227 } 228 *ep++ = c; 229 cclcnt++; 230 if (ep >= &expbuf[ESIZE]) 231 comerr(retoolong); 232 } while ((c = *sp++) != ']'); 233 lastep[1] = cclcnt; 234 continue; 235 236 case '\\': 237 if ((c = *sp++) == '(') { 238 if (numbra >= NBRA) 239 comerr("too many \\(\\) pairs"); 240 *bracketp++ = numbra; 241 *ep++ = CBRA; 242 *ep++ = numbra++; 243 continue; 244 } 245 if (c == ')') { 246 if (bracketp <= bracket) 247 comerr("unmatched \\)"); 248 *ep++ = CKET; 249 *ep++ = *--bracketp; 250 continue; 251 } 252 if (c >= '1' && c < ('1' + NBRA)) { 253 *ep++ = CBACK; 254 *ep++ = c - '1'; 255 continue; 256 } 257 *ep++ = CCHR; 258 *ep++ = c; 259 continue; 260 261 defchar: 262 default: 263 *ep++ = CCHR; 264 *ep++ = c; 265 } 266 } 267 } 268 269 /* 270 * match the argument string against the compiled re 271 */ 272 int 273 re_exec(const char *p1) 274 { 275 struct re_globals *_re = re_globals; 276 char *p2; 277 int c; 278 int rv; 279 280 if (_re == NULL) 281 return (0); 282 p2 = expbuf; 283 for (c = 0; c < NBRA; c++) { 284 braslist[c] = 0; 285 braelist[c] = 0; 286 } 287 if (circf) 288 return ((advance(p1, p2))); 289 /* 290 * fast check for first character 291 */ 292 if (*p2 == CCHR) { 293 c = p2[1]; 294 do { 295 if (*p1 != c) 296 continue; 297 if (rv = advance(p1, p2)) 298 return (rv); 299 } while (*p1++); 300 return (0); 301 } 302 /* 303 * regular algorithm 304 */ 305 do { 306 if (rv = advance(p1, p2)) 307 return (rv); 308 } while (*p1++); 309 return (0); 310 } 311 312 /* 313 * try to match the next thing in the dfa 314 */ 315 static int 316 advance(const char *lp, char *ep) 317 { 318 const char *curlp; 319 ptrdiff_t ct; 320 int i; 321 int rv; 322 struct re_globals *_re = re_globals; 323 324 for (;;) 325 switch (*ep++) { 326 327 case CCHR: 328 if (*ep++ == *lp++) 329 continue; 330 return (0); 331 332 case CDOT: 333 if (*lp++) 334 continue; 335 return (0); 336 337 case CDOL: 338 if (*lp == '\0') 339 continue; 340 return (0); 341 342 case CEOF: 343 return (1); 344 345 case CCL: 346 if (cclass(ep, *lp++, 1)) { 347 ep += *ep; 348 continue; 349 } 350 return (0); 351 352 case NCCL: 353 if (cclass(ep, *lp++, 0)) { 354 ep += *ep; 355 continue; 356 } 357 return (0); 358 359 case CBRA: 360 braslist[*ep++] = (char *)lp; 361 continue; 362 363 case CKET: 364 braelist[*ep++] = (char *)lp; 365 continue; 366 367 case CBACK: 368 if (braelist[i = *ep++] == NULL) 369 return (-1); 370 if (backref(i, lp)) { 371 lp += braelist[i] - braslist[i]; 372 continue; 373 } 374 return (0); 375 376 case CBACK|CSTAR: 377 if (braelist[i = *ep++] == NULL) 378 return (-1); 379 curlp = lp; 380 ct = braelist[i] - braslist[i]; 381 while (backref(i, lp)) 382 lp += ct; 383 while (lp >= curlp) { 384 if (rv = advance(lp, ep)) 385 return (rv); 386 lp -= ct; 387 } 388 continue; 389 390 case CDOT|CSTAR: 391 curlp = lp; 392 while (*lp++) 393 ; 394 goto star; 395 396 case CCHR|CSTAR: 397 curlp = lp; 398 while (*lp++ == *ep) 399 ; 400 ep++; 401 goto star; 402 403 case CCL|CSTAR: 404 case NCCL|CSTAR: 405 curlp = lp; 406 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 407 ; 408 ep += *ep; 409 goto star; 410 411 star: 412 do { 413 lp--; 414 if (rv = advance(lp, ep)) 415 return (rv); 416 } while (lp > curlp); 417 return (0); 418 419 default: 420 return (-1); 421 } 422 } 423 424 static int 425 backref(int i, const char *lp) 426 { 427 char *bp; 428 struct re_globals *_re = re_globals; 429 430 bp = braslist[i]; 431 while (*bp++ == *lp++) 432 if (bp >= braelist[i]) 433 return (1); 434 return (0); 435 } 436 437 static int 438 cclass(char *set, char c, int af) 439 { 440 int n; 441 442 if (c == 0) 443 return (0); 444 n = *set++; 445 while (--n) 446 if (*set++ == c) 447 return (af); 448 return (! af); 449 } 450