1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 1997, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 31 /* Portions Copyright(c) 1988, Sun Microsystems Inc. */ 32 /* All Rights Reserved */ 33 34 /*LINTLIBRARY*/ 35 36 /* 37 * routines to do regular expression matching 38 * 39 * Entry points: 40 * 41 * re_comp(s) 42 * char *s; 43 * ... returns 0 if the string s was compiled successfully, 44 * a pointer to an error message otherwise. 45 * If passed 0 or a null string returns without changing 46 * the currently compiled re (see note 11 below). 47 * 48 * re_exec(s) 49 * char *s; 50 * ... returns 1 if the string s matches the last compiled regular 51 * expression, 52 * 0 if the string s failed to match the last compiled 53 * regular expression, and 54 * -1 if the compiled regular expression was invalid 55 * (indicating an internal error). 56 * 57 * The strings passed to both re_comp and re_exec may have trailing or 58 * embedded newline characters; they are terminated by nulls. 59 * 60 * The identity of the author of these routines is lost in antiquity; 61 * this is essentially the same as the re code in the original V6 ed. 62 * 63 * The regular expressions recognized are described below. This description 64 * is essentially the same as that for ed. 65 * 66 * A regular expression specifies a set of strings of characters. 67 * A member of this set of strings is said to be matched by 68 * the regular expression. In the following specification for 69 * regular expressions the word `character' means any character but NUL. 70 * 71 * 1. Any character except a special character matches itself. 72 * Special characters are the regular expression delimiter plus 73 * \ [ . and sometimes ^ * $. 74 * 2. A . matches any character. 75 * 3. A \ followed by any character except a digit or ( ) 76 * matches that character. 77 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 78 * character in (or not in) s. In s, \ has no special meaning, 79 * and ] may only appear as the first letter. A substring 80 * a-b, with a and b in ascending ASCII order, stands for 81 * the inclusive range of ASCII characters. 82 * 5. A regular expression of form 1-4 followed by * matches a 83 * sequence of 0 or more matches of the regular expression. 84 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 85 * matches what x matches. 86 * 7. A \ followed by a digit n matches a copy of the string that the 87 * bracketed regular expression beginning with the nth \( matched. 88 * 8. A regular expression of form 1-8, x, followed by a regular 89 * expression of form 1-7, y matches a match for x followed by 90 * a match for y, with the x match being as long as possible 91 * while still permitting a y match. 92 * 9. A regular expression of form 1-8 preceded by ^ (or followed 93 * by $), is constrained to matches that begin at the left 94 * (or end at the right) end of a line. 95 * 10. A regular expression of form 1-9 picks out the longest among 96 * the leftmost matches in a line. 97 * 11. An empty regular expression stands for a copy of the last 98 * regular expression encountered. 99 */ 100 101 #include <sys/types.h> 102 #include <stdlib.h> 103 #include <stddef.h> 104 105 /* 106 * constants for re's 107 */ 108 #define CBRA 1 109 #define CCHR 2 110 #define CDOT 4 111 #define CCL 6 112 #define NCCL 8 113 #define CDOL 10 114 #define CEOF 11 115 #define CKET 12 116 #define CBACK 18 117 118 #define CSTAR 01 119 120 #define ESIZE 512 121 #define NBRA 9 122 123 static struct re_globals { 124 char _expbuf[ESIZE]; 125 char *_braslist[NBRA], *_braelist[NBRA]; 126 char _circf; 127 } *re_globals; 128 #define expbuf (_re->_expbuf) 129 #define braslist (_re->_braslist) 130 #define braelist (_re->_braelist) 131 #define circf (_re->_circf) 132 133 /* 134 * forward declarations 135 */ 136 static int backref(int, char *); 137 static int advance(char *, char *); 138 static int cclass(char *, char, int); 139 140 /* 141 * compile the regular expression argument into a dfa 142 */ 143 char * 144 re_comp(char *sp) 145 { 146 char c; 147 struct re_globals *_re = re_globals; 148 char *ep; 149 int cclcnt, numbra = 0; 150 char *lastep = 0; 151 char bracket[NBRA]; 152 char *bracketp = &bracket[0]; 153 char *retoolong = "Regular expression too long"; 154 155 if (_re == 0) { 156 _re = (struct re_globals *)calloc(1, sizeof (*_re)); 157 if (_re == 0) 158 return ("Out of memory"); 159 re_globals = _re; 160 } 161 ep = expbuf; 162 163 #define comerr(msg) {expbuf[0] = 0; return (msg); } 164 165 if (sp == 0 || *sp == '\0') { 166 if (*ep == 0) 167 return ("No previous regular expression"); 168 return (0); 169 } 170 if (*sp == '^') { 171 circf = 1; 172 sp++; 173 } 174 else 175 circf = 0; 176 for (;;) { 177 if (ep >= &expbuf[ESIZE]) 178 comerr(retoolong); 179 if ((c = *sp++) == '\0') { 180 if (bracketp != bracket) 181 comerr("unmatched \\("); 182 *ep++ = CEOF; 183 *ep++ = 0; 184 return (0); 185 } 186 if (c != '*') 187 lastep = ep; 188 switch (c) { 189 190 case '.': 191 *ep++ = CDOT; 192 continue; 193 194 case '*': 195 if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 196 goto defchar; 197 *lastep |= CSTAR; 198 continue; 199 200 case '$': 201 if (*sp != '\0') 202 goto defchar; 203 *ep++ = CDOL; 204 continue; 205 206 case '[': 207 *ep++ = CCL; 208 *ep++ = 0; 209 cclcnt = 1; 210 if ((c = *sp++) == '^') { 211 c = *sp++; 212 ep[-2] = NCCL; 213 } 214 do { 215 if (c == '\0') 216 comerr("missing ]"); 217 if (c == '-' && ep [-1] != 0) { 218 if ((c = *sp++) == ']') { 219 *ep++ = '-'; 220 cclcnt++; 221 break; 222 } 223 while (ep[-1] < c) { 224 *ep = ep[-1] + 1; 225 ep++; 226 cclcnt++; 227 if (ep >= &expbuf[ESIZE]) 228 comerr(retoolong); 229 } 230 } 231 *ep++ = c; 232 cclcnt++; 233 if (ep >= &expbuf[ESIZE]) 234 comerr(retoolong); 235 } while ((c = *sp++) != ']'); 236 lastep[1] = (char)cclcnt; 237 continue; 238 239 case '\\': 240 if ((c = *sp++) == '(') { 241 if (numbra >= NBRA) 242 comerr("too many \\(\\) pairs"); 243 *bracketp++ = (char)numbra; 244 *ep++ = CBRA; 245 *ep++ = numbra++; 246 continue; 247 } 248 if (c == ')') { 249 if (bracketp <= bracket) 250 comerr("unmatched \\)"); 251 *ep++ = CKET; 252 *ep++ = *--bracketp; 253 continue; 254 } 255 if (c >= '1' && c < ('1' + NBRA)) { 256 *ep++ = CBACK; 257 *ep++ = c - '1'; 258 continue; 259 } 260 *ep++ = CCHR; 261 *ep++ = c; 262 continue; 263 264 defchar: 265 default: 266 *ep++ = CCHR; 267 *ep++ = c; 268 } 269 } 270 } 271 272 /* 273 * match the argument string against the compiled re 274 */ 275 int 276 re_exec(char *p1) 277 { 278 struct re_globals *_re = re_globals; 279 char *p2; 280 int c; 281 int rv; 282 283 if (_re == 0) 284 return (0); 285 p2 = expbuf; 286 for (c = 0; c < NBRA; c++) { 287 braslist[c] = 0; 288 braelist[c] = 0; 289 } 290 if (circf) 291 return ((advance(p1, p2))); 292 /* 293 * fast check for first character 294 */ 295 if (*p2 == CCHR) { 296 c = p2[1]; 297 do { 298 if (*p1 != c) 299 continue; 300 if (rv = advance(p1, p2)) 301 return (rv); 302 } while (*p1++); 303 return (0); 304 } 305 /* 306 * regular algorithm 307 */ 308 do 309 if (rv = advance(p1, p2)) 310 return (rv); 311 while (*p1++); 312 return (0); 313 } 314 315 /* 316 * try to match the next thing in the dfa 317 */ 318 static int 319 advance(char *lp, char *ep) 320 { 321 char *curlp; 322 int i; 323 ptrdiff_t ct; 324 int rv; 325 struct re_globals *_re = re_globals; 326 327 for (;;) 328 switch (*ep++) { 329 330 case CCHR: 331 if (*ep++ == *lp++) 332 continue; 333 return (0); 334 335 case CDOT: 336 if (*lp++) 337 continue; 338 return (0); 339 340 case CDOL: 341 if (*lp == '\0') 342 continue; 343 return (0); 344 345 case CEOF: 346 return (1); 347 348 case CCL: 349 if (cclass(ep, *lp++, 1)) { 350 ep += *ep; 351 continue; 352 } 353 return (0); 354 355 case NCCL: 356 if (cclass(ep, *lp++, 0)) { 357 ep += *ep; 358 continue; 359 } 360 return (0); 361 362 case CBRA: 363 braslist[*ep++] = lp; 364 continue; 365 366 case CKET: 367 braelist[*ep++] = lp; 368 continue; 369 370 case CBACK: 371 if (braelist[i = *ep++] == 0) 372 return (-1); 373 if (backref(i, lp)) { 374 lp += braelist[i] - braslist[i]; 375 continue; 376 } 377 return (0); 378 379 case CBACK|CSTAR: 380 if (braelist[i = *ep++] == 0) 381 return (-1); 382 curlp = lp; 383 ct = braelist[i] - braslist[i]; 384 while (backref(i, lp)) 385 lp += ct; 386 while (lp >= curlp) { 387 if (rv = advance(lp, ep)) 388 return (rv); 389 lp -= ct; 390 } 391 continue; 392 393 case CDOT|CSTAR: 394 curlp = lp; 395 while (*lp++) 396 ; 397 goto star; 398 399 case CCHR|CSTAR: 400 curlp = lp; 401 while (*lp++ == *ep) 402 ; 403 ep++; 404 goto star; 405 406 case CCL|CSTAR: 407 case NCCL|CSTAR: 408 curlp = lp; 409 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 410 ; 411 ep += *ep; 412 goto star; 413 414 star: 415 do { 416 lp--; 417 if (rv = advance(lp, ep)) 418 return (rv); 419 } while (lp > curlp); 420 return (0); 421 422 default: 423 return (-1); 424 } 425 } 426 427 static int 428 backref(int i, char *lp) 429 { 430 char *bp; 431 struct re_globals *_re = re_globals; 432 433 bp = braslist[i]; 434 while (*bp++ == *lp++) 435 if (bp >= braelist[i]) 436 return (1); 437 return (0); 438 } 439 440 static int 441 cclass(char *set, char c, int af) 442 { 443 int n; 444 445 if (c == 0) 446 return (0); 447 n = *set++; 448 while (--n) 449 if (*set++ == c) 450 return (af); 451 return (! af); 452 } 453