1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 1987 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * routines to do regular expression matching 31 * 32 * Entry points: 33 * 34 * re_comp(s) 35 * char *s; 36 * ... returns 0 if the string s was compiled successfully, 37 * a pointer to an error message otherwise. 38 * If passed 0 or a null string returns without changing 39 * the currently compiled re (see note 11 below). 40 * 41 * re_exec(s) 42 * char *s; 43 * ... returns 1 if the string s matches the last compiled regular 44 * expression, 45 * 0 if the string s failed to match the last compiled 46 * regular expression, and 47 * -1 if the compiled regular expression was invalid 48 * (indicating an internal error). 49 * 50 * The strings passed to both re_comp and re_exec may have trailing or 51 * embedded newline characters; they are terminated by nulls. 52 * 53 * The identity of the author of these routines is lost in antiquity; 54 * this is essentially the same as the re code in the original V6 ed. 55 * 56 * The regular expressions recognized are described below. This description 57 * is essentially the same as that for ed. 58 * 59 * A regular expression specifies a set of strings of characters. 60 * A member of this set of strings is said to be matched by 61 * the regular expression. In the following specification for 62 * regular expressions the word `character' means any character but NUL. 63 * 64 * 1. Any character except a special character matches itself. 65 * Special characters are the regular expression delimiter plus 66 * \ [ . and sometimes ^ * $. 67 * 2. A . matches any character. 68 * 3. A \ followed by any character except a digit or ( ) 69 * matches that character. 70 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 71 * character in (or not in) s. In s, \ has no special meaning, 72 * and ] may only appear as the first letter. A substring 73 * a-b, with a and b in ascending ASCII order, stands for 74 * the inclusive range of ASCII characters. 75 * 5. A regular expression of form 1-4 followed by * matches a 76 * sequence of 0 or more matches of the regular expression. 77 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 78 * matches what x matches. 79 * 7. A \ followed by a digit n matches a copy of the string that the 80 * bracketed regular expression beginning with the nth \( matched. 81 * 8. A regular expression of form 1-8, x, followed by a regular 82 * expression of form 1-7, y matches a match for x followed by 83 * a match for y, with the x match being as long as possible 84 * while still permitting a y match. 85 * 9. A regular expression of form 1-8 preceded by ^ (or followed 86 * by $), is constrained to matches that begin at the left 87 * (or end at the right) end of a line. 88 * 10. A regular expression of form 1-9 picks out the longest among 89 * the leftmost matches in a line. 90 * 11. An empty regular expression stands for a copy of the last 91 * regular expression encountered. 92 */ 93 94 /* 95 * constants for re's 96 */ 97 #define CBRA 1 98 #define CCHR 2 99 #define CDOT 4 100 #define CCL 6 101 #define NCCL 8 102 #define CDOL 10 103 #define CEOF 11 104 #define CKET 12 105 #define CBACK 18 106 107 #define CSTAR 01 108 109 #define ESIZE 512 110 #define NBRA 9 111 112 static struct re_globals { 113 char _expbuf[ESIZE]; 114 char *_braslist[NBRA], *_braelist[NBRA]; 115 char _circf; 116 } *re_globals; 117 #define expbuf (_re->_expbuf) 118 #define braslist (_re->_braslist) 119 #define braelist (_re->_braelist) 120 #define circf (_re->_circf) 121 122 static int advance(char *, char *); 123 static int backref(int, char *); 124 static int cclass(char *, char, int); 125 126 /* 127 * compile the regular expression argument into a dfa 128 */ 129 char * 130 re_comp(char *sp) 131 { 132 int c; 133 struct re_globals *_re = re_globals; 134 char *ep; 135 int cclcnt, numbra = 0; 136 char *lastep = 0; 137 char bracket[NBRA]; 138 char *bracketp = &bracket[0]; 139 char *retoolong = "Regular expression too long"; 140 141 if (_re == 0) { 142 _re = (struct re_globals *)calloc(1, sizeof (*_re)); 143 if (_re == 0) 144 return ("Out of memory"); 145 re_globals = _re; 146 } 147 ep = expbuf; 148 149 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 150 151 if (sp == 0 || *sp == '\0') { 152 if (*ep == 0) 153 return("No previous regular expression"); 154 return (0); 155 } 156 if (*sp == '^') { 157 circf = 1; 158 sp++; 159 } 160 else 161 circf = 0; 162 for (;;) { 163 if (ep >= &expbuf[ESIZE]) 164 comerr(retoolong); 165 if ((c = *sp++) == '\0') { 166 if (bracketp != bracket) 167 comerr("unmatched \\("); 168 *ep++ = CEOF; 169 *ep++ = 0; 170 return (0); 171 } 172 if (c != '*') 173 lastep = ep; 174 switch (c) { 175 176 case '.': 177 *ep++ = CDOT; 178 continue; 179 180 case '*': 181 if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 182 goto defchar; 183 *lastep |= CSTAR; 184 continue; 185 186 case '$': 187 if (*sp != '\0') 188 goto defchar; 189 *ep++ = CDOL; 190 continue; 191 192 case '[': 193 *ep++ = CCL; 194 *ep++ = 0; 195 cclcnt = 1; 196 if ((c = *sp++) == '^') { 197 c = *sp++; 198 ep[-2] = NCCL; 199 } 200 do { 201 if (c == '\0') 202 comerr("missing ]"); 203 if (c == '-' && ep [-1] != 0) { 204 if ((c = *sp++) == ']') { 205 *ep++ = '-'; 206 cclcnt++; 207 break; 208 } 209 while (ep[-1] < c) { 210 *ep = ep[-1] + 1; 211 ep++; 212 cclcnt++; 213 if (ep >= &expbuf[ESIZE]) 214 comerr(retoolong); 215 } 216 } 217 *ep++ = c; 218 cclcnt++; 219 if (ep >= &expbuf[ESIZE]) 220 comerr(retoolong); 221 } while ((c = *sp++) != ']'); 222 lastep[1] = cclcnt; 223 continue; 224 225 case '\\': 226 if ((c = *sp++) == '(') { 227 if (numbra >= NBRA) 228 comerr("too many \\(\\) pairs"); 229 *bracketp++ = numbra; 230 *ep++ = CBRA; 231 *ep++ = numbra++; 232 continue; 233 } 234 if (c == ')') { 235 if (bracketp <= bracket) 236 comerr("unmatched \\)"); 237 *ep++ = CKET; 238 *ep++ = *--bracketp; 239 continue; 240 } 241 if (c >= '1' && c < ('1' + NBRA)) { 242 *ep++ = CBACK; 243 *ep++ = c - '1'; 244 continue; 245 } 246 *ep++ = CCHR; 247 *ep++ = c; 248 continue; 249 250 defchar: 251 default: 252 *ep++ = CCHR; 253 *ep++ = c; 254 } 255 } 256 } 257 258 /* 259 * match the argument string against the compiled re 260 */ 261 int 262 re_exec(char *p1) 263 { 264 struct re_globals *_re = re_globals; 265 char *p2; 266 int c; 267 int rv; 268 269 if (_re == 0) 270 return (0); 271 p2 = expbuf; 272 for (c = 0; c < NBRA; c++) { 273 braslist[c] = 0; 274 braelist[c] = 0; 275 } 276 if (circf) 277 return((advance(p1, p2))); 278 /* 279 * fast check for first character 280 */ 281 if (*p2 == CCHR) { 282 c = p2[1]; 283 do { 284 if (*p1 != c) 285 continue; 286 if (rv = advance(p1, p2)) 287 return(rv); 288 } while (*p1++); 289 return(0); 290 } 291 /* 292 * regular algorithm 293 */ 294 do 295 if (rv = advance(p1, p2)) 296 return(rv); 297 while (*p1++); 298 return(0); 299 } 300 301 /* 302 * try to match the next thing in the dfa 303 */ 304 static int 305 advance(char *lp, char *ep) 306 { 307 char *curlp; 308 int ct, i; 309 int rv; 310 struct re_globals *_re = re_globals; 311 312 for (;;) 313 switch (*ep++) { 314 315 case CCHR: 316 if (*ep++ == *lp++) 317 continue; 318 return(0); 319 320 case CDOT: 321 if (*lp++) 322 continue; 323 return(0); 324 325 case CDOL: 326 if (*lp == '\0') 327 continue; 328 return(0); 329 330 case CEOF: 331 return(1); 332 333 case CCL: 334 if (cclass(ep, *lp++, 1)) { 335 ep += *ep; 336 continue; 337 } 338 return(0); 339 340 case NCCL: 341 if (cclass(ep, *lp++, 0)) { 342 ep += *ep; 343 continue; 344 } 345 return(0); 346 347 case CBRA: 348 braslist[*ep++] = lp; 349 continue; 350 351 case CKET: 352 braelist[*ep++] = lp; 353 continue; 354 355 case CBACK: 356 if (braelist[i = *ep++] == 0) 357 return(-1); 358 if (backref(i, lp)) { 359 lp += braelist[i] - braslist[i]; 360 continue; 361 } 362 return(0); 363 364 case CBACK|CSTAR: 365 if (braelist[i = *ep++] == 0) 366 return(-1); 367 curlp = lp; 368 ct = braelist[i] - braslist[i]; 369 while (backref(i, lp)) 370 lp += ct; 371 while (lp >= curlp) { 372 if (rv = advance(lp, ep)) 373 return(rv); 374 lp -= ct; 375 } 376 continue; 377 378 case CDOT|CSTAR: 379 curlp = lp; 380 while (*lp++) 381 ; 382 goto star; 383 384 case CCHR|CSTAR: 385 curlp = lp; 386 while (*lp++ == *ep) 387 ; 388 ep++; 389 goto star; 390 391 case CCL|CSTAR: 392 case NCCL|CSTAR: 393 curlp = lp; 394 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 395 ; 396 ep += *ep; 397 goto star; 398 399 star: 400 do { 401 lp--; 402 if (rv = advance(lp, ep)) 403 return(rv); 404 } while (lp > curlp); 405 return(0); 406 407 default: 408 return(-1); 409 } 410 } 411 412 static int 413 backref(int i, char *lp) 414 { 415 char *bp; 416 struct re_globals *_re = re_globals; 417 418 bp = braslist[i]; 419 while (*bp++ == *lp++) 420 if (bp >= braelist[i]) 421 return (1); 422 return (0); 423 } 424 425 static int 426 cclass(char *set, char c, int af) 427 { 428 int n; 429 430 if (c == 0) 431 return(0); 432 n = *set++; 433 while (--n) 434 if (*set++ == c) 435 return (af); 436 return (!af); 437 } 438