1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 1987 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 /* from UCB 4.1 80/12/21 */ 29 30 /* 31 * routines to do regular expression matching 32 * 33 * Entry points: 34 * 35 * re_comp(s) 36 * char *s; 37 * ... returns 0 if the string s was compiled successfully, 38 * a pointer to an error message otherwise. 39 * If passed 0 or a null string returns without changing 40 * the currently compiled re (see note 11 below). 41 * 42 * re_exec(s) 43 * char *s; 44 * ... returns 1 if the string s matches the last compiled regular 45 * expression, 46 * 0 if the string s failed to match the last compiled 47 * regular expression, and 48 * -1 if the compiled regular expression was invalid 49 * (indicating an internal error). 50 * 51 * The strings passed to both re_comp and re_exec may have trailing or 52 * embedded newline characters; they are terminated by nulls. 53 * 54 * The identity of the author of these routines is lost in antiquity; 55 * this is essentially the same as the re code in the original V6 ed. 56 * 57 * The regular expressions recognized are described below. This description 58 * is essentially the same as that for ed. 59 * 60 * A regular expression specifies a set of strings of characters. 61 * A member of this set of strings is said to be matched by 62 * the regular expression. In the following specification for 63 * regular expressions the word `character' means any character but NUL. 64 * 65 * 1. Any character except a special character matches itself. 66 * Special characters are the regular expression delimiter plus 67 * \ [ . and sometimes ^ * $. 68 * 2. A . matches any character. 69 * 3. A \ followed by any character except a digit or ( ) 70 * matches that character. 71 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 72 * character in (or not in) s. In s, \ has no special meaning, 73 * and ] may only appear as the first letter. A substring 74 * a-b, with a and b in ascending ASCII order, stands for 75 * the inclusive range of ASCII characters. 76 * 5. A regular expression of form 1-4 followed by * matches a 77 * sequence of 0 or more matches of the regular expression. 78 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 79 * matches what x matches. 80 * 7. A \ followed by a digit n matches a copy of the string that the 81 * bracketed regular expression beginning with the nth \( matched. 82 * 8. A regular expression of form 1-8, x, followed by a regular 83 * expression of form 1-7, y matches a match for x followed by 84 * a match for y, with the x match being as long as possible 85 * while still permitting a y match. 86 * 9. A regular expression of form 1-8 preceded by ^ (or followed 87 * by $), is constrained to matches that begin at the left 88 * (or end at the right) end of a line. 89 * 10. A regular expression of form 1-9 picks out the longest among 90 * the leftmost matches in a line. 91 * 11. An empty regular expression stands for a copy of the last 92 * regular expression encountered. 93 */ 94 95 /* 96 * constants for re's 97 */ 98 #define CBRA 1 99 #define CCHR 2 100 #define CDOT 4 101 #define CCL 6 102 #define NCCL 8 103 #define CDOL 10 104 #define CEOF 11 105 #define CKET 12 106 #define CBACK 18 107 108 #define CSTAR 01 109 110 #define ESIZE 512 111 #define NBRA 9 112 113 static struct re_globals { 114 char _expbuf[ESIZE]; 115 char *_braslist[NBRA], *_braelist[NBRA]; 116 char _circf; 117 } *re_globals; 118 #define expbuf (_re->_expbuf) 119 #define braslist (_re->_braslist) 120 #define braelist (_re->_braelist) 121 #define circf (_re->_circf) 122 123 /* 124 * compile the regular expression argument into a dfa 125 */ 126 char * 127 re_comp(sp) 128 register char *sp; 129 { 130 register int c; 131 register struct re_globals *_re = re_globals; 132 register char *ep; 133 int cclcnt, numbra = 0; 134 char *lastep = 0; 135 char bracket[NBRA]; 136 char *bracketp = &bracket[0]; 137 char *retoolong = "Regular expression too long"; 138 139 if (_re == 0) { 140 _re = (struct re_globals *)calloc(1, sizeof (*_re)); 141 if (_re == 0) 142 return ("Out of memory"); 143 re_globals = _re; 144 } 145 ep = expbuf; 146 147 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } 148 149 if (sp == 0 || *sp == '\0') { 150 if (*ep == 0) 151 return("No previous regular expression"); 152 return(0); 153 } 154 if (*sp == '^') { 155 circf = 1; 156 sp++; 157 } 158 else 159 circf = 0; 160 for (;;) { 161 if (ep >= &expbuf[ESIZE]) 162 comerr(retoolong); 163 if ((c = *sp++) == '\0') { 164 if (bracketp != bracket) 165 comerr("unmatched \\("); 166 *ep++ = CEOF; 167 *ep++ = 0; 168 return(0); 169 } 170 if (c != '*') 171 lastep = ep; 172 switch (c) { 173 174 case '.': 175 *ep++ = CDOT; 176 continue; 177 178 case '*': 179 if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 180 goto defchar; 181 *lastep |= CSTAR; 182 continue; 183 184 case '$': 185 if (*sp != '\0') 186 goto defchar; 187 *ep++ = CDOL; 188 continue; 189 190 case '[': 191 *ep++ = CCL; 192 *ep++ = 0; 193 cclcnt = 1; 194 if ((c = *sp++) == '^') { 195 c = *sp++; 196 ep[-2] = NCCL; 197 } 198 do { 199 if (c == '\0') 200 comerr("missing ]"); 201 if (c == '-' && ep [-1] != 0) { 202 if ((c = *sp++) == ']') { 203 *ep++ = '-'; 204 cclcnt++; 205 break; 206 } 207 while (ep[-1] < c) { 208 *ep = ep[-1] + 1; 209 ep++; 210 cclcnt++; 211 if (ep >= &expbuf[ESIZE]) 212 comerr(retoolong); 213 } 214 } 215 *ep++ = c; 216 cclcnt++; 217 if (ep >= &expbuf[ESIZE]) 218 comerr(retoolong); 219 } while ((c = *sp++) != ']'); 220 lastep[1] = cclcnt; 221 continue; 222 223 case '\\': 224 if ((c = *sp++) == '(') { 225 if (numbra >= NBRA) 226 comerr("too many \\(\\) pairs"); 227 *bracketp++ = numbra; 228 *ep++ = CBRA; 229 *ep++ = numbra++; 230 continue; 231 } 232 if (c == ')') { 233 if (bracketp <= bracket) 234 comerr("unmatched \\)"); 235 *ep++ = CKET; 236 *ep++ = *--bracketp; 237 continue; 238 } 239 if (c >= '1' && c < ('1' + NBRA)) { 240 *ep++ = CBACK; 241 *ep++ = c - '1'; 242 continue; 243 } 244 *ep++ = CCHR; 245 *ep++ = c; 246 continue; 247 248 defchar: 249 default: 250 *ep++ = CCHR; 251 *ep++ = c; 252 } 253 } 254 } 255 256 /* 257 * match the argument string against the compiled re 258 */ 259 int 260 re_exec(p1) 261 register char *p1; 262 { 263 register struct re_globals *_re = re_globals; 264 register char *p2; 265 register int c; 266 int rv; 267 268 if (_re == 0) 269 return (0); 270 p2 = expbuf; 271 for (c = 0; c < NBRA; c++) { 272 braslist[c] = 0; 273 braelist[c] = 0; 274 } 275 if (circf) 276 return((advance(p1, p2))); 277 /* 278 * fast check for first character 279 */ 280 if (*p2 == CCHR) { 281 c = p2[1]; 282 do { 283 if (*p1 != c) 284 continue; 285 if (rv = advance(p1, p2)) 286 return(rv); 287 } while (*p1++); 288 return(0); 289 } 290 /* 291 * regular algorithm 292 */ 293 do 294 if (rv = advance(p1, p2)) 295 return(rv); 296 while (*p1++); 297 return(0); 298 } 299 300 /* 301 * try to match the next thing in the dfa 302 */ 303 static int 304 advance(lp, ep) 305 register char *lp, *ep; 306 { 307 register char *curlp; 308 int ct, i; 309 int rv; 310 register struct re_globals *_re = re_globals; 311 312 for (;;) 313 switch (*ep++) { 314 315 case CCHR: 316 if (*ep++ == *lp++) 317 continue; 318 return(0); 319 320 case CDOT: 321 if (*lp++) 322 continue; 323 return(0); 324 325 case CDOL: 326 if (*lp == '\0') 327 continue; 328 return(0); 329 330 case CEOF: 331 return(1); 332 333 case CCL: 334 if (cclass(ep, *lp++, 1)) { 335 ep += *ep; 336 continue; 337 } 338 return(0); 339 340 case NCCL: 341 if (cclass(ep, *lp++, 0)) { 342 ep += *ep; 343 continue; 344 } 345 return(0); 346 347 case CBRA: 348 braslist[*ep++] = lp; 349 continue; 350 351 case CKET: 352 braelist[*ep++] = lp; 353 continue; 354 355 case CBACK: 356 if (braelist[i = *ep++] == 0) 357 return(-1); 358 if (backref(i, lp)) { 359 lp += braelist[i] - braslist[i]; 360 continue; 361 } 362 return(0); 363 364 case CBACK|CSTAR: 365 if (braelist[i = *ep++] == 0) 366 return(-1); 367 curlp = lp; 368 ct = braelist[i] - braslist[i]; 369 while (backref(i, lp)) 370 lp += ct; 371 while (lp >= curlp) { 372 if (rv = advance(lp, ep)) 373 return(rv); 374 lp -= ct; 375 } 376 continue; 377 378 case CDOT|CSTAR: 379 curlp = lp; 380 while (*lp++) 381 ; 382 goto star; 383 384 case CCHR|CSTAR: 385 curlp = lp; 386 while (*lp++ == *ep) 387 ; 388 ep++; 389 goto star; 390 391 case CCL|CSTAR: 392 case NCCL|CSTAR: 393 curlp = lp; 394 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 395 ; 396 ep += *ep; 397 goto star; 398 399 star: 400 do { 401 lp--; 402 if (rv = advance(lp, ep)) 403 return(rv); 404 } while (lp > curlp); 405 return(0); 406 407 default: 408 return(-1); 409 } 410 } 411 412 static 413 backref(i, lp) 414 register int i; 415 register char *lp; 416 { 417 register char *bp; 418 register struct re_globals *_re = re_globals; 419 420 bp = braslist[i]; 421 while (*bp++ == *lp++) 422 if (bp >= braelist[i]) 423 return(1); 424 return(0); 425 } 426 427 static int 428 cclass(set, c, af) 429 register char *set, c; 430 int af; 431 { 432 register int n; 433 434 if (c == 0) 435 return(0); 436 n = *set++; 437 while (--n) 438 if (*set++ == c) 439 return(af); 440 return(! af); 441 } 442