1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 1997, by Sun Microsystems, Inc. 24 * All rights reserved. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 31 /* Portions Copyright(c) 1988, Sun Microsystems Inc. */ 32 /* All Rights Reserved */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.1 */ 35 36 /*LINTLIBRARY*/ 37 38 /* 39 * routines to do regular expression matching 40 * 41 * Entry points: 42 * 43 * re_comp(s) 44 * char *s; 45 * ... returns 0 if the string s was compiled successfully, 46 * a pointer to an error message otherwise. 47 * If passed 0 or a null string returns without changing 48 * the currently compiled re (see note 11 below). 49 * 50 * re_exec(s) 51 * char *s; 52 * ... returns 1 if the string s matches the last compiled regular 53 * expression, 54 * 0 if the string s failed to match the last compiled 55 * regular expression, and 56 * -1 if the compiled regular expression was invalid 57 * (indicating an internal error). 58 * 59 * The strings passed to both re_comp and re_exec may have trailing or 60 * embedded newline characters; they are terminated by nulls. 61 * 62 * The identity of the author of these routines is lost in antiquity; 63 * this is essentially the same as the re code in the original V6 ed. 64 * 65 * The regular expressions recognized are described below. This description 66 * is essentially the same as that for ed. 67 * 68 * A regular expression specifies a set of strings of characters. 69 * A member of this set of strings is said to be matched by 70 * the regular expression. In the following specification for 71 * regular expressions the word `character' means any character but NUL. 72 * 73 * 1. Any character except a special character matches itself. 74 * Special characters are the regular expression delimiter plus 75 * \ [ . and sometimes ^ * $. 76 * 2. A . matches any character. 77 * 3. A \ followed by any character except a digit or ( ) 78 * matches that character. 79 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 80 * character in (or not in) s. In s, \ has no special meaning, 81 * and ] may only appear as the first letter. A substring 82 * a-b, with a and b in ascending ASCII order, stands for 83 * the inclusive range of ASCII characters. 84 * 5. A regular expression of form 1-4 followed by * matches a 85 * sequence of 0 or more matches of the regular expression. 86 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 87 * matches what x matches. 88 * 7. A \ followed by a digit n matches a copy of the string that the 89 * bracketed regular expression beginning with the nth \( matched. 90 * 8. A regular expression of form 1-8, x, followed by a regular 91 * expression of form 1-7, y matches a match for x followed by 92 * a match for y, with the x match being as long as possible 93 * while still permitting a y match. 94 * 9. A regular expression of form 1-8 preceded by ^ (or followed 95 * by $), is constrained to matches that begin at the left 96 * (or end at the right) end of a line. 97 * 10. A regular expression of form 1-9 picks out the longest among 98 * the leftmost matches in a line. 99 * 11. An empty regular expression stands for a copy of the last 100 * regular expression encountered. 101 */ 102 103 #include <sys/types.h> 104 #include <stdlib.h> 105 #include <stddef.h> 106 107 /* 108 * constants for re's 109 */ 110 #define CBRA 1 111 #define CCHR 2 112 #define CDOT 4 113 #define CCL 6 114 #define NCCL 8 115 #define CDOL 10 116 #define CEOF 11 117 #define CKET 12 118 #define CBACK 18 119 120 #define CSTAR 01 121 122 #define ESIZE 512 123 #define NBRA 9 124 125 static struct re_globals { 126 char _expbuf[ESIZE]; 127 char *_braslist[NBRA], *_braelist[NBRA]; 128 char _circf; 129 } *re_globals; 130 #define expbuf (_re->_expbuf) 131 #define braslist (_re->_braslist) 132 #define braelist (_re->_braelist) 133 #define circf (_re->_circf) 134 135 /* 136 * forward declarations 137 */ 138 static int backref(int, char *); 139 static int advance(char *, char *); 140 static int cclass(char *, char, int); 141 142 /* 143 * compile the regular expression argument into a dfa 144 */ 145 char * 146 re_comp(char *sp) 147 { 148 char c; 149 struct re_globals *_re = re_globals; 150 char *ep; 151 int cclcnt, numbra = 0; 152 char *lastep = 0; 153 char bracket[NBRA]; 154 char *bracketp = &bracket[0]; 155 char *retoolong = "Regular expression too long"; 156 157 if (_re == 0) { 158 _re = (struct re_globals *)calloc(1, sizeof (*_re)); 159 if (_re == 0) 160 return ("Out of memory"); 161 re_globals = _re; 162 } 163 ep = expbuf; 164 165 #define comerr(msg) {expbuf[0] = 0; return (msg); } 166 167 if (sp == 0 || *sp == '\0') { 168 if (*ep == 0) 169 return ("No previous regular expression"); 170 return (0); 171 } 172 if (*sp == '^') { 173 circf = 1; 174 sp++; 175 } 176 else 177 circf = 0; 178 for (;;) { 179 if (ep >= &expbuf[ESIZE]) 180 comerr(retoolong); 181 if ((c = *sp++) == '\0') { 182 if (bracketp != bracket) 183 comerr("unmatched \\("); 184 *ep++ = CEOF; 185 *ep++ = 0; 186 return (0); 187 } 188 if (c != '*') 189 lastep = ep; 190 switch (c) { 191 192 case '.': 193 *ep++ = CDOT; 194 continue; 195 196 case '*': 197 if (lastep == 0 || *lastep == CBRA || *lastep == CKET) 198 goto defchar; 199 *lastep |= CSTAR; 200 continue; 201 202 case '$': 203 if (*sp != '\0') 204 goto defchar; 205 *ep++ = CDOL; 206 continue; 207 208 case '[': 209 *ep++ = CCL; 210 *ep++ = 0; 211 cclcnt = 1; 212 if ((c = *sp++) == '^') { 213 c = *sp++; 214 ep[-2] = NCCL; 215 } 216 do { 217 if (c == '\0') 218 comerr("missing ]"); 219 if (c == '-' && ep [-1] != 0) { 220 if ((c = *sp++) == ']') { 221 *ep++ = '-'; 222 cclcnt++; 223 break; 224 } 225 while (ep[-1] < c) { 226 *ep = ep[-1] + 1; 227 ep++; 228 cclcnt++; 229 if (ep >= &expbuf[ESIZE]) 230 comerr(retoolong); 231 } 232 } 233 *ep++ = c; 234 cclcnt++; 235 if (ep >= &expbuf[ESIZE]) 236 comerr(retoolong); 237 } while ((c = *sp++) != ']'); 238 lastep[1] = (char)cclcnt; 239 continue; 240 241 case '\\': 242 if ((c = *sp++) == '(') { 243 if (numbra >= NBRA) 244 comerr("too many \\(\\) pairs"); 245 *bracketp++ = (char)numbra; 246 *ep++ = CBRA; 247 *ep++ = numbra++; 248 continue; 249 } 250 if (c == ')') { 251 if (bracketp <= bracket) 252 comerr("unmatched \\)"); 253 *ep++ = CKET; 254 *ep++ = *--bracketp; 255 continue; 256 } 257 if (c >= '1' && c < ('1' + NBRA)) { 258 *ep++ = CBACK; 259 *ep++ = c - '1'; 260 continue; 261 } 262 *ep++ = CCHR; 263 *ep++ = c; 264 continue; 265 266 defchar: 267 default: 268 *ep++ = CCHR; 269 *ep++ = c; 270 } 271 } 272 } 273 274 /* 275 * match the argument string against the compiled re 276 */ 277 int 278 re_exec(char *p1) 279 { 280 struct re_globals *_re = re_globals; 281 char *p2; 282 int c; 283 int rv; 284 285 if (_re == 0) 286 return (0); 287 p2 = expbuf; 288 for (c = 0; c < NBRA; c++) { 289 braslist[c] = 0; 290 braelist[c] = 0; 291 } 292 if (circf) 293 return ((advance(p1, p2))); 294 /* 295 * fast check for first character 296 */ 297 if (*p2 == CCHR) { 298 c = p2[1]; 299 do { 300 if (*p1 != c) 301 continue; 302 if (rv = advance(p1, p2)) 303 return (rv); 304 } while (*p1++); 305 return (0); 306 } 307 /* 308 * regular algorithm 309 */ 310 do 311 if (rv = advance(p1, p2)) 312 return (rv); 313 while (*p1++); 314 return (0); 315 } 316 317 /* 318 * try to match the next thing in the dfa 319 */ 320 static int 321 advance(char *lp, char *ep) 322 { 323 char *curlp; 324 int i; 325 ptrdiff_t ct; 326 int rv; 327 struct re_globals *_re = re_globals; 328 329 for (;;) 330 switch (*ep++) { 331 332 case CCHR: 333 if (*ep++ == *lp++) 334 continue; 335 return (0); 336 337 case CDOT: 338 if (*lp++) 339 continue; 340 return (0); 341 342 case CDOL: 343 if (*lp == '\0') 344 continue; 345 return (0); 346 347 case CEOF: 348 return (1); 349 350 case CCL: 351 if (cclass(ep, *lp++, 1)) { 352 ep += *ep; 353 continue; 354 } 355 return (0); 356 357 case NCCL: 358 if (cclass(ep, *lp++, 0)) { 359 ep += *ep; 360 continue; 361 } 362 return (0); 363 364 case CBRA: 365 braslist[*ep++] = lp; 366 continue; 367 368 case CKET: 369 braelist[*ep++] = lp; 370 continue; 371 372 case CBACK: 373 if (braelist[i = *ep++] == 0) 374 return (-1); 375 if (backref(i, lp)) { 376 lp += braelist[i] - braslist[i]; 377 continue; 378 } 379 return (0); 380 381 case CBACK|CSTAR: 382 if (braelist[i = *ep++] == 0) 383 return (-1); 384 curlp = lp; 385 ct = braelist[i] - braslist[i]; 386 while (backref(i, lp)) 387 lp += ct; 388 while (lp >= curlp) { 389 if (rv = advance(lp, ep)) 390 return (rv); 391 lp -= ct; 392 } 393 continue; 394 395 case CDOT|CSTAR: 396 curlp = lp; 397 while (*lp++) 398 ; 399 goto star; 400 401 case CCHR|CSTAR: 402 curlp = lp; 403 while (*lp++ == *ep) 404 ; 405 ep++; 406 goto star; 407 408 case CCL|CSTAR: 409 case NCCL|CSTAR: 410 curlp = lp; 411 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 412 ; 413 ep += *ep; 414 goto star; 415 416 star: 417 do { 418 lp--; 419 if (rv = advance(lp, ep)) 420 return (rv); 421 } while (lp > curlp); 422 return (0); 423 424 default: 425 return (-1); 426 } 427 } 428 429 static int 430 backref(int i, char *lp) 431 { 432 char *bp; 433 struct re_globals *_re = re_globals; 434 435 bp = braslist[i]; 436 while (*bp++ == *lp++) 437 if (bp >= braelist[i]) 438 return (1); 439 return (0); 440 } 441 442 static int 443 cclass(char *set, char c, int af) 444 { 445 int n; 446 447 if (c == 0) 448 return (0); 449 n = *set++; 450 while (--n) 451 if (*set++ == c) 452 return (af); 453 return (! af); 454 } 455