1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 30 /* All Rights Reserved */ 31 32 33 /* 34 * routines to do regular expression matching 35 * 36 * Entry points: 37 * 38 * re_comp(s) 39 * char *s; 40 * ... returns 0 if the string s was compiled successfully, 41 * a pointer to an error message otherwise. 42 * If passed 0 or a null string returns without changing 43 * the currently compiled re (see note 11 below). 44 * 45 * re_exec(s) 46 * char *s; 47 * ... returns 1 if the string s matches the last compiled regular 48 * expression, 49 * 0 if the string s failed to match the last compiled 50 * regular expression, and 51 * -1 if the compiled regular expression was invalid 52 * (indicating an internal error). 53 * 54 * The strings passed to both re_comp and re_exec may have trailing or 55 * embedded newline characters; they are terminated by nulls. 56 * 57 * The identity of the author of these routines is lost in antiquity; 58 * this is essentially the same as the re code in the original V6 ed. 59 * 60 * The regular expressions recognized are described below. This description 61 * is essentially the same as that for ed. 62 * 63 * A regular expression specifies a set of strings of characters. 64 * A member of this set of strings is said to be matched by 65 * the regular expression. In the following specification for 66 * regular expressions the word `character' means any character but NUL. 67 * 68 * 1. Any character except a special character matches itself. 69 * Special characters are the regular expression delimiter plus 70 * \ [ . and sometimes ^ * $. 71 * 2. A . matches any character. 72 * 3. A \ followed by any character except a digit or ( ) 73 * matches that character. 74 * 4. A nonempty string s bracketed [s] (or [^s]) matches any 75 * character in (or not in) s. In s, \ has no special meaning, 76 * and ] may only appear as the first letter. A substring 77 * a-b, with a and b in ascending ASCII order, stands for 78 * the inclusive range of ASCII characters. 79 * 5. A regular expression of form 1-4 followed by * matches a 80 * sequence of 0 or more matches of the regular expression. 81 * 6. A regular expression, x, of form 1-8, bracketed \(x\) 82 * matches what x matches. 83 * 7. A \ followed by a digit n matches a copy of the string that the 84 * bracketed regular expression beginning with the nth \( matched. 85 * 8. A regular expression of form 1-8, x, followed by a regular 86 * expression of form 1-7, y matches a match for x followed by 87 * a match for y, with the x match being as long as possible 88 * while still permitting a y match. 89 * 9. A regular expression of form 1-8 preceded by ^ (or followed 90 * by $), is constrained to matches that begin at the left 91 * (or end at the right) end of a line. 92 * 10. A regular expression of form 1-9 picks out the longest among 93 * the leftmost matches in a line. 94 * 11. An empty regular expression stands for a copy of the last 95 * regular expression encountered. 96 */ 97 98 #include "synonyms.h" 99 100 #include <stdlib.h> 101 #include <re_comp.h> 102 #include <stddef.h> 103 #include <sys/types.h> 104 105 /* 106 * constants for re's 107 */ 108 #define CBRA 1 109 #define CCHR 2 110 #define CDOT 4 111 #define CCL 6 112 #define NCCL 8 113 #define CDOL 10 114 #define CEOF 11 115 #define CKET 12 116 #define CBACK 18 117 118 #define CSTAR 01 119 120 #define ESIZE 512 121 #define NBRA 9 122 123 static struct re_globals { 124 char _expbuf[ESIZE]; 125 char *_braslist[NBRA], *_braelist[NBRA]; 126 char _circf; 127 } *re_globals; 128 #define expbuf (_re->_expbuf) 129 #define braslist (_re->_braslist) 130 #define braelist (_re->_braelist) 131 #define circf (_re->_circf) 132 133 static int advance(const char *, char *); 134 static int backref(int, const char *); 135 static int cclass(char *, char, int); 136 137 /* 138 * compile the regular expression argument into a dfa 139 */ 140 char * 141 re_comp(const char *sp) 142 { 143 char c; 144 struct re_globals *_re = re_globals; 145 char *ep; 146 char cclcnt, numbra = 0; 147 char *lastep = NULL; 148 char bracket[NBRA]; 149 char *bracketp = &bracket[0]; 150 char *retoolong = "Regular expression too long"; 151 152 if (_re == NULL) { 153 _re = (struct re_globals *)calloc(1, sizeof (*_re)); 154 if (_re == NULL) 155 return ("Out of memory"); 156 re_globals = _re; 157 } 158 ep = expbuf; 159 160 #define comerr(msg) {expbuf[0] = 0; return (msg); } 161 162 if (sp == NULL || *sp == '\0') { 163 if (*ep == 0) 164 return ("No previous regular expression"); 165 return (NULL); 166 } 167 if (*sp == '^') { 168 circf = 1; 169 sp++; 170 } 171 else 172 circf = 0; 173 for (;;) { 174 if (ep >= &expbuf[ESIZE]) 175 comerr(retoolong); 176 if ((c = *sp++) == '\0') { 177 if (bracketp != bracket) 178 comerr("unmatched \\("); 179 *ep++ = CEOF; 180 *ep++ = 0; 181 return (NULL); 182 } 183 if (c != '*') 184 lastep = ep; 185 switch (c) { 186 187 case '.': 188 *ep++ = CDOT; 189 continue; 190 191 case '*': 192 if (lastep == NULL || *lastep == CBRA || 193 *lastep == CKET) 194 goto defchar; 195 *lastep |= CSTAR; 196 continue; 197 198 case '$': 199 if (*sp != '\0') 200 goto defchar; 201 *ep++ = CDOL; 202 continue; 203 204 case '[': 205 *ep++ = CCL; 206 *ep++ = 0; 207 cclcnt = 1; 208 if ((c = *sp++) == '^') { 209 c = *sp++; 210 ep[-2] = NCCL; 211 } 212 do { 213 if (c == '\0') 214 comerr("missing ]"); 215 if (c == '-' && ep [-1] != 0) { 216 if ((c = *sp++) == ']') { 217 *ep++ = '-'; 218 cclcnt++; 219 break; 220 } 221 while (ep[-1] < c) { 222 *ep = ep[-1] + 1; 223 ep++; 224 cclcnt++; 225 if (ep >= &expbuf[ESIZE]) 226 comerr(retoolong); 227 } 228 } 229 *ep++ = c; 230 cclcnt++; 231 if (ep >= &expbuf[ESIZE]) 232 comerr(retoolong); 233 } while ((c = *sp++) != ']'); 234 lastep[1] = cclcnt; 235 continue; 236 237 case '\\': 238 if ((c = *sp++) == '(') { 239 if (numbra >= NBRA) 240 comerr("too many \\(\\) pairs"); 241 *bracketp++ = numbra; 242 *ep++ = CBRA; 243 *ep++ = numbra++; 244 continue; 245 } 246 if (c == ')') { 247 if (bracketp <= bracket) 248 comerr("unmatched \\)"); 249 *ep++ = CKET; 250 *ep++ = *--bracketp; 251 continue; 252 } 253 if (c >= '1' && c < ('1' + NBRA)) { 254 *ep++ = CBACK; 255 *ep++ = c - '1'; 256 continue; 257 } 258 *ep++ = CCHR; 259 *ep++ = c; 260 continue; 261 262 defchar: 263 default: 264 *ep++ = CCHR; 265 *ep++ = c; 266 } 267 } 268 } 269 270 /* 271 * match the argument string against the compiled re 272 */ 273 int 274 re_exec(const char *p1) 275 { 276 struct re_globals *_re = re_globals; 277 char *p2; 278 int c; 279 int rv; 280 281 if (_re == NULL) 282 return (0); 283 p2 = expbuf; 284 for (c = 0; c < NBRA; c++) { 285 braslist[c] = 0; 286 braelist[c] = 0; 287 } 288 if (circf) 289 return ((advance(p1, p2))); 290 /* 291 * fast check for first character 292 */ 293 if (*p2 == CCHR) { 294 c = p2[1]; 295 do { 296 if (*p1 != c) 297 continue; 298 if (rv = advance(p1, p2)) 299 return (rv); 300 } while (*p1++); 301 return (0); 302 } 303 /* 304 * regular algorithm 305 */ 306 do 307 if (rv = advance(p1, p2)) 308 return (rv); 309 while (*p1++); 310 return (0); 311 } 312 313 /* 314 * try to match the next thing in the dfa 315 */ 316 static int 317 advance(const char *lp, char *ep) 318 { 319 const char *curlp; 320 ptrdiff_t ct; 321 int i; 322 int rv; 323 struct re_globals *_re = re_globals; 324 325 for (;;) 326 switch (*ep++) { 327 328 case CCHR: 329 if (*ep++ == *lp++) 330 continue; 331 return (0); 332 333 case CDOT: 334 if (*lp++) 335 continue; 336 return (0); 337 338 case CDOL: 339 if (*lp == '\0') 340 continue; 341 return (0); 342 343 case CEOF: 344 return (1); 345 346 case CCL: 347 if (cclass(ep, *lp++, 1)) { 348 ep += *ep; 349 continue; 350 } 351 return (0); 352 353 case NCCL: 354 if (cclass(ep, *lp++, 0)) { 355 ep += *ep; 356 continue; 357 } 358 return (0); 359 360 case CBRA: 361 braslist[*ep++] = (char *)lp; 362 continue; 363 364 case CKET: 365 braelist[*ep++] = (char *)lp; 366 continue; 367 368 case CBACK: 369 if (braelist[i = *ep++] == NULL) 370 return (-1); 371 if (backref(i, lp)) { 372 lp += braelist[i] - braslist[i]; 373 continue; 374 } 375 return (0); 376 377 case CBACK|CSTAR: 378 if (braelist[i = *ep++] == NULL) 379 return (-1); 380 curlp = lp; 381 ct = braelist[i] - braslist[i]; 382 while (backref(i, lp)) 383 lp += ct; 384 while (lp >= curlp) { 385 if (rv = advance(lp, ep)) 386 return (rv); 387 lp -= ct; 388 } 389 continue; 390 391 case CDOT|CSTAR: 392 curlp = lp; 393 while (*lp++) 394 ; 395 goto star; 396 397 case CCHR|CSTAR: 398 curlp = lp; 399 while (*lp++ == *ep) 400 ; 401 ep++; 402 goto star; 403 404 case CCL|CSTAR: 405 case NCCL|CSTAR: 406 curlp = lp; 407 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) 408 ; 409 ep += *ep; 410 goto star; 411 412 star: 413 do { 414 lp--; 415 if (rv = advance(lp, ep)) 416 return (rv); 417 } while (lp > curlp); 418 return (0); 419 420 default: 421 return (-1); 422 } 423 } 424 425 static int 426 backref(int i, const char *lp) 427 { 428 char *bp; 429 struct re_globals *_re = re_globals; 430 431 bp = braslist[i]; 432 while (*bp++ == *lp++) 433 if (bp >= braelist[i]) 434 return (1); 435 return (0); 436 } 437 438 static int 439 cclass(char *set, char c, int af) 440 { 441 int n; 442 443 if (c == 0) 444 return (0); 445 n = *set++; 446 while (--n) 447 if (*set++ == c) 448 return (af); 449 return (! af); 450 } 451