1 /* 2 * Copyright (c) 1980, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 33 __FBSDID("$FreeBSD$"); 34 35 #ifndef lint 36 static const char copyright[] = 37 "@(#) Copyright (c) 1980, 1993\n\ 38 The Regents of the University of California. All rights reserved.\n"; 39 #endif 40 41 #ifndef lint 42 static const char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 43 #endif 44 45 #include <ctype.h> 46 #include <stdlib.h> 47 #include <stdbool.h> 48 #include <string.h> 49 50 #include "extern.h" 51 52 static void expconv(void); 53 54 bool _escaped; /* true if we are currently x_escaped */ 55 char *s_start; /* start of string */ 56 bool l_onecase; /* true if upper and lower equivalent */ 57 58 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 59 60 /* STRNCMP - like strncmp except that we convert the 61 * first string to lower case before comparing 62 * if l_onecase is set. 63 */ 64 65 int 66 STRNCMP(register char *s1, register char *s2, register int len) 67 { 68 if (l_onecase) { 69 do 70 if (*s2 - makelower(*s1)) 71 return (*s2 - makelower(*s1)); 72 else { 73 s2++; 74 s1++; 75 } 76 while (--len); 77 } else { 78 do 79 if (*s2 - *s1) 80 return (*s2 - *s1); 81 else { 82 s2++; 83 s1++; 84 } 85 while (--len); 86 } 87 return(0); 88 } 89 90 /* The following routine converts an irregular expression to 91 * internal format. 92 * 93 * Either meta symbols (\a \d or \p) or character strings or 94 * operations ( alternation or parenthesizing ) can be 95 * specified. Each starts with a descriptor byte. The descriptor 96 * byte has STR set for strings, META set for meta symbols 97 * and OPER set for operations. 98 * The descriptor byte can also have the OPT bit set if the object 99 * defined is optional. Also ALT can be set to indicate an alternation. 100 * 101 * For metasymbols the byte following the descriptor byte identities 102 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 103 * strings the byte after the descriptor is a character count for 104 * the string: 105 * 106 * meta symbols := descriptor 107 * symbol 108 * 109 * strings := descriptor 110 * character count 111 * the string 112 * 113 * operations := descriptor 114 * symbol 115 * character count 116 */ 117 118 /* 119 * handy macros for accessing parts of match blocks 120 */ 121 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 122 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 123 124 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 125 #define OCNT(A) (*(A+2)) /* character count */ 126 #define ONEXT(A) (A+3) /* next character after the operation */ 127 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 128 129 #define SCNT(A) (*(A+1)) /* byte count of a string */ 130 #define SSTR(A) (A+2) /* address of the string */ 131 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 132 133 /* 134 * bit flags in the descriptor 135 */ 136 #define OPT 1 137 #define STR 2 138 #define META 4 139 #define ALT 8 140 #define OPER 16 141 142 static char *ccre; /* pointer to current position in converted exp*/ 143 static char *ure; /* pointer current position in unconverted exp */ 144 145 /* re: unconverted irregular expression */ 146 char * 147 convexp(char *re) 148 { 149 register char *cre; /* pointer to converted regular expression */ 150 151 /* allocate room for the converted expression */ 152 if (re == NULL) 153 return (NULL); 154 if (*re == '\0') 155 return (NULL); 156 cre = malloc(4 * strlen(re) + 3); 157 ccre = cre; 158 ure = re; 159 160 /* start the conversion with a \a */ 161 *cre = META | OPT; 162 MSYM(cre) = 'a'; 163 ccre = MNEXT(cre); 164 165 /* start the conversion (its recursive) */ 166 expconv (); 167 *ccre = 0; 168 return (cre); 169 } 170 171 static void 172 expconv() 173 { 174 register char *cs; /* pointer to current symbol in converted exp */ 175 register char c; /* character being processed */ 176 register char *acs; /* pinter to last alternate */ 177 register int temp; 178 179 /* let the conversion begin */ 180 acs = NULL; 181 cs = NULL; 182 while (*ure) { 183 switch (c = *ure++) { 184 185 case '\\': 186 switch (c = *ure++) { 187 188 /* escaped characters are just characters */ 189 default: 190 if (cs == NULL || (*cs & STR) == 0) { 191 cs = ccre; 192 *cs = STR; 193 SCNT(cs) = 1; 194 ccre += 2; 195 } else 196 SCNT(cs)++; 197 *ccre++ = c; 198 break; 199 200 /* normal(?) metacharacters */ 201 case 'a': 202 case 'd': 203 case 'e': 204 case 'p': 205 if (acs != NULL && acs != cs) { 206 do { 207 temp = OCNT(acs); 208 OCNT(acs) = ccre - acs; 209 acs -= temp; 210 } while (temp != 0); 211 acs = NULL; 212 } 213 cs = ccre; 214 *cs = META; 215 MSYM(cs) = c; 216 ccre = MNEXT(cs); 217 break; 218 } 219 break; 220 221 /* just put the symbol in */ 222 case '^': 223 case '$': 224 if (acs != NULL && acs != cs) { 225 do { 226 temp = OCNT(acs); 227 OCNT(acs) = ccre - acs; 228 acs -= temp; 229 } while (temp != 0); 230 acs = NULL; 231 } 232 cs = ccre; 233 *cs = META; 234 MSYM(cs) = c; 235 ccre = MNEXT(cs); 236 break; 237 238 /* mark the last match sequence as optional */ 239 case '?': 240 if (cs) 241 *cs = *cs | OPT; 242 break; 243 244 /* recurse and define a subexpression */ 245 case '(': 246 if (acs != NULL && acs != cs) { 247 do { 248 temp = OCNT(acs); 249 OCNT(acs) = ccre - acs; 250 acs -= temp; 251 } while (temp != 0); 252 acs = NULL; 253 } 254 cs = ccre; 255 *cs = OPER; 256 OSYM(cs) = '('; 257 ccre = ONEXT(cs); 258 expconv(); 259 OCNT(cs) = ccre - cs; /* offset to next symbol */ 260 break; 261 262 /* return from a recursion */ 263 case ')': 264 if (acs != NULL) { 265 do { 266 temp = OCNT(acs); 267 OCNT(acs) = ccre - acs; 268 acs -= temp; 269 } while (temp != 0); 270 acs = NULL; 271 } 272 cs = ccre; 273 *cs = META; 274 MSYM(cs) = c; 275 ccre = MNEXT(cs); 276 return; 277 278 /* mark the last match sequence as having an alternate */ 279 /* the third byte will contain an offset to jump over the */ 280 /* alternate match in case the first did not fail */ 281 case '|': 282 if (acs != NULL && acs != cs) 283 OCNT(ccre) = ccre - acs; /* make a back pointer */ 284 else 285 OCNT(ccre) = 0; 286 *cs |= ALT; 287 cs = ccre; 288 *cs = OPER; 289 OSYM(cs) = '|'; 290 ccre = ONEXT(cs); 291 acs = cs; /* remember that the pointer is to be filles */ 292 break; 293 294 /* if its not a metasymbol just build a scharacter string */ 295 default: 296 if (cs == NULL || (*cs & STR) == 0) { 297 cs = ccre; 298 *cs = STR; 299 SCNT(cs) = 1; 300 ccre = SSTR(cs); 301 } else 302 SCNT(cs)++; 303 *ccre++ = c; 304 break; 305 } 306 } 307 if (acs != NULL) { 308 do { 309 temp = OCNT(acs); 310 OCNT(acs) = ccre - acs; 311 acs -= temp; 312 } while (temp != 0); 313 acs = NULL; 314 } 315 return; 316 } 317 /* end of convertre */ 318 319 320 /* 321 * The following routine recognises an irregular expression 322 * with the following special characters: 323 * 324 * \? - means last match was optional 325 * \a - matches any number of characters 326 * \d - matches any number of spaces and tabs 327 * \p - matches any number of alphanumeric 328 * characters. The 329 * characters matched will be copied into 330 * the area pointed to by 'name'. 331 * \| - alternation 332 * \( \) - grouping used mostly for alternation and 333 * optionality 334 * 335 * The irregular expression must be translated to internal form 336 * prior to calling this routine 337 * 338 * The value returned is the pointer to the first non \a 339 * character matched. 340 */ 341 342 /* 343 * s: string to check for a match in 344 * re: a converted irregular expression 345 * mstring: where to put whatever matches a \p 346 */ 347 char * 348 expmatch (register char *s, register char *re, register char *mstring) 349 { 350 register char *cs; /* the current symbol */ 351 register char *ptr,*s1; /* temporary pointer */ 352 bool matched; /* a temporary bool */ 353 354 /* initial conditions */ 355 if (re == NULL) 356 return (NULL); 357 cs = re; 358 matched = false; 359 360 /* loop till expression string is exhausted (or at least pretty tired) */ 361 while (*cs) { 362 switch (*cs & (OPER | STR | META)) { 363 364 /* try to match a string */ 365 case STR: 366 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 367 if (matched) { 368 369 /* hoorah it matches */ 370 s += SCNT(cs); 371 cs = SNEXT(cs); 372 } else if (*cs & ALT) { 373 374 /* alternation, skip to next expression */ 375 cs = SNEXT(cs); 376 } else if (*cs & OPT) { 377 378 /* the match is optional */ 379 cs = SNEXT(cs); 380 matched = 1; /* indicate a successful match */ 381 } else { 382 383 /* no match, error return */ 384 return (NULL); 385 } 386 break; 387 388 /* an operator, do something fancy */ 389 case OPER: 390 switch (OSYM(cs)) { 391 392 /* this is an alternation */ 393 case '|': 394 if (matched) 395 396 /* last thing in the alternation was a match, skip ahead */ 397 cs = OPTR(cs); 398 else 399 400 /* no match, keep trying */ 401 cs = ONEXT(cs); 402 break; 403 404 /* this is a grouping, recurse */ 405 case '(': 406 ptr = expmatch(s, ONEXT(cs), mstring); 407 if (ptr != NULL) { 408 409 /* the subexpression matched */ 410 matched = 1; 411 s = ptr; 412 } else if (*cs & ALT) { 413 414 /* alternation, skip to next expression */ 415 matched = 0; 416 } else if (*cs & OPT) { 417 418 /* the match is optional */ 419 matched = 1; /* indicate a successful match */ 420 } else { 421 422 /* no match, error return */ 423 return (NULL); 424 } 425 cs = OPTR(cs); 426 break; 427 } 428 break; 429 430 /* try to match a metasymbol */ 431 case META: 432 switch (MSYM(cs)) { 433 434 /* try to match anything and remember what was matched */ 435 case 'p': 436 /* 437 * This is really the same as trying the match the 438 * remaining parts of the expression to any subset 439 * of the string. 440 */ 441 s1 = s; 442 do { 443 ptr = expmatch(s1, MNEXT(cs), mstring); 444 if (ptr != NULL && s1 != s) { 445 446 /* we have a match, remember the match */ 447 strncpy (mstring, s, s1 - s); 448 mstring[s1 - s] = '\0'; 449 return (ptr); 450 } else if (ptr != NULL && (*cs & OPT)) { 451 452 /* it was aoptional so no match is ok */ 453 return (ptr); 454 } else if (ptr != NULL) { 455 456 /* not optional and we still matched */ 457 return (NULL); 458 } 459 if (!(isalnum(*s1) || *s1 == '_' || 460 /* C++ destructor */ 461 *s1 == '~' || 462 /* C++ scope operator */ 463 (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' && 464 (s1++, true)))) 465 return (NULL); 466 if (*s1 == '\\') 467 _escaped = _escaped ? false : true; 468 else 469 _escaped = false; 470 } while (*s1++); 471 return (NULL); 472 473 /* try to match anything */ 474 case 'a': 475 /* 476 * This is really the same as trying the match the 477 * remaining parts of the expression to any subset 478 * of the string. 479 */ 480 s1 = s; 481 do { 482 ptr = expmatch(s1, MNEXT(cs), mstring); 483 if (ptr != NULL && s1 != s) { 484 485 /* we have a match */ 486 return (ptr); 487 } else if (ptr != NULL && (*cs & OPT)) { 488 489 /* it was aoptional so no match is ok */ 490 return (ptr); 491 } else if (ptr != NULL) { 492 493 /* not optional and we still matched */ 494 return (NULL); 495 } 496 if (*s1 == '\\') 497 _escaped = _escaped ? false : true; 498 else 499 _escaped = false; 500 } while (*s1++); 501 return (NULL); 502 503 /* fail if we are currently _escaped */ 504 case 'e': 505 if (_escaped) 506 return(NULL); 507 cs = MNEXT(cs); 508 break; 509 510 /* match any number of tabs and spaces */ 511 case 'd': 512 ptr = s; 513 while (*s == ' ' || *s == '\t') 514 s++; 515 if (s != ptr || s == s_start) { 516 517 /* match, be happy */ 518 matched = 1; 519 cs = MNEXT(cs); 520 } else if (*s == '\n' || *s == '\0') { 521 522 /* match, be happy */ 523 matched = 1; 524 cs = MNEXT(cs); 525 } else if (*cs & ALT) { 526 527 /* try the next part */ 528 matched = 0; 529 cs = MNEXT(cs); 530 } else if (*cs & OPT) { 531 532 /* doesn't matter */ 533 matched = 1; 534 cs = MNEXT(cs); 535 } else 536 537 /* no match, error return */ 538 return (NULL); 539 break; 540 541 /* check for end of line */ 542 case '$': 543 if (*s == '\0' || *s == '\n') { 544 545 /* match, be happy */ 546 s++; 547 matched = 1; 548 cs = MNEXT(cs); 549 } else if (*cs & ALT) { 550 551 /* try the next part */ 552 matched = 0; 553 cs = MNEXT(cs); 554 } else if (*cs & OPT) { 555 556 /* doesn't matter */ 557 matched = 1; 558 cs = MNEXT(cs); 559 } else 560 561 /* no match, error return */ 562 return (NULL); 563 break; 564 565 /* check for start of line */ 566 case '^': 567 if (s == s_start) { 568 569 /* match, be happy */ 570 matched = 1; 571 cs = MNEXT(cs); 572 } else if (*cs & ALT) { 573 574 /* try the next part */ 575 matched = 0; 576 cs = MNEXT(cs); 577 } else if (*cs & OPT) { 578 579 /* doesn't matter */ 580 matched = 1; 581 cs = MNEXT(cs); 582 } else 583 584 /* no match, error return */ 585 return (NULL); 586 break; 587 588 /* end of a subexpression, return success */ 589 case ')': 590 return (s); 591 } 592 break; 593 } 594 } 595 return (s); 596 } 597