1 /* 2 * Copyright (c) 1980, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 4. Neither the name of the University nor the names of its contributors 15 * may be used to endorse or promote products derived from this software 16 * without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 33 __FBSDID("$FreeBSD$"); 34 35 #ifndef lint 36 static const char copyright[] = 37 "@(#) Copyright (c) 1980, 1993\n\ 38 The Regents of the University of California. All rights reserved.\n"; 39 #endif 40 41 #ifndef lint 42 static const char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 43 #endif 44 45 #include <ctype.h> 46 #include <stdlib.h> 47 #include <string.h> 48 49 #include "extern.h" 50 51 #define FALSE 0 52 #define TRUE !(FALSE) 53 #define NIL 0 54 55 static void expconv(void); 56 57 boolean _escaped; /* true if we are currently _escaped */ 58 char *s_start; /* start of string */ 59 boolean l_onecase; /* true if upper and lower equivalent */ 60 61 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 62 63 /* STRNCMP - like strncmp except that we convert the 64 * first string to lower case before comparing 65 * if l_onecase is set. 66 */ 67 68 int 69 STRNCMP(s1, s2, len) 70 register char *s1,*s2; 71 register int len; 72 { 73 if (l_onecase) { 74 do 75 if (*s2 - makelower(*s1)) 76 return (*s2 - makelower(*s1)); 77 else { 78 s2++; 79 s1++; 80 } 81 while (--len); 82 } else { 83 do 84 if (*s2 - *s1) 85 return (*s2 - *s1); 86 else { 87 s2++; 88 s1++; 89 } 90 while (--len); 91 } 92 return(0); 93 } 94 95 /* The following routine converts an irregular expression to 96 * internal format. 97 * 98 * Either meta symbols (\a \d or \p) or character strings or 99 * operations ( alternation or perenthesizing ) can be 100 * specified. Each starts with a descriptor byte. The descriptor 101 * byte has STR set for strings, META set for meta symbols 102 * and OPER set for operations. 103 * The descriptor byte can also have the OPT bit set if the object 104 * defined is optional. Also ALT can be set to indicate an alternation. 105 * 106 * For metasymbols the byte following the descriptor byte identities 107 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 108 * strings the byte after the descriptor is a character count for 109 * the string: 110 * 111 * meta symbols := descriptor 112 * symbol 113 * 114 * strings := descriptor 115 * character count 116 * the string 117 * 118 * operatins := descriptor 119 * symbol 120 * character count 121 */ 122 123 /* 124 * handy macros for accessing parts of match blocks 125 */ 126 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 127 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 128 129 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 130 #define OCNT(A) (*(A+2)) /* character count */ 131 #define ONEXT(A) (A+3) /* next character after the operation */ 132 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 133 134 #define SCNT(A) (*(A+1)) /* byte count of a string */ 135 #define SSTR(A) (A+2) /* address of the string */ 136 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 137 138 /* 139 * bit flags in the descriptor 140 */ 141 #define OPT 1 142 #define STR 2 143 #define META 4 144 #define ALT 8 145 #define OPER 16 146 147 static char *ccre; /* pointer to current position in converted exp*/ 148 static char *ure; /* pointer current position in unconverted exp */ 149 150 char * 151 convexp(re) 152 char *re; /* unconverted irregular expression */ 153 { 154 register char *cre; /* pointer to converted regular expression */ 155 156 /* allocate room for the converted expression */ 157 if (re == NIL) 158 return (NIL); 159 if (*re == '\0') 160 return (NIL); 161 cre = malloc (4 * strlen(re) + 3); 162 ccre = cre; 163 ure = re; 164 165 /* start the conversion with a \a */ 166 *cre = META | OPT; 167 MSYM(cre) = 'a'; 168 ccre = MNEXT(cre); 169 170 /* start the conversion (its recursive) */ 171 expconv (); 172 *ccre = 0; 173 return (cre); 174 } 175 176 static void 177 expconv() 178 { 179 register char *cs; /* pointer to current symbol in converted exp */ 180 register char c; /* character being processed */ 181 register char *acs; /* pinter to last alternate */ 182 register int temp; 183 184 /* let the conversion begin */ 185 acs = NIL; 186 cs = NIL; 187 while (*ure != NIL) { 188 switch (c = *ure++) { 189 190 case '\\': 191 switch (c = *ure++) { 192 193 /* escaped characters are just characters */ 194 default: 195 if (cs == NIL || (*cs & STR) == 0) { 196 cs = ccre; 197 *cs = STR; 198 SCNT(cs) = 1; 199 ccre += 2; 200 } else 201 SCNT(cs)++; 202 *ccre++ = c; 203 break; 204 205 /* normal(?) metacharacters */ 206 case 'a': 207 case 'd': 208 case 'e': 209 case 'p': 210 if (acs != NIL && acs != cs) { 211 do { 212 temp = OCNT(acs); 213 OCNT(acs) = ccre - acs; 214 acs -= temp; 215 } while (temp != 0); 216 acs = NIL; 217 } 218 cs = ccre; 219 *cs = META; 220 MSYM(cs) = c; 221 ccre = MNEXT(cs); 222 break; 223 } 224 break; 225 226 /* just put the symbol in */ 227 case '^': 228 case '$': 229 if (acs != NIL && acs != cs) { 230 do { 231 temp = OCNT(acs); 232 OCNT(acs) = ccre - acs; 233 acs -= temp; 234 } while (temp != 0); 235 acs = NIL; 236 } 237 cs = ccre; 238 *cs = META; 239 MSYM(cs) = c; 240 ccre = MNEXT(cs); 241 break; 242 243 /* mark the last match sequence as optional */ 244 case '?': 245 if (cs) 246 *cs = *cs | OPT; 247 break; 248 249 /* recurse and define a subexpression */ 250 case '(': 251 if (acs != NIL && acs != cs) { 252 do { 253 temp = OCNT(acs); 254 OCNT(acs) = ccre - acs; 255 acs -= temp; 256 } while (temp != 0); 257 acs = NIL; 258 } 259 cs = ccre; 260 *cs = OPER; 261 OSYM(cs) = '('; 262 ccre = ONEXT(cs); 263 expconv (); 264 OCNT(cs) = ccre - cs; /* offset to next symbol */ 265 break; 266 267 /* return from a recursion */ 268 case ')': 269 if (acs != NIL) { 270 do { 271 temp = OCNT(acs); 272 OCNT(acs) = ccre - acs; 273 acs -= temp; 274 } while (temp != 0); 275 acs = NIL; 276 } 277 cs = ccre; 278 *cs = META; 279 MSYM(cs) = c; 280 ccre = MNEXT(cs); 281 return; 282 283 /* mark the last match sequence as having an alternate */ 284 /* the third byte will contain an offset to jump over the */ 285 /* alternate match in case the first did not fail */ 286 case '|': 287 if (acs != NIL && acs != cs) 288 OCNT(ccre) = ccre - acs; /* make a back pointer */ 289 else 290 OCNT(ccre) = 0; 291 *cs |= ALT; 292 cs = ccre; 293 *cs = OPER; 294 OSYM(cs) = '|'; 295 ccre = ONEXT(cs); 296 acs = cs; /* remember that the pointer is to be filles */ 297 break; 298 299 /* if its not a metasymbol just build a scharacter string */ 300 default: 301 if (cs == NIL || (*cs & STR) == 0) { 302 cs = ccre; 303 *cs = STR; 304 SCNT(cs) = 1; 305 ccre = SSTR(cs); 306 } else 307 SCNT(cs)++; 308 *ccre++ = c; 309 break; 310 } 311 } 312 if (acs != NIL) { 313 do { 314 temp = OCNT(acs); 315 OCNT(acs) = ccre - acs; 316 acs -= temp; 317 } while (temp != 0); 318 acs = NIL; 319 } 320 return; 321 } 322 /* end of convertre */ 323 324 325 /* 326 * The following routine recognises an irregular expresion 327 * with the following special characters: 328 * 329 * \? - means last match was optional 330 * \a - matches any number of characters 331 * \d - matches any number of spaces and tabs 332 * \p - matches any number of alphanumeric 333 * characters. The 334 * characters matched will be copied into 335 * the area pointed to by 'name'. 336 * \| - alternation 337 * \( \) - grouping used mostly for alternation and 338 * optionality 339 * 340 * The irregular expression must be translated to internal form 341 * prior to calling this routine 342 * 343 * The value returned is the pointer to the first non \a 344 * character matched. 345 */ 346 347 char * 348 expmatch (s, re, mstring) 349 register char *s; /* string to check for a match in */ 350 register char *re; /* a converted irregular expression */ 351 register char *mstring; /* where to put whatever matches a \p */ 352 { 353 register char *cs; /* the current symbol */ 354 register char *ptr,*s1; /* temporary pointer */ 355 boolean matched; /* a temporary boolean */ 356 357 /* initial conditions */ 358 if (re == NIL) 359 return (NIL); 360 cs = re; 361 matched = FALSE; 362 363 /* loop till expression string is exhausted (or at least pretty tired) */ 364 while (*cs) { 365 switch (*cs & (OPER | STR | META)) { 366 367 /* try to match a string */ 368 case STR: 369 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 370 if (matched) { 371 372 /* hoorah it matches */ 373 s += SCNT(cs); 374 cs = SNEXT(cs); 375 } else if (*cs & ALT) { 376 377 /* alternation, skip to next expression */ 378 cs = SNEXT(cs); 379 } else if (*cs & OPT) { 380 381 /* the match is optional */ 382 cs = SNEXT(cs); 383 matched = 1; /* indicate a successful match */ 384 } else { 385 386 /* no match, error return */ 387 return (NIL); 388 } 389 break; 390 391 /* an operator, do something fancy */ 392 case OPER: 393 switch (OSYM(cs)) { 394 395 /* this is an alternation */ 396 case '|': 397 if (matched) 398 399 /* last thing in the alternation was a match, skip ahead */ 400 cs = OPTR(cs); 401 else 402 403 /* no match, keep trying */ 404 cs = ONEXT(cs); 405 break; 406 407 /* this is a grouping, recurse */ 408 case '(': 409 ptr = expmatch (s, ONEXT(cs), mstring); 410 if (ptr != NIL) { 411 412 /* the subexpression matched */ 413 matched = 1; 414 s = ptr; 415 } else if (*cs & ALT) { 416 417 /* alternation, skip to next expression */ 418 matched = 0; 419 } else if (*cs & OPT) { 420 421 /* the match is optional */ 422 matched = 1; /* indicate a successful match */ 423 } else { 424 425 /* no match, error return */ 426 return (NIL); 427 } 428 cs = OPTR(cs); 429 break; 430 } 431 break; 432 433 /* try to match a metasymbol */ 434 case META: 435 switch (MSYM(cs)) { 436 437 /* try to match anything and remember what was matched */ 438 case 'p': 439 /* 440 * This is really the same as trying the match the 441 * remaining parts of the expression to any subset 442 * of the string. 443 */ 444 s1 = s; 445 do { 446 ptr = expmatch (s1, MNEXT(cs), mstring); 447 if (ptr != NIL && s1 != s) { 448 449 /* we have a match, remember the match */ 450 strncpy (mstring, s, s1 - s); 451 mstring[s1 - s] = '\0'; 452 return (ptr); 453 } else if (ptr != NIL && (*cs & OPT)) { 454 455 /* it was aoptional so no match is ok */ 456 return (ptr); 457 } else if (ptr != NIL) { 458 459 /* not optional and we still matched */ 460 return (NIL); 461 } 462 if (!(isalnum(*s1) || *s1 == '_' || 463 /* C++ destructor */ 464 *s1 == '~' || 465 /* C++ scope operator */ 466 (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' && 467 (s1++, TRUE)))) 468 return (NIL); 469 if (*s1 == '\\') 470 _escaped = _escaped ? FALSE : TRUE; 471 else 472 _escaped = FALSE; 473 } while (*s1++); 474 return (NIL); 475 476 /* try to match anything */ 477 case 'a': 478 /* 479 * This is really the same as trying the match the 480 * remaining parts of the expression to any subset 481 * of the string. 482 */ 483 s1 = s; 484 do { 485 ptr = expmatch (s1, MNEXT(cs), mstring); 486 if (ptr != NIL && s1 != s) { 487 488 /* we have a match */ 489 return (ptr); 490 } else if (ptr != NIL && (*cs & OPT)) { 491 492 /* it was aoptional so no match is ok */ 493 return (ptr); 494 } else if (ptr != NIL) { 495 496 /* not optional and we still matched */ 497 return (NIL); 498 } 499 if (*s1 == '\\') 500 _escaped = _escaped ? FALSE : TRUE; 501 else 502 _escaped = FALSE; 503 } while (*s1++); 504 return (NIL); 505 506 /* fail if we are currently _escaped */ 507 case 'e': 508 if (_escaped) 509 return(NIL); 510 cs = MNEXT(cs); 511 break; 512 513 /* match any number of tabs and spaces */ 514 case 'd': 515 ptr = s; 516 while (*s == ' ' || *s == '\t') 517 s++; 518 if (s != ptr || s == s_start) { 519 520 /* match, be happy */ 521 matched = 1; 522 cs = MNEXT(cs); 523 } else if (*s == '\n' || *s == '\0') { 524 525 /* match, be happy */ 526 matched = 1; 527 cs = MNEXT(cs); 528 } else if (*cs & ALT) { 529 530 /* try the next part */ 531 matched = 0; 532 cs = MNEXT(cs); 533 } else if (*cs & OPT) { 534 535 /* doesn't matter */ 536 matched = 1; 537 cs = MNEXT(cs); 538 } else 539 540 /* no match, error return */ 541 return (NIL); 542 break; 543 544 /* check for end of line */ 545 case '$': 546 if (*s == '\0' || *s == '\n') { 547 548 /* match, be happy */ 549 s++; 550 matched = 1; 551 cs = MNEXT(cs); 552 } else if (*cs & ALT) { 553 554 /* try the next part */ 555 matched = 0; 556 cs = MNEXT(cs); 557 } else if (*cs & OPT) { 558 559 /* doesn't matter */ 560 matched = 1; 561 cs = MNEXT(cs); 562 } else 563 564 /* no match, error return */ 565 return (NIL); 566 break; 567 568 /* check for start of line */ 569 case '^': 570 if (s == s_start) { 571 572 /* match, be happy */ 573 matched = 1; 574 cs = MNEXT(cs); 575 } else if (*cs & ALT) { 576 577 /* try the next part */ 578 matched = 0; 579 cs = MNEXT(cs); 580 } else if (*cs & OPT) { 581 582 /* doesn't matter */ 583 matched = 1; 584 cs = MNEXT(cs); 585 } else 586 587 /* no match, error return */ 588 return (NIL); 589 break; 590 591 /* end of a subexpression, return success */ 592 case ')': 593 return (s); 594 } 595 break; 596 } 597 } 598 return (s); 599 } 600