1 /* 2 * Copyright (c) 1980, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by the University of 17 * California, Berkeley and its contributors. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 37 __FBSDID("$FreeBSD$"); 38 39 #ifndef lint 40 static const char copyright[] = 41 "@(#) Copyright (c) 1980, 1993\n\ 42 The Regents of the University of California. All rights reserved.\n"; 43 #endif 44 45 #ifndef lint 46 static const char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 47 #endif 48 49 #include <ctype.h> 50 #include <stdlib.h> 51 #include <string.h> 52 53 #include "extern.h" 54 55 #define FALSE 0 56 #define TRUE !(FALSE) 57 #define NIL 0 58 59 static void expconv(void); 60 61 boolean _escaped; /* true if we are currently _escaped */ 62 char *s_start; /* start of string */ 63 boolean l_onecase; /* true if upper and lower equivalent */ 64 65 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 66 67 /* STRNCMP - like strncmp except that we convert the 68 * first string to lower case before comparing 69 * if l_onecase is set. 70 */ 71 72 int 73 STRNCMP(s1, s2, len) 74 register char *s1,*s2; 75 register int len; 76 { 77 if (l_onecase) { 78 do 79 if (*s2 - makelower(*s1)) 80 return (*s2 - makelower(*s1)); 81 else { 82 s2++; 83 s1++; 84 } 85 while (--len); 86 } else { 87 do 88 if (*s2 - *s1) 89 return (*s2 - *s1); 90 else { 91 s2++; 92 s1++; 93 } 94 while (--len); 95 } 96 return(0); 97 } 98 99 /* The following routine converts an irregular expression to 100 * internal format. 101 * 102 * Either meta symbols (\a \d or \p) or character strings or 103 * operations ( alternation or perenthesizing ) can be 104 * specified. Each starts with a descriptor byte. The descriptor 105 * byte has STR set for strings, META set for meta symbols 106 * and OPER set for operations. 107 * The descriptor byte can also have the OPT bit set if the object 108 * defined is optional. Also ALT can be set to indicate an alternation. 109 * 110 * For metasymbols the byte following the descriptor byte identities 111 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 112 * strings the byte after the descriptor is a character count for 113 * the string: 114 * 115 * meta symbols := descriptor 116 * symbol 117 * 118 * strings := descriptor 119 * character count 120 * the string 121 * 122 * operatins := descriptor 123 * symbol 124 * character count 125 */ 126 127 /* 128 * handy macros for accessing parts of match blocks 129 */ 130 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 131 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 132 133 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 134 #define OCNT(A) (*(A+2)) /* character count */ 135 #define ONEXT(A) (A+3) /* next character after the operation */ 136 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 137 138 #define SCNT(A) (*(A+1)) /* byte count of a string */ 139 #define SSTR(A) (A+2) /* address of the string */ 140 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 141 142 /* 143 * bit flags in the descriptor 144 */ 145 #define OPT 1 146 #define STR 2 147 #define META 4 148 #define ALT 8 149 #define OPER 16 150 151 static char *ccre; /* pointer to current position in converted exp*/ 152 static char *ure; /* pointer current position in unconverted exp */ 153 154 char * 155 convexp(re) 156 char *re; /* unconverted irregular expression */ 157 { 158 register char *cre; /* pointer to converted regular expression */ 159 160 /* allocate room for the converted expression */ 161 if (re == NIL) 162 return (NIL); 163 if (*re == '\0') 164 return (NIL); 165 cre = malloc (4 * strlen(re) + 3); 166 ccre = cre; 167 ure = re; 168 169 /* start the conversion with a \a */ 170 *cre = META | OPT; 171 MSYM(cre) = 'a'; 172 ccre = MNEXT(cre); 173 174 /* start the conversion (its recursive) */ 175 expconv (); 176 *ccre = 0; 177 return (cre); 178 } 179 180 static void 181 expconv() 182 { 183 register char *cs; /* pointer to current symbol in converted exp */ 184 register char c; /* character being processed */ 185 register char *acs; /* pinter to last alternate */ 186 register int temp; 187 188 /* let the conversion begin */ 189 acs = NIL; 190 cs = NIL; 191 while (*ure != NIL) { 192 switch (c = *ure++) { 193 194 case '\\': 195 switch (c = *ure++) { 196 197 /* escaped characters are just characters */ 198 default: 199 if (cs == NIL || (*cs & STR) == 0) { 200 cs = ccre; 201 *cs = STR; 202 SCNT(cs) = 1; 203 ccre += 2; 204 } else 205 SCNT(cs)++; 206 *ccre++ = c; 207 break; 208 209 /* normal(?) metacharacters */ 210 case 'a': 211 case 'd': 212 case 'e': 213 case 'p': 214 if (acs != NIL && acs != cs) { 215 do { 216 temp = OCNT(acs); 217 OCNT(acs) = ccre - acs; 218 acs -= temp; 219 } while (temp != 0); 220 acs = NIL; 221 } 222 cs = ccre; 223 *cs = META; 224 MSYM(cs) = c; 225 ccre = MNEXT(cs); 226 break; 227 } 228 break; 229 230 /* just put the symbol in */ 231 case '^': 232 case '$': 233 if (acs != NIL && acs != cs) { 234 do { 235 temp = OCNT(acs); 236 OCNT(acs) = ccre - acs; 237 acs -= temp; 238 } while (temp != 0); 239 acs = NIL; 240 } 241 cs = ccre; 242 *cs = META; 243 MSYM(cs) = c; 244 ccre = MNEXT(cs); 245 break; 246 247 /* mark the last match sequence as optional */ 248 case '?': 249 if (cs) 250 *cs = *cs | OPT; 251 break; 252 253 /* recurse and define a subexpression */ 254 case '(': 255 if (acs != NIL && acs != cs) { 256 do { 257 temp = OCNT(acs); 258 OCNT(acs) = ccre - acs; 259 acs -= temp; 260 } while (temp != 0); 261 acs = NIL; 262 } 263 cs = ccre; 264 *cs = OPER; 265 OSYM(cs) = '('; 266 ccre = ONEXT(cs); 267 expconv (); 268 OCNT(cs) = ccre - cs; /* offset to next symbol */ 269 break; 270 271 /* return from a recursion */ 272 case ')': 273 if (acs != NIL) { 274 do { 275 temp = OCNT(acs); 276 OCNT(acs) = ccre - acs; 277 acs -= temp; 278 } while (temp != 0); 279 acs = NIL; 280 } 281 cs = ccre; 282 *cs = META; 283 MSYM(cs) = c; 284 ccre = MNEXT(cs); 285 return; 286 287 /* mark the last match sequence as having an alternate */ 288 /* the third byte will contain an offset to jump over the */ 289 /* alternate match in case the first did not fail */ 290 case '|': 291 if (acs != NIL && acs != cs) 292 OCNT(ccre) = ccre - acs; /* make a back pointer */ 293 else 294 OCNT(ccre) = 0; 295 *cs |= ALT; 296 cs = ccre; 297 *cs = OPER; 298 OSYM(cs) = '|'; 299 ccre = ONEXT(cs); 300 acs = cs; /* remember that the pointer is to be filles */ 301 break; 302 303 /* if its not a metasymbol just build a scharacter string */ 304 default: 305 if (cs == NIL || (*cs & STR) == 0) { 306 cs = ccre; 307 *cs = STR; 308 SCNT(cs) = 1; 309 ccre = SSTR(cs); 310 } else 311 SCNT(cs)++; 312 *ccre++ = c; 313 break; 314 } 315 } 316 if (acs != NIL) { 317 do { 318 temp = OCNT(acs); 319 OCNT(acs) = ccre - acs; 320 acs -= temp; 321 } while (temp != 0); 322 acs = NIL; 323 } 324 return; 325 } 326 /* end of convertre */ 327 328 329 /* 330 * The following routine recognises an irregular expresion 331 * with the following special characters: 332 * 333 * \? - means last match was optional 334 * \a - matches any number of characters 335 * \d - matches any number of spaces and tabs 336 * \p - matches any number of alphanumeric 337 * characters. The 338 * characters matched will be copied into 339 * the area pointed to by 'name'. 340 * \| - alternation 341 * \( \) - grouping used mostly for alternation and 342 * optionality 343 * 344 * The irregular expression must be translated to internal form 345 * prior to calling this routine 346 * 347 * The value returned is the pointer to the first non \a 348 * character matched. 349 */ 350 351 char * 352 expmatch (s, re, mstring) 353 register char *s; /* string to check for a match in */ 354 register char *re; /* a converted irregular expression */ 355 register char *mstring; /* where to put whatever matches a \p */ 356 { 357 register char *cs; /* the current symbol */ 358 register char *ptr,*s1; /* temporary pointer */ 359 boolean matched; /* a temporary boolean */ 360 361 /* initial conditions */ 362 if (re == NIL) 363 return (NIL); 364 cs = re; 365 matched = FALSE; 366 367 /* loop till expression string is exhausted (or at least pretty tired) */ 368 while (*cs) { 369 switch (*cs & (OPER | STR | META)) { 370 371 /* try to match a string */ 372 case STR: 373 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 374 if (matched) { 375 376 /* hoorah it matches */ 377 s += SCNT(cs); 378 cs = SNEXT(cs); 379 } else if (*cs & ALT) { 380 381 /* alternation, skip to next expression */ 382 cs = SNEXT(cs); 383 } else if (*cs & OPT) { 384 385 /* the match is optional */ 386 cs = SNEXT(cs); 387 matched = 1; /* indicate a successful match */ 388 } else { 389 390 /* no match, error return */ 391 return (NIL); 392 } 393 break; 394 395 /* an operator, do something fancy */ 396 case OPER: 397 switch (OSYM(cs)) { 398 399 /* this is an alternation */ 400 case '|': 401 if (matched) 402 403 /* last thing in the alternation was a match, skip ahead */ 404 cs = OPTR(cs); 405 else 406 407 /* no match, keep trying */ 408 cs = ONEXT(cs); 409 break; 410 411 /* this is a grouping, recurse */ 412 case '(': 413 ptr = expmatch (s, ONEXT(cs), mstring); 414 if (ptr != NIL) { 415 416 /* the subexpression matched */ 417 matched = 1; 418 s = ptr; 419 } else if (*cs & ALT) { 420 421 /* alternation, skip to next expression */ 422 matched = 0; 423 } else if (*cs & OPT) { 424 425 /* the match is optional */ 426 matched = 1; /* indicate a successful match */ 427 } else { 428 429 /* no match, error return */ 430 return (NIL); 431 } 432 cs = OPTR(cs); 433 break; 434 } 435 break; 436 437 /* try to match a metasymbol */ 438 case META: 439 switch (MSYM(cs)) { 440 441 /* try to match anything and remember what was matched */ 442 case 'p': 443 /* 444 * This is really the same as trying the match the 445 * remaining parts of the expression to any subset 446 * of the string. 447 */ 448 s1 = s; 449 do { 450 ptr = expmatch (s1, MNEXT(cs), mstring); 451 if (ptr != NIL && s1 != s) { 452 453 /* we have a match, remember the match */ 454 strncpy (mstring, s, s1 - s); 455 mstring[s1 - s] = '\0'; 456 return (ptr); 457 } else if (ptr != NIL && (*cs & OPT)) { 458 459 /* it was aoptional so no match is ok */ 460 return (ptr); 461 } else if (ptr != NIL) { 462 463 /* not optional and we still matched */ 464 return (NIL); 465 } 466 if (!(isalnum(*s1) || *s1 == '_' || 467 /* C++ destructor */ 468 *s1 == '~' || 469 /* C++ scope operator */ 470 (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' && 471 (s1++, TRUE)))) 472 return (NIL); 473 if (*s1 == '\\') 474 _escaped = _escaped ? FALSE : TRUE; 475 else 476 _escaped = FALSE; 477 } while (*s1++); 478 return (NIL); 479 480 /* try to match anything */ 481 case 'a': 482 /* 483 * This is really the same as trying the match the 484 * remaining parts of the expression to any subset 485 * of the string. 486 */ 487 s1 = s; 488 do { 489 ptr = expmatch (s1, MNEXT(cs), mstring); 490 if (ptr != NIL && s1 != s) { 491 492 /* we have a match */ 493 return (ptr); 494 } else if (ptr != NIL && (*cs & OPT)) { 495 496 /* it was aoptional so no match is ok */ 497 return (ptr); 498 } else if (ptr != NIL) { 499 500 /* not optional and we still matched */ 501 return (NIL); 502 } 503 if (*s1 == '\\') 504 _escaped = _escaped ? FALSE : TRUE; 505 else 506 _escaped = FALSE; 507 } while (*s1++); 508 return (NIL); 509 510 /* fail if we are currently _escaped */ 511 case 'e': 512 if (_escaped) 513 return(NIL); 514 cs = MNEXT(cs); 515 break; 516 517 /* match any number of tabs and spaces */ 518 case 'd': 519 ptr = s; 520 while (*s == ' ' || *s == '\t') 521 s++; 522 if (s != ptr || s == s_start) { 523 524 /* match, be happy */ 525 matched = 1; 526 cs = MNEXT(cs); 527 } else if (*s == '\n' || *s == '\0') { 528 529 /* match, be happy */ 530 matched = 1; 531 cs = MNEXT(cs); 532 } else if (*cs & ALT) { 533 534 /* try the next part */ 535 matched = 0; 536 cs = MNEXT(cs); 537 } else if (*cs & OPT) { 538 539 /* doesn't matter */ 540 matched = 1; 541 cs = MNEXT(cs); 542 } else 543 544 /* no match, error return */ 545 return (NIL); 546 break; 547 548 /* check for end of line */ 549 case '$': 550 if (*s == '\0' || *s == '\n') { 551 552 /* match, be happy */ 553 s++; 554 matched = 1; 555 cs = MNEXT(cs); 556 } else if (*cs & ALT) { 557 558 /* try the next part */ 559 matched = 0; 560 cs = MNEXT(cs); 561 } else if (*cs & OPT) { 562 563 /* doesn't matter */ 564 matched = 1; 565 cs = MNEXT(cs); 566 } else 567 568 /* no match, error return */ 569 return (NIL); 570 break; 571 572 /* check for start of line */ 573 case '^': 574 if (s == s_start) { 575 576 /* match, be happy */ 577 matched = 1; 578 cs = MNEXT(cs); 579 } else if (*cs & ALT) { 580 581 /* try the next part */ 582 matched = 0; 583 cs = MNEXT(cs); 584 } else if (*cs & OPT) { 585 586 /* doesn't matter */ 587 matched = 1; 588 cs = MNEXT(cs); 589 } else 590 591 /* no match, error return */ 592 return (NIL); 593 break; 594 595 /* end of a subexpression, return success */ 596 case ')': 597 return (s); 598 } 599 break; 600 } 601 } 602 return (s); 603 } 604