1 /* 2 * Copyright (c) 1980, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by the University of 17 * California, Berkeley and its contributors. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef lint 36 static char copyright[] = 37 "@(#) Copyright (c) 1980, 1993\n\ 38 The Regents of the University of California. All rights reserved.\n"; 39 #endif /* not lint */ 40 41 #ifndef lint 42 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 43 #endif /* not lint */ 44 45 #include <ctype.h> 46 #include <stdlib.h> 47 #include <string.h> 48 #include "extern.h" 49 50 #define FALSE 0 51 #define TRUE !(FALSE) 52 #define NIL 0 53 54 static void expconv __P((void)); 55 56 boolean _escaped; /* true if we are currently _escaped */ 57 char *s_start; /* start of string */ 58 boolean l_onecase; /* true if upper and lower equivalent */ 59 60 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 61 62 /* STRNCMP - like strncmp except that we convert the 63 * first string to lower case before comparing 64 * if l_onecase is set. 65 */ 66 67 int 68 STRNCMP(s1, s2, len) 69 register char *s1,*s2; 70 register int len; 71 { 72 if (l_onecase) { 73 do 74 if (*s2 - makelower(*s1)) 75 return (*s2 - makelower(*s1)); 76 else { 77 s2++; 78 s1++; 79 } 80 while (--len); 81 } else { 82 do 83 if (*s2 - *s1) 84 return (*s2 - *s1); 85 else { 86 s2++; 87 s1++; 88 } 89 while (--len); 90 } 91 return(0); 92 } 93 94 /* The following routine converts an irregular expression to 95 * internal format. 96 * 97 * Either meta symbols (\a \d or \p) or character strings or 98 * operations ( alternation or perenthesizing ) can be 99 * specified. Each starts with a descriptor byte. The descriptor 100 * byte has STR set for strings, META set for meta symbols 101 * and OPER set for operations. 102 * The descriptor byte can also have the OPT bit set if the object 103 * defined is optional. Also ALT can be set to indicate an alternation. 104 * 105 * For metasymbols the byte following the descriptor byte identities 106 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 107 * strings the byte after the descriptor is a character count for 108 * the string: 109 * 110 * meta symbols := descriptor 111 * symbol 112 * 113 * strings := descriptor 114 * character count 115 * the string 116 * 117 * operatins := descriptor 118 * symbol 119 * character count 120 */ 121 122 /* 123 * handy macros for accessing parts of match blocks 124 */ 125 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 126 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 127 128 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 129 #define OCNT(A) (*(A+2)) /* character count */ 130 #define ONEXT(A) (A+3) /* next character after the operation */ 131 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 132 133 #define SCNT(A) (*(A+1)) /* byte count of a string */ 134 #define SSTR(A) (A+2) /* address of the string */ 135 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 136 137 /* 138 * bit flags in the descriptor 139 */ 140 #define OPT 1 141 #define STR 2 142 #define META 4 143 #define ALT 8 144 #define OPER 16 145 146 static char *ccre; /* pointer to current position in converted exp*/ 147 static char *ure; /* pointer current position in unconverted exp */ 148 149 char * 150 convexp(re) 151 char *re; /* unconverted irregular expression */ 152 { 153 register char *cre; /* pointer to converted regular expression */ 154 155 /* allocate room for the converted expression */ 156 if (re == NIL) 157 return (NIL); 158 if (*re == '\0') 159 return (NIL); 160 cre = malloc (4 * strlen(re) + 3); 161 ccre = cre; 162 ure = re; 163 164 /* start the conversion with a \a */ 165 *cre = META | OPT; 166 MSYM(cre) = 'a'; 167 ccre = MNEXT(cre); 168 169 /* start the conversion (its recursive) */ 170 expconv (); 171 *ccre = 0; 172 return (cre); 173 } 174 175 static void 176 expconv() 177 { 178 register char *cs; /* pointer to current symbol in converted exp */ 179 register char c; /* character being processed */ 180 register char *acs; /* pinter to last alternate */ 181 register int temp; 182 183 /* let the conversion begin */ 184 acs = NIL; 185 cs = NIL; 186 while (*ure != NIL) { 187 switch (c = *ure++) { 188 189 case '\\': 190 switch (c = *ure++) { 191 192 /* escaped characters are just characters */ 193 default: 194 if (cs == NIL || (*cs & STR) == 0) { 195 cs = ccre; 196 *cs = STR; 197 SCNT(cs) = 1; 198 ccre += 2; 199 } else 200 SCNT(cs)++; 201 *ccre++ = c; 202 break; 203 204 /* normal(?) metacharacters */ 205 case 'a': 206 case 'd': 207 case 'e': 208 case 'p': 209 if (acs != NIL && acs != cs) { 210 do { 211 temp = OCNT(acs); 212 OCNT(acs) = ccre - acs; 213 acs -= temp; 214 } while (temp != 0); 215 acs = NIL; 216 } 217 cs = ccre; 218 *cs = META; 219 MSYM(cs) = c; 220 ccre = MNEXT(cs); 221 break; 222 } 223 break; 224 225 /* just put the symbol in */ 226 case '^': 227 case '$': 228 if (acs != NIL && acs != cs) { 229 do { 230 temp = OCNT(acs); 231 OCNT(acs) = ccre - acs; 232 acs -= temp; 233 } while (temp != 0); 234 acs = NIL; 235 } 236 cs = ccre; 237 *cs = META; 238 MSYM(cs) = c; 239 ccre = MNEXT(cs); 240 break; 241 242 /* mark the last match sequence as optional */ 243 case '?': 244 if (cs) 245 *cs = *cs | OPT; 246 break; 247 248 /* recurse and define a subexpression */ 249 case '(': 250 if (acs != NIL && acs != cs) { 251 do { 252 temp = OCNT(acs); 253 OCNT(acs) = ccre - acs; 254 acs -= temp; 255 } while (temp != 0); 256 acs = NIL; 257 } 258 cs = ccre; 259 *cs = OPER; 260 OSYM(cs) = '('; 261 ccre = ONEXT(cs); 262 expconv (); 263 OCNT(cs) = ccre - cs; /* offset to next symbol */ 264 break; 265 266 /* return from a recursion */ 267 case ')': 268 if (acs != NIL) { 269 do { 270 temp = OCNT(acs); 271 OCNT(acs) = ccre - acs; 272 acs -= temp; 273 } while (temp != 0); 274 acs = NIL; 275 } 276 cs = ccre; 277 *cs = META; 278 MSYM(cs) = c; 279 ccre = MNEXT(cs); 280 return; 281 282 /* mark the last match sequence as having an alternate */ 283 /* the third byte will contain an offset to jump over the */ 284 /* alternate match in case the first did not fail */ 285 case '|': 286 if (acs != NIL && acs != cs) 287 OCNT(ccre) = ccre - acs; /* make a back pointer */ 288 else 289 OCNT(ccre) = 0; 290 *cs |= ALT; 291 cs = ccre; 292 *cs = OPER; 293 OSYM(cs) = '|'; 294 ccre = ONEXT(cs); 295 acs = cs; /* remember that the pointer is to be filles */ 296 break; 297 298 /* if its not a metasymbol just build a scharacter string */ 299 default: 300 if (cs == NIL || (*cs & STR) == 0) { 301 cs = ccre; 302 *cs = STR; 303 SCNT(cs) = 1; 304 ccre = SSTR(cs); 305 } else 306 SCNT(cs)++; 307 *ccre++ = c; 308 break; 309 } 310 } 311 if (acs != NIL) { 312 do { 313 temp = OCNT(acs); 314 OCNT(acs) = ccre - acs; 315 acs -= temp; 316 } while (temp != 0); 317 acs = NIL; 318 } 319 return; 320 } 321 /* end of convertre */ 322 323 324 /* 325 * The following routine recognises an irregular expresion 326 * with the following special characters: 327 * 328 * \? - means last match was optional 329 * \a - matches any number of characters 330 * \d - matches any number of spaces and tabs 331 * \p - matches any number of alphanumeric 332 * characters. The 333 * characters matched will be copied into 334 * the area pointed to by 'name'. 335 * \| - alternation 336 * \( \) - grouping used mostly for alternation and 337 * optionality 338 * 339 * The irregular expression must be translated to internal form 340 * prior to calling this routine 341 * 342 * The value returned is the pointer to the first non \a 343 * character matched. 344 */ 345 346 char * 347 expmatch (s, re, mstring) 348 register char *s; /* string to check for a match in */ 349 register char *re; /* a converted irregular expression */ 350 register char *mstring; /* where to put whatever matches a \p */ 351 { 352 register char *cs; /* the current symbol */ 353 register char *ptr,*s1; /* temporary pointer */ 354 boolean matched; /* a temporary boolean */ 355 356 /* initial conditions */ 357 if (re == NIL) 358 return (NIL); 359 cs = re; 360 matched = FALSE; 361 362 /* loop till expression string is exhausted (or at least pretty tired) */ 363 while (*cs) { 364 switch (*cs & (OPER | STR | META)) { 365 366 /* try to match a string */ 367 case STR: 368 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 369 if (matched) { 370 371 /* hoorah it matches */ 372 s += SCNT(cs); 373 cs = SNEXT(cs); 374 } else if (*cs & ALT) { 375 376 /* alternation, skip to next expression */ 377 cs = SNEXT(cs); 378 } else if (*cs & OPT) { 379 380 /* the match is optional */ 381 cs = SNEXT(cs); 382 matched = 1; /* indicate a successful match */ 383 } else { 384 385 /* no match, error return */ 386 return (NIL); 387 } 388 break; 389 390 /* an operator, do something fancy */ 391 case OPER: 392 switch (OSYM(cs)) { 393 394 /* this is an alternation */ 395 case '|': 396 if (matched) 397 398 /* last thing in the alternation was a match, skip ahead */ 399 cs = OPTR(cs); 400 else 401 402 /* no match, keep trying */ 403 cs = ONEXT(cs); 404 break; 405 406 /* this is a grouping, recurse */ 407 case '(': 408 ptr = expmatch (s, ONEXT(cs), mstring); 409 if (ptr != NIL) { 410 411 /* the subexpression matched */ 412 matched = 1; 413 s = ptr; 414 } else if (*cs & ALT) { 415 416 /* alternation, skip to next expression */ 417 matched = 0; 418 } else if (*cs & OPT) { 419 420 /* the match is optional */ 421 matched = 1; /* indicate a successful match */ 422 } else { 423 424 /* no match, error return */ 425 return (NIL); 426 } 427 cs = OPTR(cs); 428 break; 429 } 430 break; 431 432 /* try to match a metasymbol */ 433 case META: 434 switch (MSYM(cs)) { 435 436 /* try to match anything and remember what was matched */ 437 case 'p': 438 /* 439 * This is really the same as trying the match the 440 * remaining parts of the expression to any subset 441 * of the string. 442 */ 443 s1 = s; 444 do { 445 ptr = expmatch (s1, MNEXT(cs), mstring); 446 if (ptr != NIL && s1 != s) { 447 448 /* we have a match, remember the match */ 449 strncpy (mstring, s, s1 - s); 450 mstring[s1 - s] = '\0'; 451 return (ptr); 452 } else if (ptr != NIL && (*cs & OPT)) { 453 454 /* it was aoptional so no match is ok */ 455 return (ptr); 456 } else if (ptr != NIL) { 457 458 /* not optional and we still matched */ 459 return (NIL); 460 } 461 if (!(isalnum(*s1) || *s1 == '_' || 462 /* C++ destructor */ 463 *s1 == '~' || 464 /* C++ scope operator */ 465 (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' && 466 (s1++, TRUE)))) 467 return (NIL); 468 if (*s1 == '\\') 469 _escaped = _escaped ? FALSE : TRUE; 470 else 471 _escaped = FALSE; 472 } while (*s1++); 473 return (NIL); 474 475 /* try to match anything */ 476 case 'a': 477 /* 478 * This is really the same as trying the match the 479 * remaining parts of the expression to any subset 480 * of the string. 481 */ 482 s1 = s; 483 do { 484 ptr = expmatch (s1, MNEXT(cs), mstring); 485 if (ptr != NIL && s1 != s) { 486 487 /* we have a match */ 488 return (ptr); 489 } else if (ptr != NIL && (*cs & OPT)) { 490 491 /* it was aoptional so no match is ok */ 492 return (ptr); 493 } else if (ptr != NIL) { 494 495 /* not optional and we still matched */ 496 return (NIL); 497 } 498 if (*s1 == '\\') 499 _escaped = _escaped ? FALSE : TRUE; 500 else 501 _escaped = FALSE; 502 } while (*s1++); 503 return (NIL); 504 505 /* fail if we are currently _escaped */ 506 case 'e': 507 if (_escaped) 508 return(NIL); 509 cs = MNEXT(cs); 510 break; 511 512 /* match any number of tabs and spaces */ 513 case 'd': 514 ptr = s; 515 while (*s == ' ' || *s == '\t') 516 s++; 517 if (s != ptr || s == s_start) { 518 519 /* match, be happy */ 520 matched = 1; 521 cs = MNEXT(cs); 522 } else if (*s == '\n' || *s == '\0') { 523 524 /* match, be happy */ 525 matched = 1; 526 cs = MNEXT(cs); 527 } else if (*cs & ALT) { 528 529 /* try the next part */ 530 matched = 0; 531 cs = MNEXT(cs); 532 } else if (*cs & OPT) { 533 534 /* doesn't matter */ 535 matched = 1; 536 cs = MNEXT(cs); 537 } else 538 539 /* no match, error return */ 540 return (NIL); 541 break; 542 543 /* check for end of line */ 544 case '$': 545 if (*s == '\0' || *s == '\n') { 546 547 /* match, be happy */ 548 s++; 549 matched = 1; 550 cs = MNEXT(cs); 551 } else if (*cs & ALT) { 552 553 /* try the next part */ 554 matched = 0; 555 cs = MNEXT(cs); 556 } else if (*cs & OPT) { 557 558 /* doesn't matter */ 559 matched = 1; 560 cs = MNEXT(cs); 561 } else 562 563 /* no match, error return */ 564 return (NIL); 565 break; 566 567 /* check for start of line */ 568 case '^': 569 if (s == s_start) { 570 571 /* match, be happy */ 572 matched = 1; 573 cs = MNEXT(cs); 574 } else if (*cs & ALT) { 575 576 /* try the next part */ 577 matched = 0; 578 cs = MNEXT(cs); 579 } else if (*cs & OPT) { 580 581 /* doesn't matter */ 582 matched = 1; 583 cs = MNEXT(cs); 584 } else 585 586 /* no match, error return */ 587 return (NIL); 588 break; 589 590 /* end of a subexpression, return success */ 591 case ')': 592 return (s); 593 } 594 break; 595 } 596 } 597 return (s); 598 } 599