1 /* 2 * Copyright (c) 1980, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by the University of 17 * California, Berkeley and its contributors. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #ifndef lint 36 static char copyright[] = 37 "@(#) Copyright (c) 1980, 1993\n\ 38 The Regents of the University of California. All rights reserved.\n"; 39 #endif /* not lint */ 40 41 #ifndef lint 42 static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 43 #endif /* not lint */ 44 45 #include <ctype.h> 46 #include <stdlib.h> 47 #include <string.h> 48 #include "extern.h" 49 50 #define FALSE 0 51 #define TRUE !(FALSE) 52 #define NIL 0 53 54 static void expconv __P((void)); 55 56 boolean _escaped; /* true if we are currently _escaped */ 57 char *_start; /* start of string */ 58 boolean l_onecase; /* true if upper and lower equivalent */ 59 60 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 61 62 /* STRNCMP - like strncmp except that we convert the 63 * first string to lower case before comparing 64 * if l_onecase is set. 65 */ 66 67 int 68 STRNCMP(s1, s2, len) 69 register char *s1,*s2; 70 register int len; 71 { 72 if (l_onecase) { 73 do 74 if (*s2 - makelower(*s1)) 75 return (*s2 - makelower(*s1)); 76 else { 77 s2++; 78 s1++; 79 } 80 while (--len); 81 } else { 82 do 83 if (*s2 - *s1) 84 return (*s2 - *s1); 85 else { 86 s2++; 87 s1++; 88 } 89 while (--len); 90 } 91 return(0); 92 } 93 94 /* The following routine converts an irregular expression to 95 * internal format. 96 * 97 * Either meta symbols (\a \d or \p) or character strings or 98 * operations ( alternation or perenthesizing ) can be 99 * specified. Each starts with a descriptor byte. The descriptor 100 * byte has STR set for strings, META set for meta symbols 101 * and OPER set for operations. 102 * The descriptor byte can also have the OPT bit set if the object 103 * defined is optional. Also ALT can be set to indicate an alternation. 104 * 105 * For metasymbols the byte following the descriptor byte identities 106 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 107 * strings the byte after the descriptor is a character count for 108 * the string: 109 * 110 * meta symbols := descriptor 111 * symbol 112 * 113 * strings := descriptor 114 * character count 115 * the string 116 * 117 * operatins := descriptor 118 * symbol 119 * character count 120 */ 121 122 /* 123 * handy macros for accessing parts of match blocks 124 */ 125 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 126 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 127 128 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 129 #define OCNT(A) (*(A+2)) /* character count */ 130 #define ONEXT(A) (A+3) /* next character after the operation */ 131 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 132 133 #define SCNT(A) (*(A+1)) /* byte count of a string */ 134 #define SSTR(A) (A+2) /* address of the string */ 135 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 136 137 /* 138 * bit flags in the descriptor 139 */ 140 #define OPT 1 141 #define STR 2 142 #define META 4 143 #define ALT 8 144 #define OPER 16 145 146 static char *ccre; /* pointer to current position in converted exp*/ 147 static char *ure; /* pointer current position in unconverted exp */ 148 149 char * 150 convexp(re) 151 char *re; /* unconverted irregular expression */ 152 { 153 register char *cre; /* pointer to converted regular expression */ 154 155 /* allocate room for the converted expression */ 156 if (re == NIL) 157 return (NIL); 158 if (*re == '\0') 159 return (NIL); 160 cre = malloc (4 * strlen(re) + 3); 161 ccre = cre; 162 ure = re; 163 164 /* start the conversion with a \a */ 165 *cre = META | OPT; 166 MSYM(cre) = 'a'; 167 ccre = MNEXT(cre); 168 169 /* start the conversion (its recursive) */ 170 expconv (); 171 *ccre = 0; 172 return (cre); 173 } 174 175 static void 176 expconv() 177 { 178 register char *cs; /* pointer to current symbol in converted exp */ 179 register char c; /* character being processed */ 180 register char *acs; /* pinter to last alternate */ 181 register int temp; 182 183 /* let the conversion begin */ 184 acs = NIL; 185 cs = NIL; 186 while (*ure != NIL) { 187 switch (c = *ure++) { 188 189 case '\\': 190 switch (c = *ure++) { 191 192 /* escaped characters are just characters */ 193 default: 194 if (cs == NIL || (*cs & STR) == 0) { 195 cs = ccre; 196 *cs = STR; 197 SCNT(cs) = 1; 198 ccre += 2; 199 } else 200 SCNT(cs)++; 201 *ccre++ = c; 202 break; 203 204 /* normal(?) metacharacters */ 205 case 'a': 206 case 'd': 207 case 'e': 208 case 'p': 209 if (acs != NIL && acs != cs) { 210 do { 211 temp = OCNT(acs); 212 OCNT(acs) = ccre - acs; 213 acs -= temp; 214 } while (temp != 0); 215 acs = NIL; 216 } 217 cs = ccre; 218 *cs = META; 219 MSYM(cs) = c; 220 ccre = MNEXT(cs); 221 break; 222 } 223 break; 224 225 /* just put the symbol in */ 226 case '^': 227 case '$': 228 if (acs != NIL && acs != cs) { 229 do { 230 temp = OCNT(acs); 231 OCNT(acs) = ccre - acs; 232 acs -= temp; 233 } while (temp != 0); 234 acs = NIL; 235 } 236 cs = ccre; 237 *cs = META; 238 MSYM(cs) = c; 239 ccre = MNEXT(cs); 240 break; 241 242 /* mark the last match sequence as optional */ 243 case '?': 244 if (cs) 245 *cs = *cs | OPT; 246 break; 247 248 /* recurse and define a subexpression */ 249 case '(': 250 if (acs != NIL && acs != cs) { 251 do { 252 temp = OCNT(acs); 253 OCNT(acs) = ccre - acs; 254 acs -= temp; 255 } while (temp != 0); 256 acs = NIL; 257 } 258 cs = ccre; 259 *cs = OPER; 260 OSYM(cs) = '('; 261 ccre = ONEXT(cs); 262 expconv (); 263 OCNT(cs) = ccre - cs; /* offset to next symbol */ 264 break; 265 266 /* reurn from a recursion */ 267 case ')': 268 if (acs != NIL) { 269 do { 270 temp = OCNT(acs); 271 OCNT(acs) = ccre - acs; 272 acs -= temp; 273 } while (temp != 0); 274 acs = NIL; 275 } 276 cs = ccre; 277 *cs = META; 278 MSYM(cs) = c; 279 ccre = MNEXT(cs); 280 return; 281 282 /* mark the last match sequence as having an alternate */ 283 /* the third byte will contain an offset to jump over the */ 284 /* alternate match in case the first did not fail */ 285 case '|': 286 if (acs != NIL && acs != cs) 287 OCNT(ccre) = ccre - acs; /* make a back pointer */ 288 else 289 OCNT(ccre) = 0; 290 *cs |= ALT; 291 cs = ccre; 292 *cs = OPER; 293 OSYM(cs) = '|'; 294 ccre = ONEXT(cs); 295 acs = cs; /* remember that the pointer is to be filles */ 296 break; 297 298 /* if its not a metasymbol just build a scharacter string */ 299 default: 300 if (cs == NIL || (*cs & STR) == 0) { 301 cs = ccre; 302 *cs = STR; 303 SCNT(cs) = 1; 304 ccre = SSTR(cs); 305 } else 306 SCNT(cs)++; 307 *ccre++ = c; 308 break; 309 } 310 } 311 if (acs != NIL) { 312 do { 313 temp = OCNT(acs); 314 OCNT(acs) = ccre - acs; 315 acs -= temp; 316 } while (temp != 0); 317 acs = NIL; 318 } 319 return; 320 } 321 /* end of convertre */ 322 323 324 /* 325 * The following routine recognises an irregular expresion 326 * with the following special characters: 327 * 328 * \? - means last match was optional 329 * \a - matches any number of characters 330 * \d - matches any number of spaces and tabs 331 * \p - matches any number of alphanumeric 332 * characters. The 333 * characters matched will be copied into 334 * the area pointed to by 'name'. 335 * \| - alternation 336 * \( \) - grouping used mostly for alternation and 337 * optionality 338 * 339 * The irregular expression must be translated to internal form 340 * prior to calling this routine 341 * 342 * The value returned is the pointer to the first non \a 343 * character matched. 344 */ 345 346 char * 347 expmatch (s, re, mstring) 348 register char *s; /* string to check for a match in */ 349 register char *re; /* a converted irregular expression */ 350 register char *mstring; /* where to put whatever matches a \p */ 351 { 352 register char *cs; /* the current symbol */ 353 register char *ptr,*s1; /* temporary pointer */ 354 boolean matched; /* a temporary boolean */ 355 356 /* initial conditions */ 357 if (re == NIL) 358 return (NIL); 359 cs = re; 360 matched = FALSE; 361 362 /* loop till expression string is exhausted (or at least pretty tired) */ 363 while (*cs) { 364 switch (*cs & (OPER | STR | META)) { 365 366 /* try to match a string */ 367 case STR: 368 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 369 if (matched) { 370 371 /* hoorah it matches */ 372 s += SCNT(cs); 373 cs = SNEXT(cs); 374 } else if (*cs & ALT) { 375 376 /* alternation, skip to next expression */ 377 cs = SNEXT(cs); 378 } else if (*cs & OPT) { 379 380 /* the match is optional */ 381 cs = SNEXT(cs); 382 matched = 1; /* indicate a successful match */ 383 } else { 384 385 /* no match, error return */ 386 return (NIL); 387 } 388 break; 389 390 /* an operator, do something fancy */ 391 case OPER: 392 switch (OSYM(cs)) { 393 394 /* this is an alternation */ 395 case '|': 396 if (matched) 397 398 /* last thing in the alternation was a match, skip ahead */ 399 cs = OPTR(cs); 400 else 401 402 /* no match, keep trying */ 403 cs = ONEXT(cs); 404 break; 405 406 /* this is a grouping, recurse */ 407 case '(': 408 ptr = expmatch (s, ONEXT(cs), mstring); 409 if (ptr != NIL) { 410 411 /* the subexpression matched */ 412 matched = 1; 413 s = ptr; 414 } else if (*cs & ALT) { 415 416 /* alternation, skip to next expression */ 417 matched = 0; 418 } else if (*cs & OPT) { 419 420 /* the match is optional */ 421 matched = 1; /* indicate a successful match */ 422 } else { 423 424 /* no match, error return */ 425 return (NIL); 426 } 427 cs = OPTR(cs); 428 break; 429 } 430 break; 431 432 /* try to match a metasymbol */ 433 case META: 434 switch (MSYM(cs)) { 435 436 /* try to match anything and remember what was matched */ 437 case 'p': 438 /* 439 * This is really the same as trying the match the 440 * remaining parts of the expression to any subset 441 * of the string. 442 */ 443 s1 = s; 444 do { 445 ptr = expmatch (s1, MNEXT(cs), mstring); 446 if (ptr != NIL && s1 != s) { 447 448 /* we have a match, remember the match */ 449 strncpy (mstring, s, s1 - s); 450 mstring[s1 - s] = '\0'; 451 return (ptr); 452 } else if (ptr != NIL && (*cs & OPT)) { 453 454 /* it was aoptional so no match is ok */ 455 return (ptr); 456 } else if (ptr != NIL) { 457 458 /* not optional and we still matched */ 459 return (NIL); 460 } 461 if (!isalnum(*s1) && *s1 != '_') 462 return (NIL); 463 if (*s1 == '\\') 464 _escaped = _escaped ? FALSE : TRUE; 465 else 466 _escaped = FALSE; 467 } while (*s1++); 468 return (NIL); 469 470 /* try to match anything */ 471 case 'a': 472 /* 473 * This is really the same as trying the match the 474 * remaining parts of the expression to any subset 475 * of the string. 476 */ 477 s1 = s; 478 do { 479 ptr = expmatch (s1, MNEXT(cs), mstring); 480 if (ptr != NIL && s1 != s) { 481 482 /* we have a match */ 483 return (ptr); 484 } else if (ptr != NIL && (*cs & OPT)) { 485 486 /* it was aoptional so no match is ok */ 487 return (ptr); 488 } else if (ptr != NIL) { 489 490 /* not optional and we still matched */ 491 return (NIL); 492 } 493 if (*s1 == '\\') 494 _escaped = _escaped ? FALSE : TRUE; 495 else 496 _escaped = FALSE; 497 } while (*s1++); 498 return (NIL); 499 500 /* fail if we are currently _escaped */ 501 case 'e': 502 if (_escaped) 503 return(NIL); 504 cs = MNEXT(cs); 505 break; 506 507 /* match any number of tabs and spaces */ 508 case 'd': 509 ptr = s; 510 while (*s == ' ' || *s == '\t') 511 s++; 512 if (s != ptr || s == _start) { 513 514 /* match, be happy */ 515 matched = 1; 516 cs = MNEXT(cs); 517 } else if (*s == '\n' || *s == '\0') { 518 519 /* match, be happy */ 520 matched = 1; 521 cs = MNEXT(cs); 522 } else if (*cs & ALT) { 523 524 /* try the next part */ 525 matched = 0; 526 cs = MNEXT(cs); 527 } else if (*cs & OPT) { 528 529 /* doesn't matter */ 530 matched = 1; 531 cs = MNEXT(cs); 532 } else 533 534 /* no match, error return */ 535 return (NIL); 536 break; 537 538 /* check for end of line */ 539 case '$': 540 if (*s == '\0' || *s == '\n') { 541 542 /* match, be happy */ 543 s++; 544 matched = 1; 545 cs = MNEXT(cs); 546 } else if (*cs & ALT) { 547 548 /* try the next part */ 549 matched = 0; 550 cs = MNEXT(cs); 551 } else if (*cs & OPT) { 552 553 /* doesn't matter */ 554 matched = 1; 555 cs = MNEXT(cs); 556 } else 557 558 /* no match, error return */ 559 return (NIL); 560 break; 561 562 /* check for start of line */ 563 case '^': 564 if (s == _start) { 565 566 /* match, be happy */ 567 matched = 1; 568 cs = MNEXT(cs); 569 } else if (*cs & ALT) { 570 571 /* try the next part */ 572 matched = 0; 573 cs = MNEXT(cs); 574 } else if (*cs & OPT) { 575 576 /* doesn't matter */ 577 matched = 1; 578 cs = MNEXT(cs); 579 } else 580 581 /* no match, error return */ 582 return (NIL); 583 break; 584 585 /* end of a subexpression, return success */ 586 case ')': 587 return (s); 588 } 589 break; 590 } 591 } 592 return (s); 593 } 594