1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 34 #ifndef lint 35 static const char copyright[] = 36 "@(#) Copyright (c) 1980, 1993\n\ 37 The Regents of the University of California. All rights reserved.\n"; 38 #endif 39 40 #ifndef lint 41 static const char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 42 #endif 43 44 #include <ctype.h> 45 #include <stdlib.h> 46 #include <stdbool.h> 47 #include <string.h> 48 49 #include "extern.h" 50 51 static void expconv(void); 52 53 bool _escaped; /* true if we are currently x_escaped */ 54 char *s_start; /* start of string */ 55 bool l_onecase; /* true if upper and lower equivalent */ 56 57 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 58 59 /* STRNCMP - like strncmp except that we convert the 60 * first string to lower case before comparing 61 * if l_onecase is set. 62 */ 63 64 int 65 STRNCMP(register char *s1, register char *s2, register int len) 66 { 67 if (l_onecase) { 68 do 69 if (*s2 - makelower(*s1)) 70 return (*s2 - makelower(*s1)); 71 else { 72 s2++; 73 s1++; 74 } 75 while (--len); 76 } else { 77 do 78 if (*s2 - *s1) 79 return (*s2 - *s1); 80 else { 81 s2++; 82 s1++; 83 } 84 while (--len); 85 } 86 return(0); 87 } 88 89 /* The following routine converts an irregular expression to 90 * internal format. 91 * 92 * Either meta symbols (\a \d or \p) or character strings or 93 * operations ( alternation or parenthesizing ) can be 94 * specified. Each starts with a descriptor byte. The descriptor 95 * byte has STR set for strings, META set for meta symbols 96 * and OPER set for operations. 97 * The descriptor byte can also have the OPT bit set if the object 98 * defined is optional. Also ALT can be set to indicate an alternation. 99 * 100 * For metasymbols the byte following the descriptor byte identities 101 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 102 * strings the byte after the descriptor is a character count for 103 * the string: 104 * 105 * meta symbols := descriptor 106 * symbol 107 * 108 * strings := descriptor 109 * character count 110 * the string 111 * 112 * operations := descriptor 113 * symbol 114 * character count 115 */ 116 117 /* 118 * handy macros for accessing parts of match blocks 119 */ 120 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 121 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 122 123 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 124 #define OCNT(A) (*(A+2)) /* character count */ 125 #define ONEXT(A) (A+3) /* next character after the operation */ 126 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 127 128 #define SCNT(A) (*(A+1)) /* byte count of a string */ 129 #define SSTR(A) (A+2) /* address of the string */ 130 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 131 132 /* 133 * bit flags in the descriptor 134 */ 135 #define OPT 1 136 #define STR 2 137 #define META 4 138 #define ALT 8 139 #define OPER 16 140 141 static char *ccre; /* pointer to current position in converted exp*/ 142 static char *ure; /* pointer current position in unconverted exp */ 143 144 /* re: unconverted irregular expression */ 145 char * 146 convexp(char *re) 147 { 148 register char *cre; /* pointer to converted regular expression */ 149 150 /* allocate room for the converted expression */ 151 if (re == NULL) 152 return (NULL); 153 if (*re == '\0') 154 return (NULL); 155 cre = malloc(4 * strlen(re) + 3); 156 ccre = cre; 157 ure = re; 158 159 /* start the conversion with a \a */ 160 *cre = META | OPT; 161 MSYM(cre) = 'a'; 162 ccre = MNEXT(cre); 163 164 /* start the conversion (its recursive) */ 165 expconv (); 166 *ccre = 0; 167 return (cre); 168 } 169 170 static void 171 expconv() 172 { 173 register char *cs; /* pointer to current symbol in converted exp */ 174 register char c; /* character being processed */ 175 register char *acs; /* pinter to last alternate */ 176 register int temp; 177 178 /* let the conversion begin */ 179 acs = NULL; 180 cs = NULL; 181 while (*ure) { 182 switch (c = *ure++) { 183 184 case '\\': 185 switch (c = *ure++) { 186 187 /* escaped characters are just characters */ 188 default: 189 if (cs == NULL || (*cs & STR) == 0) { 190 cs = ccre; 191 *cs = STR; 192 SCNT(cs) = 1; 193 ccre += 2; 194 } else 195 SCNT(cs)++; 196 *ccre++ = c; 197 break; 198 199 /* normal(?) metacharacters */ 200 case 'a': 201 case 'd': 202 case 'e': 203 case 'p': 204 if (acs != NULL && acs != cs) { 205 do { 206 temp = OCNT(acs); 207 OCNT(acs) = ccre - acs; 208 acs -= temp; 209 } while (temp != 0); 210 acs = NULL; 211 } 212 cs = ccre; 213 *cs = META; 214 MSYM(cs) = c; 215 ccre = MNEXT(cs); 216 break; 217 } 218 break; 219 220 /* just put the symbol in */ 221 case '^': 222 case '$': 223 if (acs != NULL && acs != cs) { 224 do { 225 temp = OCNT(acs); 226 OCNT(acs) = ccre - acs; 227 acs -= temp; 228 } while (temp != 0); 229 acs = NULL; 230 } 231 cs = ccre; 232 *cs = META; 233 MSYM(cs) = c; 234 ccre = MNEXT(cs); 235 break; 236 237 /* mark the last match sequence as optional */ 238 case '?': 239 if (cs) 240 *cs = *cs | OPT; 241 break; 242 243 /* recurse and define a subexpression */ 244 case '(': 245 if (acs != NULL && acs != cs) { 246 do { 247 temp = OCNT(acs); 248 OCNT(acs) = ccre - acs; 249 acs -= temp; 250 } while (temp != 0); 251 acs = NULL; 252 } 253 cs = ccre; 254 *cs = OPER; 255 OSYM(cs) = '('; 256 ccre = ONEXT(cs); 257 expconv(); 258 OCNT(cs) = ccre - cs; /* offset to next symbol */ 259 break; 260 261 /* return from a recursion */ 262 case ')': 263 if (acs != NULL) { 264 do { 265 temp = OCNT(acs); 266 OCNT(acs) = ccre - acs; 267 acs -= temp; 268 } while (temp != 0); 269 acs = NULL; 270 } 271 cs = ccre; 272 *cs = META; 273 MSYM(cs) = c; 274 ccre = MNEXT(cs); 275 return; 276 277 /* mark the last match sequence as having an alternate */ 278 /* the third byte will contain an offset to jump over the */ 279 /* alternate match in case the first did not fail */ 280 case '|': 281 if (acs != NULL && acs != cs) 282 OCNT(ccre) = ccre - acs; /* make a back pointer */ 283 else 284 OCNT(ccre) = 0; 285 *cs |= ALT; 286 cs = ccre; 287 *cs = OPER; 288 OSYM(cs) = '|'; 289 ccre = ONEXT(cs); 290 acs = cs; /* remember that the pointer is to be filles */ 291 break; 292 293 /* if its not a metasymbol just build a scharacter string */ 294 default: 295 if (cs == NULL || (*cs & STR) == 0) { 296 cs = ccre; 297 *cs = STR; 298 SCNT(cs) = 1; 299 ccre = SSTR(cs); 300 } else 301 SCNT(cs)++; 302 *ccre++ = c; 303 break; 304 } 305 } 306 if (acs != NULL) { 307 do { 308 temp = OCNT(acs); 309 OCNT(acs) = ccre - acs; 310 acs -= temp; 311 } while (temp != 0); 312 acs = NULL; 313 } 314 return; 315 } 316 /* end of convertre */ 317 318 319 /* 320 * The following routine recognises an irregular expression 321 * with the following special characters: 322 * 323 * \? - means last match was optional 324 * \a - matches any number of characters 325 * \d - matches any number of spaces and tabs 326 * \p - matches any number of alphanumeric 327 * characters. The 328 * characters matched will be copied into 329 * the area pointed to by 'name'. 330 * \| - alternation 331 * \( \) - grouping used mostly for alternation and 332 * optionality 333 * 334 * The irregular expression must be translated to internal form 335 * prior to calling this routine 336 * 337 * The value returned is the pointer to the first non \a 338 * character matched. 339 */ 340 341 /* 342 * s: string to check for a match in 343 * re: a converted irregular expression 344 * mstring: where to put whatever matches a \p 345 */ 346 char * 347 expmatch (register char *s, register char *re, register char *mstring) 348 { 349 register char *cs; /* the current symbol */ 350 register char *ptr,*s1; /* temporary pointer */ 351 bool matched; /* a temporary bool */ 352 353 /* initial conditions */ 354 if (re == NULL) 355 return (NULL); 356 cs = re; 357 matched = false; 358 359 /* loop till expression string is exhausted (or at least pretty tired) */ 360 while (*cs) { 361 switch (*cs & (OPER | STR | META)) { 362 363 /* try to match a string */ 364 case STR: 365 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 366 if (matched) { 367 368 /* hoorah it matches */ 369 s += SCNT(cs); 370 cs = SNEXT(cs); 371 } else if (*cs & ALT) { 372 373 /* alternation, skip to next expression */ 374 cs = SNEXT(cs); 375 } else if (*cs & OPT) { 376 377 /* the match is optional */ 378 cs = SNEXT(cs); 379 matched = 1; /* indicate a successful match */ 380 } else { 381 382 /* no match, error return */ 383 return (NULL); 384 } 385 break; 386 387 /* an operator, do something fancy */ 388 case OPER: 389 switch (OSYM(cs)) { 390 391 /* this is an alternation */ 392 case '|': 393 if (matched) 394 395 /* last thing in the alternation was a match, skip ahead */ 396 cs = OPTR(cs); 397 else 398 399 /* no match, keep trying */ 400 cs = ONEXT(cs); 401 break; 402 403 /* this is a grouping, recurse */ 404 case '(': 405 ptr = expmatch(s, ONEXT(cs), mstring); 406 if (ptr != NULL) { 407 408 /* the subexpression matched */ 409 matched = 1; 410 s = ptr; 411 } else if (*cs & ALT) { 412 413 /* alternation, skip to next expression */ 414 matched = 0; 415 } else if (*cs & OPT) { 416 417 /* the match is optional */ 418 matched = 1; /* indicate a successful match */ 419 } else { 420 421 /* no match, error return */ 422 return (NULL); 423 } 424 cs = OPTR(cs); 425 break; 426 } 427 break; 428 429 /* try to match a metasymbol */ 430 case META: 431 switch (MSYM(cs)) { 432 433 /* try to match anything and remember what was matched */ 434 case 'p': 435 /* 436 * This is really the same as trying the match the 437 * remaining parts of the expression to any subset 438 * of the string. 439 */ 440 s1 = s; 441 do { 442 ptr = expmatch(s1, MNEXT(cs), mstring); 443 if (ptr != NULL && s1 != s) { 444 445 /* we have a match, remember the match */ 446 strncpy (mstring, s, s1 - s); 447 mstring[s1 - s] = '\0'; 448 return (ptr); 449 } else if (ptr != NULL && (*cs & OPT)) { 450 451 /* it was aoptional so no match is ok */ 452 return (ptr); 453 } else if (ptr != NULL) { 454 455 /* not optional and we still matched */ 456 return (NULL); 457 } 458 if (!(isalnum(*s1) || *s1 == '_' || 459 /* C++ destructor */ 460 *s1 == '~' || 461 /* C++ scope operator */ 462 (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' && 463 (s1++, true)))) 464 return (NULL); 465 if (*s1 == '\\') 466 _escaped = _escaped ? false : true; 467 else 468 _escaped = false; 469 } while (*s1++); 470 return (NULL); 471 472 /* try to match anything */ 473 case 'a': 474 /* 475 * This is really the same as trying the match the 476 * remaining parts of the expression to any subset 477 * of the string. 478 */ 479 s1 = s; 480 do { 481 ptr = expmatch(s1, MNEXT(cs), mstring); 482 if (ptr != NULL && s1 != s) { 483 484 /* we have a match */ 485 return (ptr); 486 } else if (ptr != NULL && (*cs & OPT)) { 487 488 /* it was aoptional so no match is ok */ 489 return (ptr); 490 } else if (ptr != NULL) { 491 492 /* not optional and we still matched */ 493 return (NULL); 494 } 495 if (*s1 == '\\') 496 _escaped = _escaped ? false : true; 497 else 498 _escaped = false; 499 } while (*s1++); 500 return (NULL); 501 502 /* fail if we are currently _escaped */ 503 case 'e': 504 if (_escaped) 505 return(NULL); 506 cs = MNEXT(cs); 507 break; 508 509 /* match any number of tabs and spaces */ 510 case 'd': 511 ptr = s; 512 while (*s == ' ' || *s == '\t') 513 s++; 514 if (s != ptr || s == s_start) { 515 516 /* match, be happy */ 517 matched = 1; 518 cs = MNEXT(cs); 519 } else if (*s == '\n' || *s == '\0') { 520 521 /* match, be happy */ 522 matched = 1; 523 cs = MNEXT(cs); 524 } else if (*cs & ALT) { 525 526 /* try the next part */ 527 matched = 0; 528 cs = MNEXT(cs); 529 } else if (*cs & OPT) { 530 531 /* doesn't matter */ 532 matched = 1; 533 cs = MNEXT(cs); 534 } else 535 536 /* no match, error return */ 537 return (NULL); 538 break; 539 540 /* check for end of line */ 541 case '$': 542 if (*s == '\0' || *s == '\n') { 543 544 /* match, be happy */ 545 s++; 546 matched = 1; 547 cs = MNEXT(cs); 548 } else if (*cs & ALT) { 549 550 /* try the next part */ 551 matched = 0; 552 cs = MNEXT(cs); 553 } else if (*cs & OPT) { 554 555 /* doesn't matter */ 556 matched = 1; 557 cs = MNEXT(cs); 558 } else 559 560 /* no match, error return */ 561 return (NULL); 562 break; 563 564 /* check for start of line */ 565 case '^': 566 if (s == s_start) { 567 568 /* match, be happy */ 569 matched = 1; 570 cs = MNEXT(cs); 571 } else if (*cs & ALT) { 572 573 /* try the next part */ 574 matched = 0; 575 cs = MNEXT(cs); 576 } else if (*cs & OPT) { 577 578 /* doesn't matter */ 579 matched = 1; 580 cs = MNEXT(cs); 581 } else 582 583 /* no match, error return */ 584 return (NULL); 585 break; 586 587 /* end of a subexpression, return success */ 588 case ')': 589 return (s); 590 } 591 break; 592 } 593 } 594 return (s); 595 } 596