1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1980, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 35 __FBSDID("$FreeBSD$"); 36 37 #ifndef lint 38 static const char copyright[] = 39 "@(#) Copyright (c) 1980, 1993\n\ 40 The Regents of the University of California. All rights reserved.\n"; 41 #endif 42 43 #ifndef lint 44 static const char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93"; 45 #endif 46 47 #include <ctype.h> 48 #include <stdlib.h> 49 #include <stdbool.h> 50 #include <string.h> 51 52 #include "extern.h" 53 54 static void expconv(void); 55 56 bool _escaped; /* true if we are currently x_escaped */ 57 char *s_start; /* start of string */ 58 bool l_onecase; /* true if upper and lower equivalent */ 59 60 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 61 62 /* STRNCMP - like strncmp except that we convert the 63 * first string to lower case before comparing 64 * if l_onecase is set. 65 */ 66 67 int 68 STRNCMP(register char *s1, register char *s2, register int len) 69 { 70 if (l_onecase) { 71 do 72 if (*s2 - makelower(*s1)) 73 return (*s2 - makelower(*s1)); 74 else { 75 s2++; 76 s1++; 77 } 78 while (--len); 79 } else { 80 do 81 if (*s2 - *s1) 82 return (*s2 - *s1); 83 else { 84 s2++; 85 s1++; 86 } 87 while (--len); 88 } 89 return(0); 90 } 91 92 /* The following routine converts an irregular expression to 93 * internal format. 94 * 95 * Either meta symbols (\a \d or \p) or character strings or 96 * operations ( alternation or parenthesizing ) can be 97 * specified. Each starts with a descriptor byte. The descriptor 98 * byte has STR set for strings, META set for meta symbols 99 * and OPER set for operations. 100 * The descriptor byte can also have the OPT bit set if the object 101 * defined is optional. Also ALT can be set to indicate an alternation. 102 * 103 * For metasymbols the byte following the descriptor byte identities 104 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 105 * strings the byte after the descriptor is a character count for 106 * the string: 107 * 108 * meta symbols := descriptor 109 * symbol 110 * 111 * strings := descriptor 112 * character count 113 * the string 114 * 115 * operations := descriptor 116 * symbol 117 * character count 118 */ 119 120 /* 121 * handy macros for accessing parts of match blocks 122 */ 123 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 124 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 125 126 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 127 #define OCNT(A) (*(A+2)) /* character count */ 128 #define ONEXT(A) (A+3) /* next character after the operation */ 129 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 130 131 #define SCNT(A) (*(A+1)) /* byte count of a string */ 132 #define SSTR(A) (A+2) /* address of the string */ 133 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 134 135 /* 136 * bit flags in the descriptor 137 */ 138 #define OPT 1 139 #define STR 2 140 #define META 4 141 #define ALT 8 142 #define OPER 16 143 144 static char *ccre; /* pointer to current position in converted exp*/ 145 static char *ure; /* pointer current position in unconverted exp */ 146 147 /* re: unconverted irregular expression */ 148 char * 149 convexp(char *re) 150 { 151 register char *cre; /* pointer to converted regular expression */ 152 153 /* allocate room for the converted expression */ 154 if (re == NULL) 155 return (NULL); 156 if (*re == '\0') 157 return (NULL); 158 cre = malloc(4 * strlen(re) + 3); 159 ccre = cre; 160 ure = re; 161 162 /* start the conversion with a \a */ 163 *cre = META | OPT; 164 MSYM(cre) = 'a'; 165 ccre = MNEXT(cre); 166 167 /* start the conversion (its recursive) */ 168 expconv (); 169 *ccre = 0; 170 return (cre); 171 } 172 173 static void 174 expconv() 175 { 176 register char *cs; /* pointer to current symbol in converted exp */ 177 register char c; /* character being processed */ 178 register char *acs; /* pinter to last alternate */ 179 register int temp; 180 181 /* let the conversion begin */ 182 acs = NULL; 183 cs = NULL; 184 while (*ure) { 185 switch (c = *ure++) { 186 187 case '\\': 188 switch (c = *ure++) { 189 190 /* escaped characters are just characters */ 191 default: 192 if (cs == NULL || (*cs & STR) == 0) { 193 cs = ccre; 194 *cs = STR; 195 SCNT(cs) = 1; 196 ccre += 2; 197 } else 198 SCNT(cs)++; 199 *ccre++ = c; 200 break; 201 202 /* normal(?) metacharacters */ 203 case 'a': 204 case 'd': 205 case 'e': 206 case 'p': 207 if (acs != NULL && acs != cs) { 208 do { 209 temp = OCNT(acs); 210 OCNT(acs) = ccre - acs; 211 acs -= temp; 212 } while (temp != 0); 213 acs = NULL; 214 } 215 cs = ccre; 216 *cs = META; 217 MSYM(cs) = c; 218 ccre = MNEXT(cs); 219 break; 220 } 221 break; 222 223 /* just put the symbol in */ 224 case '^': 225 case '$': 226 if (acs != NULL && acs != cs) { 227 do { 228 temp = OCNT(acs); 229 OCNT(acs) = ccre - acs; 230 acs -= temp; 231 } while (temp != 0); 232 acs = NULL; 233 } 234 cs = ccre; 235 *cs = META; 236 MSYM(cs) = c; 237 ccre = MNEXT(cs); 238 break; 239 240 /* mark the last match sequence as optional */ 241 case '?': 242 if (cs) 243 *cs = *cs | OPT; 244 break; 245 246 /* recurse and define a subexpression */ 247 case '(': 248 if (acs != NULL && acs != cs) { 249 do { 250 temp = OCNT(acs); 251 OCNT(acs) = ccre - acs; 252 acs -= temp; 253 } while (temp != 0); 254 acs = NULL; 255 } 256 cs = ccre; 257 *cs = OPER; 258 OSYM(cs) = '('; 259 ccre = ONEXT(cs); 260 expconv(); 261 OCNT(cs) = ccre - cs; /* offset to next symbol */ 262 break; 263 264 /* return from a recursion */ 265 case ')': 266 if (acs != NULL) { 267 do { 268 temp = OCNT(acs); 269 OCNT(acs) = ccre - acs; 270 acs -= temp; 271 } while (temp != 0); 272 acs = NULL; 273 } 274 cs = ccre; 275 *cs = META; 276 MSYM(cs) = c; 277 ccre = MNEXT(cs); 278 return; 279 280 /* mark the last match sequence as having an alternate */ 281 /* the third byte will contain an offset to jump over the */ 282 /* alternate match in case the first did not fail */ 283 case '|': 284 if (acs != NULL && acs != cs) 285 OCNT(ccre) = ccre - acs; /* make a back pointer */ 286 else 287 OCNT(ccre) = 0; 288 *cs |= ALT; 289 cs = ccre; 290 *cs = OPER; 291 OSYM(cs) = '|'; 292 ccre = ONEXT(cs); 293 acs = cs; /* remember that the pointer is to be filles */ 294 break; 295 296 /* if its not a metasymbol just build a scharacter string */ 297 default: 298 if (cs == NULL || (*cs & STR) == 0) { 299 cs = ccre; 300 *cs = STR; 301 SCNT(cs) = 1; 302 ccre = SSTR(cs); 303 } else 304 SCNT(cs)++; 305 *ccre++ = c; 306 break; 307 } 308 } 309 if (acs != NULL) { 310 do { 311 temp = OCNT(acs); 312 OCNT(acs) = ccre - acs; 313 acs -= temp; 314 } while (temp != 0); 315 acs = NULL; 316 } 317 return; 318 } 319 /* end of convertre */ 320 321 322 /* 323 * The following routine recognises an irregular expression 324 * with the following special characters: 325 * 326 * \? - means last match was optional 327 * \a - matches any number of characters 328 * \d - matches any number of spaces and tabs 329 * \p - matches any number of alphanumeric 330 * characters. The 331 * characters matched will be copied into 332 * the area pointed to by 'name'. 333 * \| - alternation 334 * \( \) - grouping used mostly for alternation and 335 * optionality 336 * 337 * The irregular expression must be translated to internal form 338 * prior to calling this routine 339 * 340 * The value returned is the pointer to the first non \a 341 * character matched. 342 */ 343 344 /* 345 * s: string to check for a match in 346 * re: a converted irregular expression 347 * mstring: where to put whatever matches a \p 348 */ 349 char * 350 expmatch (register char *s, register char *re, register char *mstring) 351 { 352 register char *cs; /* the current symbol */ 353 register char *ptr,*s1; /* temporary pointer */ 354 bool matched; /* a temporary bool */ 355 356 /* initial conditions */ 357 if (re == NULL) 358 return (NULL); 359 cs = re; 360 matched = false; 361 362 /* loop till expression string is exhausted (or at least pretty tired) */ 363 while (*cs) { 364 switch (*cs & (OPER | STR | META)) { 365 366 /* try to match a string */ 367 case STR: 368 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 369 if (matched) { 370 371 /* hoorah it matches */ 372 s += SCNT(cs); 373 cs = SNEXT(cs); 374 } else if (*cs & ALT) { 375 376 /* alternation, skip to next expression */ 377 cs = SNEXT(cs); 378 } else if (*cs & OPT) { 379 380 /* the match is optional */ 381 cs = SNEXT(cs); 382 matched = 1; /* indicate a successful match */ 383 } else { 384 385 /* no match, error return */ 386 return (NULL); 387 } 388 break; 389 390 /* an operator, do something fancy */ 391 case OPER: 392 switch (OSYM(cs)) { 393 394 /* this is an alternation */ 395 case '|': 396 if (matched) 397 398 /* last thing in the alternation was a match, skip ahead */ 399 cs = OPTR(cs); 400 else 401 402 /* no match, keep trying */ 403 cs = ONEXT(cs); 404 break; 405 406 /* this is a grouping, recurse */ 407 case '(': 408 ptr = expmatch(s, ONEXT(cs), mstring); 409 if (ptr != NULL) { 410 411 /* the subexpression matched */ 412 matched = 1; 413 s = ptr; 414 } else if (*cs & ALT) { 415 416 /* alternation, skip to next expression */ 417 matched = 0; 418 } else if (*cs & OPT) { 419 420 /* the match is optional */ 421 matched = 1; /* indicate a successful match */ 422 } else { 423 424 /* no match, error return */ 425 return (NULL); 426 } 427 cs = OPTR(cs); 428 break; 429 } 430 break; 431 432 /* try to match a metasymbol */ 433 case META: 434 switch (MSYM(cs)) { 435 436 /* try to match anything and remember what was matched */ 437 case 'p': 438 /* 439 * This is really the same as trying the match the 440 * remaining parts of the expression to any subset 441 * of the string. 442 */ 443 s1 = s; 444 do { 445 ptr = expmatch(s1, MNEXT(cs), mstring); 446 if (ptr != NULL && s1 != s) { 447 448 /* we have a match, remember the match */ 449 strncpy (mstring, s, s1 - s); 450 mstring[s1 - s] = '\0'; 451 return (ptr); 452 } else if (ptr != NULL && (*cs & OPT)) { 453 454 /* it was aoptional so no match is ok */ 455 return (ptr); 456 } else if (ptr != NULL) { 457 458 /* not optional and we still matched */ 459 return (NULL); 460 } 461 if (!(isalnum(*s1) || *s1 == '_' || 462 /* C++ destructor */ 463 *s1 == '~' || 464 /* C++ scope operator */ 465 (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' && 466 (s1++, true)))) 467 return (NULL); 468 if (*s1 == '\\') 469 _escaped = _escaped ? false : true; 470 else 471 _escaped = false; 472 } while (*s1++); 473 return (NULL); 474 475 /* try to match anything */ 476 case 'a': 477 /* 478 * This is really the same as trying the match the 479 * remaining parts of the expression to any subset 480 * of the string. 481 */ 482 s1 = s; 483 do { 484 ptr = expmatch(s1, MNEXT(cs), mstring); 485 if (ptr != NULL && s1 != s) { 486 487 /* we have a match */ 488 return (ptr); 489 } else if (ptr != NULL && (*cs & OPT)) { 490 491 /* it was aoptional so no match is ok */ 492 return (ptr); 493 } else if (ptr != NULL) { 494 495 /* not optional and we still matched */ 496 return (NULL); 497 } 498 if (*s1 == '\\') 499 _escaped = _escaped ? false : true; 500 else 501 _escaped = false; 502 } while (*s1++); 503 return (NULL); 504 505 /* fail if we are currently _escaped */ 506 case 'e': 507 if (_escaped) 508 return(NULL); 509 cs = MNEXT(cs); 510 break; 511 512 /* match any number of tabs and spaces */ 513 case 'd': 514 ptr = s; 515 while (*s == ' ' || *s == '\t') 516 s++; 517 if (s != ptr || s == s_start) { 518 519 /* match, be happy */ 520 matched = 1; 521 cs = MNEXT(cs); 522 } else if (*s == '\n' || *s == '\0') { 523 524 /* match, be happy */ 525 matched = 1; 526 cs = MNEXT(cs); 527 } else if (*cs & ALT) { 528 529 /* try the next part */ 530 matched = 0; 531 cs = MNEXT(cs); 532 } else if (*cs & OPT) { 533 534 /* doesn't matter */ 535 matched = 1; 536 cs = MNEXT(cs); 537 } else 538 539 /* no match, error return */ 540 return (NULL); 541 break; 542 543 /* check for end of line */ 544 case '$': 545 if (*s == '\0' || *s == '\n') { 546 547 /* match, be happy */ 548 s++; 549 matched = 1; 550 cs = MNEXT(cs); 551 } else if (*cs & ALT) { 552 553 /* try the next part */ 554 matched = 0; 555 cs = MNEXT(cs); 556 } else if (*cs & OPT) { 557 558 /* doesn't matter */ 559 matched = 1; 560 cs = MNEXT(cs); 561 } else 562 563 /* no match, error return */ 564 return (NULL); 565 break; 566 567 /* check for start of line */ 568 case '^': 569 if (s == s_start) { 570 571 /* match, be happy */ 572 matched = 1; 573 cs = MNEXT(cs); 574 } else if (*cs & ALT) { 575 576 /* try the next part */ 577 matched = 0; 578 cs = MNEXT(cs); 579 } else if (*cs & OPT) { 580 581 /* doesn't matter */ 582 matched = 1; 583 cs = MNEXT(cs); 584 } else 585 586 /* no match, error return */ 587 return (NULL); 588 break; 589 590 /* end of a subexpression, return success */ 591 case ')': 592 return (s); 593 } 594 break; 595 } 596 } 597 return (s); 598 } 599