1 /* 2 * Copyright (c) 1980 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 */ 6 7 #include <ctype.h> 8 9 typedef int boolean; 10 #define TRUE 1 11 #define FALSE 0 12 #define NIL 0 13 14 extern boolean l_onecase; /* true if upper and lower equivalent */ 15 extern char *l_idchars; /* set of characters legal in identifiers 16 in addition to letters and digits */ 17 18 extern char *strchr(); 19 static void expconv(void); 20 21 #define isidchr(c) \ 22 (isalnum(c) || ((c) != NIL && strchr(l_idchars, (c)) != NIL)) 23 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 24 25 /* STRNCMP - like strncmp except that we convert the 26 * first string to lower case before comparing 27 * if l_onecase is set. 28 */ 29 30 int 31 STRNCMP(char *s1, char *s2, int len) 32 { 33 if (l_onecase) { 34 do 35 if (*s2 - makelower(*s1)) 36 return (*s2 - makelower(*s1)); 37 else { 38 s2++; 39 s1++; 40 } 41 while (--len); 42 } else { 43 do 44 if (*s2 - *s1) 45 return (*s2 - *s1); 46 else { 47 s2++; 48 s1++; 49 } 50 while (--len); 51 } 52 return(0); 53 } 54 55 /* The following routine converts an irregular expression to 56 * internal format. 57 * 58 * Either meta symbols (\a \d or \p) or character strings or 59 * operations ( alternation or parenthesizing ) can be 60 * specified. Each starts with a descriptor byte. The descriptor 61 * byte has STR set for strings, META set for meta symbols 62 * and OPER set for operations. 63 * The descriptor byte can also have the OPT bit set if the object 64 * defined is optional. Also ALT can be set to indicate an alternation. 65 * 66 * For metasymbols the byte following the descriptor byte identities 67 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 68 * strings the byte after the descriptor is a character count for 69 * the string: 70 * 71 * meta symbols := descriptor 72 * symbol 73 * 74 * strings := descriptor 75 * character count 76 * the string 77 * 78 * operations := descriptor 79 * symbol 80 * character count 81 */ 82 83 /* 84 * handy macros for accessing parts of match blocks 85 */ 86 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 87 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 88 89 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 90 #define OCNT(A) (*(A+2)) /* character count */ 91 #define ONEXT(A) (A+3) /* next character after the operation */ 92 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 93 94 #define SCNT(A) (*(A+1)) /* byte count of a string */ 95 #define SSTR(A) (A+2) /* address of the string */ 96 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 97 98 /* 99 * bit flags in the descriptor 100 */ 101 #define OPT 1 102 #define STR 2 103 #define META 4 104 #define ALT 8 105 #define OPER 16 106 107 char *ure; /* pointer current position in unconverted exp */ 108 char *ccre; /* pointer to current position in converted exp*/ 109 char *malloc(); 110 111 char * 112 convexp(char *re) 113 /* re - unconverted irregular expression */ 114 { 115 char *cre; /* pointer to converted regular expression */ 116 117 /* allocate room for the converted expression */ 118 if (re == NIL) 119 return (NIL); 120 if (*re == '\0') 121 return (NIL); 122 cre = malloc (4 * strlen(re) + 3); 123 ccre = cre; 124 ure = re; 125 126 /* start the conversion with a \a */ 127 *cre = META | OPT; 128 MSYM(cre) = 'a'; 129 ccre = MNEXT(cre); 130 131 /* start the conversion (its recursive) */ 132 expconv (); 133 *ccre = 0; 134 return (cre); 135 } 136 137 static void 138 expconv(void) 139 { 140 char *cs; /* pointer to current symbol in converted exp */ 141 char c; /* character being processed */ 142 char *acs; /* pinter to last alternate */ 143 int temp; 144 145 /* let the conversion begin */ 146 acs = NIL; 147 cs = NIL; 148 while (*ure != NIL) { 149 switch (c = *ure++) { 150 151 case '\\': 152 switch (c = *ure++) { 153 154 /* escaped characters are just characters */ 155 default: 156 if (cs == NIL || (*cs & STR) == 0) { 157 cs = ccre; 158 *cs = STR; 159 SCNT(cs) = 1; 160 ccre += 2; 161 } else { 162 SCNT(cs)++; 163 } 164 *ccre++ = c; 165 break; 166 167 /* normal(?) metacharacters */ 168 case 'a': 169 case 'd': 170 case 'e': 171 case 'p': 172 if (acs != NIL && acs != cs) { 173 do { 174 temp = OCNT(acs); 175 OCNT(acs) = ccre - acs; 176 acs -= temp; 177 } while (temp != 0); 178 acs = NIL; 179 } 180 cs = ccre; 181 *cs = META; 182 MSYM(cs) = c; 183 ccre = MNEXT(cs); 184 break; 185 } 186 break; 187 188 /* just put the symbol in */ 189 case '^': 190 case '$': 191 if (acs != NIL && acs != cs) { 192 do { 193 temp = OCNT(acs); 194 OCNT(acs) = ccre - acs; 195 acs -= temp; 196 } while (temp != 0); 197 acs = NIL; 198 } 199 cs = ccre; 200 *cs = META; 201 MSYM(cs) = c; 202 ccre = MNEXT(cs); 203 break; 204 205 /* mark the last match sequence as optional */ 206 case '?': 207 if (cs) 208 *cs = *cs | OPT; 209 break; 210 211 /* recurse and define a subexpression */ 212 case '(': 213 if (acs != NIL && acs != cs) { 214 do { 215 temp = OCNT(acs); 216 OCNT(acs) = ccre - acs; 217 acs -= temp; 218 } while (temp != 0); 219 acs = NIL; 220 } 221 cs = ccre; 222 *cs = OPER; 223 OSYM(cs) = '('; 224 ccre = ONEXT(cs); 225 expconv (); 226 OCNT(cs) = ccre - cs; /* offset to next symbol */ 227 break; 228 229 /* return from a recursion */ 230 case ')': 231 if (acs != NIL) { 232 do { 233 temp = OCNT(acs); 234 OCNT(acs) = ccre - acs; 235 acs -= temp; 236 } while (temp != 0); 237 acs = NIL; 238 } 239 cs = ccre; 240 *cs = META; 241 MSYM(cs) = c; 242 ccre = MNEXT(cs); 243 return; 244 245 /* mark the last match sequence as having an alternate */ 246 /* the third byte will contain an offset to jump over the */ 247 /* alternate match in case the first did not fail */ 248 case '|': 249 if (acs != NIL && acs != cs) 250 OCNT(ccre) = ccre - acs; /* make a back pointer */ 251 else 252 OCNT(ccre) = 0; 253 *cs |= ALT; 254 cs = ccre; 255 *cs = OPER; 256 OSYM(cs) = '|'; 257 ccre = ONEXT(cs); 258 acs = cs; /* remember that the pointer is to be filles */ 259 break; 260 261 /* if its not a metasymbol just build a scharacter string */ 262 default: 263 if (cs == NIL || (*cs & STR) == 0) { 264 cs = ccre; 265 *cs = STR; 266 SCNT(cs) = 1; 267 ccre = SSTR(cs); 268 } else 269 SCNT(cs)++; 270 *ccre++ = c; 271 break; 272 } 273 } 274 if (acs != NIL) { 275 do { 276 temp = OCNT(acs); 277 OCNT(acs) = ccre - acs; 278 acs -= temp; 279 } while (temp != 0); 280 acs = NIL; 281 } 282 } 283 /* end of convertre */ 284 285 286 /* 287 * The following routine recognises an irregular expresion 288 * with the following special characters: 289 * 290 * \? - means last match was optional 291 * \a - matches any number of characters 292 * \d - matches any number of spaces and tabs 293 * \p - matches any number of alphanumeric 294 * characters. The 295 * characters matched will be copied into 296 * the area pointed to by 'name'. 297 * \| - alternation 298 * \( \) - grouping used mostly for alternation and 299 * optionality 300 * 301 * The irregular expression must be translated to internal form 302 * prior to calling this routine 303 * 304 * The value returned is the pointer to the first non \a 305 * character matched. 306 */ 307 308 extern boolean _escaped; /* true if we are currently _escaped */ 309 extern char *Start; /* start of string */ 310 311 char * 312 expmatch(char *s, char *re, char *mstring) 313 /* s - string to check for a match in */ 314 /* re - a converted irregular expression */ 315 /* mstring - where to put whatever matches a \p */ 316 { 317 char *cs; /* the current symbol */ 318 char *ptr, *s1; /* temporary pointer */ 319 boolean matched; /* a temporary boolean */ 320 321 /* initial conditions */ 322 if (re == NIL) 323 return (NIL); 324 cs = re; 325 matched = FALSE; 326 327 /* loop till expression string is exhausted (or at least pretty tired) */ 328 while (*cs) { 329 switch (*cs & (OPER | STR | META)) { 330 331 /* try to match a string */ 332 case STR: 333 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 334 if (matched) { 335 336 /* hoorah it matches */ 337 s += SCNT(cs); 338 cs = SNEXT(cs); 339 } else if (*cs & ALT) { 340 341 /* alternation, skip to next expression */ 342 cs = SNEXT(cs); 343 } else if (*cs & OPT) { 344 345 /* the match is optional */ 346 cs = SNEXT(cs); 347 matched = 1; /* indicate a successful match */ 348 } else { 349 350 /* no match, error return */ 351 return (NIL); 352 } 353 break; 354 355 /* an operator, do something fancy */ 356 case OPER: 357 switch (OSYM(cs)) { 358 359 /* this is an alternation */ 360 case '|': 361 if (matched) 362 363 /* last thing in the alternation was a match, skip ahead */ 364 cs = OPTR(cs); 365 else 366 367 /* no match, keep trying */ 368 cs = ONEXT(cs); 369 break; 370 371 /* this is a grouping, recurse */ 372 case '(': 373 ptr = expmatch (s, ONEXT(cs), mstring); 374 if (ptr != NIL) { 375 376 /* the subexpression matched */ 377 matched = 1; 378 s = ptr; 379 } else if (*cs & ALT) { 380 381 /* alternation, skip to next expression */ 382 matched = 0; 383 } else if (*cs & OPT) { 384 385 /* the match is optional */ 386 matched = 1; /* indicate a successful match */ 387 } else { 388 389 /* no match, error return */ 390 return (NIL); 391 } 392 cs = OPTR(cs); 393 break; 394 } 395 break; 396 397 /* try to match a metasymbol */ 398 case META: 399 switch (MSYM(cs)) { 400 401 /* try to match anything and remember what was matched */ 402 case 'p': 403 /* 404 * This is really the same as trying the match the 405 * remaining parts of the expression to any subset 406 * of the string. 407 */ 408 s1 = s; 409 do { 410 ptr = expmatch (s1, MNEXT(cs), mstring); 411 if (ptr != NIL && s1 != s) { 412 413 /* we have a match, remember the match */ 414 strncpy (mstring, s, s1 - s); 415 mstring[s1 - s] = '\0'; 416 return (ptr); 417 } else if (ptr != NIL && (*cs & OPT)) { 418 419 /* it was aoptional so no match is ok */ 420 return (ptr); 421 } else if (ptr != NIL) { 422 423 /* not optional and we still matched */ 424 return (NIL); 425 } 426 if (!isidchr(*s1)) 427 return (NIL); 428 if (*s1 == '\\') 429 _escaped = _escaped ? FALSE : TRUE; 430 else 431 _escaped = FALSE; 432 } while (*s1++); 433 return (NIL); 434 435 /* try to match anything */ 436 case 'a': 437 /* 438 * This is really the same as trying the match the 439 * remaining parts of the expression to any subset 440 * of the string. 441 */ 442 s1 = s; 443 do { 444 ptr = expmatch (s1, MNEXT(cs), mstring); 445 if (ptr != NIL && s1 != s) { 446 447 /* we have a match */ 448 return (ptr); 449 } else if (ptr != NIL && (*cs & OPT)) { 450 451 /* it was aoptional so no match is ok */ 452 return (ptr); 453 } else if (ptr != NIL) { 454 455 /* not optional and we still matched */ 456 return (NIL); 457 } 458 if (*s1 == '\\') 459 _escaped = _escaped ? FALSE : TRUE; 460 else 461 _escaped = FALSE; 462 } while (*s1++); 463 return (NIL); 464 465 /* fail if we are currently _escaped */ 466 case 'e': 467 if (_escaped) 468 return(NIL); 469 cs = MNEXT(cs); 470 break; 471 472 /* match any number of tabs and spaces */ 473 case 'd': 474 ptr = s; 475 while (*s == ' ' || *s == '\t') 476 s++; 477 if (s != ptr || s == Start) { 478 479 /* match, be happy */ 480 matched = 1; 481 cs = MNEXT(cs); 482 } else if (*s == '\n' || *s == '\0') { 483 484 /* match, be happy */ 485 matched = 1; 486 cs = MNEXT(cs); 487 } else if (*cs & ALT) { 488 489 /* try the next part */ 490 matched = 0; 491 cs = MNEXT(cs); 492 } else if (*cs & OPT) { 493 494 /* doesn't matter */ 495 matched = 1; 496 cs = MNEXT(cs); 497 } else 498 499 /* no match, error return */ 500 return (NIL); 501 break; 502 503 /* check for end of line */ 504 case '$': 505 if (*s == '\0' || *s == '\n') { 506 507 /* match, be happy */ 508 s++; 509 matched = 1; 510 cs = MNEXT(cs); 511 } else if (*cs & ALT) { 512 513 /* try the next part */ 514 matched = 0; 515 cs = MNEXT(cs); 516 } else if (*cs & OPT) { 517 518 /* doesn't matter */ 519 matched = 1; 520 cs = MNEXT(cs); 521 } else 522 523 /* no match, error return */ 524 return (NIL); 525 break; 526 527 /* check for start of line */ 528 case '^': 529 if (s == Start) { 530 531 /* match, be happy */ 532 matched = 1; 533 cs = MNEXT(cs); 534 } else if (*cs & ALT) { 535 536 /* try the next part */ 537 matched = 0; 538 cs = MNEXT(cs); 539 } else if (*cs & OPT) { 540 541 /* doesn't matter */ 542 matched = 1; 543 cs = MNEXT(cs); 544 } else 545 546 /* no match, error return */ 547 return (NIL); 548 break; 549 550 /* end of a subexpression, return success */ 551 case ')': 552 return (s); 553 } 554 break; 555 } 556 } 557 return (s); 558 } 559