1 /* 2 * Copyright (c) 1980 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 */ 6 7 #pragma ident "%Z%%M% %I% %E% SMI" 8 /* from UCB 5.1 (Berkeley) 6/5/85 */ 9 10 #include <ctype.h> 11 12 typedef int boolean; 13 #define TRUE 1 14 #define FALSE 0 15 #define NIL 0 16 17 extern boolean l_onecase; /* true if upper and lower equivalent */ 18 extern char *l_idchars; /* set of characters legal in identifiers 19 in addition to letters and digits */ 20 21 extern char *strchr(); 22 23 #define isidchr(c) \ 24 (isalnum(c) || ((c) != NIL && strchr(l_idchars, (c)) != NIL)) 25 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) 26 27 /* STRNCMP - like strncmp except that we convert the 28 * first string to lower case before comparing 29 * if l_onecase is set. 30 */ 31 32 STRNCMP(s1, s2, len) 33 register char *s1,*s2; 34 register int len; 35 { 36 if (l_onecase) { 37 do 38 if (*s2 - makelower(*s1)) 39 return (*s2 - makelower(*s1)); 40 else { 41 s2++; 42 s1++; 43 } 44 while (--len); 45 } else { 46 do 47 if (*s2 - *s1) 48 return (*s2 - *s1); 49 else { 50 s2++; 51 s1++; 52 } 53 while (--len); 54 } 55 return(0); 56 } 57 58 /* The following routine converts an irregular expression to 59 * internal format. 60 * 61 * Either meta symbols (\a \d or \p) or character strings or 62 * operations ( alternation or parenthesizing ) can be 63 * specified. Each starts with a descriptor byte. The descriptor 64 * byte has STR set for strings, META set for meta symbols 65 * and OPER set for operations. 66 * The descriptor byte can also have the OPT bit set if the object 67 * defined is optional. Also ALT can be set to indicate an alternation. 68 * 69 * For metasymbols the byte following the descriptor byte identities 70 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For 71 * strings the byte after the descriptor is a character count for 72 * the string: 73 * 74 * meta symbols := descriptor 75 * symbol 76 * 77 * strings := descriptor 78 * character count 79 * the string 80 * 81 * operations := descriptor 82 * symbol 83 * character count 84 */ 85 86 /* 87 * handy macros for accessing parts of match blocks 88 */ 89 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ 90 #define MNEXT(A) (A+2) /* character following a metasymbol block */ 91 92 #define OSYM(A) (*(A+1)) /* symbol in an operation block */ 93 #define OCNT(A) (*(A+2)) /* character count */ 94 #define ONEXT(A) (A+3) /* next character after the operation */ 95 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ 96 97 #define SCNT(A) (*(A+1)) /* byte count of a string */ 98 #define SSTR(A) (A+2) /* address of the string */ 99 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ 100 101 /* 102 * bit flags in the descriptor 103 */ 104 #define OPT 1 105 #define STR 2 106 #define META 4 107 #define ALT 8 108 #define OPER 16 109 110 char *ure; /* pointer current position in unconverted exp */ 111 char *ccre; /* pointer to current position in converted exp*/ 112 char *malloc(); 113 114 char * 115 convexp(re) 116 char *re; /* unconverted irregular expression */ 117 { 118 register char *cre; /* pointer to converted regular expression */ 119 120 /* allocate room for the converted expression */ 121 if (re == NIL) 122 return (NIL); 123 if (*re == '\0') 124 return (NIL); 125 cre = malloc (4 * strlen(re) + 3); 126 ccre = cre; 127 ure = re; 128 129 /* start the conversion with a \a */ 130 *cre = META | OPT; 131 MSYM(cre) = 'a'; 132 ccre = MNEXT(cre); 133 134 /* start the conversion (its recursive) */ 135 expconv (); 136 *ccre = 0; 137 return (cre); 138 } 139 140 expconv() 141 { 142 register char *cs; /* pointer to current symbol in converted exp */ 143 register char c; /* character being processed */ 144 register char *acs; /* pinter to last alternate */ 145 register int temp; 146 147 /* let the conversion begin */ 148 acs = NIL; 149 cs = NIL; 150 while (*ure != NIL) { 151 switch (c = *ure++) { 152 153 case '\\': 154 switch (c = *ure++) { 155 156 /* escaped characters are just characters */ 157 default: 158 if (cs == NIL || (*cs & STR) == 0) { 159 cs = ccre; 160 *cs = STR; 161 SCNT(cs) = 1; 162 ccre += 2; 163 } else 164 SCNT(cs)++; 165 *ccre++ = c; 166 break; 167 168 /* normal(?) metacharacters */ 169 case 'a': 170 case 'd': 171 case 'e': 172 case 'p': 173 if (acs != NIL && acs != cs) { 174 do { 175 temp = OCNT(acs); 176 OCNT(acs) = ccre - acs; 177 acs -= temp; 178 } while (temp != 0); 179 acs = NIL; 180 } 181 cs = ccre; 182 *cs = META; 183 MSYM(cs) = c; 184 ccre = MNEXT(cs); 185 break; 186 } 187 break; 188 189 /* just put the symbol in */ 190 case '^': 191 case '$': 192 if (acs != NIL && acs != cs) { 193 do { 194 temp = OCNT(acs); 195 OCNT(acs) = ccre - acs; 196 acs -= temp; 197 } while (temp != 0); 198 acs = NIL; 199 } 200 cs = ccre; 201 *cs = META; 202 MSYM(cs) = c; 203 ccre = MNEXT(cs); 204 break; 205 206 /* mark the last match sequence as optional */ 207 case '?': 208 if (cs) 209 *cs = *cs | OPT; 210 break; 211 212 /* recurse and define a subexpression */ 213 case '(': 214 if (acs != NIL && acs != cs) { 215 do { 216 temp = OCNT(acs); 217 OCNT(acs) = ccre - acs; 218 acs -= temp; 219 } while (temp != 0); 220 acs = NIL; 221 } 222 cs = ccre; 223 *cs = OPER; 224 OSYM(cs) = '('; 225 ccre = ONEXT(cs); 226 expconv (); 227 OCNT(cs) = ccre - cs; /* offset to next symbol */ 228 break; 229 230 /* return from a recursion */ 231 case ')': 232 if (acs != NIL) { 233 do { 234 temp = OCNT(acs); 235 OCNT(acs) = ccre - acs; 236 acs -= temp; 237 } while (temp != 0); 238 acs = NIL; 239 } 240 cs = ccre; 241 *cs = META; 242 MSYM(cs) = c; 243 ccre = MNEXT(cs); 244 return; 245 246 /* mark the last match sequence as having an alternate */ 247 /* the third byte will contain an offset to jump over the */ 248 /* alternate match in case the first did not fail */ 249 case '|': 250 if (acs != NIL && acs != cs) 251 OCNT(ccre) = ccre - acs; /* make a back pointer */ 252 else 253 OCNT(ccre) = 0; 254 *cs |= ALT; 255 cs = ccre; 256 *cs = OPER; 257 OSYM(cs) = '|'; 258 ccre = ONEXT(cs); 259 acs = cs; /* remember that the pointer is to be filles */ 260 break; 261 262 /* if its not a metasymbol just build a scharacter string */ 263 default: 264 if (cs == NIL || (*cs & STR) == 0) { 265 cs = ccre; 266 *cs = STR; 267 SCNT(cs) = 1; 268 ccre = SSTR(cs); 269 } else 270 SCNT(cs)++; 271 *ccre++ = c; 272 break; 273 } 274 } 275 if (acs != NIL) { 276 do { 277 temp = OCNT(acs); 278 OCNT(acs) = ccre - acs; 279 acs -= temp; 280 } while (temp != 0); 281 acs = NIL; 282 } 283 return; 284 } 285 /* end of convertre */ 286 287 288 /* 289 * The following routine recognises an irregular expresion 290 * with the following special characters: 291 * 292 * \? - means last match was optional 293 * \a - matches any number of characters 294 * \d - matches any number of spaces and tabs 295 * \p - matches any number of alphanumeric 296 * characters. The 297 * characters matched will be copied into 298 * the area pointed to by 'name'. 299 * \| - alternation 300 * \( \) - grouping used mostly for alternation and 301 * optionality 302 * 303 * The irregular expression must be translated to internal form 304 * prior to calling this routine 305 * 306 * The value returned is the pointer to the first non \a 307 * character matched. 308 */ 309 310 boolean _escaped; /* true if we are currently _escaped */ 311 char *Start; /* start of string */ 312 313 char * 314 expmatch (s, re, mstring) 315 register char *s; /* string to check for a match in */ 316 register char *re; /* a converted irregular expression */ 317 register char *mstring; /* where to put whatever matches a \p */ 318 { 319 register char *cs; /* the current symbol */ 320 register char *ptr,*s1; /* temporary pointer */ 321 boolean matched; /* a temporary boolean */ 322 323 /* initial conditions */ 324 if (re == NIL) 325 return (NIL); 326 cs = re; 327 matched = FALSE; 328 329 /* loop till expression string is exhausted (or at least pretty tired) */ 330 while (*cs) { 331 switch (*cs & (OPER | STR | META)) { 332 333 /* try to match a string */ 334 case STR: 335 matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); 336 if (matched) { 337 338 /* hoorah it matches */ 339 s += SCNT(cs); 340 cs = SNEXT(cs); 341 } else if (*cs & ALT) { 342 343 /* alternation, skip to next expression */ 344 cs = SNEXT(cs); 345 } else if (*cs & OPT) { 346 347 /* the match is optional */ 348 cs = SNEXT(cs); 349 matched = 1; /* indicate a successful match */ 350 } else { 351 352 /* no match, error return */ 353 return (NIL); 354 } 355 break; 356 357 /* an operator, do something fancy */ 358 case OPER: 359 switch (OSYM(cs)) { 360 361 /* this is an alternation */ 362 case '|': 363 if (matched) 364 365 /* last thing in the alternation was a match, skip ahead */ 366 cs = OPTR(cs); 367 else 368 369 /* no match, keep trying */ 370 cs = ONEXT(cs); 371 break; 372 373 /* this is a grouping, recurse */ 374 case '(': 375 ptr = expmatch (s, ONEXT(cs), mstring); 376 if (ptr != NIL) { 377 378 /* the subexpression matched */ 379 matched = 1; 380 s = ptr; 381 } else if (*cs & ALT) { 382 383 /* alternation, skip to next expression */ 384 matched = 0; 385 } else if (*cs & OPT) { 386 387 /* the match is optional */ 388 matched = 1; /* indicate a successful match */ 389 } else { 390 391 /* no match, error return */ 392 return (NIL); 393 } 394 cs = OPTR(cs); 395 break; 396 } 397 break; 398 399 /* try to match a metasymbol */ 400 case META: 401 switch (MSYM(cs)) { 402 403 /* try to match anything and remember what was matched */ 404 case 'p': 405 /* 406 * This is really the same as trying the match the 407 * remaining parts of the expression to any subset 408 * of the string. 409 */ 410 s1 = s; 411 do { 412 ptr = expmatch (s1, MNEXT(cs), mstring); 413 if (ptr != NIL && s1 != s) { 414 415 /* we have a match, remember the match */ 416 strncpy (mstring, s, s1 - s); 417 mstring[s1 - s] = '\0'; 418 return (ptr); 419 } else if (ptr != NIL && (*cs & OPT)) { 420 421 /* it was aoptional so no match is ok */ 422 return (ptr); 423 } else if (ptr != NIL) { 424 425 /* not optional and we still matched */ 426 return (NIL); 427 } 428 if (!isidchr(*s1)) 429 return (NIL); 430 if (*s1 == '\\') 431 _escaped = _escaped ? FALSE : TRUE; 432 else 433 _escaped = FALSE; 434 } while (*s1++); 435 return (NIL); 436 437 /* try to match anything */ 438 case 'a': 439 /* 440 * This is really the same as trying the match the 441 * remaining parts of the expression to any subset 442 * of the string. 443 */ 444 s1 = s; 445 do { 446 ptr = expmatch (s1, MNEXT(cs), mstring); 447 if (ptr != NIL && s1 != s) { 448 449 /* we have a match */ 450 return (ptr); 451 } else if (ptr != NIL && (*cs & OPT)) { 452 453 /* it was aoptional so no match is ok */ 454 return (ptr); 455 } else if (ptr != NIL) { 456 457 /* not optional and we still matched */ 458 return (NIL); 459 } 460 if (*s1 == '\\') 461 _escaped = _escaped ? FALSE : TRUE; 462 else 463 _escaped = FALSE; 464 } while (*s1++); 465 return (NIL); 466 467 /* fail if we are currently _escaped */ 468 case 'e': 469 if (_escaped) 470 return(NIL); 471 cs = MNEXT(cs); 472 break; 473 474 /* match any number of tabs and spaces */ 475 case 'd': 476 ptr = s; 477 while (*s == ' ' || *s == '\t') 478 s++; 479 if (s != ptr || s == Start) { 480 481 /* match, be happy */ 482 matched = 1; 483 cs = MNEXT(cs); 484 } else if (*s == '\n' || *s == '\0') { 485 486 /* match, be happy */ 487 matched = 1; 488 cs = MNEXT(cs); 489 } else if (*cs & ALT) { 490 491 /* try the next part */ 492 matched = 0; 493 cs = MNEXT(cs); 494 } else if (*cs & OPT) { 495 496 /* doesn't matter */ 497 matched = 1; 498 cs = MNEXT(cs); 499 } else 500 501 /* no match, error return */ 502 return (NIL); 503 break; 504 505 /* check for end of line */ 506 case '$': 507 if (*s == '\0' || *s == '\n') { 508 509 /* match, be happy */ 510 s++; 511 matched = 1; 512 cs = MNEXT(cs); 513 } else if (*cs & ALT) { 514 515 /* try the next part */ 516 matched = 0; 517 cs = MNEXT(cs); 518 } else if (*cs & OPT) { 519 520 /* doesn't matter */ 521 matched = 1; 522 cs = MNEXT(cs); 523 } else 524 525 /* no match, error return */ 526 return (NIL); 527 break; 528 529 /* check for start of line */ 530 case '^': 531 if (s == Start) { 532 533 /* match, be happy */ 534 matched = 1; 535 cs = MNEXT(cs); 536 } else if (*cs & ALT) { 537 538 /* try the next part */ 539 matched = 0; 540 cs = MNEXT(cs); 541 } else if (*cs & OPT) { 542 543 /* doesn't matter */ 544 matched = 1; 545 cs = MNEXT(cs); 546 } else 547 548 /* no match, error return */ 549 return (NIL); 550 break; 551 552 /* end of a subexpression, return success */ 553 case ')': 554 return (s); 555 } 556 break; 557 } 558 } 559 return (s); 560 } 561