1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #if defined(LIBC_SCCS) && !defined(lint) 41 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42 #endif /* LIBC_SCCS and not lint */ 43 44 #include <sys/types.h> 45 #include <stdio.h> 46 #include <string.h> 47 #include <ctype.h> 48 #include <limits.h> 49 #include <locale.h> 50 #include <stdlib.h> 51 #include <regex.h> 52 53 #include "utils.h" 54 #include "regex2.h" 55 56 #include "cclass.h" 57 #include "cname.h" 58 59 /* 60 * parse structure, passed up and down to avoid global variables and 61 * other clumsinesses 62 */ 63 struct parse { 64 char *next; /* next character in RE */ 65 char *end; /* end of string (-> NUL normally) */ 66 int error; /* has an error been seen? */ 67 sop *strip; /* malloced strip */ 68 sopno ssize; /* malloced strip size (allocated) */ 69 sopno slen; /* malloced strip length (used) */ 70 int ncsalloc; /* number of csets allocated */ 71 struct re_guts *g; 72 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 73 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 74 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 75 }; 76 77 /* ========= begin header generated by ./mkh ========= */ 78 #ifdef __cplusplus 79 extern "C" { 80 #endif 81 82 /* === regcomp.c === */ 83 static void p_ere __P((struct parse *p, int stop)); 84 static void p_ere_exp __P((struct parse *p)); 85 static void p_str __P((struct parse *p)); 86 static void p_bre __P((struct parse *p, int end1, int end2)); 87 static int p_simp_re __P((struct parse *p, int starordinary)); 88 static int p_count __P((struct parse *p)); 89 static void p_bracket __P((struct parse *p)); 90 static void p_b_term __P((struct parse *p, cset *cs)); 91 static void p_b_cclass __P((struct parse *p, cset *cs)); 92 static void p_b_eclass __P((struct parse *p, cset *cs)); 93 static char p_b_symbol __P((struct parse *p)); 94 static char p_b_coll_elem __P((struct parse *p, int endc)); 95 static char othercase __P((int ch)); 96 static void bothcases __P((struct parse *p, int ch)); 97 static void ordinary __P((struct parse *p, int ch)); 98 static void nonnewline __P((struct parse *p)); 99 static void repeat __P((struct parse *p, sopno start, int from, int to)); 100 static int seterr __P((struct parse *p, int e)); 101 static cset *allocset __P((struct parse *p)); 102 static void freeset __P((struct parse *p, cset *cs)); 103 static int freezeset __P((struct parse *p, cset *cs)); 104 static int firstch __P((struct parse *p, cset *cs)); 105 static int nch __P((struct parse *p, cset *cs)); 106 static void mcadd __P((struct parse *p, cset *cs, char *cp)); 107 #if used 108 static void mcsub __P((cset *cs, char *cp)); 109 static int mcin __P((cset *cs, char *cp)); 110 static char *mcfind __P((cset *cs, char *cp)); 111 #endif 112 static void mcinvert __P((struct parse *p, cset *cs)); 113 static void mccase __P((struct parse *p, cset *cs)); 114 static int isinsets __P((struct re_guts *g, int c)); 115 static int samesets __P((struct re_guts *g, int c1, int c2)); 116 static void categorize __P((struct parse *p, struct re_guts *g)); 117 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 118 static void doemit __P((struct parse *p, sop op, size_t opnd)); 119 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 120 static void dofwd __P((struct parse *p, sopno pos, sop value)); 121 static void enlarge __P((struct parse *p, sopno size)); 122 static void stripsnug __P((struct parse *p, struct re_guts *g)); 123 static void findmust __P((struct parse *p, struct re_guts *g)); 124 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 125 126 #ifdef __cplusplus 127 } 128 #endif 129 /* ========= end header generated by ./mkh ========= */ 130 131 static char nuls[10]; /* place to point scanner in event of error */ 132 133 /* 134 * macros for use with parse structure 135 * BEWARE: these know that the parse structure is named `p' !!! 136 */ 137 #define PEEK() (*p->next) 138 #define PEEK2() (*(p->next+1)) 139 #define MORE() (p->next < p->end) 140 #define MORE2() (p->next+1 < p->end) 141 #define SEE(c) (MORE() && PEEK() == (c)) 142 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 143 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 144 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 145 #define NEXT() (p->next++) 146 #define NEXT2() (p->next += 2) 147 #define NEXTn(n) (p->next += (n)) 148 #define GETNEXT() (*p->next++) 149 #define SETERROR(e) seterr(p, (e)) 150 #define REQUIRE(co, e) ((co) || SETERROR(e)) 151 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 152 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 153 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 154 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 155 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 156 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 157 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 158 #define HERE() (p->slen) 159 #define THERE() (p->slen - 1) 160 #define THERETHERE() (p->slen - 2) 161 #define DROP(n) (p->slen -= (n)) 162 163 #ifndef NDEBUG 164 static int never = 0; /* for use in asserts; shuts lint up */ 165 #else 166 #define never 0 /* some <assert.h>s have bugs too */ 167 #endif 168 169 /* 170 - regcomp - interface for parser and compilation 171 = extern int regcomp(regex_t *, const char *, int); 172 = #define REG_BASIC 0000 173 = #define REG_EXTENDED 0001 174 = #define REG_ICASE 0002 175 = #define REG_NOSUB 0004 176 = #define REG_NEWLINE 0010 177 = #define REG_NOSPEC 0020 178 = #define REG_PEND 0040 179 = #define REG_DUMP 0200 180 */ 181 int /* 0 success, otherwise REG_something */ 182 regcomp(preg, pattern, cflags) 183 regex_t *preg; 184 const char *pattern; 185 int cflags; 186 { 187 struct parse pa; 188 register struct re_guts *g; 189 register struct parse *p = &pa; 190 register int i; 191 register size_t len; 192 #ifdef REDEBUG 193 # define GOODFLAGS(f) (f) 194 #else 195 # define GOODFLAGS(f) ((f)&~REG_DUMP) 196 #endif 197 198 cflags = GOODFLAGS(cflags); 199 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 200 return(REG_INVARG); 201 202 if (cflags®_PEND) { 203 if (preg->re_endp < pattern) 204 return(REG_INVARG); 205 len = preg->re_endp - pattern; 206 } else 207 len = strlen((char *)pattern); 208 209 /* do the mallocs early so failure handling is easy */ 210 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 211 (NC-1)*sizeof(cat_t)); 212 if (g == NULL) 213 return(REG_ESPACE); 214 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 215 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 216 p->slen = 0; 217 if (p->strip == NULL) { 218 free((char *)g); 219 return(REG_ESPACE); 220 } 221 222 /* set things up */ 223 p->g = g; 224 p->next = (char *)pattern; /* convenience; we do not modify it */ 225 p->end = p->next + len; 226 p->error = 0; 227 p->ncsalloc = 0; 228 for (i = 0; i < NPAREN; i++) { 229 p->pbegin[i] = 0; 230 p->pend[i] = 0; 231 } 232 g->csetsize = NC; 233 g->sets = NULL; 234 g->setbits = NULL; 235 g->ncsets = 0; 236 g->cflags = cflags; 237 g->iflags = 0; 238 g->nbol = 0; 239 g->neol = 0; 240 g->must = NULL; 241 g->mlen = 0; 242 g->nsub = 0; 243 g->ncategories = 1; /* category 0 is "everything else" */ 244 g->categories = &g->catspace[-(CHAR_MIN)]; 245 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 246 g->backrefs = 0; 247 248 /* do it */ 249 EMIT(OEND, 0); 250 g->firststate = THERE(); 251 if (cflags®_EXTENDED) 252 p_ere(p, OUT); 253 else if (cflags®_NOSPEC) 254 p_str(p); 255 else 256 p_bre(p, OUT, OUT); 257 EMIT(OEND, 0); 258 g->laststate = THERE(); 259 260 /* tidy up loose ends and fill things in */ 261 categorize(p, g); 262 stripsnug(p, g); 263 findmust(p, g); 264 g->nplus = pluscount(p, g); 265 g->magic = MAGIC2; 266 preg->re_nsub = g->nsub; 267 preg->re_g = g; 268 preg->re_magic = MAGIC1; 269 #ifndef REDEBUG 270 /* not debugging, so can't rely on the assert() in regexec() */ 271 if (g->iflags&BAD) 272 SETERROR(REG_ASSERT); 273 #endif 274 275 /* win or lose, we're done */ 276 if (p->error != 0) /* lose */ 277 regfree(preg); 278 return(p->error); 279 } 280 281 /* 282 - p_ere - ERE parser top level, concatenation and alternation 283 == static void p_ere(register struct parse *p, int stop); 284 */ 285 static void 286 p_ere(p, stop) 287 register struct parse *p; 288 int stop; /* character this ERE should end at */ 289 { 290 register char c; 291 register sopno prevback; 292 register sopno prevfwd; 293 register sopno conc; 294 register int first = 1; /* is this the first alternative? */ 295 296 for (;;) { 297 /* do a bunch of concatenated expressions */ 298 conc = HERE(); 299 while (MORE() && (c = PEEK()) != '|' && c != stop) 300 p_ere_exp(p); 301 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 302 303 if (!EAT('|')) 304 break; /* NOTE BREAK OUT */ 305 306 if (first) { 307 INSERT(OCH_, conc); /* offset is wrong */ 308 prevfwd = conc; 309 prevback = conc; 310 first = 0; 311 } 312 ASTERN(OOR1, prevback); 313 prevback = THERE(); 314 AHEAD(prevfwd); /* fix previous offset */ 315 prevfwd = HERE(); 316 EMIT(OOR2, 0); /* offset is very wrong */ 317 } 318 319 if (!first) { /* tail-end fixups */ 320 AHEAD(prevfwd); 321 ASTERN(O_CH, prevback); 322 } 323 324 assert(!MORE() || SEE(stop)); 325 } 326 327 /* 328 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 329 == static void p_ere_exp(register struct parse *p); 330 */ 331 static void 332 p_ere_exp(p) 333 register struct parse *p; 334 { 335 register char c; 336 register sopno pos; 337 register int count; 338 register int count2; 339 register sopno subno; 340 int wascaret = 0; 341 342 assert(MORE()); /* caller should have ensured this */ 343 c = GETNEXT(); 344 345 pos = HERE(); 346 switch (c) { 347 case '(': 348 (void)REQUIRE(MORE(), REG_EPAREN); 349 p->g->nsub++; 350 subno = p->g->nsub; 351 if (subno < NPAREN) 352 p->pbegin[subno] = HERE(); 353 EMIT(OLPAREN, subno); 354 if (!SEE(')')) 355 p_ere(p, ')'); 356 if (subno < NPAREN) { 357 p->pend[subno] = HERE(); 358 assert(p->pend[subno] != 0); 359 } 360 EMIT(ORPAREN, subno); 361 (void)MUSTEAT(')', REG_EPAREN); 362 break; 363 #ifndef POSIX_MISTAKE 364 case ')': /* happens only if no current unmatched ( */ 365 /* 366 * You may ask, why the ifndef? Because I didn't notice 367 * this until slightly too late for 1003.2, and none of the 368 * other 1003.2 regular-expression reviewers noticed it at 369 * all. So an unmatched ) is legal POSIX, at least until 370 * we can get it fixed. 371 */ 372 SETERROR(REG_EPAREN); 373 break; 374 #endif 375 case '^': 376 EMIT(OBOL, 0); 377 p->g->iflags |= USEBOL; 378 p->g->nbol++; 379 wascaret = 1; 380 break; 381 case '$': 382 EMIT(OEOL, 0); 383 p->g->iflags |= USEEOL; 384 p->g->neol++; 385 break; 386 case '|': 387 SETERROR(REG_EMPTY); 388 break; 389 case '*': 390 case '+': 391 case '?': 392 SETERROR(REG_BADRPT); 393 break; 394 case '.': 395 if (p->g->cflags®_NEWLINE) 396 nonnewline(p); 397 else 398 EMIT(OANY, 0); 399 break; 400 case '[': 401 p_bracket(p); 402 break; 403 case '\\': 404 (void)REQUIRE(MORE(), REG_EESCAPE); 405 c = GETNEXT(); 406 ordinary(p, c); 407 break; 408 case '{': /* okay as ordinary except if digit follows */ 409 (void)REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 410 /* FALLTHROUGH */ 411 default: 412 ordinary(p, c); 413 break; 414 } 415 416 if (!MORE()) 417 return; 418 c = PEEK(); 419 /* we call { a repetition if followed by a digit */ 420 if (!( c == '*' || c == '+' || c == '?' || 421 (c == '{' && MORE2() && isdigit(PEEK2())) )) 422 return; /* no repetition, we're done */ 423 NEXT(); 424 425 (void)REQUIRE(!wascaret, REG_BADRPT); 426 switch (c) { 427 case '*': /* implemented as +? */ 428 /* this case does not require the (y|) trick, noKLUDGE */ 429 INSERT(OPLUS_, pos); 430 ASTERN(O_PLUS, pos); 431 INSERT(OQUEST_, pos); 432 ASTERN(O_QUEST, pos); 433 break; 434 case '+': 435 INSERT(OPLUS_, pos); 436 ASTERN(O_PLUS, pos); 437 break; 438 case '?': 439 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 440 INSERT(OCH_, pos); /* offset slightly wrong */ 441 ASTERN(OOR1, pos); /* this one's right */ 442 AHEAD(pos); /* fix the OCH_ */ 443 EMIT(OOR2, 0); /* offset very wrong... */ 444 AHEAD(THERE()); /* ...so fix it */ 445 ASTERN(O_CH, THERETHERE()); 446 break; 447 case '{': 448 count = p_count(p); 449 if (EAT(',')) { 450 if (isdigit(PEEK())) { 451 count2 = p_count(p); 452 (void)REQUIRE(count <= count2, REG_BADBR); 453 } else /* single number with comma */ 454 count2 = INFINITY; 455 } else /* just a single number */ 456 count2 = count; 457 repeat(p, pos, count, count2); 458 if (!EAT('}')) { /* error heuristics */ 459 while (MORE() && PEEK() != '}') 460 NEXT(); 461 (void)REQUIRE(MORE(), REG_EBRACE); 462 SETERROR(REG_BADBR); 463 } 464 break; 465 } 466 467 if (!MORE()) 468 return; 469 c = PEEK(); 470 if (!( c == '*' || c == '+' || c == '?' || 471 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 472 return; 473 SETERROR(REG_BADRPT); 474 } 475 476 /* 477 - p_str - string (no metacharacters) "parser" 478 == static void p_str(register struct parse *p); 479 */ 480 static void 481 p_str(p) 482 register struct parse *p; 483 { 484 (void)REQUIRE(MORE(), REG_EMPTY); 485 while (MORE()) 486 ordinary(p, GETNEXT()); 487 } 488 489 /* 490 - p_bre - BRE parser top level, anchoring and concatenation 491 == static void p_bre(register struct parse *p, register int end1, \ 492 == register int end2); 493 * Giving end1 as OUT essentially eliminates the end1/end2 check. 494 * 495 * This implementation is a bit of a kludge, in that a trailing $ is first 496 * taken as an ordinary character and then revised to be an anchor. The 497 * only undesirable side effect is that '$' gets included as a character 498 * category in such cases. This is fairly harmless; not worth fixing. 499 * The amount of lookahead needed to avoid this kludge is excessive. 500 */ 501 static void 502 p_bre(p, end1, end2) 503 register struct parse *p; 504 register int end1; /* first terminating character */ 505 register int end2; /* second terminating character */ 506 { 507 register sopno start = HERE(); 508 register int first = 1; /* first subexpression? */ 509 register int wasdollar = 0; 510 511 if (EAT('^')) { 512 EMIT(OBOL, 0); 513 p->g->iflags |= USEBOL; 514 p->g->nbol++; 515 } 516 while (MORE() && !SEETWO(end1, end2)) { 517 wasdollar = p_simp_re(p, first); 518 first = 0; 519 } 520 if (wasdollar) { /* oops, that was a trailing anchor */ 521 DROP(1); 522 EMIT(OEOL, 0); 523 p->g->iflags |= USEEOL; 524 p->g->neol++; 525 } 526 527 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 528 } 529 530 /* 531 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 532 == static int p_simp_re(register struct parse *p, int starordinary); 533 */ 534 static int /* was the simple RE an unbackslashed $? */ 535 p_simp_re(p, starordinary) 536 register struct parse *p; 537 int starordinary; /* is a leading * an ordinary character? */ 538 { 539 register int c; 540 register int count; 541 register int count2; 542 register sopno pos; 543 register int i; 544 register sopno subno; 545 # define BACKSL (1<<CHAR_BIT) 546 547 pos = HERE(); /* repetion op, if any, covers from here */ 548 549 assert(MORE()); /* caller should have ensured this */ 550 c = GETNEXT(); 551 if (c == '\\') { 552 (void)REQUIRE(MORE(), REG_EESCAPE); 553 c = BACKSL | (unsigned char)GETNEXT(); 554 } 555 switch (c) { 556 case '.': 557 if (p->g->cflags®_NEWLINE) 558 nonnewline(p); 559 else 560 EMIT(OANY, 0); 561 break; 562 case '[': 563 p_bracket(p); 564 break; 565 case BACKSL|'{': 566 SETERROR(REG_BADRPT); 567 break; 568 case BACKSL|'(': 569 p->g->nsub++; 570 subno = p->g->nsub; 571 if (subno < NPAREN) 572 p->pbegin[subno] = HERE(); 573 EMIT(OLPAREN, subno); 574 /* the MORE here is an error heuristic */ 575 if (MORE() && !SEETWO('\\', ')')) 576 p_bre(p, '\\', ')'); 577 if (subno < NPAREN) { 578 p->pend[subno] = HERE(); 579 assert(p->pend[subno] != 0); 580 } 581 EMIT(ORPAREN, subno); 582 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 583 break; 584 case BACKSL|')': /* should not get here -- must be user */ 585 case BACKSL|'}': 586 SETERROR(REG_EPAREN); 587 break; 588 case BACKSL|'1': 589 case BACKSL|'2': 590 case BACKSL|'3': 591 case BACKSL|'4': 592 case BACKSL|'5': 593 case BACKSL|'6': 594 case BACKSL|'7': 595 case BACKSL|'8': 596 case BACKSL|'9': 597 i = (c&~BACKSL) - '0'; 598 assert(i < NPAREN); 599 if (p->pend[i] != 0) { 600 assert(i <= p->g->nsub); 601 EMIT(OBACK_, i); 602 assert(p->pbegin[i] != 0); 603 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 604 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 605 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 606 EMIT(O_BACK, i); 607 } else 608 SETERROR(REG_ESUBREG); 609 p->g->backrefs = 1; 610 break; 611 case '*': 612 (void)REQUIRE(starordinary, REG_BADRPT); 613 /* FALLTHROUGH */ 614 default: 615 ordinary(p, c &~ BACKSL); 616 break; 617 } 618 619 if (EAT('*')) { /* implemented as +? */ 620 /* this case does not require the (y|) trick, noKLUDGE */ 621 INSERT(OPLUS_, pos); 622 ASTERN(O_PLUS, pos); 623 INSERT(OQUEST_, pos); 624 ASTERN(O_QUEST, pos); 625 } else if (EATTWO('\\', '{')) { 626 count = p_count(p); 627 if (EAT(',')) { 628 if (MORE() && isdigit(PEEK())) { 629 count2 = p_count(p); 630 (void)REQUIRE(count <= count2, REG_BADBR); 631 } else /* single number with comma */ 632 count2 = INFINITY; 633 } else /* just a single number */ 634 count2 = count; 635 repeat(p, pos, count, count2); 636 if (!EATTWO('\\', '}')) { /* error heuristics */ 637 while (MORE() && !SEETWO('\\', '}')) 638 NEXT(); 639 (void)REQUIRE(MORE(), REG_EBRACE); 640 SETERROR(REG_BADBR); 641 } 642 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 643 return(1); 644 645 return(0); 646 } 647 648 /* 649 - p_count - parse a repetition count 650 == static int p_count(register struct parse *p); 651 */ 652 static int /* the value */ 653 p_count(p) 654 register struct parse *p; 655 { 656 register int count = 0; 657 register int ndigits = 0; 658 659 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 660 count = count*10 + (GETNEXT() - '0'); 661 ndigits++; 662 } 663 664 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 665 return(count); 666 } 667 668 /* 669 - p_bracket - parse a bracketed character list 670 == static void p_bracket(register struct parse *p); 671 * 672 * Note a significant property of this code: if the allocset() did SETERROR, 673 * no set operations are done. 674 */ 675 static void 676 p_bracket(p) 677 register struct parse *p; 678 { 679 register cset *cs = allocset(p); 680 register int invert = 0; 681 682 /* Dept of Truly Sickening Special-Case Kludges */ 683 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 684 EMIT(OBOW, 0); 685 NEXTn(6); 686 return; 687 } 688 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 689 EMIT(OEOW, 0); 690 NEXTn(6); 691 return; 692 } 693 694 if (EAT('^')) 695 invert++; /* make note to invert set at end */ 696 if (EAT(']')) 697 CHadd(cs, ']'); 698 else if (EAT('-')) 699 CHadd(cs, '-'); 700 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 701 p_b_term(p, cs); 702 if (EAT('-')) 703 CHadd(cs, '-'); 704 (void)MUSTEAT(']', REG_EBRACK); 705 706 if (p->error != 0) /* don't mess things up further */ 707 return; 708 709 if (p->g->cflags®_ICASE) { 710 register int i; 711 register int ci; 712 713 for (i = p->g->csetsize - 1; i >= 0; i--) 714 if (CHIN(cs, i) && isalpha(i)) { 715 ci = othercase(i); 716 if (ci != i) 717 CHadd(cs, ci); 718 } 719 if (cs->multis != NULL) 720 mccase(p, cs); 721 } 722 if (invert) { 723 register int i; 724 725 for (i = p->g->csetsize - 1; i >= 0; i--) 726 if (CHIN(cs, i)) 727 CHsub(cs, i); 728 else 729 CHadd(cs, i); 730 if (p->g->cflags®_NEWLINE) 731 CHsub(cs, '\n'); 732 if (cs->multis != NULL) 733 mcinvert(p, cs); 734 } 735 736 assert(cs->multis == NULL); /* xxx */ 737 738 if (nch(p, cs) == 1) { /* optimize singleton sets */ 739 ordinary(p, firstch(p, cs)); 740 freeset(p, cs); 741 } else 742 EMIT(OANYOF, freezeset(p, cs)); 743 } 744 745 /* 746 - p_b_term - parse one term of a bracketed character list 747 == static void p_b_term(register struct parse *p, register cset *cs); 748 */ 749 static void 750 p_b_term(p, cs) 751 register struct parse *p; 752 register cset *cs; 753 { 754 register char c; 755 register char start, finish; 756 register int i; 757 758 /* classify what we've got */ 759 switch ((MORE()) ? PEEK() : '\0') { 760 case '[': 761 c = (MORE2()) ? PEEK2() : '\0'; 762 break; 763 case '-': 764 SETERROR(REG_ERANGE); 765 return; /* NOTE RETURN */ 766 break; 767 default: 768 c = '\0'; 769 break; 770 } 771 772 switch (c) { 773 case ':': /* character class */ 774 NEXT2(); 775 (void)REQUIRE(MORE(), REG_EBRACK); 776 c = PEEK(); 777 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 778 p_b_cclass(p, cs); 779 (void)REQUIRE(MORE(), REG_EBRACK); 780 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 781 break; 782 case '=': /* equivalence class */ 783 NEXT2(); 784 (void)REQUIRE(MORE(), REG_EBRACK); 785 c = PEEK(); 786 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 787 p_b_eclass(p, cs); 788 (void)REQUIRE(MORE(), REG_EBRACK); 789 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 790 break; 791 default: /* symbol, ordinary character, or range */ 792 /* xxx revision needed for multichar stuff */ 793 start = p_b_symbol(p); 794 if (SEE('-') && MORE2() && PEEK2() != ']') { 795 /* range */ 796 NEXT(); 797 if (EAT('-')) 798 finish = '-'; 799 else 800 finish = p_b_symbol(p); 801 } else 802 finish = start; 803 if (start == finish) 804 CHadd(cs, start); 805 else { 806 (void)REQUIRE(collate_range_cmp(start, finish) <= 0, REG_ERANGE); 807 for (i = CHAR_MIN; i <= CHAR_MAX; i++) { 808 if ( collate_range_cmp(start, i) <= 0 809 && collate_range_cmp(i, finish) <= 0 810 ) 811 CHadd(cs, i); 812 } 813 } 814 break; 815 } 816 } 817 818 /* 819 - p_b_cclass - parse a character-class name and deal with it 820 == static void p_b_cclass(register struct parse *p, register cset *cs); 821 */ 822 static void 823 p_b_cclass(p, cs) 824 register struct parse *p; 825 register cset *cs; 826 { 827 register int c; 828 register char *sp = p->next; 829 register struct cclass *cp; 830 register size_t len; 831 832 while (MORE() && isalpha((uch)PEEK())) 833 NEXT(); 834 len = p->next - sp; 835 for (cp = cclasses; cp->name != NULL; cp++) 836 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 837 break; 838 if (cp->name == NULL) { 839 /* oops, didn't find it */ 840 SETERROR(REG_ECTYPE); 841 return; 842 } 843 844 switch (cp->fidx) { 845 case CALNUM: 846 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 847 if (isalnum((uch)c)) 848 CHadd(cs, c); 849 break; 850 case CALPHA: 851 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 852 if (isalpha((uch)c)) 853 CHadd(cs, c); 854 break; 855 case CBLANK: 856 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 857 if (isblank((uch)c)) 858 CHadd(cs, c); 859 break; 860 case CCNTRL: 861 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 862 if (iscntrl((uch)c)) 863 CHadd(cs, c); 864 break; 865 case CDIGIT: 866 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 867 if (isdigit((uch)c)) 868 CHadd(cs, c); 869 break; 870 case CGRAPH: 871 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 872 if (isgraph((uch)c)) 873 CHadd(cs, c); 874 break; 875 case CLOWER: 876 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 877 if (islower((uch)c)) 878 CHadd(cs, c); 879 break; 880 case CPRINT: 881 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 882 if (isprint((uch)c)) 883 CHadd(cs, c); 884 break; 885 case CPUNCT: 886 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 887 if (ispunct((uch)c)) 888 CHadd(cs, c); 889 break; 890 case CSPACE: 891 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 892 if (isspace((uch)c)) 893 CHadd(cs, c); 894 break; 895 case CUPPER: 896 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 897 if (isupper((uch)c)) 898 CHadd(cs, c); 899 break; 900 case CXDIGIT: 901 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 902 if (isxdigit((uch)c)) 903 CHadd(cs, c); 904 break; 905 } 906 #if 0 907 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 908 MCadd(p, cs, u); 909 #endif 910 } 911 912 /* 913 - p_b_eclass - parse an equivalence-class name and deal with it 914 == static void p_b_eclass(register struct parse *p, register cset *cs); 915 * 916 * This implementation is incomplete. xxx 917 */ 918 static void 919 p_b_eclass(p, cs) 920 register struct parse *p; 921 register cset *cs; 922 { 923 register char c; 924 925 c = p_b_coll_elem(p, '='); 926 CHadd(cs, c); 927 } 928 929 /* 930 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 931 == static char p_b_symbol(register struct parse *p); 932 */ 933 static char /* value of symbol */ 934 p_b_symbol(p) 935 register struct parse *p; 936 { 937 register char value; 938 939 (void)REQUIRE(MORE(), REG_EBRACK); 940 if (!EATTWO('[', '.')) 941 return(GETNEXT()); 942 943 /* collating symbol */ 944 value = p_b_coll_elem(p, '.'); 945 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 946 return(value); 947 } 948 949 /* 950 - p_b_coll_elem - parse a collating-element name and look it up 951 == static char p_b_coll_elem(register struct parse *p, int endc); 952 */ 953 static char /* value of collating element */ 954 p_b_coll_elem(p, endc) 955 register struct parse *p; 956 int endc; /* name ended by endc,']' */ 957 { 958 register char *sp = p->next; 959 register struct cname *cp; 960 register int len; 961 962 while (MORE() && !SEETWO(endc, ']')) 963 NEXT(); 964 if (!MORE()) { 965 SETERROR(REG_EBRACK); 966 return(0); 967 } 968 len = p->next - sp; 969 for (cp = cnames; cp->name != NULL; cp++) 970 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 971 return(cp->code); /* known name */ 972 if (len == 1) 973 return(*sp); /* single character */ 974 SETERROR(REG_ECOLLATE); /* neither */ 975 return(0); 976 } 977 978 /* 979 - othercase - return the case counterpart of an alphabetic 980 == static char othercase(int ch); 981 */ 982 static char /* if no counterpart, return ch */ 983 othercase(ch) 984 int ch; 985 { 986 ch = (unsigned char)ch; 987 assert(isalpha(ch)); 988 if (isupper(ch)) 989 return(tolower(ch)); 990 else if (islower(ch)) 991 return(toupper(ch)); 992 else /* peculiar, but could happen */ 993 return(ch); 994 } 995 996 /* 997 - bothcases - emit a dualcase version of a two-case character 998 == static void bothcases(register struct parse *p, int ch); 999 * 1000 * Boy, is this implementation ever a kludge... 1001 */ 1002 static void 1003 bothcases(p, ch) 1004 register struct parse *p; 1005 int ch; 1006 { 1007 register char *oldnext = p->next; 1008 register char *oldend = p->end; 1009 char bracket[3]; 1010 1011 ch = (unsigned char)ch; 1012 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1013 p->next = bracket; 1014 p->end = bracket+2; 1015 bracket[0] = ch; 1016 bracket[1] = ']'; 1017 bracket[2] = '\0'; 1018 p_bracket(p); 1019 assert(p->next == bracket+2); 1020 p->next = oldnext; 1021 p->end = oldend; 1022 } 1023 1024 /* 1025 - ordinary - emit an ordinary character 1026 == static void ordinary(register struct parse *p, register int ch); 1027 */ 1028 static void 1029 ordinary(p, ch) 1030 register struct parse *p; 1031 register int ch; 1032 { 1033 register cat_t *cap = p->g->categories; 1034 1035 if ((p->g->cflags®_ICASE) && isalpha((unsigned char)ch) && othercase(ch) != ch) 1036 bothcases(p, ch); 1037 else { 1038 EMIT(OCHAR, (unsigned char)ch); 1039 if (cap[ch] == 0) 1040 cap[ch] = p->g->ncategories++; 1041 } 1042 } 1043 1044 /* 1045 - nonnewline - emit REG_NEWLINE version of OANY 1046 == static void nonnewline(register struct parse *p); 1047 * 1048 * Boy, is this implementation ever a kludge... 1049 */ 1050 static void 1051 nonnewline(p) 1052 register struct parse *p; 1053 { 1054 register char *oldnext = p->next; 1055 register char *oldend = p->end; 1056 char bracket[4]; 1057 1058 p->next = bracket; 1059 p->end = bracket+3; 1060 bracket[0] = '^'; 1061 bracket[1] = '\n'; 1062 bracket[2] = ']'; 1063 bracket[3] = '\0'; 1064 p_bracket(p); 1065 assert(p->next == bracket+3); 1066 p->next = oldnext; 1067 p->end = oldend; 1068 } 1069 1070 /* 1071 - repeat - generate code for a bounded repetition, recursively if needed 1072 == static void repeat(register struct parse *p, sopno start, int from, int to); 1073 */ 1074 static void 1075 repeat(p, start, from, to) 1076 register struct parse *p; 1077 sopno start; /* operand from here to end of strip */ 1078 int from; /* repeated from this number */ 1079 int to; /* to this number of times (maybe INFINITY) */ 1080 { 1081 register sopno finish = HERE(); 1082 # define N 2 1083 # define INF 3 1084 # define REP(f, t) ((f)*8 + (t)) 1085 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1086 register sopno copy; 1087 1088 if (p->error != 0) /* head off possible runaway recursion */ 1089 return; 1090 1091 assert(from <= to); 1092 1093 switch (REP(MAP(from), MAP(to))) { 1094 case REP(0, 0): /* must be user doing this */ 1095 DROP(finish-start); /* drop the operand */ 1096 break; 1097 case REP(0, 1): /* as x{1,1}? */ 1098 case REP(0, N): /* as x{1,n}? */ 1099 case REP(0, INF): /* as x{1,}? */ 1100 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1101 INSERT(OCH_, start); /* offset is wrong... */ 1102 repeat(p, start+1, 1, to); 1103 ASTERN(OOR1, start); 1104 AHEAD(start); /* ... fix it */ 1105 EMIT(OOR2, 0); 1106 AHEAD(THERE()); 1107 ASTERN(O_CH, THERETHERE()); 1108 break; 1109 case REP(1, 1): /* trivial case */ 1110 /* done */ 1111 break; 1112 case REP(1, N): /* as x?x{1,n-1} */ 1113 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1114 INSERT(OCH_, start); 1115 ASTERN(OOR1, start); 1116 AHEAD(start); 1117 EMIT(OOR2, 0); /* offset very wrong... */ 1118 AHEAD(THERE()); /* ...so fix it */ 1119 ASTERN(O_CH, THERETHERE()); 1120 copy = dupl(p, start+1, finish+1); 1121 assert(copy == finish+4); 1122 repeat(p, copy, 1, to-1); 1123 break; 1124 case REP(1, INF): /* as x+ */ 1125 INSERT(OPLUS_, start); 1126 ASTERN(O_PLUS, start); 1127 break; 1128 case REP(N, N): /* as xx{m-1,n-1} */ 1129 copy = dupl(p, start, finish); 1130 repeat(p, copy, from-1, to-1); 1131 break; 1132 case REP(N, INF): /* as xx{n-1,INF} */ 1133 copy = dupl(p, start, finish); 1134 repeat(p, copy, from-1, to); 1135 break; 1136 default: /* "can't happen" */ 1137 SETERROR(REG_ASSERT); /* just in case */ 1138 break; 1139 } 1140 } 1141 1142 /* 1143 - seterr - set an error condition 1144 == static int seterr(register struct parse *p, int e); 1145 */ 1146 static int /* useless but makes type checking happy */ 1147 seterr(p, e) 1148 register struct parse *p; 1149 int e; 1150 { 1151 if (p->error == 0) /* keep earliest error condition */ 1152 p->error = e; 1153 p->next = nuls; /* try to bring things to a halt */ 1154 p->end = nuls; 1155 return(0); /* make the return value well-defined */ 1156 } 1157 1158 /* 1159 - allocset - allocate a set of characters for [] 1160 == static cset *allocset(register struct parse *p); 1161 */ 1162 static cset * 1163 allocset(p) 1164 register struct parse *p; 1165 { 1166 register int no = p->g->ncsets++; 1167 register size_t nc; 1168 register size_t nbytes; 1169 register cset *cs; 1170 register size_t css = (size_t)p->g->csetsize; 1171 register int i; 1172 1173 if (no >= p->ncsalloc) { /* need another column of space */ 1174 p->ncsalloc += CHAR_BIT; 1175 nc = p->ncsalloc; 1176 assert(nc % CHAR_BIT == 0); 1177 nbytes = nc / CHAR_BIT * css; 1178 if (p->g->sets == NULL) 1179 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1180 else 1181 p->g->sets = (cset *)realloc((char *)p->g->sets, 1182 nc * sizeof(cset)); 1183 if (p->g->setbits == NULL) 1184 p->g->setbits = (uch *)malloc(nbytes); 1185 else { 1186 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1187 nbytes); 1188 /* xxx this isn't right if setbits is now NULL */ 1189 for (i = 0; i < no; i++) 1190 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1191 } 1192 if (p->g->sets != NULL && p->g->setbits != NULL) 1193 (void) memset((char *)p->g->setbits + (nbytes - css), 1194 0, css); 1195 else { 1196 no = 0; 1197 SETERROR(REG_ESPACE); 1198 /* caller's responsibility not to do set ops */ 1199 } 1200 } 1201 1202 assert(p->g->sets != NULL); /* xxx */ 1203 cs = &p->g->sets[no]; 1204 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1205 cs->mask = 1 << ((no) % CHAR_BIT); 1206 cs->hash = 0; 1207 cs->smultis = 0; 1208 cs->multis = NULL; 1209 1210 return(cs); 1211 } 1212 1213 /* 1214 - freeset - free a now-unused set 1215 == static void freeset(register struct parse *p, register cset *cs); 1216 */ 1217 static void 1218 freeset(p, cs) 1219 register struct parse *p; 1220 register cset *cs; 1221 { 1222 register int i; 1223 register cset *top = &p->g->sets[p->g->ncsets]; 1224 register size_t css = (size_t)p->g->csetsize; 1225 1226 for (i = 0; i < css; i++) 1227 CHsub(cs, i); 1228 if (cs == top-1) /* recover only the easy case */ 1229 p->g->ncsets--; 1230 } 1231 1232 /* 1233 - freezeset - final processing on a set of characters 1234 == static int freezeset(register struct parse *p, register cset *cs); 1235 * 1236 * The main task here is merging identical sets. This is usually a waste 1237 * of time (although the hash code minimizes the overhead), but can win 1238 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1239 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1240 * the same value! 1241 */ 1242 static int /* set number */ 1243 freezeset(p, cs) 1244 register struct parse *p; 1245 register cset *cs; 1246 { 1247 register short h = cs->hash; 1248 register int i; 1249 register cset *top = &p->g->sets[p->g->ncsets]; 1250 register cset *cs2; 1251 register size_t css = (size_t)p->g->csetsize; 1252 1253 /* look for an earlier one which is the same */ 1254 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1255 if (cs2->hash == h && cs2 != cs) { 1256 /* maybe */ 1257 for (i = 0; i < css; i++) 1258 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1259 break; /* no */ 1260 if (i == css) 1261 break; /* yes */ 1262 } 1263 1264 if (cs2 < top) { /* found one */ 1265 freeset(p, cs); 1266 cs = cs2; 1267 } 1268 1269 return((int)(cs - p->g->sets)); 1270 } 1271 1272 /* 1273 - firstch - return first character in a set (which must have at least one) 1274 == static int firstch(register struct parse *p, register cset *cs); 1275 */ 1276 static int /* character; there is no "none" value */ 1277 firstch(p, cs) 1278 register struct parse *p; 1279 register cset *cs; 1280 { 1281 register int i; 1282 register size_t css = (size_t)p->g->csetsize; 1283 1284 for (i = 0; i < css; i++) 1285 if (CHIN(cs, i)) 1286 return((unsigned char)i); 1287 assert(never); 1288 return(0); /* arbitrary */ 1289 } 1290 1291 /* 1292 - nch - number of characters in a set 1293 == static int nch(register struct parse *p, register cset *cs); 1294 */ 1295 static int 1296 nch(p, cs) 1297 register struct parse *p; 1298 register cset *cs; 1299 { 1300 register int i; 1301 register size_t css = (size_t)p->g->csetsize; 1302 register int n = 0; 1303 1304 for (i = 0; i < css; i++) 1305 if (CHIN(cs, i)) 1306 n++; 1307 return(n); 1308 } 1309 1310 /* 1311 - mcadd - add a collating element to a cset 1312 == static void mcadd(register struct parse *p, register cset *cs, \ 1313 == register char *cp); 1314 */ 1315 static void 1316 mcadd(p, cs, cp) 1317 register struct parse *p; 1318 register cset *cs; 1319 register char *cp; 1320 { 1321 register size_t oldend = cs->smultis; 1322 1323 cs->smultis += strlen(cp) + 1; 1324 if (cs->multis == NULL) 1325 cs->multis = malloc(cs->smultis); 1326 else 1327 cs->multis = realloc(cs->multis, cs->smultis); 1328 if (cs->multis == NULL) { 1329 SETERROR(REG_ESPACE); 1330 return; 1331 } 1332 1333 (void) strcpy(cs->multis + oldend - 1, cp); 1334 cs->multis[cs->smultis - 1] = '\0'; 1335 } 1336 1337 #if used 1338 /* 1339 - mcsub - subtract a collating element from a cset 1340 == static void mcsub(register cset *cs, register char *cp); 1341 */ 1342 static void 1343 mcsub(cs, cp) 1344 register cset *cs; 1345 register char *cp; 1346 { 1347 register char *fp = mcfind(cs, cp); 1348 register size_t len = strlen(fp); 1349 1350 assert(fp != NULL); 1351 (void) memmove(fp, fp + len + 1, 1352 cs->smultis - (fp + len + 1 - cs->multis)); 1353 cs->smultis -= len; 1354 1355 if (cs->smultis == 0) { 1356 free(cs->multis); 1357 cs->multis = NULL; 1358 return; 1359 } 1360 1361 cs->multis = realloc(cs->multis, cs->smultis); 1362 assert(cs->multis != NULL); 1363 } 1364 1365 /* 1366 - mcin - is a collating element in a cset? 1367 == static int mcin(register cset *cs, register char *cp); 1368 */ 1369 static int 1370 mcin(cs, cp) 1371 register cset *cs; 1372 register char *cp; 1373 { 1374 return(mcfind(cs, cp) != NULL); 1375 } 1376 1377 /* 1378 - mcfind - find a collating element in a cset 1379 == static char *mcfind(register cset *cs, register char *cp); 1380 */ 1381 static char * 1382 mcfind(cs, cp) 1383 register cset *cs; 1384 register char *cp; 1385 { 1386 register char *p; 1387 1388 if (cs->multis == NULL) 1389 return(NULL); 1390 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1391 if (strcmp(cp, p) == 0) 1392 return(p); 1393 return(NULL); 1394 } 1395 #endif 1396 1397 /* 1398 - mcinvert - invert the list of collating elements in a cset 1399 == static void mcinvert(register struct parse *p, register cset *cs); 1400 * 1401 * This would have to know the set of possibilities. Implementation 1402 * is deferred. 1403 */ 1404 static void 1405 mcinvert(p, cs) 1406 register struct parse *p; 1407 register cset *cs; 1408 { 1409 assert(cs->multis == NULL); /* xxx */ 1410 } 1411 1412 /* 1413 - mccase - add case counterparts of the list of collating elements in a cset 1414 == static void mccase(register struct parse *p, register cset *cs); 1415 * 1416 * This would have to know the set of possibilities. Implementation 1417 * is deferred. 1418 */ 1419 static void 1420 mccase(p, cs) 1421 register struct parse *p; 1422 register cset *cs; 1423 { 1424 assert(cs->multis == NULL); /* xxx */ 1425 } 1426 1427 /* 1428 - isinsets - is this character in any sets? 1429 == static int isinsets(register struct re_guts *g, int c); 1430 */ 1431 static int /* predicate */ 1432 isinsets(g, c) 1433 register struct re_guts *g; 1434 int c; 1435 { 1436 register uch *col; 1437 register int i; 1438 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1439 register unsigned uc = (unsigned char)c; 1440 1441 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1442 if (col[uc] != 0) 1443 return(1); 1444 return(0); 1445 } 1446 1447 /* 1448 - samesets - are these two characters in exactly the same sets? 1449 == static int samesets(register struct re_guts *g, int c1, int c2); 1450 */ 1451 static int /* predicate */ 1452 samesets(g, c1, c2) 1453 register struct re_guts *g; 1454 int c1; 1455 int c2; 1456 { 1457 register uch *col; 1458 register int i; 1459 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1460 register unsigned uc1 = (unsigned char)c1; 1461 register unsigned uc2 = (unsigned char)c2; 1462 1463 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1464 if (col[uc1] != col[uc2]) 1465 return(0); 1466 return(1); 1467 } 1468 1469 /* 1470 - categorize - sort out character categories 1471 == static void categorize(struct parse *p, register struct re_guts *g); 1472 */ 1473 static void 1474 categorize(p, g) 1475 struct parse *p; 1476 register struct re_guts *g; 1477 { 1478 register cat_t *cats = g->categories; 1479 register int c; 1480 register int c2; 1481 register cat_t cat; 1482 1483 /* avoid making error situations worse */ 1484 if (p->error != 0) 1485 return; 1486 1487 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1488 if (cats[c] == 0 && isinsets(g, c)) { 1489 cat = g->ncategories++; 1490 cats[c] = cat; 1491 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1492 if (cats[c2] == 0 && samesets(g, c, c2)) 1493 cats[c2] = cat; 1494 } 1495 } 1496 1497 /* 1498 - dupl - emit a duplicate of a bunch of sops 1499 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1500 */ 1501 static sopno /* start of duplicate */ 1502 dupl(p, start, finish) 1503 register struct parse *p; 1504 sopno start; /* from here */ 1505 sopno finish; /* to this less one */ 1506 { 1507 register sopno ret = HERE(); 1508 register sopno len = finish - start; 1509 1510 assert(finish >= start); 1511 if (len == 0) 1512 return(ret); 1513 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1514 assert(p->ssize >= p->slen + len); 1515 (void) memcpy((char *)(p->strip + p->slen), 1516 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1517 p->slen += len; 1518 return(ret); 1519 } 1520 1521 /* 1522 - doemit - emit a strip operator 1523 == static void doemit(register struct parse *p, sop op, size_t opnd); 1524 * 1525 * It might seem better to implement this as a macro with a function as 1526 * hard-case backup, but it's just too big and messy unless there are 1527 * some changes to the data structures. Maybe later. 1528 */ 1529 static void 1530 doemit(p, op, opnd) 1531 register struct parse *p; 1532 sop op; 1533 size_t opnd; 1534 { 1535 /* avoid making error situations worse */ 1536 if (p->error != 0) 1537 return; 1538 1539 /* deal with oversize operands ("can't happen", more or less) */ 1540 assert(opnd < 1<<OPSHIFT); 1541 1542 /* deal with undersized strip */ 1543 if (p->slen >= p->ssize) 1544 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1545 assert(p->slen < p->ssize); 1546 1547 /* finally, it's all reduced to the easy case */ 1548 p->strip[p->slen++] = SOP(op, opnd); 1549 } 1550 1551 /* 1552 - doinsert - insert a sop into the strip 1553 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1554 */ 1555 static void 1556 doinsert(p, op, opnd, pos) 1557 register struct parse *p; 1558 sop op; 1559 size_t opnd; 1560 sopno pos; 1561 { 1562 register sopno sn; 1563 register sop s; 1564 register int i; 1565 1566 /* avoid making error situations worse */ 1567 if (p->error != 0) 1568 return; 1569 1570 sn = HERE(); 1571 EMIT(op, opnd); /* do checks, ensure space */ 1572 assert(HERE() == sn+1); 1573 s = p->strip[sn]; 1574 1575 /* adjust paren pointers */ 1576 assert(pos > 0); 1577 for (i = 1; i < NPAREN; i++) { 1578 if (p->pbegin[i] >= pos) { 1579 p->pbegin[i]++; 1580 } 1581 if (p->pend[i] >= pos) { 1582 p->pend[i]++; 1583 } 1584 } 1585 1586 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1587 (HERE()-pos-1)*sizeof(sop)); 1588 p->strip[pos] = s; 1589 } 1590 1591 /* 1592 - dofwd - complete a forward reference 1593 == static void dofwd(register struct parse *p, sopno pos, sop value); 1594 */ 1595 static void 1596 dofwd(p, pos, value) 1597 register struct parse *p; 1598 register sopno pos; 1599 sop value; 1600 { 1601 /* avoid making error situations worse */ 1602 if (p->error != 0) 1603 return; 1604 1605 assert(value < 1<<OPSHIFT); 1606 p->strip[pos] = OP(p->strip[pos]) | value; 1607 } 1608 1609 /* 1610 - enlarge - enlarge the strip 1611 == static void enlarge(register struct parse *p, sopno size); 1612 */ 1613 static void 1614 enlarge(p, size) 1615 register struct parse *p; 1616 register sopno size; 1617 { 1618 register sop *sp; 1619 1620 if (p->ssize >= size) 1621 return; 1622 1623 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1624 if (sp == NULL) { 1625 SETERROR(REG_ESPACE); 1626 return; 1627 } 1628 p->strip = sp; 1629 p->ssize = size; 1630 } 1631 1632 /* 1633 - stripsnug - compact the strip 1634 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1635 */ 1636 static void 1637 stripsnug(p, g) 1638 register struct parse *p; 1639 register struct re_guts *g; 1640 { 1641 g->nstates = p->slen; 1642 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1643 if (g->strip == NULL) { 1644 SETERROR(REG_ESPACE); 1645 g->strip = p->strip; 1646 } 1647 } 1648 1649 /* 1650 - findmust - fill in must and mlen with longest mandatory literal string 1651 == static void findmust(register struct parse *p, register struct re_guts *g); 1652 * 1653 * This algorithm could do fancy things like analyzing the operands of | 1654 * for common subsequences. Someday. This code is simple and finds most 1655 * of the interesting cases. 1656 * 1657 * Note that must and mlen got initialized during setup. 1658 */ 1659 static void 1660 findmust(p, g) 1661 struct parse *p; 1662 register struct re_guts *g; 1663 { 1664 register sop *scan; 1665 sop *start; 1666 register sop *newstart; 1667 register sopno newlen; 1668 register sop s; 1669 register char *cp; 1670 register sopno i; 1671 1672 /* avoid making error situations worse */ 1673 if (p->error != 0) 1674 return; 1675 1676 /* find the longest OCHAR sequence in strip */ 1677 newlen = 0; 1678 scan = g->strip + 1; 1679 do { 1680 s = *scan++; 1681 switch (OP(s)) { 1682 case OCHAR: /* sequence member */ 1683 if (newlen == 0) /* new sequence */ 1684 newstart = scan - 1; 1685 newlen++; 1686 break; 1687 case OPLUS_: /* things that don't break one */ 1688 case OLPAREN: 1689 case ORPAREN: 1690 break; 1691 case OQUEST_: /* things that must be skipped */ 1692 case OCH_: 1693 scan--; 1694 do { 1695 scan += OPND(s); 1696 s = *scan; 1697 /* assert() interferes w debug printouts */ 1698 if (OP(s) != O_QUEST && OP(s) != O_CH && 1699 OP(s) != OOR2) { 1700 g->iflags |= BAD; 1701 return; 1702 } 1703 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1704 /* fallthrough */ 1705 default: /* things that break a sequence */ 1706 if (newlen > g->mlen) { /* ends one */ 1707 start = newstart; 1708 g->mlen = newlen; 1709 } 1710 newlen = 0; 1711 break; 1712 } 1713 } while (OP(s) != OEND); 1714 1715 if (g->mlen == 0) /* there isn't one */ 1716 return; 1717 1718 /* turn it into a character string */ 1719 g->must = malloc((size_t)g->mlen + 1); 1720 if (g->must == NULL) { /* argh; just forget it */ 1721 g->mlen = 0; 1722 return; 1723 } 1724 cp = g->must; 1725 scan = start; 1726 for (i = g->mlen; i > 0; i--) { 1727 while (OP(s = *scan++) != OCHAR) 1728 continue; 1729 assert(cp < g->must + g->mlen); 1730 *cp++ = (char)OPND(s); 1731 } 1732 assert(cp == g->must + g->mlen); 1733 *cp++ = '\0'; /* just on general principles */ 1734 } 1735 1736 /* 1737 - pluscount - count + nesting 1738 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1739 */ 1740 static sopno /* nesting depth */ 1741 pluscount(p, g) 1742 struct parse *p; 1743 register struct re_guts *g; 1744 { 1745 register sop *scan; 1746 register sop s; 1747 register sopno plusnest = 0; 1748 register sopno maxnest = 0; 1749 1750 if (p->error != 0) 1751 return(0); /* there may not be an OEND */ 1752 1753 scan = g->strip + 1; 1754 do { 1755 s = *scan++; 1756 switch (OP(s)) { 1757 case OPLUS_: 1758 plusnest++; 1759 break; 1760 case O_PLUS: 1761 if (plusnest > maxnest) 1762 maxnest = plusnest; 1763 plusnest--; 1764 break; 1765 } 1766 } while (OP(s) != OEND); 1767 if (plusnest != 0) 1768 g->iflags |= BAD; 1769 return(maxnest); 1770 } 1771