1 /*- 2 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 3 * Copyright (c) 1992, 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Henry Spencer. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 38 */ 39 40 #if defined(LIBC_SCCS) && !defined(lint) 41 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 42 #endif /* LIBC_SCCS and not lint */ 43 44 #include <sys/types.h> 45 #include <stdio.h> 46 #include <string.h> 47 #include <ctype.h> 48 #include <limits.h> 49 #include <stdlib.h> 50 #include <regex.h> 51 52 #include "utils.h" 53 #include "regex2.h" 54 55 #include "cclass.h" 56 #include "cname.h" 57 58 /* 59 * parse structure, passed up and down to avoid global variables and 60 * other clumsinesses 61 */ 62 struct parse { 63 char *next; /* next character in RE */ 64 char *end; /* end of string (-> NUL normally) */ 65 int error; /* has an error been seen? */ 66 sop *strip; /* malloced strip */ 67 sopno ssize; /* malloced strip size (allocated) */ 68 sopno slen; /* malloced strip length (used) */ 69 int ncsalloc; /* number of csets allocated */ 70 struct re_guts *g; 71 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 72 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 73 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 74 }; 75 76 /* ========= begin header generated by ./mkh ========= */ 77 #ifdef __cplusplus 78 extern "C" { 79 #endif 80 81 /* === regcomp.c === */ 82 static void p_ere __P((struct parse *p, int stop)); 83 static void p_ere_exp __P((struct parse *p)); 84 static void p_str __P((struct parse *p)); 85 static void p_bre __P((struct parse *p, int end1, int end2)); 86 static int p_simp_re __P((struct parse *p, int starordinary)); 87 static int p_count __P((struct parse *p)); 88 static void p_bracket __P((struct parse *p)); 89 static void p_b_term __P((struct parse *p, cset *cs)); 90 static void p_b_cclass __P((struct parse *p, cset *cs)); 91 static void p_b_eclass __P((struct parse *p, cset *cs)); 92 static char p_b_symbol __P((struct parse *p)); 93 static char p_b_coll_elem __P((struct parse *p, int endc)); 94 static char othercase __P((int ch)); 95 static void bothcases __P((struct parse *p, int ch)); 96 static void ordinary __P((struct parse *p, int ch)); 97 static void nonnewline __P((struct parse *p)); 98 static void repeat __P((struct parse *p, sopno start, int from, int to)); 99 static int seterr __P((struct parse *p, int e)); 100 static cset *allocset __P((struct parse *p)); 101 static void freeset __P((struct parse *p, cset *cs)); 102 static int freezeset __P((struct parse *p, cset *cs)); 103 static int firstch __P((struct parse *p, cset *cs)); 104 static int nch __P((struct parse *p, cset *cs)); 105 static void mcadd __P((struct parse *p, cset *cs, char *cp)); 106 static void mcsub __P((cset *cs, char *cp)); 107 static int mcin __P((cset *cs, char *cp)); 108 static char *mcfind __P((cset *cs, char *cp)); 109 static void mcinvert __P((struct parse *p, cset *cs)); 110 static void mccase __P((struct parse *p, cset *cs)); 111 static int isinsets __P((struct re_guts *g, int c)); 112 static int samesets __P((struct re_guts *g, int c1, int c2)); 113 static void categorize __P((struct parse *p, struct re_guts *g)); 114 static sopno dupl __P((struct parse *p, sopno start, sopno finish)); 115 static void doemit __P((struct parse *p, sop op, size_t opnd)); 116 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos)); 117 static void dofwd __P((struct parse *p, sopno pos, sop value)); 118 static void enlarge __P((struct parse *p, sopno size)); 119 static void stripsnug __P((struct parse *p, struct re_guts *g)); 120 static void findmust __P((struct parse *p, struct re_guts *g)); 121 static sopno pluscount __P((struct parse *p, struct re_guts *g)); 122 123 #ifdef __cplusplus 124 } 125 #endif 126 /* ========= end header generated by ./mkh ========= */ 127 128 static char nuls[10]; /* place to point scanner in event of error */ 129 130 /* 131 * macros for use with parse structure 132 * BEWARE: these know that the parse structure is named `p' !!! 133 */ 134 #define PEEK() (*p->next) 135 #define PEEK2() (*(p->next+1)) 136 #define MORE() (p->next < p->end) 137 #define MORE2() (p->next+1 < p->end) 138 #define SEE(c) (MORE() && PEEK() == (c)) 139 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 140 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 141 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 142 #define NEXT() (p->next++) 143 #define NEXT2() (p->next += 2) 144 #define NEXTn(n) (p->next += (n)) 145 #define GETNEXT() (*p->next++) 146 #define SETERROR(e) seterr(p, (e)) 147 #define REQUIRE(co, e) ((co) || SETERROR(e)) 148 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 149 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 150 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 151 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 152 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 153 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 154 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 155 #define HERE() (p->slen) 156 #define THERE() (p->slen - 1) 157 #define THERETHERE() (p->slen - 2) 158 #define DROP(n) (p->slen -= (n)) 159 160 #ifndef NDEBUG 161 static int never = 0; /* for use in asserts; shuts lint up */ 162 #else 163 #define never 0 /* some <assert.h>s have bugs too */ 164 #endif 165 166 /* 167 - regcomp - interface for parser and compilation 168 = extern int regcomp(regex_t *, const char *, int); 169 = #define REG_BASIC 0000 170 = #define REG_EXTENDED 0001 171 = #define REG_ICASE 0002 172 = #define REG_NOSUB 0004 173 = #define REG_NEWLINE 0010 174 = #define REG_NOSPEC 0020 175 = #define REG_PEND 0040 176 = #define REG_DUMP 0200 177 */ 178 int /* 0 success, otherwise REG_something */ 179 regcomp(preg, pattern, cflags) 180 regex_t *preg; 181 const char *pattern; 182 int cflags; 183 { 184 struct parse pa; 185 register struct re_guts *g; 186 register struct parse *p = &pa; 187 register int i; 188 register size_t len; 189 #ifdef REDEBUG 190 # define GOODFLAGS(f) (f) 191 #else 192 # define GOODFLAGS(f) ((f)&~REG_DUMP) 193 #endif 194 195 cflags = GOODFLAGS(cflags); 196 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 197 return(REG_INVARG); 198 199 if (cflags®_PEND) { 200 if (preg->re_endp < pattern) 201 return(REG_INVARG); 202 len = preg->re_endp - pattern; 203 } else 204 len = strlen((char *)pattern); 205 206 /* do the mallocs early so failure handling is easy */ 207 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 208 (NC-1)*sizeof(cat_t)); 209 if (g == NULL) 210 return(REG_ESPACE); 211 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 212 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); 213 p->slen = 0; 214 if (p->strip == NULL) { 215 free((char *)g); 216 return(REG_ESPACE); 217 } 218 219 /* set things up */ 220 p->g = g; 221 p->next = (char *)pattern; /* convenience; we do not modify it */ 222 p->end = p->next + len; 223 p->error = 0; 224 p->ncsalloc = 0; 225 for (i = 0; i < NPAREN; i++) { 226 p->pbegin[i] = 0; 227 p->pend[i] = 0; 228 } 229 g->csetsize = NC; 230 g->sets = NULL; 231 g->setbits = NULL; 232 g->ncsets = 0; 233 g->cflags = cflags; 234 g->iflags = 0; 235 g->nbol = 0; 236 g->neol = 0; 237 g->must = NULL; 238 g->mlen = 0; 239 g->nsub = 0; 240 g->ncategories = 1; /* category 0 is "everything else" */ 241 g->categories = &g->catspace[-(CHAR_MIN)]; 242 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 243 g->backrefs = 0; 244 245 /* do it */ 246 EMIT(OEND, 0); 247 g->firststate = THERE(); 248 if (cflags®_EXTENDED) 249 p_ere(p, OUT); 250 else if (cflags®_NOSPEC) 251 p_str(p); 252 else 253 p_bre(p, OUT, OUT); 254 EMIT(OEND, 0); 255 g->laststate = THERE(); 256 257 /* tidy up loose ends and fill things in */ 258 categorize(p, g); 259 stripsnug(p, g); 260 findmust(p, g); 261 g->nplus = pluscount(p, g); 262 g->magic = MAGIC2; 263 preg->re_nsub = g->nsub; 264 preg->re_g = g; 265 preg->re_magic = MAGIC1; 266 #ifndef REDEBUG 267 /* not debugging, so can't rely on the assert() in regexec() */ 268 if (g->iflags&BAD) 269 SETERROR(REG_ASSERT); 270 #endif 271 272 /* win or lose, we're done */ 273 if (p->error != 0) /* lose */ 274 regfree(preg); 275 return(p->error); 276 } 277 278 /* 279 - p_ere - ERE parser top level, concatenation and alternation 280 == static void p_ere(register struct parse *p, int stop); 281 */ 282 static void 283 p_ere(p, stop) 284 register struct parse *p; 285 int stop; /* character this ERE should end at */ 286 { 287 register char c; 288 register sopno prevback; 289 register sopno prevfwd; 290 register sopno conc; 291 register int first = 1; /* is this the first alternative? */ 292 293 for (;;) { 294 /* do a bunch of concatenated expressions */ 295 conc = HERE(); 296 while (MORE() && (c = PEEK()) != '|' && c != stop) 297 p_ere_exp(p); 298 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 299 300 if (!EAT('|')) 301 break; /* NOTE BREAK OUT */ 302 303 if (first) { 304 INSERT(OCH_, conc); /* offset is wrong */ 305 prevfwd = conc; 306 prevback = conc; 307 first = 0; 308 } 309 ASTERN(OOR1, prevback); 310 prevback = THERE(); 311 AHEAD(prevfwd); /* fix previous offset */ 312 prevfwd = HERE(); 313 EMIT(OOR2, 0); /* offset is very wrong */ 314 } 315 316 if (!first) { /* tail-end fixups */ 317 AHEAD(prevfwd); 318 ASTERN(O_CH, prevback); 319 } 320 321 assert(!MORE() || SEE(stop)); 322 } 323 324 /* 325 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 326 == static void p_ere_exp(register struct parse *p); 327 */ 328 static void 329 p_ere_exp(p) 330 register struct parse *p; 331 { 332 register char c; 333 register sopno pos; 334 register int count; 335 register int count2; 336 register sopno subno; 337 int wascaret = 0; 338 339 assert(MORE()); /* caller should have ensured this */ 340 c = GETNEXT(); 341 342 pos = HERE(); 343 switch (c) { 344 case '(': 345 REQUIRE(MORE(), REG_EPAREN); 346 p->g->nsub++; 347 subno = p->g->nsub; 348 if (subno < NPAREN) 349 p->pbegin[subno] = HERE(); 350 EMIT(OLPAREN, subno); 351 if (!SEE(')')) 352 p_ere(p, ')'); 353 if (subno < NPAREN) { 354 p->pend[subno] = HERE(); 355 assert(p->pend[subno] != 0); 356 } 357 EMIT(ORPAREN, subno); 358 MUSTEAT(')', REG_EPAREN); 359 break; 360 #ifndef POSIX_MISTAKE 361 case ')': /* happens only if no current unmatched ( */ 362 /* 363 * You may ask, why the ifndef? Because I didn't notice 364 * this until slightly too late for 1003.2, and none of the 365 * other 1003.2 regular-expression reviewers noticed it at 366 * all. So an unmatched ) is legal POSIX, at least until 367 * we can get it fixed. 368 */ 369 SETERROR(REG_EPAREN); 370 break; 371 #endif 372 case '^': 373 EMIT(OBOL, 0); 374 p->g->iflags |= USEBOL; 375 p->g->nbol++; 376 wascaret = 1; 377 break; 378 case '$': 379 EMIT(OEOL, 0); 380 p->g->iflags |= USEEOL; 381 p->g->neol++; 382 break; 383 case '|': 384 SETERROR(REG_EMPTY); 385 break; 386 case '*': 387 case '+': 388 case '?': 389 SETERROR(REG_BADRPT); 390 break; 391 case '.': 392 if (p->g->cflags®_NEWLINE) 393 nonnewline(p); 394 else 395 EMIT(OANY, 0); 396 break; 397 case '[': 398 p_bracket(p); 399 break; 400 case '\\': 401 REQUIRE(MORE(), REG_EESCAPE); 402 c = GETNEXT(); 403 ordinary(p, c); 404 break; 405 case '{': /* okay as ordinary except if digit follows */ 406 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); 407 /* FALLTHROUGH */ 408 default: 409 ordinary(p, c); 410 break; 411 } 412 413 if (!MORE()) 414 return; 415 c = PEEK(); 416 /* we call { a repetition if followed by a digit */ 417 if (!( c == '*' || c == '+' || c == '?' || 418 (c == '{' && MORE2() && isdigit(PEEK2())) )) 419 return; /* no repetition, we're done */ 420 NEXT(); 421 422 REQUIRE(!wascaret, REG_BADRPT); 423 switch (c) { 424 case '*': /* implemented as +? */ 425 /* this case does not require the (y|) trick, noKLUDGE */ 426 INSERT(OPLUS_, pos); 427 ASTERN(O_PLUS, pos); 428 INSERT(OQUEST_, pos); 429 ASTERN(O_QUEST, pos); 430 break; 431 case '+': 432 INSERT(OPLUS_, pos); 433 ASTERN(O_PLUS, pos); 434 break; 435 case '?': 436 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 437 INSERT(OCH_, pos); /* offset slightly wrong */ 438 ASTERN(OOR1, pos); /* this one's right */ 439 AHEAD(pos); /* fix the OCH_ */ 440 EMIT(OOR2, 0); /* offset very wrong... */ 441 AHEAD(THERE()); /* ...so fix it */ 442 ASTERN(O_CH, THERETHERE()); 443 break; 444 case '{': 445 count = p_count(p); 446 if (EAT(',')) { 447 if (isdigit(PEEK())) { 448 count2 = p_count(p); 449 REQUIRE(count <= count2, REG_BADBR); 450 } else /* single number with comma */ 451 count2 = INFINITY; 452 } else /* just a single number */ 453 count2 = count; 454 repeat(p, pos, count, count2); 455 if (!EAT('}')) { /* error heuristics */ 456 while (MORE() && PEEK() != '}') 457 NEXT(); 458 REQUIRE(MORE(), REG_EBRACE); 459 SETERROR(REG_BADBR); 460 } 461 break; 462 } 463 464 if (!MORE()) 465 return; 466 c = PEEK(); 467 if (!( c == '*' || c == '+' || c == '?' || 468 (c == '{' && MORE2() && isdigit(PEEK2())) ) ) 469 return; 470 SETERROR(REG_BADRPT); 471 } 472 473 /* 474 - p_str - string (no metacharacters) "parser" 475 == static void p_str(register struct parse *p); 476 */ 477 static void 478 p_str(p) 479 register struct parse *p; 480 { 481 REQUIRE(MORE(), REG_EMPTY); 482 while (MORE()) 483 ordinary(p, GETNEXT()); 484 } 485 486 /* 487 - p_bre - BRE parser top level, anchoring and concatenation 488 == static void p_bre(register struct parse *p, register int end1, \ 489 == register int end2); 490 * Giving end1 as OUT essentially eliminates the end1/end2 check. 491 * 492 * This implementation is a bit of a kludge, in that a trailing $ is first 493 * taken as an ordinary character and then revised to be an anchor. The 494 * only undesirable side effect is that '$' gets included as a character 495 * category in such cases. This is fairly harmless; not worth fixing. 496 * The amount of lookahead needed to avoid this kludge is excessive. 497 */ 498 static void 499 p_bre(p, end1, end2) 500 register struct parse *p; 501 register int end1; /* first terminating character */ 502 register int end2; /* second terminating character */ 503 { 504 register sopno start = HERE(); 505 register int first = 1; /* first subexpression? */ 506 register int wasdollar = 0; 507 508 if (EAT('^')) { 509 EMIT(OBOL, 0); 510 p->g->iflags |= USEBOL; 511 p->g->nbol++; 512 } 513 while (MORE() && !SEETWO(end1, end2)) { 514 wasdollar = p_simp_re(p, first); 515 first = 0; 516 } 517 if (wasdollar) { /* oops, that was a trailing anchor */ 518 DROP(1); 519 EMIT(OEOL, 0); 520 p->g->iflags |= USEEOL; 521 p->g->neol++; 522 } 523 524 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 525 } 526 527 /* 528 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 529 == static int p_simp_re(register struct parse *p, int starordinary); 530 */ 531 static int /* was the simple RE an unbackslashed $? */ 532 p_simp_re(p, starordinary) 533 register struct parse *p; 534 int starordinary; /* is a leading * an ordinary character? */ 535 { 536 register int c; 537 register int count; 538 register int count2; 539 register sopno pos; 540 register int i; 541 register sopno subno; 542 # define BACKSL (1<<CHAR_BIT) 543 544 pos = HERE(); /* repetion op, if any, covers from here */ 545 546 assert(MORE()); /* caller should have ensured this */ 547 c = GETNEXT(); 548 if (c == '\\') { 549 REQUIRE(MORE(), REG_EESCAPE); 550 c = BACKSL | (unsigned char)GETNEXT(); 551 } 552 switch (c) { 553 case '.': 554 if (p->g->cflags®_NEWLINE) 555 nonnewline(p); 556 else 557 EMIT(OANY, 0); 558 break; 559 case '[': 560 p_bracket(p); 561 break; 562 case BACKSL|'{': 563 SETERROR(REG_BADRPT); 564 break; 565 case BACKSL|'(': 566 p->g->nsub++; 567 subno = p->g->nsub; 568 if (subno < NPAREN) 569 p->pbegin[subno] = HERE(); 570 EMIT(OLPAREN, subno); 571 /* the MORE here is an error heuristic */ 572 if (MORE() && !SEETWO('\\', ')')) 573 p_bre(p, '\\', ')'); 574 if (subno < NPAREN) { 575 p->pend[subno] = HERE(); 576 assert(p->pend[subno] != 0); 577 } 578 EMIT(ORPAREN, subno); 579 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 580 break; 581 case BACKSL|')': /* should not get here -- must be user */ 582 case BACKSL|'}': 583 SETERROR(REG_EPAREN); 584 break; 585 case BACKSL|'1': 586 case BACKSL|'2': 587 case BACKSL|'3': 588 case BACKSL|'4': 589 case BACKSL|'5': 590 case BACKSL|'6': 591 case BACKSL|'7': 592 case BACKSL|'8': 593 case BACKSL|'9': 594 i = (c&~BACKSL) - '0'; 595 assert(i < NPAREN); 596 if (p->pend[i] != 0) { 597 assert(i <= p->g->nsub); 598 EMIT(OBACK_, i); 599 assert(p->pbegin[i] != 0); 600 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 601 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 602 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 603 EMIT(O_BACK, i); 604 } else 605 SETERROR(REG_ESUBREG); 606 p->g->backrefs = 1; 607 break; 608 case '*': 609 REQUIRE(starordinary, REG_BADRPT); 610 /* FALLTHROUGH */ 611 default: 612 ordinary(p, c &~ BACKSL); 613 break; 614 } 615 616 if (EAT('*')) { /* implemented as +? */ 617 /* this case does not require the (y|) trick, noKLUDGE */ 618 INSERT(OPLUS_, pos); 619 ASTERN(O_PLUS, pos); 620 INSERT(OQUEST_, pos); 621 ASTERN(O_QUEST, pos); 622 } else if (EATTWO('\\', '{')) { 623 count = p_count(p); 624 if (EAT(',')) { 625 if (MORE() && isdigit(PEEK())) { 626 count2 = p_count(p); 627 REQUIRE(count <= count2, REG_BADBR); 628 } else /* single number with comma */ 629 count2 = INFINITY; 630 } else /* just a single number */ 631 count2 = count; 632 repeat(p, pos, count, count2); 633 if (!EATTWO('\\', '}')) { /* error heuristics */ 634 while (MORE() && !SEETWO('\\', '}')) 635 NEXT(); 636 REQUIRE(MORE(), REG_EBRACE); 637 SETERROR(REG_BADBR); 638 } 639 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 640 return(1); 641 642 return(0); 643 } 644 645 /* 646 - p_count - parse a repetition count 647 == static int p_count(register struct parse *p); 648 */ 649 static int /* the value */ 650 p_count(p) 651 register struct parse *p; 652 { 653 register int count = 0; 654 register int ndigits = 0; 655 656 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) { 657 count = count*10 + (GETNEXT() - '0'); 658 ndigits++; 659 } 660 661 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 662 return(count); 663 } 664 665 /* 666 - p_bracket - parse a bracketed character list 667 == static void p_bracket(register struct parse *p); 668 * 669 * Note a significant property of this code: if the allocset() did SETERROR, 670 * no set operations are done. 671 */ 672 static void 673 p_bracket(p) 674 register struct parse *p; 675 { 676 register cset *cs = allocset(p); 677 register int invert = 0; 678 679 /* Dept of Truly Sickening Special-Case Kludges */ 680 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 681 EMIT(OBOW, 0); 682 NEXTn(6); 683 return; 684 } 685 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 686 EMIT(OEOW, 0); 687 NEXTn(6); 688 return; 689 } 690 691 if (EAT('^')) 692 invert++; /* make note to invert set at end */ 693 if (EAT(']')) 694 CHadd(cs, ']'); 695 else if (EAT('-')) 696 CHadd(cs, '-'); 697 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 698 p_b_term(p, cs); 699 if (EAT('-')) 700 CHadd(cs, '-'); 701 MUSTEAT(']', REG_EBRACK); 702 703 if (p->error != 0) /* don't mess things up further */ 704 return; 705 706 if (p->g->cflags®_ICASE) { 707 register int i; 708 register int ci; 709 710 for (i = p->g->csetsize - 1; i >= 0; i--) 711 if (CHIN(cs, i) && isalpha(i)) { 712 ci = othercase(i); 713 if (ci != i) 714 CHadd(cs, ci); 715 } 716 if (cs->multis != NULL) 717 mccase(p, cs); 718 } 719 if (invert) { 720 register int i; 721 722 for (i = p->g->csetsize - 1; i >= 0; i--) 723 if (CHIN(cs, i)) 724 CHsub(cs, i); 725 else 726 CHadd(cs, i); 727 if (p->g->cflags®_NEWLINE) 728 CHsub(cs, '\n'); 729 if (cs->multis != NULL) 730 mcinvert(p, cs); 731 } 732 733 assert(cs->multis == NULL); /* xxx */ 734 735 if (nch(p, cs) == 1) { /* optimize singleton sets */ 736 ordinary(p, firstch(p, cs)); 737 freeset(p, cs); 738 } else 739 EMIT(OANYOF, freezeset(p, cs)); 740 } 741 742 /* 743 - p_b_term - parse one term of a bracketed character list 744 == static void p_b_term(register struct parse *p, register cset *cs); 745 */ 746 static void 747 p_b_term(p, cs) 748 register struct parse *p; 749 register cset *cs; 750 { 751 register char c; 752 register char start, finish; 753 register int i; 754 755 /* classify what we've got */ 756 switch ((MORE()) ? PEEK() : '\0') { 757 case '[': 758 c = (MORE2()) ? PEEK2() : '\0'; 759 break; 760 case '-': 761 SETERROR(REG_ERANGE); 762 return; /* NOTE RETURN */ 763 break; 764 default: 765 c = '\0'; 766 break; 767 } 768 769 switch (c) { 770 case ':': /* character class */ 771 NEXT2(); 772 REQUIRE(MORE(), REG_EBRACK); 773 c = PEEK(); 774 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 775 p_b_cclass(p, cs); 776 REQUIRE(MORE(), REG_EBRACK); 777 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 778 break; 779 case '=': /* equivalence class */ 780 NEXT2(); 781 REQUIRE(MORE(), REG_EBRACK); 782 c = PEEK(); 783 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 784 p_b_eclass(p, cs); 785 REQUIRE(MORE(), REG_EBRACK); 786 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 787 break; 788 default: /* symbol, ordinary character, or range */ 789 /* xxx revision needed for multichar stuff */ 790 start = p_b_symbol(p); 791 if (SEE('-') && MORE2() && PEEK2() != ']') { 792 /* range */ 793 NEXT(); 794 if (EAT('-')) 795 finish = '-'; 796 else 797 finish = p_b_symbol(p); 798 } else 799 finish = start; 800 /* xxx what about signed chars here... */ 801 REQUIRE(start <= finish, REG_ERANGE); 802 for (i = start; i <= finish; i++) 803 CHadd(cs, i); 804 break; 805 } 806 } 807 808 /* 809 - p_b_cclass - parse a character-class name and deal with it 810 == static void p_b_cclass(register struct parse *p, register cset *cs); 811 */ 812 static void 813 p_b_cclass(p, cs) 814 register struct parse *p; 815 register cset *cs; 816 { 817 register char *sp = p->next; 818 register struct cclass *cp; 819 register size_t len; 820 register char *u; 821 register char c; 822 823 while (MORE() && isalpha(PEEK())) 824 NEXT(); 825 len = p->next - sp; 826 for (cp = cclasses; cp->name != NULL; cp++) 827 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 828 break; 829 if (cp->name == NULL) { 830 /* oops, didn't find it */ 831 SETERROR(REG_ECTYPE); 832 return; 833 } 834 835 u = cp->chars; 836 while ((c = *u++) != '\0') 837 CHadd(cs, c); 838 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 839 MCadd(p, cs, u); 840 } 841 842 /* 843 - p_b_eclass - parse an equivalence-class name and deal with it 844 == static void p_b_eclass(register struct parse *p, register cset *cs); 845 * 846 * This implementation is incomplete. xxx 847 */ 848 static void 849 p_b_eclass(p, cs) 850 register struct parse *p; 851 register cset *cs; 852 { 853 register char c; 854 855 c = p_b_coll_elem(p, '='); 856 CHadd(cs, c); 857 } 858 859 /* 860 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 861 == static char p_b_symbol(register struct parse *p); 862 */ 863 static char /* value of symbol */ 864 p_b_symbol(p) 865 register struct parse *p; 866 { 867 register char value; 868 869 REQUIRE(MORE(), REG_EBRACK); 870 if (!EATTWO('[', '.')) 871 return(GETNEXT()); 872 873 /* collating symbol */ 874 value = p_b_coll_elem(p, '.'); 875 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 876 return(value); 877 } 878 879 /* 880 - p_b_coll_elem - parse a collating-element name and look it up 881 == static char p_b_coll_elem(register struct parse *p, int endc); 882 */ 883 static char /* value of collating element */ 884 p_b_coll_elem(p, endc) 885 register struct parse *p; 886 int endc; /* name ended by endc,']' */ 887 { 888 register char *sp = p->next; 889 register struct cname *cp; 890 register int len; 891 892 while (MORE() && !SEETWO(endc, ']')) 893 NEXT(); 894 if (!MORE()) { 895 SETERROR(REG_EBRACK); 896 return(0); 897 } 898 len = p->next - sp; 899 for (cp = cnames; cp->name != NULL; cp++) 900 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 901 return(cp->code); /* known name */ 902 if (len == 1) 903 return(*sp); /* single character */ 904 SETERROR(REG_ECOLLATE); /* neither */ 905 return(0); 906 } 907 908 /* 909 - othercase - return the case counterpart of an alphabetic 910 == static char othercase(int ch); 911 */ 912 static char /* if no counterpart, return ch */ 913 othercase(ch) 914 int ch; 915 { 916 assert(isalpha(ch)); 917 if (isupper(ch)) 918 return(tolower(ch)); 919 else if (islower(ch)) 920 return(toupper(ch)); 921 else /* peculiar, but could happen */ 922 return(ch); 923 } 924 925 /* 926 - bothcases - emit a dualcase version of a two-case character 927 == static void bothcases(register struct parse *p, int ch); 928 * 929 * Boy, is this implementation ever a kludge... 930 */ 931 static void 932 bothcases(p, ch) 933 register struct parse *p; 934 int ch; 935 { 936 register char *oldnext = p->next; 937 register char *oldend = p->end; 938 char bracket[3]; 939 940 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 941 p->next = bracket; 942 p->end = bracket+2; 943 bracket[0] = ch; 944 bracket[1] = ']'; 945 bracket[2] = '\0'; 946 p_bracket(p); 947 assert(p->next == bracket+2); 948 p->next = oldnext; 949 p->end = oldend; 950 } 951 952 /* 953 - ordinary - emit an ordinary character 954 == static void ordinary(register struct parse *p, register int ch); 955 */ 956 static void 957 ordinary(p, ch) 958 register struct parse *p; 959 register int ch; 960 { 961 register cat_t *cap = p->g->categories; 962 963 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch) 964 bothcases(p, ch); 965 else { 966 EMIT(OCHAR, (unsigned char)ch); 967 if (cap[ch] == 0) 968 cap[ch] = p->g->ncategories++; 969 } 970 } 971 972 /* 973 - nonnewline - emit REG_NEWLINE version of OANY 974 == static void nonnewline(register struct parse *p); 975 * 976 * Boy, is this implementation ever a kludge... 977 */ 978 static void 979 nonnewline(p) 980 register struct parse *p; 981 { 982 register char *oldnext = p->next; 983 register char *oldend = p->end; 984 char bracket[4]; 985 986 p->next = bracket; 987 p->end = bracket+3; 988 bracket[0] = '^'; 989 bracket[1] = '\n'; 990 bracket[2] = ']'; 991 bracket[3] = '\0'; 992 p_bracket(p); 993 assert(p->next == bracket+3); 994 p->next = oldnext; 995 p->end = oldend; 996 } 997 998 /* 999 - repeat - generate code for a bounded repetition, recursively if needed 1000 == static void repeat(register struct parse *p, sopno start, int from, int to); 1001 */ 1002 static void 1003 repeat(p, start, from, to) 1004 register struct parse *p; 1005 sopno start; /* operand from here to end of strip */ 1006 int from; /* repeated from this number */ 1007 int to; /* to this number of times (maybe INFINITY) */ 1008 { 1009 register sopno finish = HERE(); 1010 # define N 2 1011 # define INF 3 1012 # define REP(f, t) ((f)*8 + (t)) 1013 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1014 register sopno copy; 1015 1016 if (p->error != 0) /* head off possible runaway recursion */ 1017 return; 1018 1019 assert(from <= to); 1020 1021 switch (REP(MAP(from), MAP(to))) { 1022 case REP(0, 0): /* must be user doing this */ 1023 DROP(finish-start); /* drop the operand */ 1024 break; 1025 case REP(0, 1): /* as x{1,1}? */ 1026 case REP(0, N): /* as x{1,n}? */ 1027 case REP(0, INF): /* as x{1,}? */ 1028 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1029 INSERT(OCH_, start); /* offset is wrong... */ 1030 repeat(p, start+1, 1, to); 1031 ASTERN(OOR1, start); 1032 AHEAD(start); /* ... fix it */ 1033 EMIT(OOR2, 0); 1034 AHEAD(THERE()); 1035 ASTERN(O_CH, THERETHERE()); 1036 break; 1037 case REP(1, 1): /* trivial case */ 1038 /* done */ 1039 break; 1040 case REP(1, N): /* as x?x{1,n-1} */ 1041 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1042 INSERT(OCH_, start); 1043 ASTERN(OOR1, start); 1044 AHEAD(start); 1045 EMIT(OOR2, 0); /* offset very wrong... */ 1046 AHEAD(THERE()); /* ...so fix it */ 1047 ASTERN(O_CH, THERETHERE()); 1048 copy = dupl(p, start+1, finish+1); 1049 assert(copy == finish+4); 1050 repeat(p, copy, 1, to-1); 1051 break; 1052 case REP(1, INF): /* as x+ */ 1053 INSERT(OPLUS_, start); 1054 ASTERN(O_PLUS, start); 1055 break; 1056 case REP(N, N): /* as xx{m-1,n-1} */ 1057 copy = dupl(p, start, finish); 1058 repeat(p, copy, from-1, to-1); 1059 break; 1060 case REP(N, INF): /* as xx{n-1,INF} */ 1061 copy = dupl(p, start, finish); 1062 repeat(p, copy, from-1, to); 1063 break; 1064 default: /* "can't happen" */ 1065 SETERROR(REG_ASSERT); /* just in case */ 1066 break; 1067 } 1068 } 1069 1070 /* 1071 - seterr - set an error condition 1072 == static int seterr(register struct parse *p, int e); 1073 */ 1074 static int /* useless but makes type checking happy */ 1075 seterr(p, e) 1076 register struct parse *p; 1077 int e; 1078 { 1079 if (p->error == 0) /* keep earliest error condition */ 1080 p->error = e; 1081 p->next = nuls; /* try to bring things to a halt */ 1082 p->end = nuls; 1083 return(0); /* make the return value well-defined */ 1084 } 1085 1086 /* 1087 - allocset - allocate a set of characters for [] 1088 == static cset *allocset(register struct parse *p); 1089 */ 1090 static cset * 1091 allocset(p) 1092 register struct parse *p; 1093 { 1094 register int no = p->g->ncsets++; 1095 register size_t nc; 1096 register size_t nbytes; 1097 register cset *cs; 1098 register size_t css = (size_t)p->g->csetsize; 1099 register int i; 1100 1101 if (no >= p->ncsalloc) { /* need another column of space */ 1102 p->ncsalloc += CHAR_BIT; 1103 nc = p->ncsalloc; 1104 assert(nc % CHAR_BIT == 0); 1105 nbytes = nc / CHAR_BIT * css; 1106 if (p->g->sets == NULL) 1107 p->g->sets = (cset *)malloc(nc * sizeof(cset)); 1108 else 1109 p->g->sets = (cset *)realloc((char *)p->g->sets, 1110 nc * sizeof(cset)); 1111 if (p->g->setbits == NULL) 1112 p->g->setbits = (uch *)malloc(nbytes); 1113 else { 1114 p->g->setbits = (uch *)realloc((char *)p->g->setbits, 1115 nbytes); 1116 /* xxx this isn't right if setbits is now NULL */ 1117 for (i = 0; i < no; i++) 1118 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1119 } 1120 if (p->g->sets != NULL && p->g->setbits != NULL) 1121 (void) memset((char *)p->g->setbits + (nbytes - css), 1122 0, css); 1123 else { 1124 no = 0; 1125 SETERROR(REG_ESPACE); 1126 /* caller's responsibility not to do set ops */ 1127 } 1128 } 1129 1130 assert(p->g->sets != NULL); /* xxx */ 1131 cs = &p->g->sets[no]; 1132 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1133 cs->mask = 1 << ((no) % CHAR_BIT); 1134 cs->hash = 0; 1135 cs->smultis = 0; 1136 cs->multis = NULL; 1137 1138 return(cs); 1139 } 1140 1141 /* 1142 - freeset - free a now-unused set 1143 == static void freeset(register struct parse *p, register cset *cs); 1144 */ 1145 static void 1146 freeset(p, cs) 1147 register struct parse *p; 1148 register cset *cs; 1149 { 1150 register int i; 1151 register cset *top = &p->g->sets[p->g->ncsets]; 1152 register size_t css = (size_t)p->g->csetsize; 1153 1154 for (i = 0; i < css; i++) 1155 CHsub(cs, i); 1156 if (cs == top-1) /* recover only the easy case */ 1157 p->g->ncsets--; 1158 } 1159 1160 /* 1161 - freezeset - final processing on a set of characters 1162 == static int freezeset(register struct parse *p, register cset *cs); 1163 * 1164 * The main task here is merging identical sets. This is usually a waste 1165 * of time (although the hash code minimizes the overhead), but can win 1166 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1167 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1168 * the same value! 1169 */ 1170 static int /* set number */ 1171 freezeset(p, cs) 1172 register struct parse *p; 1173 register cset *cs; 1174 { 1175 register uch h = cs->hash; 1176 register int i; 1177 register cset *top = &p->g->sets[p->g->ncsets]; 1178 register cset *cs2; 1179 register size_t css = (size_t)p->g->csetsize; 1180 1181 /* look for an earlier one which is the same */ 1182 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1183 if (cs2->hash == h && cs2 != cs) { 1184 /* maybe */ 1185 for (i = 0; i < css; i++) 1186 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1187 break; /* no */ 1188 if (i == css) 1189 break; /* yes */ 1190 } 1191 1192 if (cs2 < top) { /* found one */ 1193 freeset(p, cs); 1194 cs = cs2; 1195 } 1196 1197 return((int)(cs - p->g->sets)); 1198 } 1199 1200 /* 1201 - firstch - return first character in a set (which must have at least one) 1202 == static int firstch(register struct parse *p, register cset *cs); 1203 */ 1204 static int /* character; there is no "none" value */ 1205 firstch(p, cs) 1206 register struct parse *p; 1207 register cset *cs; 1208 { 1209 register int i; 1210 register size_t css = (size_t)p->g->csetsize; 1211 1212 for (i = 0; i < css; i++) 1213 if (CHIN(cs, i)) 1214 return((char)i); 1215 assert(never); 1216 return(0); /* arbitrary */ 1217 } 1218 1219 /* 1220 - nch - number of characters in a set 1221 == static int nch(register struct parse *p, register cset *cs); 1222 */ 1223 static int 1224 nch(p, cs) 1225 register struct parse *p; 1226 register cset *cs; 1227 { 1228 register int i; 1229 register size_t css = (size_t)p->g->csetsize; 1230 register int n = 0; 1231 1232 for (i = 0; i < css; i++) 1233 if (CHIN(cs, i)) 1234 n++; 1235 return(n); 1236 } 1237 1238 /* 1239 - mcadd - add a collating element to a cset 1240 == static void mcadd(register struct parse *p, register cset *cs, \ 1241 == register char *cp); 1242 */ 1243 static void 1244 mcadd(p, cs, cp) 1245 register struct parse *p; 1246 register cset *cs; 1247 register char *cp; 1248 { 1249 register size_t oldend = cs->smultis; 1250 1251 cs->smultis += strlen(cp) + 1; 1252 if (cs->multis == NULL) 1253 cs->multis = malloc(cs->smultis); 1254 else 1255 cs->multis = realloc(cs->multis, cs->smultis); 1256 if (cs->multis == NULL) { 1257 SETERROR(REG_ESPACE); 1258 return; 1259 } 1260 1261 (void) strcpy(cs->multis + oldend - 1, cp); 1262 cs->multis[cs->smultis - 1] = '\0'; 1263 } 1264 1265 /* 1266 - mcsub - subtract a collating element from a cset 1267 == static void mcsub(register cset *cs, register char *cp); 1268 */ 1269 static void 1270 mcsub(cs, cp) 1271 register cset *cs; 1272 register char *cp; 1273 { 1274 register char *fp = mcfind(cs, cp); 1275 register size_t len = strlen(fp); 1276 1277 assert(fp != NULL); 1278 (void) memmove(fp, fp + len + 1, 1279 cs->smultis - (fp + len + 1 - cs->multis)); 1280 cs->smultis -= len; 1281 1282 if (cs->smultis == 0) { 1283 free(cs->multis); 1284 cs->multis = NULL; 1285 return; 1286 } 1287 1288 cs->multis = realloc(cs->multis, cs->smultis); 1289 assert(cs->multis != NULL); 1290 } 1291 1292 /* 1293 - mcin - is a collating element in a cset? 1294 == static int mcin(register cset *cs, register char *cp); 1295 */ 1296 static int 1297 mcin(cs, cp) 1298 register cset *cs; 1299 register char *cp; 1300 { 1301 return(mcfind(cs, cp) != NULL); 1302 } 1303 1304 /* 1305 - mcfind - find a collating element in a cset 1306 == static char *mcfind(register cset *cs, register char *cp); 1307 */ 1308 static char * 1309 mcfind(cs, cp) 1310 register cset *cs; 1311 register char *cp; 1312 { 1313 register char *p; 1314 1315 if (cs->multis == NULL) 1316 return(NULL); 1317 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1318 if (strcmp(cp, p) == 0) 1319 return(p); 1320 return(NULL); 1321 } 1322 1323 /* 1324 - mcinvert - invert the list of collating elements in a cset 1325 == static void mcinvert(register struct parse *p, register cset *cs); 1326 * 1327 * This would have to know the set of possibilities. Implementation 1328 * is deferred. 1329 */ 1330 static void 1331 mcinvert(p, cs) 1332 register struct parse *p; 1333 register cset *cs; 1334 { 1335 assert(cs->multis == NULL); /* xxx */ 1336 } 1337 1338 /* 1339 - mccase - add case counterparts of the list of collating elements in a cset 1340 == static void mccase(register struct parse *p, register cset *cs); 1341 * 1342 * This would have to know the set of possibilities. Implementation 1343 * is deferred. 1344 */ 1345 static void 1346 mccase(p, cs) 1347 register struct parse *p; 1348 register cset *cs; 1349 { 1350 assert(cs->multis == NULL); /* xxx */ 1351 } 1352 1353 /* 1354 - isinsets - is this character in any sets? 1355 == static int isinsets(register struct re_guts *g, int c); 1356 */ 1357 static int /* predicate */ 1358 isinsets(g, c) 1359 register struct re_guts *g; 1360 int c; 1361 { 1362 register uch *col; 1363 register int i; 1364 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1365 register unsigned uc = (unsigned char)c; 1366 1367 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1368 if (col[uc] != 0) 1369 return(1); 1370 return(0); 1371 } 1372 1373 /* 1374 - samesets - are these two characters in exactly the same sets? 1375 == static int samesets(register struct re_guts *g, int c1, int c2); 1376 */ 1377 static int /* predicate */ 1378 samesets(g, c1, c2) 1379 register struct re_guts *g; 1380 int c1; 1381 int c2; 1382 { 1383 register uch *col; 1384 register int i; 1385 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1386 register unsigned uc1 = (unsigned char)c1; 1387 register unsigned uc2 = (unsigned char)c2; 1388 1389 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1390 if (col[uc1] != col[uc2]) 1391 return(0); 1392 return(1); 1393 } 1394 1395 /* 1396 - categorize - sort out character categories 1397 == static void categorize(struct parse *p, register struct re_guts *g); 1398 */ 1399 static void 1400 categorize(p, g) 1401 struct parse *p; 1402 register struct re_guts *g; 1403 { 1404 register cat_t *cats = g->categories; 1405 register int c; 1406 register int c2; 1407 register cat_t cat; 1408 1409 /* avoid making error situations worse */ 1410 if (p->error != 0) 1411 return; 1412 1413 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1414 if (cats[c] == 0 && isinsets(g, c)) { 1415 cat = g->ncategories++; 1416 cats[c] = cat; 1417 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1418 if (cats[c2] == 0 && samesets(g, c, c2)) 1419 cats[c2] = cat; 1420 } 1421 } 1422 1423 /* 1424 - dupl - emit a duplicate of a bunch of sops 1425 == static sopno dupl(register struct parse *p, sopno start, sopno finish); 1426 */ 1427 static sopno /* start of duplicate */ 1428 dupl(p, start, finish) 1429 register struct parse *p; 1430 sopno start; /* from here */ 1431 sopno finish; /* to this less one */ 1432 { 1433 register sopno ret = HERE(); 1434 register sopno len = finish - start; 1435 1436 assert(finish >= start); 1437 if (len == 0) 1438 return(ret); 1439 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1440 assert(p->ssize >= p->slen + len); 1441 (void) memcpy((char *)(p->strip + p->slen), 1442 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1443 p->slen += len; 1444 return(ret); 1445 } 1446 1447 /* 1448 - doemit - emit a strip operator 1449 == static void doemit(register struct parse *p, sop op, size_t opnd); 1450 * 1451 * It might seem better to implement this as a macro with a function as 1452 * hard-case backup, but it's just too big and messy unless there are 1453 * some changes to the data structures. Maybe later. 1454 */ 1455 static void 1456 doemit(p, op, opnd) 1457 register struct parse *p; 1458 sop op; 1459 size_t opnd; 1460 { 1461 /* avoid making error situations worse */ 1462 if (p->error != 0) 1463 return; 1464 1465 /* deal with oversize operands ("can't happen", more or less) */ 1466 assert(opnd < 1<<OPSHIFT); 1467 1468 /* deal with undersized strip */ 1469 if (p->slen >= p->ssize) 1470 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1471 assert(p->slen < p->ssize); 1472 1473 /* finally, it's all reduced to the easy case */ 1474 p->strip[p->slen++] = SOP(op, opnd); 1475 } 1476 1477 /* 1478 - doinsert - insert a sop into the strip 1479 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); 1480 */ 1481 static void 1482 doinsert(p, op, opnd, pos) 1483 register struct parse *p; 1484 sop op; 1485 size_t opnd; 1486 sopno pos; 1487 { 1488 register sopno sn; 1489 register sop s; 1490 register int i; 1491 1492 /* avoid making error situations worse */ 1493 if (p->error != 0) 1494 return; 1495 1496 sn = HERE(); 1497 EMIT(op, opnd); /* do checks, ensure space */ 1498 assert(HERE() == sn+1); 1499 s = p->strip[sn]; 1500 1501 /* adjust paren pointers */ 1502 assert(pos > 0); 1503 for (i = 1; i < NPAREN; i++) { 1504 if (p->pbegin[i] >= pos) { 1505 p->pbegin[i]++; 1506 } 1507 if (p->pend[i] >= pos) { 1508 p->pend[i]++; 1509 } 1510 } 1511 1512 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1513 (HERE()-pos-1)*sizeof(sop)); 1514 p->strip[pos] = s; 1515 } 1516 1517 /* 1518 - dofwd - complete a forward reference 1519 == static void dofwd(register struct parse *p, sopno pos, sop value); 1520 */ 1521 static void 1522 dofwd(p, pos, value) 1523 register struct parse *p; 1524 register sopno pos; 1525 sop value; 1526 { 1527 /* avoid making error situations worse */ 1528 if (p->error != 0) 1529 return; 1530 1531 assert(value < 1<<OPSHIFT); 1532 p->strip[pos] = OP(p->strip[pos]) | value; 1533 } 1534 1535 /* 1536 - enlarge - enlarge the strip 1537 == static void enlarge(register struct parse *p, sopno size); 1538 */ 1539 static void 1540 enlarge(p, size) 1541 register struct parse *p; 1542 register sopno size; 1543 { 1544 register sop *sp; 1545 1546 if (p->ssize >= size) 1547 return; 1548 1549 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1550 if (sp == NULL) { 1551 SETERROR(REG_ESPACE); 1552 return; 1553 } 1554 p->strip = sp; 1555 p->ssize = size; 1556 } 1557 1558 /* 1559 - stripsnug - compact the strip 1560 == static void stripsnug(register struct parse *p, register struct re_guts *g); 1561 */ 1562 static void 1563 stripsnug(p, g) 1564 register struct parse *p; 1565 register struct re_guts *g; 1566 { 1567 g->nstates = p->slen; 1568 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1569 if (g->strip == NULL) { 1570 SETERROR(REG_ESPACE); 1571 g->strip = p->strip; 1572 } 1573 } 1574 1575 /* 1576 - findmust - fill in must and mlen with longest mandatory literal string 1577 == static void findmust(register struct parse *p, register struct re_guts *g); 1578 * 1579 * This algorithm could do fancy things like analyzing the operands of | 1580 * for common subsequences. Someday. This code is simple and finds most 1581 * of the interesting cases. 1582 * 1583 * Note that must and mlen got initialized during setup. 1584 */ 1585 static void 1586 findmust(p, g) 1587 struct parse *p; 1588 register struct re_guts *g; 1589 { 1590 register sop *scan; 1591 sop *start; 1592 register sop *newstart; 1593 register sopno newlen; 1594 register sop s; 1595 register char *cp; 1596 register sopno i; 1597 1598 /* avoid making error situations worse */ 1599 if (p->error != 0) 1600 return; 1601 1602 /* find the longest OCHAR sequence in strip */ 1603 newlen = 0; 1604 scan = g->strip + 1; 1605 do { 1606 s = *scan++; 1607 switch (OP(s)) { 1608 case OCHAR: /* sequence member */ 1609 if (newlen == 0) /* new sequence */ 1610 newstart = scan - 1; 1611 newlen++; 1612 break; 1613 case OPLUS_: /* things that don't break one */ 1614 case OLPAREN: 1615 case ORPAREN: 1616 break; 1617 case OQUEST_: /* things that must be skipped */ 1618 case OCH_: 1619 scan--; 1620 do { 1621 scan += OPND(s); 1622 s = *scan; 1623 /* assert() interferes w debug printouts */ 1624 if (OP(s) != O_QUEST && OP(s) != O_CH && 1625 OP(s) != OOR2) { 1626 g->iflags |= BAD; 1627 return; 1628 } 1629 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1630 /* fallthrough */ 1631 default: /* things that break a sequence */ 1632 if (newlen > g->mlen) { /* ends one */ 1633 start = newstart; 1634 g->mlen = newlen; 1635 } 1636 newlen = 0; 1637 break; 1638 } 1639 } while (OP(s) != OEND); 1640 1641 if (g->mlen == 0) /* there isn't one */ 1642 return; 1643 1644 /* turn it into a character string */ 1645 g->must = malloc((size_t)g->mlen + 1); 1646 if (g->must == NULL) { /* argh; just forget it */ 1647 g->mlen = 0; 1648 return; 1649 } 1650 cp = g->must; 1651 scan = start; 1652 for (i = g->mlen; i > 0; i--) { 1653 while (OP(s = *scan++) != OCHAR) 1654 continue; 1655 assert(cp < g->must + g->mlen); 1656 *cp++ = (char)OPND(s); 1657 } 1658 assert(cp == g->must + g->mlen); 1659 *cp++ = '\0'; /* just on general principles */ 1660 } 1661 1662 /* 1663 - pluscount - count + nesting 1664 == static sopno pluscount(register struct parse *p, register struct re_guts *g); 1665 */ 1666 static sopno /* nesting depth */ 1667 pluscount(p, g) 1668 struct parse *p; 1669 register struct re_guts *g; 1670 { 1671 register sop *scan; 1672 register sop s; 1673 register sopno plusnest = 0; 1674 register sopno maxnest = 0; 1675 1676 if (p->error != 0) 1677 return(0); /* there may not be an OEND */ 1678 1679 scan = g->strip + 1; 1680 do { 1681 s = *scan++; 1682 switch (OP(s)) { 1683 case OPLUS_: 1684 plusnest++; 1685 break; 1686 case O_PLUS: 1687 if (plusnest > maxnest) 1688 maxnest = plusnest; 1689 plusnest--; 1690 break; 1691 } 1692 } while (OP(s) != OEND); 1693 if (plusnest != 0) 1694 g->iflags |= BAD; 1695 return(maxnest); 1696 } 1697